rbbt-text 1.1.8 → 1.1.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/ner/NER.rb +3 -3
  3. data/lib/rbbt/ner/abner.rb +3 -3
  4. data/lib/rbbt/ner/banner.rb +1 -1
  5. data/lib/rbbt/ner/brat.rb +2 -2
  6. data/lib/rbbt/ner/chemical_tagger.rb +1 -1
  7. data/lib/rbbt/ner/linnaeus.rb +1 -1
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
  9. data/lib/rbbt/ner/oscar3.rb +1 -1
  10. data/lib/rbbt/ner/oscar4.rb +1 -1
  11. data/lib/rbbt/ner/patterns.rb +4 -4
  12. data/lib/rbbt/ner/regexpNER.rb +1 -1
  13. data/lib/rbbt/ner/token_trieNER.rb +2 -2
  14. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  15. data/lib/rbbt/nlp/nlp.rb +2 -2
  16. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
  17. data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
  18. data/lib/rbbt/text/corpus/document.rb +361 -0
  19. data/lib/rbbt/text/corpus/document_repo.rb +68 -0
  20. data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
  21. data/lib/rbbt/text/document.rb +39 -0
  22. data/lib/rbbt/{ner → text}/segment.rb +11 -6
  23. data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
  24. data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
  25. data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
  26. data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
  27. data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
  28. data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
  29. data/test/rbbt/entity/test_document.rb +1 -0
  30. data/test/rbbt/ner/test_abner.rb +1 -0
  31. data/test/rbbt/ner/test_linnaeus.rb +1 -0
  32. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
  33. data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
  34. data/test/rbbt/text/corpus/test_document.rb +52 -0
  35. data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
  36. data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
  37. data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
  38. data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
  39. data/test/rbbt/text/test_corpus.rb +34 -0
  40. data/test/rbbt/text/test_document.rb +58 -0
  41. data/test/rbbt/{ner → text}/test_segment.rb +2 -2
  42. data/test/test_helper.rb +3 -3
  43. metadata +32 -24
  44. data/lib/rbbt/corpus/document.rb +0 -266
  45. data/lib/rbbt/corpus/document_repo.rb +0 -137
  46. data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
  47. data/lib/rbbt/entity/document.rb +0 -75
@@ -0,0 +1,52 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/text/corpus/document'
3
+
4
+ class TestCorpusDocument < Test::Unit::TestCase
5
+ def setup
6
+ Log.severity = 0
7
+
8
+ Corpus::Document.define :words do
9
+ words = self.text.split(" ")
10
+ Segment.align(self.text, words)
11
+ end
12
+
13
+ Open.mkdir Rbbt.tmp.test.annotations.find
14
+ Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
15
+
16
+
17
+ Corpus::Document.define_multiple :words2 do |documents|
18
+ documents.collect do |doc|
19
+ words = doc.text.split(" ")
20
+ Segment.align(doc.text, words)
21
+ end
22
+ end
23
+
24
+ Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
25
+ end
26
+
27
+ def test_words
28
+ text = "This is a test document"
29
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
30
+ assert_equal Segment.sort(document.words), text.split(" ")
31
+ end
32
+
33
+ def test_words_multiple
34
+ document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
35
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
36
+
37
+ docs = [document1, document2]
38
+
39
+ Corpus::Document.prepare_multiple(docs, :words2)
40
+
41
+ assert_equal document1.words2, document1.text.split(" ")
42
+ assert_equal document2.words2, document2.text.split(" ")
43
+
44
+ document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
45
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
46
+
47
+ docs = [document1, document2]
48
+
49
+ Corpus::Document.prepare_multiple(docs, :words2)
50
+ end
51
+ end
52
+
@@ -1,6 +1,6 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment'
3
- require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/text/segment'
3
+ require 'rbbt/text/segment/named_entity'
4
4
 
5
5
  class TestClass < Test::Unit::TestCase
6
6
  def test_info
@@ -1,5 +1,5 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment/segmented'
2
+ require 'rbbt/text/segment/segmented'
3
3
 
4
4
  class TestClass < Test::Unit::TestCase
5
5
  def test_split
@@ -1,6 +1,6 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment/transformed'
3
- require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/text/segment/transformed'
3
+ require 'rbbt/text/segment/named_entity'
4
4
  require 'rexml/document'
5
5
 
6
6
  class TestClass < Test::Unit::TestCase
@@ -98,7 +98,6 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
98
98
 
99
99
  assert_equal original, a
100
100
 
101
-
102
101
  assert_equal original, a
103
102
 
104
103
  exp1, exp2 = nil, nil
@@ -286,5 +285,99 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
286
285
  end
287
286
 
288
287
  end
288
+
289
+ def test_by_sentence
290
+ a = "This is a first sentences. ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
291
+
292
+ sentence_pos = a.index('.')+2
293
+ sentence = a[sentence_pos..-1]
294
+ Segment.setup sentence, sentence_pos
295
+
296
+ gene1 = "ILF"
297
+ gene1.extend NamedEntity
298
+ gene1.offset = a.index gene1
299
+ gene1.type = "Gene"
300
+
301
+ Transformed.with_transform(sentence, [gene1], "[G]") do
302
+ assert_equal sentence.sub("ILF", "[G]"), sentence
303
+ end
304
+ end
305
+
306
+ def test_collisions
307
+ text =<<-EOF.chomp
308
+ This is another sentence. Protein (nsp1), helicase (nsp13).
309
+ EOF
310
+
311
+ sentence_pos = text.index(".") + 2
312
+ sentence = Segment.setup(text[sentence_pos..-1], sentence_pos)
313
+
314
+ viral = %w(nsp1 nsp13)
315
+ human = %w(helicase)
316
+
317
+ viral = viral.collect do |e|
318
+ next unless text.index(e)
319
+ NamedEntity.setup(e, text.index(e), "VirGene")
320
+ end.compact
321
+
322
+ human = human.collect do |e|
323
+ next unless text.index(e)
324
+ NamedEntity.setup(e, text.index(e), "HumGene")
325
+ end
326
+
327
+ clean = human.reject{|s| s.overlaps(viral).any?}
328
+
329
+ Transformed.with_transform(sentence, viral, Proc.new{|e| "[VIRAL=#{e}]"}) do
330
+ assert_equal sentence, "Protein ([VIRAL=nsp1]), helicase ([VIRAL=nsp13])."
331
+ Transformed.with_transform(sentence, clean, Proc.new{|e| "[HUMAN=#{e}]"}) do
332
+ assert_equal sentence, "Protein ([VIRAL=nsp1]), [HUMAN=helicase] ([VIRAL=nsp13])."
333
+ end
334
+ end
335
+ end
336
+
337
+
338
+ def test_collisions2
339
+ text =<<-EOF.chomp
340
+ This is another sentence. Among the nonstructural proteins, the leader protein (nsp1), the papain-like protease (nsp3), the nsp4, the 3C-like protease (nsp5), the nsp7, the nsp8, the nsp9, the nsp10, the RNA-directed RNA polymerase (nsp12), the helicase (nsp13), the guanine-N7 methyltransferase (nsp14), the uridylate-specific endoribonuclease (nsp15), the 2'-O-methyltransferase (nsp16), and the ORF7a protein could be built on the basis of homology templates.
341
+ EOF
342
+
343
+ sentence_pos = text.index(".") + 2
344
+ sentence = Segment.setup(text[sentence_pos..-1], sentence_pos)
345
+
346
+ target = sentence.dup
347
+
348
+ viral = %w(nsp1 nsp4 nsp5 nsp7 nsp8 nsp9 nsp10 nsp12 nsp13 nsp14 nsp15 ORF7a spike)
349
+ human = %w(helicase nsp5 nsp4 nsp3)
350
+
351
+ viral = viral.collect do |e|
352
+ next unless text.index(e)
353
+ NamedEntity.setup(e, text.index(e), "VirGene")
354
+ end.compact
355
+
356
+ human = human.collect do |e|
357
+ next unless text.index(e)
358
+ NamedEntity.setup(e, text.index(e), "HumGene")
359
+ end
360
+
361
+ clean = human.reject{|s| s.overlaps(viral).any?}
362
+
363
+ tag = Misc.digest("TAG")
364
+
365
+ viral.each do |e|
366
+ target.gsub!(/\b#{e}\b/, "[VIRAL=#{e}-#{tag}]")
367
+ end
368
+
369
+ target_tmp = target.dup
370
+
371
+ clean.each do |e|
372
+ target.gsub!(/\b#{e}\b/, "[HUMAN=#{e}-#{tag}]")
373
+ end
374
+
375
+ Transformed.with_transform(sentence, viral, Proc.new{|e| "[VIRAL=#{e}-#{tag}]"}) do
376
+ assert_equal sentence, target_tmp
377
+ Transformed.with_transform(sentence, clean, Proc.new{|e| "[HUMAN=#{e}-#{tag}]"}) do
378
+ assert_equal sentence, target
379
+ end
380
+ end
381
+ end
289
382
  end
290
383
 
@@ -0,0 +1,34 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'test/unit'
4
+ require 'rbbt-util'
5
+ require 'rbbt/text/corpus'
6
+
7
+ class Corpus::Document
8
+
9
+ define :words do
10
+ text.split(" ")
11
+ end
12
+ end
13
+
14
+ class TestClass < Test::Unit::TestCase
15
+ def test_document
16
+ Log.severity = 0
17
+ text = "This is a test document"
18
+
19
+ docid = nil
20
+ TmpFile.with_file do |dir|
21
+ corpus = Corpus.new dir
22
+ docid = corpus.add_document text, :TEST, :test_doc
23
+ document = corpus.docid(docid)
24
+ assert_equal text, document.text
25
+
26
+ corpus = Corpus.new dir
27
+ document = corpus.docid(docid)
28
+ assert_equal text, document.text
29
+ document = corpus.find(:TEST, :test_doc).first
30
+ assert_equal text, document.text
31
+ end
32
+ end
33
+ end
34
+
@@ -0,0 +1,58 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/text/document'
3
+ require 'rbbt/text/corpus/sources/pmid'
4
+
5
+ class TestDocument < Test::Unit::TestCase
6
+ def setup
7
+ Log.severity = 0
8
+ Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
9
+
10
+ Corpus::Document.define :words do
11
+ words = self.text.split(" ")
12
+ Segment.align(self.text, words)
13
+ end
14
+
15
+ Corpus::Document.define :genes do
16
+ require 'rbbt/ner/banner'
17
+ Banner.new.match(self.text)
18
+ end
19
+
20
+ Corpus::Document.persist_in_global_tsv("genes")
21
+ Corpus::Document.persist_in_global_tsv(:words)
22
+ end
23
+
24
+ def test_title_and_text
25
+ document = Document.setup('PMID:32272262')
26
+
27
+ assert document.text.downcase.include?("covid")
28
+ assert_equal "High-resolution Chest CT Features and Clinical Characteristics of Patients Infected with COVID-19 in Jiangsu, China.", document.title
29
+ end
30
+
31
+ def test_full_text
32
+ document = Document.setup('PMID:4304705')
33
+ assert document.text.length < document.full_text.length
34
+ end
35
+
36
+ def test_words
37
+ document = Document.setup('PMID:32272262')
38
+ words = document.entities :words
39
+ assert words.first.respond_to?(:offset)
40
+ end
41
+
42
+ def test_genes
43
+ text = "This is a mention to TP53, a gene that should be found"
44
+ document = Document.setup(Document.corpus.add_document(text, "TEST"))
45
+ genes = document.entities :genes
46
+
47
+ assert_equal "TP53", genes.first
48
+ assert genes.first.respond_to?(:offset)
49
+
50
+ text = "This is a mention to TP53, a gene that should be found"
51
+ document = Document.setup(Document.corpus.add_document(text, "TEST"))
52
+ genes = document.entities :genes
53
+
54
+ assert_equal "TP53", genes.first
55
+ assert genes.first.respond_to?(:offset)
56
+ end
57
+ end
58
+
@@ -1,7 +1,7 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment'
2
+ require 'rbbt/text/segment'
3
3
 
4
- class TestClass < Test::Unit::TestCase
4
+ class TestSegment < Test::Unit::TestCase
5
5
  def test_info
6
6
  a = "test"
7
7
  a.extend Segment
@@ -6,7 +6,7 @@ require 'rbbt'
6
6
  require 'rbbt/persist'
7
7
  require 'rbbt/util/tmpfile'
8
8
  require 'rbbt/util/log'
9
- require 'rbbt/corpus/document_repo'
9
+ require 'rbbt/text/corpus'
10
10
 
11
11
  class Test::Unit::TestCase
12
12
  def get_test_datafile(file)
@@ -22,8 +22,8 @@ class Test::Unit::TestCase
22
22
  FileUtils.rm_rf Rbbt.tmp.test.find :user
23
23
  Persist::CONNECTIONS.values.each do |c| c.close end
24
24
  Persist::CONNECTIONS.clear
25
- DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
- DocumentRepo::TC_CONNECTIONS.clear
25
+ Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
+ Corpus::DocumentRepo::TC_CONNECTIONS.clear
27
27
  end
28
28
 
29
29
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.8
4
+ version: 1.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-31 00:00:00.000000000 Z
11
+ date: 2020-04-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -78,11 +78,6 @@ files:
78
78
  - lib/rbbt/bow/bow.rb
79
79
  - lib/rbbt/bow/dictionary.rb
80
80
  - lib/rbbt/bow/misc.rb
81
- - lib/rbbt/corpus/corpus.rb
82
- - lib/rbbt/corpus/document.rb
83
- - lib/rbbt/corpus/document_repo.rb
84
- - lib/rbbt/corpus/sources/pubmed.rb
85
- - lib/rbbt/entity/document.rb
86
81
  - lib/rbbt/ner/NER.rb
87
82
  - lib/rbbt/ner/abner.rb
88
83
  - lib/rbbt/ner/banner.rb
@@ -99,17 +94,22 @@ files:
99
94
  - lib/rbbt/ner/rnorm.rb
100
95
  - lib/rbbt/ner/rnorm/cue_index.rb
101
96
  - lib/rbbt/ner/rnorm/tokens.rb
102
- - lib/rbbt/ner/segment.rb
103
- - lib/rbbt/ner/segment/docid.rb
104
- - lib/rbbt/ner/segment/named_entity.rb
105
- - lib/rbbt/ner/segment/relationship.rb
106
- - lib/rbbt/ner/segment/segmented.rb
107
- - lib/rbbt/ner/segment/token.rb
108
- - lib/rbbt/ner/segment/transformed.rb
109
97
  - lib/rbbt/ner/token_trieNER.rb
110
98
  - lib/rbbt/nlp/genia/sentence_splitter.rb
111
99
  - lib/rbbt/nlp/nlp.rb
112
100
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
101
+ - lib/rbbt/text/corpus.rb
102
+ - lib/rbbt/text/corpus/document.rb
103
+ - lib/rbbt/text/corpus/document_repo.rb
104
+ - lib/rbbt/text/corpus/sources/pmid.rb
105
+ - lib/rbbt/text/document.rb
106
+ - lib/rbbt/text/segment.rb
107
+ - lib/rbbt/text/segment/docid.rb
108
+ - lib/rbbt/text/segment/named_entity.rb
109
+ - lib/rbbt/text/segment/relationship.rb
110
+ - lib/rbbt/text/segment/segmented.rb
111
+ - lib/rbbt/text/segment/token.rb
112
+ - lib/rbbt/text/segment/transformed.rb
113
113
  - share/install/software/ABNER
114
114
  - share/install/software/BANNER
115
115
  - share/install/software/ChemicalTagger
@@ -129,10 +129,6 @@ files:
129
129
  - test/rbbt/bow/test_dictionary.rb
130
130
  - test/rbbt/bow/test_misc.rb
131
131
  - test/rbbt/entity/test_document.rb
132
- - test/rbbt/ner/segment/test_named_entity.rb
133
- - test/rbbt/ner/segment/test_relationship.rb
134
- - test/rbbt/ner/segment/test_segmented.rb
135
- - test/rbbt/ner/segment/test_transformed.rb
136
132
  - test/rbbt/ner/test_NER.rb
137
133
  - test/rbbt/ner/test_abner.rb
138
134
  - test/rbbt/ner/test_banner.rb
@@ -146,11 +142,19 @@ files:
146
142
  - test/rbbt/ner/test_patterns.rb
147
143
  - test/rbbt/ner/test_regexpNER.rb
148
144
  - test/rbbt/ner/test_rnorm.rb
149
- - test/rbbt/ner/test_segment.rb
150
145
  - test/rbbt/ner/test_token_trieNER.rb
151
146
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
152
147
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
153
148
  - test/rbbt/nlp/test_nlp.rb
149
+ - test/rbbt/text/corpus/sources/test_pmid.rb
150
+ - test/rbbt/text/corpus/test_document.rb
151
+ - test/rbbt/text/segment/test_named_entity.rb
152
+ - test/rbbt/text/segment/test_relationship.rb
153
+ - test/rbbt/text/segment/test_segmented.rb
154
+ - test/rbbt/text/segment/test_transformed.rb
155
+ - test/rbbt/text/test_corpus.rb
156
+ - test/rbbt/text/test_document.rb
157
+ - test/rbbt/text/test_segment.rb
154
158
  - test/test_helper.rb
155
159
  homepage: http://github.com/mikisvaz/rbbt-util
156
160
  licenses: []
@@ -178,6 +182,15 @@ test_files:
178
182
  - test/rbbt/nlp/test_nlp.rb
179
183
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
180
184
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
185
+ - test/rbbt/text/test_document.rb
186
+ - test/rbbt/text/corpus/sources/test_pmid.rb
187
+ - test/rbbt/text/corpus/test_document.rb
188
+ - test/rbbt/text/test_segment.rb
189
+ - test/rbbt/text/test_corpus.rb
190
+ - test/rbbt/text/segment/test_transformed.rb
191
+ - test/rbbt/text/segment/test_relationship.rb
192
+ - test/rbbt/text/segment/test_named_entity.rb
193
+ - test/rbbt/text/segment/test_segmented.rb
181
194
  - test/rbbt/bow/test_bow.rb
182
195
  - test/rbbt/bow/test_misc.rb
183
196
  - test/rbbt/bow/test_dictionary.rb
@@ -194,11 +207,6 @@ test_files:
194
207
  - test/rbbt/ner/test_banner.rb
195
208
  - test/rbbt/ner/test_token_trieNER.rb
196
209
  - test/rbbt/ner/test_finder.rb
197
- - test/rbbt/ner/test_segment.rb
198
210
  - test/rbbt/ner/test_linnaeus.rb
199
- - test/rbbt/ner/segment/test_transformed.rb
200
- - test/rbbt/ner/segment/test_relationship.rb
201
- - test/rbbt/ner/segment/test_named_entity.rb
202
- - test/rbbt/ner/segment/test_segmented.rb
203
211
  - test/rbbt/ner/test_oscar4.rb
204
212
  - test/test_helper.rb
@@ -1,266 +0,0 @@
1
- require 'rbbt/ner/segment'
2
- require 'rbbt/ner/segment/segmented'
3
- require 'rbbt/tsv'
4
- require 'rbbt/resource/path'
5
- require 'rbbt/persist/tsv'
6
- require 'rbbt/util/misc'
7
- require 'json'
8
-
9
- class Document
10
-
11
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence
12
- def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil)
13
- @segments = {}
14
- @segment_indices = {}
15
-
16
- if not persist_dir.nil?
17
- @persist_dir = persist_dir
18
- @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
19
- end
20
-
21
- @global_persistence = global_persistence
22
-
23
- if not docid.nil?
24
- @docid = docid
25
- update_docid
26
- end
27
- @text = text unless text.nil?
28
- end
29
-
30
- def update_docid
31
- @namespace, @id, @type, @hash = docid.split(":", -1)
32
- end
33
-
34
- def docid=(docid)
35
- @docid = docid
36
- update_docid
37
- end
38
-
39
- #{{{ PERSISTENCE
40
-
41
- TSV_REPOS = {}
42
- FIELDS_FOR_ENTITY_PERSISTENCE = {}
43
- def self.persist(entity, fields = nil)
44
-
45
- if not fields.nil?
46
- fields = [fields] if not Array === fields
47
- fields = fields.collect{|f| f.to_s}
48
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
49
- end
50
-
51
- self.class_eval <<-EOC
52
- def load_with_persistence_#{entity}(raw = false)
53
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
54
-
55
- tsv_file = File.join(@persist_dir.find, "#{ entity }")
56
-
57
- return nil if raw == :check and File.exists? tsv_file
58
-
59
- annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
60
- segments = produce_#{entity}
61
- tsv = Segment.tsv(segments, fields)
62
- end
63
-
64
- return annotations if raw
65
-
66
- annotations.unnamed = true
67
- annotations.collect{|id, annotation|
68
- Segment.load_tsv_values(text, annotation, annotations.fields)
69
- }
70
- end
71
- EOC
72
- end
73
-
74
- def self.persist_in_tsv(entity, tsv = nil, fields = nil)
75
- if not tsv.nil? and not tsv.respond_to?(:keys)
76
- fields = tsv
77
- tsv = nil
78
- end
79
-
80
- TSV_REPOS[entity.to_s] = tsv
81
-
82
- if not fields.nil?
83
- fields = [fields] if not Array === fields
84
- fields = fields.collect{|f| f.to_s}
85
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
86
- end
87
-
88
- self.class_eval <<-EOC
89
- def load_with_persistence_#{entity}(raw = false)
90
- repo = TSV_REPOS["#{ entity }"]
91
- if repo.nil?
92
- raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
93
- repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
94
- end
95
-
96
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
97
- if not repo.include? "#{ entity }"
98
- segments = produce_#{entity}
99
- repo.write
100
- repo["#{entity}"] = Segment.tsv(segments, fields)
101
- repo.read
102
- else
103
- if raw == :check
104
- repo.close
105
- return nil
106
- end
107
- end
108
-
109
-
110
- annotations = repo["#{entity}"]
111
-
112
- repo.close
113
-
114
-
115
- return annotations if raw
116
-
117
- annotations.unnamed = true
118
- annotations.collect{|id, annotation|
119
- Segment.load_tsv_values(text, annotation, annotations.fields)
120
- }
121
- end
122
- EOC
123
- end
124
-
125
- def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
126
- doc_field ||= "Document ID"
127
- entity_field ||= "Entity Type"
128
-
129
- TSV_REPOS[entity.to_s] = tsv
130
-
131
- if not fields.nil?
132
- fields = [fields] if not Array === fields
133
- fields = fields.collect{|f| f.to_s}
134
- else
135
- fields = nil
136
- end
137
-
138
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
139
-
140
- self.class_eval <<-EOC
141
- def load_with_persistence_#{entity}(raw = false)
142
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
143
-
144
- data = TSV_REPOS["#{ entity }"] || @global_persistence
145
-
146
- data.read true
147
-
148
- fields = data.fields if fields.nil? and data.respond_to? :fields
149
-
150
-
151
- if data.respond_to? :persistence_path and String === data.persistence_path
152
- data.filter(data.persistence_path + '.filters')
153
- end
154
- data.add_filter("field:#{ doc_field }", @docid)
155
- data.add_filter("field:#{ entity_field }", "#{ entity }")
156
- keys = data.keys
157
- data.pop_filter
158
- data.pop_filter
159
-
160
- if keys.empty?
161
- segments = produce_#{entity}
162
- segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
163
- tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
164
-
165
- tsv.add_field "#{ doc_field }" do
166
- @docid
167
- end
168
-
169
- tsv.add_field "#{ entity_field }" do
170
- "#{ entity }"
171
- end
172
-
173
- data.add_filter("field:#{ doc_field }", @docid)
174
- data.add_filter("field:#{ entity_field }", "#{ entity }")
175
- data.write true
176
- keys = tsv.collect do |key, value|
177
- data[key] = value
178
- key
179
- end
180
- data.pop_filter
181
- data.pop_filter
182
- data.read
183
-
184
- else
185
- if raw == :check
186
- data.close
187
- return nil
188
- end
189
- end
190
-
191
- return data.values if raw
192
-
193
- start_pos = data.identify_field "Start"
194
- segments = data.values_at(*keys).collect{|annotation|
195
- pos = annotation[start_pos]
196
- Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
197
- }.compact
198
- data.close
199
-
200
- segments
201
- end
202
- EOC
203
- end
204
-
205
-
206
- def self.define(entity, &block)
207
- send :define_method, "produce_#{entity}", &block
208
-
209
- self.class_eval <<-EOC
210
- def load_#{entity}(raw = false)
211
- return if segments.include? "#{ entity }"
212
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
213
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
214
- else
215
- segments["#{ entity }"] = produce_#{entity}
216
- end
217
- end
218
-
219
- def #{entity}(raw = false)
220
- begin
221
- entities = segments["#{ entity }"]
222
- if entities.nil?
223
- load_#{entity}(raw)
224
- entities = segments["#{ entity }"]
225
- end
226
- end
227
-
228
- entities
229
- end
230
-
231
- def #{entity}_at(pos, persist = false)
232
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
233
- end
234
-
235
- EOC
236
- end
237
-
238
- def segment_index(name, persist_dir = nil)
239
- @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
240
- end
241
-
242
- def load_into(segment, *annotations)
243
- options = annotations.pop if Hash === annotations.last
244
- options ||= {}
245
-
246
- if options[:persist] and not @persist_dir.nil?
247
- persist_dir = File.join(@persist_dir, 'ranges')
248
- else
249
- persist_dir = nil
250
- end
251
-
252
- Segmented.setup(segment, {})
253
- annotations.collect do |name|
254
- name = name.to_s
255
- index = segment_index(name, persist_dir)
256
- annotations = index[segment.range]
257
- segment.segments[name] ||= {}
258
- segment.segments[name] = annotations
259
- class << segment
260
- self
261
- end.class_eval "def #{ name }; @segments['#{ name }']; end"
262
- end
263
-
264
- segment
265
- end
266
- end