rbbt-text 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/ner/NER.rb +3 -3
  3. data/lib/rbbt/ner/abner.rb +3 -3
  4. data/lib/rbbt/ner/banner.rb +1 -1
  5. data/lib/rbbt/ner/brat.rb +2 -2
  6. data/lib/rbbt/ner/chemical_tagger.rb +1 -1
  7. data/lib/rbbt/ner/linnaeus.rb +1 -1
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
  9. data/lib/rbbt/ner/oscar3.rb +1 -1
  10. data/lib/rbbt/ner/oscar4.rb +1 -1
  11. data/lib/rbbt/ner/patterns.rb +4 -4
  12. data/lib/rbbt/ner/regexpNER.rb +1 -1
  13. data/lib/rbbt/ner/token_trieNER.rb +2 -2
  14. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  15. data/lib/rbbt/nlp/nlp.rb +2 -2
  16. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
  17. data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
  18. data/lib/rbbt/text/corpus/document.rb +361 -0
  19. data/lib/rbbt/text/corpus/document_repo.rb +68 -0
  20. data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
  21. data/lib/rbbt/text/document.rb +39 -0
  22. data/lib/rbbt/{ner → text}/segment.rb +11 -6
  23. data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
  24. data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
  25. data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
  26. data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
  27. data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
  28. data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
  29. data/test/rbbt/entity/test_document.rb +1 -0
  30. data/test/rbbt/ner/test_abner.rb +1 -0
  31. data/test/rbbt/ner/test_linnaeus.rb +1 -0
  32. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
  33. data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
  34. data/test/rbbt/text/corpus/test_document.rb +52 -0
  35. data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
  36. data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
  37. data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
  38. data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
  39. data/test/rbbt/text/test_corpus.rb +34 -0
  40. data/test/rbbt/text/test_document.rb +58 -0
  41. data/test/rbbt/{ner → text}/test_segment.rb +2 -2
  42. data/test/test_helper.rb +3 -3
  43. metadata +32 -24
  44. data/lib/rbbt/corpus/document.rb +0 -266
  45. data/lib/rbbt/corpus/document_repo.rb +0 -137
  46. data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
  47. data/lib/rbbt/entity/document.rb +0 -75
@@ -0,0 +1,52 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/text/corpus/document'
3
+
4
+ class TestCorpusDocument < Test::Unit::TestCase
5
+ def setup
6
+ Log.severity = 0
7
+
8
+ Corpus::Document.define :words do
9
+ words = self.text.split(" ")
10
+ Segment.align(self.text, words)
11
+ end
12
+
13
+ Open.mkdir Rbbt.tmp.test.annotations.find
14
+ Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
15
+
16
+
17
+ Corpus::Document.define_multiple :words2 do |documents|
18
+ documents.collect do |doc|
19
+ words = doc.text.split(" ")
20
+ Segment.align(doc.text, words)
21
+ end
22
+ end
23
+
24
+ Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
25
+ end
26
+
27
+ def test_words
28
+ text = "This is a test document"
29
+ document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
30
+ assert_equal Segment.sort(document.words), text.split(" ")
31
+ end
32
+
33
+ def test_words_multiple
34
+ document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
35
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
36
+
37
+ docs = [document1, document2]
38
+
39
+ Corpus::Document.prepare_multiple(docs, :words2)
40
+
41
+ assert_equal document1.words2, document1.text.split(" ")
42
+ assert_equal document2.words2, document2.text.split(" ")
43
+
44
+ document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
45
+ document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is a another test document")
46
+
47
+ docs = [document1, document2]
48
+
49
+ Corpus::Document.prepare_multiple(docs, :words2)
50
+ end
51
+ end
52
+
@@ -1,6 +1,6 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment'
3
- require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/text/segment'
3
+ require 'rbbt/text/segment/named_entity'
4
4
 
5
5
  class TestClass < Test::Unit::TestCase
6
6
  def test_info
@@ -1,5 +1,5 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment/segmented'
2
+ require 'rbbt/text/segment/segmented'
3
3
 
4
4
  class TestClass < Test::Unit::TestCase
5
5
  def test_split
@@ -1,6 +1,6 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment/transformed'
3
- require 'rbbt/ner/segment/named_entity'
2
+ require 'rbbt/text/segment/transformed'
3
+ require 'rbbt/text/segment/named_entity'
4
4
  require 'rexml/document'
5
5
 
6
6
  class TestClass < Test::Unit::TestCase
@@ -98,7 +98,6 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
98
98
 
99
99
  assert_equal original, a
100
100
 
101
-
102
101
  assert_equal original, a
103
102
 
104
103
  exp1, exp2 = nil, nil
@@ -286,5 +285,99 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
286
285
  end
287
286
 
288
287
  end
288
+
289
+ def test_by_sentence
290
+ a = "This is a first sentences. ILF can bind to purine-rich regulatory motifs such as the human T-cell leukemia virus-long terminal region and the interleukin-2 promoter."
291
+
292
+ sentence_pos = a.index('.')+2
293
+ sentence = a[sentence_pos..-1]
294
+ Segment.setup sentence, sentence_pos
295
+
296
+ gene1 = "ILF"
297
+ gene1.extend NamedEntity
298
+ gene1.offset = a.index gene1
299
+ gene1.type = "Gene"
300
+
301
+ Transformed.with_transform(sentence, [gene1], "[G]") do
302
+ assert_equal sentence.sub("ILF", "[G]"), sentence
303
+ end
304
+ end
305
+
306
+ def test_collisions
307
+ text =<<-EOF.chomp
308
+ This is another sentence. Protein (nsp1), helicase (nsp13).
309
+ EOF
310
+
311
+ sentence_pos = text.index(".") + 2
312
+ sentence = Segment.setup(text[sentence_pos..-1], sentence_pos)
313
+
314
+ viral = %w(nsp1 nsp13)
315
+ human = %w(helicase)
316
+
317
+ viral = viral.collect do |e|
318
+ next unless text.index(e)
319
+ NamedEntity.setup(e, text.index(e), "VirGene")
320
+ end.compact
321
+
322
+ human = human.collect do |e|
323
+ next unless text.index(e)
324
+ NamedEntity.setup(e, text.index(e), "HumGene")
325
+ end
326
+
327
+ clean = human.reject{|s| s.overlaps(viral).any?}
328
+
329
+ Transformed.with_transform(sentence, viral, Proc.new{|e| "[VIRAL=#{e}]"}) do
330
+ assert_equal sentence, "Protein ([VIRAL=nsp1]), helicase ([VIRAL=nsp13])."
331
+ Transformed.with_transform(sentence, clean, Proc.new{|e| "[HUMAN=#{e}]"}) do
332
+ assert_equal sentence, "Protein ([VIRAL=nsp1]), [HUMAN=helicase] ([VIRAL=nsp13])."
333
+ end
334
+ end
335
+ end
336
+
337
+
338
+ def test_collisions2
339
+ text =<<-EOF.chomp
340
+ This is another sentence. Among the nonstructural proteins, the leader protein (nsp1), the papain-like protease (nsp3), the nsp4, the 3C-like protease (nsp5), the nsp7, the nsp8, the nsp9, the nsp10, the RNA-directed RNA polymerase (nsp12), the helicase (nsp13), the guanine-N7 methyltransferase (nsp14), the uridylate-specific endoribonuclease (nsp15), the 2'-O-methyltransferase (nsp16), and the ORF7a protein could be built on the basis of homology templates.
341
+ EOF
342
+
343
+ sentence_pos = text.index(".") + 2
344
+ sentence = Segment.setup(text[sentence_pos..-1], sentence_pos)
345
+
346
+ target = sentence.dup
347
+
348
+ viral = %w(nsp1 nsp4 nsp5 nsp7 nsp8 nsp9 nsp10 nsp12 nsp13 nsp14 nsp15 ORF7a spike)
349
+ human = %w(helicase nsp5 nsp4 nsp3)
350
+
351
+ viral = viral.collect do |e|
352
+ next unless text.index(e)
353
+ NamedEntity.setup(e, text.index(e), "VirGene")
354
+ end.compact
355
+
356
+ human = human.collect do |e|
357
+ next unless text.index(e)
358
+ NamedEntity.setup(e, text.index(e), "HumGene")
359
+ end
360
+
361
+ clean = human.reject{|s| s.overlaps(viral).any?}
362
+
363
+ tag = Misc.digest("TAG")
364
+
365
+ viral.each do |e|
366
+ target.gsub!(/\b#{e}\b/, "[VIRAL=#{e}-#{tag}]")
367
+ end
368
+
369
+ target_tmp = target.dup
370
+
371
+ clean.each do |e|
372
+ target.gsub!(/\b#{e}\b/, "[HUMAN=#{e}-#{tag}]")
373
+ end
374
+
375
+ Transformed.with_transform(sentence, viral, Proc.new{|e| "[VIRAL=#{e}-#{tag}]"}) do
376
+ assert_equal sentence, target_tmp
377
+ Transformed.with_transform(sentence, clean, Proc.new{|e| "[HUMAN=#{e}-#{tag}]"}) do
378
+ assert_equal sentence, target
379
+ end
380
+ end
381
+ end
289
382
  end
290
383
 
@@ -0,0 +1,34 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'test/unit'
4
+ require 'rbbt-util'
5
+ require 'rbbt/text/corpus'
6
+
7
+ class Corpus::Document
8
+
9
+ define :words do
10
+ text.split(" ")
11
+ end
12
+ end
13
+
14
+ class TestClass < Test::Unit::TestCase
15
+ def test_document
16
+ Log.severity = 0
17
+ text = "This is a test document"
18
+
19
+ docid = nil
20
+ TmpFile.with_file do |dir|
21
+ corpus = Corpus.new dir
22
+ docid = corpus.add_document text, :TEST, :test_doc
23
+ document = corpus.docid(docid)
24
+ assert_equal text, document.text
25
+
26
+ corpus = Corpus.new dir
27
+ document = corpus.docid(docid)
28
+ assert_equal text, document.text
29
+ document = corpus.find(:TEST, :test_doc).first
30
+ assert_equal text, document.text
31
+ end
32
+ end
33
+ end
34
+
@@ -0,0 +1,58 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/text/document'
3
+ require 'rbbt/text/corpus/sources/pmid'
4
+
5
+ class TestDocument < Test::Unit::TestCase
6
+ def setup
7
+ Log.severity = 0
8
+ Document.corpus = Corpus.new Rbbt.tmp.test.document_corpus
9
+
10
+ Corpus::Document.define :words do
11
+ words = self.text.split(" ")
12
+ Segment.align(self.text, words)
13
+ end
14
+
15
+ Corpus::Document.define :genes do
16
+ require 'rbbt/ner/banner'
17
+ Banner.new.match(self.text)
18
+ end
19
+
20
+ Corpus::Document.persist_in_global_tsv("genes")
21
+ Corpus::Document.persist_in_global_tsv(:words)
22
+ end
23
+
24
+ def test_title_and_text
25
+ document = Document.setup('PMID:32272262')
26
+
27
+ assert document.text.downcase.include?("covid")
28
+ assert_equal "High-resolution Chest CT Features and Clinical Characteristics of Patients Infected with COVID-19 in Jiangsu, China.", document.title
29
+ end
30
+
31
+ def test_full_text
32
+ document = Document.setup('PMID:4304705')
33
+ assert document.text.length < document.full_text.length
34
+ end
35
+
36
+ def test_words
37
+ document = Document.setup('PMID:32272262')
38
+ words = document.entities :words
39
+ assert words.first.respond_to?(:offset)
40
+ end
41
+
42
+ def test_genes
43
+ text = "This is a mention to TP53, a gene that should be found"
44
+ document = Document.setup(Document.corpus.add_document(text, "TEST"))
45
+ genes = document.entities :genes
46
+
47
+ assert_equal "TP53", genes.first
48
+ assert genes.first.respond_to?(:offset)
49
+
50
+ text = "This is a mention to TP53, a gene that should be found"
51
+ document = Document.setup(Document.corpus.add_document(text, "TEST"))
52
+ genes = document.entities :genes
53
+
54
+ assert_equal "TP53", genes.first
55
+ assert genes.first.respond_to?(:offset)
56
+ end
57
+ end
58
+
@@ -1,7 +1,7 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
- require 'rbbt/ner/segment'
2
+ require 'rbbt/text/segment'
3
3
 
4
- class TestClass < Test::Unit::TestCase
4
+ class TestSegment < Test::Unit::TestCase
5
5
  def test_info
6
6
  a = "test"
7
7
  a.extend Segment
@@ -6,7 +6,7 @@ require 'rbbt'
6
6
  require 'rbbt/persist'
7
7
  require 'rbbt/util/tmpfile'
8
8
  require 'rbbt/util/log'
9
- require 'rbbt/corpus/document_repo'
9
+ require 'rbbt/text/corpus'
10
10
 
11
11
  class Test::Unit::TestCase
12
12
  def get_test_datafile(file)
@@ -22,8 +22,8 @@ class Test::Unit::TestCase
22
22
  FileUtils.rm_rf Rbbt.tmp.test.find :user
23
23
  Persist::CONNECTIONS.values.each do |c| c.close end
24
24
  Persist::CONNECTIONS.clear
25
- DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
- DocumentRepo::TC_CONNECTIONS.clear
25
+ Corpus::DocumentRepo::TC_CONNECTIONS.values.each do |c| c.close end
26
+ Corpus::DocumentRepo::TC_CONNECTIONS.clear
27
27
  end
28
28
 
29
29
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.8
4
+ version: 1.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-31 00:00:00.000000000 Z
11
+ date: 2020-04-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -78,11 +78,6 @@ files:
78
78
  - lib/rbbt/bow/bow.rb
79
79
  - lib/rbbt/bow/dictionary.rb
80
80
  - lib/rbbt/bow/misc.rb
81
- - lib/rbbt/corpus/corpus.rb
82
- - lib/rbbt/corpus/document.rb
83
- - lib/rbbt/corpus/document_repo.rb
84
- - lib/rbbt/corpus/sources/pubmed.rb
85
- - lib/rbbt/entity/document.rb
86
81
  - lib/rbbt/ner/NER.rb
87
82
  - lib/rbbt/ner/abner.rb
88
83
  - lib/rbbt/ner/banner.rb
@@ -99,17 +94,22 @@ files:
99
94
  - lib/rbbt/ner/rnorm.rb
100
95
  - lib/rbbt/ner/rnorm/cue_index.rb
101
96
  - lib/rbbt/ner/rnorm/tokens.rb
102
- - lib/rbbt/ner/segment.rb
103
- - lib/rbbt/ner/segment/docid.rb
104
- - lib/rbbt/ner/segment/named_entity.rb
105
- - lib/rbbt/ner/segment/relationship.rb
106
- - lib/rbbt/ner/segment/segmented.rb
107
- - lib/rbbt/ner/segment/token.rb
108
- - lib/rbbt/ner/segment/transformed.rb
109
97
  - lib/rbbt/ner/token_trieNER.rb
110
98
  - lib/rbbt/nlp/genia/sentence_splitter.rb
111
99
  - lib/rbbt/nlp/nlp.rb
112
100
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
101
+ - lib/rbbt/text/corpus.rb
102
+ - lib/rbbt/text/corpus/document.rb
103
+ - lib/rbbt/text/corpus/document_repo.rb
104
+ - lib/rbbt/text/corpus/sources/pmid.rb
105
+ - lib/rbbt/text/document.rb
106
+ - lib/rbbt/text/segment.rb
107
+ - lib/rbbt/text/segment/docid.rb
108
+ - lib/rbbt/text/segment/named_entity.rb
109
+ - lib/rbbt/text/segment/relationship.rb
110
+ - lib/rbbt/text/segment/segmented.rb
111
+ - lib/rbbt/text/segment/token.rb
112
+ - lib/rbbt/text/segment/transformed.rb
113
113
  - share/install/software/ABNER
114
114
  - share/install/software/BANNER
115
115
  - share/install/software/ChemicalTagger
@@ -129,10 +129,6 @@ files:
129
129
  - test/rbbt/bow/test_dictionary.rb
130
130
  - test/rbbt/bow/test_misc.rb
131
131
  - test/rbbt/entity/test_document.rb
132
- - test/rbbt/ner/segment/test_named_entity.rb
133
- - test/rbbt/ner/segment/test_relationship.rb
134
- - test/rbbt/ner/segment/test_segmented.rb
135
- - test/rbbt/ner/segment/test_transformed.rb
136
132
  - test/rbbt/ner/test_NER.rb
137
133
  - test/rbbt/ner/test_abner.rb
138
134
  - test/rbbt/ner/test_banner.rb
@@ -146,11 +142,19 @@ files:
146
142
  - test/rbbt/ner/test_patterns.rb
147
143
  - test/rbbt/ner/test_regexpNER.rb
148
144
  - test/rbbt/ner/test_rnorm.rb
149
- - test/rbbt/ner/test_segment.rb
150
145
  - test/rbbt/ner/test_token_trieNER.rb
151
146
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
152
147
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
153
148
  - test/rbbt/nlp/test_nlp.rb
149
+ - test/rbbt/text/corpus/sources/test_pmid.rb
150
+ - test/rbbt/text/corpus/test_document.rb
151
+ - test/rbbt/text/segment/test_named_entity.rb
152
+ - test/rbbt/text/segment/test_relationship.rb
153
+ - test/rbbt/text/segment/test_segmented.rb
154
+ - test/rbbt/text/segment/test_transformed.rb
155
+ - test/rbbt/text/test_corpus.rb
156
+ - test/rbbt/text/test_document.rb
157
+ - test/rbbt/text/test_segment.rb
154
158
  - test/test_helper.rb
155
159
  homepage: http://github.com/mikisvaz/rbbt-util
156
160
  licenses: []
@@ -178,6 +182,15 @@ test_files:
178
182
  - test/rbbt/nlp/test_nlp.rb
179
183
  - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
180
184
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
185
+ - test/rbbt/text/test_document.rb
186
+ - test/rbbt/text/corpus/sources/test_pmid.rb
187
+ - test/rbbt/text/corpus/test_document.rb
188
+ - test/rbbt/text/test_segment.rb
189
+ - test/rbbt/text/test_corpus.rb
190
+ - test/rbbt/text/segment/test_transformed.rb
191
+ - test/rbbt/text/segment/test_relationship.rb
192
+ - test/rbbt/text/segment/test_named_entity.rb
193
+ - test/rbbt/text/segment/test_segmented.rb
181
194
  - test/rbbt/bow/test_bow.rb
182
195
  - test/rbbt/bow/test_misc.rb
183
196
  - test/rbbt/bow/test_dictionary.rb
@@ -194,11 +207,6 @@ test_files:
194
207
  - test/rbbt/ner/test_banner.rb
195
208
  - test/rbbt/ner/test_token_trieNER.rb
196
209
  - test/rbbt/ner/test_finder.rb
197
- - test/rbbt/ner/test_segment.rb
198
210
  - test/rbbt/ner/test_linnaeus.rb
199
- - test/rbbt/ner/segment/test_transformed.rb
200
- - test/rbbt/ner/segment/test_relationship.rb
201
- - test/rbbt/ner/segment/test_named_entity.rb
202
- - test/rbbt/ner/segment/test_segmented.rb
203
211
  - test/rbbt/ner/test_oscar4.rb
204
212
  - test/test_helper.rb
@@ -1,266 +0,0 @@
1
- require 'rbbt/ner/segment'
2
- require 'rbbt/ner/segment/segmented'
3
- require 'rbbt/tsv'
4
- require 'rbbt/resource/path'
5
- require 'rbbt/persist/tsv'
6
- require 'rbbt/util/misc'
7
- require 'json'
8
-
9
- class Document
10
-
11
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence
12
- def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil)
13
- @segments = {}
14
- @segment_indices = {}
15
-
16
- if not persist_dir.nil?
17
- @persist_dir = persist_dir
18
- @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
19
- end
20
-
21
- @global_persistence = global_persistence
22
-
23
- if not docid.nil?
24
- @docid = docid
25
- update_docid
26
- end
27
- @text = text unless text.nil?
28
- end
29
-
30
- def update_docid
31
- @namespace, @id, @type, @hash = docid.split(":", -1)
32
- end
33
-
34
- def docid=(docid)
35
- @docid = docid
36
- update_docid
37
- end
38
-
39
- #{{{ PERSISTENCE
40
-
41
- TSV_REPOS = {}
42
- FIELDS_FOR_ENTITY_PERSISTENCE = {}
43
- def self.persist(entity, fields = nil)
44
-
45
- if not fields.nil?
46
- fields = [fields] if not Array === fields
47
- fields = fields.collect{|f| f.to_s}
48
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
49
- end
50
-
51
- self.class_eval <<-EOC
52
- def load_with_persistence_#{entity}(raw = false)
53
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
54
-
55
- tsv_file = File.join(@persist_dir.find, "#{ entity }")
56
-
57
- return nil if raw == :check and File.exists? tsv_file
58
-
59
- annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
60
- segments = produce_#{entity}
61
- tsv = Segment.tsv(segments, fields)
62
- end
63
-
64
- return annotations if raw
65
-
66
- annotations.unnamed = true
67
- annotations.collect{|id, annotation|
68
- Segment.load_tsv_values(text, annotation, annotations.fields)
69
- }
70
- end
71
- EOC
72
- end
73
-
74
- def self.persist_in_tsv(entity, tsv = nil, fields = nil)
75
- if not tsv.nil? and not tsv.respond_to?(:keys)
76
- fields = tsv
77
- tsv = nil
78
- end
79
-
80
- TSV_REPOS[entity.to_s] = tsv
81
-
82
- if not fields.nil?
83
- fields = [fields] if not Array === fields
84
- fields = fields.collect{|f| f.to_s}
85
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
86
- end
87
-
88
- self.class_eval <<-EOC
89
- def load_with_persistence_#{entity}(raw = false)
90
- repo = TSV_REPOS["#{ entity }"]
91
- if repo.nil?
92
- raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
93
- repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
94
- end
95
-
96
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
97
- if not repo.include? "#{ entity }"
98
- segments = produce_#{entity}
99
- repo.write
100
- repo["#{entity}"] = Segment.tsv(segments, fields)
101
- repo.read
102
- else
103
- if raw == :check
104
- repo.close
105
- return nil
106
- end
107
- end
108
-
109
-
110
- annotations = repo["#{entity}"]
111
-
112
- repo.close
113
-
114
-
115
- return annotations if raw
116
-
117
- annotations.unnamed = true
118
- annotations.collect{|id, annotation|
119
- Segment.load_tsv_values(text, annotation, annotations.fields)
120
- }
121
- end
122
- EOC
123
- end
124
-
125
- def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
126
- doc_field ||= "Document ID"
127
- entity_field ||= "Entity Type"
128
-
129
- TSV_REPOS[entity.to_s] = tsv
130
-
131
- if not fields.nil?
132
- fields = [fields] if not Array === fields
133
- fields = fields.collect{|f| f.to_s}
134
- else
135
- fields = nil
136
- end
137
-
138
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
139
-
140
- self.class_eval <<-EOC
141
- def load_with_persistence_#{entity}(raw = false)
142
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
143
-
144
- data = TSV_REPOS["#{ entity }"] || @global_persistence
145
-
146
- data.read true
147
-
148
- fields = data.fields if fields.nil? and data.respond_to? :fields
149
-
150
-
151
- if data.respond_to? :persistence_path and String === data.persistence_path
152
- data.filter(data.persistence_path + '.filters')
153
- end
154
- data.add_filter("field:#{ doc_field }", @docid)
155
- data.add_filter("field:#{ entity_field }", "#{ entity }")
156
- keys = data.keys
157
- data.pop_filter
158
- data.pop_filter
159
-
160
- if keys.empty?
161
- segments = produce_#{entity}
162
- segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
163
- tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
164
-
165
- tsv.add_field "#{ doc_field }" do
166
- @docid
167
- end
168
-
169
- tsv.add_field "#{ entity_field }" do
170
- "#{ entity }"
171
- end
172
-
173
- data.add_filter("field:#{ doc_field }", @docid)
174
- data.add_filter("field:#{ entity_field }", "#{ entity }")
175
- data.write true
176
- keys = tsv.collect do |key, value|
177
- data[key] = value
178
- key
179
- end
180
- data.pop_filter
181
- data.pop_filter
182
- data.read
183
-
184
- else
185
- if raw == :check
186
- data.close
187
- return nil
188
- end
189
- end
190
-
191
- return data.values if raw
192
-
193
- start_pos = data.identify_field "Start"
194
- segments = data.values_at(*keys).collect{|annotation|
195
- pos = annotation[start_pos]
196
- Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
197
- }.compact
198
- data.close
199
-
200
- segments
201
- end
202
- EOC
203
- end
204
-
205
-
206
- def self.define(entity, &block)
207
- send :define_method, "produce_#{entity}", &block
208
-
209
- self.class_eval <<-EOC
210
- def load_#{entity}(raw = false)
211
- return if segments.include? "#{ entity }"
212
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
213
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
214
- else
215
- segments["#{ entity }"] = produce_#{entity}
216
- end
217
- end
218
-
219
- def #{entity}(raw = false)
220
- begin
221
- entities = segments["#{ entity }"]
222
- if entities.nil?
223
- load_#{entity}(raw)
224
- entities = segments["#{ entity }"]
225
- end
226
- end
227
-
228
- entities
229
- end
230
-
231
- def #{entity}_at(pos, persist = false)
232
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
233
- end
234
-
235
- EOC
236
- end
237
-
238
- def segment_index(name, persist_dir = nil)
239
- @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
240
- end
241
-
242
- def load_into(segment, *annotations)
243
- options = annotations.pop if Hash === annotations.last
244
- options ||= {}
245
-
246
- if options[:persist] and not @persist_dir.nil?
247
- persist_dir = File.join(@persist_dir, 'ranges')
248
- else
249
- persist_dir = nil
250
- end
251
-
252
- Segmented.setup(segment, {})
253
- annotations.collect do |name|
254
- name = name.to_s
255
- index = segment_index(name, persist_dir)
256
- annotations = index[segment.range]
257
- segment.segments[name] ||= {}
258
- segment.segments[name] = annotations
259
- class << segment
260
- self
261
- end.class_eval "def #{ name }; @segments['#{ name }']; end"
262
- end
263
-
264
- segment
265
- end
266
- end