rbbt-text 1.1.8 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/ner/NER.rb +3 -3
  3. data/lib/rbbt/ner/abner.rb +3 -3
  4. data/lib/rbbt/ner/banner.rb +1 -1
  5. data/lib/rbbt/ner/brat.rb +2 -2
  6. data/lib/rbbt/ner/chemical_tagger.rb +1 -1
  7. data/lib/rbbt/ner/linnaeus.rb +1 -1
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
  9. data/lib/rbbt/ner/oscar3.rb +1 -1
  10. data/lib/rbbt/ner/oscar4.rb +1 -1
  11. data/lib/rbbt/ner/patterns.rb +4 -4
  12. data/lib/rbbt/ner/regexpNER.rb +1 -1
  13. data/lib/rbbt/ner/token_trieNER.rb +2 -2
  14. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  15. data/lib/rbbt/nlp/nlp.rb +2 -2
  16. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
  17. data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
  18. data/lib/rbbt/text/corpus/document.rb +361 -0
  19. data/lib/rbbt/text/corpus/document_repo.rb +68 -0
  20. data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
  21. data/lib/rbbt/text/document.rb +39 -0
  22. data/lib/rbbt/{ner → text}/segment.rb +11 -6
  23. data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
  24. data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
  25. data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
  26. data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
  27. data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
  28. data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
  29. data/test/rbbt/entity/test_document.rb +1 -0
  30. data/test/rbbt/ner/test_abner.rb +1 -0
  31. data/test/rbbt/ner/test_linnaeus.rb +1 -0
  32. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
  33. data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
  34. data/test/rbbt/text/corpus/test_document.rb +52 -0
  35. data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
  36. data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
  37. data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
  38. data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
  39. data/test/rbbt/text/test_corpus.rb +34 -0
  40. data/test/rbbt/text/test_document.rb +58 -0
  41. data/test/rbbt/{ner → text}/test_segment.rb +2 -2
  42. data/test/test_helper.rb +3 -3
  43. metadata +32 -24
  44. data/lib/rbbt/corpus/document.rb +0 -266
  45. data/lib/rbbt/corpus/document_repo.rb +0 -137
  46. data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
  47. data/lib/rbbt/entity/document.rb +0 -75
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea1646b5f32644bb5872f57422534b49955f988df26df4a65c8dda592515eac3
4
- data.tar.gz: 3f6bc60546b79c76b6b35840712453616c377fcc088f321e95847f116776bef1
3
+ metadata.gz: 77391b4691e4ea2a6e5da918bc40820bae8175ff1d82f9c96a1685986605dfd7
4
+ data.tar.gz: a83dd9236502d1787f1040fb4c60a6160086515713282283e434b589c1425743
5
5
  SHA512:
6
- metadata.gz: 9376c68bad67733b5771b57ead7c962d45ff29c44362d1c51bf3480d3c3bf9f1f75284e40044fc4ed95bd94a03ab0759b3b7320bf1e3da00a0cdd82255c9395c
7
- data.tar.gz: cd25a9cd91fde366be195801d45238d555edfc94f2b06391db7db2d9f4781b34dd599514385782d6c7e22af2841c5f3322ba74bf0a3a9c1fdbe308a255f00098
6
+ metadata.gz: f69d7eb10741d2b3c7735e8e29f29625567775647d16d0261b42cce108d2f8309a2e938dad3360842a964a9c5d4fd5a2197c72618ab40971f7a65306e9c6936a
7
+ data.tar.gz: dec802a15cfc7c8c9a90ee8ec0c83af88c881ee16e071776a995554aa0661603bdd6cb7bf30162c43beccf1a423a2e8d26afc15f92544ccc08284a87a038a1b2
@@ -1,6 +1,6 @@
1
- require 'rbbt/ner/segment'
2
- require 'rbbt/ner/segment/named_entity'
3
- require 'rbbt/ner/segment/segmented'
1
+ require 'rbbt/text/segment'
2
+ require 'rbbt/text/segment/named_entity'
3
+ require 'rbbt/text/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
4
3
  require 'rbbt/resource'
4
+ require 'rbbt/text/segment'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
@@ -31,10 +31,10 @@ class Abner < NER
31
31
  # Given a chunk of text, it finds all the mentions appearing in it. It
32
32
  # returns all the mentions found, regardless of type, to be coherent
33
33
  # with the rest of NER packages in Rbbt.
34
- def match(text)
34
+ def match(text, fix_encode = true)
35
35
  return [] if text.nil? or text.empty?
36
36
 
37
- text = text.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
37
+ text = text.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '') if fix_encode
38
38
  res = @tagger.getEntities(text)
39
39
  types = res[1]
40
40
  strings = res[0]
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
4
3
  require 'rbbt/ner/NER'
4
+ require 'rbbt/text/segment'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
@@ -1,5 +1,5 @@
1
- require 'rbbt/ner/segment/named_entity'
2
- require 'rbbt/ner/segment/relationship'
1
+ require 'rbbt/text/segment/named_entity'
2
+ require 'rbbt/text/segment/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
5
 
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
3
+ require 'rbbt/text/segment'
4
4
  require 'rbbt/ner/NER'
5
5
  require 'rbbt/util/log'
6
6
 
@@ -1,6 +1,6 @@
1
1
  require 'rjb'
2
2
  require 'rbbt'
3
- require 'rbbt/ner/segment/named_entity'
3
+ require 'rbbt/text/segment/named_entity'
4
4
 
5
5
  module Linnaeus
6
6
 
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/tsv'
4
- require 'rbbt/ner/segment'
5
- require 'rbbt/ner/segment/token'
4
+ require 'rbbt/text/segment'
5
+ require 'rbbt/text/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/segment'
4
+ require 'rbbt/text/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/segment'
4
+ require 'rbbt/text/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -1,7 +1,7 @@
1
- require 'rbbt/ner/segment/named_entity'
2
- require 'rbbt/ner/segment/segmented'
3
- require 'rbbt/ner/segment/transformed'
4
- require 'rbbt/ner/segment/relationship'
1
+ require 'rbbt/text/segment/named_entity'
2
+ require 'rbbt/text/segment/segmented'
3
+ require 'rbbt/text/segment/transformed'
4
+ require 'rbbt/text/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -1,4 +1,4 @@
1
- require 'rbbt/ner/segment'
1
+ require 'rbbt/text/segment'
2
2
  require 'rbbt/ner/NER'
3
3
  require 'rbbt/util/simpleDSL'
4
4
 
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
- require 'rbbt/ner/segment'
4
- require 'rbbt/ner/segment/token'
3
+ require 'rbbt/text/segment'
4
+ require 'rbbt/text/segment/token'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  class TokenTrieNER < NER
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/nlp/nlp'
2
- require 'rbbt/ner/segment'
2
+ require 'rbbt/text/segment'
3
3
  module NLP
4
4
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
5
 
@@ -2,8 +2,8 @@ require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
3
  require 'rbbt/persist'
4
4
  require 'rbbt/resource'
5
- require 'rbbt/ner/segment'
6
- require 'rbbt/ner/segment/segmented'
5
+ require 'rbbt/text/segment'
6
+ require 'rbbt/text/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
3
+ require 'rbbt/text/segment'
4
4
  require 'rbbt/resource'
5
5
 
6
6
  module OpenNLP
@@ -1,8 +1,17 @@
1
- require 'rbbt/corpus/document'
2
- require 'rbbt/corpus/document_repo'
1
+ require 'rbbt/text/corpus/document'
2
+ require 'rbbt/text/corpus/document_repo'
3
3
 
4
4
  class Corpus
5
+ class << self
6
+ attr_accessor :claims
7
+ def claim(namespace, &block)
8
+ @@claims = {}
9
+ @@claims[namespace] = block
10
+ end
11
+
12
+ end
5
13
  attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
14
+
6
15
  def initialize(corpora_path = nil)
7
16
  @corpora_path = case
8
17
  when corpora_path.nil?
@@ -24,6 +33,7 @@ class Corpus
24
33
 
25
34
  Misc.lock(@corpora_path.document_repo) do
26
35
  @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
36
+ @document_repo.close
27
37
  end
28
38
 
29
39
  end
@@ -32,35 +42,65 @@ class Corpus
32
42
  File.join(persistence_dir, docid)
33
43
  end
34
44
 
45
+
46
+ def docid(docid)
47
+ begin
48
+ if @document_repo.include?(docid)
49
+ Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
50
+ else
51
+ namespace, id, type = docid.split(":")
52
+ if @@claims.include?(namespace)
53
+
54
+ docid = self.instance_exec id, type, &(@@claims[namespace])
55
+ docid = docid.first if Array === docid
56
+ self.docid(docid)
57
+ else
58
+ raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
59
+ end
60
+ end
61
+ ensure
62
+ @document_repo.close
63
+ end
64
+ end
65
+
35
66
  def document(namespace, id, type, hash)
36
67
  docid = [namespace, id, type, hash] * ":"
37
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
38
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
68
+ self.docid(docid)
39
69
  end
40
70
 
41
- def docid(docid)
42
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
43
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
71
+ def add_document(text, namespace = nil, id = nil, type = nil)
72
+ text = Misc.fixutf8(text)
73
+ hash = Digest::MD5.hexdigest(text)
74
+ @document_repo.add(text, namespace, id, type, hash)
44
75
  end
45
76
 
46
- def add_document(text, namespace, id, type = nil)
47
- hash = Digest::MD5.hexdigest(text)
77
+ def add_docid(text, docid)
78
+ namespace, id, type, hash = docid.split(":")
48
79
  @document_repo.add(text, namespace, id, type, hash)
49
80
  end
50
81
 
82
+
51
83
  def find(namespace=nil, id = nil, type = nil, hash = nil)
52
84
  @document_repo.find(namespace, id, type, hash).collect{|docid|
53
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
85
+ self.docid(docid)
54
86
  }
55
87
  end
56
88
 
57
89
  def find_docid(docid)
58
90
  @document_repo.find_docid(docid).collect{|docid|
59
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
91
+ self.docid(docid)
60
92
  }
61
93
  end
62
94
 
63
95
  def exists?(namespace=nil, id = nil, type = nil, hash = nil)
64
96
  find(namespace, id, type, hash).any?
65
97
  end
98
+
99
+ def [](docid)
100
+ self.docid(docid)
101
+ end
102
+
103
+ def include?(id)
104
+ @document_repo.include? id
105
+ end
66
106
  end
@@ -0,0 +1,361 @@
1
+ require 'rbbt/text/segment'
2
+ require 'rbbt/text/segment/segmented'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/resource/path'
5
+ require 'rbbt/persist/tsv'
6
+ require 'rbbt/util/misc'
7
+ require 'rbbt/text/document'
8
+ require 'json'
9
+
10
+ class Corpus
11
+ class Document
12
+
13
+ class MultipleEntity < Exception; end
14
+
15
+ attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
16
+
17
+ attr_accessor :multiple_result
18
+ def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
19
+ @segments = {}
20
+ @segment_indices = {}
21
+ @corpus = corpus
22
+
23
+ if not persist_dir.nil?
24
+ @persist_dir = persist_dir
25
+ @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
26
+ end
27
+
28
+ @global_persistence = global_persistence
29
+
30
+ if not docid.nil?
31
+ @docid = docid
32
+ update_docid
33
+ end
34
+ @text = text unless text.nil?
35
+ end
36
+
37
+ def update_docid
38
+ @namespace, @id, @type, @hash = docid.split(":", -1)
39
+ end
40
+
41
+ def docid=(docid)
42
+ @docid = docid
43
+ update_docid
44
+ end
45
+
46
+ def self.define(entity, &block)
47
+ send :define_method, "produce_#{entity}", &block
48
+
49
+ self.class_eval <<-EOC, __FILE__, __LINE__
50
+ def load_#{entity}(raw = false)
51
+ return if segments.include? "#{ entity }"
52
+ if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
53
+ segments["#{entity}"] = load_with_persistence_#{entity}(raw)
54
+ else
55
+ segments["#{ entity }"] = produce_#{entity}
56
+ end
57
+ end
58
+
59
+ def #{entity}(raw = false)
60
+ begin
61
+ entities = segments["#{ entity }"]
62
+ if entities.nil?
63
+ load_#{entity}(raw)
64
+ entities = segments["#{ entity }"]
65
+ end
66
+ end
67
+
68
+ entities
69
+ end
70
+
71
+ def #{entity}_at(pos, persist = false)
72
+ segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
73
+ end
74
+
75
+ EOC
76
+ end
77
+
78
+ def self.define_multiple(entity, &block)
79
+ send :define_method, "produce_#{entity}" do
80
+ return self.multiple_result[entity] if self.multiple_result && self.multiple_result[entity]
81
+ raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
82
+ end
83
+
84
+ name = "multiple_produce_#{entity}"
85
+ class << self
86
+ self
87
+ end.send :define_method, name, &block
88
+
89
+ self.class_eval <<-EOC, __FILE__, __LINE__
90
+ def load_#{entity}(raw = false)
91
+ return if segments.include? "#{ entity }"
92
+ if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
93
+ segments["#{entity}"] = load_with_persistence_#{entity}(raw)
94
+ else
95
+ segments["#{ entity }"] = produce_#{entity}
96
+ end
97
+ end
98
+
99
+ def #{entity}(raw = false)
100
+ begin
101
+ entities = segments["#{ entity }"]
102
+ if entities.nil?
103
+ load_#{entity}(raw)
104
+ entities = segments["#{ entity }"]
105
+ end
106
+ end
107
+
108
+ entities
109
+ end
110
+
111
+ def #{entity}_at(pos, persist = false)
112
+ segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
113
+ end
114
+
115
+ EOC
116
+ end
117
+
118
+ def self.prepare_multiple(docs, entity)
119
+ missing = []
120
+ docs.each do |doc|
121
+ begin
122
+ doc.send(entity)
123
+ rescue MultipleEntity
124
+ missing << doc
125
+ end
126
+ end
127
+ res = self.send("multiple_produce_#{entity.to_s}", missing)
128
+ case res
129
+ when Array
130
+ res.each_with_index do |res,i|
131
+ missing[i].multiple_result ||= {}
132
+ missing[i].multiple_result[entity] = res
133
+ end
134
+ when Hash
135
+ res.each do |document,res|
136
+ case document
137
+ when Corpus::Document
138
+ document.multiple_result[entity] = res
139
+ when String
140
+ document = missing.select{|d| d.docid == document}.first
141
+ document.multiple_result[entity] = res
142
+ end
143
+ end
144
+ end
145
+ missing.each{|doc| doc.send entity }
146
+ end
147
+
148
+
149
+ #{{{ PERSISTENCE
150
+
151
+ TSV_REPOS = {}
152
+ FIELDS_FOR_ENTITY_PERSISTENCE = {}
153
+ def self.persist(entity, fields = nil)
154
+
155
+ if not fields.nil?
156
+ fields = [fields] if not Array === fields
157
+ fields = fields.collect{|f| f.to_s}
158
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
159
+ end
160
+
161
+ self.class_eval <<-EOC, __FILE__, __LINE__
162
+ def load_with_persistence_#{entity}(raw = false)
163
+ fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
164
+
165
+ tsv_file = File.join(@persist_dir.find, "#{ entity }")
166
+
167
+ return nil if raw == :check and File.exists? tsv_file
168
+
169
+ annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
170
+ segments = produce_#{entity}
171
+ tsv = Segment.tsv(segments, fields)
172
+ end
173
+
174
+ return annotations if raw
175
+
176
+ annotations.unnamed = true
177
+ annotations.collect{|id, annotation|
178
+ Segment.load_tsv_values(text, annotation, annotations.fields)
179
+ }
180
+ end
181
+ EOC
182
+ end
183
+
184
+ def self.persist_in_tsv(entity, tsv = nil, fields = nil)
185
+ tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]).tap{|t| t.unnamed = true, t.close} if Path === tsv
186
+
187
+ if ! tsv.nil? && ! tsv.respond_to?(:keys)
188
+ fields = tsv
189
+ tsv = nil
190
+ end
191
+
192
+ TSV_REPOS[entity.to_s] = tsv
193
+
194
+ if ! fields.nil?
195
+ fields = [fields] if not Array === fields
196
+ fields = fields.collect{|f| f.to_s}
197
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
198
+ end
199
+
200
+ self.class_eval <<-EOC, __FILE__, __LINE__
201
+ def load_with_persistence_#{entity}(raw = false)
202
+ repo = TSV_REPOS["#{ entity }"]
203
+ if repo.nil?
204
+ raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
205
+ repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
206
+ end
207
+
208
+ fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
209
+ begin
210
+ if ! repo.include?("#{ entity }")
211
+ segments = produce_#{entity}
212
+ repo.write_and_read do
213
+ repo["#{entity}"] = Segment.tsv(segments, fields) if segments.any?
214
+ end
215
+ else
216
+ if raw == :check
217
+ repo.close
218
+ return nil
219
+ end
220
+ end
221
+
222
+ annotations = repo["#{entity}"]
223
+
224
+ repo.close
225
+
226
+ return annotations if raw
227
+
228
+ annotations.unnamed = true
229
+ annotations.collect{|id, annotation|
230
+ Segment.load_tsv_values(text, annotation, annotations.fields)
231
+ }
232
+ ensure
233
+ repo.close
234
+ end
235
+ end
236
+ EOC
237
+ end
238
+
239
+ def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
240
+ tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => (fields || ["Start", "End", "JSON", "Document ID", "Entity Type"])).tap{|t| t.unnamed = true, t.close} if Path === tsv
241
+
242
+ doc_field ||= "Document ID"
243
+ entity_field ||= "Entity Type"
244
+
245
+ TSV_REPOS[entity.to_s] = tsv
246
+
247
+ if not fields.nil?
248
+ fields = [fields] if not Array === fields
249
+ fields = fields.collect{|f| f.to_s}
250
+ else
251
+ fields = nil
252
+ end
253
+
254
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
255
+
256
+ self.class_eval <<-EOC, __FILE__, __LINE__
257
+ def load_with_persistence_#{entity}(raw = false)
258
+ fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
259
+
260
+ data = TSV_REPOS["#{ entity }"] || @global_persistence
261
+
262
+ begin
263
+
264
+ data.read true
265
+
266
+ fields = data.fields if fields.nil? and data.respond_to? :fields
267
+
268
+
269
+ if data.respond_to? :persistence_path and String === data.persistence_path
270
+ data.filter(data.persistence_path + '.filters')
271
+ end
272
+
273
+ data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
274
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
275
+ keys = data.keys
276
+ data.pop_filter if data.fields.include?("#{entity_field}")
277
+ data.pop_filter if data.fields.include?("#{doc_field}")
278
+
279
+ if keys.empty?
280
+ segments = produce_#{entity}
281
+ segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
282
+ tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
283
+
284
+ tsv.add_field "#{ doc_field }" do
285
+ @docid
286
+ end
287
+
288
+ tsv.add_field "#{ entity_field }" do
289
+ "#{ entity }"
290
+ end
291
+
292
+ data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
293
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
294
+ data.write true
295
+ keys = tsv.collect do |key, value|
296
+ data[key] = value
297
+ key
298
+ end
299
+ data.pop_filter if data.fields.include?("#{entity_field}")
300
+ data.pop_filter if data.fields.include?("#{doc_field}")
301
+ data.read
302
+
303
+ else
304
+ if raw == :check
305
+ data.close
306
+ return nil
307
+ end
308
+ end
309
+
310
+ return data.values if raw
311
+
312
+ start_pos = data.identify_field "Start"
313
+ segments = data.values_at(*keys).collect{|annotation|
314
+ pos = annotation[start_pos]
315
+ Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
316
+ }.compact
317
+ data.close
318
+
319
+ segments
320
+ ensure
321
+ data.close
322
+ end
323
+
324
+ end
325
+ EOC
326
+ end
327
+
328
+ def segment_index(name, persist_dir = nil)
329
+ @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
330
+ end
331
+
332
+ def load_into(segment, *annotations)
333
+ options = annotations.pop if Hash === annotations.last
334
+ options ||= {}
335
+
336
+ if options[:persist] and not @persist_dir.nil?
337
+ persist_dir = File.join(@persist_dir, 'ranges')
338
+ else
339
+ persist_dir = nil
340
+ end
341
+
342
+ Segmented.setup(segment, {})
343
+ annotations.collect do |name|
344
+ name = name.to_s
345
+ index = segment_index(name, persist_dir)
346
+ annotations = index[segment.range]
347
+ segment.segments[name] ||= {}
348
+ segment.segments[name] = annotations
349
+ class << segment
350
+ self
351
+ end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
352
+ end
353
+
354
+ segment
355
+ end
356
+
357
+ def entity
358
+ Object::Document.setup(self.docid, corpus)
359
+ end
360
+ end
361
+ end