rbbt-text 1.1.8 → 1.1.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/ner/NER.rb +3 -3
  3. data/lib/rbbt/ner/abner.rb +3 -3
  4. data/lib/rbbt/ner/banner.rb +1 -1
  5. data/lib/rbbt/ner/brat.rb +2 -2
  6. data/lib/rbbt/ner/chemical_tagger.rb +1 -1
  7. data/lib/rbbt/ner/linnaeus.rb +1 -1
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
  9. data/lib/rbbt/ner/oscar3.rb +1 -1
  10. data/lib/rbbt/ner/oscar4.rb +1 -1
  11. data/lib/rbbt/ner/patterns.rb +4 -4
  12. data/lib/rbbt/ner/regexpNER.rb +1 -1
  13. data/lib/rbbt/ner/token_trieNER.rb +2 -2
  14. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  15. data/lib/rbbt/nlp/nlp.rb +2 -2
  16. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
  17. data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
  18. data/lib/rbbt/text/corpus/document.rb +361 -0
  19. data/lib/rbbt/text/corpus/document_repo.rb +68 -0
  20. data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
  21. data/lib/rbbt/text/document.rb +39 -0
  22. data/lib/rbbt/{ner → text}/segment.rb +11 -6
  23. data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
  24. data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
  25. data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
  26. data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
  27. data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
  28. data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
  29. data/test/rbbt/entity/test_document.rb +1 -0
  30. data/test/rbbt/ner/test_abner.rb +1 -0
  31. data/test/rbbt/ner/test_linnaeus.rb +1 -0
  32. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
  33. data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
  34. data/test/rbbt/text/corpus/test_document.rb +52 -0
  35. data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
  36. data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
  37. data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
  38. data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
  39. data/test/rbbt/text/test_corpus.rb +34 -0
  40. data/test/rbbt/text/test_document.rb +58 -0
  41. data/test/rbbt/{ner → text}/test_segment.rb +2 -2
  42. data/test/test_helper.rb +3 -3
  43. metadata +32 -24
  44. data/lib/rbbt/corpus/document.rb +0 -266
  45. data/lib/rbbt/corpus/document_repo.rb +0 -137
  46. data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
  47. data/lib/rbbt/entity/document.rb +0 -75
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea1646b5f32644bb5872f57422534b49955f988df26df4a65c8dda592515eac3
4
- data.tar.gz: 3f6bc60546b79c76b6b35840712453616c377fcc088f321e95847f116776bef1
3
+ metadata.gz: 77391b4691e4ea2a6e5da918bc40820bae8175ff1d82f9c96a1685986605dfd7
4
+ data.tar.gz: a83dd9236502d1787f1040fb4c60a6160086515713282283e434b589c1425743
5
5
  SHA512:
6
- metadata.gz: 9376c68bad67733b5771b57ead7c962d45ff29c44362d1c51bf3480d3c3bf9f1f75284e40044fc4ed95bd94a03ab0759b3b7320bf1e3da00a0cdd82255c9395c
7
- data.tar.gz: cd25a9cd91fde366be195801d45238d555edfc94f2b06391db7db2d9f4781b34dd599514385782d6c7e22af2841c5f3322ba74bf0a3a9c1fdbe308a255f00098
6
+ metadata.gz: f69d7eb10741d2b3c7735e8e29f29625567775647d16d0261b42cce108d2f8309a2e938dad3360842a964a9c5d4fd5a2197c72618ab40971f7a65306e9c6936a
7
+ data.tar.gz: dec802a15cfc7c8c9a90ee8ec0c83af88c881ee16e071776a995554aa0661603bdd6cb7bf30162c43beccf1a423a2e8d26afc15f92544ccc08284a87a038a1b2
@@ -1,6 +1,6 @@
1
- require 'rbbt/ner/segment'
2
- require 'rbbt/ner/segment/named_entity'
3
- require 'rbbt/ner/segment/segmented'
1
+ require 'rbbt/text/segment'
2
+ require 'rbbt/text/segment/named_entity'
3
+ require 'rbbt/text/segment/segmented'
4
4
 
5
5
  class NER
6
6
  def entities(text, protect = false, *args)
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
4
3
  require 'rbbt/resource'
4
+ require 'rbbt/text/segment'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
@@ -31,10 +31,10 @@ class Abner < NER
31
31
  # Given a chunk of text, it finds all the mentions appearing in it. It
32
32
  # returns all the mentions found, regardless of type, to be coherent
33
33
  # with the rest of NER packages in Rbbt.
34
- def match(text)
34
+ def match(text, fix_encode = true)
35
35
  return [] if text.nil? or text.empty?
36
36
 
37
- text = text.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
37
+ text = text.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '') if fix_encode
38
38
  res = @tagger.getEntities(text)
39
39
  types = res[1]
40
40
  strings = res[0]
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
4
3
  require 'rbbt/ner/NER'
4
+ require 'rbbt/text/segment'
5
5
 
6
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
7
7
  # in Java. Banner[http://banner.sourceforge.net/].
@@ -1,5 +1,5 @@
1
- require 'rbbt/ner/segment/named_entity'
2
- require 'rbbt/ner/segment/relationship'
1
+ require 'rbbt/text/segment/named_entity'
2
+ require 'rbbt/text/segment/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
5
 
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
3
+ require 'rbbt/text/segment'
4
4
  require 'rbbt/ner/NER'
5
5
  require 'rbbt/util/log'
6
6
 
@@ -1,6 +1,6 @@
1
1
  require 'rjb'
2
2
  require 'rbbt'
3
- require 'rbbt/ner/segment/named_entity'
3
+ require 'rbbt/text/segment/named_entity'
4
4
 
5
5
  module Linnaeus
6
6
 
@@ -1,8 +1,8 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/misc'
3
3
  require 'rbbt/tsv'
4
- require 'rbbt/ner/segment'
5
- require 'rbbt/ner/segment/token'
4
+ require 'rbbt/text/segment'
5
+ require 'rbbt/text/segment/token'
6
6
  require 'rbbt/ner/NER'
7
7
  require 'inline'
8
8
 
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/segment'
4
+ require 'rbbt/text/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/segment'
4
+ require 'rbbt/text/segment'
5
5
  require 'rbbt/ner/NER'
6
6
  require 'rbbt/util/log'
7
7
 
@@ -1,7 +1,7 @@
1
- require 'rbbt/ner/segment/named_entity'
2
- require 'rbbt/ner/segment/segmented'
3
- require 'rbbt/ner/segment/transformed'
4
- require 'rbbt/ner/segment/relationship'
1
+ require 'rbbt/text/segment/named_entity'
2
+ require 'rbbt/text/segment/segmented'
3
+ require 'rbbt/text/segment/transformed'
4
+ require 'rbbt/text/segment/relationship'
5
5
  require 'rbbt/ner/regexpNER'
6
6
  require 'rbbt/ner/token_trieNER'
7
7
  require 'rbbt/nlp/nlp'
@@ -1,4 +1,4 @@
1
- require 'rbbt/ner/segment'
1
+ require 'rbbt/text/segment'
2
2
  require 'rbbt/ner/NER'
3
3
  require 'rbbt/util/simpleDSL'
4
4
 
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/tsv'
3
- require 'rbbt/ner/segment'
4
- require 'rbbt/ner/segment/token'
3
+ require 'rbbt/text/segment'
4
+ require 'rbbt/text/segment/token'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  class TokenTrieNER < NER
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/nlp/nlp'
2
- require 'rbbt/ner/segment'
2
+ require 'rbbt/text/segment'
3
3
  module NLP
4
4
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
5
 
@@ -2,8 +2,8 @@ require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
3
  require 'rbbt/persist'
4
4
  require 'rbbt/resource'
5
- require 'rbbt/ner/segment'
6
- require 'rbbt/ner/segment/segmented'
5
+ require 'rbbt/text/segment'
6
+ require 'rbbt/text/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -1,6 +1,6 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/segment'
3
+ require 'rbbt/text/segment'
4
4
  require 'rbbt/resource'
5
5
 
6
6
  module OpenNLP
@@ -1,8 +1,17 @@
1
- require 'rbbt/corpus/document'
2
- require 'rbbt/corpus/document_repo'
1
+ require 'rbbt/text/corpus/document'
2
+ require 'rbbt/text/corpus/document_repo'
3
3
 
4
4
  class Corpus
5
+ class << self
6
+ attr_accessor :claims
7
+ def claim(namespace, &block)
8
+ @@claims = {}
9
+ @@claims[namespace] = block
10
+ end
11
+
12
+ end
5
13
  attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
14
+
6
15
  def initialize(corpora_path = nil)
7
16
  @corpora_path = case
8
17
  when corpora_path.nil?
@@ -24,6 +33,7 @@ class Corpus
24
33
 
25
34
  Misc.lock(@corpora_path.document_repo) do
26
35
  @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
36
+ @document_repo.close
27
37
  end
28
38
 
29
39
  end
@@ -32,35 +42,65 @@ class Corpus
32
42
  File.join(persistence_dir, docid)
33
43
  end
34
44
 
45
+
46
+ def docid(docid)
47
+ begin
48
+ if @document_repo.include?(docid)
49
+ Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
50
+ else
51
+ namespace, id, type = docid.split(":")
52
+ if @@claims.include?(namespace)
53
+
54
+ docid = self.instance_exec id, type, &(@@claims[namespace])
55
+ docid = docid.first if Array === docid
56
+ self.docid(docid)
57
+ else
58
+ raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
59
+ end
60
+ end
61
+ ensure
62
+ @document_repo.close
63
+ end
64
+ end
65
+
35
66
  def document(namespace, id, type, hash)
36
67
  docid = [namespace, id, type, hash] * ":"
37
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
38
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
68
+ self.docid(docid)
39
69
  end
40
70
 
41
- def docid(docid)
42
- raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
43
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
71
+ def add_document(text, namespace = nil, id = nil, type = nil)
72
+ text = Misc.fixutf8(text)
73
+ hash = Digest::MD5.hexdigest(text)
74
+ @document_repo.add(text, namespace, id, type, hash)
44
75
  end
45
76
 
46
- def add_document(text, namespace, id, type = nil)
47
- hash = Digest::MD5.hexdigest(text)
77
+ def add_docid(text, docid)
78
+ namespace, id, type, hash = docid.split(":")
48
79
  @document_repo.add(text, namespace, id, type, hash)
49
80
  end
50
81
 
82
+
51
83
  def find(namespace=nil, id = nil, type = nil, hash = nil)
52
84
  @document_repo.find(namespace, id, type, hash).collect{|docid|
53
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
85
+ self.docid(docid)
54
86
  }
55
87
  end
56
88
 
57
89
  def find_docid(docid)
58
90
  @document_repo.find_docid(docid).collect{|docid|
59
- Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
91
+ self.docid(docid)
60
92
  }
61
93
  end
62
94
 
63
95
  def exists?(namespace=nil, id = nil, type = nil, hash = nil)
64
96
  find(namespace, id, type, hash).any?
65
97
  end
98
+
99
+ def [](docid)
100
+ self.docid(docid)
101
+ end
102
+
103
+ def include?(id)
104
+ @document_repo.include? id
105
+ end
66
106
  end
@@ -0,0 +1,361 @@
1
+ require 'rbbt/text/segment'
2
+ require 'rbbt/text/segment/segmented'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/resource/path'
5
+ require 'rbbt/persist/tsv'
6
+ require 'rbbt/util/misc'
7
+ require 'rbbt/text/document'
8
+ require 'json'
9
+
10
+ class Corpus
11
+ class Document
12
+
13
+ class MultipleEntity < Exception; end
14
+
15
+ attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
16
+
17
+ attr_accessor :multiple_result
18
+ def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
19
+ @segments = {}
20
+ @segment_indices = {}
21
+ @corpus = corpus
22
+
23
+ if not persist_dir.nil?
24
+ @persist_dir = persist_dir
25
+ @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
26
+ end
27
+
28
+ @global_persistence = global_persistence
29
+
30
+ if not docid.nil?
31
+ @docid = docid
32
+ update_docid
33
+ end
34
+ @text = text unless text.nil?
35
+ end
36
+
37
+ def update_docid
38
+ @namespace, @id, @type, @hash = docid.split(":", -1)
39
+ end
40
+
41
+ def docid=(docid)
42
+ @docid = docid
43
+ update_docid
44
+ end
45
+
46
+ def self.define(entity, &block)
47
+ send :define_method, "produce_#{entity}", &block
48
+
49
+ self.class_eval <<-EOC, __FILE__, __LINE__
50
+ def load_#{entity}(raw = false)
51
+ return if segments.include? "#{ entity }"
52
+ if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
53
+ segments["#{entity}"] = load_with_persistence_#{entity}(raw)
54
+ else
55
+ segments["#{ entity }"] = produce_#{entity}
56
+ end
57
+ end
58
+
59
+ def #{entity}(raw = false)
60
+ begin
61
+ entities = segments["#{ entity }"]
62
+ if entities.nil?
63
+ load_#{entity}(raw)
64
+ entities = segments["#{ entity }"]
65
+ end
66
+ end
67
+
68
+ entities
69
+ end
70
+
71
+ def #{entity}_at(pos, persist = false)
72
+ segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
73
+ end
74
+
75
+ EOC
76
+ end
77
+
78
+ def self.define_multiple(entity, &block)
79
+ send :define_method, "produce_#{entity}" do
80
+ return self.multiple_result[entity] if self.multiple_result && self.multiple_result[entity]
81
+ raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
82
+ end
83
+
84
+ name = "multiple_produce_#{entity}"
85
+ class << self
86
+ self
87
+ end.send :define_method, name, &block
88
+
89
+ self.class_eval <<-EOC, __FILE__, __LINE__
90
+ def load_#{entity}(raw = false)
91
+ return if segments.include? "#{ entity }"
92
+ if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
93
+ segments["#{entity}"] = load_with_persistence_#{entity}(raw)
94
+ else
95
+ segments["#{ entity }"] = produce_#{entity}
96
+ end
97
+ end
98
+
99
+ def #{entity}(raw = false)
100
+ begin
101
+ entities = segments["#{ entity }"]
102
+ if entities.nil?
103
+ load_#{entity}(raw)
104
+ entities = segments["#{ entity }"]
105
+ end
106
+ end
107
+
108
+ entities
109
+ end
110
+
111
+ def #{entity}_at(pos, persist = false)
112
+ segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
113
+ end
114
+
115
+ EOC
116
+ end
117
+
118
+ def self.prepare_multiple(docs, entity)
119
+ missing = []
120
+ docs.each do |doc|
121
+ begin
122
+ doc.send(entity)
123
+ rescue MultipleEntity
124
+ missing << doc
125
+ end
126
+ end
127
+ res = self.send("multiple_produce_#{entity.to_s}", missing)
128
+ case res
129
+ when Array
130
+ res.each_with_index do |res,i|
131
+ missing[i].multiple_result ||= {}
132
+ missing[i].multiple_result[entity] = res
133
+ end
134
+ when Hash
135
+ res.each do |document,res|
136
+ case document
137
+ when Corpus::Document
138
+ document.multiple_result[entity] = res
139
+ when String
140
+ document = missing.select{|d| d.docid == document}.first
141
+ document.multiple_result[entity] = res
142
+ end
143
+ end
144
+ end
145
+ missing.each{|doc| doc.send entity }
146
+ end
147
+
148
+
149
+ #{{{ PERSISTENCE
150
+
151
+ TSV_REPOS = {}
152
+ FIELDS_FOR_ENTITY_PERSISTENCE = {}
153
+ def self.persist(entity, fields = nil)
154
+
155
+ if not fields.nil?
156
+ fields = [fields] if not Array === fields
157
+ fields = fields.collect{|f| f.to_s}
158
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
159
+ end
160
+
161
+ self.class_eval <<-EOC, __FILE__, __LINE__
162
+ def load_with_persistence_#{entity}(raw = false)
163
+ fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
164
+
165
+ tsv_file = File.join(@persist_dir.find, "#{ entity }")
166
+
167
+ return nil if raw == :check and File.exists? tsv_file
168
+
169
+ annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
170
+ segments = produce_#{entity}
171
+ tsv = Segment.tsv(segments, fields)
172
+ end
173
+
174
+ return annotations if raw
175
+
176
+ annotations.unnamed = true
177
+ annotations.collect{|id, annotation|
178
+ Segment.load_tsv_values(text, annotation, annotations.fields)
179
+ }
180
+ end
181
+ EOC
182
+ end
183
+
184
+ def self.persist_in_tsv(entity, tsv = nil, fields = nil)
185
+ tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]).tap{|t| t.unnamed = true, t.close} if Path === tsv
186
+
187
+ if ! tsv.nil? && ! tsv.respond_to?(:keys)
188
+ fields = tsv
189
+ tsv = nil
190
+ end
191
+
192
+ TSV_REPOS[entity.to_s] = tsv
193
+
194
+ if ! fields.nil?
195
+ fields = [fields] if not Array === fields
196
+ fields = fields.collect{|f| f.to_s}
197
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
198
+ end
199
+
200
+ self.class_eval <<-EOC, __FILE__, __LINE__
201
+ def load_with_persistence_#{entity}(raw = false)
202
+ repo = TSV_REPOS["#{ entity }"]
203
+ if repo.nil?
204
+ raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
205
+ repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
206
+ end
207
+
208
+ fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
209
+ begin
210
+ if ! repo.include?("#{ entity }")
211
+ segments = produce_#{entity}
212
+ repo.write_and_read do
213
+ repo["#{entity}"] = Segment.tsv(segments, fields) if segments.any?
214
+ end
215
+ else
216
+ if raw == :check
217
+ repo.close
218
+ return nil
219
+ end
220
+ end
221
+
222
+ annotations = repo["#{entity}"]
223
+
224
+ repo.close
225
+
226
+ return annotations if raw
227
+
228
+ annotations.unnamed = true
229
+ annotations.collect{|id, annotation|
230
+ Segment.load_tsv_values(text, annotation, annotations.fields)
231
+ }
232
+ ensure
233
+ repo.close
234
+ end
235
+ end
236
+ EOC
237
+ end
238
+
239
+ def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
240
+ tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => (fields || ["Start", "End", "JSON", "Document ID", "Entity Type"])).tap{|t| t.unnamed = true, t.close} if Path === tsv
241
+
242
+ doc_field ||= "Document ID"
243
+ entity_field ||= "Entity Type"
244
+
245
+ TSV_REPOS[entity.to_s] = tsv
246
+
247
+ if not fields.nil?
248
+ fields = [fields] if not Array === fields
249
+ fields = fields.collect{|f| f.to_s}
250
+ else
251
+ fields = nil
252
+ end
253
+
254
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
255
+
256
+ self.class_eval <<-EOC, __FILE__, __LINE__
257
+ def load_with_persistence_#{entity}(raw = false)
258
+ fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
259
+
260
+ data = TSV_REPOS["#{ entity }"] || @global_persistence
261
+
262
+ begin
263
+
264
+ data.read true
265
+
266
+ fields = data.fields if fields.nil? and data.respond_to? :fields
267
+
268
+
269
+ if data.respond_to? :persistence_path and String === data.persistence_path
270
+ data.filter(data.persistence_path + '.filters')
271
+ end
272
+
273
+ data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
274
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
275
+ keys = data.keys
276
+ data.pop_filter if data.fields.include?("#{entity_field}")
277
+ data.pop_filter if data.fields.include?("#{doc_field}")
278
+
279
+ if keys.empty?
280
+ segments = produce_#{entity}
281
+ segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
282
+ tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
283
+
284
+ tsv.add_field "#{ doc_field }" do
285
+ @docid
286
+ end
287
+
288
+ tsv.add_field "#{ entity_field }" do
289
+ "#{ entity }"
290
+ end
291
+
292
+ data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
293
+ data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
294
+ data.write true
295
+ keys = tsv.collect do |key, value|
296
+ data[key] = value
297
+ key
298
+ end
299
+ data.pop_filter if data.fields.include?("#{entity_field}")
300
+ data.pop_filter if data.fields.include?("#{doc_field}")
301
+ data.read
302
+
303
+ else
304
+ if raw == :check
305
+ data.close
306
+ return nil
307
+ end
308
+ end
309
+
310
+ return data.values if raw
311
+
312
+ start_pos = data.identify_field "Start"
313
+ segments = data.values_at(*keys).collect{|annotation|
314
+ pos = annotation[start_pos]
315
+ Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
316
+ }.compact
317
+ data.close
318
+
319
+ segments
320
+ ensure
321
+ data.close
322
+ end
323
+
324
+ end
325
+ EOC
326
+ end
327
+
328
+ def segment_index(name, persist_dir = nil)
329
+ @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
330
+ end
331
+
332
+ def load_into(segment, *annotations)
333
+ options = annotations.pop if Hash === annotations.last
334
+ options ||= {}
335
+
336
+ if options[:persist] and not @persist_dir.nil?
337
+ persist_dir = File.join(@persist_dir, 'ranges')
338
+ else
339
+ persist_dir = nil
340
+ end
341
+
342
+ Segmented.setup(segment, {})
343
+ annotations.collect do |name|
344
+ name = name.to_s
345
+ index = segment_index(name, persist_dir)
346
+ annotations = index[segment.range]
347
+ segment.segments[name] ||= {}
348
+ segment.segments[name] = annotations
349
+ class << segment
350
+ self
351
+ end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
352
+ end
353
+
354
+ segment
355
+ end
356
+
357
+ def entity
358
+ Object::Document.setup(self.docid, corpus)
359
+ end
360
+ end
361
+ end