rbbt-text 1.1.8 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +3 -3
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +2 -2
- data/lib/rbbt/ner/chemical_tagger.rb +1 -1
- data/lib/rbbt/ner/linnaeus.rb +1 -1
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +2 -2
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +1 -1
- data/lib/rbbt/ner/patterns.rb +4 -4
- data/lib/rbbt/ner/regexpNER.rb +1 -1
- data/lib/rbbt/ner/token_trieNER.rb +2 -2
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +2 -2
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +1 -1
- data/lib/rbbt/{corpus → text}/corpus.rb +51 -11
- data/lib/rbbt/text/corpus/document.rb +361 -0
- data/lib/rbbt/text/corpus/document_repo.rb +68 -0
- data/lib/rbbt/text/corpus/sources/pmid.rb +34 -0
- data/lib/rbbt/text/document.rb +39 -0
- data/lib/rbbt/{ner → text}/segment.rb +11 -6
- data/lib/rbbt/{ner → text}/segment/docid.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/named_entity.rb +2 -2
- data/lib/rbbt/{ner → text}/segment/relationship.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/segmented.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/token.rb +1 -1
- data/lib/rbbt/{ner → text}/segment/transformed.rb +47 -42
- data/test/rbbt/entity/test_document.rb +1 -0
- data/test/rbbt/ner/test_abner.rb +1 -0
- data/test/rbbt/ner/test_linnaeus.rb +1 -0
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +0 -1
- data/test/rbbt/text/corpus/sources/test_pmid.rb +33 -0
- data/test/rbbt/text/corpus/test_document.rb +52 -0
- data/test/rbbt/{ner → text}/segment/test_named_entity.rb +2 -2
- data/test/rbbt/{ner → text}/segment/test_relationship.rb +0 -0
- data/test/rbbt/{ner → text}/segment/test_segmented.rb +1 -1
- data/test/rbbt/{ner → text}/segment/test_transformed.rb +96 -3
- data/test/rbbt/text/test_corpus.rb +34 -0
- data/test/rbbt/text/test_document.rb +58 -0
- data/test/rbbt/{ner → text}/test_segment.rb +2 -2
- data/test/test_helper.rb +3 -3
- metadata +32 -24
- data/lib/rbbt/corpus/document.rb +0 -266
- data/lib/rbbt/corpus/document_repo.rb +0 -137
- data/lib/rbbt/corpus/sources/pubmed.rb +0 -27
- data/lib/rbbt/entity/document.rb +0 -75
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77391b4691e4ea2a6e5da918bc40820bae8175ff1d82f9c96a1685986605dfd7
|
4
|
+
data.tar.gz: a83dd9236502d1787f1040fb4c60a6160086515713282283e434b589c1425743
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f69d7eb10741d2b3c7735e8e29f29625567775647d16d0261b42cce108d2f8309a2e938dad3360842a964a9c5d4fd5a2197c72618ab40971f7a65306e9c6936a
|
7
|
+
data.tar.gz: dec802a15cfc7c8c9a90ee8ec0c83af88c881ee16e071776a995554aa0661603bdd6cb7bf30162c43beccf1a423a2e8d26afc15f92544ccc08284a87a038a1b2
|
data/lib/rbbt/ner/NER.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/text/segment'
|
2
|
+
require 'rbbt/text/segment/named_entity'
|
3
|
+
require 'rbbt/text/segment/segmented'
|
4
4
|
|
5
5
|
class NER
|
6
6
|
def entities(text, protect = false, *args)
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/segment'
|
4
3
|
require 'rbbt/resource'
|
4
|
+
require 'rbbt/text/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
|
7
7
|
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
@@ -31,10 +31,10 @@ class Abner < NER
|
|
31
31
|
# Given a chunk of text, it finds all the mentions appearing in it. It
|
32
32
|
# returns all the mentions found, regardless of type, to be coherent
|
33
33
|
# with the rest of NER packages in Rbbt.
|
34
|
-
def match(text)
|
34
|
+
def match(text, fix_encode = true)
|
35
35
|
return [] if text.nil? or text.empty?
|
36
36
|
|
37
|
-
text = text.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '')
|
37
|
+
text = text.encode('utf-8', 'binary', :invalid => :replace, :undef => :replace, :replace => '') if fix_encode
|
38
38
|
res = @tagger.getEntities(text)
|
39
39
|
types = res[1]
|
40
40
|
strings = res[0]
|
data/lib/rbbt/ner/banner.rb
CHANGED
data/lib/rbbt/ner/brat.rb
CHANGED
data/lib/rbbt/ner/linnaeus.rb
CHANGED
data/lib/rbbt/ner/oscar3.rb
CHANGED
data/lib/rbbt/ner/oscar4.rb
CHANGED
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/text/segment/named_entity'
|
2
|
+
require 'rbbt/text/segment/segmented'
|
3
|
+
require 'rbbt/text/segment/transformed'
|
4
|
+
require 'rbbt/text/segment/relationship'
|
5
5
|
require 'rbbt/ner/regexpNER'
|
6
6
|
require 'rbbt/ner/token_trieNER'
|
7
7
|
require 'rbbt/nlp/nlp'
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
data/lib/rbbt/nlp/nlp.rb
CHANGED
@@ -2,8 +2,8 @@ require 'rbbt'
|
|
2
2
|
require 'rbbt/util/tmpfile'
|
3
3
|
require 'rbbt/persist'
|
4
4
|
require 'rbbt/resource'
|
5
|
-
require 'rbbt/
|
6
|
-
require 'rbbt/
|
5
|
+
require 'rbbt/text/segment'
|
6
|
+
require 'rbbt/text/segment/segmented'
|
7
7
|
require 'rbbt/nlp/genia/sentence_splitter'
|
8
8
|
require 'digest/md5'
|
9
9
|
|
@@ -1,8 +1,17 @@
|
|
1
|
-
require 'rbbt/corpus/document'
|
2
|
-
require 'rbbt/corpus/document_repo'
|
1
|
+
require 'rbbt/text/corpus/document'
|
2
|
+
require 'rbbt/text/corpus/document_repo'
|
3
3
|
|
4
4
|
class Corpus
|
5
|
+
class << self
|
6
|
+
attr_accessor :claims
|
7
|
+
def claim(namespace, &block)
|
8
|
+
@@claims = {}
|
9
|
+
@@claims[namespace] = block
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
5
13
|
attr_accessor :corpora_path, :document_repo, :persistence_dir, :global_annotations
|
14
|
+
|
6
15
|
def initialize(corpora_path = nil)
|
7
16
|
@corpora_path = case
|
8
17
|
when corpora_path.nil?
|
@@ -24,6 +33,7 @@ class Corpus
|
|
24
33
|
|
25
34
|
Misc.lock(@corpora_path.document_repo) do
|
26
35
|
@document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
|
36
|
+
@document_repo.close
|
27
37
|
end
|
28
38
|
|
29
39
|
end
|
@@ -32,35 +42,65 @@ class Corpus
|
|
32
42
|
File.join(persistence_dir, docid)
|
33
43
|
end
|
34
44
|
|
45
|
+
|
46
|
+
def docid(docid)
|
47
|
+
begin
|
48
|
+
if @document_repo.include?(docid)
|
49
|
+
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations, self)
|
50
|
+
else
|
51
|
+
namespace, id, type = docid.split(":")
|
52
|
+
if @@claims.include?(namespace)
|
53
|
+
|
54
|
+
docid = self.instance_exec id, type, &(@@claims[namespace])
|
55
|
+
docid = docid.first if Array === docid
|
56
|
+
self.docid(docid)
|
57
|
+
else
|
58
|
+
raise "Document '#{ docid }' was not found." unless @document_repo.include? docid
|
59
|
+
end
|
60
|
+
end
|
61
|
+
ensure
|
62
|
+
@document_repo.close
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
35
66
|
def document(namespace, id, type, hash)
|
36
67
|
docid = [namespace, id, type, hash] * ":"
|
37
|
-
|
38
|
-
Document.new(persistence_for(docid), docid, @document_repo[docid], @global_annotations)
|
68
|
+
self.docid(docid)
|
39
69
|
end
|
40
70
|
|
41
|
-
def
|
42
|
-
|
43
|
-
|
71
|
+
def add_document(text, namespace = nil, id = nil, type = nil)
|
72
|
+
text = Misc.fixutf8(text)
|
73
|
+
hash = Digest::MD5.hexdigest(text)
|
74
|
+
@document_repo.add(text, namespace, id, type, hash)
|
44
75
|
end
|
45
76
|
|
46
|
-
def
|
47
|
-
hash =
|
77
|
+
def add_docid(text, docid)
|
78
|
+
namespace, id, type, hash = docid.split(":")
|
48
79
|
@document_repo.add(text, namespace, id, type, hash)
|
49
80
|
end
|
50
81
|
|
82
|
+
|
51
83
|
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
52
84
|
@document_repo.find(namespace, id, type, hash).collect{|docid|
|
53
|
-
|
85
|
+
self.docid(docid)
|
54
86
|
}
|
55
87
|
end
|
56
88
|
|
57
89
|
def find_docid(docid)
|
58
90
|
@document_repo.find_docid(docid).collect{|docid|
|
59
|
-
|
91
|
+
self.docid(docid)
|
60
92
|
}
|
61
93
|
end
|
62
94
|
|
63
95
|
def exists?(namespace=nil, id = nil, type = nil, hash = nil)
|
64
96
|
find(namespace, id, type, hash).any?
|
65
97
|
end
|
98
|
+
|
99
|
+
def [](docid)
|
100
|
+
self.docid(docid)
|
101
|
+
end
|
102
|
+
|
103
|
+
def include?(id)
|
104
|
+
@document_repo.include? id
|
105
|
+
end
|
66
106
|
end
|
@@ -0,0 +1,361 @@
|
|
1
|
+
require 'rbbt/text/segment'
|
2
|
+
require 'rbbt/text/segment/segmented'
|
3
|
+
require 'rbbt/tsv'
|
4
|
+
require 'rbbt/resource/path'
|
5
|
+
require 'rbbt/persist/tsv'
|
6
|
+
require 'rbbt/util/misc'
|
7
|
+
require 'rbbt/text/document'
|
8
|
+
require 'json'
|
9
|
+
|
10
|
+
class Corpus
|
11
|
+
class Document
|
12
|
+
|
13
|
+
class MultipleEntity < Exception; end
|
14
|
+
|
15
|
+
attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
|
16
|
+
|
17
|
+
attr_accessor :multiple_result
|
18
|
+
def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
|
19
|
+
@segments = {}
|
20
|
+
@segment_indices = {}
|
21
|
+
@corpus = corpus
|
22
|
+
|
23
|
+
if not persist_dir.nil?
|
24
|
+
@persist_dir = persist_dir
|
25
|
+
@persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
|
26
|
+
end
|
27
|
+
|
28
|
+
@global_persistence = global_persistence
|
29
|
+
|
30
|
+
if not docid.nil?
|
31
|
+
@docid = docid
|
32
|
+
update_docid
|
33
|
+
end
|
34
|
+
@text = text unless text.nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
def update_docid
|
38
|
+
@namespace, @id, @type, @hash = docid.split(":", -1)
|
39
|
+
end
|
40
|
+
|
41
|
+
def docid=(docid)
|
42
|
+
@docid = docid
|
43
|
+
update_docid
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.define(entity, &block)
|
47
|
+
send :define_method, "produce_#{entity}", &block
|
48
|
+
|
49
|
+
self.class_eval <<-EOC, __FILE__, __LINE__
|
50
|
+
def load_#{entity}(raw = false)
|
51
|
+
return if segments.include? "#{ entity }"
|
52
|
+
if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
|
53
|
+
segments["#{entity}"] = load_with_persistence_#{entity}(raw)
|
54
|
+
else
|
55
|
+
segments["#{ entity }"] = produce_#{entity}
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def #{entity}(raw = false)
|
60
|
+
begin
|
61
|
+
entities = segments["#{ entity }"]
|
62
|
+
if entities.nil?
|
63
|
+
load_#{entity}(raw)
|
64
|
+
entities = segments["#{ entity }"]
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
entities
|
69
|
+
end
|
70
|
+
|
71
|
+
def #{entity}_at(pos, persist = false)
|
72
|
+
segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
|
73
|
+
end
|
74
|
+
|
75
|
+
EOC
|
76
|
+
end
|
77
|
+
|
78
|
+
def self.define_multiple(entity, &block)
|
79
|
+
send :define_method, "produce_#{entity}" do
|
80
|
+
return self.multiple_result[entity] if self.multiple_result && self.multiple_result[entity]
|
81
|
+
raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
|
82
|
+
end
|
83
|
+
|
84
|
+
name = "multiple_produce_#{entity}"
|
85
|
+
class << self
|
86
|
+
self
|
87
|
+
end.send :define_method, name, &block
|
88
|
+
|
89
|
+
self.class_eval <<-EOC, __FILE__, __LINE__
|
90
|
+
def load_#{entity}(raw = false)
|
91
|
+
return if segments.include? "#{ entity }"
|
92
|
+
if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
|
93
|
+
segments["#{entity}"] = load_with_persistence_#{entity}(raw)
|
94
|
+
else
|
95
|
+
segments["#{ entity }"] = produce_#{entity}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def #{entity}(raw = false)
|
100
|
+
begin
|
101
|
+
entities = segments["#{ entity }"]
|
102
|
+
if entities.nil?
|
103
|
+
load_#{entity}(raw)
|
104
|
+
entities = segments["#{ entity }"]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
entities
|
109
|
+
end
|
110
|
+
|
111
|
+
def #{entity}_at(pos, persist = false)
|
112
|
+
segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
|
113
|
+
end
|
114
|
+
|
115
|
+
EOC
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.prepare_multiple(docs, entity)
|
119
|
+
missing = []
|
120
|
+
docs.each do |doc|
|
121
|
+
begin
|
122
|
+
doc.send(entity)
|
123
|
+
rescue MultipleEntity
|
124
|
+
missing << doc
|
125
|
+
end
|
126
|
+
end
|
127
|
+
res = self.send("multiple_produce_#{entity.to_s}", missing)
|
128
|
+
case res
|
129
|
+
when Array
|
130
|
+
res.each_with_index do |res,i|
|
131
|
+
missing[i].multiple_result ||= {}
|
132
|
+
missing[i].multiple_result[entity] = res
|
133
|
+
end
|
134
|
+
when Hash
|
135
|
+
res.each do |document,res|
|
136
|
+
case document
|
137
|
+
when Corpus::Document
|
138
|
+
document.multiple_result[entity] = res
|
139
|
+
when String
|
140
|
+
document = missing.select{|d| d.docid == document}.first
|
141
|
+
document.multiple_result[entity] = res
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
145
|
+
missing.each{|doc| doc.send entity }
|
146
|
+
end
|
147
|
+
|
148
|
+
|
149
|
+
#{{{ PERSISTENCE
|
150
|
+
|
151
|
+
TSV_REPOS = {}
|
152
|
+
FIELDS_FOR_ENTITY_PERSISTENCE = {}
|
153
|
+
def self.persist(entity, fields = nil)
|
154
|
+
|
155
|
+
if not fields.nil?
|
156
|
+
fields = [fields] if not Array === fields
|
157
|
+
fields = fields.collect{|f| f.to_s}
|
158
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
|
159
|
+
end
|
160
|
+
|
161
|
+
self.class_eval <<-EOC, __FILE__, __LINE__
|
162
|
+
def load_with_persistence_#{entity}(raw = false)
|
163
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
164
|
+
|
165
|
+
tsv_file = File.join(@persist_dir.find, "#{ entity }")
|
166
|
+
|
167
|
+
return nil if raw == :check and File.exists? tsv_file
|
168
|
+
|
169
|
+
annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
|
170
|
+
segments = produce_#{entity}
|
171
|
+
tsv = Segment.tsv(segments, fields)
|
172
|
+
end
|
173
|
+
|
174
|
+
return annotations if raw
|
175
|
+
|
176
|
+
annotations.unnamed = true
|
177
|
+
annotations.collect{|id, annotation|
|
178
|
+
Segment.load_tsv_values(text, annotation, annotations.fields)
|
179
|
+
}
|
180
|
+
end
|
181
|
+
EOC
|
182
|
+
end
|
183
|
+
|
184
|
+
def self.persist_in_tsv(entity, tsv = nil, fields = nil)
|
185
|
+
tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]).tap{|t| t.unnamed = true, t.close} if Path === tsv
|
186
|
+
|
187
|
+
if ! tsv.nil? && ! tsv.respond_to?(:keys)
|
188
|
+
fields = tsv
|
189
|
+
tsv = nil
|
190
|
+
end
|
191
|
+
|
192
|
+
TSV_REPOS[entity.to_s] = tsv
|
193
|
+
|
194
|
+
if ! fields.nil?
|
195
|
+
fields = [fields] if not Array === fields
|
196
|
+
fields = fields.collect{|f| f.to_s}
|
197
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
198
|
+
end
|
199
|
+
|
200
|
+
self.class_eval <<-EOC, __FILE__, __LINE__
|
201
|
+
def load_with_persistence_#{entity}(raw = false)
|
202
|
+
repo = TSV_REPOS["#{ entity }"]
|
203
|
+
if repo.nil?
|
204
|
+
raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
|
205
|
+
repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
|
206
|
+
end
|
207
|
+
|
208
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
209
|
+
begin
|
210
|
+
if ! repo.include?("#{ entity }")
|
211
|
+
segments = produce_#{entity}
|
212
|
+
repo.write_and_read do
|
213
|
+
repo["#{entity}"] = Segment.tsv(segments, fields) if segments.any?
|
214
|
+
end
|
215
|
+
else
|
216
|
+
if raw == :check
|
217
|
+
repo.close
|
218
|
+
return nil
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
annotations = repo["#{entity}"]
|
223
|
+
|
224
|
+
repo.close
|
225
|
+
|
226
|
+
return annotations if raw
|
227
|
+
|
228
|
+
annotations.unnamed = true
|
229
|
+
annotations.collect{|id, annotation|
|
230
|
+
Segment.load_tsv_values(text, annotation, annotations.fields)
|
231
|
+
}
|
232
|
+
ensure
|
233
|
+
repo.close
|
234
|
+
end
|
235
|
+
end
|
236
|
+
EOC
|
237
|
+
end
|
238
|
+
|
239
|
+
def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
|
240
|
+
tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => (fields || ["Start", "End", "JSON", "Document ID", "Entity Type"])).tap{|t| t.unnamed = true, t.close} if Path === tsv
|
241
|
+
|
242
|
+
doc_field ||= "Document ID"
|
243
|
+
entity_field ||= "Entity Type"
|
244
|
+
|
245
|
+
TSV_REPOS[entity.to_s] = tsv
|
246
|
+
|
247
|
+
if not fields.nil?
|
248
|
+
fields = [fields] if not Array === fields
|
249
|
+
fields = fields.collect{|f| f.to_s}
|
250
|
+
else
|
251
|
+
fields = nil
|
252
|
+
end
|
253
|
+
|
254
|
+
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
|
255
|
+
|
256
|
+
self.class_eval <<-EOC, __FILE__, __LINE__
|
257
|
+
def load_with_persistence_#{entity}(raw = false)
|
258
|
+
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
259
|
+
|
260
|
+
data = TSV_REPOS["#{ entity }"] || @global_persistence
|
261
|
+
|
262
|
+
begin
|
263
|
+
|
264
|
+
data.read true
|
265
|
+
|
266
|
+
fields = data.fields if fields.nil? and data.respond_to? :fields
|
267
|
+
|
268
|
+
|
269
|
+
if data.respond_to? :persistence_path and String === data.persistence_path
|
270
|
+
data.filter(data.persistence_path + '.filters')
|
271
|
+
end
|
272
|
+
|
273
|
+
data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
|
274
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
|
275
|
+
keys = data.keys
|
276
|
+
data.pop_filter if data.fields.include?("#{entity_field}")
|
277
|
+
data.pop_filter if data.fields.include?("#{doc_field}")
|
278
|
+
|
279
|
+
if keys.empty?
|
280
|
+
segments = produce_#{entity}
|
281
|
+
segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
|
282
|
+
tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
|
283
|
+
|
284
|
+
tsv.add_field "#{ doc_field }" do
|
285
|
+
@docid
|
286
|
+
end
|
287
|
+
|
288
|
+
tsv.add_field "#{ entity_field }" do
|
289
|
+
"#{ entity }"
|
290
|
+
end
|
291
|
+
|
292
|
+
data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
|
293
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
|
294
|
+
data.write true
|
295
|
+
keys = tsv.collect do |key, value|
|
296
|
+
data[key] = value
|
297
|
+
key
|
298
|
+
end
|
299
|
+
data.pop_filter if data.fields.include?("#{entity_field}")
|
300
|
+
data.pop_filter if data.fields.include?("#{doc_field}")
|
301
|
+
data.read
|
302
|
+
|
303
|
+
else
|
304
|
+
if raw == :check
|
305
|
+
data.close
|
306
|
+
return nil
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
return data.values if raw
|
311
|
+
|
312
|
+
start_pos = data.identify_field "Start"
|
313
|
+
segments = data.values_at(*keys).collect{|annotation|
|
314
|
+
pos = annotation[start_pos]
|
315
|
+
Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
|
316
|
+
}.compact
|
317
|
+
data.close
|
318
|
+
|
319
|
+
segments
|
320
|
+
ensure
|
321
|
+
data.close
|
322
|
+
end
|
323
|
+
|
324
|
+
end
|
325
|
+
EOC
|
326
|
+
end
|
327
|
+
|
328
|
+
def segment_index(name, persist_dir = nil)
|
329
|
+
@segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
|
330
|
+
end
|
331
|
+
|
332
|
+
def load_into(segment, *annotations)
|
333
|
+
options = annotations.pop if Hash === annotations.last
|
334
|
+
options ||= {}
|
335
|
+
|
336
|
+
if options[:persist] and not @persist_dir.nil?
|
337
|
+
persist_dir = File.join(@persist_dir, 'ranges')
|
338
|
+
else
|
339
|
+
persist_dir = nil
|
340
|
+
end
|
341
|
+
|
342
|
+
Segmented.setup(segment, {})
|
343
|
+
annotations.collect do |name|
|
344
|
+
name = name.to_s
|
345
|
+
index = segment_index(name, persist_dir)
|
346
|
+
annotations = index[segment.range]
|
347
|
+
segment.segments[name] ||= {}
|
348
|
+
segment.segments[name] = annotations
|
349
|
+
class << segment
|
350
|
+
self
|
351
|
+
end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
|
352
|
+
end
|
353
|
+
|
354
|
+
segment
|
355
|
+
end
|
356
|
+
|
357
|
+
def entity
|
358
|
+
Object::Document.setup(self.docid, corpus)
|
359
|
+
end
|
360
|
+
end
|
361
|
+
end
|