rbbt-text 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,383 +0,0 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/docid'
4
- require 'rbbt/tsv'
5
- require 'rbbt/resource/path'
6
- require 'rbbt/persist/tsv'
7
- require 'rbbt/util/misc'
8
- require 'rbbt/text/document'
9
- require 'json'
10
-
11
- class Corpus
12
- class Document
13
-
14
- class MultipleEntity < Exception; end
15
-
16
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
17
-
18
- attr_accessor :multiple_result
19
-
20
- def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
21
- @segments = {}
22
- @segment_indices = {}
23
- @corpus = corpus
24
-
25
- if not persist_dir.nil?
26
- @persist_dir = persist_dir
27
- @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
28
- end
29
-
30
- @global_persistence = global_persistence
31
-
32
- if not docid.nil?
33
- @docid = docid
34
- update_docid
35
- end
36
- @text = text unless text.nil?
37
- end
38
-
39
- def update_docid
40
- @namespace, @id, @type, @hash = docid.split(":", -1)
41
- end
42
-
43
- def docid=(docid)
44
- @docid = docid
45
- update_docid
46
- end
47
-
48
- def self.define(entity, &block)
49
- send :define_method, "produce_#{entity}" do
50
- segments = self.instance_exec &block
51
-
52
- segments.each{|s| s.docid = docid }
53
- end
54
-
55
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
56
- def load_#{entity}(raw = false)
57
- return if segments.include? "#{ entity }"
58
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
59
- entities = load_with_persistence_#{entity}(raw)
60
- else
61
- entities = produce_#{entity}
62
- end
63
-
64
- segments["#{ entity }"] = entities
65
- end
66
-
67
- def #{entity}(raw = false)
68
- begin
69
- entities = segments["#{ entity }"]
70
- if entities.nil?
71
- load_#{entity}(raw)
72
- entities = segments["#{ entity }"]
73
- end
74
- end
75
-
76
- entities
77
- end
78
-
79
- def #{entity}_at(pos, persist = false)
80
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
81
- end
82
-
83
- EOC
84
- end
85
-
86
- def self.define_multiple(entity, &block)
87
- send :define_method, "produce_#{entity}" do
88
- if self.multiple_result && self.multiple_result[entity]
89
- segments = self.multiple_result[entity]
90
- return segments.each{|s| s.docid = docid }
91
- end
92
- raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
93
- end
94
-
95
- name = "multiple_produce_#{entity}"
96
- class << self
97
- self
98
- end.send :define_method, name, &block
99
-
100
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
101
- def load_#{entity}(raw = false)
102
- return if segments.include? "#{ entity }"
103
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
104
- entities = load_with_persistence_#{entity}(raw)
105
- else
106
- entities = produce_#{entity}
107
- end
108
-
109
- segments["#{ entity }"] = entities
110
- end
111
-
112
- def #{entity}(raw = false)
113
- begin
114
- entities = segments["#{ entity }"]
115
- if entities.nil?
116
- load_#{entity}(raw)
117
- entities = segments["#{ entity }"]
118
- end
119
- end
120
-
121
- entities
122
- end
123
-
124
- def #{entity}_at(pos, persist = false)
125
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
126
- end
127
-
128
- EOC
129
- end
130
-
131
- def self.prepare_multiple(docs, entity)
132
- missing = []
133
- docs.each do |doc|
134
- begin
135
- doc.send(entity)
136
- rescue MultipleEntity
137
- missing << doc
138
- end
139
- end
140
- res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
141
- case res
142
- when Array
143
- res.each_with_index do |res,i|
144
- missing[i].multiple_result ||= {}
145
- missing[i].multiple_result[entity] = res
146
- end
147
- when Hash
148
- res.each do |document,res|
149
- case document
150
- when Corpus::Document
151
- document.multiple_result[entity] = res
152
- when String
153
- document = missing.select{|d| d.docid == document}.first
154
- document.multiple_result[entity] = res
155
- end
156
- end
157
- end
158
- missing.each{|doc|
159
- doc.send entity
160
- }
161
- end
162
-
163
-
164
- #{{{ PERSISTENCE
165
-
166
- TSV_REPOS = {}
167
- FIELDS_FOR_ENTITY_PERSISTENCE = {}
168
- def self.persist(entity, fields = nil)
169
-
170
- if not fields.nil?
171
- fields = [fields] if not Array === fields
172
- fields = fields.collect{|f| f.to_s}
173
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
174
- end
175
-
176
- self.class_eval <<-EOC, __FILE__, __LINE__
177
- def load_with_persistence_#{entity}(raw = false)
178
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
179
-
180
- tsv_file = File.join(@persist_dir.find, "#{ entity }")
181
-
182
- return nil if raw == :check and File.exists? tsv_file
183
-
184
- annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
185
- segments = produce_#{entity}
186
- tsv = Segment.tsv(segments, fields)
187
- end
188
-
189
- return annotations if raw
190
-
191
- annotations.unnamed = true
192
- annotations.collect{|id, annotation|
193
- Segment.load_tsv_values(text, annotation, annotations.fields)
194
- }
195
- end
196
- EOC
197
- end
198
-
199
- def self.persist_in_tsv(entity, tsv = nil, fields = nil)
200
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]).tap{|t| t.unnamed = true, t.close} if Path === tsv
201
-
202
- if ! tsv.nil? && ! tsv.respond_to?(:keys)
203
- fields = tsv
204
- tsv = nil
205
- end
206
-
207
- TSV_REPOS[entity.to_s] = tsv
208
-
209
- if ! fields.nil?
210
- fields = [fields] if not Array === fields
211
- fields = fields.collect{|f| f.to_s}
212
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
213
- end
214
-
215
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
216
- def load_with_persistence_#{entity}(raw = false)
217
- repo = TSV_REPOS["#{ entity }"]
218
- if repo.nil?
219
- raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
220
- repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
221
- end
222
-
223
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
224
- begin
225
- if ! repo.include?("#{ entity }")
226
- segments = produce_#{entity}
227
- repo.write_and_read do
228
- repo["#{entity}"] = Segment.tsv(segments, fields) if segments.any?
229
- end
230
- else
231
- if raw == :check
232
- repo.close
233
- return nil
234
- end
235
- end
236
-
237
- annotations = repo["#{entity}"]
238
-
239
- repo.close
240
-
241
- return annotations if raw
242
-
243
- annotations.unnamed = true
244
- annotations.collect{|id, annotation|
245
- Segment.load_tsv_values(text, annotation, annotations.fields)
246
- }
247
- ensure
248
- repo.close
249
- end
250
- end
251
- EOC
252
- end
253
-
254
- def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
255
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => (fields || ["Start", "End", "JSON", "Document ID", "Entity Type"])).tap{|t| t.unnamed = true, t.close} if Path === tsv
256
-
257
- doc_field ||= "Document ID"
258
- entity_field ||= "Entity Type"
259
-
260
- TSV_REPOS[entity.to_s] = tsv
261
-
262
- if not fields.nil?
263
- fields = [fields] if not Array === fields
264
- fields = fields.collect{|f| f.to_s}
265
- else
266
- fields = nil
267
- end
268
-
269
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
270
-
271
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
272
- def load_with_persistence_#{entity}(raw = false)
273
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
274
-
275
- data = TSV_REPOS["#{ entity }"] || @global_persistence
276
-
277
- begin
278
-
279
- if data.respond_to? :persistence_path and String === data.persistence_path
280
- data.filter(data.persistence_path + '.filters')
281
- end
282
-
283
- keys = data.read_and_close do
284
-
285
- fields = data.fields if fields.nil? and data.respond_to? :fields
286
-
287
- data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
288
- data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
289
- keys = data.keys
290
- data.pop_filter if fields.include?("#{entity_field}")
291
- data.pop_filter if fields.include?("#{doc_field}")
292
-
293
- keys
294
- end
295
-
296
-
297
- if keys.empty?
298
- segments = produce_#{entity}
299
- segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
300
- tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
301
-
302
- tsv.add_field "#{ doc_field }" do
303
- @docid
304
- end
305
-
306
- tsv.add_field "#{ entity_field }" do
307
- "#{ entity }"
308
- end
309
-
310
- keys = data.write_and_close do
311
- data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
312
- data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
313
- keys = tsv.collect do |key, value|
314
- data[key] = value
315
- key
316
- end
317
- data.pop_filter if fields.include?("#{entity_field}")
318
- data.pop_filter if fields.include?("#{doc_field}")
319
- keys
320
- end
321
-
322
- else
323
- return nil if raw == :check
324
- end
325
-
326
- return data.values if raw
327
-
328
- start_pos = data.identify_field "Start"
329
- data.read_and_close do
330
- data.chunked_values_at(keys).collect{|annotation|
331
- begin
332
- pos = annotation[start_pos]
333
- Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
334
- rescue
335
- Log.exception $!
336
- iif keys
337
- iif [text, annotation]
338
- end
339
-
340
- }.compact
341
- end
342
- ensure
343
- data.close
344
- end
345
-
346
- end
347
- EOC
348
- end
349
-
350
- def segment_index(name, persist_dir = nil)
351
- @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
352
- end
353
-
354
- def load_into(segment, *annotations)
355
- options = annotations.pop if Hash === annotations.last
356
- options ||= {}
357
-
358
- if options[:persist] and not @persist_dir.nil?
359
- persist_dir = File.join(@persist_dir, 'ranges')
360
- else
361
- persist_dir = nil
362
- end
363
-
364
- Segmented.setup(segment, {})
365
- annotations.collect do |name|
366
- name = name.to_s
367
- index = segment_index(name, persist_dir)
368
- annotations = index[segment.range]
369
- segment.segments[name] ||= {}
370
- segment.segments[name] = annotations
371
- class << segment
372
- self
373
- end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
374
- end
375
-
376
- segment
377
- end
378
-
379
- def entity
380
- Object::Document.setup(self.docid, corpus)
381
- end
382
- end
383
- end
@@ -1,68 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'tokyocabinet'
3
-
4
- class Corpus
5
- module DocumentRepo
6
- class OpenError < StandardError;end
7
- class KeyFormatError < StandardError;end
8
-
9
- TC_CONNECTIONS = {}
10
- def self.open_tokyocabinet(path, write)
11
- database = Persist.open_tokyocabinet(path, write, :single, TokyoCabinet::BDB)
12
- database.extend DocumentRepo
13
- database
14
- end
15
-
16
- def docid2fields(docid)
17
- docid.split(":", -1).values_at 0,1,2,3
18
- end
19
-
20
- def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
21
- [namespace, id, type, hash] * ":"
22
- end
23
-
24
- def docid(docid)
25
- get(docid)
26
- end
27
-
28
- def add(text, namespace, id, type, hash)
29
- docid = fields2docid(namespace, id, type, hash)
30
-
31
- return docid if self.include?(docid)
32
-
33
- write_and_close do
34
- self[docid] = text
35
- end
36
-
37
- docid
38
- end
39
-
40
- def find(namespace=nil, id = nil, type = nil, hash = nil)
41
- case
42
- when namespace.nil?
43
- self.keys
44
- when id.nil?
45
- range_start = [namespace] * ":" + ':'
46
- range_end = [namespace] * ":" + ';'
47
- self.range(range_start, true, range_end, false)
48
- when (type and hash)
49
- [[namespace, id, type, hash] * ":"]
50
- when hash
51
- [[namespace, id, "", hash] * ":"]
52
- when type
53
- range_start = [namespace, id, type] * ":" + ':'
54
- range_end = [namespace, id, type] * ":" + ';'
55
- self.range(range_start, true, range_end, false)
56
- else
57
- range_start = [namespace, id] * ":" + ':'
58
- range_end = [namespace, id] * ":" + ';'
59
- self.range(range_start, true, range_end, false)
60
- end
61
- end
62
-
63
- def find_docid(docid)
64
- find(*docid2fields(docid))
65
- end
66
-
67
- end
68
- end