rbbt-text 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,383 +0,0 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/text/segment/docid'
4
- require 'rbbt/tsv'
5
- require 'rbbt/resource/path'
6
- require 'rbbt/persist/tsv'
7
- require 'rbbt/util/misc'
8
- require 'rbbt/text/document'
9
- require 'json'
10
-
11
- class Corpus
12
- class Document
13
-
14
- class MultipleEntity < Exception; end
15
-
16
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
17
-
18
- attr_accessor :multiple_result
19
-
20
- def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
21
- @segments = {}
22
- @segment_indices = {}
23
- @corpus = corpus
24
-
25
- if not persist_dir.nil?
26
- @persist_dir = persist_dir
27
- @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
28
- end
29
-
30
- @global_persistence = global_persistence
31
-
32
- if not docid.nil?
33
- @docid = docid
34
- update_docid
35
- end
36
- @text = text unless text.nil?
37
- end
38
-
39
- def update_docid
40
- @namespace, @id, @type, @hash = docid.split(":", -1)
41
- end
42
-
43
- def docid=(docid)
44
- @docid = docid
45
- update_docid
46
- end
47
-
48
- def self.define(entity, &block)
49
- send :define_method, "produce_#{entity}" do
50
- segments = self.instance_exec &block
51
-
52
- segments.each{|s| s.docid = docid }
53
- end
54
-
55
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
56
- def load_#{entity}(raw = false)
57
- return if segments.include? "#{ entity }"
58
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
59
- entities = load_with_persistence_#{entity}(raw)
60
- else
61
- entities = produce_#{entity}
62
- end
63
-
64
- segments["#{ entity }"] = entities
65
- end
66
-
67
- def #{entity}(raw = false)
68
- begin
69
- entities = segments["#{ entity }"]
70
- if entities.nil?
71
- load_#{entity}(raw)
72
- entities = segments["#{ entity }"]
73
- end
74
- end
75
-
76
- entities
77
- end
78
-
79
- def #{entity}_at(pos, persist = false)
80
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
81
- end
82
-
83
- EOC
84
- end
85
-
86
- def self.define_multiple(entity, &block)
87
- send :define_method, "produce_#{entity}" do
88
- if self.multiple_result && self.multiple_result[entity]
89
- segments = self.multiple_result[entity]
90
- return segments.each{|s| s.docid = docid }
91
- end
92
- raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
93
- end
94
-
95
- name = "multiple_produce_#{entity}"
96
- class << self
97
- self
98
- end.send :define_method, name, &block
99
-
100
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
101
- def load_#{entity}(raw = false)
102
- return if segments.include? "#{ entity }"
103
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
104
- entities = load_with_persistence_#{entity}(raw)
105
- else
106
- entities = produce_#{entity}
107
- end
108
-
109
- segments["#{ entity }"] = entities
110
- end
111
-
112
- def #{entity}(raw = false)
113
- begin
114
- entities = segments["#{ entity }"]
115
- if entities.nil?
116
- load_#{entity}(raw)
117
- entities = segments["#{ entity }"]
118
- end
119
- end
120
-
121
- entities
122
- end
123
-
124
- def #{entity}_at(pos, persist = false)
125
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
126
- end
127
-
128
- EOC
129
- end
130
-
131
- def self.prepare_multiple(docs, entity)
132
- missing = []
133
- docs.each do |doc|
134
- begin
135
- doc.send(entity)
136
- rescue MultipleEntity
137
- missing << doc
138
- end
139
- end
140
- res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
141
- case res
142
- when Array
143
- res.each_with_index do |res,i|
144
- missing[i].multiple_result ||= {}
145
- missing[i].multiple_result[entity] = res
146
- end
147
- when Hash
148
- res.each do |document,res|
149
- case document
150
- when Corpus::Document
151
- document.multiple_result[entity] = res
152
- when String
153
- document = missing.select{|d| d.docid == document}.first
154
- document.multiple_result[entity] = res
155
- end
156
- end
157
- end
158
- missing.each{|doc|
159
- doc.send entity
160
- }
161
- end
162
-
163
-
164
- #{{{ PERSISTENCE
165
-
166
- TSV_REPOS = {}
167
- FIELDS_FOR_ENTITY_PERSISTENCE = {}
168
- def self.persist(entity, fields = nil)
169
-
170
- if not fields.nil?
171
- fields = [fields] if not Array === fields
172
- fields = fields.collect{|f| f.to_s}
173
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
174
- end
175
-
176
- self.class_eval <<-EOC, __FILE__, __LINE__
177
- def load_with_persistence_#{entity}(raw = false)
178
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
179
-
180
- tsv_file = File.join(@persist_dir.find, "#{ entity }")
181
-
182
- return nil if raw == :check and File.exists? tsv_file
183
-
184
- annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
185
- segments = produce_#{entity}
186
- tsv = Segment.tsv(segments, fields)
187
- end
188
-
189
- return annotations if raw
190
-
191
- annotations.unnamed = true
192
- annotations.collect{|id, annotation|
193
- Segment.load_tsv_values(text, annotation, annotations.fields)
194
- }
195
- end
196
- EOC
197
- end
198
-
199
- def self.persist_in_tsv(entity, tsv = nil, fields = nil)
200
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]).tap{|t| t.unnamed = true, t.close} if Path === tsv
201
-
202
- if ! tsv.nil? && ! tsv.respond_to?(:keys)
203
- fields = tsv
204
- tsv = nil
205
- end
206
-
207
- TSV_REPOS[entity.to_s] = tsv
208
-
209
- if ! fields.nil?
210
- fields = [fields] if not Array === fields
211
- fields = fields.collect{|f| f.to_s}
212
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
213
- end
214
-
215
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
216
- def load_with_persistence_#{entity}(raw = false)
217
- repo = TSV_REPOS["#{ entity }"]
218
- if repo.nil?
219
- raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
220
- repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
221
- end
222
-
223
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
224
- begin
225
- if ! repo.include?("#{ entity }")
226
- segments = produce_#{entity}
227
- repo.write_and_read do
228
- repo["#{entity}"] = Segment.tsv(segments, fields) if segments.any?
229
- end
230
- else
231
- if raw == :check
232
- repo.close
233
- return nil
234
- end
235
- end
236
-
237
- annotations = repo["#{entity}"]
238
-
239
- repo.close
240
-
241
- return annotations if raw
242
-
243
- annotations.unnamed = true
244
- annotations.collect{|id, annotation|
245
- Segment.load_tsv_values(text, annotation, annotations.fields)
246
- }
247
- ensure
248
- repo.close
249
- end
250
- end
251
- EOC
252
- end
253
-
254
- def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
255
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => (fields || ["Start", "End", "JSON", "Document ID", "Entity Type"])).tap{|t| t.unnamed = true, t.close} if Path === tsv
256
-
257
- doc_field ||= "Document ID"
258
- entity_field ||= "Entity Type"
259
-
260
- TSV_REPOS[entity.to_s] = tsv
261
-
262
- if not fields.nil?
263
- fields = [fields] if not Array === fields
264
- fields = fields.collect{|f| f.to_s}
265
- else
266
- fields = nil
267
- end
268
-
269
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
270
-
271
- self.class_eval <<-EOC, __FILE__, __LINE__ + 1
272
- def load_with_persistence_#{entity}(raw = false)
273
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
274
-
275
- data = TSV_REPOS["#{ entity }"] || @global_persistence
276
-
277
- begin
278
-
279
- if data.respond_to? :persistence_path and String === data.persistence_path
280
- data.filter(data.persistence_path + '.filters')
281
- end
282
-
283
- keys = data.read_and_close do
284
-
285
- fields = data.fields if fields.nil? and data.respond_to? :fields
286
-
287
- data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
288
- data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
289
- keys = data.keys
290
- data.pop_filter if fields.include?("#{entity_field}")
291
- data.pop_filter if fields.include?("#{doc_field}")
292
-
293
- keys
294
- end
295
-
296
-
297
- if keys.empty?
298
- segments = produce_#{entity}
299
- segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
300
- tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
301
-
302
- tsv.add_field "#{ doc_field }" do
303
- @docid
304
- end
305
-
306
- tsv.add_field "#{ entity_field }" do
307
- "#{ entity }"
308
- end
309
-
310
- keys = data.write_and_close do
311
- data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
312
- data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
313
- keys = tsv.collect do |key, value|
314
- data[key] = value
315
- key
316
- end
317
- data.pop_filter if fields.include?("#{entity_field}")
318
- data.pop_filter if fields.include?("#{doc_field}")
319
- keys
320
- end
321
-
322
- else
323
- return nil if raw == :check
324
- end
325
-
326
- return data.values if raw
327
-
328
- start_pos = data.identify_field "Start"
329
- data.read_and_close do
330
- data.chunked_values_at(keys).collect{|annotation|
331
- begin
332
- pos = annotation[start_pos]
333
- Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
334
- rescue
335
- Log.exception $!
336
- iif keys
337
- iif [text, annotation]
338
- end
339
-
340
- }.compact
341
- end
342
- ensure
343
- data.close
344
- end
345
-
346
- end
347
- EOC
348
- end
349
-
350
- def segment_index(name, persist_dir = nil)
351
- @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
352
- end
353
-
354
- def load_into(segment, *annotations)
355
- options = annotations.pop if Hash === annotations.last
356
- options ||= {}
357
-
358
- if options[:persist] and not @persist_dir.nil?
359
- persist_dir = File.join(@persist_dir, 'ranges')
360
- else
361
- persist_dir = nil
362
- end
363
-
364
- Segmented.setup(segment, {})
365
- annotations.collect do |name|
366
- name = name.to_s
367
- index = segment_index(name, persist_dir)
368
- annotations = index[segment.range]
369
- segment.segments[name] ||= {}
370
- segment.segments[name] = annotations
371
- class << segment
372
- self
373
- end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
374
- end
375
-
376
- segment
377
- end
378
-
379
- def entity
380
- Object::Document.setup(self.docid, corpus)
381
- end
382
- end
383
- end
@@ -1,68 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'tokyocabinet'
3
-
4
- class Corpus
5
- module DocumentRepo
6
- class OpenError < StandardError;end
7
- class KeyFormatError < StandardError;end
8
-
9
- TC_CONNECTIONS = {}
10
- def self.open_tokyocabinet(path, write)
11
- database = Persist.open_tokyocabinet(path, write, :single, TokyoCabinet::BDB)
12
- database.extend DocumentRepo
13
- database
14
- end
15
-
16
- def docid2fields(docid)
17
- docid.split(":", -1).values_at 0,1,2,3
18
- end
19
-
20
- def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
21
- [namespace, id, type, hash] * ":"
22
- end
23
-
24
- def docid(docid)
25
- get(docid)
26
- end
27
-
28
- def add(text, namespace, id, type, hash)
29
- docid = fields2docid(namespace, id, type, hash)
30
-
31
- return docid if self.include?(docid)
32
-
33
- write_and_close do
34
- self[docid] = text
35
- end
36
-
37
- docid
38
- end
39
-
40
- def find(namespace=nil, id = nil, type = nil, hash = nil)
41
- case
42
- when namespace.nil?
43
- self.keys
44
- when id.nil?
45
- range_start = [namespace] * ":" + ':'
46
- range_end = [namespace] * ":" + ';'
47
- self.range(range_start, true, range_end, false)
48
- when (type and hash)
49
- [[namespace, id, type, hash] * ":"]
50
- when hash
51
- [[namespace, id, "", hash] * ":"]
52
- when type
53
- range_start = [namespace, id, type] * ":" + ':'
54
- range_end = [namespace, id, type] * ":" + ';'
55
- self.range(range_start, true, range_end, false)
56
- else
57
- range_start = [namespace, id] * ":" + ':'
58
- range_end = [namespace, id] * ":" + ';'
59
- self.range(range_start, true, range_end, false)
60
- end
61
- end
62
-
63
- def find_docid(docid)
64
- find(*docid2fields(docid))
65
- end
66
-
67
- end
68
- end