rbbt-text 1.1.9 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +56 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +61 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +42 -12
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -361
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -355
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -52
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,361 +0,0 @@
1
- require 'rbbt/text/segment'
2
- require 'rbbt/text/segment/segmented'
3
- require 'rbbt/tsv'
4
- require 'rbbt/resource/path'
5
- require 'rbbt/persist/tsv'
6
- require 'rbbt/util/misc'
7
- require 'rbbt/text/document'
8
- require 'json'
9
-
10
- class Corpus
11
- class Document
12
-
13
- class MultipleEntity < Exception; end
14
-
15
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
16
-
17
- attr_accessor :multiple_result
18
- def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
19
- @segments = {}
20
- @segment_indices = {}
21
- @corpus = corpus
22
-
23
- if not persist_dir.nil?
24
- @persist_dir = persist_dir
25
- @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
26
- end
27
-
28
- @global_persistence = global_persistence
29
-
30
- if not docid.nil?
31
- @docid = docid
32
- update_docid
33
- end
34
- @text = text unless text.nil?
35
- end
36
-
37
- def update_docid
38
- @namespace, @id, @type, @hash = docid.split(":", -1)
39
- end
40
-
41
- def docid=(docid)
42
- @docid = docid
43
- update_docid
44
- end
45
-
46
- def self.define(entity, &block)
47
- send :define_method, "produce_#{entity}", &block
48
-
49
- self.class_eval <<-EOC, __FILE__, __LINE__
50
- def load_#{entity}(raw = false)
51
- return if segments.include? "#{ entity }"
52
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
53
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
54
- else
55
- segments["#{ entity }"] = produce_#{entity}
56
- end
57
- end
58
-
59
- def #{entity}(raw = false)
60
- begin
61
- entities = segments["#{ entity }"]
62
- if entities.nil?
63
- load_#{entity}(raw)
64
- entities = segments["#{ entity }"]
65
- end
66
- end
67
-
68
- entities
69
- end
70
-
71
- def #{entity}_at(pos, persist = false)
72
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
73
- end
74
-
75
- EOC
76
- end
77
-
78
- def self.define_multiple(entity, &block)
79
- send :define_method, "produce_#{entity}" do
80
- return self.multiple_result[entity] if self.multiple_result && self.multiple_result[entity]
81
- raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
82
- end
83
-
84
- name = "multiple_produce_#{entity}"
85
- class << self
86
- self
87
- end.send :define_method, name, &block
88
-
89
- self.class_eval <<-EOC, __FILE__, __LINE__
90
- def load_#{entity}(raw = false)
91
- return if segments.include? "#{ entity }"
92
- if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
93
- segments["#{entity}"] = load_with_persistence_#{entity}(raw)
94
- else
95
- segments["#{ entity }"] = produce_#{entity}
96
- end
97
- end
98
-
99
- def #{entity}(raw = false)
100
- begin
101
- entities = segments["#{ entity }"]
102
- if entities.nil?
103
- load_#{entity}(raw)
104
- entities = segments["#{ entity }"]
105
- end
106
- end
107
-
108
- entities
109
- end
110
-
111
- def #{entity}_at(pos, persist = false)
112
- segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
113
- end
114
-
115
- EOC
116
- end
117
-
118
- def self.prepare_multiple(docs, entity)
119
- missing = []
120
- docs.each do |doc|
121
- begin
122
- doc.send(entity)
123
- rescue MultipleEntity
124
- missing << doc
125
- end
126
- end
127
- res = self.send("multiple_produce_#{entity.to_s}", missing)
128
- case res
129
- when Array
130
- res.each_with_index do |res,i|
131
- missing[i].multiple_result ||= {}
132
- missing[i].multiple_result[entity] = res
133
- end
134
- when Hash
135
- res.each do |document,res|
136
- case document
137
- when Corpus::Document
138
- document.multiple_result[entity] = res
139
- when String
140
- document = missing.select{|d| d.docid == document}.first
141
- document.multiple_result[entity] = res
142
- end
143
- end
144
- end
145
- missing.each{|doc| doc.send entity }
146
- end
147
-
148
-
149
- #{{{ PERSISTENCE
150
-
151
- TSV_REPOS = {}
152
- FIELDS_FOR_ENTITY_PERSISTENCE = {}
153
- def self.persist(entity, fields = nil)
154
-
155
- if not fields.nil?
156
- fields = [fields] if not Array === fields
157
- fields = fields.collect{|f| f.to_s}
158
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
159
- end
160
-
161
- self.class_eval <<-EOC, __FILE__, __LINE__
162
- def load_with_persistence_#{entity}(raw = false)
163
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
164
-
165
- tsv_file = File.join(@persist_dir.find, "#{ entity }")
166
-
167
- return nil if raw == :check and File.exists? tsv_file
168
-
169
- annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
170
- segments = produce_#{entity}
171
- tsv = Segment.tsv(segments, fields)
172
- end
173
-
174
- return annotations if raw
175
-
176
- annotations.unnamed = true
177
- annotations.collect{|id, annotation|
178
- Segment.load_tsv_values(text, annotation, annotations.fields)
179
- }
180
- end
181
- EOC
182
- end
183
-
184
- def self.persist_in_tsv(entity, tsv = nil, fields = nil)
185
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"]).tap{|t| t.unnamed = true, t.close} if Path === tsv
186
-
187
- if ! tsv.nil? && ! tsv.respond_to?(:keys)
188
- fields = tsv
189
- tsv = nil
190
- end
191
-
192
- TSV_REPOS[entity.to_s] = tsv
193
-
194
- if ! fields.nil?
195
- fields = [fields] if not Array === fields
196
- fields = fields.collect{|f| f.to_s}
197
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
198
- end
199
-
200
- self.class_eval <<-EOC, __FILE__, __LINE__
201
- def load_with_persistence_#{entity}(raw = false)
202
- repo = TSV_REPOS["#{ entity }"]
203
- if repo.nil?
204
- raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
205
- repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
206
- end
207
-
208
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
209
- begin
210
- if ! repo.include?("#{ entity }")
211
- segments = produce_#{entity}
212
- repo.write_and_read do
213
- repo["#{entity}"] = Segment.tsv(segments, fields) if segments.any?
214
- end
215
- else
216
- if raw == :check
217
- repo.close
218
- return nil
219
- end
220
- end
221
-
222
- annotations = repo["#{entity}"]
223
-
224
- repo.close
225
-
226
- return annotations if raw
227
-
228
- annotations.unnamed = true
229
- annotations.collect{|id, annotation|
230
- Segment.load_tsv_values(text, annotation, annotations.fields)
231
- }
232
- ensure
233
- repo.close
234
- end
235
- end
236
- EOC
237
- end
238
-
239
- def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
240
- tsv = TSV.setup(Persist.open_tokyocabinet(tsv, false, :list), :key => "ID", :fields => (fields || ["Start", "End", "JSON", "Document ID", "Entity Type"])).tap{|t| t.unnamed = true, t.close} if Path === tsv
241
-
242
- doc_field ||= "Document ID"
243
- entity_field ||= "Entity Type"
244
-
245
- TSV_REPOS[entity.to_s] = tsv
246
-
247
- if not fields.nil?
248
- fields = [fields] if not Array === fields
249
- fields = fields.collect{|f| f.to_s}
250
- else
251
- fields = nil
252
- end
253
-
254
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
255
-
256
- self.class_eval <<-EOC, __FILE__, __LINE__
257
- def load_with_persistence_#{entity}(raw = false)
258
- fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
259
-
260
- data = TSV_REPOS["#{ entity }"] || @global_persistence
261
-
262
- begin
263
-
264
- data.read true
265
-
266
- fields = data.fields if fields.nil? and data.respond_to? :fields
267
-
268
-
269
- if data.respond_to? :persistence_path and String === data.persistence_path
270
- data.filter(data.persistence_path + '.filters')
271
- end
272
-
273
- data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
274
- data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
275
- keys = data.keys
276
- data.pop_filter if data.fields.include?("#{entity_field}")
277
- data.pop_filter if data.fields.include?("#{doc_field}")
278
-
279
- if keys.empty?
280
- segments = produce_#{entity}
281
- segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
282
- tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
283
-
284
- tsv.add_field "#{ doc_field }" do
285
- @docid
286
- end
287
-
288
- tsv.add_field "#{ entity_field }" do
289
- "#{ entity }"
290
- end
291
-
292
- data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
293
- data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
294
- data.write true
295
- keys = tsv.collect do |key, value|
296
- data[key] = value
297
- key
298
- end
299
- data.pop_filter if data.fields.include?("#{entity_field}")
300
- data.pop_filter if data.fields.include?("#{doc_field}")
301
- data.read
302
-
303
- else
304
- if raw == :check
305
- data.close
306
- return nil
307
- end
308
- end
309
-
310
- return data.values if raw
311
-
312
- start_pos = data.identify_field "Start"
313
- segments = data.values_at(*keys).collect{|annotation|
314
- pos = annotation[start_pos]
315
- Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
316
- }.compact
317
- data.close
318
-
319
- segments
320
- ensure
321
- data.close
322
- end
323
-
324
- end
325
- EOC
326
- end
327
-
328
- def segment_index(name, persist_dir = nil)
329
- @segment_indices[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
330
- end
331
-
332
- def load_into(segment, *annotations)
333
- options = annotations.pop if Hash === annotations.last
334
- options ||= {}
335
-
336
- if options[:persist] and not @persist_dir.nil?
337
- persist_dir = File.join(@persist_dir, 'ranges')
338
- else
339
- persist_dir = nil
340
- end
341
-
342
- Segmented.setup(segment, {})
343
- annotations.collect do |name|
344
- name = name.to_s
345
- index = segment_index(name, persist_dir)
346
- annotations = index[segment.range]
347
- segment.segments[name] ||= {}
348
- segment.segments[name] = annotations
349
- class << segment
350
- self
351
- end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
352
- end
353
-
354
- segment
355
- end
356
-
357
- def entity
358
- Object::Document.setup(self.docid, corpus)
359
- end
360
- end
361
- end
@@ -1,68 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'tokyocabinet'
3
-
4
- class Corpus
5
- module DocumentRepo
6
- class OpenError < StandardError;end
7
- class KeyFormatError < StandardError;end
8
-
9
- TC_CONNECTIONS = {}
10
- def self.open_tokyocabinet(path, write)
11
- database = Persist.open_tokyocabinet(path, write, :single, TokyoCabinet::BDB)
12
- database.extend DocumentRepo
13
- database
14
- end
15
-
16
- def docid2fields(docid)
17
- docid.split(":", -1).values_at 0,1,2,3
18
- end
19
-
20
- def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
21
- [namespace, id, type, hash] * ":"
22
- end
23
-
24
- def docid(docid)
25
- get(docid)
26
- end
27
-
28
- def add(text, namespace, id, type, hash)
29
- docid = fields2docid(namespace, id, type, hash)
30
-
31
- return docid if self.include?(docid)
32
-
33
- write_and_close do
34
- self[docid] = text
35
- end
36
-
37
- docid
38
- end
39
-
40
- def find(namespace=nil, id = nil, type = nil, hash = nil)
41
- case
42
- when namespace.nil?
43
- self.keys
44
- when id.nil?
45
- range_start = [namespace] * ":" + ':'
46
- range_end = [namespace] * ":" + ';'
47
- self.range(range_start, true, range_end, false)
48
- when (type and hash)
49
- [[namespace, id, type, hash] * ":"]
50
- when hash
51
- [[namespace, id, "", hash] * ":"]
52
- when type
53
- range_start = [namespace, id, type] * ":" + ':'
54
- range_end = [namespace, id, type] * ":" + ';'
55
- self.range(range_start, true, range_end, false)
56
- else
57
- range_start = [namespace, id] * ":" + ':'
58
- range_end = [namespace, id] * ":" + ';'
59
- self.range(range_start, true, range_end, false)
60
- end
61
- end
62
-
63
- def find_docid(docid)
64
- find(*docid2fields(docid))
65
- end
66
-
67
- end
68
- end
@@ -1,34 +0,0 @@
1
- require 'rbbt/sources/pubmed'
2
-
3
- class Corpus
4
-
5
- NAMESPACES = {} unless defined? NAMESPACES
6
- NAMESPACES[:pubmed] = :add_pmid
7
-
8
- def add_pmid(pmid, type = nil)
9
- pmids = Array === pmid ? pmid : [pmid]
10
- type = nil if String === type and type.empty?
11
-
12
- PubMed.get_article(pmids).collect do |pmid, article|
13
- Log.debug "Loading pmid #{pmid}"
14
- if type.nil? || type.to_sym == :abstract
15
- add_document(article.abstract || "", :PMID, pmid, :abstract)
16
- elsif type.to_sym == :title
17
- add_document(article.title, :PMID, pmid, :title)
18
- else
19
- raise "No FullText available for #{ pmid }" if article.full_text.nil?
20
- add_document(article.full_text, :PMID, pmid, :fulltext)
21
- end
22
- end
23
- end
24
-
25
- def add_pubmed_query(query, max = 3000, type = nil)
26
- pmids = PubMed.query(query, max)
27
- add_pmid(pmids, type)
28
- end
29
-
30
- self.claim "PMID" do |id, type|
31
- Log.debug "Claiming #{id}"
32
- self.add_pmid(id, type)
33
- end
34
- end