rbbt-text 1.2.0 → 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,68 +0,0 @@
1
- require 'rbbt/util/misc'
2
- require 'tokyocabinet'
3
-
4
- class Corpus
5
- module DocumentRepo
6
- class OpenError < StandardError;end
7
- class KeyFormatError < StandardError;end
8
-
9
- TC_CONNECTIONS = {}
10
- def self.open_tokyocabinet(path, write)
11
- database = Persist.open_tokyocabinet(path, write, :single, TokyoCabinet::BDB)
12
- database.extend DocumentRepo
13
- database
14
- end
15
-
16
- def docid2fields(docid)
17
- docid.split(":", -1).values_at 0,1,2,3
18
- end
19
-
20
- def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
21
- [namespace, id, type, hash] * ":"
22
- end
23
-
24
- def docid(docid)
25
- get(docid)
26
- end
27
-
28
- def add(text, namespace, id, type, hash)
29
- docid = fields2docid(namespace, id, type, hash)
30
-
31
- return docid if self.include?(docid)
32
-
33
- write_and_close do
34
- self[docid] = text
35
- end
36
-
37
- docid
38
- end
39
-
40
- def find(namespace=nil, id = nil, type = nil, hash = nil)
41
- case
42
- when namespace.nil?
43
- self.keys
44
- when id.nil?
45
- range_start = [namespace] * ":" + ':'
46
- range_end = [namespace] * ":" + ';'
47
- self.range(range_start, true, range_end, false)
48
- when (type and hash)
49
- [[namespace, id, type, hash] * ":"]
50
- when hash
51
- [[namespace, id, "", hash] * ":"]
52
- when type
53
- range_start = [namespace, id, type] * ":" + ':'
54
- range_end = [namespace, id, type] * ":" + ';'
55
- self.range(range_start, true, range_end, false)
56
- else
57
- range_start = [namespace, id] * ":" + ':'
58
- range_end = [namespace, id] * ":" + ';'
59
- self.range(range_start, true, range_end, false)
60
- end
61
- end
62
-
63
- def find_docid(docid)
64
- find(*docid2fields(docid))
65
- end
66
-
67
- end
68
- end
@@ -1,34 +0,0 @@
1
- require 'rbbt/sources/pubmed'
2
-
3
- class Corpus
4
-
5
- NAMESPACES = {} unless defined? NAMESPACES
6
- NAMESPACES[:pubmed] = :add_pmid
7
-
8
- def add_pmid(pmid, type = nil)
9
- pmids = Array === pmid ? pmid : [pmid]
10
- type = nil if String === type and type.empty?
11
-
12
- PubMed.get_article(pmids).collect do |pmid, article|
13
- Log.debug "Loading pmid #{pmid}"
14
- if type.nil? || type.to_sym == :abstract
15
- add_document(article.abstract || "", :PMID, pmid, :abstract)
16
- elsif type.to_sym == :title
17
- add_document(article.title, :PMID, pmid, :title)
18
- else
19
- raise "No FullText available for #{ pmid }" if article.full_text.nil?
20
- add_document(article.full_text, :PMID, pmid, :fulltext)
21
- end
22
- end
23
- end
24
-
25
- def add_pubmed_query(query, max = 3000, type = nil)
26
- pmids = PubMed.query(query, max)
27
- add_pmid(pmids, type)
28
- end
29
-
30
- self.claim "PMID" do |id, type|
31
- Log.debug "Claiming #{id}"
32
- self.add_pmid(id, type)
33
- end
34
- end
@@ -1,39 +0,0 @@
1
- require 'rbbt-util'
2
- require 'rbbt/entity'
3
-
4
- require 'rbbt/text/corpus'
5
-
6
- module Document
7
- extend Entity
8
- class << self
9
- attr_accessor :corpus
10
- end
11
-
12
- property :document => :single do
13
- Document.corpus.docid(self)
14
- end
15
-
16
- property :type => :single do |type|
17
- self.annotate((self.split(":").values_at(0,1)) * ":" + ":" + type.to_s)
18
- end
19
-
20
- property :title => :single do
21
- type(:title).text
22
- end
23
-
24
- property :full_text => :single do
25
- type(:full_text).text
26
- end
27
-
28
- property :abstract => :single do
29
- type(:abstract).text
30
- end
31
-
32
- property :text => :single do
33
- document.text
34
- end
35
-
36
- property :entities => :single do |type,*args|
37
- document.method(type).call *args
38
- end
39
- end
@@ -1,363 +0,0 @@
1
- require 'rbbt/annotations'
2
- require 'rbbt/fix_width_table'
3
-
4
- module Segment
5
- extend Annotation
6
- self.annotation :offset, :docid
7
-
8
- def segment_length
9
- begin
10
- super()
11
- rescue
12
- self.length
13
- end
14
- end
15
-
16
- #{{{ Ranges
17
-
18
- def end
19
- return nil if offset.nil?
20
- offset.to_i + segment_length - 1
21
- end
22
-
23
- def range
24
- raise "No offset specified" if offset.nil?
25
- (offset.to_i..self.end)
26
- end
27
-
28
- def pull(offset)
29
- if self.offset.nil? or offset.nil?
30
- self.offset = nil
31
- else
32
- self.offset += offset
33
- end
34
-
35
- self
36
- end
37
-
38
- def push(offset)
39
- if self.offset.nil? or offset.nil?
40
- self.offset = nil
41
- else
42
- self.offset -= offset
43
- end
44
-
45
- self
46
- end
47
-
48
- def make_relative(segments, &block)
49
- if block_given?
50
- segments.each{|s| s.push offset}
51
- yield(segments)
52
- segments.each{|s| s.pull offset}
53
- else
54
- segments.each{|s| s.push offset}
55
- end
56
- end
57
-
58
- def range_in(container = nil)
59
- raise "No offset specified" if offset.nil?
60
- case
61
- when (Segment === container and not container.offset.nil?)
62
- ((offset - container.offset)..(self.end - container.offset))
63
- when Integer === container
64
- ((offset - container)..(self.end - container))
65
- else
66
- range
67
- end
68
- end
69
-
70
- def includes?(segment)
71
- (segment.offset.to_i >= self.offset.to_i) and
72
- (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
73
- end
74
-
75
- def overlaps?(segment)
76
- segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.end ||
77
- self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.end
78
- end
79
-
80
- def overlaps(segments)
81
- segments.select{|s| self.overlaps?(s)}
82
- end
83
-
84
-
85
- def self.collisions(main, secondary)
86
- secondary.select do |ss|
87
- main.select{|ms| ms.overlaps? ss }.any?
88
- end
89
- end
90
-
91
- #{{{ Sorting
92
-
93
- def self.sort(segments, inline = true)
94
- if inline
95
- segments.sort do |a,b|
96
- case
97
- when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
98
- 0
99
- when (a.nil? or a.offset.nil?)
100
- -1
101
- when (b.nil? or b.offset.nil?)
102
- +1
103
- when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
104
- a.offset.to_i <=> b.offset.to_i
105
- else
106
- a.segment_length <=> b.segment_length
107
- end
108
- end
109
- else
110
- segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
111
- end
112
- end
113
-
114
- def self.overlaps(sorted_segments)
115
- last = nil
116
- overlaped = []
117
-
118
- sorted_segments.reverse.each do |segment|
119
- overlaped << segment if (not last.nil?) and segment.range.end > last
120
- last = segment.range.begin
121
- end
122
-
123
- overlaped
124
- end
125
-
126
- def self.clean_sort(segments)
127
- sorted = sort(segments).reject{|s| s.offset.nil?}
128
- overlaps = overlaps(sorted)
129
- overlaps.each do |s|
130
- sorted.delete s
131
- end
132
-
133
- sorted
134
- end
135
-
136
- #{{{ Splitting
137
-
138
- def self.split(text, segments, skip_segments = false)
139
- sorted_segments = clean_sort segments
140
-
141
- chunks = []
142
- segment_end = 0
143
- text_offset = 0
144
- sorted_segments.each do |segment|
145
- return chunks if text.nil? or text.empty?
146
- next if segment.offset.nil?
147
- offset = segment.offset - text_offset
148
-
149
- # Consider segment offset. Save pre, or skip if overlap
150
- case
151
- when offset < 0 # Overlap, skip
152
- next
153
- when offset > 0 # Save pre
154
- chunk = text[0..offset - 1]
155
- Segment.setup(chunk, text_offset)
156
- chunks << chunk
157
- end
158
-
159
- segment_end = offset + segment.segment_length - 1
160
-
161
- if not skip_segments
162
- chunk = text[offset..segment_end]
163
- Segment.setup(chunk, text_offset + offset)
164
- chunks << chunk
165
- end
166
-
167
- text_offset += segment_end + 1
168
- text = text[segment_end + 1..-1]
169
-
170
- end
171
-
172
- if not text.nil? and not text.empty?
173
- chunk = text.dup
174
- Segment.setup(chunk, text_offset)
175
- chunks << chunk
176
- end
177
-
178
- chunks
179
- end
180
-
181
-
182
- #{{{ Align
183
-
184
- def self.align(text, parts)
185
- pre_offset = 0
186
- parts.each do |part|
187
- offset = text.index part
188
- next if offset.nil?
189
- Segment.setup(part, pre_offset + offset)
190
- pre_offset += offset + part.segment_length - 1
191
- text = text[(offset + part.segment_length - 1)..-1]
192
- end
193
- end
194
-
195
- #{{{ Index
196
-
197
- class Index
198
- attr_accessor :index, :data
199
- def initialize(index, data)
200
- @index = index
201
- @data = data
202
- end
203
-
204
- def [](pos)
205
- index[pos].collect{|id| data[id]}
206
- end
207
- end
208
-
209
- def self.index(segments, persist_file = :memory)
210
- segments = segments.values.flatten if Hash === segments
211
-
212
- annotation_index =
213
- Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
214
-
215
- value_size = 0
216
- index_data = segments.collect{|segment|
217
- next if segment.offset.nil?
218
- range = segment.range
219
- value_size = [segment.id.length, value_size].max
220
- [segment.id, [range.begin, range.end]]
221
- }.compact
222
-
223
- fwt = FixWidthTable.get :memory, value_size, true
224
- fwt.add_range index_data
225
-
226
- fwt
227
- end
228
-
229
- data = {}
230
- segments.each do |segment| data[segment.id] = segment end
231
- Index.new annotation_index, data
232
- end
233
-
234
- #{{{ Save and load
235
-
236
- def self.tsv_values_for_segment(segment, fields)
237
- info = segment.info
238
- values = []
239
-
240
- fields.each do |field|
241
- values << case
242
- when field == "JSON"
243
- info.to_json
244
- when field == "literal"
245
- segment.gsub(/\n|\t/, ' ')
246
- when field == "Start"
247
- segment.offset
248
- when field == "End"
249
- segment.end
250
- else
251
- info.delete(field.to_sym)
252
- end
253
- end
254
-
255
- values
256
- end
257
-
258
- def self.load_tsv_values(text, values, fields)
259
- info = {}
260
- literal_pos = fields.index "literal"
261
-
262
- object = if literal_pos.nil?
263
- ""
264
- else
265
- v = values[literal_pos]
266
- v = v.first if Array === v
267
- v
268
- end
269
-
270
- fields.each_with_index do |field, i|
271
- if field == "JSON"
272
- JSON.parse(values[i]).each do |key, value|
273
- info[key.to_sym] = value
274
- end
275
- else
276
- info[field.to_sym] = values[i]
277
- end
278
- end
279
-
280
- start = info.delete(:Start)
281
- if not (start.nil? or ((Array === start or String === start) and start.empty?))
282
- if Array === start
283
- start = start.first
284
- end
285
- start = start.to_i
286
- info[:offset] = start
287
-
288
- eend = info.delete(:End)
289
- if Array === eend
290
- eend = eend.first
291
- end
292
- eend = eend.to_i
293
-
294
- if object.empty?
295
- object.replace text[start..eend]
296
- end
297
- end
298
-
299
- info[:annotation_types] = [Segment] unless info.include? :annotation_types
300
-
301
- Annotated.load_entity(object, info)
302
- end
303
-
304
- def self.set_tsv_fields(fields, segments)
305
- tsv_fields = []
306
- add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
307
- literal = (fields.delete(:literal) || fields.delete("literal"))
308
- tsv_fields << "Start" << "End"
309
- tsv_fields << :annotation_types if add_types
310
- tsv_fields << :literal if literal
311
-
312
- if fields.any? and not (fields == [:all] or fields == ["all"])
313
- tsv_fields.concat fields
314
- else
315
- tsv_fields.concat segments.first.annotations if segments.any?
316
- end
317
- tsv_fields
318
- tsv_fields.collect!{|f| f.to_s}
319
- tsv_fields.delete "offset"
320
- tsv_fields
321
- end
322
-
323
- def self.tsv(segments, *fields)
324
- fields = set_tsv_fields fields, segments
325
- tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
326
-
327
- segments.each do |segment|
328
- tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
329
- end
330
-
331
- tsv
332
- end
333
-
334
- def self.load_tsv(tsv)
335
- fields = tsv.fields
336
- tsv.with_unnamed do
337
- tsv.collect do |id, values|
338
- Annotated.load_tsv_values(id, values, fields)
339
- end
340
- end
341
- end
342
-
343
- def ansi(color)
344
- Log.color color, self
345
- end
346
-
347
- def locus
348
- [offset, self.end] * ".."
349
- end
350
-
351
- def segment_id
352
- if self.respond_to?(:docid)
353
- [docid, locus, Misc.obj2digest(info)] * ":"
354
- else
355
- Misc.obj2digest(info)
356
- end
357
- end
358
-
359
- #def ==(other)
360
- # self.text == other.text
361
- #end
362
- end
363
-