rbbt-text 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/document.rb +46 -0
  3. data/lib/rbbt/document/annotation.rb +42 -0
  4. data/lib/rbbt/document/corpus.rb +38 -0
  5. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  6. data/lib/rbbt/ner/NER.rb +3 -3
  7. data/lib/rbbt/ner/abner.rb +1 -1
  8. data/lib/rbbt/ner/banner.rb +1 -1
  9. data/lib/rbbt/ner/brat.rb +1 -1
  10. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  11. data/lib/rbbt/ner/g_norm_plus.rb +19 -2
  12. data/lib/rbbt/ner/linnaeus.rb +3 -3
  13. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  14. data/lib/rbbt/ner/oscar3.rb +1 -2
  15. data/lib/rbbt/ner/oscar4.rb +3 -3
  16. data/lib/rbbt/ner/patterns.rb +6 -5
  17. data/lib/rbbt/ner/regexpNER.rb +1 -2
  18. data/lib/rbbt/ner/token_trieNER.rb +6 -6
  19. data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
  20. data/lib/rbbt/nlp/nlp.rb +5 -5
  21. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  22. data/lib/rbbt/segment.rb +177 -0
  23. data/lib/rbbt/segment/annotation.rb +58 -0
  24. data/lib/rbbt/segment/encoding.rb +18 -0
  25. data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
  26. data/lib/rbbt/segment/overlaps.rb +63 -0
  27. data/lib/rbbt/segment/range_index.rb +35 -0
  28. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  29. data/lib/rbbt/segment/token.rb +23 -0
  30. data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
  31. data/lib/rbbt/segment/tsv.rb +41 -0
  32. data/share/install/software/Linnaeus +1 -1
  33. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  34. data/test/rbbt/document/test_annotation.rb +140 -0
  35. data/test/rbbt/document/test_corpus.rb +33 -0
  36. data/test/rbbt/ner/test_finder.rb +3 -3
  37. data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
  38. data/test/rbbt/ner/test_patterns.rb +9 -9
  39. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  40. data/test/rbbt/ner/test_rnorm.rb +3 -4
  41. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  42. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
  43. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  44. data/test/rbbt/segment/test_annotation.rb +40 -0
  45. data/test/rbbt/segment/test_corpus.rb +36 -0
  46. data/test/rbbt/segment/test_encoding.rb +24 -0
  47. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
  48. data/test/rbbt/segment/test_overlaps.rb +69 -0
  49. data/test/rbbt/segment/test_range_index.rb +43 -0
  50. data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
  51. data/test/rbbt/test_document.rb +14 -0
  52. data/test/rbbt/test_segment.rb +187 -0
  53. data/test/test_helper.rb +5 -3
  54. metadata +40 -32
  55. data/lib/rbbt/text/corpus.rb +0 -106
  56. data/lib/rbbt/text/corpus/document.rb +0 -383
  57. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  58. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  59. data/lib/rbbt/text/document.rb +0 -39
  60. data/lib/rbbt/text/segment.rb +0 -363
  61. data/lib/rbbt/text/segment/docid.rb +0 -46
  62. data/lib/rbbt/text/segment/relationship.rb +0 -24
  63. data/lib/rbbt/text/segment/token.rb +0 -49
  64. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  65. data/test/rbbt/text/corpus/test_document.rb +0 -82
  66. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  67. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  68. data/test/rbbt/text/test_corpus.rb +0 -34
  69. data/test/rbbt/text/test_document.rb +0 -58
  70. data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,34 +0,0 @@
1
- require 'rbbt/sources/pubmed'
2
-
3
- class Corpus
4
-
5
- NAMESPACES = {} unless defined? NAMESPACES
6
- NAMESPACES[:pubmed] = :add_pmid
7
-
8
- def add_pmid(pmid, type = nil)
9
- pmids = Array === pmid ? pmid : [pmid]
10
- type = nil if String === type and type.empty?
11
-
12
- PubMed.get_article(pmids).collect do |pmid, article|
13
- Log.debug "Loading pmid #{pmid}"
14
- if type.nil? || type.to_sym == :abstract
15
- add_document(article.abstract || "", :PMID, pmid, :abstract)
16
- elsif type.to_sym == :title
17
- add_document(article.title, :PMID, pmid, :title)
18
- else
19
- raise "No FullText available for #{ pmid }" if article.full_text.nil?
20
- add_document(article.full_text, :PMID, pmid, :fulltext)
21
- end
22
- end
23
- end
24
-
25
- def add_pubmed_query(query, max = 3000, type = nil)
26
- pmids = PubMed.query(query, max)
27
- add_pmid(pmids, type)
28
- end
29
-
30
- self.claim "PMID" do |id, type|
31
- Log.debug "Claiming #{id}"
32
- self.add_pmid(id, type)
33
- end
34
- end
@@ -1,39 +0,0 @@
1
- require 'rbbt-util'
2
- require 'rbbt/entity'
3
-
4
- require 'rbbt/text/corpus'
5
-
6
- module Document
7
- extend Entity
8
- class << self
9
- attr_accessor :corpus
10
- end
11
-
12
- property :document => :single do
13
- Document.corpus.docid(self)
14
- end
15
-
16
- property :type => :single do |type|
17
- self.annotate((self.split(":").values_at(0,1)) * ":" + ":" + type.to_s)
18
- end
19
-
20
- property :title => :single do
21
- type(:title).text
22
- end
23
-
24
- property :full_text => :single do
25
- type(:full_text).text
26
- end
27
-
28
- property :abstract => :single do
29
- type(:abstract).text
30
- end
31
-
32
- property :text => :single do
33
- document.text
34
- end
35
-
36
- property :entities => :single do |type,*args|
37
- document.method(type).call *args
38
- end
39
- end
@@ -1,363 +0,0 @@
1
- require 'rbbt/annotations'
2
- require 'rbbt/fix_width_table'
3
-
4
- module Segment
5
- extend Annotation
6
- self.annotation :offset, :docid
7
-
8
- def segment_length
9
- begin
10
- super()
11
- rescue
12
- self.length
13
- end
14
- end
15
-
16
- #{{{ Ranges
17
-
18
- def end
19
- return nil if offset.nil?
20
- offset.to_i + segment_length - 1
21
- end
22
-
23
- def range
24
- raise "No offset specified" if offset.nil?
25
- (offset.to_i..self.end)
26
- end
27
-
28
- def pull(offset)
29
- if self.offset.nil? or offset.nil?
30
- self.offset = nil
31
- else
32
- self.offset += offset
33
- end
34
-
35
- self
36
- end
37
-
38
- def push(offset)
39
- if self.offset.nil? or offset.nil?
40
- self.offset = nil
41
- else
42
- self.offset -= offset
43
- end
44
-
45
- self
46
- end
47
-
48
- def make_relative(segments, &block)
49
- if block_given?
50
- segments.each{|s| s.push offset}
51
- yield(segments)
52
- segments.each{|s| s.pull offset}
53
- else
54
- segments.each{|s| s.push offset}
55
- end
56
- end
57
-
58
- def range_in(container = nil)
59
- raise "No offset specified" if offset.nil?
60
- case
61
- when (Segment === container and not container.offset.nil?)
62
- ((offset - container.offset)..(self.end - container.offset))
63
- when Integer === container
64
- ((offset - container)..(self.end - container))
65
- else
66
- range
67
- end
68
- end
69
-
70
- def includes?(segment)
71
- (segment.offset.to_i >= self.offset.to_i) and
72
- (segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
73
- end
74
-
75
- def overlaps?(segment)
76
- segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.end ||
77
- self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.end
78
- end
79
-
80
- def overlaps(segments)
81
- segments.select{|s| self.overlaps?(s)}
82
- end
83
-
84
-
85
- def self.collisions(main, secondary)
86
- secondary.select do |ss|
87
- main.select{|ms| ms.overlaps? ss }.any?
88
- end
89
- end
90
-
91
- #{{{ Sorting
92
-
93
- def self.sort(segments, inline = true)
94
- if inline
95
- segments.sort do |a,b|
96
- case
97
- when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
98
- 0
99
- when (a.nil? or a.offset.nil?)
100
- -1
101
- when (b.nil? or b.offset.nil?)
102
- +1
103
- when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
104
- a.offset.to_i <=> b.offset.to_i
105
- else
106
- a.segment_length <=> b.segment_length
107
- end
108
- end
109
- else
110
- segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
111
- end
112
- end
113
-
114
- def self.overlaps(sorted_segments)
115
- last = nil
116
- overlaped = []
117
-
118
- sorted_segments.reverse.each do |segment|
119
- overlaped << segment if (not last.nil?) and segment.range.end > last
120
- last = segment.range.begin
121
- end
122
-
123
- overlaped
124
- end
125
-
126
- def self.clean_sort(segments)
127
- sorted = sort(segments).reject{|s| s.offset.nil?}
128
- overlaps = overlaps(sorted)
129
- overlaps.each do |s|
130
- sorted.delete s
131
- end
132
-
133
- sorted
134
- end
135
-
136
- #{{{ Splitting
137
-
138
- def self.split(text, segments, skip_segments = false)
139
- sorted_segments = clean_sort segments
140
-
141
- chunks = []
142
- segment_end = 0
143
- text_offset = 0
144
- sorted_segments.each do |segment|
145
- return chunks if text.nil? or text.empty?
146
- next if segment.offset.nil?
147
- offset = segment.offset - text_offset
148
-
149
- # Consider segment offset. Save pre, or skip if overlap
150
- case
151
- when offset < 0 # Overlap, skip
152
- next
153
- when offset > 0 # Save pre
154
- chunk = text[0..offset - 1]
155
- Segment.setup(chunk, text_offset)
156
- chunks << chunk
157
- end
158
-
159
- segment_end = offset + segment.segment_length - 1
160
-
161
- if not skip_segments
162
- chunk = text[offset..segment_end]
163
- Segment.setup(chunk, text_offset + offset)
164
- chunks << chunk
165
- end
166
-
167
- text_offset += segment_end + 1
168
- text = text[segment_end + 1..-1]
169
-
170
- end
171
-
172
- if not text.nil? and not text.empty?
173
- chunk = text.dup
174
- Segment.setup(chunk, text_offset)
175
- chunks << chunk
176
- end
177
-
178
- chunks
179
- end
180
-
181
-
182
- #{{{ Align
183
-
184
- def self.align(text, parts)
185
- pre_offset = 0
186
- parts.each do |part|
187
- offset = text.index part
188
- next if offset.nil?
189
- Segment.setup(part, pre_offset + offset)
190
- pre_offset += offset + part.segment_length - 1
191
- text = text[(offset + part.segment_length - 1)..-1]
192
- end
193
- end
194
-
195
- #{{{ Index
196
-
197
- class Index
198
- attr_accessor :index, :data
199
- def initialize(index, data)
200
- @index = index
201
- @data = data
202
- end
203
-
204
- def [](pos)
205
- index[pos].collect{|id| data[id]}
206
- end
207
- end
208
-
209
- def self.index(segments, persist_file = :memory)
210
- segments = segments.values.flatten if Hash === segments
211
-
212
- annotation_index =
213
- Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
214
-
215
- value_size = 0
216
- index_data = segments.collect{|segment|
217
- next if segment.offset.nil?
218
- range = segment.range
219
- value_size = [segment.id.length, value_size].max
220
- [segment.id, [range.begin, range.end]]
221
- }.compact
222
-
223
- fwt = FixWidthTable.get :memory, value_size, true
224
- fwt.add_range index_data
225
-
226
- fwt
227
- end
228
-
229
- data = {}
230
- segments.each do |segment| data[segment.id] = segment end
231
- Index.new annotation_index, data
232
- end
233
-
234
- #{{{ Save and load
235
-
236
- def self.tsv_values_for_segment(segment, fields)
237
- info = segment.info
238
- values = []
239
-
240
- fields.each do |field|
241
- values << case
242
- when field == "JSON"
243
- info.to_json
244
- when field == "literal"
245
- segment.gsub(/\n|\t/, ' ')
246
- when field == "Start"
247
- segment.offset
248
- when field == "End"
249
- segment.end
250
- else
251
- info.delete(field.to_sym)
252
- end
253
- end
254
-
255
- values
256
- end
257
-
258
- def self.load_tsv_values(text, values, fields)
259
- info = {}
260
- literal_pos = fields.index "literal"
261
-
262
- object = if literal_pos.nil?
263
- ""
264
- else
265
- v = values[literal_pos]
266
- v = v.first if Array === v
267
- v
268
- end
269
-
270
- fields.each_with_index do |field, i|
271
- if field == "JSON"
272
- JSON.parse(values[i]).each do |key, value|
273
- info[key.to_sym] = value
274
- end
275
- else
276
- info[field.to_sym] = values[i]
277
- end
278
- end
279
-
280
- start = info.delete(:Start)
281
- if not (start.nil? or ((Array === start or String === start) and start.empty?))
282
- if Array === start
283
- start = start.first
284
- end
285
- start = start.to_i
286
- info[:offset] = start
287
-
288
- eend = info.delete(:End)
289
- if Array === eend
290
- eend = eend.first
291
- end
292
- eend = eend.to_i
293
-
294
- if object.empty?
295
- object.replace text[start..eend]
296
- end
297
- end
298
-
299
- info[:annotation_types] = [Segment] unless info.include? :annotation_types
300
-
301
- Annotated.load_entity(object, info)
302
- end
303
-
304
- def self.set_tsv_fields(fields, segments)
305
- tsv_fields = []
306
- add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
307
- literal = (fields.delete(:literal) || fields.delete("literal"))
308
- tsv_fields << "Start" << "End"
309
- tsv_fields << :annotation_types if add_types
310
- tsv_fields << :literal if literal
311
-
312
- if fields.any? and not (fields == [:all] or fields == ["all"])
313
- tsv_fields.concat fields
314
- else
315
- tsv_fields.concat segments.first.annotations if segments.any?
316
- end
317
- tsv_fields
318
- tsv_fields.collect!{|f| f.to_s}
319
- tsv_fields.delete "offset"
320
- tsv_fields
321
- end
322
-
323
- def self.tsv(segments, *fields)
324
- fields = set_tsv_fields fields, segments
325
- tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
326
-
327
- segments.each do |segment|
328
- tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
329
- end
330
-
331
- tsv
332
- end
333
-
334
- def self.load_tsv(tsv)
335
- fields = tsv.fields
336
- tsv.with_unnamed do
337
- tsv.collect do |id, values|
338
- Annotated.load_tsv_values(id, values, fields)
339
- end
340
- end
341
- end
342
-
343
- def ansi(color)
344
- Log.color color, self
345
- end
346
-
347
- def locus
348
- [offset, self.end] * ".."
349
- end
350
-
351
- def segment_id
352
- if self.respond_to?(:docid)
353
- [docid, locus, Misc.obj2digest(info)] * ":"
354
- else
355
- Misc.obj2digest(info)
356
- end
357
- end
358
-
359
- #def ==(other)
360
- # self.text == other.text
361
- #end
362
- end
363
-
@@ -1,46 +0,0 @@
1
- require 'rbbt/text/segment'
2
-
3
- #module SegmentWithDocid
4
- # extend Annotation
5
- #
6
- # self.annotation :docid
7
- #
8
- # def masked?
9
- # self[0..5] == "MASKED"
10
- # end
11
- #
12
- # def mask
13
- # return self if masked?
14
- # raise "Cannot mask an array of elements, they must be masked individually" if Array === self
15
- # raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
16
- # raise "Cannot mask a segment with no offset" if offset.nil?
17
- # textual_position = ["MASKED", length] * ":"
18
- # self.replace(textual_position)
19
- # self
20
- # end
21
- #
22
- # def unmasked_text
23
- # return self unless masked?
24
- # tag, length = self.split(":")
25
- # Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
26
- # end
27
- #
28
- # def unmask
29
- # return self unless masked?
30
- # self.replace(unmasked_text)
31
- # self
32
- # end
33
- #
34
- # def str_length
35
- # self.length
36
- # end
37
- #
38
- # def masked_length
39
- # self.split(":").last.to_i
40
- # end
41
- #
42
- # def segment_length
43
- # masked? ? masked_length : str_length
44
- # end
45
- #end
46
- #