rbbt-text 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +46 -0
- data/lib/rbbt/document/annotation.rb +42 -0
- data/lib/rbbt/document/corpus.rb +38 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +19 -2
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +6 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +6 -6
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/segment.rb +177 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +40 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +43 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +187 -0
- data/test/test_helper.rb +5 -3
- metadata +40 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -1,34 +0,0 @@
|
|
1
|
-
require 'rbbt/sources/pubmed'
|
2
|
-
|
3
|
-
class Corpus
|
4
|
-
|
5
|
-
NAMESPACES = {} unless defined? NAMESPACES
|
6
|
-
NAMESPACES[:pubmed] = :add_pmid
|
7
|
-
|
8
|
-
def add_pmid(pmid, type = nil)
|
9
|
-
pmids = Array === pmid ? pmid : [pmid]
|
10
|
-
type = nil if String === type and type.empty?
|
11
|
-
|
12
|
-
PubMed.get_article(pmids).collect do |pmid, article|
|
13
|
-
Log.debug "Loading pmid #{pmid}"
|
14
|
-
if type.nil? || type.to_sym == :abstract
|
15
|
-
add_document(article.abstract || "", :PMID, pmid, :abstract)
|
16
|
-
elsif type.to_sym == :title
|
17
|
-
add_document(article.title, :PMID, pmid, :title)
|
18
|
-
else
|
19
|
-
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
20
|
-
add_document(article.full_text, :PMID, pmid, :fulltext)
|
21
|
-
end
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def add_pubmed_query(query, max = 3000, type = nil)
|
26
|
-
pmids = PubMed.query(query, max)
|
27
|
-
add_pmid(pmids, type)
|
28
|
-
end
|
29
|
-
|
30
|
-
self.claim "PMID" do |id, type|
|
31
|
-
Log.debug "Claiming #{id}"
|
32
|
-
self.add_pmid(id, type)
|
33
|
-
end
|
34
|
-
end
|
data/lib/rbbt/text/document.rb
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
require 'rbbt-util'
|
2
|
-
require 'rbbt/entity'
|
3
|
-
|
4
|
-
require 'rbbt/text/corpus'
|
5
|
-
|
6
|
-
module Document
|
7
|
-
extend Entity
|
8
|
-
class << self
|
9
|
-
attr_accessor :corpus
|
10
|
-
end
|
11
|
-
|
12
|
-
property :document => :single do
|
13
|
-
Document.corpus.docid(self)
|
14
|
-
end
|
15
|
-
|
16
|
-
property :type => :single do |type|
|
17
|
-
self.annotate((self.split(":").values_at(0,1)) * ":" + ":" + type.to_s)
|
18
|
-
end
|
19
|
-
|
20
|
-
property :title => :single do
|
21
|
-
type(:title).text
|
22
|
-
end
|
23
|
-
|
24
|
-
property :full_text => :single do
|
25
|
-
type(:full_text).text
|
26
|
-
end
|
27
|
-
|
28
|
-
property :abstract => :single do
|
29
|
-
type(:abstract).text
|
30
|
-
end
|
31
|
-
|
32
|
-
property :text => :single do
|
33
|
-
document.text
|
34
|
-
end
|
35
|
-
|
36
|
-
property :entities => :single do |type,*args|
|
37
|
-
document.method(type).call *args
|
38
|
-
end
|
39
|
-
end
|
data/lib/rbbt/text/segment.rb
DELETED
@@ -1,363 +0,0 @@
|
|
1
|
-
require 'rbbt/annotations'
|
2
|
-
require 'rbbt/fix_width_table'
|
3
|
-
|
4
|
-
module Segment
|
5
|
-
extend Annotation
|
6
|
-
self.annotation :offset, :docid
|
7
|
-
|
8
|
-
def segment_length
|
9
|
-
begin
|
10
|
-
super()
|
11
|
-
rescue
|
12
|
-
self.length
|
13
|
-
end
|
14
|
-
end
|
15
|
-
|
16
|
-
#{{{ Ranges
|
17
|
-
|
18
|
-
def end
|
19
|
-
return nil if offset.nil?
|
20
|
-
offset.to_i + segment_length - 1
|
21
|
-
end
|
22
|
-
|
23
|
-
def range
|
24
|
-
raise "No offset specified" if offset.nil?
|
25
|
-
(offset.to_i..self.end)
|
26
|
-
end
|
27
|
-
|
28
|
-
def pull(offset)
|
29
|
-
if self.offset.nil? or offset.nil?
|
30
|
-
self.offset = nil
|
31
|
-
else
|
32
|
-
self.offset += offset
|
33
|
-
end
|
34
|
-
|
35
|
-
self
|
36
|
-
end
|
37
|
-
|
38
|
-
def push(offset)
|
39
|
-
if self.offset.nil? or offset.nil?
|
40
|
-
self.offset = nil
|
41
|
-
else
|
42
|
-
self.offset -= offset
|
43
|
-
end
|
44
|
-
|
45
|
-
self
|
46
|
-
end
|
47
|
-
|
48
|
-
def make_relative(segments, &block)
|
49
|
-
if block_given?
|
50
|
-
segments.each{|s| s.push offset}
|
51
|
-
yield(segments)
|
52
|
-
segments.each{|s| s.pull offset}
|
53
|
-
else
|
54
|
-
segments.each{|s| s.push offset}
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def range_in(container = nil)
|
59
|
-
raise "No offset specified" if offset.nil?
|
60
|
-
case
|
61
|
-
when (Segment === container and not container.offset.nil?)
|
62
|
-
((offset - container.offset)..(self.end - container.offset))
|
63
|
-
when Integer === container
|
64
|
-
((offset - container)..(self.end - container))
|
65
|
-
else
|
66
|
-
range
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def includes?(segment)
|
71
|
-
(segment.offset.to_i >= self.offset.to_i) and
|
72
|
-
(segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
|
73
|
-
end
|
74
|
-
|
75
|
-
def overlaps?(segment)
|
76
|
-
segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.end ||
|
77
|
-
self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.end
|
78
|
-
end
|
79
|
-
|
80
|
-
def overlaps(segments)
|
81
|
-
segments.select{|s| self.overlaps?(s)}
|
82
|
-
end
|
83
|
-
|
84
|
-
|
85
|
-
def self.collisions(main, secondary)
|
86
|
-
secondary.select do |ss|
|
87
|
-
main.select{|ms| ms.overlaps? ss }.any?
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
#{{{ Sorting
|
92
|
-
|
93
|
-
def self.sort(segments, inline = true)
|
94
|
-
if inline
|
95
|
-
segments.sort do |a,b|
|
96
|
-
case
|
97
|
-
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
98
|
-
0
|
99
|
-
when (a.nil? or a.offset.nil?)
|
100
|
-
-1
|
101
|
-
when (b.nil? or b.offset.nil?)
|
102
|
-
+1
|
103
|
-
when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
|
104
|
-
a.offset.to_i <=> b.offset.to_i
|
105
|
-
else
|
106
|
-
a.segment_length <=> b.segment_length
|
107
|
-
end
|
108
|
-
end
|
109
|
-
else
|
110
|
-
segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
|
111
|
-
end
|
112
|
-
end
|
113
|
-
|
114
|
-
def self.overlaps(sorted_segments)
|
115
|
-
last = nil
|
116
|
-
overlaped = []
|
117
|
-
|
118
|
-
sorted_segments.reverse.each do |segment|
|
119
|
-
overlaped << segment if (not last.nil?) and segment.range.end > last
|
120
|
-
last = segment.range.begin
|
121
|
-
end
|
122
|
-
|
123
|
-
overlaped
|
124
|
-
end
|
125
|
-
|
126
|
-
def self.clean_sort(segments)
|
127
|
-
sorted = sort(segments).reject{|s| s.offset.nil?}
|
128
|
-
overlaps = overlaps(sorted)
|
129
|
-
overlaps.each do |s|
|
130
|
-
sorted.delete s
|
131
|
-
end
|
132
|
-
|
133
|
-
sorted
|
134
|
-
end
|
135
|
-
|
136
|
-
#{{{ Splitting
|
137
|
-
|
138
|
-
def self.split(text, segments, skip_segments = false)
|
139
|
-
sorted_segments = clean_sort segments
|
140
|
-
|
141
|
-
chunks = []
|
142
|
-
segment_end = 0
|
143
|
-
text_offset = 0
|
144
|
-
sorted_segments.each do |segment|
|
145
|
-
return chunks if text.nil? or text.empty?
|
146
|
-
next if segment.offset.nil?
|
147
|
-
offset = segment.offset - text_offset
|
148
|
-
|
149
|
-
# Consider segment offset. Save pre, or skip if overlap
|
150
|
-
case
|
151
|
-
when offset < 0 # Overlap, skip
|
152
|
-
next
|
153
|
-
when offset > 0 # Save pre
|
154
|
-
chunk = text[0..offset - 1]
|
155
|
-
Segment.setup(chunk, text_offset)
|
156
|
-
chunks << chunk
|
157
|
-
end
|
158
|
-
|
159
|
-
segment_end = offset + segment.segment_length - 1
|
160
|
-
|
161
|
-
if not skip_segments
|
162
|
-
chunk = text[offset..segment_end]
|
163
|
-
Segment.setup(chunk, text_offset + offset)
|
164
|
-
chunks << chunk
|
165
|
-
end
|
166
|
-
|
167
|
-
text_offset += segment_end + 1
|
168
|
-
text = text[segment_end + 1..-1]
|
169
|
-
|
170
|
-
end
|
171
|
-
|
172
|
-
if not text.nil? and not text.empty?
|
173
|
-
chunk = text.dup
|
174
|
-
Segment.setup(chunk, text_offset)
|
175
|
-
chunks << chunk
|
176
|
-
end
|
177
|
-
|
178
|
-
chunks
|
179
|
-
end
|
180
|
-
|
181
|
-
|
182
|
-
#{{{ Align
|
183
|
-
|
184
|
-
def self.align(text, parts)
|
185
|
-
pre_offset = 0
|
186
|
-
parts.each do |part|
|
187
|
-
offset = text.index part
|
188
|
-
next if offset.nil?
|
189
|
-
Segment.setup(part, pre_offset + offset)
|
190
|
-
pre_offset += offset + part.segment_length - 1
|
191
|
-
text = text[(offset + part.segment_length - 1)..-1]
|
192
|
-
end
|
193
|
-
end
|
194
|
-
|
195
|
-
#{{{ Index
|
196
|
-
|
197
|
-
class Index
|
198
|
-
attr_accessor :index, :data
|
199
|
-
def initialize(index, data)
|
200
|
-
@index = index
|
201
|
-
@data = data
|
202
|
-
end
|
203
|
-
|
204
|
-
def [](pos)
|
205
|
-
index[pos].collect{|id| data[id]}
|
206
|
-
end
|
207
|
-
end
|
208
|
-
|
209
|
-
def self.index(segments, persist_file = :memory)
|
210
|
-
segments = segments.values.flatten if Hash === segments
|
211
|
-
|
212
|
-
annotation_index =
|
213
|
-
Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
|
214
|
-
|
215
|
-
value_size = 0
|
216
|
-
index_data = segments.collect{|segment|
|
217
|
-
next if segment.offset.nil?
|
218
|
-
range = segment.range
|
219
|
-
value_size = [segment.id.length, value_size].max
|
220
|
-
[segment.id, [range.begin, range.end]]
|
221
|
-
}.compact
|
222
|
-
|
223
|
-
fwt = FixWidthTable.get :memory, value_size, true
|
224
|
-
fwt.add_range index_data
|
225
|
-
|
226
|
-
fwt
|
227
|
-
end
|
228
|
-
|
229
|
-
data = {}
|
230
|
-
segments.each do |segment| data[segment.id] = segment end
|
231
|
-
Index.new annotation_index, data
|
232
|
-
end
|
233
|
-
|
234
|
-
#{{{ Save and load
|
235
|
-
|
236
|
-
def self.tsv_values_for_segment(segment, fields)
|
237
|
-
info = segment.info
|
238
|
-
values = []
|
239
|
-
|
240
|
-
fields.each do |field|
|
241
|
-
values << case
|
242
|
-
when field == "JSON"
|
243
|
-
info.to_json
|
244
|
-
when field == "literal"
|
245
|
-
segment.gsub(/\n|\t/, ' ')
|
246
|
-
when field == "Start"
|
247
|
-
segment.offset
|
248
|
-
when field == "End"
|
249
|
-
segment.end
|
250
|
-
else
|
251
|
-
info.delete(field.to_sym)
|
252
|
-
end
|
253
|
-
end
|
254
|
-
|
255
|
-
values
|
256
|
-
end
|
257
|
-
|
258
|
-
def self.load_tsv_values(text, values, fields)
|
259
|
-
info = {}
|
260
|
-
literal_pos = fields.index "literal"
|
261
|
-
|
262
|
-
object = if literal_pos.nil?
|
263
|
-
""
|
264
|
-
else
|
265
|
-
v = values[literal_pos]
|
266
|
-
v = v.first if Array === v
|
267
|
-
v
|
268
|
-
end
|
269
|
-
|
270
|
-
fields.each_with_index do |field, i|
|
271
|
-
if field == "JSON"
|
272
|
-
JSON.parse(values[i]).each do |key, value|
|
273
|
-
info[key.to_sym] = value
|
274
|
-
end
|
275
|
-
else
|
276
|
-
info[field.to_sym] = values[i]
|
277
|
-
end
|
278
|
-
end
|
279
|
-
|
280
|
-
start = info.delete(:Start)
|
281
|
-
if not (start.nil? or ((Array === start or String === start) and start.empty?))
|
282
|
-
if Array === start
|
283
|
-
start = start.first
|
284
|
-
end
|
285
|
-
start = start.to_i
|
286
|
-
info[:offset] = start
|
287
|
-
|
288
|
-
eend = info.delete(:End)
|
289
|
-
if Array === eend
|
290
|
-
eend = eend.first
|
291
|
-
end
|
292
|
-
eend = eend.to_i
|
293
|
-
|
294
|
-
if object.empty?
|
295
|
-
object.replace text[start..eend]
|
296
|
-
end
|
297
|
-
end
|
298
|
-
|
299
|
-
info[:annotation_types] = [Segment] unless info.include? :annotation_types
|
300
|
-
|
301
|
-
Annotated.load_entity(object, info)
|
302
|
-
end
|
303
|
-
|
304
|
-
def self.set_tsv_fields(fields, segments)
|
305
|
-
tsv_fields = []
|
306
|
-
add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
|
307
|
-
literal = (fields.delete(:literal) || fields.delete("literal"))
|
308
|
-
tsv_fields << "Start" << "End"
|
309
|
-
tsv_fields << :annotation_types if add_types
|
310
|
-
tsv_fields << :literal if literal
|
311
|
-
|
312
|
-
if fields.any? and not (fields == [:all] or fields == ["all"])
|
313
|
-
tsv_fields.concat fields
|
314
|
-
else
|
315
|
-
tsv_fields.concat segments.first.annotations if segments.any?
|
316
|
-
end
|
317
|
-
tsv_fields
|
318
|
-
tsv_fields.collect!{|f| f.to_s}
|
319
|
-
tsv_fields.delete "offset"
|
320
|
-
tsv_fields
|
321
|
-
end
|
322
|
-
|
323
|
-
def self.tsv(segments, *fields)
|
324
|
-
fields = set_tsv_fields fields, segments
|
325
|
-
tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
326
|
-
|
327
|
-
segments.each do |segment|
|
328
|
-
tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
|
329
|
-
end
|
330
|
-
|
331
|
-
tsv
|
332
|
-
end
|
333
|
-
|
334
|
-
def self.load_tsv(tsv)
|
335
|
-
fields = tsv.fields
|
336
|
-
tsv.with_unnamed do
|
337
|
-
tsv.collect do |id, values|
|
338
|
-
Annotated.load_tsv_values(id, values, fields)
|
339
|
-
end
|
340
|
-
end
|
341
|
-
end
|
342
|
-
|
343
|
-
def ansi(color)
|
344
|
-
Log.color color, self
|
345
|
-
end
|
346
|
-
|
347
|
-
def locus
|
348
|
-
[offset, self.end] * ".."
|
349
|
-
end
|
350
|
-
|
351
|
-
def segment_id
|
352
|
-
if self.respond_to?(:docid)
|
353
|
-
[docid, locus, Misc.obj2digest(info)] * ":"
|
354
|
-
else
|
355
|
-
Misc.obj2digest(info)
|
356
|
-
end
|
357
|
-
end
|
358
|
-
|
359
|
-
#def ==(other)
|
360
|
-
# self.text == other.text
|
361
|
-
#end
|
362
|
-
end
|
363
|
-
|
@@ -1,46 +0,0 @@
|
|
1
|
-
require 'rbbt/text/segment'
|
2
|
-
|
3
|
-
#module SegmentWithDocid
|
4
|
-
# extend Annotation
|
5
|
-
#
|
6
|
-
# self.annotation :docid
|
7
|
-
#
|
8
|
-
# def masked?
|
9
|
-
# self[0..5] == "MASKED"
|
10
|
-
# end
|
11
|
-
#
|
12
|
-
# def mask
|
13
|
-
# return self if masked?
|
14
|
-
# raise "Cannot mask an array of elements, they must be masked individually" if Array === self
|
15
|
-
# raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
|
16
|
-
# raise "Cannot mask a segment with no offset" if offset.nil?
|
17
|
-
# textual_position = ["MASKED", length] * ":"
|
18
|
-
# self.replace(textual_position)
|
19
|
-
# self
|
20
|
-
# end
|
21
|
-
#
|
22
|
-
# def unmasked_text
|
23
|
-
# return self unless masked?
|
24
|
-
# tag, length = self.split(":")
|
25
|
-
# Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
|
26
|
-
# end
|
27
|
-
#
|
28
|
-
# def unmask
|
29
|
-
# return self unless masked?
|
30
|
-
# self.replace(unmasked_text)
|
31
|
-
# self
|
32
|
-
# end
|
33
|
-
#
|
34
|
-
# def str_length
|
35
|
-
# self.length
|
36
|
-
# end
|
37
|
-
#
|
38
|
-
# def masked_length
|
39
|
-
# self.split(":").last.to_i
|
40
|
-
# end
|
41
|
-
#
|
42
|
-
# def segment_length
|
43
|
-
# masked? ? masked_length : str_length
|
44
|
-
# end
|
45
|
-
#end
|
46
|
-
#
|