rbbt-text 1.1.9 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +56 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +61 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +42 -12
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -361
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -355
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -52
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
data/lib/rbbt/segment.rb
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity'
|
3
|
+
require 'rbbt/document'
|
4
|
+
|
5
|
+
module SegID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def range
|
14
|
+
@range ||= Range.new(*_parts[4].split("..").map(&:to_i))
|
15
|
+
end
|
16
|
+
|
17
|
+
def docid
|
18
|
+
@docid ||= DocID.setup(_parts[0..3] * ":")
|
19
|
+
end
|
20
|
+
|
21
|
+
def offset
|
22
|
+
range.begin
|
23
|
+
end
|
24
|
+
|
25
|
+
def segment_length
|
26
|
+
range.end - range.begin + 1
|
27
|
+
end
|
28
|
+
|
29
|
+
property :segment => :single do
|
30
|
+
docid = self.docid
|
31
|
+
document = DocID.setup(docid, :corpus => corpus).document
|
32
|
+
|
33
|
+
text = document[range]
|
34
|
+
|
35
|
+
Segment.setup(text, :docid => docid, :offset => offset)
|
36
|
+
end
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
module Segment
|
45
|
+
extend Entity
|
46
|
+
self.annotation :offset, :docid
|
47
|
+
|
48
|
+
def segment_length
|
49
|
+
length
|
50
|
+
end
|
51
|
+
|
52
|
+
def eend
|
53
|
+
offset.to_i + length - 1
|
54
|
+
end
|
55
|
+
|
56
|
+
def range
|
57
|
+
(offset.to_i..eend)
|
58
|
+
end
|
59
|
+
|
60
|
+
property :segid do |corpus=nil|
|
61
|
+
SegID.setup([docid, range] * ":", :corpus => corpus)
|
62
|
+
end
|
63
|
+
|
64
|
+
alias id segid
|
65
|
+
|
66
|
+
property :segment do
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.sort(segments, inline = true)
|
71
|
+
if inline
|
72
|
+
segments.sort do |a,b|
|
73
|
+
case
|
74
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
75
|
+
0
|
76
|
+
when (a.nil? or a.offset.nil?)
|
77
|
+
-1
|
78
|
+
when (b.nil? or b.offset.nil?)
|
79
|
+
+1
|
80
|
+
when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
|
81
|
+
a.offset.to_i <=> b.offset.to_i
|
82
|
+
else
|
83
|
+
a.segment_length <=> b.segment_length
|
84
|
+
end
|
85
|
+
end
|
86
|
+
else
|
87
|
+
segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.overlaps(sorted_segments)
|
92
|
+
last = nil
|
93
|
+
overlaped = []
|
94
|
+
|
95
|
+
sorted_segments.reverse.each do |segment|
|
96
|
+
overlaped << segment if (not last.nil?) and segment.range.end > last
|
97
|
+
last = segment.range.begin
|
98
|
+
end
|
99
|
+
|
100
|
+
overlaped
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.clean_sort(segments)
|
104
|
+
sorted = sort(segments).reject{|s| s.offset.nil?}
|
105
|
+
overlaps = overlaps(sorted)
|
106
|
+
overlaps.each do |s|
|
107
|
+
sorted.delete s
|
108
|
+
end
|
109
|
+
|
110
|
+
sorted
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.split(text, segments, skip_segments = false)
|
114
|
+
sorted_segments = clean_sort segments
|
115
|
+
|
116
|
+
chunks = []
|
117
|
+
segment_end = 0
|
118
|
+
text_offset = 0
|
119
|
+
sorted_segments.each do |segment|
|
120
|
+
return chunks if text.nil? or text.empty?
|
121
|
+
next if segment.offset.nil?
|
122
|
+
offset = segment.offset - text_offset
|
123
|
+
|
124
|
+
# Consider segment offset. Save pre, or skip if overlap
|
125
|
+
case
|
126
|
+
when offset < 0 # Overlap, skip
|
127
|
+
next
|
128
|
+
when offset > 0 # Save pre
|
129
|
+
chunk = text[0..offset - 1]
|
130
|
+
Segment.setup(chunk, text_offset)
|
131
|
+
chunks << chunk
|
132
|
+
end
|
133
|
+
|
134
|
+
segment_end = offset + segment.segment_length - 1
|
135
|
+
|
136
|
+
if not skip_segments
|
137
|
+
chunk = text[offset..segment_end]
|
138
|
+
Segment.setup(chunk, text_offset + offset)
|
139
|
+
chunks << chunk
|
140
|
+
end
|
141
|
+
|
142
|
+
text_offset += segment_end + 1
|
143
|
+
text = text[segment_end + 1..-1]
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
if not text.nil? and not text.empty?
|
148
|
+
chunk = text.dup
|
149
|
+
Segment.setup(chunk, text_offset)
|
150
|
+
chunks << chunk
|
151
|
+
end
|
152
|
+
|
153
|
+
chunks
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.align(text, parts)
|
157
|
+
pre_offset = 0
|
158
|
+
docid = text.respond_to?(:docid) ? text.docid : nil
|
159
|
+
parts.each do |part|
|
160
|
+
offset = text.index part
|
161
|
+
next if offset.nil?
|
162
|
+
Segment.setup(part, pre_offset + offset, docid)
|
163
|
+
pre_offset += offset + part.segment_length - 1
|
164
|
+
text = text[(offset + part.segment_length - 1)..-1]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.index(*args)
|
169
|
+
Segment::RangeIndex.index(*args)
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
require 'rbbt/segment/range_index'
|
175
|
+
require 'rbbt/segment/overlaps'
|
176
|
+
require 'rbbt/segment/transformed'
|
177
|
+
require 'rbbt/segment/segmented'
|
178
|
+
require 'rbbt/segment/encoding'
|
179
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/entity'
|
4
|
+
|
5
|
+
module AnnotID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def segid
|
14
|
+
@segid ||= _parts[0..4] * ":"
|
15
|
+
end
|
16
|
+
|
17
|
+
def type
|
18
|
+
@type ||= _parts[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
property :annotation do
|
22
|
+
segment = SegID.setup(segid, :corpus => corpus).segment
|
23
|
+
|
24
|
+
SegmentAnnotation.setup(segment, :type => type)
|
25
|
+
end
|
26
|
+
|
27
|
+
property :annotid do
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
module SegmentAnnotation
|
34
|
+
extend Entity
|
35
|
+
include Object::Segment
|
36
|
+
self.annotation :type
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
case self
|
40
|
+
when SegID
|
41
|
+
self
|
42
|
+
when Segment
|
43
|
+
super()
|
44
|
+
else
|
45
|
+
raise "Unknown object: #{self}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
property :annotid do |corpus=nil|
|
50
|
+
AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
|
51
|
+
end
|
52
|
+
|
53
|
+
alias id annotid
|
54
|
+
|
55
|
+
property :annotation do
|
56
|
+
self
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
module Segment
|
3
|
+
def self.bad_chars(text)
|
4
|
+
segments = []
|
5
|
+
text.chars.each_with_index do |c,i|
|
6
|
+
if ! c.ascii_only?
|
7
|
+
segments << Segment.setup(c, :offset => i)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
segments
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.ascii(text, replace = nil, &block)
|
14
|
+
bad = bad_chars(text)
|
15
|
+
replace = "?" if replace.nil?
|
16
|
+
Transformed.with_transform(text, bad, replace, &block)
|
17
|
+
end
|
18
|
+
end
|
@@ -1,17 +1,18 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/segment/annotation'
|
3
3
|
|
4
|
-
module NamedEntity
|
4
|
+
module NamedEntity
|
5
5
|
extend Entity
|
6
6
|
include Segment
|
7
|
+
include SegmentAnnotation
|
7
8
|
|
8
|
-
self.annotation :
|
9
|
+
self.annotation :entity_type, :code, :score
|
9
10
|
|
10
11
|
def report
|
11
12
|
<<-EOF
|
12
13
|
String: #{ self }
|
13
14
|
Offset: #{ offset.inspect }
|
14
|
-
Type: #{
|
15
|
+
Type: #{entity_type.inspect}
|
15
16
|
Code: #{code.inspect}
|
16
17
|
Score: #{score.inspect}
|
17
18
|
EOF
|
@@ -20,7 +21,7 @@ Score: #{score.inspect}
|
|
20
21
|
def html
|
21
22
|
text = <<-EOF
|
22
23
|
<span class='Entity'\
|
23
|
-
#{
|
24
|
+
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
24
25
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
25
26
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
26
27
|
>#{ self }</span>
|
@@ -29,13 +30,14 @@ Score: #{score.inspect}
|
|
29
30
|
end
|
30
31
|
|
31
32
|
def entity(params = nil)
|
33
|
+
code = self.code || self.dup
|
32
34
|
format, entity = code.split(":")
|
33
35
|
entity, format = format, nil if entity.nil?
|
34
|
-
|
35
|
-
if defined?(Entity) && Entity.formats.include?(
|
36
|
+
|
37
|
+
if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
|
36
38
|
params ||= {}
|
37
39
|
params[:format] = format if format and params[:format].nil?
|
38
|
-
mod = (Entity.formats[
|
40
|
+
mod = (Entity.formats[entity_type] || Entity.format[entity])
|
39
41
|
mod.setup(entity, params)
|
40
42
|
end
|
41
43
|
|
@@ -43,4 +45,3 @@ Score: #{score.inspect}
|
|
43
45
|
end
|
44
46
|
|
45
47
|
end
|
46
|
-
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Segment
|
2
|
+
def pull(offset)
|
3
|
+
if self.offset.nil? or offset.nil?
|
4
|
+
self.offset = nil
|
5
|
+
else
|
6
|
+
self.offset += offset
|
7
|
+
end
|
8
|
+
|
9
|
+
self
|
10
|
+
end
|
11
|
+
|
12
|
+
def push(offset)
|
13
|
+
if self.offset.nil? or offset.nil?
|
14
|
+
self.offset = nil
|
15
|
+
else
|
16
|
+
self.offset -= offset
|
17
|
+
end
|
18
|
+
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def make_relative(segments, &block)
|
23
|
+
if block_given?
|
24
|
+
segments.each{|s| s.push offset}
|
25
|
+
yield(segments)
|
26
|
+
segments.each{|s| s.pull offset}
|
27
|
+
else
|
28
|
+
segments.each{|s| s.push offset}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def range_in(container = nil)
|
33
|
+
raise "No offset specified" if offset.nil?
|
34
|
+
case
|
35
|
+
when (Segment === container and not container.offset.nil?)
|
36
|
+
((offset - container.offset)..(self.eend - container.offset))
|
37
|
+
when Integer === container
|
38
|
+
((offset - container)..(self.eend - container))
|
39
|
+
else
|
40
|
+
range
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def includes?(segment)
|
45
|
+
(segment.offset.to_i >= self.offset.to_i) and
|
46
|
+
(segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
|
47
|
+
end
|
48
|
+
|
49
|
+
def overlaps?(segment)
|
50
|
+
segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
|
51
|
+
self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
|
52
|
+
end
|
53
|
+
|
54
|
+
def overlaps(segments)
|
55
|
+
segments.select{|s| self.overlaps?(s) }
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.collisions(main, secondary)
|
59
|
+
secondary.select do |ss|
|
60
|
+
main.select{|ms| ms.overlaps? ss }.any?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Segment::RangeIndex
|
2
|
+
attr_accessor :corpus
|
3
|
+
|
4
|
+
def [](*args)
|
5
|
+
res = super(*args)
|
6
|
+
SegID.setup(res, :corpus => corpus)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.index(segments, corpus, persist_file = :memory)
|
10
|
+
segments = segments.values.flatten if Hash === segments
|
11
|
+
|
12
|
+
annotation_index =
|
13
|
+
Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
|
14
|
+
|
15
|
+
value_size = 0
|
16
|
+
index_data = segments.collect{|segment|
|
17
|
+
next if segment.offset.nil?
|
18
|
+
range = segment.range
|
19
|
+
value_size = [segment.segid.length, value_size].max
|
20
|
+
[segment.segid, [range.begin, range.end]]
|
21
|
+
}.compact
|
22
|
+
|
23
|
+
fwt = FixWidthTable.get :memory, value_size, true
|
24
|
+
fwt.add_range index_data
|
25
|
+
|
26
|
+
fwt
|
27
|
+
end
|
28
|
+
|
29
|
+
annotation_index.extend Segment::RangeIndex
|
30
|
+
annotation_index.corpus = corpus
|
31
|
+
annotation_index
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
|
3
|
+
module Token
|
4
|
+
extend Entity
|
5
|
+
include Segment
|
6
|
+
|
7
|
+
self.annotation :original
|
8
|
+
|
9
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
|
+
|
11
|
+
tokens = []
|
12
|
+
while matchdata = text.match(split_at)
|
13
|
+
tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
|
14
|
+
tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
15
|
+
start += matchdata.end(0)
|
16
|
+
text = matchdata.post_match
|
17
|
+
end
|
18
|
+
|
19
|
+
tokens << Token.setup(text, :offset => start) unless text.empty?
|
20
|
+
|
21
|
+
tokens
|
22
|
+
end
|
23
|
+
end
|