rbbt-text 1.1.9 → 1.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +56 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +61 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +42 -12
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -361
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -355
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -52
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
data/lib/rbbt/segment.rb
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity'
|
3
|
+
require 'rbbt/document'
|
4
|
+
|
5
|
+
module SegID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def range
|
14
|
+
@range ||= Range.new(*_parts[4].split("..").map(&:to_i))
|
15
|
+
end
|
16
|
+
|
17
|
+
def docid
|
18
|
+
@docid ||= DocID.setup(_parts[0..3] * ":")
|
19
|
+
end
|
20
|
+
|
21
|
+
def offset
|
22
|
+
range.begin
|
23
|
+
end
|
24
|
+
|
25
|
+
def segment_length
|
26
|
+
range.end - range.begin + 1
|
27
|
+
end
|
28
|
+
|
29
|
+
property :segment => :single do
|
30
|
+
docid = self.docid
|
31
|
+
document = DocID.setup(docid, :corpus => corpus).document
|
32
|
+
|
33
|
+
text = document[range]
|
34
|
+
|
35
|
+
Segment.setup(text, :docid => docid, :offset => offset)
|
36
|
+
end
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
module Segment
|
45
|
+
extend Entity
|
46
|
+
self.annotation :offset, :docid
|
47
|
+
|
48
|
+
def segment_length
|
49
|
+
length
|
50
|
+
end
|
51
|
+
|
52
|
+
def eend
|
53
|
+
offset.to_i + length - 1
|
54
|
+
end
|
55
|
+
|
56
|
+
def range
|
57
|
+
(offset.to_i..eend)
|
58
|
+
end
|
59
|
+
|
60
|
+
property :segid do |corpus=nil|
|
61
|
+
SegID.setup([docid, range] * ":", :corpus => corpus)
|
62
|
+
end
|
63
|
+
|
64
|
+
alias id segid
|
65
|
+
|
66
|
+
property :segment do
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.sort(segments, inline = true)
|
71
|
+
if inline
|
72
|
+
segments.sort do |a,b|
|
73
|
+
case
|
74
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
75
|
+
0
|
76
|
+
when (a.nil? or a.offset.nil?)
|
77
|
+
-1
|
78
|
+
when (b.nil? or b.offset.nil?)
|
79
|
+
+1
|
80
|
+
when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
|
81
|
+
a.offset.to_i <=> b.offset.to_i
|
82
|
+
else
|
83
|
+
a.segment_length <=> b.segment_length
|
84
|
+
end
|
85
|
+
end
|
86
|
+
else
|
87
|
+
segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.overlaps(sorted_segments)
|
92
|
+
last = nil
|
93
|
+
overlaped = []
|
94
|
+
|
95
|
+
sorted_segments.reverse.each do |segment|
|
96
|
+
overlaped << segment if (not last.nil?) and segment.range.end > last
|
97
|
+
last = segment.range.begin
|
98
|
+
end
|
99
|
+
|
100
|
+
overlaped
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.clean_sort(segments)
|
104
|
+
sorted = sort(segments).reject{|s| s.offset.nil?}
|
105
|
+
overlaps = overlaps(sorted)
|
106
|
+
overlaps.each do |s|
|
107
|
+
sorted.delete s
|
108
|
+
end
|
109
|
+
|
110
|
+
sorted
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.split(text, segments, skip_segments = false)
|
114
|
+
sorted_segments = clean_sort segments
|
115
|
+
|
116
|
+
chunks = []
|
117
|
+
segment_end = 0
|
118
|
+
text_offset = 0
|
119
|
+
sorted_segments.each do |segment|
|
120
|
+
return chunks if text.nil? or text.empty?
|
121
|
+
next if segment.offset.nil?
|
122
|
+
offset = segment.offset - text_offset
|
123
|
+
|
124
|
+
# Consider segment offset. Save pre, or skip if overlap
|
125
|
+
case
|
126
|
+
when offset < 0 # Overlap, skip
|
127
|
+
next
|
128
|
+
when offset > 0 # Save pre
|
129
|
+
chunk = text[0..offset - 1]
|
130
|
+
Segment.setup(chunk, text_offset)
|
131
|
+
chunks << chunk
|
132
|
+
end
|
133
|
+
|
134
|
+
segment_end = offset + segment.segment_length - 1
|
135
|
+
|
136
|
+
if not skip_segments
|
137
|
+
chunk = text[offset..segment_end]
|
138
|
+
Segment.setup(chunk, text_offset + offset)
|
139
|
+
chunks << chunk
|
140
|
+
end
|
141
|
+
|
142
|
+
text_offset += segment_end + 1
|
143
|
+
text = text[segment_end + 1..-1]
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
if not text.nil? and not text.empty?
|
148
|
+
chunk = text.dup
|
149
|
+
Segment.setup(chunk, text_offset)
|
150
|
+
chunks << chunk
|
151
|
+
end
|
152
|
+
|
153
|
+
chunks
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.align(text, parts)
|
157
|
+
pre_offset = 0
|
158
|
+
docid = text.respond_to?(:docid) ? text.docid : nil
|
159
|
+
parts.each do |part|
|
160
|
+
offset = text.index part
|
161
|
+
next if offset.nil?
|
162
|
+
Segment.setup(part, pre_offset + offset, docid)
|
163
|
+
pre_offset += offset + part.segment_length - 1
|
164
|
+
text = text[(offset + part.segment_length - 1)..-1]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.index(*args)
|
169
|
+
Segment::RangeIndex.index(*args)
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
require 'rbbt/segment/range_index'
|
175
|
+
require 'rbbt/segment/overlaps'
|
176
|
+
require 'rbbt/segment/transformed'
|
177
|
+
require 'rbbt/segment/segmented'
|
178
|
+
require 'rbbt/segment/encoding'
|
179
|
+
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/entity'
|
4
|
+
|
5
|
+
module AnnotID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def segid
|
14
|
+
@segid ||= _parts[0..4] * ":"
|
15
|
+
end
|
16
|
+
|
17
|
+
def type
|
18
|
+
@type ||= _parts[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
property :annotation do
|
22
|
+
segment = SegID.setup(segid, :corpus => corpus).segment
|
23
|
+
|
24
|
+
SegmentAnnotation.setup(segment, :type => type)
|
25
|
+
end
|
26
|
+
|
27
|
+
property :annotid do
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
module SegmentAnnotation
|
34
|
+
extend Entity
|
35
|
+
include Object::Segment
|
36
|
+
self.annotation :type
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
case self
|
40
|
+
when SegID
|
41
|
+
self
|
42
|
+
when Segment
|
43
|
+
super()
|
44
|
+
else
|
45
|
+
raise "Unknown object: #{self}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
property :annotid do |corpus=nil|
|
50
|
+
AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
|
51
|
+
end
|
52
|
+
|
53
|
+
alias id annotid
|
54
|
+
|
55
|
+
property :annotation do
|
56
|
+
self
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
module Segment
|
3
|
+
def self.bad_chars(text)
|
4
|
+
segments = []
|
5
|
+
text.chars.each_with_index do |c,i|
|
6
|
+
if ! c.ascii_only?
|
7
|
+
segments << Segment.setup(c, :offset => i)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
segments
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.ascii(text, replace = nil, &block)
|
14
|
+
bad = bad_chars(text)
|
15
|
+
replace = "?" if replace.nil?
|
16
|
+
Transformed.with_transform(text, bad, replace, &block)
|
17
|
+
end
|
18
|
+
end
|
@@ -1,17 +1,18 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/segment/annotation'
|
3
3
|
|
4
|
-
module NamedEntity
|
4
|
+
module NamedEntity
|
5
5
|
extend Entity
|
6
6
|
include Segment
|
7
|
+
include SegmentAnnotation
|
7
8
|
|
8
|
-
self.annotation :
|
9
|
+
self.annotation :entity_type, :code, :score
|
9
10
|
|
10
11
|
def report
|
11
12
|
<<-EOF
|
12
13
|
String: #{ self }
|
13
14
|
Offset: #{ offset.inspect }
|
14
|
-
Type: #{
|
15
|
+
Type: #{entity_type.inspect}
|
15
16
|
Code: #{code.inspect}
|
16
17
|
Score: #{score.inspect}
|
17
18
|
EOF
|
@@ -20,7 +21,7 @@ Score: #{score.inspect}
|
|
20
21
|
def html
|
21
22
|
text = <<-EOF
|
22
23
|
<span class='Entity'\
|
23
|
-
#{
|
24
|
+
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
24
25
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
25
26
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
26
27
|
>#{ self }</span>
|
@@ -29,13 +30,14 @@ Score: #{score.inspect}
|
|
29
30
|
end
|
30
31
|
|
31
32
|
def entity(params = nil)
|
33
|
+
code = self.code || self.dup
|
32
34
|
format, entity = code.split(":")
|
33
35
|
entity, format = format, nil if entity.nil?
|
34
|
-
|
35
|
-
if defined?(Entity) && Entity.formats.include?(
|
36
|
+
|
37
|
+
if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
|
36
38
|
params ||= {}
|
37
39
|
params[:format] = format if format and params[:format].nil?
|
38
|
-
mod = (Entity.formats[
|
40
|
+
mod = (Entity.formats[entity_type] || Entity.format[entity])
|
39
41
|
mod.setup(entity, params)
|
40
42
|
end
|
41
43
|
|
@@ -43,4 +45,3 @@ Score: #{score.inspect}
|
|
43
45
|
end
|
44
46
|
|
45
47
|
end
|
46
|
-
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Segment
|
2
|
+
def pull(offset)
|
3
|
+
if self.offset.nil? or offset.nil?
|
4
|
+
self.offset = nil
|
5
|
+
else
|
6
|
+
self.offset += offset
|
7
|
+
end
|
8
|
+
|
9
|
+
self
|
10
|
+
end
|
11
|
+
|
12
|
+
def push(offset)
|
13
|
+
if self.offset.nil? or offset.nil?
|
14
|
+
self.offset = nil
|
15
|
+
else
|
16
|
+
self.offset -= offset
|
17
|
+
end
|
18
|
+
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def make_relative(segments, &block)
|
23
|
+
if block_given?
|
24
|
+
segments.each{|s| s.push offset}
|
25
|
+
yield(segments)
|
26
|
+
segments.each{|s| s.pull offset}
|
27
|
+
else
|
28
|
+
segments.each{|s| s.push offset}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def range_in(container = nil)
|
33
|
+
raise "No offset specified" if offset.nil?
|
34
|
+
case
|
35
|
+
when (Segment === container and not container.offset.nil?)
|
36
|
+
((offset - container.offset)..(self.eend - container.offset))
|
37
|
+
when Integer === container
|
38
|
+
((offset - container)..(self.eend - container))
|
39
|
+
else
|
40
|
+
range
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def includes?(segment)
|
45
|
+
(segment.offset.to_i >= self.offset.to_i) and
|
46
|
+
(segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
|
47
|
+
end
|
48
|
+
|
49
|
+
def overlaps?(segment)
|
50
|
+
segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
|
51
|
+
self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
|
52
|
+
end
|
53
|
+
|
54
|
+
def overlaps(segments)
|
55
|
+
segments.select{|s| self.overlaps?(s) }
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.collisions(main, secondary)
|
59
|
+
secondary.select do |ss|
|
60
|
+
main.select{|ms| ms.overlaps? ss }.any?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Segment::RangeIndex
|
2
|
+
attr_accessor :corpus
|
3
|
+
|
4
|
+
def [](*args)
|
5
|
+
res = super(*args)
|
6
|
+
SegID.setup(res, :corpus => corpus)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.index(segments, corpus, persist_file = :memory)
|
10
|
+
segments = segments.values.flatten if Hash === segments
|
11
|
+
|
12
|
+
annotation_index =
|
13
|
+
Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
|
14
|
+
|
15
|
+
value_size = 0
|
16
|
+
index_data = segments.collect{|segment|
|
17
|
+
next if segment.offset.nil?
|
18
|
+
range = segment.range
|
19
|
+
value_size = [segment.segid.length, value_size].max
|
20
|
+
[segment.segid, [range.begin, range.end]]
|
21
|
+
}.compact
|
22
|
+
|
23
|
+
fwt = FixWidthTable.get :memory, value_size, true
|
24
|
+
fwt.add_range index_data
|
25
|
+
|
26
|
+
fwt
|
27
|
+
end
|
28
|
+
|
29
|
+
annotation_index.extend Segment::RangeIndex
|
30
|
+
annotation_index.corpus = corpus
|
31
|
+
annotation_index
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
|
3
|
+
module Token
|
4
|
+
extend Entity
|
5
|
+
include Segment
|
6
|
+
|
7
|
+
self.annotation :original
|
8
|
+
|
9
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
|
+
|
11
|
+
tokens = []
|
12
|
+
while matchdata = text.match(split_at)
|
13
|
+
tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
|
14
|
+
tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
15
|
+
start += matchdata.end(0)
|
16
|
+
text = matchdata.post_match
|
17
|
+
end
|
18
|
+
|
19
|
+
tokens << Token.setup(text, :offset => start) unless text.empty?
|
20
|
+
|
21
|
+
tokens
|
22
|
+
end
|
23
|
+
end
|