rbbt-text 0.2.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
2
|
+
require 'tokyocabinet'
|
3
|
+
|
4
|
+
class DocumentRepo < TokyoCabinet::BDB
|
5
|
+
class OpenError < StandardError;end
|
6
|
+
class KeyFormatError < StandardError;end
|
7
|
+
|
8
|
+
CONNECTIONS = {} unless defined? CONNECTIONS
|
9
|
+
|
10
|
+
def self.get(path, write = false)
|
11
|
+
|
12
|
+
if !File.exists?(path) or not CONNECTIONS.include? path
|
13
|
+
CONNECTIONS[path] = self.new(path, true)
|
14
|
+
end
|
15
|
+
|
16
|
+
d = CONNECTIONS[path]
|
17
|
+
|
18
|
+
if write and not d.write?
|
19
|
+
d.write
|
20
|
+
else
|
21
|
+
d.read if d.write?
|
22
|
+
end
|
23
|
+
|
24
|
+
d
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
alias original_open open
|
29
|
+
def open(write = false)
|
30
|
+
flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
|
31
|
+
|
32
|
+
FileUtils.mkdir_p File.dirname(@path_to_db) unless File.exists?(File.dirname(@path_to_db))
|
33
|
+
if !self.original_open(@path_to_db, flags)
|
34
|
+
ecode = self.ecode
|
35
|
+
raise OpenError, "Open error: #{self.errmsg(ecode)}. Trying to open file #{@path_to_db}"
|
36
|
+
end
|
37
|
+
|
38
|
+
@write = write
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
def write?
|
43
|
+
@write
|
44
|
+
end
|
45
|
+
|
46
|
+
def write
|
47
|
+
self.close
|
48
|
+
self.open(true)
|
49
|
+
end
|
50
|
+
|
51
|
+
def read
|
52
|
+
self.close
|
53
|
+
self.open(false)
|
54
|
+
end
|
55
|
+
|
56
|
+
def initialize(path, write = false)
|
57
|
+
super()
|
58
|
+
|
59
|
+
@path_to_db = path
|
60
|
+
|
61
|
+
if write || ! File.exists?(@path_to_db)
|
62
|
+
self.setcache(100000) or raise "Error setting cache"
|
63
|
+
self.open(true)
|
64
|
+
else
|
65
|
+
self.open(false)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def docid2fields(docid)
|
70
|
+
docid.split(":", -1).values_at 0,1,2,3
|
71
|
+
end
|
72
|
+
|
73
|
+
def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
|
74
|
+
[namespace, id, type, hash] * ":"
|
75
|
+
end
|
76
|
+
|
77
|
+
def docid(docid)
|
78
|
+
get(docid)
|
79
|
+
end
|
80
|
+
|
81
|
+
def add(text, namespace, id, type, hash)
|
82
|
+
write unless write?
|
83
|
+
docid = fields2docid(namespace, id, type, hash)
|
84
|
+
self[docid] = text unless self.include? docid
|
85
|
+
docid
|
86
|
+
end
|
87
|
+
|
88
|
+
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
89
|
+
case
|
90
|
+
when namespace.nil?
|
91
|
+
self.keys
|
92
|
+
when id.nil?
|
93
|
+
range_start = [namespace] * ":" + ':'
|
94
|
+
range_end = [namespace] * ":" + ';'
|
95
|
+
self.range(range_start, true, range_end, false)
|
96
|
+
when (type and hash)
|
97
|
+
[[namespace, id, type, hash] * ":"]
|
98
|
+
when hash
|
99
|
+
[[namespace, id, "", hash] * ":"]
|
100
|
+
when type
|
101
|
+
range_start = [namespace, id, type] * ":" + ':'
|
102
|
+
range_end = [namespace, id, type] * ":" + ';'
|
103
|
+
self.range(range_start, true, range_end, false)
|
104
|
+
else
|
105
|
+
range_start = [namespace, id] * ":" + ':'
|
106
|
+
range_end = [namespace, id] * ":" + ';'
|
107
|
+
self.range(range_start, true, range_end, false)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def find_docid(docid)
|
112
|
+
find(*docid2fields(docid))
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rbbt/sources/pubmed'
|
2
|
+
|
3
|
+
class Corpus
|
4
|
+
|
5
|
+
NAMESPACES = {} unless defined? NAMESPACES
|
6
|
+
NAMESPACES[:pubmed] = :add_pmid
|
7
|
+
|
8
|
+
def add_pmid(pmid, type = nil)
|
9
|
+
pmids = Array === pmid ? pmid : [pmid]
|
10
|
+
type = nil if String === type and type.empty?
|
11
|
+
|
12
|
+
PubMed.get_article(pmids).collect do |pmid, article|
|
13
|
+
if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
|
14
|
+
add_document(article.text, :pubmed, pmid, :abstract)
|
15
|
+
else
|
16
|
+
raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
|
17
|
+
add_document(article.full_text, :pubmed, pmid, :fulltext)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_pubmed_query(query, max, type = nil)
|
23
|
+
pmids = PubMed.query(query, max)
|
24
|
+
add_pmid(pmids, type)
|
25
|
+
end
|
26
|
+
end
|
data/lib/rbbt/ner/NER.rb
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
require 'rbbt/ner/annotations'
|
2
|
+
require 'rbbt/ner/annotations/named_entity'
|
3
|
+
require 'rbbt/ner/annotations/annotated'
|
2
4
|
|
3
5
|
class NER
|
4
|
-
def entities(text,
|
6
|
+
def entities(text, protect = false, *args)
|
5
7
|
case
|
6
8
|
when Array === text
|
7
9
|
text.collect do |element|
|
8
|
-
matches = entities(element,
|
10
|
+
matches = entities(element, protect, *args)
|
9
11
|
matches.each{|match|
|
10
|
-
match.offset += element.offset if match.offset
|
12
|
+
match.offset += element.offset if match.offset and element.offset
|
11
13
|
}
|
12
14
|
matches
|
13
15
|
end.flatten
|
14
|
-
when (Annotated === text and
|
15
|
-
entities(text.
|
16
|
+
when (Annotated === text and protect)
|
17
|
+
entities(text.split_segments(true), protect, *args)
|
16
18
|
else
|
17
19
|
match(text, *args)
|
18
20
|
end
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -7,7 +7,7 @@ require 'rbbt/ner/NER'
|
|
7
7
|
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
8
8
|
class Abner < NER
|
9
9
|
|
10
|
-
Rbbt.
|
10
|
+
Rbbt.software.opt.ABNER.define_as_install Rbbt.share.install.software.ABNER.find
|
11
11
|
|
12
12
|
@@JFile = Rjb::import('java.io.File')
|
13
13
|
@@Tagger = Rjb::import('abner.Tagger')
|
@@ -27,14 +27,25 @@ class Abner < NER
|
|
27
27
|
# returns all the mentions found, regardless of type, to be coherent
|
28
28
|
# with the rest of NER packages in Rbbt.
|
29
29
|
def match(text)
|
30
|
+
return [] if text.nil? or text.empty?
|
30
31
|
|
31
32
|
res = @tagger.getEntities(text)
|
32
33
|
types = res[1]
|
33
34
|
strings = res[0]
|
34
35
|
|
36
|
+
global_offset = 0
|
35
37
|
strings.zip(types).collect do |mention, type|
|
36
38
|
mention = mention.to_s;
|
37
|
-
|
39
|
+
offset = text.index(mention)
|
40
|
+
if offset.nil?
|
41
|
+
NamedEntity.annotate(mention, nil, type.to_s)
|
42
|
+
else
|
43
|
+
NamedEntity.annotate(mention, offset + global_offset, type.to_s)
|
44
|
+
text = text[offset + mention.length..-1]
|
45
|
+
global_offset += offset + mention.length
|
46
|
+
end
|
47
|
+
|
48
|
+
mention
|
38
49
|
end
|
39
50
|
end
|
40
51
|
|
data/lib/rbbt/ner/annotations.rb
CHANGED
@@ -1,5 +1,63 @@
|
|
1
1
|
module Segment
|
2
|
-
attr_accessor :offset
|
2
|
+
attr_accessor :offset, :docid
|
3
|
+
|
4
|
+
def self.included(base)
|
5
|
+
if base.instance_methods.include? "segment_types"
|
6
|
+
class << base
|
7
|
+
self.module_eval do
|
8
|
+
define_method "extended" do |object|
|
9
|
+
object.segment_types ||= []
|
10
|
+
object.segment_types << self.to_s unless object.segment_types.include? self.to_s
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.annotate(string, offset = nil, docid = nil)
|
18
|
+
string.extend Segment
|
19
|
+
string.offset = offset
|
20
|
+
string.docid = docid
|
21
|
+
string
|
22
|
+
end
|
23
|
+
|
24
|
+
def id
|
25
|
+
new = info.dup
|
26
|
+
Digest::MD5.hexdigest(Misc.hash2string(new) << self << (offset || 0).to_s)
|
27
|
+
end
|
28
|
+
|
29
|
+
SKIP = %w(docid offset)
|
30
|
+
def info
|
31
|
+
equal_ascii = "="[0]
|
32
|
+
info = {}
|
33
|
+
singleton_methods.select{|method| method[-1] == equal_ascii}.
|
34
|
+
collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
|
35
|
+
info
|
36
|
+
info.delete_if{|k,v| v.nil?}
|
37
|
+
info
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.load(text, start, eend, info, docid = nil)
|
41
|
+
string = text[start.to_i..eend.to_i] if start and eend
|
42
|
+
string ||= info[:literal]
|
43
|
+
string.extend Segment
|
44
|
+
|
45
|
+
# add types
|
46
|
+
types = info.delete("segment_types")|| info.delete(:segment_types) || []
|
47
|
+
types.each do |type| string.extend Misc.string2const(type) end
|
48
|
+
|
49
|
+
# set info data
|
50
|
+
info.each do |key,value|
|
51
|
+
string.send key + '=', value if string.respond_to? key.to_sym
|
52
|
+
end
|
53
|
+
|
54
|
+
string.docid = docid
|
55
|
+
string.offset = start.to_i
|
56
|
+
|
57
|
+
string
|
58
|
+
end
|
59
|
+
|
60
|
+
# {{{ Sorting and splitting
|
3
61
|
|
4
62
|
def self.sort(segments, inline = true)
|
5
63
|
if inline
|
@@ -14,21 +72,43 @@ module Segment
|
|
14
72
|
when (not a.range.include? b.offset and not b.range.include? a.offset)
|
15
73
|
a.offset <=> b.offset
|
16
74
|
else
|
17
|
-
|
75
|
+
a.length <=> b.length
|
18
76
|
end
|
19
|
-
end
|
77
|
+
end
|
20
78
|
else
|
21
|
-
segments.sort_by do |segment| segment.offset || 0 end
|
79
|
+
segments.sort_by do |segment| segment.offset || 0 end.reverse
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.overlaps(sorted_segments)
|
84
|
+
|
85
|
+
last = nil
|
86
|
+
overlaped = []
|
87
|
+
sorted_segments.reverse.each do |segment|
|
88
|
+
overlaped << segment if (not last.nil?) and segment.range.end > last
|
89
|
+
last = segment.range.begin
|
22
90
|
end
|
91
|
+
|
92
|
+
overlaped
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.clean_sort(segments)
|
96
|
+
sorted = sort(segments).reject{|s| s.offset.nil?}
|
97
|
+
overlaps = overlaps(sorted)
|
98
|
+
overlaps.each do |s|
|
99
|
+
sorted.delete s
|
100
|
+
end
|
101
|
+
|
102
|
+
sorted
|
23
103
|
end
|
24
104
|
|
25
|
-
def self.split(text, segments)
|
26
|
-
sorted_segments =
|
105
|
+
def self.split(text, segments, skip_segments = false)
|
106
|
+
sorted_segments = clean_sort segments
|
27
107
|
|
28
108
|
chunks = []
|
29
109
|
segment_end = 0
|
30
110
|
text_offset = 0
|
31
|
-
sorted_segments.each do |segment|
|
111
|
+
sorted_segments.reverse.each do |segment|
|
32
112
|
return chunks if text.nil? or text.empty?
|
33
113
|
next if segment.offset.nil?
|
34
114
|
offset = segment.offset - text_offset
|
@@ -45,12 +125,15 @@ module Segment
|
|
45
125
|
|
46
126
|
segment_end = offset + segment.length - 1
|
47
127
|
|
48
|
-
|
49
|
-
|
50
|
-
|
128
|
+
if not skip_segments
|
129
|
+
chunk = text[offset..segment_end]
|
130
|
+
Segment.annotate(chunk, text_offset + offset)
|
131
|
+
chunks << chunk
|
132
|
+
end
|
51
133
|
|
52
134
|
text_offset += segment_end + 1
|
53
135
|
text = text[segment_end + 1..-1]
|
136
|
+
|
54
137
|
end
|
55
138
|
|
56
139
|
if not text.nil? and text.any?
|
@@ -62,62 +145,110 @@ module Segment
|
|
62
145
|
chunks
|
63
146
|
end
|
64
147
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
148
|
+
# {{{ Ranges and manipulation
|
149
|
+
|
150
|
+
def pull(offset)
|
151
|
+
if self.offset.nil? or offset.nil?
|
152
|
+
self.offset = nil
|
153
|
+
else
|
154
|
+
self.offset += offset
|
155
|
+
end
|
156
|
+
|
157
|
+
self
|
158
|
+
end
|
159
|
+
|
160
|
+
def push(offset)
|
161
|
+
if self.offset.nil? or offset.nil?
|
162
|
+
self.offset = nil
|
163
|
+
else
|
164
|
+
self.offset -= offset
|
165
|
+
end
|
166
|
+
|
167
|
+
self
|
168
|
+
end
|
169
|
+
|
170
|
+
def make_relative(segments)
|
171
|
+
segments.collect{|s| s.push offset}
|
172
|
+
end
|
173
|
+
|
174
|
+
def end
|
175
|
+
return nil if offset.nil?
|
176
|
+
offset + length - 1
|
69
177
|
end
|
70
178
|
|
71
179
|
def range
|
72
|
-
|
180
|
+
raise "No offset specified" if offset.nil?
|
181
|
+
(offset..self.end)
|
73
182
|
end
|
74
|
-
end
|
75
183
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
184
|
+
def range_in(container = nil)
|
185
|
+
raise "No offset specified" if offset.nil?
|
186
|
+
case
|
187
|
+
when (Segment === container and not container.offset.nil?)
|
188
|
+
((offset - container.offset)..(self.end - container.offset))
|
189
|
+
when Integer === container
|
190
|
+
((offset - container)..(self.end - container))
|
191
|
+
else
|
192
|
+
range
|
193
|
+
end
|
82
194
|
end
|
83
195
|
|
84
|
-
def
|
85
|
-
|
196
|
+
def self.align(text, parts)
|
197
|
+
pre_offset = 0
|
198
|
+
parts.each do |part|
|
199
|
+
offset = text.index part
|
200
|
+
next if offset.nil?
|
201
|
+
Segment.annotate(part, pre_offset + offset)
|
202
|
+
pre_offset += offset + part.length - 1
|
203
|
+
text = text[(offset + part.length - 1)..-1]
|
204
|
+
end
|
86
205
|
end
|
87
|
-
end
|
88
206
|
|
89
|
-
|
90
|
-
|
91
|
-
|
207
|
+
class Index
|
208
|
+
attr_accessor :index, :data
|
209
|
+
def initialize(index, data)
|
210
|
+
@index = index
|
211
|
+
@data = data
|
212
|
+
end
|
92
213
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
string.type = type
|
97
|
-
string.code = code
|
98
|
-
string.score = score
|
99
|
-
string
|
214
|
+
def [](pos)
|
215
|
+
index[pos].collect{|id| data[id]}
|
216
|
+
end
|
100
217
|
end
|
101
218
|
|
102
|
-
def
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
219
|
+
def self.index(segments, persistence_file = :memory)
|
220
|
+
|
221
|
+
segments = segments.values.flatten if Hash === segments
|
222
|
+
|
223
|
+
annotation_index =
|
224
|
+
Persistence.persist("Index", :Index, :fwt, :persistence => (! (persistence_file.nil? or persistence_file == :memory)), :persistence_file => persistence_file, :range => true) do
|
225
|
+
|
226
|
+
value_size = 0
|
227
|
+
index_data = segments.collect{|segment|
|
228
|
+
next if segment.offset.nil?
|
229
|
+
range = segment.range
|
230
|
+
value_size = [segment.id.length, value_size].max
|
231
|
+
[segment.id, [range.begin, range.end]]
|
232
|
+
}.compact
|
233
|
+
|
234
|
+
fwt = FixWidthTable.get :memory, value_size, true
|
235
|
+
fwt.add_range index_data
|
236
|
+
fwt
|
237
|
+
end
|
238
|
+
|
239
|
+
data = {}
|
240
|
+
segments.each do |segment| data[segment.id] = segment end
|
241
|
+
Index.new annotation_index, data
|
110
242
|
end
|
243
|
+
|
111
244
|
end
|
112
245
|
|
113
|
-
module
|
246
|
+
module Comment
|
114
247
|
include Segment
|
115
|
-
attr_accessor :
|
116
|
-
def self.annotate(
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
string
|
248
|
+
attr_accessor :comment
|
249
|
+
def self.annotate(text, comment = nil)
|
250
|
+
text.extend Comment
|
251
|
+
text.comment = (comment.nil? ? text : comment)
|
252
|
+
text
|
121
253
|
end
|
122
254
|
end
|
123
|
-
|