rbbt-text 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
@@ -0,0 +1,115 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
2
|
+
require 'tokyocabinet'
|
3
|
+
|
4
|
+
class DocumentRepo < TokyoCabinet::BDB
|
5
|
+
class OpenError < StandardError;end
|
6
|
+
class KeyFormatError < StandardError;end
|
7
|
+
|
8
|
+
CONNECTIONS = {} unless defined? CONNECTIONS
|
9
|
+
|
10
|
+
def self.get(path, write = false)
|
11
|
+
|
12
|
+
if !File.exists?(path) or not CONNECTIONS.include? path
|
13
|
+
CONNECTIONS[path] = self.new(path, true)
|
14
|
+
end
|
15
|
+
|
16
|
+
d = CONNECTIONS[path]
|
17
|
+
|
18
|
+
if write and not d.write?
|
19
|
+
d.write
|
20
|
+
else
|
21
|
+
d.read if d.write?
|
22
|
+
end
|
23
|
+
|
24
|
+
d
|
25
|
+
end
|
26
|
+
|
27
|
+
|
28
|
+
alias original_open open
|
29
|
+
def open(write = false)
|
30
|
+
flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
|
31
|
+
|
32
|
+
FileUtils.mkdir_p File.dirname(@path_to_db) unless File.exists?(File.dirname(@path_to_db))
|
33
|
+
if !self.original_open(@path_to_db, flags)
|
34
|
+
ecode = self.ecode
|
35
|
+
raise OpenError, "Open error: #{self.errmsg(ecode)}. Trying to open file #{@path_to_db}"
|
36
|
+
end
|
37
|
+
|
38
|
+
@write = write
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
def write?
|
43
|
+
@write
|
44
|
+
end
|
45
|
+
|
46
|
+
def write
|
47
|
+
self.close
|
48
|
+
self.open(true)
|
49
|
+
end
|
50
|
+
|
51
|
+
def read
|
52
|
+
self.close
|
53
|
+
self.open(false)
|
54
|
+
end
|
55
|
+
|
56
|
+
def initialize(path, write = false)
|
57
|
+
super()
|
58
|
+
|
59
|
+
@path_to_db = path
|
60
|
+
|
61
|
+
if write || ! File.exists?(@path_to_db)
|
62
|
+
self.setcache(100000) or raise "Error setting cache"
|
63
|
+
self.open(true)
|
64
|
+
else
|
65
|
+
self.open(false)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def docid2fields(docid)
|
70
|
+
docid.split(":", -1).values_at 0,1,2,3
|
71
|
+
end
|
72
|
+
|
73
|
+
def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
|
74
|
+
[namespace, id, type, hash] * ":"
|
75
|
+
end
|
76
|
+
|
77
|
+
def docid(docid)
|
78
|
+
get(docid)
|
79
|
+
end
|
80
|
+
|
81
|
+
def add(text, namespace, id, type, hash)
|
82
|
+
write unless write?
|
83
|
+
docid = fields2docid(namespace, id, type, hash)
|
84
|
+
self[docid] = text unless self.include? docid
|
85
|
+
docid
|
86
|
+
end
|
87
|
+
|
88
|
+
def find(namespace=nil, id = nil, type = nil, hash = nil)
|
89
|
+
case
|
90
|
+
when namespace.nil?
|
91
|
+
self.keys
|
92
|
+
when id.nil?
|
93
|
+
range_start = [namespace] * ":" + ':'
|
94
|
+
range_end = [namespace] * ":" + ';'
|
95
|
+
self.range(range_start, true, range_end, false)
|
96
|
+
when (type and hash)
|
97
|
+
[[namespace, id, type, hash] * ":"]
|
98
|
+
when hash
|
99
|
+
[[namespace, id, "", hash] * ":"]
|
100
|
+
when type
|
101
|
+
range_start = [namespace, id, type] * ":" + ':'
|
102
|
+
range_end = [namespace, id, type] * ":" + ';'
|
103
|
+
self.range(range_start, true, range_end, false)
|
104
|
+
else
|
105
|
+
range_start = [namespace, id] * ":" + ':'
|
106
|
+
range_end = [namespace, id] * ":" + ';'
|
107
|
+
self.range(range_start, true, range_end, false)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def find_docid(docid)
|
112
|
+
find(*docid2fields(docid))
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rbbt/sources/pubmed'
|
2
|
+
|
3
|
+
class Corpus
|
4
|
+
|
5
|
+
NAMESPACES = {} unless defined? NAMESPACES
|
6
|
+
NAMESPACES[:pubmed] = :add_pmid
|
7
|
+
|
8
|
+
def add_pmid(pmid, type = nil)
|
9
|
+
pmids = Array === pmid ? pmid : [pmid]
|
10
|
+
type = nil if String === type and type.empty?
|
11
|
+
|
12
|
+
PubMed.get_article(pmids).collect do |pmid, article|
|
13
|
+
if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
|
14
|
+
add_document(article.text, :pubmed, pmid, :abstract)
|
15
|
+
else
|
16
|
+
raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
|
17
|
+
add_document(article.full_text, :pubmed, pmid, :fulltext)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def add_pubmed_query(query, max, type = nil)
|
23
|
+
pmids = PubMed.query(query, max)
|
24
|
+
add_pmid(pmids, type)
|
25
|
+
end
|
26
|
+
end
|
data/lib/rbbt/ner/NER.rb
CHANGED
@@ -1,18 +1,20 @@
|
|
1
1
|
require 'rbbt/ner/annotations'
|
2
|
+
require 'rbbt/ner/annotations/named_entity'
|
3
|
+
require 'rbbt/ner/annotations/annotated'
|
2
4
|
|
3
5
|
class NER
|
4
|
-
def entities(text,
|
6
|
+
def entities(text, protect = false, *args)
|
5
7
|
case
|
6
8
|
when Array === text
|
7
9
|
text.collect do |element|
|
8
|
-
matches = entities(element,
|
10
|
+
matches = entities(element, protect, *args)
|
9
11
|
matches.each{|match|
|
10
|
-
match.offset += element.offset if match.offset
|
12
|
+
match.offset += element.offset if match.offset and element.offset
|
11
13
|
}
|
12
14
|
matches
|
13
15
|
end.flatten
|
14
|
-
when (Annotated === text and
|
15
|
-
entities(text.
|
16
|
+
when (Annotated === text and protect)
|
17
|
+
entities(text.split_segments(true), protect, *args)
|
16
18
|
else
|
17
19
|
match(text, *args)
|
18
20
|
end
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -7,7 +7,7 @@ require 'rbbt/ner/NER'
|
|
7
7
|
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
8
8
|
class Abner < NER
|
9
9
|
|
10
|
-
Rbbt.
|
10
|
+
Rbbt.software.opt.ABNER.define_as_install Rbbt.share.install.software.ABNER.find
|
11
11
|
|
12
12
|
@@JFile = Rjb::import('java.io.File')
|
13
13
|
@@Tagger = Rjb::import('abner.Tagger')
|
@@ -27,14 +27,25 @@ class Abner < NER
|
|
27
27
|
# returns all the mentions found, regardless of type, to be coherent
|
28
28
|
# with the rest of NER packages in Rbbt.
|
29
29
|
def match(text)
|
30
|
+
return [] if text.nil? or text.empty?
|
30
31
|
|
31
32
|
res = @tagger.getEntities(text)
|
32
33
|
types = res[1]
|
33
34
|
strings = res[0]
|
34
35
|
|
36
|
+
global_offset = 0
|
35
37
|
strings.zip(types).collect do |mention, type|
|
36
38
|
mention = mention.to_s;
|
37
|
-
|
39
|
+
offset = text.index(mention)
|
40
|
+
if offset.nil?
|
41
|
+
NamedEntity.annotate(mention, nil, type.to_s)
|
42
|
+
else
|
43
|
+
NamedEntity.annotate(mention, offset + global_offset, type.to_s)
|
44
|
+
text = text[offset + mention.length..-1]
|
45
|
+
global_offset += offset + mention.length
|
46
|
+
end
|
47
|
+
|
48
|
+
mention
|
38
49
|
end
|
39
50
|
end
|
40
51
|
|
data/lib/rbbt/ner/annotations.rb
CHANGED
@@ -1,5 +1,63 @@
|
|
1
1
|
module Segment
|
2
|
-
attr_accessor :offset
|
2
|
+
attr_accessor :offset, :docid
|
3
|
+
|
4
|
+
def self.included(base)
|
5
|
+
if base.instance_methods.include? "segment_types"
|
6
|
+
class << base
|
7
|
+
self.module_eval do
|
8
|
+
define_method "extended" do |object|
|
9
|
+
object.segment_types ||= []
|
10
|
+
object.segment_types << self.to_s unless object.segment_types.include? self.to_s
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.annotate(string, offset = nil, docid = nil)
|
18
|
+
string.extend Segment
|
19
|
+
string.offset = offset
|
20
|
+
string.docid = docid
|
21
|
+
string
|
22
|
+
end
|
23
|
+
|
24
|
+
def id
|
25
|
+
new = info.dup
|
26
|
+
Digest::MD5.hexdigest(Misc.hash2string(new) << self << (offset || 0).to_s)
|
27
|
+
end
|
28
|
+
|
29
|
+
SKIP = %w(docid offset)
|
30
|
+
def info
|
31
|
+
equal_ascii = "="[0]
|
32
|
+
info = {}
|
33
|
+
singleton_methods.select{|method| method[-1] == equal_ascii}.
|
34
|
+
collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
|
35
|
+
info
|
36
|
+
info.delete_if{|k,v| v.nil?}
|
37
|
+
info
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.load(text, start, eend, info, docid = nil)
|
41
|
+
string = text[start.to_i..eend.to_i] if start and eend
|
42
|
+
string ||= info[:literal]
|
43
|
+
string.extend Segment
|
44
|
+
|
45
|
+
# add types
|
46
|
+
types = info.delete("segment_types")|| info.delete(:segment_types) || []
|
47
|
+
types.each do |type| string.extend Misc.string2const(type) end
|
48
|
+
|
49
|
+
# set info data
|
50
|
+
info.each do |key,value|
|
51
|
+
string.send key + '=', value if string.respond_to? key.to_sym
|
52
|
+
end
|
53
|
+
|
54
|
+
string.docid = docid
|
55
|
+
string.offset = start.to_i
|
56
|
+
|
57
|
+
string
|
58
|
+
end
|
59
|
+
|
60
|
+
# {{{ Sorting and splitting
|
3
61
|
|
4
62
|
def self.sort(segments, inline = true)
|
5
63
|
if inline
|
@@ -14,21 +72,43 @@ module Segment
|
|
14
72
|
when (not a.range.include? b.offset and not b.range.include? a.offset)
|
15
73
|
a.offset <=> b.offset
|
16
74
|
else
|
17
|
-
|
75
|
+
a.length <=> b.length
|
18
76
|
end
|
19
|
-
end
|
77
|
+
end
|
20
78
|
else
|
21
|
-
segments.sort_by do |segment| segment.offset || 0 end
|
79
|
+
segments.sort_by do |segment| segment.offset || 0 end.reverse
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def self.overlaps(sorted_segments)
|
84
|
+
|
85
|
+
last = nil
|
86
|
+
overlaped = []
|
87
|
+
sorted_segments.reverse.each do |segment|
|
88
|
+
overlaped << segment if (not last.nil?) and segment.range.end > last
|
89
|
+
last = segment.range.begin
|
22
90
|
end
|
91
|
+
|
92
|
+
overlaped
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.clean_sort(segments)
|
96
|
+
sorted = sort(segments).reject{|s| s.offset.nil?}
|
97
|
+
overlaps = overlaps(sorted)
|
98
|
+
overlaps.each do |s|
|
99
|
+
sorted.delete s
|
100
|
+
end
|
101
|
+
|
102
|
+
sorted
|
23
103
|
end
|
24
104
|
|
25
|
-
def self.split(text, segments)
|
26
|
-
sorted_segments =
|
105
|
+
def self.split(text, segments, skip_segments = false)
|
106
|
+
sorted_segments = clean_sort segments
|
27
107
|
|
28
108
|
chunks = []
|
29
109
|
segment_end = 0
|
30
110
|
text_offset = 0
|
31
|
-
sorted_segments.each do |segment|
|
111
|
+
sorted_segments.reverse.each do |segment|
|
32
112
|
return chunks if text.nil? or text.empty?
|
33
113
|
next if segment.offset.nil?
|
34
114
|
offset = segment.offset - text_offset
|
@@ -45,12 +125,15 @@ module Segment
|
|
45
125
|
|
46
126
|
segment_end = offset + segment.length - 1
|
47
127
|
|
48
|
-
|
49
|
-
|
50
|
-
|
128
|
+
if not skip_segments
|
129
|
+
chunk = text[offset..segment_end]
|
130
|
+
Segment.annotate(chunk, text_offset + offset)
|
131
|
+
chunks << chunk
|
132
|
+
end
|
51
133
|
|
52
134
|
text_offset += segment_end + 1
|
53
135
|
text = text[segment_end + 1..-1]
|
136
|
+
|
54
137
|
end
|
55
138
|
|
56
139
|
if not text.nil? and text.any?
|
@@ -62,62 +145,110 @@ module Segment
|
|
62
145
|
chunks
|
63
146
|
end
|
64
147
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
148
|
+
# {{{ Ranges and manipulation
|
149
|
+
|
150
|
+
def pull(offset)
|
151
|
+
if self.offset.nil? or offset.nil?
|
152
|
+
self.offset = nil
|
153
|
+
else
|
154
|
+
self.offset += offset
|
155
|
+
end
|
156
|
+
|
157
|
+
self
|
158
|
+
end
|
159
|
+
|
160
|
+
def push(offset)
|
161
|
+
if self.offset.nil? or offset.nil?
|
162
|
+
self.offset = nil
|
163
|
+
else
|
164
|
+
self.offset -= offset
|
165
|
+
end
|
166
|
+
|
167
|
+
self
|
168
|
+
end
|
169
|
+
|
170
|
+
def make_relative(segments)
|
171
|
+
segments.collect{|s| s.push offset}
|
172
|
+
end
|
173
|
+
|
174
|
+
def end
|
175
|
+
return nil if offset.nil?
|
176
|
+
offset + length - 1
|
69
177
|
end
|
70
178
|
|
71
179
|
def range
|
72
|
-
|
180
|
+
raise "No offset specified" if offset.nil?
|
181
|
+
(offset..self.end)
|
73
182
|
end
|
74
|
-
end
|
75
183
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
184
|
+
def range_in(container = nil)
|
185
|
+
raise "No offset specified" if offset.nil?
|
186
|
+
case
|
187
|
+
when (Segment === container and not container.offset.nil?)
|
188
|
+
((offset - container.offset)..(self.end - container.offset))
|
189
|
+
when Integer === container
|
190
|
+
((offset - container)..(self.end - container))
|
191
|
+
else
|
192
|
+
range
|
193
|
+
end
|
82
194
|
end
|
83
195
|
|
84
|
-
def
|
85
|
-
|
196
|
+
def self.align(text, parts)
|
197
|
+
pre_offset = 0
|
198
|
+
parts.each do |part|
|
199
|
+
offset = text.index part
|
200
|
+
next if offset.nil?
|
201
|
+
Segment.annotate(part, pre_offset + offset)
|
202
|
+
pre_offset += offset + part.length - 1
|
203
|
+
text = text[(offset + part.length - 1)..-1]
|
204
|
+
end
|
86
205
|
end
|
87
|
-
end
|
88
206
|
|
89
|
-
|
90
|
-
|
91
|
-
|
207
|
+
class Index
|
208
|
+
attr_accessor :index, :data
|
209
|
+
def initialize(index, data)
|
210
|
+
@index = index
|
211
|
+
@data = data
|
212
|
+
end
|
92
213
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
string.type = type
|
97
|
-
string.code = code
|
98
|
-
string.score = score
|
99
|
-
string
|
214
|
+
def [](pos)
|
215
|
+
index[pos].collect{|id| data[id]}
|
216
|
+
end
|
100
217
|
end
|
101
218
|
|
102
|
-
def
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
219
|
+
def self.index(segments, persistence_file = :memory)
|
220
|
+
|
221
|
+
segments = segments.values.flatten if Hash === segments
|
222
|
+
|
223
|
+
annotation_index =
|
224
|
+
Persistence.persist("Index", :Index, :fwt, :persistence => (! (persistence_file.nil? or persistence_file == :memory)), :persistence_file => persistence_file, :range => true) do
|
225
|
+
|
226
|
+
value_size = 0
|
227
|
+
index_data = segments.collect{|segment|
|
228
|
+
next if segment.offset.nil?
|
229
|
+
range = segment.range
|
230
|
+
value_size = [segment.id.length, value_size].max
|
231
|
+
[segment.id, [range.begin, range.end]]
|
232
|
+
}.compact
|
233
|
+
|
234
|
+
fwt = FixWidthTable.get :memory, value_size, true
|
235
|
+
fwt.add_range index_data
|
236
|
+
fwt
|
237
|
+
end
|
238
|
+
|
239
|
+
data = {}
|
240
|
+
segments.each do |segment| data[segment.id] = segment end
|
241
|
+
Index.new annotation_index, data
|
110
242
|
end
|
243
|
+
|
111
244
|
end
|
112
245
|
|
113
|
-
module
|
246
|
+
module Comment
|
114
247
|
include Segment
|
115
|
-
attr_accessor :
|
116
|
-
def self.annotate(
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
string
|
248
|
+
attr_accessor :comment
|
249
|
+
def self.annotate(text, comment = nil)
|
250
|
+
text.extend Comment
|
251
|
+
text.comment = (comment.nil? ? text : comment)
|
252
|
+
text
|
121
253
|
end
|
122
254
|
end
|
123
|
-
|