rbbt-text 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
module Annotated
|
3
|
+
attr_accessor :annotations
|
4
|
+
def self.annotate(string, annotations = nil)
|
5
|
+
string.extend Annotated
|
6
|
+
string.annotations = annotations || []
|
7
|
+
string
|
8
|
+
end
|
9
|
+
|
10
|
+
def split_segments(skip_segments = false)
|
11
|
+
Segment.split(self, @annotations, skip_segments)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
|
3
|
+
module NamedEntity
|
4
|
+
attr_accessor :type, :code, :score, :segment_types
|
5
|
+
include Segment
|
6
|
+
|
7
|
+
def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
|
8
|
+
string.extend NamedEntity
|
9
|
+
string.offset = offset unless offset.nil?
|
10
|
+
string.type = type unless type.nil?
|
11
|
+
string.code = code unless code.nil?
|
12
|
+
string.score = score unless score.nil?
|
13
|
+
string
|
14
|
+
end
|
15
|
+
|
16
|
+
def report
|
17
|
+
<<-EOF
|
18
|
+
String: #{ self }
|
19
|
+
Offset: #{ offset.inspect }
|
20
|
+
Type: #{type.inspect}
|
21
|
+
Code: #{code.inspect}
|
22
|
+
Score: #{score.inspect}
|
23
|
+
EOF
|
24
|
+
end
|
25
|
+
|
26
|
+
def html
|
27
|
+
text = <<-EOF
|
28
|
+
<span class='Entity'\
|
29
|
+
#{type.nil? ? "" : " attr-entity-type='#{Array === type ? type * " " : type}'"}\
|
30
|
+
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
31
|
+
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
32
|
+
>#{ self }</span>
|
33
|
+
EOF
|
34
|
+
text.chomp
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
|
3
|
+
module Relationship
|
4
|
+
attr_accessor :terms, :segment_types
|
5
|
+
include Segment
|
6
|
+
def self.annotate(string, offset = nil, terms = nil)
|
7
|
+
string.extend PPI
|
8
|
+
string.offset = offset unless offset.nil?
|
9
|
+
string.terms = terms unless terms.nil?
|
10
|
+
string
|
11
|
+
end
|
12
|
+
|
13
|
+
def html
|
14
|
+
text = <<-EOF
|
15
|
+
<span class='Relationship'\
|
16
|
+
>#{ self }</span>
|
17
|
+
EOF
|
18
|
+
text.chomp
|
19
|
+
end
|
20
|
+
|
21
|
+
def html_with_entities(*types)
|
22
|
+
annotations.values_at(*types).each do |segments|
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
|
3
|
+
module Token
|
4
|
+
include Segment
|
5
|
+
attr_accessor :original
|
6
|
+
def self.annotate(string, offset = nil, original = nil)
|
7
|
+
string.extend Token
|
8
|
+
string.offset = offset unless offset.nil?
|
9
|
+
string.original = original || string.dup
|
10
|
+
string
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
14
|
+
|
15
|
+
tokens = []
|
16
|
+
while matchdata = text.match(split_at)
|
17
|
+
tokens << Token.annotate(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
18
|
+
tokens << Token.annotate(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
19
|
+
start += matchdata.end(0)
|
20
|
+
text = matchdata.post_match
|
21
|
+
end
|
22
|
+
|
23
|
+
tokens << Token.annotate(text, start) unless text.empty?
|
24
|
+
|
25
|
+
tokens
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
@@ -0,0 +1,170 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
module Transformed
|
3
|
+
attr_accessor :transformation_offset_differences, :transformation_original
|
4
|
+
|
5
|
+
def self.with_transform(text, segments, replacement)
|
6
|
+
require 'rbbt/util/misc'
|
7
|
+
|
8
|
+
text.extend Transformed
|
9
|
+
text.replace(segments, replacement)
|
10
|
+
|
11
|
+
segments = yield text
|
12
|
+
|
13
|
+
segments = nil unless Array === segments
|
14
|
+
|
15
|
+
text.restore(segments, true)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.transform(text, segments, replacement = nil, &block)
|
19
|
+
require 'rbbt/util/misc'
|
20
|
+
|
21
|
+
text.extend Transformed
|
22
|
+
text.replace(segments, replacement, &block)
|
23
|
+
|
24
|
+
text
|
25
|
+
end
|
26
|
+
|
27
|
+
def transform_pos(pos)
|
28
|
+
return pos if transformation_offset_differences.nil?
|
29
|
+
# tranformation_offset_differences are assumed to be sorted in reverse
|
30
|
+
# order
|
31
|
+
transformation_offset_differences.reverse.each do |trans_diff|
|
32
|
+
acc = 0
|
33
|
+
trans_diff.reverse.each do |offset, diff, orig_length, trans_length|
|
34
|
+
break if offset >= pos
|
35
|
+
acc += diff
|
36
|
+
end
|
37
|
+
pos = pos - acc
|
38
|
+
end
|
39
|
+
|
40
|
+
pos
|
41
|
+
end
|
42
|
+
|
43
|
+
def transform_range(range)
|
44
|
+
(transform_pos(range.begin)..transform_pos(range.end))
|
45
|
+
end
|
46
|
+
|
47
|
+
def transformed_set(pos, value)
|
48
|
+
transformed_pos = case
|
49
|
+
when Range === pos
|
50
|
+
transform_range(pos)
|
51
|
+
when Integer === pos
|
52
|
+
transform_pos(pos)
|
53
|
+
else
|
54
|
+
raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
|
55
|
+
end
|
56
|
+
|
57
|
+
self[transformed_pos] = value
|
58
|
+
end
|
59
|
+
|
60
|
+
def transformed_get(pos)
|
61
|
+
transformed_pos = case
|
62
|
+
when Range === pos
|
63
|
+
transform_range(pos)
|
64
|
+
when Integer === pos
|
65
|
+
transform_pos(pos)
|
66
|
+
else
|
67
|
+
raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
|
68
|
+
end
|
69
|
+
|
70
|
+
self[transformed_pos]
|
71
|
+
end
|
72
|
+
|
73
|
+
def conflict?(segment_range)
|
74
|
+
return false if @transformation_offset_differences.nil? or @transformation_offset_differences.empty?
|
75
|
+
transformation_offset_difference = @transformation_offset_differences.last
|
76
|
+
|
77
|
+
transformation_offset_difference.each do |info|
|
78
|
+
offset, diff, orig_length, trans_length = info
|
79
|
+
return true if segment_range.begin > offset and segment_range.begin < offset + trans_length or
|
80
|
+
segment_range.end > offset and segment_range.end < offset + trans_length
|
81
|
+
end
|
82
|
+
|
83
|
+
return false
|
84
|
+
end
|
85
|
+
|
86
|
+
def replace(segments, replacement = nil, &block)
|
87
|
+
replacement ||= block
|
88
|
+
raise "No replacement given" if replacement.nil?
|
89
|
+
transformation_offset_differences = []
|
90
|
+
transformation_original = []
|
91
|
+
|
92
|
+
Segment.clean_sort(segments).reverse.each do |segment|
|
93
|
+
untransformed_segment_range_here= segment.range_in(self)
|
94
|
+
transformed_segment_range = self.transform_range(untransformed_segment_range_here)
|
95
|
+
next if conflict?(transformed_segment_range)
|
96
|
+
|
97
|
+
text_before_transform = self[transformed_segment_range]
|
98
|
+
|
99
|
+
case
|
100
|
+
when String === replacement
|
101
|
+
transformed_text = replacement
|
102
|
+
when Proc === replacement
|
103
|
+
|
104
|
+
# Prepare segment with new text
|
105
|
+
save_segment_text = segment.dup
|
106
|
+
save_offset = segment.offset
|
107
|
+
segment.replace text_before_transform
|
108
|
+
segment.offset = transformed_segment_range.begin
|
109
|
+
|
110
|
+
transformed_text = replacement.call segment
|
111
|
+
|
112
|
+
# Restore segment with original text
|
113
|
+
segment.replace save_segment_text
|
114
|
+
segment.offset = save_offset
|
115
|
+
else
|
116
|
+
raise "Replacemente not String nor Proc"
|
117
|
+
end
|
118
|
+
diff = segment.length - transformed_text.length
|
119
|
+
self[transformed_segment_range] = transformed_text
|
120
|
+
|
121
|
+
transformation_offset_differences << [untransformed_segment_range_here.begin, diff, text_before_transform.length, transformed_text.length]
|
122
|
+
transformation_original << text_before_transform
|
123
|
+
end
|
124
|
+
|
125
|
+
@transformation_offset_differences ||= []
|
126
|
+
@transformation_offset_differences << transformation_offset_differences
|
127
|
+
@transformation_original ||= []
|
128
|
+
@transformation_original << transformation_original
|
129
|
+
end
|
130
|
+
|
131
|
+
def restore(segments = nil, first_only = false)
|
132
|
+
stop = false
|
133
|
+
while self.transformation_offset_differences.any? and not stop
|
134
|
+
transformation_offset_differences = self.transformation_offset_differences.pop
|
135
|
+
transformation_original = self.transformation_original.pop
|
136
|
+
|
137
|
+
ranges = transformation_offset_differences.collect do |offset,diff,orig_length,rep_length|
|
138
|
+
(offset..(offset + rep_length - 1))
|
139
|
+
end
|
140
|
+
|
141
|
+
ranges.zip(transformation_original).reverse.each do |range,text|
|
142
|
+
self.transformed_set(range, text)
|
143
|
+
end
|
144
|
+
|
145
|
+
stop = true if first_only
|
146
|
+
|
147
|
+
next if segments.nil?
|
148
|
+
|
149
|
+
segment_ranges = segments.each do |segment|
|
150
|
+
r = segment.range
|
151
|
+
|
152
|
+
s = r.begin
|
153
|
+
e = r.end
|
154
|
+
sdiff = 0
|
155
|
+
ediff = 0
|
156
|
+
transformation_offset_differences.reverse.each do |offset,diff,orig_length,rep_length|
|
157
|
+
sdiff += diff if offset < s
|
158
|
+
ediff += diff if offset + rep_length - 1 < e
|
159
|
+
end
|
160
|
+
|
161
|
+
segment.offset = s + sdiff
|
162
|
+
segment.replace self[(s+sdiff)..(e + ediff)]
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
segments
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -7,7 +7,7 @@ require 'rbbt/ner/NER'
|
|
7
7
|
# in Java. Banner[http://banner.sourceforge.net/].
|
8
8
|
class Banner < NER
|
9
9
|
|
10
|
-
Rbbt.
|
10
|
+
Rbbt.software.opt.BANNER.define_as_install Rbbt.share.install.software.BANNER.find
|
11
11
|
|
12
12
|
@@JFile = Rjb::import('java.io.File')
|
13
13
|
@@SimpleTokenizer = Rjb::import('banner.tokenization.SimpleTokenizer')
|
@@ -22,9 +22,9 @@ class Banner < NER
|
|
22
22
|
# The parameters are set to default values, the only one that one
|
23
23
|
# might want to change is the modelfile to point to a custom trained
|
24
24
|
# one.
|
25
|
-
def initialize(modelfile =
|
26
|
-
lemmadir =
|
27
|
-
taggerdir =
|
25
|
+
def initialize(modelfile = Rbbt.software.opt.BANNER["gene_model.bin"].find,
|
26
|
+
lemmadir = Rbbt.software.opt.BANNER.nlpdata.lemmatiser.find,
|
27
|
+
taggerdir = Rbbt.software.opt.BANNER.nlpdata.tagger.find
|
28
28
|
)
|
29
29
|
|
30
30
|
@tokenizer = @@SimpleTokenizer.new
|
@@ -50,8 +50,10 @@ class Banner < NER
|
|
50
50
|
# Returns an array with the mention found in the provided piece of
|
51
51
|
# text.
|
52
52
|
def match(text)
|
53
|
+
return [] if text.nil?
|
53
54
|
text.gsub!(/\n/,' ')
|
54
55
|
text.gsub!(/\|/,'/') # Character | gives an error
|
56
|
+
return [] if text.strip.empty?
|
55
57
|
sentence = @@Sentence.new(text)
|
56
58
|
|
57
59
|
@tokenizer.tokenize(sentence)
|
@@ -65,7 +67,8 @@ class Banner < NER
|
|
65
67
|
mention = $1
|
66
68
|
mention.sub!(/^\s*/,'')
|
67
69
|
mention.sub!(/\s*$/,'')
|
68
|
-
|
70
|
+
offset = text.index(mention)
|
71
|
+
NamedEntity.annotate(mention, offset, 'GENE')
|
69
72
|
mention
|
70
73
|
}
|
71
74
|
res
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rjb'
|
3
|
+
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/NER'
|
5
|
+
require 'rbbt/util/log'
|
6
|
+
|
7
|
+
class ChemicalTagger < NER
|
8
|
+
Rbbt.software.opt.ChemicalTagger.define_as_install Rbbt.share.install.software.ChemicalTagger.find
|
9
|
+
|
10
|
+
Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
|
11
|
+
|
12
|
+
@@RbbtChemicalTagger = Rjb::import('RbbtChemicalTagger')
|
13
|
+
|
14
|
+
def self.match(text, type = nil, memm = false)
|
15
|
+
|
16
|
+
return [] if text.nil? or text.strip.empty?
|
17
|
+
|
18
|
+
begin
|
19
|
+
matches = @@RbbtChemicalTagger.match(text)
|
20
|
+
rescue
|
21
|
+
Log.debug "ChemicalTagger Error: #{$!.message}"
|
22
|
+
return []
|
23
|
+
end
|
24
|
+
|
25
|
+
matches.collect do |mention|
|
26
|
+
offset = text.index mention
|
27
|
+
NamedEntity.annotate mention, offset, "Chemical Mention", nil, nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def match(*args)
|
32
|
+
ChemicalTagger.match(*args)
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/util/tsv'
|
3
|
+
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/annotations/token'
|
5
|
+
require 'rbbt/ner/NER'
|
6
|
+
require 'inline'
|
7
|
+
|
8
|
+
# This code was adapted from Ashish Tendulkar (ASK MARTIN)
|
9
|
+
class NGramPrefixDictionary < NER
|
10
|
+
STOP_LETTERS = %w(\' " ( ) { } [ ] - ? ! < ; : > . ,)
|
11
|
+
STOP_LETTER_CHAR_VALUES = STOP_LETTERS.collect{|l| l[0]}
|
12
|
+
class << self
|
13
|
+
inline do |builder|
|
14
|
+
|
15
|
+
builder.c_raw <<-EOC
|
16
|
+
int is_stop_letter(char letter)
|
17
|
+
{
|
18
|
+
|
19
|
+
if( letter == ' ' || #{STOP_LETTERS.collect{|l| "letter == '#{l}' "} * "||"} ){ return 1;}
|
20
|
+
|
21
|
+
return 0;
|
22
|
+
}
|
23
|
+
EOC
|
24
|
+
|
25
|
+
builder.c <<-EOC
|
26
|
+
VALUE fast_start_with(VALUE str, VALUE cmp, int offset)
|
27
|
+
{
|
28
|
+
int length_cmp = RSTRING_LEN(cmp);
|
29
|
+
int length_str = RSTRING_LEN(str);
|
30
|
+
|
31
|
+
if (memcmp(RSTRING_PTR(str)+ offset, RSTRING_PTR(cmp), length_cmp) == 0){
|
32
|
+
if (length_cmp - offset == length_str || is_stop_letter(RSTRING_PTR(str)[offset + length_cmp]))
|
33
|
+
return Qtrue;
|
34
|
+
else
|
35
|
+
return Qfalse;
|
36
|
+
}
|
37
|
+
|
38
|
+
return Qfalse;
|
39
|
+
}
|
40
|
+
EOC
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.process(hash)
|
45
|
+
index = {}
|
46
|
+
hash.each do |code, names|
|
47
|
+
names.each do |name|
|
48
|
+
ngram = name[0..2].strip
|
49
|
+
index[ngram] ||= []
|
50
|
+
index[ngram] << [name, code]
|
51
|
+
end
|
52
|
+
end
|
53
|
+
index
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.match(index, text)
|
57
|
+
matches = []
|
58
|
+
|
59
|
+
text_offset = 0
|
60
|
+
text_length = text.length
|
61
|
+
while (not text_offset.nil?) and text_offset < text_length
|
62
|
+
text_offset += 1 if STOP_LETTER_CHAR_VALUES.include? text[text_offset]
|
63
|
+
ngram = text[text_offset..text_offset + 2].strip
|
64
|
+
|
65
|
+
found = nil
|
66
|
+
if index.include? ngram
|
67
|
+
|
68
|
+
diff = text_length - text_offset
|
69
|
+
# Match with entries
|
70
|
+
index[ngram].each do |name, code|
|
71
|
+
if name.length < diff
|
72
|
+
#if piece.start_with? name and
|
73
|
+
# (text_offset + name.length == text_length or piece[name.length] == " "[0])
|
74
|
+
|
75
|
+
if fast_start_with(text, name, text_offset)
|
76
|
+
found = [name, code, text_offset]
|
77
|
+
break
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
if found.nil?
|
84
|
+
text_offset = text.index(" ", text_offset)
|
85
|
+
text_offset += 1 unless text_offset.nil?
|
86
|
+
else
|
87
|
+
matches << found
|
88
|
+
text_offset += found.first.length
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
matches
|
93
|
+
end
|
94
|
+
|
95
|
+
attr_accessor :index, :type
|
96
|
+
def initialize(file, type = nil)
|
97
|
+
tsv = TSV.new(file, :flat)
|
98
|
+
@type = type
|
99
|
+
tsv.unnamed = true
|
100
|
+
@index = NGramPrefixDictionary.process(tsv)
|
101
|
+
end
|
102
|
+
|
103
|
+
def match(text)
|
104
|
+
NGramPrefixDictionary.match(index, text).collect{|name, code, offset|
|
105
|
+
NamedEntity.annotate(name, offset, type, code)
|
106
|
+
}
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
if __FILE__ == $0
|
111
|
+
require 'rbbt/sources/jochem'
|
112
|
+
require 'rbbt/sources/pubmed'
|
113
|
+
|
114
|
+
texts = []
|
115
|
+
index = {}
|
116
|
+
|
117
|
+
texts = PubMed.get_article(PubMed.query("GB-1a", 100)).collect do |pmid, article|
|
118
|
+
article.text
|
119
|
+
end
|
120
|
+
|
121
|
+
texts *= 150/texts.length
|
122
|
+
|
123
|
+
tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => false, :grep => "GB"
|
124
|
+
#tsv = Rbbt.share.databases.JoChem.lexicon.tsv :flat, :persistence => true
|
125
|
+
|
126
|
+
tsv.unnamed = true
|
127
|
+
ner = NGramPrefixDictionary.new(tsv)
|
128
|
+
|
129
|
+
Misc.benchmark do
|
130
|
+
texts.each do |text|
|
131
|
+
ner.match(text)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
end
|