rbbt-text 0.6.3 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/corpus/document.rb +1 -0
- data/lib/rbbt/entity/document.rb +62 -18
- data/lib/rbbt/ner/abner.rb +6 -3
- data/lib/rbbt/ner/banner.rb +10 -7
- data/lib/rbbt/ner/chemical_tagger.rb +5 -3
- data/lib/rbbt/ner/finder.rb +60 -0
- data/lib/rbbt/ner/linnaeus.rb +38 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
- data/lib/rbbt/ner/oscar3.rb +9 -6
- data/lib/rbbt/ner/oscar4.rb +21 -7
- data/lib/rbbt/ner/rnorm.rb +57 -33
- data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
- data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
- data/lib/rbbt/ner/segment.rb +19 -8
- data/lib/rbbt/ner/segment/docid.rb +46 -0
- data/lib/rbbt/ner/segment/named_entity.rb +1 -1
- data/lib/rbbt/ner/segment/transformed.rb +5 -3
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
- data/share/install/software/Linnaeus +21 -0
- data/share/install/software/OpenNLP +12 -0
- data/share/rnorm/tokens_default +1 -2
- data/test/rbbt/entity/test_document.rb +66 -0
- data/test/rbbt/ner/segment/test_transformed.rb +10 -0
- data/test/rbbt/ner/test_finder.rb +34 -0
- data/test/rbbt/ner/test_linnaeus.rb +16 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
- data/test/rbbt/ner/test_oscar4.rb +3 -3
- data/test/rbbt/ner/test_rnorm.rb +3 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
- data/test/test_helper.rb +1 -1
- metadata +101 -99
- data/test/rbbt/corpus/test_corpus.rb +0 -99
- data/test/rbbt/corpus/test_document.rb +0 -236
@@ -1,99 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
-
require 'rbbt/corpus/corpus'
|
3
|
-
require 'rbbt/corpus/sources/pubmed'
|
4
|
-
|
5
|
-
class Document
|
6
|
-
define :sentences do
|
7
|
-
require 'rbbt/nlp/nlp'
|
8
|
-
NLP.geniass_sentence_splitter(text)
|
9
|
-
end
|
10
|
-
|
11
|
-
define :genes do
|
12
|
-
require 'rbbt/ner/abner'
|
13
|
-
Abner.new.entities(text)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
class TestCorpus < Test::Unit::TestCase
|
19
|
-
|
20
|
-
def test_add_document
|
21
|
-
pmid = "19458159"
|
22
|
-
|
23
|
-
text = PubMed.get_article(pmid).text
|
24
|
-
|
25
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
26
|
-
|
27
|
-
assert corpus.find(:pubmed, pmid).empty?
|
28
|
-
|
29
|
-
corpus.add_document(text, :pubmed, pmid, :abstract)
|
30
|
-
|
31
|
-
assert corpus.find(:pubmed, pmid).any?
|
32
|
-
assert corpus.find(:pubmed, pmid, :fulltext).empty?
|
33
|
-
assert corpus.find(:pubmed, pmid, :abstract).any?
|
34
|
-
|
35
|
-
assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
|
36
|
-
end
|
37
|
-
|
38
|
-
def test_add_pmid
|
39
|
-
pmid = "19465387"
|
40
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
41
|
-
corpus.add_pmid(pmid, :abstract)
|
42
|
-
|
43
|
-
assert corpus.exists? :pubmed, pmid
|
44
|
-
assert corpus.exists? :pubmed, pmid, :abstract
|
45
|
-
assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
|
46
|
-
end
|
47
|
-
|
48
|
-
def test_find_all
|
49
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
50
|
-
corpus.add_pmid("19458159", :abstract)
|
51
|
-
corpus.add_pmid("19465387", :abstract)
|
52
|
-
|
53
|
-
all = corpus.find
|
54
|
-
|
55
|
-
assert_equal 2, all.length
|
56
|
-
assert all.select{|document| document.id == "19458159"}.any?
|
57
|
-
assert all.select{|document| document.id == "19465387"}.any?
|
58
|
-
end
|
59
|
-
|
60
|
-
def test_doc_sentences
|
61
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
62
|
-
corpus.add_pmid("19458159", :abstract)
|
63
|
-
|
64
|
-
sentences = corpus.find.collect{|doc| doc.sentences}.flatten
|
65
|
-
|
66
|
-
assert corpus.find.first.sentences.length > 0
|
67
|
-
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
68
|
-
|
69
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
70
|
-
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
71
|
-
end
|
72
|
-
|
73
|
-
def test_doc_genes
|
74
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
75
|
-
corpus.add_pmid("21611789", :abstract)
|
76
|
-
|
77
|
-
assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_genes
|
81
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
82
|
-
corpus.add_pmid("21611789", :abstract)
|
83
|
-
|
84
|
-
assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
|
85
|
-
end
|
86
|
-
|
87
|
-
def test_index
|
88
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
89
|
-
corpus.add_pmid("21611789", :abstract)
|
90
|
-
|
91
|
-
document = corpus.find(:pubmed, "21611789").first
|
92
|
-
|
93
|
-
genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
|
94
|
-
|
95
|
-
assert genes.collect{|gene|
|
96
|
-
document.sentences_at(gene.offset)
|
97
|
-
}.flatten.length > 1
|
98
|
-
end
|
99
|
-
end
|
@@ -1,236 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
-
require 'rbbt/corpus/document'
|
3
|
-
require 'test/unit'
|
4
|
-
|
5
|
-
module TokenEntity
|
6
|
-
extend Annotation
|
7
|
-
include Segment
|
8
|
-
self.annotation :original
|
9
|
-
end
|
10
|
-
class Document
|
11
|
-
|
12
|
-
def tokenize(text)
|
13
|
-
Token.tokenize(text).collect do |token|
|
14
|
-
TokenEntity.setup(token.dup, token.offset, token.original)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
define :sentences do
|
19
|
-
require 'rbbt/nlp/nlp'
|
20
|
-
NLP.geniass_sentence_splitter(text)
|
21
|
-
end
|
22
|
-
|
23
|
-
define :tokens do
|
24
|
-
require 'rbbt/ner/segment/token'
|
25
|
-
tokenize(text)
|
26
|
-
end
|
27
|
-
|
28
|
-
define :long_words do
|
29
|
-
require 'rbbt/ner/segment/token'
|
30
|
-
tokenize(text).select{|tok| tok.length > 5}
|
31
|
-
end
|
32
|
-
|
33
|
-
define :short_words do
|
34
|
-
require 'rbbt/ner/segment/token'
|
35
|
-
tokenize(text).select{|tok| tok.length < 5}
|
36
|
-
end
|
37
|
-
|
38
|
-
define :even_words do
|
39
|
-
require 'rbbt/ner/segment/token'
|
40
|
-
tokenize(text).select{|tok| tok.length % 2 == 0}
|
41
|
-
end
|
42
|
-
|
43
|
-
define :missing do
|
44
|
-
[]
|
45
|
-
end
|
46
|
-
|
47
|
-
define :tokens_again do
|
48
|
-
raise "This should be here already"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
class TestDocument < Test::Unit::TestCase
|
53
|
-
|
54
|
-
def setup
|
55
|
-
global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
|
56
|
-
$persistence = TSV.setup({})
|
57
|
-
$tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
|
58
|
-
$global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
|
59
|
-
$tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
|
60
|
-
$tchash_global_persistence.read
|
61
|
-
$tchash_global_persistence.write
|
62
|
-
|
63
|
-
Document.class_eval do
|
64
|
-
|
65
|
-
persist :sentences
|
66
|
-
persist_in_tsv :tokens, :literal
|
67
|
-
persist_in_tsv :long_words, $tchash_persistence, :literal
|
68
|
-
persist_in_global_tsv :short_words, $global_persistence
|
69
|
-
persist_in_global_tsv :even_words, $tchash_global_persistence
|
70
|
-
persist_in_global_tsv :missing, $tchash_global_persistence
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def test_annotations
|
75
|
-
|
76
|
-
text =<<-EOF
|
77
|
-
This is a
|
78
|
-
sentence. This is
|
79
|
-
another sentence.
|
80
|
-
EOF
|
81
|
-
|
82
|
-
doc = Document.new
|
83
|
-
doc.text = text
|
84
|
-
|
85
|
-
assert_equal 2, doc.sentences.length
|
86
|
-
assert_equal 10, doc.tokens.length
|
87
|
-
end
|
88
|
-
|
89
|
-
def test_annotation_load
|
90
|
-
text =<<-EOF
|
91
|
-
This is a
|
92
|
-
sentence. This is
|
93
|
-
another sentence.
|
94
|
-
EOF
|
95
|
-
|
96
|
-
doc = Document.new
|
97
|
-
doc.text = text * 10
|
98
|
-
|
99
|
-
sentence = doc.sentences.last
|
100
|
-
doc.load_into sentence, :tokens
|
101
|
-
assert_equal 5, sentence.tokens.length
|
102
|
-
assert_equal "another", sentence.tokens[2]
|
103
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
104
|
-
end
|
105
|
-
|
106
|
-
def test_annotation_persistence
|
107
|
-
text =<<-EOF
|
108
|
-
This is a
|
109
|
-
sentence. This is
|
110
|
-
another sentence.
|
111
|
-
EOF
|
112
|
-
|
113
|
-
text *= 10
|
114
|
-
|
115
|
-
TmpFile.with_file do |dir|
|
116
|
-
FileUtils.mkdir_p dir
|
117
|
-
|
118
|
-
doc = Document.new(dir)
|
119
|
-
doc.text = text
|
120
|
-
doc.sentences
|
121
|
-
|
122
|
-
doc = Document.new(dir)
|
123
|
-
doc.text = text
|
124
|
-
|
125
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
126
|
-
doc.load_into sentence, :tokens
|
127
|
-
|
128
|
-
assert_equal 5, sentence.tokens.length
|
129
|
-
assert_equal "another", sentence.tokens[2]
|
130
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
def test_range_persistence
|
135
|
-
text =<<-EOF
|
136
|
-
This is a
|
137
|
-
sentence. This is
|
138
|
-
another sentence.
|
139
|
-
EOF
|
140
|
-
|
141
|
-
text *= 10
|
142
|
-
|
143
|
-
TmpFile.with_file do |dir|
|
144
|
-
FileUtils.mkdir_p dir
|
145
|
-
|
146
|
-
doc = Document.new(dir)
|
147
|
-
doc.text = text
|
148
|
-
|
149
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
150
|
-
Misc.benchmark(1) do
|
151
|
-
doc = Document.new(dir)
|
152
|
-
doc.text = text
|
153
|
-
|
154
|
-
doc.load_into sentence, :tokens, :persist => true
|
155
|
-
assert_equal 5, sentence.tokens.length
|
156
|
-
assert_equal "another", sentence.tokens[2]
|
157
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
158
|
-
assert_equal sentence.offset + 5, sentence.tokens[1].offset
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
def test_annotation_persistence_in_tsv
|
164
|
-
text =<<-EOF
|
165
|
-
This is a
|
166
|
-
sentence. This is
|
167
|
-
another sentence.
|
168
|
-
EOF
|
169
|
-
|
170
|
-
TmpFile.with_file do |dir|
|
171
|
-
FileUtils.mkdir_p dir
|
172
|
-
|
173
|
-
|
174
|
-
doc = Document.new(dir)
|
175
|
-
doc.text = text * 10
|
176
|
-
doc.sentences
|
177
|
-
|
178
|
-
doc = Document.new(dir)
|
179
|
-
doc.text = text * 10
|
180
|
-
|
181
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
182
|
-
|
183
|
-
doc.load_into sentence, :tokens, :long_words
|
184
|
-
|
185
|
-
assert_equal 5, sentence.tokens.length
|
186
|
-
assert_equal "another", sentence.tokens[2]
|
187
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
188
|
-
|
189
|
-
assert_equal 2, sentence.long_words.length
|
190
|
-
doc = Document.new(dir)
|
191
|
-
doc.text = text * 10
|
192
|
-
doc.sentences
|
193
|
-
assert_equal sentence, doc.sentences.sort_by{|sentence| sentence.offset}.last
|
194
|
-
|
195
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
196
|
-
doc.load_into sentence, :tokens, :long_words
|
197
|
-
|
198
|
-
assert_equal 2, sentence.long_words.length
|
199
|
-
assert_equal %w(another sentence), sentence.long_words
|
200
|
-
assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
def test_annotation_persistence_in_global
|
205
|
-
text =<<-EOF
|
206
|
-
This is a
|
207
|
-
sentence. This is
|
208
|
-
another sentence.
|
209
|
-
EOF
|
210
|
-
|
211
|
-
TmpFile.with_file do |dir|
|
212
|
-
FileUtils.mkdir_p dir
|
213
|
-
|
214
|
-
|
215
|
-
global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
|
216
|
-
doc = Document.new(dir, nil, nil, global_persistence)
|
217
|
-
doc.text = text * 10
|
218
|
-
doc.docid = "TEST"
|
219
|
-
|
220
|
-
doc.sentences
|
221
|
-
|
222
|
-
doc = Document.new(dir)
|
223
|
-
doc.text = text * 10
|
224
|
-
doc.docid = "TEST"
|
225
|
-
|
226
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
227
|
-
|
228
|
-
doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
|
229
|
-
|
230
|
-
assert_equal 3, sentence.short_words.length
|
231
|
-
assert_equal 3, sentence.even_words.length
|
232
|
-
end
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
|