rbbt-text 0.6.3 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/corpus/document.rb +1 -0
- data/lib/rbbt/entity/document.rb +62 -18
- data/lib/rbbt/ner/abner.rb +6 -3
- data/lib/rbbt/ner/banner.rb +10 -7
- data/lib/rbbt/ner/chemical_tagger.rb +5 -3
- data/lib/rbbt/ner/finder.rb +60 -0
- data/lib/rbbt/ner/linnaeus.rb +38 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +42 -48
- data/lib/rbbt/ner/oscar3.rb +9 -6
- data/lib/rbbt/ner/oscar4.rb +21 -7
- data/lib/rbbt/ner/rnorm.rb +57 -33
- data/lib/rbbt/ner/rnorm/cue_index.rb +4 -3
- data/lib/rbbt/ner/rnorm/tokens.rb +10 -4
- data/lib/rbbt/ner/segment.rb +19 -8
- data/lib/rbbt/ner/segment/docid.rb +46 -0
- data/lib/rbbt/ner/segment/named_entity.rb +1 -1
- data/lib/rbbt/ner/segment/transformed.rb +5 -3
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +22 -1
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +74 -0
- data/share/install/software/Linnaeus +21 -0
- data/share/install/software/OpenNLP +12 -0
- data/share/rnorm/tokens_default +1 -2
- data/test/rbbt/entity/test_document.rb +66 -0
- data/test/rbbt/ner/segment/test_transformed.rb +10 -0
- data/test/rbbt/ner/test_finder.rb +34 -0
- data/test/rbbt/ner/test_linnaeus.rb +16 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +22 -0
- data/test/rbbt/ner/test_oscar4.rb +3 -3
- data/test/rbbt/ner/test_rnorm.rb +3 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +45 -0
- data/test/test_helper.rb +1 -1
- metadata +101 -99
- data/test/rbbt/corpus/test_corpus.rb +0 -99
- data/test/rbbt/corpus/test_document.rb +0 -236
@@ -1,99 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
-
require 'rbbt/corpus/corpus'
|
3
|
-
require 'rbbt/corpus/sources/pubmed'
|
4
|
-
|
5
|
-
class Document
|
6
|
-
define :sentences do
|
7
|
-
require 'rbbt/nlp/nlp'
|
8
|
-
NLP.geniass_sentence_splitter(text)
|
9
|
-
end
|
10
|
-
|
11
|
-
define :genes do
|
12
|
-
require 'rbbt/ner/abner'
|
13
|
-
Abner.new.entities(text)
|
14
|
-
end
|
15
|
-
end
|
16
|
-
|
17
|
-
|
18
|
-
class TestCorpus < Test::Unit::TestCase
|
19
|
-
|
20
|
-
def test_add_document
|
21
|
-
pmid = "19458159"
|
22
|
-
|
23
|
-
text = PubMed.get_article(pmid).text
|
24
|
-
|
25
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
26
|
-
|
27
|
-
assert corpus.find(:pubmed, pmid).empty?
|
28
|
-
|
29
|
-
corpus.add_document(text, :pubmed, pmid, :abstract)
|
30
|
-
|
31
|
-
assert corpus.find(:pubmed, pmid).any?
|
32
|
-
assert corpus.find(:pubmed, pmid, :fulltext).empty?
|
33
|
-
assert corpus.find(:pubmed, pmid, :abstract).any?
|
34
|
-
|
35
|
-
assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
|
36
|
-
end
|
37
|
-
|
38
|
-
def test_add_pmid
|
39
|
-
pmid = "19465387"
|
40
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
41
|
-
corpus.add_pmid(pmid, :abstract)
|
42
|
-
|
43
|
-
assert corpus.exists? :pubmed, pmid
|
44
|
-
assert corpus.exists? :pubmed, pmid, :abstract
|
45
|
-
assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
|
46
|
-
end
|
47
|
-
|
48
|
-
def test_find_all
|
49
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
50
|
-
corpus.add_pmid("19458159", :abstract)
|
51
|
-
corpus.add_pmid("19465387", :abstract)
|
52
|
-
|
53
|
-
all = corpus.find
|
54
|
-
|
55
|
-
assert_equal 2, all.length
|
56
|
-
assert all.select{|document| document.id == "19458159"}.any?
|
57
|
-
assert all.select{|document| document.id == "19465387"}.any?
|
58
|
-
end
|
59
|
-
|
60
|
-
def test_doc_sentences
|
61
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
62
|
-
corpus.add_pmid("19458159", :abstract)
|
63
|
-
|
64
|
-
sentences = corpus.find.collect{|doc| doc.sentences}.flatten
|
65
|
-
|
66
|
-
assert corpus.find.first.sentences.length > 0
|
67
|
-
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
68
|
-
|
69
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
70
|
-
assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
|
71
|
-
end
|
72
|
-
|
73
|
-
def test_doc_genes
|
74
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
75
|
-
corpus.add_pmid("21611789", :abstract)
|
76
|
-
|
77
|
-
assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
|
78
|
-
end
|
79
|
-
|
80
|
-
def test_genes
|
81
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
82
|
-
corpus.add_pmid("21611789", :abstract)
|
83
|
-
|
84
|
-
assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
|
85
|
-
end
|
86
|
-
|
87
|
-
def test_index
|
88
|
-
corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
|
89
|
-
corpus.add_pmid("21611789", :abstract)
|
90
|
-
|
91
|
-
document = corpus.find(:pubmed, "21611789").first
|
92
|
-
|
93
|
-
genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
|
94
|
-
|
95
|
-
assert genes.collect{|gene|
|
96
|
-
document.sentences_at(gene.offset)
|
97
|
-
}.flatten.length > 1
|
98
|
-
end
|
99
|
-
end
|
@@ -1,236 +0,0 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
|
2
|
-
require 'rbbt/corpus/document'
|
3
|
-
require 'test/unit'
|
4
|
-
|
5
|
-
module TokenEntity
|
6
|
-
extend Annotation
|
7
|
-
include Segment
|
8
|
-
self.annotation :original
|
9
|
-
end
|
10
|
-
class Document
|
11
|
-
|
12
|
-
def tokenize(text)
|
13
|
-
Token.tokenize(text).collect do |token|
|
14
|
-
TokenEntity.setup(token.dup, token.offset, token.original)
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
define :sentences do
|
19
|
-
require 'rbbt/nlp/nlp'
|
20
|
-
NLP.geniass_sentence_splitter(text)
|
21
|
-
end
|
22
|
-
|
23
|
-
define :tokens do
|
24
|
-
require 'rbbt/ner/segment/token'
|
25
|
-
tokenize(text)
|
26
|
-
end
|
27
|
-
|
28
|
-
define :long_words do
|
29
|
-
require 'rbbt/ner/segment/token'
|
30
|
-
tokenize(text).select{|tok| tok.length > 5}
|
31
|
-
end
|
32
|
-
|
33
|
-
define :short_words do
|
34
|
-
require 'rbbt/ner/segment/token'
|
35
|
-
tokenize(text).select{|tok| tok.length < 5}
|
36
|
-
end
|
37
|
-
|
38
|
-
define :even_words do
|
39
|
-
require 'rbbt/ner/segment/token'
|
40
|
-
tokenize(text).select{|tok| tok.length % 2 == 0}
|
41
|
-
end
|
42
|
-
|
43
|
-
define :missing do
|
44
|
-
[]
|
45
|
-
end
|
46
|
-
|
47
|
-
define :tokens_again do
|
48
|
-
raise "This should be here already"
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
class TestDocument < Test::Unit::TestCase
|
53
|
-
|
54
|
-
def setup
|
55
|
-
global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
|
56
|
-
$persistence = TSV.setup({})
|
57
|
-
$tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
|
58
|
-
$global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
|
59
|
-
$tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
|
60
|
-
$tchash_global_persistence.read
|
61
|
-
$tchash_global_persistence.write
|
62
|
-
|
63
|
-
Document.class_eval do
|
64
|
-
|
65
|
-
persist :sentences
|
66
|
-
persist_in_tsv :tokens, :literal
|
67
|
-
persist_in_tsv :long_words, $tchash_persistence, :literal
|
68
|
-
persist_in_global_tsv :short_words, $global_persistence
|
69
|
-
persist_in_global_tsv :even_words, $tchash_global_persistence
|
70
|
-
persist_in_global_tsv :missing, $tchash_global_persistence
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
def test_annotations
|
75
|
-
|
76
|
-
text =<<-EOF
|
77
|
-
This is a
|
78
|
-
sentence. This is
|
79
|
-
another sentence.
|
80
|
-
EOF
|
81
|
-
|
82
|
-
doc = Document.new
|
83
|
-
doc.text = text
|
84
|
-
|
85
|
-
assert_equal 2, doc.sentences.length
|
86
|
-
assert_equal 10, doc.tokens.length
|
87
|
-
end
|
88
|
-
|
89
|
-
def test_annotation_load
|
90
|
-
text =<<-EOF
|
91
|
-
This is a
|
92
|
-
sentence. This is
|
93
|
-
another sentence.
|
94
|
-
EOF
|
95
|
-
|
96
|
-
doc = Document.new
|
97
|
-
doc.text = text * 10
|
98
|
-
|
99
|
-
sentence = doc.sentences.last
|
100
|
-
doc.load_into sentence, :tokens
|
101
|
-
assert_equal 5, sentence.tokens.length
|
102
|
-
assert_equal "another", sentence.tokens[2]
|
103
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
104
|
-
end
|
105
|
-
|
106
|
-
def test_annotation_persistence
|
107
|
-
text =<<-EOF
|
108
|
-
This is a
|
109
|
-
sentence. This is
|
110
|
-
another sentence.
|
111
|
-
EOF
|
112
|
-
|
113
|
-
text *= 10
|
114
|
-
|
115
|
-
TmpFile.with_file do |dir|
|
116
|
-
FileUtils.mkdir_p dir
|
117
|
-
|
118
|
-
doc = Document.new(dir)
|
119
|
-
doc.text = text
|
120
|
-
doc.sentences
|
121
|
-
|
122
|
-
doc = Document.new(dir)
|
123
|
-
doc.text = text
|
124
|
-
|
125
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
126
|
-
doc.load_into sentence, :tokens
|
127
|
-
|
128
|
-
assert_equal 5, sentence.tokens.length
|
129
|
-
assert_equal "another", sentence.tokens[2]
|
130
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
131
|
-
end
|
132
|
-
end
|
133
|
-
|
134
|
-
def test_range_persistence
|
135
|
-
text =<<-EOF
|
136
|
-
This is a
|
137
|
-
sentence. This is
|
138
|
-
another sentence.
|
139
|
-
EOF
|
140
|
-
|
141
|
-
text *= 10
|
142
|
-
|
143
|
-
TmpFile.with_file do |dir|
|
144
|
-
FileUtils.mkdir_p dir
|
145
|
-
|
146
|
-
doc = Document.new(dir)
|
147
|
-
doc.text = text
|
148
|
-
|
149
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
150
|
-
Misc.benchmark(1) do
|
151
|
-
doc = Document.new(dir)
|
152
|
-
doc.text = text
|
153
|
-
|
154
|
-
doc.load_into sentence, :tokens, :persist => true
|
155
|
-
assert_equal 5, sentence.tokens.length
|
156
|
-
assert_equal "another", sentence.tokens[2]
|
157
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
158
|
-
assert_equal sentence.offset + 5, sentence.tokens[1].offset
|
159
|
-
end
|
160
|
-
end
|
161
|
-
end
|
162
|
-
|
163
|
-
def test_annotation_persistence_in_tsv
|
164
|
-
text =<<-EOF
|
165
|
-
This is a
|
166
|
-
sentence. This is
|
167
|
-
another sentence.
|
168
|
-
EOF
|
169
|
-
|
170
|
-
TmpFile.with_file do |dir|
|
171
|
-
FileUtils.mkdir_p dir
|
172
|
-
|
173
|
-
|
174
|
-
doc = Document.new(dir)
|
175
|
-
doc.text = text * 10
|
176
|
-
doc.sentences
|
177
|
-
|
178
|
-
doc = Document.new(dir)
|
179
|
-
doc.text = text * 10
|
180
|
-
|
181
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
182
|
-
|
183
|
-
doc.load_into sentence, :tokens, :long_words
|
184
|
-
|
185
|
-
assert_equal 5, sentence.tokens.length
|
186
|
-
assert_equal "another", sentence.tokens[2]
|
187
|
-
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
188
|
-
|
189
|
-
assert_equal 2, sentence.long_words.length
|
190
|
-
doc = Document.new(dir)
|
191
|
-
doc.text = text * 10
|
192
|
-
doc.sentences
|
193
|
-
assert_equal sentence, doc.sentences.sort_by{|sentence| sentence.offset}.last
|
194
|
-
|
195
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
196
|
-
doc.load_into sentence, :tokens, :long_words
|
197
|
-
|
198
|
-
assert_equal 2, sentence.long_words.length
|
199
|
-
assert_equal %w(another sentence), sentence.long_words
|
200
|
-
assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
def test_annotation_persistence_in_global
|
205
|
-
text =<<-EOF
|
206
|
-
This is a
|
207
|
-
sentence. This is
|
208
|
-
another sentence.
|
209
|
-
EOF
|
210
|
-
|
211
|
-
TmpFile.with_file do |dir|
|
212
|
-
FileUtils.mkdir_p dir
|
213
|
-
|
214
|
-
|
215
|
-
global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
|
216
|
-
doc = Document.new(dir, nil, nil, global_persistence)
|
217
|
-
doc.text = text * 10
|
218
|
-
doc.docid = "TEST"
|
219
|
-
|
220
|
-
doc.sentences
|
221
|
-
|
222
|
-
doc = Document.new(dir)
|
223
|
-
doc.text = text * 10
|
224
|
-
doc.docid = "TEST"
|
225
|
-
|
226
|
-
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
227
|
-
|
228
|
-
doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
|
229
|
-
|
230
|
-
assert_equal 3, sentence.short_words.length
|
231
|
-
assert_equal 3, sentence.even_words.length
|
232
|
-
end
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
|