rbbt-text 0.6.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,99 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
- require 'rbbt/corpus/corpus'
3
- require 'rbbt/corpus/sources/pubmed'
4
-
5
- class Document
6
- define :sentences do
7
- require 'rbbt/nlp/nlp'
8
- NLP.geniass_sentence_splitter(text)
9
- end
10
-
11
- define :genes do
12
- require 'rbbt/ner/abner'
13
- Abner.new.entities(text)
14
- end
15
- end
16
-
17
-
18
- class TestCorpus < Test::Unit::TestCase
19
-
20
- def test_add_document
21
- pmid = "19458159"
22
-
23
- text = PubMed.get_article(pmid).text
24
-
25
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
26
-
27
- assert corpus.find(:pubmed, pmid).empty?
28
-
29
- corpus.add_document(text, :pubmed, pmid, :abstract)
30
-
31
- assert corpus.find(:pubmed, pmid).any?
32
- assert corpus.find(:pubmed, pmid, :fulltext).empty?
33
- assert corpus.find(:pubmed, pmid, :abstract).any?
34
-
35
- assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
36
- end
37
-
38
- def test_add_pmid
39
- pmid = "19465387"
40
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
41
- corpus.add_pmid(pmid, :abstract)
42
-
43
- assert corpus.exists? :pubmed, pmid
44
- assert corpus.exists? :pubmed, pmid, :abstract
45
- assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
46
- end
47
-
48
- def test_find_all
49
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
50
- corpus.add_pmid("19458159", :abstract)
51
- corpus.add_pmid("19465387", :abstract)
52
-
53
- all = corpus.find
54
-
55
- assert_equal 2, all.length
56
- assert all.select{|document| document.id == "19458159"}.any?
57
- assert all.select{|document| document.id == "19465387"}.any?
58
- end
59
-
60
- def test_doc_sentences
61
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
62
- corpus.add_pmid("19458159", :abstract)
63
-
64
- sentences = corpus.find.collect{|doc| doc.sentences}.flatten
65
-
66
- assert corpus.find.first.sentences.length > 0
67
- assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
68
-
69
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
70
- assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
71
- end
72
-
73
- def test_doc_genes
74
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
75
- corpus.add_pmid("21611789", :abstract)
76
-
77
- assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
78
- end
79
-
80
- def test_genes
81
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
82
- corpus.add_pmid("21611789", :abstract)
83
-
84
- assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
85
- end
86
-
87
- def test_index
88
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
89
- corpus.add_pmid("21611789", :abstract)
90
-
91
- document = corpus.find(:pubmed, "21611789").first
92
-
93
- genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
94
-
95
- assert genes.collect{|gene|
96
- document.sentences_at(gene.offset)
97
- }.flatten.length > 1
98
- end
99
- end
@@ -1,236 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
- require 'rbbt/corpus/document'
3
- require 'test/unit'
4
-
5
- module TokenEntity
6
- extend Annotation
7
- include Segment
8
- self.annotation :original
9
- end
10
- class Document
11
-
12
- def tokenize(text)
13
- Token.tokenize(text).collect do |token|
14
- TokenEntity.setup(token.dup, token.offset, token.original)
15
- end
16
- end
17
-
18
- define :sentences do
19
- require 'rbbt/nlp/nlp'
20
- NLP.geniass_sentence_splitter(text)
21
- end
22
-
23
- define :tokens do
24
- require 'rbbt/ner/segment/token'
25
- tokenize(text)
26
- end
27
-
28
- define :long_words do
29
- require 'rbbt/ner/segment/token'
30
- tokenize(text).select{|tok| tok.length > 5}
31
- end
32
-
33
- define :short_words do
34
- require 'rbbt/ner/segment/token'
35
- tokenize(text).select{|tok| tok.length < 5}
36
- end
37
-
38
- define :even_words do
39
- require 'rbbt/ner/segment/token'
40
- tokenize(text).select{|tok| tok.length % 2 == 0}
41
- end
42
-
43
- define :missing do
44
- []
45
- end
46
-
47
- define :tokens_again do
48
- raise "This should be here already"
49
- end
50
- end
51
-
52
- class TestDocument < Test::Unit::TestCase
53
-
54
- def setup
55
- global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
56
- $persistence = TSV.setup({})
57
- $tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
58
- $global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
59
- $tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
60
- $tchash_global_persistence.read
61
- $tchash_global_persistence.write
62
-
63
- Document.class_eval do
64
-
65
- persist :sentences
66
- persist_in_tsv :tokens, :literal
67
- persist_in_tsv :long_words, $tchash_persistence, :literal
68
- persist_in_global_tsv :short_words, $global_persistence
69
- persist_in_global_tsv :even_words, $tchash_global_persistence
70
- persist_in_global_tsv :missing, $tchash_global_persistence
71
- end
72
- end
73
-
74
- def test_annotations
75
-
76
- text =<<-EOF
77
- This is a
78
- sentence. This is
79
- another sentence.
80
- EOF
81
-
82
- doc = Document.new
83
- doc.text = text
84
-
85
- assert_equal 2, doc.sentences.length
86
- assert_equal 10, doc.tokens.length
87
- end
88
-
89
- def test_annotation_load
90
- text =<<-EOF
91
- This is a
92
- sentence. This is
93
- another sentence.
94
- EOF
95
-
96
- doc = Document.new
97
- doc.text = text * 10
98
-
99
- sentence = doc.sentences.last
100
- doc.load_into sentence, :tokens
101
- assert_equal 5, sentence.tokens.length
102
- assert_equal "another", sentence.tokens[2]
103
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
104
- end
105
-
106
- def test_annotation_persistence
107
- text =<<-EOF
108
- This is a
109
- sentence. This is
110
- another sentence.
111
- EOF
112
-
113
- text *= 10
114
-
115
- TmpFile.with_file do |dir|
116
- FileUtils.mkdir_p dir
117
-
118
- doc = Document.new(dir)
119
- doc.text = text
120
- doc.sentences
121
-
122
- doc = Document.new(dir)
123
- doc.text = text
124
-
125
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
126
- doc.load_into sentence, :tokens
127
-
128
- assert_equal 5, sentence.tokens.length
129
- assert_equal "another", sentence.tokens[2]
130
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
131
- end
132
- end
133
-
134
- def test_range_persistence
135
- text =<<-EOF
136
- This is a
137
- sentence. This is
138
- another sentence.
139
- EOF
140
-
141
- text *= 10
142
-
143
- TmpFile.with_file do |dir|
144
- FileUtils.mkdir_p dir
145
-
146
- doc = Document.new(dir)
147
- doc.text = text
148
-
149
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
150
- Misc.benchmark(1) do
151
- doc = Document.new(dir)
152
- doc.text = text
153
-
154
- doc.load_into sentence, :tokens, :persist => true
155
- assert_equal 5, sentence.tokens.length
156
- assert_equal "another", sentence.tokens[2]
157
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
158
- assert_equal sentence.offset + 5, sentence.tokens[1].offset
159
- end
160
- end
161
- end
162
-
163
- def test_annotation_persistence_in_tsv
164
- text =<<-EOF
165
- This is a
166
- sentence. This is
167
- another sentence.
168
- EOF
169
-
170
- TmpFile.with_file do |dir|
171
- FileUtils.mkdir_p dir
172
-
173
-
174
- doc = Document.new(dir)
175
- doc.text = text * 10
176
- doc.sentences
177
-
178
- doc = Document.new(dir)
179
- doc.text = text * 10
180
-
181
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
182
-
183
- doc.load_into sentence, :tokens, :long_words
184
-
185
- assert_equal 5, sentence.tokens.length
186
- assert_equal "another", sentence.tokens[2]
187
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
188
-
189
- assert_equal 2, sentence.long_words.length
190
- doc = Document.new(dir)
191
- doc.text = text * 10
192
- doc.sentences
193
- assert_equal sentence, doc.sentences.sort_by{|sentence| sentence.offset}.last
194
-
195
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
196
- doc.load_into sentence, :tokens, :long_words
197
-
198
- assert_equal 2, sentence.long_words.length
199
- assert_equal %w(another sentence), sentence.long_words
200
- assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
201
- end
202
- end
203
-
204
- def test_annotation_persistence_in_global
205
- text =<<-EOF
206
- This is a
207
- sentence. This is
208
- another sentence.
209
- EOF
210
-
211
- TmpFile.with_file do |dir|
212
- FileUtils.mkdir_p dir
213
-
214
-
215
- global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
216
- doc = Document.new(dir, nil, nil, global_persistence)
217
- doc.text = text * 10
218
- doc.docid = "TEST"
219
-
220
- doc.sentences
221
-
222
- doc = Document.new(dir)
223
- doc.text = text * 10
224
- doc.docid = "TEST"
225
-
226
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
227
-
228
- doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
229
-
230
- assert_equal 3, sentence.short_words.length
231
- assert_equal 3, sentence.even_words.length
232
- end
233
- end
234
- end
235
-
236
-