rbbt-text 0.6.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,99 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
- require 'rbbt/corpus/corpus'
3
- require 'rbbt/corpus/sources/pubmed'
4
-
5
- class Document
6
- define :sentences do
7
- require 'rbbt/nlp/nlp'
8
- NLP.geniass_sentence_splitter(text)
9
- end
10
-
11
- define :genes do
12
- require 'rbbt/ner/abner'
13
- Abner.new.entities(text)
14
- end
15
- end
16
-
17
-
18
- class TestCorpus < Test::Unit::TestCase
19
-
20
- def test_add_document
21
- pmid = "19458159"
22
-
23
- text = PubMed.get_article(pmid).text
24
-
25
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
26
-
27
- assert corpus.find(:pubmed, pmid).empty?
28
-
29
- corpus.add_document(text, :pubmed, pmid, :abstract)
30
-
31
- assert corpus.find(:pubmed, pmid).any?
32
- assert corpus.find(:pubmed, pmid, :fulltext).empty?
33
- assert corpus.find(:pubmed, pmid, :abstract).any?
34
-
35
- assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
36
- end
37
-
38
- def test_add_pmid
39
- pmid = "19465387"
40
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
41
- corpus.add_pmid(pmid, :abstract)
42
-
43
- assert corpus.exists? :pubmed, pmid
44
- assert corpus.exists? :pubmed, pmid, :abstract
45
- assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
46
- end
47
-
48
- def test_find_all
49
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
50
- corpus.add_pmid("19458159", :abstract)
51
- corpus.add_pmid("19465387", :abstract)
52
-
53
- all = corpus.find
54
-
55
- assert_equal 2, all.length
56
- assert all.select{|document| document.id == "19458159"}.any?
57
- assert all.select{|document| document.id == "19465387"}.any?
58
- end
59
-
60
- def test_doc_sentences
61
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
62
- corpus.add_pmid("19458159", :abstract)
63
-
64
- sentences = corpus.find.collect{|doc| doc.sentences}.flatten
65
-
66
- assert corpus.find.first.sentences.length > 0
67
- assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
68
-
69
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
70
- assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
71
- end
72
-
73
- def test_doc_genes
74
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
75
- corpus.add_pmid("21611789", :abstract)
76
-
77
- assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
78
- end
79
-
80
- def test_genes
81
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
82
- corpus.add_pmid("21611789", :abstract)
83
-
84
- assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
85
- end
86
-
87
- def test_index
88
- corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
89
- corpus.add_pmid("21611789", :abstract)
90
-
91
- document = corpus.find(:pubmed, "21611789").first
92
-
93
- genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
94
-
95
- assert genes.collect{|gene|
96
- document.sentences_at(gene.offset)
97
- }.flatten.length > 1
98
- end
99
- end
@@ -1,236 +0,0 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
- require 'rbbt/corpus/document'
3
- require 'test/unit'
4
-
5
- module TokenEntity
6
- extend Annotation
7
- include Segment
8
- self.annotation :original
9
- end
10
- class Document
11
-
12
- def tokenize(text)
13
- Token.tokenize(text).collect do |token|
14
- TokenEntity.setup(token.dup, token.offset, token.original)
15
- end
16
- end
17
-
18
- define :sentences do
19
- require 'rbbt/nlp/nlp'
20
- NLP.geniass_sentence_splitter(text)
21
- end
22
-
23
- define :tokens do
24
- require 'rbbt/ner/segment/token'
25
- tokenize(text)
26
- end
27
-
28
- define :long_words do
29
- require 'rbbt/ner/segment/token'
30
- tokenize(text).select{|tok| tok.length > 5}
31
- end
32
-
33
- define :short_words do
34
- require 'rbbt/ner/segment/token'
35
- tokenize(text).select{|tok| tok.length < 5}
36
- end
37
-
38
- define :even_words do
39
- require 'rbbt/ner/segment/token'
40
- tokenize(text).select{|tok| tok.length % 2 == 0}
41
- end
42
-
43
- define :missing do
44
- []
45
- end
46
-
47
- define :tokens_again do
48
- raise "This should be here already"
49
- end
50
- end
51
-
52
- class TestDocument < Test::Unit::TestCase
53
-
54
- def setup
55
- global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
56
- $persistence = TSV.setup({})
57
- $tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
58
- $global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
59
- $tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
60
- $tchash_global_persistence.read
61
- $tchash_global_persistence.write
62
-
63
- Document.class_eval do
64
-
65
- persist :sentences
66
- persist_in_tsv :tokens, :literal
67
- persist_in_tsv :long_words, $tchash_persistence, :literal
68
- persist_in_global_tsv :short_words, $global_persistence
69
- persist_in_global_tsv :even_words, $tchash_global_persistence
70
- persist_in_global_tsv :missing, $tchash_global_persistence
71
- end
72
- end
73
-
74
- def test_annotations
75
-
76
- text =<<-EOF
77
- This is a
78
- sentence. This is
79
- another sentence.
80
- EOF
81
-
82
- doc = Document.new
83
- doc.text = text
84
-
85
- assert_equal 2, doc.sentences.length
86
- assert_equal 10, doc.tokens.length
87
- end
88
-
89
- def test_annotation_load
90
- text =<<-EOF
91
- This is a
92
- sentence. This is
93
- another sentence.
94
- EOF
95
-
96
- doc = Document.new
97
- doc.text = text * 10
98
-
99
- sentence = doc.sentences.last
100
- doc.load_into sentence, :tokens
101
- assert_equal 5, sentence.tokens.length
102
- assert_equal "another", sentence.tokens[2]
103
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
104
- end
105
-
106
- def test_annotation_persistence
107
- text =<<-EOF
108
- This is a
109
- sentence. This is
110
- another sentence.
111
- EOF
112
-
113
- text *= 10
114
-
115
- TmpFile.with_file do |dir|
116
- FileUtils.mkdir_p dir
117
-
118
- doc = Document.new(dir)
119
- doc.text = text
120
- doc.sentences
121
-
122
- doc = Document.new(dir)
123
- doc.text = text
124
-
125
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
126
- doc.load_into sentence, :tokens
127
-
128
- assert_equal 5, sentence.tokens.length
129
- assert_equal "another", sentence.tokens[2]
130
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
131
- end
132
- end
133
-
134
- def test_range_persistence
135
- text =<<-EOF
136
- This is a
137
- sentence. This is
138
- another sentence.
139
- EOF
140
-
141
- text *= 10
142
-
143
- TmpFile.with_file do |dir|
144
- FileUtils.mkdir_p dir
145
-
146
- doc = Document.new(dir)
147
- doc.text = text
148
-
149
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
150
- Misc.benchmark(1) do
151
- doc = Document.new(dir)
152
- doc.text = text
153
-
154
- doc.load_into sentence, :tokens, :persist => true
155
- assert_equal 5, sentence.tokens.length
156
- assert_equal "another", sentence.tokens[2]
157
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
158
- assert_equal sentence.offset + 5, sentence.tokens[1].offset
159
- end
160
- end
161
- end
162
-
163
- def test_annotation_persistence_in_tsv
164
- text =<<-EOF
165
- This is a
166
- sentence. This is
167
- another sentence.
168
- EOF
169
-
170
- TmpFile.with_file do |dir|
171
- FileUtils.mkdir_p dir
172
-
173
-
174
- doc = Document.new(dir)
175
- doc.text = text * 10
176
- doc.sentences
177
-
178
- doc = Document.new(dir)
179
- doc.text = text * 10
180
-
181
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
182
-
183
- doc.load_into sentence, :tokens, :long_words
184
-
185
- assert_equal 5, sentence.tokens.length
186
- assert_equal "another", sentence.tokens[2]
187
- assert_equal sentence.offset + 0, sentence.tokens[0].offset
188
-
189
- assert_equal 2, sentence.long_words.length
190
- doc = Document.new(dir)
191
- doc.text = text * 10
192
- doc.sentences
193
- assert_equal sentence, doc.sentences.sort_by{|sentence| sentence.offset}.last
194
-
195
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
196
- doc.load_into sentence, :tokens, :long_words
197
-
198
- assert_equal 2, sentence.long_words.length
199
- assert_equal %w(another sentence), sentence.long_words
200
- assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
201
- end
202
- end
203
-
204
- def test_annotation_persistence_in_global
205
- text =<<-EOF
206
- This is a
207
- sentence. This is
208
- another sentence.
209
- EOF
210
-
211
- TmpFile.with_file do |dir|
212
- FileUtils.mkdir_p dir
213
-
214
-
215
- global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
216
- doc = Document.new(dir, nil, nil, global_persistence)
217
- doc.text = text * 10
218
- doc.docid = "TEST"
219
-
220
- doc.sentences
221
-
222
- doc = Document.new(dir)
223
- doc.text = text * 10
224
- doc.docid = "TEST"
225
-
226
- sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
227
-
228
- doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
229
-
230
- assert_equal 3, sentence.short_words.length
231
- assert_equal 3, sentence.even_words.length
232
- end
233
- end
234
- end
235
-
236
-