rbbt-text 1.2.0 → 1.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/overlaps'
|
4
|
+
|
5
|
+
class TestOverlaps < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@text = <<-EOF
|
8
|
+
This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
9
|
+
EOF
|
10
|
+
|
11
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
12
|
+
Segment.setup(literal, :offset => @text.index(literal))
|
13
|
+
end
|
14
|
+
|
15
|
+
@sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
|
16
|
+
Segment.setup sentence, :offset => @text.index(sentence)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_make_relative
|
21
|
+
sentence = @sentences[1]
|
22
|
+
|
23
|
+
@entities.each do |e|
|
24
|
+
assert_equal e, @text[e.range]
|
25
|
+
end
|
26
|
+
|
27
|
+
sentence.make_relative @entities do
|
28
|
+
@entities.each do |e|
|
29
|
+
assert_equal e, sentence[e.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
@entities.each do |e|
|
33
|
+
assert_not_equal e, @text[e.range]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@entities.each do |e|
|
38
|
+
assert_equal e, @text[e.range]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_range_in
|
43
|
+
sentence = @sentences[1]
|
44
|
+
|
45
|
+
@entities.each do |e|
|
46
|
+
assert_equal e.range_in(sentence).begin, sentence.index(e)
|
47
|
+
assert_equal e.range.begin - sentence.offset, sentence.index(e)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_includes
|
52
|
+
@entities.each do |e|
|
53
|
+
assert ! @sentences[0].include?(e)
|
54
|
+
assert @sentences[1].include?(e)
|
55
|
+
assert ! e.include?(@sentences[0])
|
56
|
+
assert ! e.include?(@sentences[1])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_overlaps?
|
61
|
+
@entities.each do |e|
|
62
|
+
assert ! @sentences[0].overlaps?(e)
|
63
|
+
assert @sentences[1].overlaps?(e)
|
64
|
+
assert ! e.overlaps?(@sentences[0])
|
65
|
+
assert e.overlaps?(@sentences[1])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/range_index'
|
6
|
+
|
7
|
+
class TestRangeIndex < Test::Unit::TestCase
|
8
|
+
def test_segment_index
|
9
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = Document::Corpus.setup({})
|
13
|
+
|
14
|
+
corpus.add_document(text)
|
15
|
+
|
16
|
+
gene1 = "TP53"
|
17
|
+
gene1.extend Segment
|
18
|
+
gene1.offset = text.index gene1
|
19
|
+
gene1.docid = text.docid
|
20
|
+
|
21
|
+
gene2 = "CDK5R1"
|
22
|
+
gene2.extend Segment
|
23
|
+
gene2.offset = text.index gene2
|
24
|
+
gene2.docid = text.docid
|
25
|
+
|
26
|
+
gene3 = "TP53 gene"
|
27
|
+
gene3.extend Segment
|
28
|
+
gene3.offset = text.index gene1
|
29
|
+
gene3.docid = text.docid
|
30
|
+
|
31
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
|
32
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
33
|
+
|
34
|
+
TmpFile.with_file do |fwt|
|
35
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
36
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
37
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
38
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
@@ -1,10 +1,21 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/transformed'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
require 'rexml/document'
|
5
5
|
|
6
|
-
class
|
7
|
-
|
6
|
+
class TestTransformed < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@text = <<-EOF
|
10
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
11
|
+
EOF
|
12
|
+
|
13
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
14
|
+
NamedEntity.setup(literal, :offset => @text.index(literal))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_transform
|
8
19
|
text = <<-EOF
|
9
20
|
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
10
21
|
EOF
|
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
13
24
|
NamedEntity.setup(literal, :offset => text.index(literal))
|
14
25
|
end
|
15
26
|
|
16
|
-
Transformed.
|
17
|
-
|
18
|
-
end
|
27
|
+
Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
|
28
|
+
assert text.include? "such as [IL-2]"
|
19
29
|
end
|
20
30
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gene1 = "TP53"
|
26
|
-
gene1.extend Segment
|
27
|
-
gene1.offset = a.index gene1
|
28
|
-
|
29
|
-
gene2 = "CDK5"
|
30
|
-
gene2.extend Segment
|
31
|
-
gene2.offset = a.index gene2
|
32
|
-
|
33
|
-
assert_equal gene1, a[gene1.range]
|
34
|
-
assert_equal gene2, a[gene2.range]
|
35
|
-
|
36
|
-
c = a.dup
|
37
|
-
|
38
|
-
c[gene2.range] = "GN"
|
39
|
-
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
40
|
-
c[gene1.range] = "GN"
|
41
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
42
|
-
|
43
|
-
iii a.transformation_offset_differences
|
44
|
-
raise
|
45
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
46
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
47
|
-
|
48
|
-
|
49
|
-
gene3 = "GN gene"
|
50
|
-
gene3.extend Segment
|
51
|
-
gene3.offset = a.index gene3
|
52
|
-
|
53
|
-
assert_equal gene3, a[gene3.range]
|
31
|
+
def test_with_transform
|
32
|
+
text = <<-EOF
|
33
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
34
|
+
EOF
|
54
35
|
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
|
37
|
+
NamedEntity.setup(literal, :offset => text.index(literal))
|
38
|
+
end
|
58
39
|
|
40
|
+
Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
|
41
|
+
assert text.include? "such as [IL-2]"
|
42
|
+
end
|
59
43
|
end
|
60
44
|
|
61
|
-
def
|
45
|
+
def test_with_transform_2
|
62
46
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
63
47
|
original = a.dup
|
64
48
|
|
@@ -117,18 +101,47 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
117
101
|
assert_equal "CDK5R1 protein", exp2
|
118
102
|
end
|
119
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
120
133
|
def test_html
|
121
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
122
135
|
|
123
136
|
gene1 = "TP53"
|
124
137
|
gene1.extend NamedEntity
|
125
138
|
gene1.offset = a.index gene1
|
126
|
-
gene1.
|
139
|
+
gene1.entity_type = "Gene"
|
127
140
|
|
128
141
|
gene2 = "CDK5R1"
|
129
142
|
gene2.extend NamedEntity
|
130
143
|
gene2.offset = a.index gene2
|
131
|
-
gene2.
|
144
|
+
gene2.entity_type = "Protein"
|
132
145
|
|
133
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
134
147
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -143,13 +156,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
143
156
|
gene1.extend NamedEntity
|
144
157
|
gene1.offset = a.index gene1
|
145
158
|
gene1.offset += 10
|
146
|
-
gene1.
|
159
|
+
gene1.entity_type = "Gene"
|
147
160
|
|
148
161
|
gene2 = "CDK5R1"
|
149
162
|
gene2.extend NamedEntity
|
150
163
|
gene2.offset = a.index gene2
|
151
164
|
gene2.offset += 10
|
152
|
-
gene2.
|
165
|
+
gene2.entity_type = "Protein"
|
153
166
|
|
154
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
155
168
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -162,12 +175,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
162
175
|
gene1 = "TP53"
|
163
176
|
gene1.extend NamedEntity
|
164
177
|
gene1.offset = a.index gene1
|
165
|
-
gene1.
|
178
|
+
gene1.entity_type = "Gene"
|
166
179
|
|
167
180
|
gene2 = "TP53 gene"
|
168
181
|
gene2.extend NamedEntity
|
169
182
|
gene2.offset = a.index gene2
|
170
|
-
gene2.
|
183
|
+
gene2.entity_type = "Expanded Gene"
|
171
184
|
|
172
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
173
186
|
|
@@ -379,5 +392,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
379
392
|
end
|
380
393
|
end
|
381
394
|
end
|
395
|
+
|
396
|
+
def ___test_transform
|
397
|
+
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
398
|
+
original = a.dup
|
399
|
+
|
400
|
+
gene1 = "TP53"
|
401
|
+
gene1.extend Segment
|
402
|
+
gene1.offset = a.index gene1
|
403
|
+
|
404
|
+
gene2 = "CDK5"
|
405
|
+
gene2.extend Segment
|
406
|
+
gene2.offset = a.index gene2
|
407
|
+
|
408
|
+
assert_equal gene1, a[gene1.range]
|
409
|
+
assert_equal gene2, a[gene2.range]
|
410
|
+
|
411
|
+
c = a.dup
|
412
|
+
|
413
|
+
c[gene2.range] = "GN"
|
414
|
+
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
415
|
+
c[gene1.range] = "GN"
|
416
|
+
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
417
|
+
|
418
|
+
iii a.transformation_offset_differences
|
419
|
+
raise
|
420
|
+
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
421
|
+
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
422
|
+
|
423
|
+
|
424
|
+
gene3 = "GN gene"
|
425
|
+
gene3.extend Segment
|
426
|
+
gene3.offset = a.index gene3
|
427
|
+
|
428
|
+
assert_equal gene3, a[gene3.range]
|
429
|
+
|
430
|
+
a.restore([gene3])
|
431
|
+
assert_equal original, a
|
432
|
+
assert_equal "TP53 gene", a[gene3.range]
|
433
|
+
|
434
|
+
end
|
435
|
+
|
382
436
|
end
|
383
437
|
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
|
4
|
+
class TestDocument < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_docid
|
7
|
+
text = "This is a document"
|
8
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
9
|
+
|
10
|
+
assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
@@ -0,0 +1,182 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
|
6
|
+
class TestSegment < Test::Unit::TestCase
|
7
|
+
def test_segment
|
8
|
+
text = "This is a document"
|
9
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
10
|
+
|
11
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
12
|
+
|
13
|
+
assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_segid
|
17
|
+
text = "This is a document"
|
18
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
19
|
+
|
20
|
+
corpus = Document::Corpus.setup({})
|
21
|
+
|
22
|
+
corpus.add_document(text)
|
23
|
+
|
24
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
25
|
+
|
26
|
+
segid = segment.segid(corpus)
|
27
|
+
|
28
|
+
segment = segid.segment
|
29
|
+
assert_equal "is", segment
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_info
|
33
|
+
segment = "test"
|
34
|
+
segment.extend Segment
|
35
|
+
segment.offset = 10
|
36
|
+
assert segment.info.include? :offset
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_sort
|
40
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
41
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
42
|
+
|
43
|
+
corpus = Document::Corpus.setup({})
|
44
|
+
|
45
|
+
corpus.add_document(text)
|
46
|
+
|
47
|
+
gene1 = "TP53"
|
48
|
+
gene1.extend Segment
|
49
|
+
gene1.offset = text.index gene1
|
50
|
+
gene1.docid = text.docid
|
51
|
+
|
52
|
+
gene2 = "CDK5R1"
|
53
|
+
gene2.extend Segment
|
54
|
+
gene2.offset = text.index gene2
|
55
|
+
gene2.docid = text.docid
|
56
|
+
|
57
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
58
|
+
|
59
|
+
assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_clean_sort
|
63
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
64
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
65
|
+
|
66
|
+
corpus = Document::Corpus.setup({})
|
67
|
+
|
68
|
+
corpus.add_document(text)
|
69
|
+
|
70
|
+
gene1 = "TP53"
|
71
|
+
gene1.extend Segment
|
72
|
+
gene1.offset = text.index gene1
|
73
|
+
gene1.docid = text.docid
|
74
|
+
|
75
|
+
gene2 = "CDK5R1"
|
76
|
+
gene2.extend Segment
|
77
|
+
gene2.offset = text.index gene2
|
78
|
+
gene2.docid = text.docid
|
79
|
+
|
80
|
+
gene3 = "TP53 gene"
|
81
|
+
gene3.extend Segment
|
82
|
+
gene3.offset = text.index gene1
|
83
|
+
gene3.docid = text.docid
|
84
|
+
|
85
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
86
|
+
|
87
|
+
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_split
|
91
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
92
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
93
|
+
|
94
|
+
corpus = Document::Corpus.setup({})
|
95
|
+
|
96
|
+
corpus.add_document(text)
|
97
|
+
|
98
|
+
gene1 = "TP53"
|
99
|
+
gene1.extend Segment
|
100
|
+
gene1.offset = text.index gene1
|
101
|
+
gene1.docid = text.docid
|
102
|
+
|
103
|
+
gene2 = "CDK5R1"
|
104
|
+
gene2.extend Segment
|
105
|
+
gene2.offset = text.index gene2
|
106
|
+
gene2.docid = text.docid
|
107
|
+
|
108
|
+
gene3 = "TP53 gene"
|
109
|
+
gene3.extend Segment
|
110
|
+
gene3.offset = text.index gene1
|
111
|
+
gene3.docid = text.docid
|
112
|
+
|
113
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
|
114
|
+
|
115
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
def test_align
|
120
|
+
text =<<-EOF
|
121
|
+
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
122
|
+
EOF
|
123
|
+
|
124
|
+
parts = text.split(/\W/)
|
125
|
+
Segment.align(text, parts)
|
126
|
+
|
127
|
+
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
128
|
+
|
129
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
130
|
+
|
131
|
+
parts = text.split(/\W/)
|
132
|
+
Segment.align(text, parts)
|
133
|
+
|
134
|
+
assert_equal parts.first.docid, text.docid
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_segment_index
|
138
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
139
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
140
|
+
|
141
|
+
corpus = Document::Corpus.setup({})
|
142
|
+
|
143
|
+
corpus.add_document(text)
|
144
|
+
|
145
|
+
gene1 = "TP53"
|
146
|
+
gene1.extend Segment
|
147
|
+
gene1.offset = text.index gene1
|
148
|
+
gene1.docid = text.docid
|
149
|
+
|
150
|
+
gene2 = "CDK5R1"
|
151
|
+
gene2.extend Segment
|
152
|
+
gene2.offset = text.index gene2
|
153
|
+
gene2.docid = text.docid
|
154
|
+
|
155
|
+
gene3 = "TP53 gene"
|
156
|
+
gene3.extend Segment
|
157
|
+
gene3.offset = text.index gene1
|
158
|
+
gene3.docid = text.docid
|
159
|
+
|
160
|
+
index = Segment.index([gene1, gene2, gene3], corpus)
|
161
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
162
|
+
|
163
|
+
TmpFile.with_file do |fwt|
|
164
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
165
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
166
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
167
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
168
|
+
end
|
169
|
+
|
170
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
|
171
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
172
|
+
|
173
|
+
TmpFile.with_file do |fwt|
|
174
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
175
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
176
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
177
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|