rbbt-text 1.2.0 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/overlaps'
|
4
|
+
|
5
|
+
class TestOverlaps < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@text = <<-EOF
|
8
|
+
This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
9
|
+
EOF
|
10
|
+
|
11
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
12
|
+
Segment.setup(literal, :offset => @text.index(literal))
|
13
|
+
end
|
14
|
+
|
15
|
+
@sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
|
16
|
+
Segment.setup sentence, :offset => @text.index(sentence)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_make_relative
|
21
|
+
sentence = @sentences[1]
|
22
|
+
|
23
|
+
@entities.each do |e|
|
24
|
+
assert_equal e, @text[e.range]
|
25
|
+
end
|
26
|
+
|
27
|
+
sentence.make_relative @entities do
|
28
|
+
@entities.each do |e|
|
29
|
+
assert_equal e, sentence[e.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
@entities.each do |e|
|
33
|
+
assert_not_equal e, @text[e.range]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@entities.each do |e|
|
38
|
+
assert_equal e, @text[e.range]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_range_in
|
43
|
+
sentence = @sentences[1]
|
44
|
+
|
45
|
+
@entities.each do |e|
|
46
|
+
assert_equal e.range_in(sentence).begin, sentence.index(e)
|
47
|
+
assert_equal e.range.begin - sentence.offset, sentence.index(e)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_includes
|
52
|
+
@entities.each do |e|
|
53
|
+
assert ! @sentences[0].include?(e)
|
54
|
+
assert @sentences[1].include?(e)
|
55
|
+
assert ! e.include?(@sentences[0])
|
56
|
+
assert ! e.include?(@sentences[1])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_overlaps?
|
61
|
+
@entities.each do |e|
|
62
|
+
assert ! @sentences[0].overlaps?(e)
|
63
|
+
assert @sentences[1].overlaps?(e)
|
64
|
+
assert ! e.overlaps?(@sentences[0])
|
65
|
+
assert e.overlaps?(@sentences[1])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/range_index'
|
6
|
+
|
7
|
+
class TestRangeIndex < Test::Unit::TestCase
|
8
|
+
def test_segment_index
|
9
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = Document::Corpus.setup({})
|
13
|
+
|
14
|
+
corpus.add_document(text)
|
15
|
+
|
16
|
+
gene1 = "TP53"
|
17
|
+
gene1.extend Segment
|
18
|
+
gene1.offset = text.index gene1
|
19
|
+
gene1.docid = text.docid
|
20
|
+
|
21
|
+
gene2 = "CDK5R1"
|
22
|
+
gene2.extend Segment
|
23
|
+
gene2.offset = text.index gene2
|
24
|
+
gene2.docid = text.docid
|
25
|
+
|
26
|
+
gene3 = "TP53 gene"
|
27
|
+
gene3.extend Segment
|
28
|
+
gene3.offset = text.index gene1
|
29
|
+
gene3.docid = text.docid
|
30
|
+
|
31
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
|
32
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
33
|
+
|
34
|
+
TmpFile.with_file do |fwt|
|
35
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
36
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
37
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
38
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
@@ -1,10 +1,21 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/transformed'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
require 'rexml/document'
|
5
5
|
|
6
|
-
class
|
7
|
-
|
6
|
+
class TestTransformed < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@text = <<-EOF
|
10
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
11
|
+
EOF
|
12
|
+
|
13
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
14
|
+
NamedEntity.setup(literal, :offset => @text.index(literal))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_transform
|
8
19
|
text = <<-EOF
|
9
20
|
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
10
21
|
EOF
|
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
13
24
|
NamedEntity.setup(literal, :offset => text.index(literal))
|
14
25
|
end
|
15
26
|
|
16
|
-
Transformed.
|
17
|
-
|
18
|
-
end
|
27
|
+
Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
|
28
|
+
assert text.include? "such as [IL-2]"
|
19
29
|
end
|
20
30
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gene1 = "TP53"
|
26
|
-
gene1.extend Segment
|
27
|
-
gene1.offset = a.index gene1
|
28
|
-
|
29
|
-
gene2 = "CDK5"
|
30
|
-
gene2.extend Segment
|
31
|
-
gene2.offset = a.index gene2
|
32
|
-
|
33
|
-
assert_equal gene1, a[gene1.range]
|
34
|
-
assert_equal gene2, a[gene2.range]
|
35
|
-
|
36
|
-
c = a.dup
|
37
|
-
|
38
|
-
c[gene2.range] = "GN"
|
39
|
-
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
40
|
-
c[gene1.range] = "GN"
|
41
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
42
|
-
|
43
|
-
iii a.transformation_offset_differences
|
44
|
-
raise
|
45
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
46
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
47
|
-
|
48
|
-
|
49
|
-
gene3 = "GN gene"
|
50
|
-
gene3.extend Segment
|
51
|
-
gene3.offset = a.index gene3
|
52
|
-
|
53
|
-
assert_equal gene3, a[gene3.range]
|
31
|
+
def test_with_transform
|
32
|
+
text = <<-EOF
|
33
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
34
|
+
EOF
|
54
35
|
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
|
37
|
+
NamedEntity.setup(literal, :offset => text.index(literal))
|
38
|
+
end
|
58
39
|
|
40
|
+
Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
|
41
|
+
assert text.include? "such as [IL-2]"
|
42
|
+
end
|
59
43
|
end
|
60
44
|
|
61
|
-
def
|
45
|
+
def test_with_transform_2
|
62
46
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
63
47
|
original = a.dup
|
64
48
|
|
@@ -117,18 +101,47 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
117
101
|
assert_equal "CDK5R1 protein", exp2
|
118
102
|
end
|
119
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
120
133
|
def test_html
|
121
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
122
135
|
|
123
136
|
gene1 = "TP53"
|
124
137
|
gene1.extend NamedEntity
|
125
138
|
gene1.offset = a.index gene1
|
126
|
-
gene1.
|
139
|
+
gene1.entity_type = "Gene"
|
127
140
|
|
128
141
|
gene2 = "CDK5R1"
|
129
142
|
gene2.extend NamedEntity
|
130
143
|
gene2.offset = a.index gene2
|
131
|
-
gene2.
|
144
|
+
gene2.entity_type = "Protein"
|
132
145
|
|
133
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
134
147
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -143,13 +156,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
143
156
|
gene1.extend NamedEntity
|
144
157
|
gene1.offset = a.index gene1
|
145
158
|
gene1.offset += 10
|
146
|
-
gene1.
|
159
|
+
gene1.entity_type = "Gene"
|
147
160
|
|
148
161
|
gene2 = "CDK5R1"
|
149
162
|
gene2.extend NamedEntity
|
150
163
|
gene2.offset = a.index gene2
|
151
164
|
gene2.offset += 10
|
152
|
-
gene2.
|
165
|
+
gene2.entity_type = "Protein"
|
153
166
|
|
154
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
155
168
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -162,12 +175,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
162
175
|
gene1 = "TP53"
|
163
176
|
gene1.extend NamedEntity
|
164
177
|
gene1.offset = a.index gene1
|
165
|
-
gene1.
|
178
|
+
gene1.entity_type = "Gene"
|
166
179
|
|
167
180
|
gene2 = "TP53 gene"
|
168
181
|
gene2.extend NamedEntity
|
169
182
|
gene2.offset = a.index gene2
|
170
|
-
gene2.
|
183
|
+
gene2.entity_type = "Expanded Gene"
|
171
184
|
|
172
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
173
186
|
|
@@ -379,5 +392,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
379
392
|
end
|
380
393
|
end
|
381
394
|
end
|
395
|
+
|
396
|
+
def ___test_transform
|
397
|
+
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
398
|
+
original = a.dup
|
399
|
+
|
400
|
+
gene1 = "TP53"
|
401
|
+
gene1.extend Segment
|
402
|
+
gene1.offset = a.index gene1
|
403
|
+
|
404
|
+
gene2 = "CDK5"
|
405
|
+
gene2.extend Segment
|
406
|
+
gene2.offset = a.index gene2
|
407
|
+
|
408
|
+
assert_equal gene1, a[gene1.range]
|
409
|
+
assert_equal gene2, a[gene2.range]
|
410
|
+
|
411
|
+
c = a.dup
|
412
|
+
|
413
|
+
c[gene2.range] = "GN"
|
414
|
+
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
415
|
+
c[gene1.range] = "GN"
|
416
|
+
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
417
|
+
|
418
|
+
iii a.transformation_offset_differences
|
419
|
+
raise
|
420
|
+
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
421
|
+
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
422
|
+
|
423
|
+
|
424
|
+
gene3 = "GN gene"
|
425
|
+
gene3.extend Segment
|
426
|
+
gene3.offset = a.index gene3
|
427
|
+
|
428
|
+
assert_equal gene3, a[gene3.range]
|
429
|
+
|
430
|
+
a.restore([gene3])
|
431
|
+
assert_equal original, a
|
432
|
+
assert_equal "TP53 gene", a[gene3.range]
|
433
|
+
|
434
|
+
end
|
435
|
+
|
382
436
|
end
|
383
437
|
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
|
4
|
+
class TestDocument < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def test_docid
|
7
|
+
text = "This is a document"
|
8
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
9
|
+
|
10
|
+
assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
|
@@ -0,0 +1,182 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
|
6
|
+
class TestSegment < Test::Unit::TestCase
|
7
|
+
def test_segment
|
8
|
+
text = "This is a document"
|
9
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
10
|
+
|
11
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
12
|
+
|
13
|
+
assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_segid
|
17
|
+
text = "This is a document"
|
18
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
19
|
+
|
20
|
+
corpus = Document::Corpus.setup({})
|
21
|
+
|
22
|
+
corpus.add_document(text)
|
23
|
+
|
24
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
25
|
+
|
26
|
+
segid = segment.segid(corpus)
|
27
|
+
|
28
|
+
segment = segid.segment
|
29
|
+
assert_equal "is", segment
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_info
|
33
|
+
segment = "test"
|
34
|
+
segment.extend Segment
|
35
|
+
segment.offset = 10
|
36
|
+
assert segment.info.include? :offset
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_sort
|
40
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
41
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
42
|
+
|
43
|
+
corpus = Document::Corpus.setup({})
|
44
|
+
|
45
|
+
corpus.add_document(text)
|
46
|
+
|
47
|
+
gene1 = "TP53"
|
48
|
+
gene1.extend Segment
|
49
|
+
gene1.offset = text.index gene1
|
50
|
+
gene1.docid = text.docid
|
51
|
+
|
52
|
+
gene2 = "CDK5R1"
|
53
|
+
gene2.extend Segment
|
54
|
+
gene2.offset = text.index gene2
|
55
|
+
gene2.docid = text.docid
|
56
|
+
|
57
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
58
|
+
|
59
|
+
assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_clean_sort
|
63
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
64
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
65
|
+
|
66
|
+
corpus = Document::Corpus.setup({})
|
67
|
+
|
68
|
+
corpus.add_document(text)
|
69
|
+
|
70
|
+
gene1 = "TP53"
|
71
|
+
gene1.extend Segment
|
72
|
+
gene1.offset = text.index gene1
|
73
|
+
gene1.docid = text.docid
|
74
|
+
|
75
|
+
gene2 = "CDK5R1"
|
76
|
+
gene2.extend Segment
|
77
|
+
gene2.offset = text.index gene2
|
78
|
+
gene2.docid = text.docid
|
79
|
+
|
80
|
+
gene3 = "TP53 gene"
|
81
|
+
gene3.extend Segment
|
82
|
+
gene3.offset = text.index gene1
|
83
|
+
gene3.docid = text.docid
|
84
|
+
|
85
|
+
assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
|
86
|
+
|
87
|
+
assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_split
|
91
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
92
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
93
|
+
|
94
|
+
corpus = Document::Corpus.setup({})
|
95
|
+
|
96
|
+
corpus.add_document(text)
|
97
|
+
|
98
|
+
gene1 = "TP53"
|
99
|
+
gene1.extend Segment
|
100
|
+
gene1.offset = text.index gene1
|
101
|
+
gene1.docid = text.docid
|
102
|
+
|
103
|
+
gene2 = "CDK5R1"
|
104
|
+
gene2.extend Segment
|
105
|
+
gene2.offset = text.index gene2
|
106
|
+
gene2.docid = text.docid
|
107
|
+
|
108
|
+
gene3 = "TP53 gene"
|
109
|
+
gene3.extend Segment
|
110
|
+
gene3.offset = text.index gene1
|
111
|
+
gene3.docid = text.docid
|
112
|
+
|
113
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
|
114
|
+
|
115
|
+
assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
def test_align
|
120
|
+
text =<<-EOF
|
121
|
+
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
|
122
|
+
EOF
|
123
|
+
|
124
|
+
parts = text.split(/\W/)
|
125
|
+
Segment.align(text, parts)
|
126
|
+
|
127
|
+
assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
|
128
|
+
|
129
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
130
|
+
|
131
|
+
parts = text.split(/\W/)
|
132
|
+
Segment.align(text, parts)
|
133
|
+
|
134
|
+
assert_equal parts.first.docid, text.docid
|
135
|
+
end
|
136
|
+
|
137
|
+
def test_segment_index
|
138
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
139
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
140
|
+
|
141
|
+
corpus = Document::Corpus.setup({})
|
142
|
+
|
143
|
+
corpus.add_document(text)
|
144
|
+
|
145
|
+
gene1 = "TP53"
|
146
|
+
gene1.extend Segment
|
147
|
+
gene1.offset = text.index gene1
|
148
|
+
gene1.docid = text.docid
|
149
|
+
|
150
|
+
gene2 = "CDK5R1"
|
151
|
+
gene2.extend Segment
|
152
|
+
gene2.offset = text.index gene2
|
153
|
+
gene2.docid = text.docid
|
154
|
+
|
155
|
+
gene3 = "TP53 gene"
|
156
|
+
gene3.extend Segment
|
157
|
+
gene3.offset = text.index gene1
|
158
|
+
gene3.docid = text.docid
|
159
|
+
|
160
|
+
index = Segment.index([gene1, gene2, gene3], corpus)
|
161
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
162
|
+
|
163
|
+
TmpFile.with_file do |fwt|
|
164
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
165
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
166
|
+
index = Segment.index([gene1, gene2, gene3], corpus, fwt)
|
167
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
168
|
+
end
|
169
|
+
|
170
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
|
171
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
172
|
+
|
173
|
+
TmpFile.with_file do |fwt|
|
174
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
175
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
176
|
+
index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
|
177
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
end
|
182
|
+
|