rbbt-text 1.2.0 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,69 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/overlaps'
4
+
5
+ class TestOverlaps < Test::Unit::TestCase
6
+ def setup
7
+ @text = <<-EOF
8
+ This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
9
+ EOF
10
+
11
+ @entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
12
+ Segment.setup(literal, :offset => @text.index(literal))
13
+ end
14
+
15
+ @sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
16
+ Segment.setup sentence, :offset => @text.index(sentence)
17
+ end
18
+ end
19
+
20
+ def test_make_relative
21
+ sentence = @sentences[1]
22
+
23
+ @entities.each do |e|
24
+ assert_equal e, @text[e.range]
25
+ end
26
+
27
+ sentence.make_relative @entities do
28
+ @entities.each do |e|
29
+ assert_equal e, sentence[e.range]
30
+ end
31
+
32
+ @entities.each do |e|
33
+ assert_not_equal e, @text[e.range]
34
+ end
35
+ end
36
+
37
+ @entities.each do |e|
38
+ assert_equal e, @text[e.range]
39
+ end
40
+ end
41
+
42
+ def test_range_in
43
+ sentence = @sentences[1]
44
+
45
+ @entities.each do |e|
46
+ assert_equal e.range_in(sentence).begin, sentence.index(e)
47
+ assert_equal e.range.begin - sentence.offset, sentence.index(e)
48
+ end
49
+ end
50
+
51
+ def test_includes
52
+ @entities.each do |e|
53
+ assert ! @sentences[0].include?(e)
54
+ assert @sentences[1].include?(e)
55
+ assert ! e.include?(@sentences[0])
56
+ assert ! e.include?(@sentences[1])
57
+ end
58
+ end
59
+
60
+ def test_overlaps?
61
+ @entities.each do |e|
62
+ assert ! @sentences[0].overlaps?(e)
63
+ assert @sentences[1].overlaps?(e)
64
+ assert ! e.overlaps?(@sentences[0])
65
+ assert e.overlaps?(@sentences[1])
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,42 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/segment/range_index'
6
+
7
+ class TestRangeIndex < Test::Unit::TestCase
8
+ def test_segment_index
9
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
10
+ Document.setup(text, "TEST", "test_doc1", nil)
11
+
12
+ corpus = Document::Corpus.setup({})
13
+
14
+ corpus.add_document(text)
15
+
16
+ gene1 = "TP53"
17
+ gene1.extend Segment
18
+ gene1.offset = text.index gene1
19
+ gene1.docid = text.docid
20
+
21
+ gene2 = "CDK5R1"
22
+ gene2.extend Segment
23
+ gene2.offset = text.index gene2
24
+ gene2.docid = text.docid
25
+
26
+ gene3 = "TP53 gene"
27
+ gene3.extend Segment
28
+ gene3.offset = text.index gene1
29
+ gene3.docid = text.docid
30
+
31
+ index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
32
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
33
+
34
+ TmpFile.with_file do |fwt|
35
+ index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
36
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
37
+ index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
38
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
39
+ end
40
+ end
41
+ end
42
+
@@ -1,10 +1,21 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/text/segment/transformed'
3
- require 'rbbt/text/segment/named_entity'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/segment/transformed'
3
+ require 'rbbt/segment/named_entity'
4
4
  require 'rexml/document'
5
5
 
6
- class TestClass < Test::Unit::TestCase
7
- def test_sort
6
+ class TestTransformed < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @text = <<-EOF
10
+ More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
11
+ EOF
12
+
13
+ @entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
14
+ NamedEntity.setup(literal, :offset => @text.index(literal))
15
+ end
16
+ end
17
+
18
+ def test_transform
8
19
  text = <<-EOF
9
20
  More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
10
21
  EOF
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
13
24
  NamedEntity.setup(literal, :offset => text.index(literal))
14
25
  end
15
26
 
16
- Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
17
- assert text.include? "such as [IL-2]"
18
- end
27
+ Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
28
+ assert text.include? "such as [IL-2]"
19
29
  end
20
30
 
21
- def ___test_transform
22
- a = "This sentence mentions the TP53 gene and the CDK5 protein"
23
- original = a.dup
24
-
25
- gene1 = "TP53"
26
- gene1.extend Segment
27
- gene1.offset = a.index gene1
28
-
29
- gene2 = "CDK5"
30
- gene2.extend Segment
31
- gene2.offset = a.index gene2
32
-
33
- assert_equal gene1, a[gene1.range]
34
- assert_equal gene2, a[gene2.range]
35
-
36
- c = a.dup
37
-
38
- c[gene2.range] = "GN"
39
- assert_equal c, Transformed.transform(a,[gene2], "GN")
40
- c[gene1.range] = "GN"
41
- assert_equal c, Transformed.transform(a,[gene1], "GN")
42
-
43
- iii a.transformation_offset_differences
44
- raise
45
- assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
46
- assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
47
-
48
-
49
- gene3 = "GN gene"
50
- gene3.extend Segment
51
- gene3.offset = a.index gene3
52
-
53
- assert_equal gene3, a[gene3.range]
31
+ def test_with_transform
32
+ text = <<-EOF
33
+ More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
34
+ EOF
54
35
 
55
- a.restore([gene3])
56
- assert_equal original, a
57
- assert_equal "TP53 gene", a[gene3.range]
36
+ entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
37
+ NamedEntity.setup(literal, :offset => text.index(literal))
38
+ end
58
39
 
40
+ Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
41
+ assert text.include? "such as [IL-2]"
42
+ end
59
43
  end
60
44
 
61
- def test_with_transform
45
+ def test_with_transform_2
62
46
  a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
63
47
  original = a.dup
64
48
 
@@ -117,18 +101,47 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
117
101
  assert_equal "CDK5R1 protein", exp2
118
102
  end
119
103
 
104
+ def test_with_transform_sentences
105
+ a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
106
+ original = a.dup
107
+
108
+ gene1 = "TP53"
109
+ gene1.extend NamedEntity
110
+ gene1.offset = a.index gene1
111
+
112
+ gene2 = "CDK5R1"
113
+ gene2.extend NamedEntity
114
+ gene2.offset = a.index gene2
115
+
116
+ bread = "Bread"
117
+ bread.extend NamedEntity
118
+ bread.offset = a.index bread
119
+
120
+ sentences = Segment.align(a, a.split(". "))
121
+
122
+ Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
123
+ assert sentences[1].include?("GN gene and the GN protein")
124
+ end
125
+
126
+ Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
127
+ assert sentences[0].include?("first sentence mentions BR")
128
+ end
129
+
130
+
131
+ end
132
+
120
133
  def test_html
121
134
  a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
122
135
 
123
136
  gene1 = "TP53"
124
137
  gene1.extend NamedEntity
125
138
  gene1.offset = a.index gene1
126
- gene1.type = "Gene"
139
+ gene1.entity_type = "Gene"
127
140
 
128
141
  gene2 = "CDK5R1"
129
142
  gene2.extend NamedEntity
130
143
  gene2.offset = a.index gene2
131
- gene2.type = "Protein"
144
+ gene2.entity_type = "Protein"
132
145
 
133
146
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
134
147
  assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
@@ -143,13 +156,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
143
156
  gene1.extend NamedEntity
144
157
  gene1.offset = a.index gene1
145
158
  gene1.offset += 10
146
- gene1.type = "Gene"
159
+ gene1.entity_type = "Gene"
147
160
 
148
161
  gene2 = "CDK5R1"
149
162
  gene2.extend NamedEntity
150
163
  gene2.offset = a.index gene2
151
164
  gene2.offset += 10
152
- gene2.type = "Protein"
165
+ gene2.entity_type = "Protein"
153
166
 
154
167
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
155
168
  assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
@@ -162,12 +175,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
162
175
  gene1 = "TP53"
163
176
  gene1.extend NamedEntity
164
177
  gene1.offset = a.index gene1
165
- gene1.type = "Gene"
178
+ gene1.entity_type = "Gene"
166
179
 
167
180
  gene2 = "TP53 gene"
168
181
  gene2.extend NamedEntity
169
182
  gene2.offset = a.index gene2
170
- gene2.type = "Expanded Gene"
183
+ gene2.entity_type = "Expanded Gene"
171
184
 
172
185
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
173
186
 
@@ -379,5 +392,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
379
392
  end
380
393
  end
381
394
  end
395
+
396
+ def ___test_transform
397
+ a = "This sentence mentions the TP53 gene and the CDK5 protein"
398
+ original = a.dup
399
+
400
+ gene1 = "TP53"
401
+ gene1.extend Segment
402
+ gene1.offset = a.index gene1
403
+
404
+ gene2 = "CDK5"
405
+ gene2.extend Segment
406
+ gene2.offset = a.index gene2
407
+
408
+ assert_equal gene1, a[gene1.range]
409
+ assert_equal gene2, a[gene2.range]
410
+
411
+ c = a.dup
412
+
413
+ c[gene2.range] = "GN"
414
+ assert_equal c, Transformed.transform(a,[gene2], "GN")
415
+ c[gene1.range] = "GN"
416
+ assert_equal c, Transformed.transform(a,[gene1], "GN")
417
+
418
+ iii a.transformation_offset_differences
419
+ raise
420
+ assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
421
+ assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
422
+
423
+
424
+ gene3 = "GN gene"
425
+ gene3.extend Segment
426
+ gene3.offset = a.index gene3
427
+
428
+ assert_equal gene3, a[gene3.range]
429
+
430
+ a.restore([gene3])
431
+ assert_equal original, a
432
+ assert_equal "TP53 gene", a[gene3.range]
433
+
434
+ end
435
+
382
436
  end
383
437
 
@@ -0,0 +1,14 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+
4
+ class TestDocument < Test::Unit::TestCase
5
+
6
+ def test_docid
7
+ text = "This is a document"
8
+ Document.setup(text, "TEST", "test_doc1", nil)
9
+
10
+ assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
11
+ end
12
+
13
+ end
14
+
@@ -0,0 +1,182 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+
6
+ class TestSegment < Test::Unit::TestCase
7
+ def test_segment
8
+ text = "This is a document"
9
+ Document.setup(text, "TEST", "test_doc1", nil)
10
+
11
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
12
+
13
+ assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
14
+ end
15
+
16
+ def test_segid
17
+ text = "This is a document"
18
+ Document.setup(text, "TEST", "test_doc1", nil)
19
+
20
+ corpus = Document::Corpus.setup({})
21
+
22
+ corpus.add_document(text)
23
+
24
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
25
+
26
+ segid = segment.segid(corpus)
27
+
28
+ segment = segid.segment
29
+ assert_equal "is", segment
30
+ end
31
+
32
+ def test_info
33
+ segment = "test"
34
+ segment.extend Segment
35
+ segment.offset = 10
36
+ assert segment.info.include? :offset
37
+ end
38
+
39
+ def test_sort
40
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
41
+ Document.setup(text, "TEST", "test_doc1", nil)
42
+
43
+ corpus = Document::Corpus.setup({})
44
+
45
+ corpus.add_document(text)
46
+
47
+ gene1 = "TP53"
48
+ gene1.extend Segment
49
+ gene1.offset = text.index gene1
50
+ gene1.docid = text.docid
51
+
52
+ gene2 = "CDK5R1"
53
+ gene2.extend Segment
54
+ gene2.offset = text.index gene2
55
+ gene2.docid = text.docid
56
+
57
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
58
+
59
+ assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
60
+ end
61
+
62
+ def test_clean_sort
63
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
64
+ Document.setup(text, "TEST", "test_doc1", nil)
65
+
66
+ corpus = Document::Corpus.setup({})
67
+
68
+ corpus.add_document(text)
69
+
70
+ gene1 = "TP53"
71
+ gene1.extend Segment
72
+ gene1.offset = text.index gene1
73
+ gene1.docid = text.docid
74
+
75
+ gene2 = "CDK5R1"
76
+ gene2.extend Segment
77
+ gene2.offset = text.index gene2
78
+ gene2.docid = text.docid
79
+
80
+ gene3 = "TP53 gene"
81
+ gene3.extend Segment
82
+ gene3.offset = text.index gene1
83
+ gene3.docid = text.docid
84
+
85
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
86
+
87
+ assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
88
+ end
89
+
90
+ def test_split
91
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
92
+ Document.setup(text, "TEST", "test_doc1", nil)
93
+
94
+ corpus = Document::Corpus.setup({})
95
+
96
+ corpus.add_document(text)
97
+
98
+ gene1 = "TP53"
99
+ gene1.extend Segment
100
+ gene1.offset = text.index gene1
101
+ gene1.docid = text.docid
102
+
103
+ gene2 = "CDK5R1"
104
+ gene2.extend Segment
105
+ gene2.offset = text.index gene2
106
+ gene2.docid = text.docid
107
+
108
+ gene3 = "TP53 gene"
109
+ gene3.extend Segment
110
+ gene3.offset = text.index gene1
111
+ gene3.docid = text.docid
112
+
113
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
114
+
115
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
116
+ end
117
+
118
+
119
+ def test_align
120
+ text =<<-EOF
121
+ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
122
+ EOF
123
+
124
+ parts = text.split(/\W/)
125
+ Segment.align(text, parts)
126
+
127
+ assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
128
+
129
+ Document.setup(text, "TEST", "test_doc1", nil)
130
+
131
+ parts = text.split(/\W/)
132
+ Segment.align(text, parts)
133
+
134
+ assert_equal parts.first.docid, text.docid
135
+ end
136
+
137
+ def test_segment_index
138
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
139
+ Document.setup(text, "TEST", "test_doc1", nil)
140
+
141
+ corpus = Document::Corpus.setup({})
142
+
143
+ corpus.add_document(text)
144
+
145
+ gene1 = "TP53"
146
+ gene1.extend Segment
147
+ gene1.offset = text.index gene1
148
+ gene1.docid = text.docid
149
+
150
+ gene2 = "CDK5R1"
151
+ gene2.extend Segment
152
+ gene2.offset = text.index gene2
153
+ gene2.docid = text.docid
154
+
155
+ gene3 = "TP53 gene"
156
+ gene3.extend Segment
157
+ gene3.offset = text.index gene1
158
+ gene3.docid = text.docid
159
+
160
+ index = Segment.index([gene1, gene2, gene3], corpus)
161
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
162
+
163
+ TmpFile.with_file do |fwt|
164
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
165
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
166
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
167
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
168
+ end
169
+
170
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
171
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
172
+
173
+ TmpFile.with_file do |fwt|
174
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
175
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
176
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
177
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
178
+ end
179
+ end
180
+
181
+ end
182
+