rbbt-text 1.2.0 → 1.3.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +55 -0
  5. data/lib/rbbt/document/annotation.rb +45 -0
  6. data/lib/rbbt/document/corpus.rb +63 -0
  7. data/lib/rbbt/document/corpus/pubmed.rb +33 -0
  8. data/lib/rbbt/ner/NER.rb +3 -3
  9. data/lib/rbbt/ner/abner.rb +1 -1
  10. data/lib/rbbt/ner/banner.rb +1 -1
  11. data/lib/rbbt/ner/brat.rb +1 -1
  12. data/lib/rbbt/ner/chemical_tagger.rb +1 -2
  13. data/lib/rbbt/ner/g_norm_plus.rb +26 -3
  14. data/lib/rbbt/ner/linnaeus.rb +3 -3
  15. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
  16. data/lib/rbbt/ner/oscar3.rb +1 -2
  17. data/lib/rbbt/ner/oscar4.rb +3 -3
  18. data/lib/rbbt/ner/patterns.rb +5 -5
  19. data/lib/rbbt/ner/regexpNER.rb +1 -2
  20. data/lib/rbbt/ner/token_trieNER.rb +35 -22
  21. data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
  22. data/lib/rbbt/nlp/nlp.rb +5 -5
  23. data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
  24. data/lib/rbbt/nlp/spaCy.rb +52 -0
  25. data/lib/rbbt/segment.rb +179 -0
  26. data/lib/rbbt/segment/annotation.rb +58 -0
  27. data/lib/rbbt/segment/encoding.rb +18 -0
  28. data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
  29. data/lib/rbbt/segment/overlaps.rb +63 -0
  30. data/lib/rbbt/segment/range_index.rb +35 -0
  31. data/lib/rbbt/segment/relationship.rb +7 -0
  32. data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
  33. data/lib/rbbt/segment/token.rb +23 -0
  34. data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
  35. data/lib/rbbt/segment/tsv.rb +41 -0
  36. data/share/install/software/Linnaeus +1 -1
  37. data/share/install/software/OpenNLP +1 -1
  38. data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
  39. data/test/rbbt/document/test_annotation.rb +140 -0
  40. data/test/rbbt/document/test_corpus.rb +33 -0
  41. data/test/rbbt/ner/test_finder.rb +3 -3
  42. data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
  43. data/test/rbbt/ner/test_patterns.rb +9 -9
  44. data/test/rbbt/ner/test_regexpNER.rb +14 -14
  45. data/test/rbbt/ner/test_rnorm.rb +3 -4
  46. data/test/rbbt/ner/test_token_trieNER.rb +1 -0
  47. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
  48. data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
  49. data/test/rbbt/segment/test_annotation.rb +39 -0
  50. data/test/rbbt/segment/test_corpus.rb +36 -0
  51. data/test/rbbt/segment/test_encoding.rb +24 -0
  52. data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
  53. data/test/rbbt/segment/test_overlaps.rb +69 -0
  54. data/test/rbbt/segment/test_range_index.rb +42 -0
  55. data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
  56. data/test/rbbt/test_document.rb +14 -0
  57. data/test/rbbt/test_segment.rb +182 -0
  58. data/test/test_helper.rb +5 -3
  59. data/test/test_spaCy.rb +32 -0
  60. metadata +44 -32
  61. data/lib/rbbt/text/corpus.rb +0 -106
  62. data/lib/rbbt/text/corpus/document.rb +0 -383
  63. data/lib/rbbt/text/corpus/document_repo.rb +0 -68
  64. data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
  65. data/lib/rbbt/text/document.rb +0 -39
  66. data/lib/rbbt/text/segment.rb +0 -363
  67. data/lib/rbbt/text/segment/docid.rb +0 -46
  68. data/lib/rbbt/text/segment/relationship.rb +0 -24
  69. data/lib/rbbt/text/segment/token.rb +0 -49
  70. data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
  71. data/test/rbbt/text/corpus/test_document.rb +0 -82
  72. data/test/rbbt/text/segment/test_relationship.rb +0 -0
  73. data/test/rbbt/text/segment/test_segmented.rb +0 -23
  74. data/test/rbbt/text/test_corpus.rb +0 -34
  75. data/test/rbbt/text/test_document.rb +0 -58
  76. data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,69 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/segment'
3
+ require 'rbbt/segment/overlaps'
4
+
5
+ class TestOverlaps < Test::Unit::TestCase
6
+ def setup
7
+ @text = <<-EOF
8
+ This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
9
+ EOF
10
+
11
+ @entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
12
+ Segment.setup(literal, :offset => @text.index(literal))
13
+ end
14
+
15
+ @sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
16
+ Segment.setup sentence, :offset => @text.index(sentence)
17
+ end
18
+ end
19
+
20
+ def test_make_relative
21
+ sentence = @sentences[1]
22
+
23
+ @entities.each do |e|
24
+ assert_equal e, @text[e.range]
25
+ end
26
+
27
+ sentence.make_relative @entities do
28
+ @entities.each do |e|
29
+ assert_equal e, sentence[e.range]
30
+ end
31
+
32
+ @entities.each do |e|
33
+ assert_not_equal e, @text[e.range]
34
+ end
35
+ end
36
+
37
+ @entities.each do |e|
38
+ assert_equal e, @text[e.range]
39
+ end
40
+ end
41
+
42
+ def test_range_in
43
+ sentence = @sentences[1]
44
+
45
+ @entities.each do |e|
46
+ assert_equal e.range_in(sentence).begin, sentence.index(e)
47
+ assert_equal e.range.begin - sentence.offset, sentence.index(e)
48
+ end
49
+ end
50
+
51
+ def test_includes
52
+ @entities.each do |e|
53
+ assert ! @sentences[0].include?(e)
54
+ assert @sentences[1].include?(e)
55
+ assert ! e.include?(@sentences[0])
56
+ assert ! e.include?(@sentences[1])
57
+ end
58
+ end
59
+
60
+ def test_overlaps?
61
+ @entities.each do |e|
62
+ assert ! @sentences[0].overlaps?(e)
63
+ assert @sentences[1].overlaps?(e)
64
+ assert ! e.overlaps?(@sentences[0])
65
+ assert e.overlaps?(@sentences[1])
66
+ end
67
+ end
68
+
69
+ end
@@ -0,0 +1,42 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+ require 'rbbt/segment/range_index'
6
+
7
+ class TestRangeIndex < Test::Unit::TestCase
8
+ def test_segment_index
9
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
10
+ Document.setup(text, "TEST", "test_doc1", nil)
11
+
12
+ corpus = Document::Corpus.setup({})
13
+
14
+ corpus.add_document(text)
15
+
16
+ gene1 = "TP53"
17
+ gene1.extend Segment
18
+ gene1.offset = text.index gene1
19
+ gene1.docid = text.docid
20
+
21
+ gene2 = "CDK5R1"
22
+ gene2.extend Segment
23
+ gene2.offset = text.index gene2
24
+ gene2.docid = text.docid
25
+
26
+ gene3 = "TP53 gene"
27
+ gene3.extend Segment
28
+ gene3.offset = text.index gene1
29
+ gene3.docid = text.docid
30
+
31
+ index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
32
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
33
+
34
+ TmpFile.with_file do |fwt|
35
+ index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
36
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
37
+ index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
38
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
39
+ end
40
+ end
41
+ end
42
+
@@ -1,10 +1,21 @@
1
- require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
- require 'rbbt/text/segment/transformed'
3
- require 'rbbt/text/segment/named_entity'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/segment/transformed'
3
+ require 'rbbt/segment/named_entity'
4
4
  require 'rexml/document'
5
5
 
6
- class TestClass < Test::Unit::TestCase
7
- def test_sort
6
+ class TestTransformed < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @text = <<-EOF
10
+ More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
11
+ EOF
12
+
13
+ @entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
14
+ NamedEntity.setup(literal, :offset => @text.index(literal))
15
+ end
16
+ end
17
+
18
+ def test_transform
8
19
  text = <<-EOF
9
20
  More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
10
21
  EOF
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
13
24
  NamedEntity.setup(literal, :offset => text.index(literal))
14
25
  end
15
26
 
16
- Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
17
- assert text.include? "such as [IL-2]"
18
- end
27
+ Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
28
+ assert text.include? "such as [IL-2]"
19
29
  end
20
30
 
21
- def ___test_transform
22
- a = "This sentence mentions the TP53 gene and the CDK5 protein"
23
- original = a.dup
24
-
25
- gene1 = "TP53"
26
- gene1.extend Segment
27
- gene1.offset = a.index gene1
28
-
29
- gene2 = "CDK5"
30
- gene2.extend Segment
31
- gene2.offset = a.index gene2
32
-
33
- assert_equal gene1, a[gene1.range]
34
- assert_equal gene2, a[gene2.range]
35
-
36
- c = a.dup
37
-
38
- c[gene2.range] = "GN"
39
- assert_equal c, Transformed.transform(a,[gene2], "GN")
40
- c[gene1.range] = "GN"
41
- assert_equal c, Transformed.transform(a,[gene1], "GN")
42
-
43
- iii a.transformation_offset_differences
44
- raise
45
- assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
46
- assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
47
-
48
-
49
- gene3 = "GN gene"
50
- gene3.extend Segment
51
- gene3.offset = a.index gene3
52
-
53
- assert_equal gene3, a[gene3.range]
31
+ def test_with_transform
32
+ text = <<-EOF
33
+ More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
34
+ EOF
54
35
 
55
- a.restore([gene3])
56
- assert_equal original, a
57
- assert_equal "TP53 gene", a[gene3.range]
36
+ entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
37
+ NamedEntity.setup(literal, :offset => text.index(literal))
38
+ end
58
39
 
40
+ Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
41
+ assert text.include? "such as [IL-2]"
42
+ end
59
43
  end
60
44
 
61
- def test_with_transform
45
+ def test_with_transform_2
62
46
  a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
63
47
  original = a.dup
64
48
 
@@ -117,18 +101,47 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
117
101
  assert_equal "CDK5R1 protein", exp2
118
102
  end
119
103
 
104
+ def test_with_transform_sentences
105
+ a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
106
+ original = a.dup
107
+
108
+ gene1 = "TP53"
109
+ gene1.extend NamedEntity
110
+ gene1.offset = a.index gene1
111
+
112
+ gene2 = "CDK5R1"
113
+ gene2.extend NamedEntity
114
+ gene2.offset = a.index gene2
115
+
116
+ bread = "Bread"
117
+ bread.extend NamedEntity
118
+ bread.offset = a.index bread
119
+
120
+ sentences = Segment.align(a, a.split(". "))
121
+
122
+ Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
123
+ assert sentences[1].include?("GN gene and the GN protein")
124
+ end
125
+
126
+ Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
127
+ assert sentences[0].include?("first sentence mentions BR")
128
+ end
129
+
130
+
131
+ end
132
+
120
133
  def test_html
121
134
  a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
122
135
 
123
136
  gene1 = "TP53"
124
137
  gene1.extend NamedEntity
125
138
  gene1.offset = a.index gene1
126
- gene1.type = "Gene"
139
+ gene1.entity_type = "Gene"
127
140
 
128
141
  gene2 = "CDK5R1"
129
142
  gene2.extend NamedEntity
130
143
  gene2.offset = a.index gene2
131
- gene2.type = "Protein"
144
+ gene2.entity_type = "Protein"
132
145
 
133
146
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
134
147
  assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
@@ -143,13 +156,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
143
156
  gene1.extend NamedEntity
144
157
  gene1.offset = a.index gene1
145
158
  gene1.offset += 10
146
- gene1.type = "Gene"
159
+ gene1.entity_type = "Gene"
147
160
 
148
161
  gene2 = "CDK5R1"
149
162
  gene2.extend NamedEntity
150
163
  gene2.offset = a.index gene2
151
164
  gene2.offset += 10
152
- gene2.type = "Protein"
165
+ gene2.entity_type = "Protein"
153
166
 
154
167
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
155
168
  assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
@@ -162,12 +175,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
162
175
  gene1 = "TP53"
163
176
  gene1.extend NamedEntity
164
177
  gene1.offset = a.index gene1
165
- gene1.type = "Gene"
178
+ gene1.entity_type = "Gene"
166
179
 
167
180
  gene2 = "TP53 gene"
168
181
  gene2.extend NamedEntity
169
182
  gene2.offset = a.index gene2
170
- gene2.type = "Expanded Gene"
183
+ gene2.entity_type = "Expanded Gene"
171
184
 
172
185
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
173
186
 
@@ -379,5 +392,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
379
392
  end
380
393
  end
381
394
  end
395
+
396
+ def ___test_transform
397
+ a = "This sentence mentions the TP53 gene and the CDK5 protein"
398
+ original = a.dup
399
+
400
+ gene1 = "TP53"
401
+ gene1.extend Segment
402
+ gene1.offset = a.index gene1
403
+
404
+ gene2 = "CDK5"
405
+ gene2.extend Segment
406
+ gene2.offset = a.index gene2
407
+
408
+ assert_equal gene1, a[gene1.range]
409
+ assert_equal gene2, a[gene2.range]
410
+
411
+ c = a.dup
412
+
413
+ c[gene2.range] = "GN"
414
+ assert_equal c, Transformed.transform(a,[gene2], "GN")
415
+ c[gene1.range] = "GN"
416
+ assert_equal c, Transformed.transform(a,[gene1], "GN")
417
+
418
+ iii a.transformation_offset_differences
419
+ raise
420
+ assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
421
+ assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
422
+
423
+
424
+ gene3 = "GN gene"
425
+ gene3.extend Segment
426
+ gene3.offset = a.index gene3
427
+
428
+ assert_equal gene3, a[gene3.range]
429
+
430
+ a.restore([gene3])
431
+ assert_equal original, a
432
+ assert_equal "TP53 gene", a[gene3.range]
433
+
434
+ end
435
+
382
436
  end
383
437
 
@@ -0,0 +1,14 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+
4
+ class TestDocument < Test::Unit::TestCase
5
+
6
+ def test_docid
7
+ text = "This is a document"
8
+ Document.setup(text, "TEST", "test_doc1", nil)
9
+
10
+ assert_equal ["TEST", "test_doc1", nil, Misc.digest(text)] * ":", text.docid
11
+ end
12
+
13
+ end
14
+
@@ -0,0 +1,182 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '..', 'test_helper.rb')
2
+ require 'rbbt/document'
3
+ require 'rbbt/document/corpus'
4
+ require 'rbbt/segment'
5
+
6
+ class TestSegment < Test::Unit::TestCase
7
+ def test_segment
8
+ text = "This is a document"
9
+ Document.setup(text, "TEST", "test_doc1", nil)
10
+
11
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
12
+
13
+ assert_equal text.docid + ":" + segment.offset.to_s + ".." + segment.eend.to_s, segment.segid
14
+ end
15
+
16
+ def test_segid
17
+ text = "This is a document"
18
+ Document.setup(text, "TEST", "test_doc1", nil)
19
+
20
+ corpus = Document::Corpus.setup({})
21
+
22
+ corpus.add_document(text)
23
+
24
+ segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
25
+
26
+ segid = segment.segid(corpus)
27
+
28
+ segment = segid.segment
29
+ assert_equal "is", segment
30
+ end
31
+
32
+ def test_info
33
+ segment = "test"
34
+ segment.extend Segment
35
+ segment.offset = 10
36
+ assert segment.info.include? :offset
37
+ end
38
+
39
+ def test_sort
40
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
41
+ Document.setup(text, "TEST", "test_doc1", nil)
42
+
43
+ corpus = Document::Corpus.setup({})
44
+
45
+ corpus.add_document(text)
46
+
47
+ gene1 = "TP53"
48
+ gene1.extend Segment
49
+ gene1.offset = text.index gene1
50
+ gene1.docid = text.docid
51
+
52
+ gene2 = "CDK5R1"
53
+ gene2.extend Segment
54
+ gene2.offset = text.index gene2
55
+ gene2.docid = text.docid
56
+
57
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
58
+
59
+ assert_equal [gene1,gene2], Segment.sort([gene2.segid(corpus),gene1.segid(corpus)]).collect{|segid| segid.segment}
60
+ end
61
+
62
+ def test_clean_sort
63
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
64
+ Document.setup(text, "TEST", "test_doc1", nil)
65
+
66
+ corpus = Document::Corpus.setup({})
67
+
68
+ corpus.add_document(text)
69
+
70
+ gene1 = "TP53"
71
+ gene1.extend Segment
72
+ gene1.offset = text.index gene1
73
+ gene1.docid = text.docid
74
+
75
+ gene2 = "CDK5R1"
76
+ gene2.extend Segment
77
+ gene2.offset = text.index gene2
78
+ gene2.docid = text.docid
79
+
80
+ gene3 = "TP53 gene"
81
+ gene3.extend Segment
82
+ gene3.offset = text.index gene1
83
+ gene3.docid = text.docid
84
+
85
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
86
+
87
+ assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
88
+ end
89
+
90
+ def test_split
91
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
92
+ Document.setup(text, "TEST", "test_doc1", nil)
93
+
94
+ corpus = Document::Corpus.setup({})
95
+
96
+ corpus.add_document(text)
97
+
98
+ gene1 = "TP53"
99
+ gene1.extend Segment
100
+ gene1.offset = text.index gene1
101
+ gene1.docid = text.docid
102
+
103
+ gene2 = "CDK5R1"
104
+ gene2.extend Segment
105
+ gene2.offset = text.index gene2
106
+ gene2.docid = text.docid
107
+
108
+ gene3 = "TP53 gene"
109
+ gene3.extend Segment
110
+ gene3.offset = text.index gene1
111
+ gene3.docid = text.docid
112
+
113
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3])
114
+
115
+ assert_equal ["This sentence mentions the ", gene3, " and the ", gene2, " protein"], Segment.split(text, [gene2, gene1, gene3].collect{|s| s.segid})
116
+ end
117
+
118
+
119
+ def test_align
120
+ text =<<-EOF
121
+ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
122
+ EOF
123
+
124
+ parts = text.split(/\W/)
125
+ Segment.align(text, parts)
126
+
127
+ assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
128
+
129
+ Document.setup(text, "TEST", "test_doc1", nil)
130
+
131
+ parts = text.split(/\W/)
132
+ Segment.align(text, parts)
133
+
134
+ assert_equal parts.first.docid, text.docid
135
+ end
136
+
137
+ def test_segment_index
138
+ text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
139
+ Document.setup(text, "TEST", "test_doc1", nil)
140
+
141
+ corpus = Document::Corpus.setup({})
142
+
143
+ corpus.add_document(text)
144
+
145
+ gene1 = "TP53"
146
+ gene1.extend Segment
147
+ gene1.offset = text.index gene1
148
+ gene1.docid = text.docid
149
+
150
+ gene2 = "CDK5R1"
151
+ gene2.extend Segment
152
+ gene2.offset = text.index gene2
153
+ gene2.docid = text.docid
154
+
155
+ gene3 = "TP53 gene"
156
+ gene3.extend Segment
157
+ gene3.offset = text.index gene1
158
+ gene3.docid = text.docid
159
+
160
+ index = Segment.index([gene1, gene2, gene3], corpus)
161
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
162
+
163
+ TmpFile.with_file do |fwt|
164
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
165
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
166
+ index = Segment.index([gene1, gene2, gene3], corpus, fwt)
167
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
168
+ end
169
+
170
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus)
171
+ assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
172
+
173
+ TmpFile.with_file do |fwt|
174
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
175
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
176
+ index = Segment.index([gene1, gene2, gene3].collect{|s| s.segid}, corpus, fwt)
177
+ assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
178
+ end
179
+ end
180
+
181
+ end
182
+