rbbt-text 0.2.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,14 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/annotations/named_entity'
4
+ require 'rbbt/ner/annotations/transformed'
5
+
6
+ class TestClass < Test::Unit::TestCase
7
+ def test_info
8
+ a = "test"
9
+ a.extend NamedEntity
10
+ assert(! a.info.keys.include?("offset"))
11
+ a.offset = 10
12
+ assert a.info.keys.include? "offset"
13
+ end
14
+ end
@@ -0,0 +1,175 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/ner/annotations/transformed'
3
+ require 'rbbt/ner/annotations/named_entity'
4
+
5
+ class TestClass < Test::Unit::TestCase
6
+ def test_transform
7
+ a = "This sentence mentions the TP53 gene and the CDK5 protein"
8
+ original = a.dup
9
+
10
+ gene1 = "TP53"
11
+ gene1.extend NamedEntity
12
+ gene1.offset = a.index gene1
13
+
14
+ gene2 = "CDK5"
15
+ gene2.extend NamedEntity
16
+ gene2.offset = a.index gene2
17
+
18
+ assert_equal gene1, a[gene1.range]
19
+ assert_equal gene2, a[gene2.range]
20
+
21
+ c = a.dup
22
+
23
+ c[gene2.range] = "GN"
24
+ assert_equal c, Transformed.transform(a,[gene2], "GN")
25
+ c[gene1.range] = "GN"
26
+ assert_equal c, Transformed.transform(a,[gene1], "GN")
27
+
28
+ assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
29
+ assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
30
+
31
+
32
+ gene3 = "GN gene"
33
+ gene3.extend NamedEntity
34
+ gene3.offset = a.index gene3
35
+
36
+ assert_equal gene3, a[gene3.range]
37
+
38
+ a.restore([gene3])
39
+ assert_equal original, a
40
+ assert_equal "TP53 gene", a[gene3.range]
41
+
42
+ end
43
+
44
+ def test_with_transform
45
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
46
+ original = a.dup
47
+
48
+ gene1 = "TP53"
49
+ gene1.extend NamedEntity
50
+ gene1.offset = a.index gene1
51
+
52
+ gene2 = "CDK5R1"
53
+ gene2.extend NamedEntity
54
+ gene2.offset = a.index gene2
55
+
56
+ Transformed.with_transform(a, [gene1], "GN") do
57
+ assert_equal original.sub("TP53", 'GN'), a
58
+ end
59
+ assert_equal original, a
60
+
61
+ Transformed.with_transform(a, [gene1,gene2], "GN") do
62
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
63
+ end
64
+ assert_equal original, a
65
+
66
+ Transformed.with_transform(a, [gene1], "GN") do
67
+ Transformed.with_transform(a, [gene2], "GN") do
68
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
69
+ end
70
+ assert_equal original.gsub(/TP53/, 'GN'), a
71
+ end
72
+ assert_equal original, a
73
+
74
+ exp1, exp2 = nil, nil
75
+ expanded_genes = Transformed.with_transform(a, [gene1,gene2], "GN") do
76
+ exp1 = "GN gene"
77
+ exp1.extend NamedEntity
78
+ exp1.offset = a.index exp1
79
+ exp2 = "GN protein"
80
+ exp2.extend NamedEntity
81
+ exp2.offset = a.index exp2
82
+
83
+ [exp1, exp2]
84
+ end
85
+ assert_equal original, a
86
+
87
+ assert_equal "TP53 gene", exp1
88
+ assert_equal "CDK5R1 protein", exp2
89
+ end
90
+
91
+ def test_html
92
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
93
+
94
+ gene1 = "TP53"
95
+ gene1.extend NamedEntity
96
+ gene1.offset = a.index gene1
97
+ gene1.type = "Gene"
98
+
99
+ gene2 = "CDK5R1"
100
+ gene2.extend NamedEntity
101
+ gene2.offset = a.index gene2
102
+ gene2.type = "Protein"
103
+
104
+ Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
105
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
106
+ end
107
+ end
108
+
109
+ def test_html_with_offset
110
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
111
+ Segment.annotate(a, 10)
112
+
113
+ gene1 = "TP53"
114
+ gene1.extend NamedEntity
115
+ gene1.offset = a.index gene1
116
+ gene1.offset += 10
117
+ gene1.type = "Gene"
118
+
119
+ gene2 = "CDK5R1"
120
+ gene2.extend NamedEntity
121
+ gene2.offset = a.index gene2
122
+ gene2.offset += 10
123
+ gene2.type = "Protein"
124
+
125
+ Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
126
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
127
+ end
128
+ end
129
+
130
+ def test_overlap
131
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
132
+
133
+ gene1 = "TP53"
134
+ gene1.extend NamedEntity
135
+ gene1.offset = a.index gene1
136
+ gene1.type = "Gene"
137
+
138
+ gene2 = "TP53 gene"
139
+ gene2.extend NamedEntity
140
+ gene2.offset = a.index gene2
141
+ gene2.type = "Expanded Gene"
142
+
143
+ assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
144
+
145
+ Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
146
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
147
+ Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
148
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
149
+ end
150
+ end
151
+ end
152
+
153
+ def test_cascade_with_overlap_ignored
154
+ a = "This sentence mentions the HDL-C gene and the CDK5R1 protein"
155
+
156
+ gene1 = "HDL-C"
157
+ gene1.extend NamedEntity
158
+ gene1.offset = a.index gene1
159
+ gene1.type = "Gene"
160
+
161
+ gene2 = "-"
162
+ gene2.extend NamedEntity
163
+ gene2.offset = a.index gene2
164
+ gene2.type = "Dash"
165
+
166
+ Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
167
+ one = a.dup
168
+ Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
169
+ assert_equal one, a
170
+ end
171
+ end
172
+
173
+ end
174
+ end
175
+
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/ner/abner'
3
3
  require 'test/unit'
4
4
 
@@ -1,8 +1,70 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/annotations/named_entity'
4
+ require 'rbbt/ner/annotations/transformed'
2
5
 
3
6
  class TestClass < Test::Unit::TestCase
4
- def test_true
5
- assert true
7
+ def test_info
8
+ a = "test"
9
+ a.extend NamedEntity
10
+ a.type = "type"
11
+ assert a.info.keys.include? "type"
12
+ end
13
+
14
+ def test_segment_type
15
+ a = "test"
16
+ a.extend NamedEntity
17
+ assert a.segment_types.include? "NamedEntity"
18
+ end
19
+
20
+ def test_align
21
+ text =<<-EOF
22
+ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
23
+ EOF
24
+
25
+ parts = text.split(/\W/)
26
+ Segment.align(text, parts)
27
+
28
+ assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
29
+ end
30
+
31
+ def test_sort
32
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
33
+
34
+ gene1 = "TP53"
35
+ gene1.extend NamedEntity
36
+ gene1.offset = a.index gene1
37
+ gene1.type = "Gene"
38
+
39
+ gene2 = "CDK5R1"
40
+ gene2.extend NamedEntity
41
+ gene2.offset = a.index gene2
42
+ gene2.type = "Gene"
43
+
44
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
45
+
46
+ end
47
+
48
+ def test_clean_sort
49
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
50
+
51
+ gene1 = "TP53"
52
+ gene1.extend NamedEntity
53
+ gene1.offset = a.index gene1
54
+ gene1.type = "Gene"
55
+
56
+ gene2 = "CDK5R1"
57
+ gene2.extend NamedEntity
58
+ gene2.offset = a.index gene2
59
+ gene2.type = "Gene"
60
+
61
+ gene3 = "TP53 gene"
62
+ gene3.extend NamedEntity
63
+ gene3.offset = a.index gene3
64
+ gene3.type = "Gene"
65
+
66
+ assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
67
+
6
68
  end
7
69
  end
8
70
 
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/ner/banner'
3
3
  require 'test/unit'
4
4
 
@@ -0,0 +1,56 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/ner/chemical_tagger'
3
+ require 'rbbt/util/tmpfile'
4
+ require 'test/unit'
5
+
6
+ class TestChemicalTagger < Test::Unit::TestCase
7
+
8
+ def test_match
9
+ begin
10
+ ner = ChemicalTagger.new
11
+ str = "Alternatively, rearrangement of O-(w-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
12
+ mentions = ner.match(str, "CM", false)
13
+
14
+ good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(w-haloalkyl)esters"]
15
+
16
+ good_mentions.each{|mention|
17
+ assert(mentions.include? mention)
18
+ }
19
+ rescue
20
+ puts $!.message
21
+ puts $!.backtrace
22
+ end
23
+ end
24
+
25
+ def test_ranges
26
+ begin
27
+ ner = ChemicalTagger.new
28
+ str =<<-EOF
29
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
30
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
31
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
32
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
33
+ This otherone talks about O-(w-haloalkyl)esters.
34
+ This otherone talks about O-(w-haloalkyl)esters.
35
+ This otherone talks about O-(w-haloalkyl)esters.
36
+
37
+ This otherone talks about O-(w-haloalkyl)esters.
38
+ This otherone talks about O-(w-haloalkyl)esters.
39
+ EOF
40
+
41
+ mentions = ner.match(str, "CM", false)
42
+
43
+ str_original = str.dup
44
+ mentions.each do |mention|
45
+ str[mention.range] = mention
46
+ end
47
+
48
+ assert_equal str_original, str
49
+
50
+ rescue
51
+ puts $!.message
52
+ puts $!.backtrace
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,20 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/ngram_prefix_dictionary'
3
+ require 'rbbt/util/tmpfile'
4
+
5
+ class TestNGramPrefixDictionary < Test::Unit::TestCase
6
+ def test_match
7
+ lexicon =<<-EOF
8
+ C1;aa;AA;bb b
9
+ C2;11;22;3 3;bb
10
+ EOF
11
+
12
+ TmpFile.with_file(lexicon) do |file|
13
+ index = NGramPrefixDictionary.new(TSV.new(file, :flat, :sep => ';'), "test")
14
+
15
+ matches = index.match(' asdfa dsf asdf aa asdfasdf ')
16
+ assert matches.select{|m| m.code.include? 'C1'}.any?
17
+ end
18
+ end
19
+ end
20
+
@@ -1,18 +1,17 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/ner/oscar3'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/ner/oscar4'
3
3
  require 'rbbt/util/tmpfile'
4
4
  require 'test/unit'
5
5
 
6
- class TestOSCAR3 < Test::Unit::TestCase
7
-
6
+ class TestOSCAR4 < Test::Unit::TestCase
8
7
 
9
8
  def test_match
10
9
  begin
11
- ner = OSCAR3.new
12
- str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
10
+ ner = OSCAR4.new
11
+ str = "Alternatively, rearrangement of O-(w-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
13
12
 
14
13
  mentions = ner.match(str, "CM", false)
15
- good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
14
+ good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(w-haloalkyl)esters"]
16
15
 
17
16
  good_mentions.each{|mention|
18
17
  assert(mentions.include? mention)
@@ -25,18 +24,18 @@ class TestOSCAR3 < Test::Unit::TestCase
25
24
 
26
25
  def test_ranges
27
26
  begin
28
- ner = OSCAR3.new
27
+ ner = OSCAR4.new
29
28
  str =<<-EOF
30
29
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
31
30
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
32
31
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
33
32
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
34
- This otherone talks about O-(ω-haloalkyl)esters.
35
- This otherone talks about O-(ω-haloalkyl)esters.
36
- This otherone talks about O-(ω-haloalkyl)esters.
33
+ This otherone talks about O-(w-haloalkyl)esters.
34
+ This otherone talks about O-(w-haloalkyl)esters.
35
+ This otherone talks about O-(w-haloalkyl)esters.
37
36
 
38
- This otherone talks about O-(ω-haloalkyl)esters.
39
- This otherone talks about O-(ω-haloalkyl)esters.
37
+ This otherone talks about O-(w-haloalkyl)esters.
38
+ This otherone talks about O-(w-haloalkyl)esters.
40
39
  EOF
41
40
 
42
41
 
@@ -0,0 +1,66 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/ner/patterns'
3
+
4
+ class TestPatternRelExt < Test::Unit::TestCase
5
+ def test_simple_pattern
6
+ text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
7
+
8
+ gene1 = "TP53"
9
+ NamedEntity.annotate(gene1, text.index(gene1), "Gene")
10
+
11
+ gene2 = "CDK5"
12
+ NamedEntity.annotate(gene2, text.index(gene2), "Gene")
13
+
14
+ interaction = "interacts"
15
+ NamedEntity.annotate(interaction, text.index(interaction), "Interaction")
16
+
17
+ Annotated.annotate(text, [gene1, gene2, interaction])
18
+
19
+ assert_equal "TP53 interacts with CDK5", PatternRelExt.simple_pattern(text, "GENE INTERACTION with GENE").first
20
+
21
+ end
22
+
23
+ def test_chunk_pattern
24
+ text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
25
+
26
+ gene1 = "TP53"
27
+ NamedEntity.annotate(gene1, text.index(gene1), "Gene")
28
+
29
+ gene2 = "CDK5"
30
+ NamedEntity.annotate(gene2, text.index(gene2), "Gene")
31
+
32
+ interaction = "interacts"
33
+ NamedEntity.annotate(interaction, text.index(interaction), "Interaction")
34
+
35
+ Annotated.annotate(text, {:entities => [gene1, gene2, interaction]})
36
+
37
+ assert_equal "TP53 found in cultivated cells interacts with CDK5",
38
+ PatternRelExt.new("NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]").match_sentences([text]).first.first
39
+
40
+ assert_equal "TP53 found in cultivated cells interacts with CDK5",
41
+ PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
42
+ end
43
+
44
+ def test_chunk_pattern
45
+ text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
46
+
47
+ drug = "thiazolidinediones"
48
+ NamedEntity.annotate(drug, text.index(drug), "Chemical Mention")
49
+
50
+ disease = "colon cancer"
51
+ NamedEntity.annotate(disease, text.index(disease), "disease")
52
+
53
+ Annotated.annotate(text, {:entitites => [drug, disease]})
54
+
55
+ assert_equal "thiazolidinediones in patients with an increased risk of colon cancer",
56
+ PatternRelExt.new("NP[entity:Chemical Mention] NP[stem:risk] NP[entity:disease]").match_sentences([text]).first.first
57
+
58
+ end
59
+
60
+
61
+ def test_entities_with_spaces
62
+ PatternRelExt.new("NP[entity:Gene Name]").token_trie
63
+ end
64
+
65
+
66
+ end