rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,14 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/annotations/named_entity'
4
+ require 'rbbt/ner/annotations/transformed'
5
+
6
+ class TestClass < Test::Unit::TestCase
7
+ def test_info
8
+ a = "test"
9
+ a.extend NamedEntity
10
+ assert(! a.info.keys.include?("offset"))
11
+ a.offset = 10
12
+ assert a.info.keys.include? "offset"
13
+ end
14
+ end
@@ -0,0 +1,175 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
2
+ require 'rbbt/ner/annotations/transformed'
3
+ require 'rbbt/ner/annotations/named_entity'
4
+
5
+ class TestClass < Test::Unit::TestCase
6
+ def test_transform
7
+ a = "This sentence mentions the TP53 gene and the CDK5 protein"
8
+ original = a.dup
9
+
10
+ gene1 = "TP53"
11
+ gene1.extend NamedEntity
12
+ gene1.offset = a.index gene1
13
+
14
+ gene2 = "CDK5"
15
+ gene2.extend NamedEntity
16
+ gene2.offset = a.index gene2
17
+
18
+ assert_equal gene1, a[gene1.range]
19
+ assert_equal gene2, a[gene2.range]
20
+
21
+ c = a.dup
22
+
23
+ c[gene2.range] = "GN"
24
+ assert_equal c, Transformed.transform(a,[gene2], "GN")
25
+ c[gene1.range] = "GN"
26
+ assert_equal c, Transformed.transform(a,[gene1], "GN")
27
+
28
+ assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
29
+ assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
30
+
31
+
32
+ gene3 = "GN gene"
33
+ gene3.extend NamedEntity
34
+ gene3.offset = a.index gene3
35
+
36
+ assert_equal gene3, a[gene3.range]
37
+
38
+ a.restore([gene3])
39
+ assert_equal original, a
40
+ assert_equal "TP53 gene", a[gene3.range]
41
+
42
+ end
43
+
44
+ def test_with_transform
45
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
46
+ original = a.dup
47
+
48
+ gene1 = "TP53"
49
+ gene1.extend NamedEntity
50
+ gene1.offset = a.index gene1
51
+
52
+ gene2 = "CDK5R1"
53
+ gene2.extend NamedEntity
54
+ gene2.offset = a.index gene2
55
+
56
+ Transformed.with_transform(a, [gene1], "GN") do
57
+ assert_equal original.sub("TP53", 'GN'), a
58
+ end
59
+ assert_equal original, a
60
+
61
+ Transformed.with_transform(a, [gene1,gene2], "GN") do
62
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
63
+ end
64
+ assert_equal original, a
65
+
66
+ Transformed.with_transform(a, [gene1], "GN") do
67
+ Transformed.with_transform(a, [gene2], "GN") do
68
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
69
+ end
70
+ assert_equal original.gsub(/TP53/, 'GN'), a
71
+ end
72
+ assert_equal original, a
73
+
74
+ exp1, exp2 = nil, nil
75
+ expanded_genes = Transformed.with_transform(a, [gene1,gene2], "GN") do
76
+ exp1 = "GN gene"
77
+ exp1.extend NamedEntity
78
+ exp1.offset = a.index exp1
79
+ exp2 = "GN protein"
80
+ exp2.extend NamedEntity
81
+ exp2.offset = a.index exp2
82
+
83
+ [exp1, exp2]
84
+ end
85
+ assert_equal original, a
86
+
87
+ assert_equal "TP53 gene", exp1
88
+ assert_equal "CDK5R1 protein", exp2
89
+ end
90
+
91
+ def test_html
92
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
93
+
94
+ gene1 = "TP53"
95
+ gene1.extend NamedEntity
96
+ gene1.offset = a.index gene1
97
+ gene1.type = "Gene"
98
+
99
+ gene2 = "CDK5R1"
100
+ gene2.extend NamedEntity
101
+ gene2.offset = a.index gene2
102
+ gene2.type = "Protein"
103
+
104
+ Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
105
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
106
+ end
107
+ end
108
+
109
+ def test_html_with_offset
110
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
111
+ Segment.annotate(a, 10)
112
+
113
+ gene1 = "TP53"
114
+ gene1.extend NamedEntity
115
+ gene1.offset = a.index gene1
116
+ gene1.offset += 10
117
+ gene1.type = "Gene"
118
+
119
+ gene2 = "CDK5R1"
120
+ gene2.extend NamedEntity
121
+ gene2.offset = a.index gene2
122
+ gene2.offset += 10
123
+ gene2.type = "Protein"
124
+
125
+ Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
126
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
127
+ end
128
+ end
129
+
130
+ def test_overlap
131
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
132
+
133
+ gene1 = "TP53"
134
+ gene1.extend NamedEntity
135
+ gene1.offset = a.index gene1
136
+ gene1.type = "Gene"
137
+
138
+ gene2 = "TP53 gene"
139
+ gene2.extend NamedEntity
140
+ gene2.offset = a.index gene2
141
+ gene2.type = "Expanded Gene"
142
+
143
+ assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
144
+
145
+ Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
146
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
147
+ Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
148
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
149
+ end
150
+ end
151
+ end
152
+
153
+ def test_cascade_with_overlap_ignored
154
+ a = "This sentence mentions the HDL-C gene and the CDK5R1 protein"
155
+
156
+ gene1 = "HDL-C"
157
+ gene1.extend NamedEntity
158
+ gene1.offset = a.index gene1
159
+ gene1.type = "Gene"
160
+
161
+ gene2 = "-"
162
+ gene2.extend NamedEntity
163
+ gene2.offset = a.index gene2
164
+ gene2.type = "Dash"
165
+
166
+ Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
167
+ one = a.dup
168
+ Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
169
+ assert_equal one, a
170
+ end
171
+ end
172
+
173
+ end
174
+ end
175
+
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/ner/abner'
3
3
  require 'test/unit'
4
4
 
@@ -1,8 +1,70 @@
1
1
  require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/annotations'
3
+ require 'rbbt/ner/annotations/named_entity'
4
+ require 'rbbt/ner/annotations/transformed'
2
5
 
3
6
  class TestClass < Test::Unit::TestCase
4
- def test_true
5
- assert true
7
+ def test_info
8
+ a = "test"
9
+ a.extend NamedEntity
10
+ a.type = "type"
11
+ assert a.info.keys.include? "type"
12
+ end
13
+
14
+ def test_segment_type
15
+ a = "test"
16
+ a.extend NamedEntity
17
+ assert a.segment_types.include? "NamedEntity"
18
+ end
19
+
20
+ def test_align
21
+ text =<<-EOF
22
+ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of early childhood poorly responding to therapy.
23
+ EOF
24
+
25
+ parts = text.split(/\W/)
26
+ Segment.align(text, parts)
27
+
28
+ assert_equal "Atypical teratoid/".length, parts.select{|s| s == "rhabdoid"}.first.offset
29
+ end
30
+
31
+ def test_sort
32
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
33
+
34
+ gene1 = "TP53"
35
+ gene1.extend NamedEntity
36
+ gene1.offset = a.index gene1
37
+ gene1.type = "Gene"
38
+
39
+ gene2 = "CDK5R1"
40
+ gene2.extend NamedEntity
41
+ gene2.offset = a.index gene2
42
+ gene2.type = "Gene"
43
+
44
+ assert_equal [gene1,gene2], Segment.sort([gene2,gene1])
45
+
46
+ end
47
+
48
+ def test_clean_sort
49
+ a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
50
+
51
+ gene1 = "TP53"
52
+ gene1.extend NamedEntity
53
+ gene1.offset = a.index gene1
54
+ gene1.type = "Gene"
55
+
56
+ gene2 = "CDK5R1"
57
+ gene2.extend NamedEntity
58
+ gene2.offset = a.index gene2
59
+ gene2.type = "Gene"
60
+
61
+ gene3 = "TP53 gene"
62
+ gene3.extend NamedEntity
63
+ gene3.offset = a.index gene3
64
+ gene3.type = "Gene"
65
+
66
+ assert_equal [gene3,gene2], Segment.clean_sort([gene2,gene1,gene3])
67
+
6
68
  end
7
69
  end
8
70
 
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/ner/banner'
3
3
  require 'test/unit'
4
4
 
@@ -0,0 +1,56 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/ner/chemical_tagger'
3
+ require 'rbbt/util/tmpfile'
4
+ require 'test/unit'
5
+
6
+ class TestChemicalTagger < Test::Unit::TestCase
7
+
8
+ def test_match
9
+ begin
10
+ ner = ChemicalTagger.new
11
+ str = "Alternatively, rearrangement of O-(w-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
12
+ mentions = ner.match(str, "CM", false)
13
+
14
+ good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(w-haloalkyl)esters"]
15
+
16
+ good_mentions.each{|mention|
17
+ assert(mentions.include? mention)
18
+ }
19
+ rescue
20
+ puts $!.message
21
+ puts $!.backtrace
22
+ end
23
+ end
24
+
25
+ def test_ranges
26
+ begin
27
+ ner = ChemicalTagger.new
28
+ str =<<-EOF
29
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
30
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
31
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
32
+ This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
33
+ This otherone talks about O-(w-haloalkyl)esters.
34
+ This otherone talks about O-(w-haloalkyl)esters.
35
+ This otherone talks about O-(w-haloalkyl)esters.
36
+
37
+ This otherone talks about O-(w-haloalkyl)esters.
38
+ This otherone talks about O-(w-haloalkyl)esters.
39
+ EOF
40
+
41
+ mentions = ner.match(str, "CM", false)
42
+
43
+ str_original = str.dup
44
+ mentions.each do |mention|
45
+ str[mention.range] = mention
46
+ end
47
+
48
+ assert_equal str_original, str
49
+
50
+ rescue
51
+ puts $!.message
52
+ puts $!.backtrace
53
+ end
54
+ end
55
+
56
+ end
@@ -0,0 +1,20 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
2
+ require 'rbbt/ner/ngram_prefix_dictionary'
3
+ require 'rbbt/util/tmpfile'
4
+
5
+ class TestNGramPrefixDictionary < Test::Unit::TestCase
6
+ def test_match
7
+ lexicon =<<-EOF
8
+ C1;aa;AA;bb b
9
+ C2;11;22;3 3;bb
10
+ EOF
11
+
12
+ TmpFile.with_file(lexicon) do |file|
13
+ index = NGramPrefixDictionary.new(TSV.new(file, :flat, :sep => ';'), "test")
14
+
15
+ matches = index.match(' asdfa dsf asdf aa asdfasdf ')
16
+ assert matches.select{|m| m.code.include? 'C1'}.any?
17
+ end
18
+ end
19
+ end
20
+
@@ -1,18 +1,17 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
2
- require 'rbbt/ner/oscar3'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/ner/oscar4'
3
3
  require 'rbbt/util/tmpfile'
4
4
  require 'test/unit'
5
5
 
6
- class TestOSCAR3 < Test::Unit::TestCase
7
-
6
+ class TestOSCAR4 < Test::Unit::TestCase
8
7
 
9
8
  def test_match
10
9
  begin
11
- ner = OSCAR3.new
12
- str = "Alternatively, rearrangement of O-(ω-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
10
+ ner = OSCAR4.new
11
+ str = "Alternatively, rearrangement of O-(w-haloalkyl)esters 34 of 2-carboethoxy-N-hydroxypyridine-2-selone affords azonianaphthalenium halides 37 in 79% yield"
13
12
 
14
13
  mentions = ner.match(str, "CM", false)
15
- good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(ω-haloalkyl)esters"]
14
+ good_mentions = ["2-carboethoxy-N-hydroxypyridine-2-selone", "O-(w-haloalkyl)esters"]
16
15
 
17
16
  good_mentions.each{|mention|
18
17
  assert(mentions.include? mention)
@@ -25,18 +24,18 @@ class TestOSCAR3 < Test::Unit::TestCase
25
24
 
26
25
  def test_ranges
27
26
  begin
28
- ner = OSCAR3.new
27
+ ner = OSCAR4.new
29
28
  str =<<-EOF
30
29
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
31
30
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
32
31
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
33
32
  This sentence talks about 2-carboethoxy-N-hydroxypyridine-2-selone.
34
- This otherone talks about O-(ω-haloalkyl)esters.
35
- This otherone talks about O-(ω-haloalkyl)esters.
36
- This otherone talks about O-(ω-haloalkyl)esters.
33
+ This otherone talks about O-(w-haloalkyl)esters.
34
+ This otherone talks about O-(w-haloalkyl)esters.
35
+ This otherone talks about O-(w-haloalkyl)esters.
37
36
 
38
- This otherone talks about O-(ω-haloalkyl)esters.
39
- This otherone talks about O-(ω-haloalkyl)esters.
37
+ This otherone talks about O-(w-haloalkyl)esters.
38
+ This otherone talks about O-(w-haloalkyl)esters.
40
39
  EOF
41
40
 
42
41
 
@@ -0,0 +1,66 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/ner/patterns'
3
+
4
+ class TestPatternRelExt < Test::Unit::TestCase
5
+ def test_simple_pattern
6
+ text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
7
+
8
+ gene1 = "TP53"
9
+ NamedEntity.annotate(gene1, text.index(gene1), "Gene")
10
+
11
+ gene2 = "CDK5"
12
+ NamedEntity.annotate(gene2, text.index(gene2), "Gene")
13
+
14
+ interaction = "interacts"
15
+ NamedEntity.annotate(interaction, text.index(interaction), "Interaction")
16
+
17
+ Annotated.annotate(text, [gene1, gene2, interaction])
18
+
19
+ assert_equal "TP53 interacts with CDK5", PatternRelExt.simple_pattern(text, "GENE INTERACTION with GENE").first
20
+
21
+ end
22
+
23
+ def test_chunk_pattern
24
+ text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
25
+
26
+ gene1 = "TP53"
27
+ NamedEntity.annotate(gene1, text.index(gene1), "Gene")
28
+
29
+ gene2 = "CDK5"
30
+ NamedEntity.annotate(gene2, text.index(gene2), "Gene")
31
+
32
+ interaction = "interacts"
33
+ NamedEntity.annotate(interaction, text.index(interaction), "Interaction")
34
+
35
+ Annotated.annotate(text, {:entities => [gene1, gene2, interaction]})
36
+
37
+ assert_equal "TP53 found in cultivated cells interacts with CDK5",
38
+ PatternRelExt.new("NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]").match_sentences([text]).first.first
39
+
40
+ assert_equal "TP53 found in cultivated cells interacts with CDK5",
41
+ PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
42
+ end
43
+
44
+ def test_chunk_pattern
45
+ text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
46
+
47
+ drug = "thiazolidinediones"
48
+ NamedEntity.annotate(drug, text.index(drug), "Chemical Mention")
49
+
50
+ disease = "colon cancer"
51
+ NamedEntity.annotate(disease, text.index(disease), "disease")
52
+
53
+ Annotated.annotate(text, {:entitites => [drug, disease]})
54
+
55
+ assert_equal "thiazolidinediones in patients with an increased risk of colon cancer",
56
+ PatternRelExt.new("NP[entity:Chemical Mention] NP[stem:risk] NP[entity:disease]").match_sentences([text]).first.first
57
+
58
+ end
59
+
60
+
61
+ def test_entities_with_spaces
62
+ PatternRelExt.new("NP[entity:Gene Name]").token_trie
63
+ end
64
+
65
+
66
+ end