rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+
3
+ INSTALL_HELPER_FILE="$1"
4
+ RBBT_SOFTWARE_DIR="$2"
5
+ source "$INSTALL_HELPER_FILE"
6
+
7
+ name='OSCAR4'
8
+ url="http://maven.ch.cam.ac.uk/m2repo/uk/ac/cam/ch/wwmm/oscar/oscar4-all/4.0.1/oscar4-all-4.0.1-with-dependencies.jar"
9
+
10
+
11
+ PKG_DIR=`opt_dir $name`
12
+ [ -d $PKG_DIR ] || mkdir -p $PKG_DIR
13
+ wget "$url" -O "$PKG_DIR/OSCAR4.jar"
14
+ ln -sf "$PKG_DIR/OSCAR4.jar" "$OPT_JAR_DIR/OSCAR4.jar"
15
+
16
+
@@ -0,0 +1,15 @@
1
+ #!/bin/bash
2
+
3
+ INSTALL_HELPER_FILE="$1"
4
+ RBBT_SOFTWARE_DIR="$2"
5
+ source "$INSTALL_HELPER_FILE"
6
+
7
+ name='StanfordParser'
8
+ url="http://nlp.stanford.edu/downloads/stanford-parser-2011-04-20.tgz"
9
+
10
+
11
+ get_src "$name" "$url"
12
+ mkdir "$OPT_DIR/$name"
13
+ cp "$OPT_BUILD_DIR/stanford-parser.jar" "$OPT_DIR/$name"
14
+ ln -s "$OPT_DIR/$name/stanford-parser.jar" "$OPT_JAR_DIR/stanford-parser.jar"
15
+
@@ -0,0 +1,22 @@
1
+ NP[disease_NN] VP[induce_HVB by_IN] NP[drug_NN]
2
+ NP[drug_NN] VP[induce_HVB] NP[disease_NN]
3
+ NP[drug_NN] VP[cause_HVB] NP[disease_NN]
4
+ NP[disease_NN] VP[cause_HVB by_IN] NP[drug_NN]
5
+ NP[disease_NN] VP[produce_HVB by_IN] NP[drug_NN]
6
+ NP[disease_NN] VP[induce_HVB by_IN] NP[injection_HNN of_IN] NP[drug_NN]
7
+ NP[drug_NN] VP[associate_HVB with_IN] NP[risk_HNN of_IN] NP[disease_NN]
8
+ NP[disease_NN] VP[induce_HVB by_IN] NP[administration_HNN of_IN] NP[drug_NN]
9
+ NP[disease_NN] VP[be_HVB] NP[effect_HNN of_IN] NP[drug_NN]
10
+ NP[drug_NN] VP[increase_HVB] NP[risk_HNN of_IN] NP[disease_NN]
11
+ NP[disease_NN] NTG[follow_VBG] NP[treatment_HNN with_IN] NP[drug_NN]
12
+ NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN therapy_HNN]
13
+ NP[disease_NN] VP[associate_HVB with_IN] NP[use_HNN of_IN] NP[drug_NN]
14
+ NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN use_HNN]
15
+ NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN treatment_HNN]
16
+ NP[disease_NN while_IN] VP[receive_HVB] NP[drug_NN]
17
+ NP[disease_NN] NTG[follow_VBG] NP[drug_NN therapy_HNN]
18
+ NP[disease_NN after_IN] VP[receive_HVB] NP[drug_NN]
19
+ NP[disease_NN] NTG[follow_VBG] NP[drug_NN administration_HNN]
20
+ NP[disease_NN due_(?:IN|JJ) to_TO] NP[drug_NN therapy_HNN]
21
+ NP[disease_NN] VP[follow_HVB] NP[treatment_HNN with_IN] NP[drug_NN]
22
+ NP[disease_NN] VP[follow_HVB] NP[drug_NN administration_HNN]
@@ -0,0 +1,10 @@
1
+ equal do |w| [w] end
2
+ standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
+ cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
4
+ special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
+ words do |w|
6
+ w.sub(/(.*)I$/,'\1I \1').
7
+ scan(/[a-z][a-z]+/i).
8
+ sort{|a,b| b.length <=> a.length}.
9
+ collect{|n| n.downcase}
10
+ end
@@ -0,0 +1,86 @@
1
+ require 'rbbt/util/misc'
2
+
3
+
4
+ plural = Proc.new do |t| t.sub(/s$/,'') end
5
+
6
+ tokens do
7
+
8
+ # Some (possible) single letters first
9
+ receptor /^(?:receptor|r)s?$/i
10
+ protein /^(?:protein|p)s?$/i
11
+ roman /^[IV]+$/
12
+ greek_letter do |w| $inverse_greek[w.downcase] != nil end
13
+
14
+
15
+ # Some words for removal
16
+ stopword do |w| $stopwords.include?( w.downcase_first) end
17
+ gene /genes?/i
18
+ dna
19
+ cdna
20
+ rna
21
+ mrna
22
+ trna
23
+ cdna
24
+ component
25
+ exon
26
+ intron
27
+ domain
28
+ family
29
+
30
+
31
+ # Important words
32
+ number /^(?:\d+[.,]?\d+|\d)$/
33
+ greek do |w| $greek[w.downcase] != nil end
34
+ special do |w| w.is_special? end
35
+ promoter
36
+ similar /^(homolog.*|like|related|associated)$/
37
+ ase /ase$/
38
+ in_end /in$/
39
+ end
40
+
41
+ comparisons do
42
+
43
+ compare.number do |l1,l2|
44
+ v = 0
45
+ case
46
+ when l1.empty? && l2.empty?
47
+ v = 0
48
+ when l1.sort.uniq == l2.sort.uniq
49
+ v = 3
50
+ when l1.any? && l1[0] == l2[0]
51
+ v = -3
52
+ when l1.empty? && l2 == ['1']
53
+ v = -5
54
+ else
55
+ v = -10
56
+ end
57
+ v
58
+ end
59
+
60
+ diff.promoter -10
61
+ diff.receptor -10
62
+ diff.similar -10
63
+ diff.capital -10
64
+
65
+ same.unknown 1
66
+ miss.unknown -2
67
+ extr.unknown -2
68
+
69
+ same.greek 1
70
+ miss.greek -2
71
+ extr.greek -2
72
+
73
+ same.special 4
74
+ miss.special -3
75
+ extr.special -3
76
+
77
+ transform.receptor plural
78
+ transform.protein plural
79
+
80
+ transform.roman do |t| [t.arabic, :number] end
81
+ transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
82
+ transform.ase do |t| [t, :special] end
83
+ transform.in_end do |t| [t, :special] end
84
+ transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
85
+ end
86
+
File without changes
@@ -1,4 +1,4 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/bow/bow'
3
3
  require 'test/unit'
4
4
 
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/bow/dictionary'
3
3
  require 'rbbt/bow/bow'
4
4
  require 'test/unit'
@@ -1,4 +1,4 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/bow/misc'
3
3
  require 'test/unit'
4
4
 
@@ -0,0 +1,99 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/corpus/corpus'
3
+ require 'rbbt/corpus/sources/pubmed'
4
+
5
+ class Document
6
+ define :sentences do
7
+ require 'rbbt/nlp/nlp'
8
+ NLP.geniass_sentence_splitter(text)
9
+ end
10
+
11
+ define :genes do
12
+ require 'rbbt/ner/abner'
13
+ Abner.new.entities(text)
14
+ end
15
+ end
16
+
17
+
18
+ class TestCorpus < Test::Unit::TestCase
19
+
20
+ def test_add_document
21
+ pmid = "19458159"
22
+
23
+ text = PubMed.get_article(pmid).text
24
+
25
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
26
+
27
+ assert corpus.find(:pubmed, pmid).empty?
28
+
29
+ corpus.add_document(text, :pubmed, pmid, :abstract)
30
+
31
+ assert corpus.find(:pubmed, pmid).any?
32
+ assert corpus.find(:pubmed, pmid, :fulltext).empty?
33
+ assert corpus.find(:pubmed, pmid, :abstract).any?
34
+
35
+ assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
36
+ end
37
+
38
+ def test_add_pmid
39
+ pmid = "19465387"
40
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
41
+ corpus.add_pmid(pmid, :abstract)
42
+
43
+ assert corpus.exists? :pubmed, pmid
44
+ assert corpus.exists? :pubmed, pmid, :abstract
45
+ assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
46
+ end
47
+
48
+ def test_find_all
49
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
50
+ corpus.add_pmid("19458159", :abstract)
51
+ corpus.add_pmid("19465387", :abstract)
52
+
53
+ all = corpus.find
54
+
55
+ assert_equal 2, all.length
56
+ assert all.select{|document| document.id == "19458159"}.any?
57
+ assert all.select{|document| document.id == "19465387"}.any?
58
+ end
59
+
60
+ def test_doc_sentences
61
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
62
+ corpus.add_pmid("19458159", :abstract)
63
+
64
+ sentences = corpus.find.collect{|doc| doc.sentences}.flatten
65
+
66
+ assert corpus.find.first.sentences.length > 0
67
+ assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
68
+
69
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
70
+ assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
71
+ end
72
+
73
+ def test_doc_genes
74
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
75
+ corpus.add_pmid("21611789", :abstract)
76
+
77
+ assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
78
+ end
79
+
80
+ def test_genes
81
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
82
+ corpus.add_pmid("21611789", :abstract)
83
+
84
+ assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
85
+ end
86
+
87
+ def test_index
88
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
89
+ corpus.add_pmid("21611789", :abstract)
90
+
91
+ document = corpus.find(:pubmed, "21611789").first
92
+
93
+ genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
94
+
95
+ assert genes.collect{|gene|
96
+ document.sentences_at(gene.offset)
97
+ }.flatten.length > 1
98
+ end
99
+ end
@@ -0,0 +1,222 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/corpus/document'
3
+ require 'test/unit'
4
+
5
+ $persistence = TSV.new({})
6
+ $tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
7
+ $global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
8
+ $tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
9
+
10
+ class Document
11
+ define :sentences do
12
+ require 'rbbt/nlp/nlp'
13
+ NLP.geniass_sentence_splitter(text)
14
+ end
15
+
16
+ define :tokens do
17
+ require 'rbbt/ner/annotations/token'
18
+ Token.tokenize(text)
19
+ end
20
+
21
+ define :long_words do
22
+ require 'rbbt/ner/annotations/token'
23
+ Token.tokenize(text).select{|tok| tok.length > 5}
24
+ end
25
+
26
+ define :short_words do
27
+ require 'rbbt/ner/annotations/token'
28
+ Token.tokenize(text).select{|tok| tok.length < 5}
29
+ end
30
+
31
+ define :even_words do
32
+ require 'rbbt/ner/annotations/token'
33
+ Token.tokenize(text).select{|tok| tok.length % 2 == 0}
34
+ end
35
+
36
+ define :missing do
37
+ []
38
+ end
39
+
40
+ define :tokens_again do
41
+ raise "This should be here already"
42
+ end
43
+
44
+ persist :sentences
45
+ persist_in_tsv :tokens
46
+ persist_in_tsv :long_words, $tchash_persistence, :Literal
47
+ persist_in_global_tsv :short_words, $global_persistence
48
+ persist_in_global_tsv :even_words, $tchash_global_persistence
49
+ persist_in_global_tsv :missing, $tchash_global_persistence
50
+ end
51
+
52
+ class TestDocument < Test::Unit::TestCase
53
+
54
+ def test_annotations
55
+
56
+ text =<<-EOF
57
+ This is a
58
+ sentence. This is
59
+ another sentence.
60
+ EOF
61
+
62
+ doc = Document.new
63
+ doc.text = text
64
+
65
+ assert_equal 2, doc.sentences.length
66
+ assert_equal 10, doc.tokens.length
67
+ end
68
+
69
+ def test_annotation_load
70
+ text =<<-EOF
71
+ This is a
72
+ sentence. This is
73
+ another sentence.
74
+ EOF
75
+
76
+ doc = Document.new
77
+ doc.text = text * 10
78
+
79
+ sentence = doc.sentences.last
80
+ doc.load_into sentence, :tokens
81
+ assert_equal 5, sentence.tokens.length
82
+ assert_equal "another", sentence.tokens[2]
83
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
84
+ end
85
+
86
+ def test_annotation_persistence
87
+ text =<<-EOF
88
+ This is a
89
+ sentence. This is
90
+ another sentence.
91
+ EOF
92
+
93
+ text *= 10
94
+
95
+ TmpFile.with_file do |dir|
96
+ FileUtils.mkdir_p dir
97
+
98
+ doc = Document.new(dir)
99
+ doc.text = text
100
+ doc.sentences
101
+
102
+ doc = Document.new(dir)
103
+ doc.text = text
104
+
105
+ sentence = doc.sentences.last
106
+ doc.load_into sentence, :tokens
107
+
108
+ assert_equal 5, sentence.tokens.length
109
+ assert_equal "another", sentence.tokens[2]
110
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
111
+ end
112
+ end
113
+
114
+ def test_range_persistence
115
+ text =<<-EOF
116
+ This is a
117
+ sentence. This is
118
+ another sentence.
119
+ EOF
120
+
121
+ text *= 10
122
+
123
+ TmpFile.with_file do |dir|
124
+ FileUtils.mkdir_p dir
125
+
126
+ doc = Document.new(dir)
127
+ doc.text = text
128
+
129
+ sentence = doc.sentences.last
130
+ Misc.benchmark(10) do
131
+ doc = Document.new(dir)
132
+ doc.text = text
133
+
134
+ doc.load_into sentence, :tokens, :persist => true
135
+ assert_equal 5, sentence.tokens.length
136
+ assert_equal "another", sentence.tokens[2]
137
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
138
+ assert_equal sentence.offset + 5, sentence.tokens[1].offset
139
+ end
140
+ end
141
+ end
142
+
143
+ def test_annotation_persistence_in_tsv
144
+ text =<<-EOF
145
+ This is a
146
+ sentence. This is
147
+ another sentence.
148
+ EOF
149
+
150
+ TmpFile.with_file do |dir|
151
+ FileUtils.mkdir_p dir
152
+
153
+
154
+ doc = Document.new(dir)
155
+ doc.text = text * 10
156
+ doc.sentences
157
+
158
+ doc = Document.new(dir)
159
+ doc.text = text * 10
160
+
161
+ sentence = doc.sentences.last
162
+
163
+ doc.load_into sentence, :tokens, :long_words
164
+
165
+ assert_equal 5, sentence.tokens.length
166
+ assert_equal "another", sentence.tokens[2]
167
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
168
+
169
+ assert_equal 2, sentence.long_words.length
170
+ assert_equal %w(another sentence), sentence.long_words
171
+ assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
172
+ end
173
+ end
174
+
175
+ def test_annotation_persistence_in_global
176
+ text =<<-EOF
177
+ This is a
178
+ sentence. This is
179
+ another sentence.
180
+ EOF
181
+
182
+ TmpFile.with_file do |dir|
183
+ FileUtils.mkdir_p dir
184
+
185
+
186
+ doc = Document.new(dir)
187
+ doc.text = text * 10
188
+ doc.docid = "FOOF"
189
+ doc.short_words
190
+ doc.sentences
191
+
192
+ doc = Document.new(dir)
193
+ doc.text = text * 10
194
+ doc.docid = "FOOF"
195
+
196
+ sentence = doc.sentences.last
197
+
198
+ doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
199
+
200
+ assert_equal 3, sentence.short_words.length
201
+ assert_equal 3, sentence.even_words.length
202
+ end
203
+ end
204
+
205
+ def test_dump
206
+ text =<<-EOF
207
+ This is a
208
+ sentence. This is
209
+ another sentence.
210
+ EOF
211
+
212
+ TmpFile.with_file do |dir|
213
+ FileUtils.mkdir_p dir
214
+
215
+ doc = Document.new(dir)
216
+ doc.text = text * 10
217
+ tsv = Document.tsv(doc.sentences, ["Literal"])
218
+ end
219
+ end
220
+ end
221
+
222
+