rbbt-text 0.2.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+
3
+ INSTALL_HELPER_FILE="$1"
4
+ RBBT_SOFTWARE_DIR="$2"
5
+ source "$INSTALL_HELPER_FILE"
6
+
7
+ name='OSCAR4'
8
+ url="http://maven.ch.cam.ac.uk/m2repo/uk/ac/cam/ch/wwmm/oscar/oscar4-all/4.0.1/oscar4-all-4.0.1-with-dependencies.jar"
9
+
10
+
11
+ PKG_DIR=`opt_dir $name`
12
+ [ -d $PKG_DIR ] || mkdir -p $PKG_DIR
13
+ wget "$url" -O "$PKG_DIR/OSCAR4.jar"
14
+ ln -sf "$PKG_DIR/OSCAR4.jar" "$OPT_JAR_DIR/OSCAR4.jar"
15
+
16
+
@@ -0,0 +1,15 @@
1
+ #!/bin/bash
2
+
3
+ INSTALL_HELPER_FILE="$1"
4
+ RBBT_SOFTWARE_DIR="$2"
5
+ source "$INSTALL_HELPER_FILE"
6
+
7
+ name='StanfordParser'
8
+ url="http://nlp.stanford.edu/downloads/stanford-parser-2011-04-20.tgz"
9
+
10
+
11
+ get_src "$name" "$url"
12
+ mkdir "$OPT_DIR/$name"
13
+ cp "$OPT_BUILD_DIR/stanford-parser.jar" "$OPT_DIR/$name"
14
+ ln -s "$OPT_DIR/$name/stanford-parser.jar" "$OPT_JAR_DIR/stanford-parser.jar"
15
+
@@ -0,0 +1,22 @@
1
+ NP[disease_NN] VP[induce_HVB by_IN] NP[drug_NN]
2
+ NP[drug_NN] VP[induce_HVB] NP[disease_NN]
3
+ NP[drug_NN] VP[cause_HVB] NP[disease_NN]
4
+ NP[disease_NN] VP[cause_HVB by_IN] NP[drug_NN]
5
+ NP[disease_NN] VP[produce_HVB by_IN] NP[drug_NN]
6
+ NP[disease_NN] VP[induce_HVB by_IN] NP[injection_HNN of_IN] NP[drug_NN]
7
+ NP[drug_NN] VP[associate_HVB with_IN] NP[risk_HNN of_IN] NP[disease_NN]
8
+ NP[disease_NN] VP[induce_HVB by_IN] NP[administration_HNN of_IN] NP[drug_NN]
9
+ NP[disease_NN] VP[be_HVB] NP[effect_HNN of_IN] NP[drug_NN]
10
+ NP[drug_NN] VP[increase_HVB] NP[risk_HNN of_IN] NP[disease_NN]
11
+ NP[disease_NN] NTG[follow_VBG] NP[treatment_HNN with_IN] NP[drug_NN]
12
+ NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN therapy_HNN]
13
+ NP[disease_NN] VP[associate_HVB with_IN] NP[use_HNN of_IN] NP[drug_NN]
14
+ NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN use_HNN]
15
+ NP[disease_NN] VP[associate_HVB with_IN] NP[drug_NN treatment_HNN]
16
+ NP[disease_NN while_IN] VP[receive_HVB] NP[drug_NN]
17
+ NP[disease_NN] NTG[follow_VBG] NP[drug_NN therapy_HNN]
18
+ NP[disease_NN after_IN] VP[receive_HVB] NP[drug_NN]
19
+ NP[disease_NN] NTG[follow_VBG] NP[drug_NN administration_HNN]
20
+ NP[disease_NN due_(?:IN|JJ) to_TO] NP[drug_NN therapy_HNN]
21
+ NP[disease_NN] VP[follow_HVB] NP[treatment_HNN with_IN] NP[drug_NN]
22
+ NP[disease_NN] VP[follow_HVB] NP[drug_NN administration_HNN]
@@ -0,0 +1,10 @@
1
+ equal do |w| [w] end
2
+ standard do |w| [w.downcase.split(/\s+/).sort.join("")] end
3
+ cleaned do |w| [w.downcase.sub(/,.*/,'').sub(/\(.*\)/,'').gsub(/s(?:=\W)/,'')] end
4
+ special do |w| s = w.split.select{|w| w.is_special?}.collect{|w| w.downcase.sub(/p$/,'')} end
5
+ words do |w|
6
+ w.sub(/(.*)I$/,'\1I \1').
7
+ scan(/[a-z][a-z]+/i).
8
+ sort{|a,b| b.length <=> a.length}.
9
+ collect{|n| n.downcase}
10
+ end
@@ -0,0 +1,86 @@
1
+ require 'rbbt/util/misc'
2
+
3
+
4
+ plural = Proc.new do |t| t.sub(/s$/,'') end
5
+
6
+ tokens do
7
+
8
+ # Some (possible) single letters first
9
+ receptor /^(?:receptor|r)s?$/i
10
+ protein /^(?:protein|p)s?$/i
11
+ roman /^[IV]+$/
12
+ greek_letter do |w| $inverse_greek[w.downcase] != nil end
13
+
14
+
15
+ # Some words for removal
16
+ stopword do |w| $stopwords.include?( w.downcase_first) end
17
+ gene /genes?/i
18
+ dna
19
+ cdna
20
+ rna
21
+ mrna
22
+ trna
23
+ cdna
24
+ component
25
+ exon
26
+ intron
27
+ domain
28
+ family
29
+
30
+
31
+ # Important words
32
+ number /^(?:\d+[.,]?\d+|\d)$/
33
+ greek do |w| $greek[w.downcase] != nil end
34
+ special do |w| w.is_special? end
35
+ promoter
36
+ similar /^(homolog.*|like|related|associated)$/
37
+ ase /ase$/
38
+ in_end /in$/
39
+ end
40
+
41
+ comparisons do
42
+
43
+ compare.number do |l1,l2|
44
+ v = 0
45
+ case
46
+ when l1.empty? && l2.empty?
47
+ v = 0
48
+ when l1.sort.uniq == l2.sort.uniq
49
+ v = 3
50
+ when l1.any? && l1[0] == l2[0]
51
+ v = -3
52
+ when l1.empty? && l2 == ['1']
53
+ v = -5
54
+ else
55
+ v = -10
56
+ end
57
+ v
58
+ end
59
+
60
+ diff.promoter -10
61
+ diff.receptor -10
62
+ diff.similar -10
63
+ diff.capital -10
64
+
65
+ same.unknown 1
66
+ miss.unknown -2
67
+ extr.unknown -2
68
+
69
+ same.greek 1
70
+ miss.greek -2
71
+ extr.greek -2
72
+
73
+ same.special 4
74
+ miss.special -3
75
+ extr.special -3
76
+
77
+ transform.receptor plural
78
+ transform.protein plural
79
+
80
+ transform.roman do |t| [t.arabic, :number] end
81
+ transform.greek_letter do |t| [$inverse_greek[t.downcase], :greek] end
82
+ transform.ase do |t| [t, :special] end
83
+ transform.in_end do |t| [t, :special] end
84
+ transform.unknown do |t| [t, (t.length < 4 ? :special : :unknown)] end
85
+ end
86
+
File without changes
@@ -1,4 +1,4 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/bow/bow'
3
3
  require 'test/unit'
4
4
 
@@ -1,4 +1,4 @@
1
- require File.dirname(__FILE__) + '/../../test_helper'
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/bow/dictionary'
3
3
  require 'rbbt/bow/bow'
4
4
  require 'test/unit'
@@ -1,4 +1,4 @@
1
- require File.expand_path(File.dirname(__FILE__) + '/../../test_helper')
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
2
  require 'rbbt/bow/misc'
3
3
  require 'test/unit'
4
4
 
@@ -0,0 +1,99 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/corpus/corpus'
3
+ require 'rbbt/corpus/sources/pubmed'
4
+
5
+ class Document
6
+ define :sentences do
7
+ require 'rbbt/nlp/nlp'
8
+ NLP.geniass_sentence_splitter(text)
9
+ end
10
+
11
+ define :genes do
12
+ require 'rbbt/ner/abner'
13
+ Abner.new.entities(text)
14
+ end
15
+ end
16
+
17
+
18
+ class TestCorpus < Test::Unit::TestCase
19
+
20
+ def test_add_document
21
+ pmid = "19458159"
22
+
23
+ text = PubMed.get_article(pmid).text
24
+
25
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
26
+
27
+ assert corpus.find(:pubmed, pmid).empty?
28
+
29
+ corpus.add_document(text, :pubmed, pmid, :abstract)
30
+
31
+ assert corpus.find(:pubmed, pmid).any?
32
+ assert corpus.find(:pubmed, pmid, :fulltext).empty?
33
+ assert corpus.find(:pubmed, pmid, :abstract).any?
34
+
35
+ assert corpus.find(:pubmed, pmid).first.text =~ /SENT/
36
+ end
37
+
38
+ def test_add_pmid
39
+ pmid = "19465387"
40
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
41
+ corpus.add_pmid(pmid, :abstract)
42
+
43
+ assert corpus.exists? :pubmed, pmid
44
+ assert corpus.exists? :pubmed, pmid, :abstract
45
+ assert_equal false, corpus.exists?(:pubmed, pmid, :fulltext)
46
+ end
47
+
48
+ def test_find_all
49
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
50
+ corpus.add_pmid("19458159", :abstract)
51
+ corpus.add_pmid("19465387", :abstract)
52
+
53
+ all = corpus.find
54
+
55
+ assert_equal 2, all.length
56
+ assert all.select{|document| document.id == "19458159"}.any?
57
+ assert all.select{|document| document.id == "19465387"}.any?
58
+ end
59
+
60
+ def test_doc_sentences
61
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
62
+ corpus.add_pmid("19458159", :abstract)
63
+
64
+ sentences = corpus.find.collect{|doc| doc.sentences}.flatten
65
+
66
+ assert corpus.find.first.sentences.length > 0
67
+ assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
68
+
69
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
70
+ assert corpus.find.first.sentences.sort_by{|s| s.offset}.first =~ /Semantic features in Text/i
71
+ end
72
+
73
+ def test_doc_genes
74
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
75
+ corpus.add_pmid("21611789", :abstract)
76
+
77
+ assert corpus.find(:pubmed, "21611789").first.genes.include? "CDKAL1"
78
+ end
79
+
80
+ def test_genes
81
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
82
+ corpus.add_pmid("21611789", :abstract)
83
+
84
+ assert corpus.find.collect{|d| d.genes}.flatten.include? "CDKAL1"
85
+ end
86
+
87
+ def test_index
88
+ corpus = Corpus.new(Rbbt.tmp.test.Corpus.find :user)
89
+ corpus.add_pmid("21611789", :abstract)
90
+
91
+ document = corpus.find(:pubmed, "21611789").first
92
+
93
+ genes = corpus.find.collect{|d| d.genes}.flatten.select{|gene| gene == "CDKAL1"}
94
+
95
+ assert genes.collect{|gene|
96
+ document.sentences_at(gene.offset)
97
+ }.flatten.length > 1
98
+ end
99
+ end
@@ -0,0 +1,222 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.rb')
2
+ require 'rbbt/corpus/document'
3
+ require 'test/unit'
4
+
5
+ $persistence = TSV.new({})
6
+ $tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
7
+ $global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
8
+ $tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
9
+
10
+ class Document
11
+ define :sentences do
12
+ require 'rbbt/nlp/nlp'
13
+ NLP.geniass_sentence_splitter(text)
14
+ end
15
+
16
+ define :tokens do
17
+ require 'rbbt/ner/annotations/token'
18
+ Token.tokenize(text)
19
+ end
20
+
21
+ define :long_words do
22
+ require 'rbbt/ner/annotations/token'
23
+ Token.tokenize(text).select{|tok| tok.length > 5}
24
+ end
25
+
26
+ define :short_words do
27
+ require 'rbbt/ner/annotations/token'
28
+ Token.tokenize(text).select{|tok| tok.length < 5}
29
+ end
30
+
31
+ define :even_words do
32
+ require 'rbbt/ner/annotations/token'
33
+ Token.tokenize(text).select{|tok| tok.length % 2 == 0}
34
+ end
35
+
36
+ define :missing do
37
+ []
38
+ end
39
+
40
+ define :tokens_again do
41
+ raise "This should be here already"
42
+ end
43
+
44
+ persist :sentences
45
+ persist_in_tsv :tokens
46
+ persist_in_tsv :long_words, $tchash_persistence, :Literal
47
+ persist_in_global_tsv :short_words, $global_persistence
48
+ persist_in_global_tsv :even_words, $tchash_global_persistence
49
+ persist_in_global_tsv :missing, $tchash_global_persistence
50
+ end
51
+
52
+ class TestDocument < Test::Unit::TestCase
53
+
54
+ def test_annotations
55
+
56
+ text =<<-EOF
57
+ This is a
58
+ sentence. This is
59
+ another sentence.
60
+ EOF
61
+
62
+ doc = Document.new
63
+ doc.text = text
64
+
65
+ assert_equal 2, doc.sentences.length
66
+ assert_equal 10, doc.tokens.length
67
+ end
68
+
69
+ def test_annotation_load
70
+ text =<<-EOF
71
+ This is a
72
+ sentence. This is
73
+ another sentence.
74
+ EOF
75
+
76
+ doc = Document.new
77
+ doc.text = text * 10
78
+
79
+ sentence = doc.sentences.last
80
+ doc.load_into sentence, :tokens
81
+ assert_equal 5, sentence.tokens.length
82
+ assert_equal "another", sentence.tokens[2]
83
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
84
+ end
85
+
86
+ def test_annotation_persistence
87
+ text =<<-EOF
88
+ This is a
89
+ sentence. This is
90
+ another sentence.
91
+ EOF
92
+
93
+ text *= 10
94
+
95
+ TmpFile.with_file do |dir|
96
+ FileUtils.mkdir_p dir
97
+
98
+ doc = Document.new(dir)
99
+ doc.text = text
100
+ doc.sentences
101
+
102
+ doc = Document.new(dir)
103
+ doc.text = text
104
+
105
+ sentence = doc.sentences.last
106
+ doc.load_into sentence, :tokens
107
+
108
+ assert_equal 5, sentence.tokens.length
109
+ assert_equal "another", sentence.tokens[2]
110
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
111
+ end
112
+ end
113
+
114
+ def test_range_persistence
115
+ text =<<-EOF
116
+ This is a
117
+ sentence. This is
118
+ another sentence.
119
+ EOF
120
+
121
+ text *= 10
122
+
123
+ TmpFile.with_file do |dir|
124
+ FileUtils.mkdir_p dir
125
+
126
+ doc = Document.new(dir)
127
+ doc.text = text
128
+
129
+ sentence = doc.sentences.last
130
+ Misc.benchmark(10) do
131
+ doc = Document.new(dir)
132
+ doc.text = text
133
+
134
+ doc.load_into sentence, :tokens, :persist => true
135
+ assert_equal 5, sentence.tokens.length
136
+ assert_equal "another", sentence.tokens[2]
137
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
138
+ assert_equal sentence.offset + 5, sentence.tokens[1].offset
139
+ end
140
+ end
141
+ end
142
+
143
+ def test_annotation_persistence_in_tsv
144
+ text =<<-EOF
145
+ This is a
146
+ sentence. This is
147
+ another sentence.
148
+ EOF
149
+
150
+ TmpFile.with_file do |dir|
151
+ FileUtils.mkdir_p dir
152
+
153
+
154
+ doc = Document.new(dir)
155
+ doc.text = text * 10
156
+ doc.sentences
157
+
158
+ doc = Document.new(dir)
159
+ doc.text = text * 10
160
+
161
+ sentence = doc.sentences.last
162
+
163
+ doc.load_into sentence, :tokens, :long_words
164
+
165
+ assert_equal 5, sentence.tokens.length
166
+ assert_equal "another", sentence.tokens[2]
167
+ assert_equal sentence.offset + 0, sentence.tokens[0].offset
168
+
169
+ assert_equal 2, sentence.long_words.length
170
+ assert_equal %w(another sentence), sentence.long_words
171
+ assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
172
+ end
173
+ end
174
+
175
+ def test_annotation_persistence_in_global
176
+ text =<<-EOF
177
+ This is a
178
+ sentence. This is
179
+ another sentence.
180
+ EOF
181
+
182
+ TmpFile.with_file do |dir|
183
+ FileUtils.mkdir_p dir
184
+
185
+
186
+ doc = Document.new(dir)
187
+ doc.text = text * 10
188
+ doc.docid = "FOOF"
189
+ doc.short_words
190
+ doc.sentences
191
+
192
+ doc = Document.new(dir)
193
+ doc.text = text * 10
194
+ doc.docid = "FOOF"
195
+
196
+ sentence = doc.sentences.last
197
+
198
+ doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
199
+
200
+ assert_equal 3, sentence.short_words.length
201
+ assert_equal 3, sentence.even_words.length
202
+ end
203
+ end
204
+
205
+ def test_dump
206
+ text =<<-EOF
207
+ This is a
208
+ sentence. This is
209
+ another sentence.
210
+ EOF
211
+
212
+ TmpFile.with_file do |dir|
213
+ FileUtils.mkdir_p dir
214
+
215
+ doc = Document.new(dir)
216
+ doc.text = text * 10
217
+ tsv = Document.tsv(doc.sentences, ["Literal"])
218
+ end
219
+ end
220
+ end
221
+
222
+