rbbt 1.2.5 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,105 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt'
3
-
4
- # This module interacts with BioMart. It performs queries to BioMart and
5
- # synthesises a hash with the results. Note that this module connects to the
6
- # online BioMart WS using the Open in 'rbbt/util/open' module which offers
7
- # caching by default. To obtain up to date results you may need to clear the
8
- # cache from previous queries.
9
- module BioMart
10
-
11
- class BioMart::QueryError < StandardError; end
12
- private
13
-
14
- @@biomart_query_xml = <<-EOT
15
- <?xml version="1.0" encoding="UTF-8"?>
16
- <!DOCTYPE Query>
17
- <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
18
- <Dataset name = "<!--DATABASE-->" interface = "default" >
19
- <!--FILTERS-->
20
- <!--MAIN-->
21
- <!--ATTRIBUTES-->
22
- </Dataset>
23
- </Query>
24
- EOT
25
-
26
-
27
-
28
-
29
- def self.get(database, main, attrs = nil, filters = nil, data = nil)
30
- attrs ||= []
31
- filters ||= ["with_#{main}"]
32
- data ||= {}
33
-
34
- query = @@biomart_query_xml.clone
35
- query.sub!(/<!--DATABASE-->/,database)
36
- query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
37
- query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
38
- query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
39
-
40
- response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
41
- if response =~ /Query ERROR:/
42
- raise BioMart::QueryError, response
43
- end
44
-
45
- response.each_line{|l|
46
- parts = l.chomp.split(/\t/)
47
- main = parts.shift
48
- next if main.nil? || main.empty?
49
-
50
- data[main] ||= {}
51
- attrs.each{|name|
52
- value = parts.shift
53
- data[main][name] ||= []
54
- next if value.nil?
55
- data[main][name] << value
56
- }
57
- }
58
-
59
- data
60
-
61
- end
62
-
63
- public
64
-
65
- # This method performs a query in biomart for a datasets and a given set of
66
- # attributes, there must be a main attribute that will be used as the key in
67
- # the result hash, optionally there may be a list of additional attributes
68
- # and filters. The data parameter at the end is used internally to
69
- # incrementally building the result, due to a limitation of the BioMart WS
70
- # that only allows 3 external arguments, users normally should leave it
71
- # unspecified or nil. The result is a hash, where the keys are the different
72
- # values for the main attribute, and the value is a hash with every other
73
- # attribute as key, and as value and array with all possible values (Note
74
- # that for a given value of the main attribute, there may be more than one
75
- # value for another attribute). If filters is left a nil it adds a filter to
76
- # the BioMart query to remove results with the main attribute empty, this may
77
- # cause an error if the BioMart WS does not allow filtering with that
78
- # attribute.
79
- def self.query(database, main, attrs = nil, filters = nil, data = nil)
80
- attrs ||= []
81
- data ||= {}
82
-
83
- chunks = []
84
- chunk = []
85
- attrs.each{|a|
86
- chunk << a
87
- if chunk.length == 2
88
- chunks << chunk
89
- chunk = []
90
- end
91
- }
92
-
93
- chunks << chunk if chunk.any?
94
-
95
- chunks.each{|chunk|
96
- data = get(database,main,chunk, filters, data)
97
- }
98
-
99
- data
100
- end
101
-
102
-
103
-
104
- end
105
-
@@ -1,211 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/util/tmpfile'
4
- require 'rbbt/util/filecache'
5
- require 'rbbt/bow/bow.rb'
6
- require 'set'
7
-
8
-
9
- # This module is used to parse and extract information from the
10
- # gene_info file at Entrez Gene, as well as from the gene2pubmed file.
11
- # Both need to be downloaded and accesible for Rbbt, which is done as
12
- # part of a normal installation.
13
- module Entrez
14
-
15
- class NoFileError < StandardError; end
16
-
17
- # Given a taxonomy, or set of taxonomies, it returns an inverse hash,
18
- # where each key is the entrez id of a gene, and the value is an array
19
- # of possible synonyms in other databases. Is mostly used to translate
20
- # entrez ids to the native database id of the organism. The parameter
21
- # +native+ specifies the position of the key containing synonym, the
22
- # fifth by default, +fix+ and +check+ are Procs used, if present, to
23
- # pre-process lines and to check if they should be processed.
24
- def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
25
-
26
- raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
27
-
28
- native ||= 5
29
-
30
- taxs = [taxs] unless taxs.is_a?(Array)
31
- taxs = taxs.collect{|t| t.to_s}
32
-
33
- lexicon = {}
34
- tmp = TmpFile.tmp_file("entrez-")
35
- system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
36
- File.open(tmp).each{|l|
37
- parts = l.chomp.split(/\t/)
38
- next if parts[native] == '-'
39
- entrez = parts[1]
40
- parts[native].split(/\|/).each{|id|
41
- id = fix.call(id) if fix
42
- next if check && !check.call(id)
43
-
44
- lexicon[entrez] ||= []
45
- lexicon[entrez] << id
46
- }
47
- }
48
- FileUtils.rm tmp
49
-
50
- lexicon
51
- end
52
-
53
- # For a given taxonomy, or set of taxonomies, it returns a hash with
54
- # genes as keys and arrays of related PubMed ids as values, as
55
- # extracted from the gene2pubmed file from Entrez Gene.
56
- def self.entrez2pubmed(taxs)
57
- raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
58
-
59
- taxs = [taxs] unless taxs.is_a?(Array)
60
- taxs = taxs.collect{|t| t.to_s}
61
-
62
- data = {}
63
- tmp = TmpFile.tmp_file("entrez-")
64
- system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
65
-
66
- data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
67
-
68
- FileUtils.rm tmp
69
-
70
- data
71
- end
72
-
73
-
74
-
75
- # This class parses an xml containing the information for a particular
76
- # gene as served by Entrez Gene, and hold some of its information.
77
- class Gene
78
- attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
79
-
80
- def initialize(xml)
81
- return if xml.nil?
82
-
83
- @organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
84
- @symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
85
- @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
86
- @aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
87
- @protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
88
- @summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
89
- @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
90
-
91
-
92
- end
93
-
94
- # Joins the text from symbol, description, aka, protnames, and
95
- # summary
96
- def text
97
- #[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
98
- [@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
99
- end
100
- end
101
-
102
- private
103
-
104
- @@last = Time.now
105
- @@entrez_lag = 1
106
- def self.get_online(geneids)
107
-
108
- geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
109
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
110
-
111
- diff = Time.now - @@last
112
- sleep @@entrez_lag - diff unless diff > @@entrez_lag
113
-
114
- xml = Open.read(url, :quiet => true, :nocache => true)
115
-
116
- @@last = Time.now
117
-
118
- genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
119
-
120
- if geneids.is_a? Array
121
- list = {}
122
- genes.each_with_index{|gene,i|
123
- #geneid = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
124
- geneid = geneids[i]
125
- list[geneid ] = gene
126
- }
127
- return list
128
- else
129
- return genes.first
130
- end
131
-
132
- end
133
-
134
- public
135
-
136
- # Build a file name for a gene based on the id. Prefix the id by 'gene-',
137
- # substitute the slashes with '_SLASH_', and add a '.xml' extension.
138
- def self.gene_filename(id)
139
- FileCache.clean_path('gene-' + id.to_s + '.xml')
140
- end
141
-
142
- # Returns a Gene object for the given Entrez Gene id. If an array of
143
- # ids is given instead, a hash is returned. This method uses the
144
- # caching facilities from Rbbt.
145
- def self.get_gene(geneid)
146
-
147
- return nil if geneid.nil?
148
-
149
- if Array === geneid
150
- missing = []
151
- list = {}
152
-
153
- geneid.each{|p|
154
- next if p.nil?
155
- filename = gene_filename p
156
- if File.exists? FileCache.path(filename)
157
- list[p] = Gene.new(Open.read(FileCache.path(filename)))
158
- else
159
- missing << p
160
- end
161
- }
162
-
163
- return list unless missing.any?
164
- genes = get_online(missing)
165
-
166
- genes.each{|p, xml|
167
- filename = gene_filename p
168
- FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
169
- list[p] = Gene.new(xml)
170
- }
171
-
172
- return list
173
-
174
- else
175
- filename = gene_filename geneid
176
-
177
- if File.exists? FileCache.path(filename)
178
- return Gene.new(Open.read(FileCache.path(filename)))
179
- else
180
- xml = get_online(geneid)
181
- FileCache.add_file(filename,xml)
182
-
183
- return Gene.new(xml)
184
- end
185
- end
186
- end
187
-
188
- # Counts the words in common between a chunk of text and the text
189
- # found in Entrez Gene for that particular gene. The +gene+ may be a
190
- # gene identifier or a Gene class instance.
191
- def self.gene_text_similarity(gene, text)
192
-
193
- case
194
- when Entrez::Gene === gene
195
- gene_text = gene.text
196
- when String === gene || Fixnum === gene
197
- gene_text = get_gene(gene).text
198
- else
199
- return 0
200
- end
201
-
202
-
203
- gene_words = gene_text.words.to_set
204
- text_words = text.words.to_set
205
-
206
- return 0 if gene_words.empty? || text_words.empty?
207
-
208
- common = gene_words.intersection(text_words)
209
- common.length / (gene_words.length + text_words.length).to_f
210
- end
211
- end
@@ -1,85 +0,0 @@
1
- require 'rbbt'
2
-
3
-
4
- # This module holds helper methods to deal with the Gene Ontology files. Right
5
- # now all it does is provide a translation form id to the actual names.
6
- module GO
7
-
8
- @@info = nil
9
- MULTIPLE_VALUE_FIELDS = %w(is_a)
10
-
11
- # This method needs to be called before any translations can be made, it is
12
- # called automatically the first time the id2name method is called. It loads
13
- # the gene_ontology.obo file and extracts all the fields, although right now,
14
- # only the name field is used.
15
- def self.init
16
- @@info = {}
17
- File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
18
- split(/\[Term\]/).
19
- each{|term|
20
- term_info = {}
21
- term.split(/\n/).
22
- select{|l| l =~ /:/}.
23
- each{|l|
24
- key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
25
- if MULTIPLE_VALUE_FIELDS.include? key.strip
26
- term_info[key.strip] ||= []
27
- term_info[key.strip] << value.strip
28
- else
29
- term_info[key.strip] = value.strip
30
- end
31
- }
32
- @@info[term_info["id"]] = term_info
33
- }
34
- end
35
-
36
- def self.info
37
- self.init unless @@info
38
- @@info
39
- end
40
-
41
- def self.goterms
42
- self.init unless @@info
43
- @@info.keys
44
- end
45
-
46
- def self.id2name(id)
47
- self.init unless @@info
48
- if id.kind_of? Array
49
- @@info.values_at(*id).collect{|i| i['name'] if i}
50
- else
51
- return nil if @@info[id].nil?
52
- @@info[id]['name']
53
- end
54
- end
55
-
56
- def self.id2ancestors(id)
57
- self.init unless @@info
58
- if id.kind_of? Array
59
- @@info.values_at(*id).
60
- select{|i| ! i['is_a'].nil?}.
61
- collect{|i| i['is_a'].collect{|id|
62
- id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
63
- }.compact
64
- }
65
- else
66
- return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
67
- @@info[id]['is_a'].
68
- collect{|id|
69
- id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
70
- }.compact
71
- end
72
- end
73
-
74
- def self.id2namespace(id)
75
- self.init unless @@info
76
- if id.kind_of? Array
77
- @@info.values_at(*id).collect{|i| i['namespace'] if i}
78
- else
79
- return nil if @@info[id].nil?
80
- @@info[id]['namespace']
81
- end
82
- end
83
-
84
-
85
- end
@@ -1,74 +0,0 @@
1
- require 'mechanize'
2
-
3
-
4
- module GoogleScholar
5
- def self.user_agent
6
- @@a ||= Mechanize.new
7
- end
8
-
9
- def self.citation_link(title)
10
- citation_link = nil
11
-
12
- # Get citation page
13
- user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
14
- article = page.search('div[@class=gs_r]').first
15
- return nil if article.nil?
16
-
17
- return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
18
- end
19
- end
20
-
21
- def self.full_text_url(title)
22
- full_text_link = nil
23
-
24
- # Get page
25
- user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
26
- article = page.search('div[@class=gs_r]').first
27
- return nil if article.nil?
28
-
29
- link = article.search('a').select{ |link|
30
- link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
31
- }.first
32
-
33
- return nil if link.nil?
34
-
35
- return link['href']
36
- end
37
- end
38
-
39
-
40
- def self.number_cites(title)
41
-
42
- link = citation_link title
43
- return 0 if link.nil?
44
-
45
- link.inner_html =~ /(\d+)$/
46
-
47
- return $1.to_i
48
- end
49
-
50
- end
51
-
52
-
53
- #def get_citers(title)
54
- # puts title
55
- # citation_link = nil
56
- #
57
- # # Get citation page
58
- # $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
59
- # citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
60
- # end
61
- #
62
- # return [] if citation_link.nil?
63
- #
64
- # # Parse citations
65
- #
66
- # citers = []
67
- # $a.get("http://scholar.google.es" + citation_link['href']) do |page|
68
- # citers = page.search('div[@class=gs_r]').collect do |entry|
69
- # entry.search('h3').first.search('a').first.inner_html
70
- # end
71
- # end
72
- #
73
- # return citers
74
- #end