rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,105 +0,0 @@
1
- require 'rbbt/util/open'
2
- require 'rbbt'
3
-
4
- # This module interacts with BioMart. It performs queries to BioMart and
5
- # synthesises a hash with the results. Note that this module connects to the
6
- # online BioMart WS using the Open in 'rbbt/util/open' module which offers
7
- # caching by default. To obtain up to date results you may need to clear the
8
- # cache from previous queries.
9
- module BioMart
10
-
11
- class BioMart::QueryError < StandardError; end
12
- private
13
-
14
- @@biomart_query_xml = <<-EOT
15
- <?xml version="1.0" encoding="UTF-8"?>
16
- <!DOCTYPE Query>
17
- <Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
18
- <Dataset name = "<!--DATABASE-->" interface = "default" >
19
- <!--FILTERS-->
20
- <!--MAIN-->
21
- <!--ATTRIBUTES-->
22
- </Dataset>
23
- </Query>
24
- EOT
25
-
26
-
27
-
28
-
29
- def self.get(database, main, attrs = nil, filters = nil, data = nil)
30
- attrs ||= []
31
- filters ||= ["with_#{main}"]
32
- data ||= {}
33
-
34
- query = @@biomart_query_xml.clone
35
- query.sub!(/<!--DATABASE-->/,database)
36
- query.sub!(/<!--FILTERS-->/, filters.collect{|name| "<Filter name = \"#{ name }\" excluded = \"0\"/>"}.join("\n") )
37
- query.sub!(/<!--MAIN-->/,"<Attribute name = \"#{main}\" />")
38
- query.sub!(/<!--ATTRIBUTES-->/, attrs.collect{|name| "<Attribute name = \"#{ name }\"/>"}.join("\n") )
39
-
40
- response = Open.read('http://www.biomart.org/biomart/martservice?query=' + query.gsub(/\n/,' '))
41
- if response =~ /Query ERROR:/
42
- raise BioMart::QueryError, response
43
- end
44
-
45
- response.each_line{|l|
46
- parts = l.chomp.split(/\t/)
47
- main = parts.shift
48
- next if main.nil? || main.empty?
49
-
50
- data[main] ||= {}
51
- attrs.each{|name|
52
- value = parts.shift
53
- data[main][name] ||= []
54
- next if value.nil?
55
- data[main][name] << value
56
- }
57
- }
58
-
59
- data
60
-
61
- end
62
-
63
- public
64
-
65
- # This method performs a query in biomart for a datasets and a given set of
66
- # attributes, there must be a main attribute that will be used as the key in
67
- # the result hash, optionally there may be a list of additional attributes
68
- # and filters. The data parameter at the end is used internally to
69
- # incrementally building the result, due to a limitation of the BioMart WS
70
- # that only allows 3 external arguments, users normally should leave it
71
- # unspecified or nil. The result is a hash, where the keys are the different
72
- # values for the main attribute, and the value is a hash with every other
73
- # attribute as key, and as value and array with all possible values (Note
74
- # that for a given value of the main attribute, there may be more than one
75
- # value for another attribute). If filters is left a nil it adds a filter to
76
- # the BioMart query to remove results with the main attribute empty, this may
77
- # cause an error if the BioMart WS does not allow filtering with that
78
- # attribute.
79
- def self.query(database, main, attrs = nil, filters = nil, data = nil)
80
- attrs ||= []
81
- data ||= {}
82
-
83
- chunks = []
84
- chunk = []
85
- attrs.each{|a|
86
- chunk << a
87
- if chunk.length == 2
88
- chunks << chunk
89
- chunk = []
90
- end
91
- }
92
-
93
- chunks << chunk if chunk.any?
94
-
95
- chunks.each{|chunk|
96
- data = get(database,main,chunk, filters, data)
97
- }
98
-
99
- data
100
- end
101
-
102
-
103
-
104
- end
105
-
@@ -1,211 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/util/tmpfile'
4
- require 'rbbt/util/filecache'
5
- require 'rbbt/bow/bow.rb'
6
- require 'set'
7
-
8
-
9
- # This module is used to parse and extract information from the
10
- # gene_info file at Entrez Gene, as well as from the gene2pubmed file.
11
- # Both need to be downloaded and accesible for Rbbt, which is done as
12
- # part of a normal installation.
13
- module Entrez
14
-
15
- class NoFileError < StandardError; end
16
-
17
- # Given a taxonomy, or set of taxonomies, it returns an inverse hash,
18
- # where each key is the entrez id of a gene, and the value is an array
19
- # of possible synonyms in other databases. Is mostly used to translate
20
- # entrez ids to the native database id of the organism. The parameter
21
- # +native+ specifies the position of the key containing synonym, the
22
- # fifth by default, +fix+ and +check+ are Procs used, if present, to
23
- # pre-process lines and to check if they should be processed.
24
- def self.entrez2native(taxs, native = nil, fix = nil, check = nil)
25
-
26
- raise NoFileError, "Install the Entrez gene_info file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene_info')
27
-
28
- native ||= 5
29
-
30
- taxs = [taxs] unless taxs.is_a?(Array)
31
- taxs = taxs.collect{|t| t.to_s}
32
-
33
- lexicon = {}
34
- tmp = TmpFile.tmp_file("entrez-")
35
- system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene_info')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
36
- File.open(tmp).each{|l|
37
- parts = l.chomp.split(/\t/)
38
- next if parts[native] == '-'
39
- entrez = parts[1]
40
- parts[native].split(/\|/).each{|id|
41
- id = fix.call(id) if fix
42
- next if check && !check.call(id)
43
-
44
- lexicon[entrez] ||= []
45
- lexicon[entrez] << id
46
- }
47
- }
48
- FileUtils.rm tmp
49
-
50
- lexicon
51
- end
52
-
53
- # For a given taxonomy, or set of taxonomies, it returns a hash with
54
- # genes as keys and arrays of related PubMed ids as values, as
55
- # extracted from the gene2pubmed file from Entrez Gene.
56
- def self.entrez2pubmed(taxs)
57
- raise NoFileError, "Install the Entrez gene2pubmed file" unless File.exists? File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')
58
-
59
- taxs = [taxs] unless taxs.is_a?(Array)
60
- taxs = taxs.collect{|t| t.to_s}
61
-
62
- data = {}
63
- tmp = TmpFile.tmp_file("entrez-")
64
- system "cat '#{File.join(Rbbt.datadir, 'dbs/entrez/gene2pubmed')}' |grep '^\\(#{taxs.join('\\|')}\\)[[:space:]]' > #{tmp}"
65
-
66
- data = Open.to_hash(tmp, :native => 1, :extra => 2).each{|code, value_lists| value_lists.flatten!}
67
-
68
- FileUtils.rm tmp
69
-
70
- data
71
- end
72
-
73
-
74
-
75
- # This class parses an xml containing the information for a particular
76
- # gene as served by Entrez Gene, and hold some of its information.
77
- class Gene
78
- attr_reader :organism, :symbol, :description, :aka, :protnames, :summary, :comentaries
79
-
80
- def initialize(xml)
81
- return if xml.nil?
82
-
83
- @organism = xml.scan(/<Org-ref_taxname>(.*)<\/Org-ref_taxname>/s)
84
- @symbol = xml.scan(/<Gene-ref_locus>(.*)<\/Gene-ref_locus>/s)
85
- @description = xml.scan(/<Gene-ref_desc>(.*)<\/Gene-ref_desc>/s)
86
- @aka = xml.scan(/<Gene-ref_syn_E>(.*)<\Gene-ref_syn_E>/s)
87
- @protnames = xml.scan(/<Prot-ref_name_E>(.*)<\/Prot-ref_name_E>/s)
88
- @summary = xml.scan(/<Entrezgene_summary>(.*)<\/Entrezgene_summary>/s)
89
- @comentaries = xml.scan(/<Gene-commentary_text>(.*)<\/Gene-commentary_text>/s)
90
-
91
-
92
- end
93
-
94
- # Joins the text from symbol, description, aka, protnames, and
95
- # summary
96
- def text
97
- #[@organism, @symbol, @description, @aka, @protnames, @summary,@comentaries.join(". ")].join(". ")
98
- [@symbol, @description, @aka, @protnames, @summary].flatten.join(". ")
99
- end
100
- end
101
-
102
- private
103
-
104
- @@last = Time.now
105
- @@entrez_lag = 1
106
- def self.get_online(geneids)
107
-
108
- geneids_list = ( geneids.is_a?(Array) ? geneids.join(',') : geneids.to_s )
109
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&retmode=xml&id=#{geneids_list}"
110
-
111
- diff = Time.now - @@last
112
- sleep @@entrez_lag - diff unless diff > @@entrez_lag
113
-
114
- xml = Open.read(url, :quiet => true, :nocache => true)
115
-
116
- @@last = Time.now
117
-
118
- genes = xml.scan(/(<Entrezgene>.*?<\/Entrezgene>)/sm).flatten
119
-
120
- if geneids.is_a? Array
121
- list = {}
122
- genes.each_with_index{|gene,i|
123
- #geneid = gene.scan(/<Gene-track_geneid>(.*?)<\/Gene-track_geneid>/).flatten.first
124
- geneid = geneids[i]
125
- list[geneid ] = gene
126
- }
127
- return list
128
- else
129
- return genes.first
130
- end
131
-
132
- end
133
-
134
- public
135
-
136
- # Build a file name for a gene based on the id. Prefix the id by 'gene-',
137
- # substitute the slashes with '_SLASH_', and add a '.xml' extension.
138
- def self.gene_filename(id)
139
- FileCache.clean_path('gene-' + id.to_s + '.xml')
140
- end
141
-
142
- # Returns a Gene object for the given Entrez Gene id. If an array of
143
- # ids is given instead, a hash is returned. This method uses the
144
- # caching facilities from Rbbt.
145
- def self.get_gene(geneid)
146
-
147
- return nil if geneid.nil?
148
-
149
- if Array === geneid
150
- missing = []
151
- list = {}
152
-
153
- geneid.each{|p|
154
- next if p.nil?
155
- filename = gene_filename p
156
- if File.exists? FileCache.path(filename)
157
- list[p] = Gene.new(Open.read(FileCache.path(filename)))
158
- else
159
- missing << p
160
- end
161
- }
162
-
163
- return list unless missing.any?
164
- genes = get_online(missing)
165
-
166
- genes.each{|p, xml|
167
- filename = gene_filename p
168
- FileCache.add_file(filename,xml) unless File.exist? FileCache.path(filename)
169
- list[p] = Gene.new(xml)
170
- }
171
-
172
- return list
173
-
174
- else
175
- filename = gene_filename geneid
176
-
177
- if File.exists? FileCache.path(filename)
178
- return Gene.new(Open.read(FileCache.path(filename)))
179
- else
180
- xml = get_online(geneid)
181
- FileCache.add_file(filename,xml)
182
-
183
- return Gene.new(xml)
184
- end
185
- end
186
- end
187
-
188
- # Counts the words in common between a chunk of text and the text
189
- # found in Entrez Gene for that particular gene. The +gene+ may be a
190
- # gene identifier or a Gene class instance.
191
- def self.gene_text_similarity(gene, text)
192
-
193
- case
194
- when Entrez::Gene === gene
195
- gene_text = gene.text
196
- when String === gene || Fixnum === gene
197
- gene_text = get_gene(gene).text
198
- else
199
- return 0
200
- end
201
-
202
-
203
- gene_words = gene_text.words.to_set
204
- text_words = text.words.to_set
205
-
206
- return 0 if gene_words.empty? || text_words.empty?
207
-
208
- common = gene_words.intersection(text_words)
209
- common.length / (gene_words.length + text_words.length).to_f
210
- end
211
- end
@@ -1,85 +0,0 @@
1
- require 'rbbt'
2
-
3
-
4
- # This module holds helper methods to deal with the Gene Ontology files. Right
5
- # now all it does is provide a translation form id to the actual names.
6
- module GO
7
-
8
- @@info = nil
9
- MULTIPLE_VALUE_FIELDS = %w(is_a)
10
-
11
- # This method needs to be called before any translations can be made, it is
12
- # called automatically the first time the id2name method is called. It loads
13
- # the gene_ontology.obo file and extracts all the fields, although right now,
14
- # only the name field is used.
15
- def self.init
16
- @@info = {}
17
- File.open(File.join(Rbbt.datadir, 'dbs/go/gene_ontology.obo')).read.
18
- split(/\[Term\]/).
19
- each{|term|
20
- term_info = {}
21
- term.split(/\n/).
22
- select{|l| l =~ /:/}.
23
- each{|l|
24
- key, value = l.chomp.match(/(.*?):(.*)/).values_at(1,2)
25
- if MULTIPLE_VALUE_FIELDS.include? key.strip
26
- term_info[key.strip] ||= []
27
- term_info[key.strip] << value.strip
28
- else
29
- term_info[key.strip] = value.strip
30
- end
31
- }
32
- @@info[term_info["id"]] = term_info
33
- }
34
- end
35
-
36
- def self.info
37
- self.init unless @@info
38
- @@info
39
- end
40
-
41
- def self.goterms
42
- self.init unless @@info
43
- @@info.keys
44
- end
45
-
46
- def self.id2name(id)
47
- self.init unless @@info
48
- if id.kind_of? Array
49
- @@info.values_at(*id).collect{|i| i['name'] if i}
50
- else
51
- return nil if @@info[id].nil?
52
- @@info[id]['name']
53
- end
54
- end
55
-
56
- def self.id2ancestors(id)
57
- self.init unless @@info
58
- if id.kind_of? Array
59
- @@info.values_at(*id).
60
- select{|i| ! i['is_a'].nil?}.
61
- collect{|i| i['is_a'].collect{|id|
62
- id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
63
- }.compact
64
- }
65
- else
66
- return [] if @@info[id].nil? || @@info[id]['is_a'].nil?
67
- @@info[id]['is_a'].
68
- collect{|id|
69
- id.match(/(GO:\d+)/)[1] if id.match(/(GO:\d+)/)
70
- }.compact
71
- end
72
- end
73
-
74
- def self.id2namespace(id)
75
- self.init unless @@info
76
- if id.kind_of? Array
77
- @@info.values_at(*id).collect{|i| i['namespace'] if i}
78
- else
79
- return nil if @@info[id].nil?
80
- @@info[id]['namespace']
81
- end
82
- end
83
-
84
-
85
- end
@@ -1,74 +0,0 @@
1
- require 'mechanize'
2
-
3
-
4
- module GoogleScholar
5
- def self.user_agent
6
- @@a ||= Mechanize.new
7
- end
8
-
9
- def self.citation_link(title)
10
- citation_link = nil
11
-
12
- # Get citation page
13
- user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
14
- article = page.search('div[@class=gs_r]').first
15
- return nil if article.nil?
16
-
17
- return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
18
- end
19
- end
20
-
21
- def self.full_text_url(title)
22
- full_text_link = nil
23
-
24
- # Get page
25
- user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
26
- article = page.search('div[@class=gs_r]').first
27
- return nil if article.nil?
28
-
29
- link = article.search('a').select{ |link|
30
- link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
31
- }.first
32
-
33
- return nil if link.nil?
34
-
35
- return link['href']
36
- end
37
- end
38
-
39
-
40
- def self.number_cites(title)
41
-
42
- link = citation_link title
43
- return 0 if link.nil?
44
-
45
- link.inner_html =~ /(\d+)$/
46
-
47
- return $1.to_i
48
- end
49
-
50
- end
51
-
52
-
53
- #def get_citers(title)
54
- # puts title
55
- # citation_link = nil
56
- #
57
- # # Get citation page
58
- # $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
59
- # citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
60
- # end
61
- #
62
- # return [] if citation_link.nil?
63
- #
64
- # # Parse citations
65
- #
66
- # citers = []
67
- # $a.get("http://scholar.google.es" + citation_link['href']) do |page|
68
- # citers = page.search('div[@class=gs_r]').collect do |entry|
69
- # entry.search('h3').first.search('a').first.inner_html
70
- # end
71
- # end
72
- #
73
- # return citers
74
- #end