rbbt 1.2.5 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +7 -0
  2. data/README.rdoc +2 -138
  3. metadata +69 -214
  4. data/LICENSE +0 -20
  5. data/bin/rbbt_config +0 -245
  6. data/install_scripts/classifier/R/classify.R +0 -36
  7. data/install_scripts/classifier/Rakefile +0 -140
  8. data/install_scripts/get_abner.sh +0 -2
  9. data/install_scripts/get_banner.sh +0 -25
  10. data/install_scripts/get_biocreative.sh +0 -72
  11. data/install_scripts/get_crf++.sh +0 -26
  12. data/install_scripts/get_entrez.sh +0 -4
  13. data/install_scripts/get_go.sh +0 -4
  14. data/install_scripts/get_polysearch.sh +0 -8
  15. data/install_scripts/ner/Rakefile +0 -206
  16. data/install_scripts/ner/config/default.rb +0 -52
  17. data/install_scripts/norm/Rakefile +0 -219
  18. data/install_scripts/norm/config/cue_default.rb +0 -10
  19. data/install_scripts/norm/config/tokens_default.rb +0 -86
  20. data/install_scripts/norm/functions.sh +0 -23
  21. data/install_scripts/organisms/Ath.Rakefile +0 -55
  22. data/install_scripts/organisms/Cal.Rakefile +0 -84
  23. data/install_scripts/organisms/Cel.Rakefile +0 -109
  24. data/install_scripts/organisms/Hsa.Rakefile +0 -140
  25. data/install_scripts/organisms/Mmu.Rakefile +0 -77
  26. data/install_scripts/organisms/Rakefile +0 -43
  27. data/install_scripts/organisms/Rno.Rakefile +0 -88
  28. data/install_scripts/organisms/Sce.Rakefile +0 -66
  29. data/install_scripts/organisms/Spo.Rakefile +0 -40
  30. data/install_scripts/organisms/rake-include.rb +0 -252
  31. data/install_scripts/wordlists/consonants +0 -897
  32. data/install_scripts/wordlists/stopwords +0 -1
  33. data/lib/rbbt.rb +0 -83
  34. data/lib/rbbt/bow/bow.rb +0 -88
  35. data/lib/rbbt/bow/classifier.rb +0 -116
  36. data/lib/rbbt/bow/dictionary.rb +0 -187
  37. data/lib/rbbt/ner/abner.rb +0 -34
  38. data/lib/rbbt/ner/banner.rb +0 -73
  39. data/lib/rbbt/ner/dictionaryNER.rb +0 -98
  40. data/lib/rbbt/ner/regexpNER.rb +0 -70
  41. data/lib/rbbt/ner/rner.rb +0 -227
  42. data/lib/rbbt/ner/rnorm.rb +0 -143
  43. data/lib/rbbt/ner/rnorm/cue_index.rb +0 -80
  44. data/lib/rbbt/ner/rnorm/tokens.rb +0 -217
  45. data/lib/rbbt/sources/biocreative.rb +0 -75
  46. data/lib/rbbt/sources/biomart.rb +0 -105
  47. data/lib/rbbt/sources/entrez.rb +0 -211
  48. data/lib/rbbt/sources/go.rb +0 -85
  49. data/lib/rbbt/sources/gscholar.rb +0 -74
  50. data/lib/rbbt/sources/organism.rb +0 -241
  51. data/lib/rbbt/sources/polysearch.rb +0 -117
  52. data/lib/rbbt/sources/pubmed.rb +0 -248
  53. data/lib/rbbt/util/arrayHash.rb +0 -266
  54. data/lib/rbbt/util/filecache.rb +0 -72
  55. data/lib/rbbt/util/index.rb +0 -47
  56. data/lib/rbbt/util/misc.rb +0 -106
  57. data/lib/rbbt/util/open.rb +0 -251
  58. data/lib/rbbt/util/rake.rb +0 -183
  59. data/lib/rbbt/util/simpleDSL.rb +0 -87
  60. data/lib/rbbt/util/tmpfile.rb +0 -35
  61. data/tasks/install.rake +0 -124
  62. data/test/rbbt/bow/test_bow.rb +0 -33
  63. data/test/rbbt/bow/test_classifier.rb +0 -72
  64. data/test/rbbt/bow/test_dictionary.rb +0 -91
  65. data/test/rbbt/ner/rnorm/test_cue_index.rb +0 -57
  66. data/test/rbbt/ner/rnorm/test_tokens.rb +0 -70
  67. data/test/rbbt/ner/test_abner.rb +0 -17
  68. data/test/rbbt/ner/test_banner.rb +0 -17
  69. data/test/rbbt/ner/test_dictionaryNER.rb +0 -122
  70. data/test/rbbt/ner/test_regexpNER.rb +0 -33
  71. data/test/rbbt/ner/test_rner.rb +0 -126
  72. data/test/rbbt/ner/test_rnorm.rb +0 -47
  73. data/test/rbbt/sources/test_biocreative.rb +0 -38
  74. data/test/rbbt/sources/test_biomart.rb +0 -31
  75. data/test/rbbt/sources/test_entrez.rb +0 -49
  76. data/test/rbbt/sources/test_go.rb +0 -24
  77. data/test/rbbt/sources/test_organism.rb +0 -59
  78. data/test/rbbt/sources/test_polysearch.rb +0 -27
  79. data/test/rbbt/sources/test_pubmed.rb +0 -39
  80. data/test/rbbt/util/test_arrayHash.rb +0 -257
  81. data/test/rbbt/util/test_filecache.rb +0 -37
  82. data/test/rbbt/util/test_index.rb +0 -31
  83. data/test/rbbt/util/test_misc.rb +0 -20
  84. data/test/rbbt/util/test_open.rb +0 -110
  85. data/test/rbbt/util/test_simpleDSL.rb +0 -57
  86. data/test/rbbt/util/test_tmpfile.rb +0 -21
  87. data/test/test_helper.rb +0 -4
  88. data/test/test_rbbt.rb +0 -11
@@ -1,241 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/util/index'
4
-
5
- # This module contains some Organism centric functionalities. Each organism is
6
- # identified by a keyword.
7
- module Organism
8
-
9
- # Raised when trying to access information for an organism that has not been
10
- # prepared already.
11
- class OrganismNotProcessedError < StandardError; end
12
-
13
- # Return the list of all supported organisms. The prepared flag is used to
14
- # show only those that have been prepared.
15
- def self.all(prepared = true)
16
- if prepared
17
- Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*/identifiers').collect{|f| File.basename(File.dirname(f))}
18
- else
19
- Dir.glob(File.join(Rbbt.datadir,'/organisms/') + '/*').select{|f| File.directory? f}.collect{|f| File.basename(f)}
20
- end
21
- end
22
-
23
-
24
- # Return the complete name of an organism. The org parameter is the organism
25
- # keyword
26
- def self.name(org)
27
- raise OrganismNotProcessedError, "Missing 'name' file" if ! File.exists? File.join(Rbbt.datadir,"organisms/#{ org }/name")
28
- Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/name"))
29
- end
30
-
31
- # Hash linking all the organism log names with their keywords in Rbbt. Its
32
- # the inverse of the name method.
33
- NAME2ORG = {}
34
- Organism::all.each{|org|
35
- name = Organism.name(org).strip.downcase
36
- NAME2ORG[name] = org
37
- }
38
-
39
-
40
- # Return the key word associated with an organism.
41
- def self.name2org(name)
42
- NAME2ORG[name.strip.downcase]
43
- end
44
-
45
- # FIXME: The NER related stuff is harder to install, thats why we hide the
46
- # requires next to where they are needed, next to options
47
-
48
- # Return a NER object which could be of RNER, Abner or Banner class, this is
49
- # selected using the type parameter.
50
- def self.ner(org, type=:rner, options = {})
51
-
52
- case type.to_sym
53
- when :abner
54
- require 'rbbt/ner/abner'
55
- return Abner.new
56
- when :banner
57
- require 'rbbt/ner/banner'
58
- return Banner.new
59
- when :rner
60
- require 'rbbt/ner/rner'
61
- model = options[:model]
62
- model ||= File.join(Rbbt.datadir,"ner/model/#{ org }") if File.exist? File.join(Rbbt.datadir,"ner/model/#{ org }")
63
- model ||= File.join(Rbbt.datadir,'ner/model/BC2')
64
- return NER.new(model)
65
- else
66
- raise "Ner type (#{ type }) unknown"
67
- end
68
-
69
- end
70
-
71
- # Return a normalization object.
72
- def self.norm(org, to_entrez = nil)
73
- require 'rbbt/ner/rnorm'
74
- if to_entrez.nil?
75
- to_entrez = id_index(org, :native => 'Entrez Gene Id', :other => [supported_ids(org).first])
76
- end
77
-
78
- token_file = File.join(Rbbt.datadir, 'norm','config',org.to_s + '.config')
79
- if !File.exists? token_file
80
- token_file = nil
81
- end
82
-
83
- Normalizer.new(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"), :to_entrez => to_entrez, :file => token_file, :max_candidates => 20)
84
- end
85
-
86
- # Returns a hash with the names associated with each gene id. The ids are
87
- # in Rbbt native format for that organism.
88
- def self.lexicon(org, options = {})
89
- options = {:sep => "\t|\\|", :flatten => true}.merge(options)
90
- Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/lexicon"),options)
91
- end
92
-
93
- # Returns a hash with the list of go terms for each gene id. Gene ids are in
94
- # Rbbt native format for that organism.
95
- def self.goterms(org)
96
- Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.go"), :flatten => true)
97
- end
98
-
99
- # Return list of PubMed ids associated to the organism. Determined using a
100
- # PubMed query with the name of the organism
101
- def self.literature(org)
102
- Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/all.pmid")).scan(/\d+/)
103
- end
104
-
105
- # Return hash that associates genes to a list of PubMed ids.
106
- def self.gene_literature(org)
107
- Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene.pmid"), :flatten => true)
108
- end
109
-
110
- # Return hash that associates genes to a list of PubMed ids. Includes only
111
- # those found to support GO term associations.
112
- def self.gene_literature_go(org)
113
- Open.to_hash(File.join(Rbbt.datadir,"organisms/#{ org }/gene_go.pmid"), :flatten => true)
114
- end
115
-
116
- # Returns a list with the names of the id formats supported for an organism.
117
- # If examples are produced, the list is of [format, example] pairs.
118
- #
119
- # *Options:*
120
- #
121
- # *examples:* Include example ids for each format
122
- def self.supported_ids(org, options = {})
123
- formats = []
124
- examples = [] if options[:examples]
125
- i= 0
126
- Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers")).each_line{|l|
127
- if i == 0
128
- i += 1
129
- next unless l=~/^\s*#/
130
- formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
131
- return formats unless examples
132
- next
133
- end
134
-
135
- if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
136
- examples = Open.fields(l).collect{|name| name.split(/\|/).first}
137
- end
138
- i += 1
139
- }
140
-
141
- formats.zip(examples)
142
- end
143
-
144
- # Creates a hash where each possible id is associated with the names of the
145
- # formats (its potentially possible for different formats to have the same
146
- # id). This is used in the guessIdFormat method.
147
- def self.id_formats(org)
148
- id_types = {}
149
- formats = supported_ids(org)
150
-
151
- text = Open.read(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"))
152
-
153
- if text.respond_to? :collect
154
- lines = text.collect
155
- else
156
- lines = text.lines
157
- end
158
-
159
- lines.each{|l|
160
- ids_per_type = Open.fields(l)
161
- formats.zip(ids_per_type).each{|p|
162
- format = p[0]
163
- p[1] ||= ""
164
- ids = p[1].split(/\|/)
165
- ids.each{|id|
166
- next if id.nil? || id == ""
167
- id_types[id.downcase] ||= []
168
- id_types[id.downcase] << format unless id_types[id.downcase].include? format
169
- }
170
- }
171
- }
172
-
173
- return id_types
174
- end
175
-
176
- def self.guessIdFormat(formats, query)
177
- query = query.compact.collect{|gene| gene.downcase}.uniq
178
- if String === formats
179
- formats = id_formats(formats)
180
- end
181
-
182
- return nil if formats.values.empty?
183
- values = formats.values_at(*query)
184
- return nil if values.empty?
185
-
186
- format_count = {}
187
- values.compact.collect{|types| types.uniq}.flatten.each{|f|
188
- format_count[f] ||= 0
189
- format_count[f] += 1
190
- }
191
-
192
- return nil if format_count.values.empty?
193
- format_count.select{|k,v| v > (query.length / 10)}.sort{|a,b| b[1] <=> a[1]}.first
194
- end
195
-
196
- def self.id_position(supported_ids, id_name, options = {})
197
- pos = 0
198
- supported_ids.each_with_index{|id, i|
199
- if id.strip == id_name.strip || !options[:case_sensitive] && id.strip.downcase == id_name.strip.downcase
200
- pos = i;
201
- end
202
- }
203
- pos
204
- end
205
-
206
- def self.id_index(org, options = {})
207
- native = options[:native]
208
- other = options[:other]
209
- options[:case_sensitive] = false if options[:case_sensitive].nil?
210
-
211
- if native.nil? and other.nil?
212
- Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
213
- else
214
- supported = Organism.supported_ids(org)
215
-
216
- first = nil
217
- if native
218
- first = id_position(supported,native,options)
219
- raise "No match for native format '#{ native }'" if first.nil?
220
- else
221
- first = 0
222
- end
223
-
224
- rest = nil
225
- if other
226
- rest = other.collect{|name| id_position(supported,name, options)}
227
- else
228
- rest = (0..supported.length - 1).to_a - [first]
229
- end
230
-
231
- options[:native] = first
232
- options[:extra] = rest
233
- options[:sep] = "\t"
234
- index = Index.index(File.join(Rbbt.datadir,"organisms/#{ org }/identifiers"), options)
235
-
236
- index
237
- end
238
- end
239
-
240
- end
241
-
@@ -1,117 +0,0 @@
1
- require 'rbbt'
2
- require 'rbbt/util/open'
3
- require 'rbbt/ner/regexpNER'
4
- require 'rbbt/ner/dictionaryNER'
5
-
6
- # Find terms in the Polysearch thesauri using simple regular expression
7
- # matching. Note that the first time the methods are used the correspondent
8
- # thesaurus are loaded into memory. The available thesauri are: disease, drug,
9
- # metabolite, organ, subcellular (subcellular localization) and tissue.
10
- module Polysearch
11
-
12
-
13
- @@names = {}
14
- def self.type_names(type) #:nodoc:
15
- @@names[type] ||= Open.to_hash(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :single => true)
16
- end
17
-
18
-
19
- @@indexes = {}
20
- def self.type_index(type) #:nodoc:
21
- if $stopwords
22
- stopwords = $stopwords
23
- else
24
- stopwords = []
25
- end
26
-
27
- case type.to_sym
28
- when :disease
29
- stopwords << 'use'
30
- end
31
-
32
- @@indexes[type] ||= RegExpNER.new(File.join(Rbbt.datadir,'dbs','polysearch',type.to_s + '.txt'), :stopwords => stopwords)
33
- end
34
-
35
- # Find matches in a string of text, the types array specifies which thesauri
36
- # to use, if if nil it will use all.
37
- def self.match(text, types = nil)
38
- if types.nil?
39
- types = %w(disease drug metabolite organ subcellular tissue)
40
- end
41
-
42
- types = [types] unless Array === types
43
- types = types.sort
44
-
45
- matches = {}
46
- types.collect{|type|
47
- matches.merge!(type_index(type).match(text))
48
- }
49
-
50
- matches
51
- end
52
-
53
- # Transform the code into a name, type is the thesaurus to use
54
- def self.name(type, code)
55
- type_names(type)[code]
56
- end
57
-
58
- end
59
-
60
- if __FILE__ == $0
61
- text =<<-EOT
62
-
63
- Background Microorganisms adapt their transcriptome by integrating
64
- multiple chemical and physical signals from their environment. Shake-flask
65
- cultivation does not allow precise manipulation of individual culture
66
- parameters and therefore precludes a quantitative analysis of the
67
- (combinatorial) influence of these parameters on transcriptional
68
- regulation. Steady-state chemostat cultures, which do enable accurate
69
- control, measurement and manipulation of individual cultivation parameters
70
- (e.g. specific growth rate, temperature, identity of the growth-limiting
71
- nutrient) appear to provide a promising experimental platform for such a
72
- combinatorial analysis. Results A microarray compendium of 170
73
- steady-state chemostat cultures of the yeast Saccharomyces cerevisiae is
74
- presented and analyzed. The 170 microarrays encompass 55 unique
75
- conditions, which can be characterized by the combined settings of 10
76
- different cultivation parameters. By applying a regression model to assess
77
- the impact of (combinations of) cultivation parameters on the
78
- transcriptome, most S. cerevisiae genes were shown to be influenced by
79
- multiple cultivation parameters, and in many cases by combinatorial
80
- effects of cultivation parameters. The inclusion of these combinatorial
81
- effects in the regression model led to higher explained variance of the
82
- gene expression patterns and resulted in higher function enrichment in
83
- subsequent analysis. We further demonstrate the usefulness of the
84
- compendium and regression analysis for interpretation of shake-flask-based
85
- transcriptome studies and for guiding functional analysis of
86
- (uncharacterized) genes and pathways. Conclusions Modeling the
87
- combinatorial effects of environmental parameters on the transcriptome is
88
- crucial for understanding transcriptional regulation. Chemostat
89
- cultivation offers a powerful tool for such an approach. Keywords:
90
- chemostat steady state samples Cerebellar stroke syndrome
91
-
92
-
93
- EOT
94
-
95
- require 'benchmark'
96
- require 'ruby-prof'
97
-
98
- puts Benchmark.measure{
99
- p Polysearch.match(text,'disease')
100
- }
101
-
102
-
103
- RubyProf.start
104
-
105
- Polysearch.match(text,'disease')
106
-
107
- result = RubyProf.stop
108
-
109
- # Print a flat profile to text
110
- printer = RubyProf::FlatPrinter.new(result)
111
- printer.print(STDOUT, 0)
112
-
113
- puts Benchmark.measure{
114
- 10.times{ p Polysearch.match(text,'disease') }
115
- }
116
-
117
- end
@@ -1,248 +0,0 @@
1
- require 'rbbt/util/filecache'
2
- require 'rbbt/util/open'
3
- require 'rbbt/sources/gscholar'
4
- require 'rbbt'
5
- require 'libxml'
6
-
7
- # This module offers an interface with PubMed, to perform queries, and
8
- # retrieve simple information from articles. It uses the caching
9
- # services of Rbbt.
10
- module PubMed
11
-
12
- private
13
- @@last = Time.now
14
- @@pubmed_lag = 1
15
- def self.get_online(pmids)
16
-
17
- pmid_list = ( pmids.is_a?(Array) ? pmids.join(',') : pmids.to_s )
18
- url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=#{pmid_list}"
19
-
20
- diff = Time.now - @@last
21
- sleep @@pubmed_lag - diff unless diff > @@pubmed_lag
22
-
23
- xml = Open.read(url, :quiet => true, :nocache => true)
24
-
25
- @@last = Time.now
26
-
27
- articles = xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten
28
-
29
- if pmids.is_a? Array
30
- list = {}
31
- articles.each{|article|
32
- pmid = article.scan(/<PMID>(.*?)<\/PMID>/).flatten.first
33
- list[pmid] = article
34
- }
35
- return list
36
- else
37
- return articles.first
38
- end
39
-
40
- end
41
-
42
- public
43
-
44
- # Processes the xml with an articles as served by MedLine and extracts
45
- # the abstract, title and journal information
46
- class Article
47
-
48
-
49
- XML_KEYS = [
50
- [:title , "ArticleTitle"],
51
- [:journal , "Journal/Title"],
52
- [:issue , "Journal/JournalIssue/Issue"],
53
- [:volume , "Journal/JournalIssue/Volume"],
54
- [:issn , "Journal/ISSN"],
55
- [:year , "Journal/JournalIssue/PubDate/Year"],
56
- [:month , "Journal/JournalIssue/PubDate/Month"],
57
- [:pages , "Pagination/MedlinePgn"],
58
- [:abstract , "Abstract/AbstractText"],
59
- ]
60
-
61
- PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
62
-
63
- def self.escape_title(title)
64
- title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
65
- end
66
-
67
- def self.make_bibentry(lastname, year, title)
68
- words = title.downcase.scan(/\w+/)
69
- if words.first.length > 3
70
- abrev = words.first
71
- else
72
- abrev = words[0..2].collect{|w| w.chars.first} * ""
73
- end
74
- [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * ""
75
- end
76
- def self.parse_xml(xml)
77
- parser = LibXML::XML::Parser.string(xml)
78
- pubmed = parser.parse.find("/PubmedArticle").first
79
- medline = pubmed.find("MedlineCitation").first
80
- article = medline.find("Article").first
81
-
82
- info = {}
83
-
84
- info[:pmid] = medline.find("PMID").first.content
85
-
86
- XML_KEYS.each do |p|
87
- name, key = p
88
- node = article.find(key).first
89
-
90
- next if node.nil?
91
-
92
- info[name] = node.content
93
- end
94
-
95
- bibentry = nil
96
- info[:author] = article.find("AuthorList/Author").collect do |author|
97
- begin
98
- lastname = author.find("LastName").first.content
99
- if author.find("ForeName").first.nil?
100
- forename = nil
101
- else
102
- forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
103
- end
104
- bibentry ||= make_bibentry lastname, info[:year], info[:title]
105
- rescue
106
- end
107
- [lastname, forename] * ", "
108
- end * " and "
109
-
110
- info[:bibentry] = bibentry.downcase if bibentry
111
-
112
- info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
113
-
114
- if info[:pmc_pdf]
115
- info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
116
- end
117
-
118
- info
119
- end
120
-
121
- attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
122
- attr_accessor *XML_KEYS.collect{|p| p.first }
123
-
124
- def initialize(xml)
125
- if xml && ! xml.empty?
126
- info = PubMed::Article.parse_xml xml
127
- info.each do |key, value|
128
- self.send("#{ key }=", value)
129
- end
130
- end
131
- end
132
-
133
- def pdf_url
134
- return pmc_pdf if pmc_pdf
135
- @gscholar_pdf ||= GoogleScholar::full_text_url title
136
- end
137
-
138
- def full_text
139
- return nil if pdf_url.nil?
140
-
141
- text = nil
142
- TmpFile.with_file do |pdf|
143
-
144
- # Change user-agent, oh well...
145
- `wget --user-agent=firefox #{ pdf_url } -O #{ pdf }`
146
- TmpFile.with_file do |txt|
147
- `pdftotext #{ pdf } #{ txt }`
148
- text = Open.read(txt) if File.exists? txt
149
- end
150
- end
151
-
152
- text
153
- end
154
-
155
- def bibtex
156
- keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
157
- bibtex = "@article{#{bibentry},\n"
158
-
159
- keys.each do |key|
160
- next if self.send(key).nil?
161
-
162
- case key
163
-
164
- when :title
165
- bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
166
-
167
- when :issue
168
- bibtex += " number = { #{ issue } },\n"
169
-
170
- else
171
- bibtex += " #{ key } = { #{ self.send(key) } },\n"
172
- end
173
-
174
- end
175
-
176
- bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
177
- bibtex += " pmid = { #{ pmid } }\n}"
178
-
179
-
180
- bibtex
181
- end
182
-
183
- # Join the text from title and abstract
184
- def text
185
- [title, abstract].join("\n")
186
- end
187
- end
188
-
189
- # Returns the Article object containing the information for the PubMed
190
- # ID specified as an argument. If +pmid+ is an array instead of a single
191
- # identifier it returns an hash with the Article object for each id.
192
- # It uses the Rbbt cache to save the articles xml.
193
- def self.get_article(pmid)
194
-
195
- if pmid.is_a? Array
196
- missing = []
197
- list = {}
198
-
199
- pmid.each{|p|
200
- filename = p.to_s + '.xml'
201
- if File.exists? FileCache.path(filename)
202
- list[p] = Article.new(Open.read(FileCache.path(filename)))
203
- else
204
- missing << p
205
- end
206
- }
207
-
208
- return list unless missing.any?
209
- chunk_size = [100, missing.length].min
210
- chunks = (missing.length.to_f / chunk_size).ceil
211
-
212
- articles = {}
213
- chunks.times do |chunk|
214
- pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
215
- articles.merge!(get_online(pmids))
216
- end
217
-
218
- articles.each{|p, xml|
219
- filename = p + '.xml'
220
- FileCache.add_file(filename,xml, :force => true)
221
- list[p] = Article.new(xml)
222
- }
223
-
224
- return list
225
-
226
- else
227
- filename = pmid.to_s + '.xml'
228
-
229
- if File.exists? FileCache.path(filename)
230
- return Article.new(Open.read(FileCache.path(filename)))
231
- else
232
- xml = get_online(pmid)
233
- FileCache.add_file(filename,xml)
234
-
235
- return Article.new(xml)
236
- end
237
- end
238
- end
239
-
240
- # Performs the specified query and returns an array with the PubMed
241
- # Ids returned. +retmax+ can be used to limit the number of ids
242
- # returned, if is not specified 30000 is used.
243
- def self.query(query, retmax=nil)
244
- retmax ||= 30000
245
-
246
- Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten
247
- end
248
- end