rbbt 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ require 'mechanize'
2
+
3
+
4
+ module GoogleScholar
5
+ def self.user_agent
6
+ @@a ||= Mechanize.new
7
+ end
8
+
9
+ def self.citation_link(title)
10
+ citation_link = nil
11
+
12
+ # Get citation page
13
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
14
+ article = page.search('div[@class=gs_r]').first
15
+ return nil if article.nil?
16
+
17
+ return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
18
+ end
19
+ end
20
+
21
+ def self.full_text_url(title)
22
+ full_text_link = nil
23
+
24
+ # Get page
25
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
26
+ article = page.search('div[@class=gs_r]').first
27
+ return nil if article.nil?
28
+
29
+ link = article.search('a').select{ |link|
30
+ link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
31
+ }.first
32
+
33
+ return nil if link.nil?
34
+
35
+ return link['href']
36
+ end
37
+ end
38
+
39
+
40
+ def self.number_cites(title)
41
+
42
+ link = citation_link title
43
+ return 0 if link.nil?
44
+
45
+ link.inner_html =~ /(\d+)$/
46
+
47
+ return $1.to_i
48
+ end
49
+
50
+ end
51
+
52
+
53
+ #def get_citers(title)
54
+ # puts title
55
+ # citation_link = nil
56
+ #
57
+ # # Get citation page
58
+ # $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
59
+ # citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
60
+ # end
61
+ #
62
+ # return [] if citation_link.nil?
63
+ #
64
+ # # Parse citations
65
+ #
66
+ # citers = []
67
+ # $a.get("http://scholar.google.es" + citation_link['href']) do |page|
68
+ # citers = page.search('div[@class=gs_r]').collect do |entry|
69
+ # entry.search('h3').first.search('a').first.inner_html
70
+ # end
71
+ # end
72
+ #
73
+ # return citers
74
+ #end
@@ -127,12 +127,12 @@ module Organism
127
127
  if i == 0
128
128
  i += 1
129
129
  next unless l=~/^\s*#/
130
- formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
130
+ formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
131
131
  return formats unless examples
132
132
  next
133
133
  end
134
134
 
135
- if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.length
135
+ if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
136
136
  examples = Open.fields(l).collect{|name| name.split(/\|/).first}
137
137
  end
138
138
  i += 1
@@ -216,6 +216,7 @@ module Organism
216
216
  first = nil
217
217
  if native
218
218
  first = id_position(supported,native,options)
219
+ raise "No match for native format '#{ native }'"
219
220
  else
220
221
  first = 0
221
222
  end
@@ -1,6 +1,8 @@
1
1
  require 'rbbt/util/filecache'
2
2
  require 'rbbt/util/open'
3
+ require 'rbbt/sources/gscholar'
3
4
  require 'rbbt'
5
+ require 'libxml'
4
6
 
5
7
  # This module offers an interface with PubMed, to perform queries, and
6
8
  # retrieve simple information from articles. It uses the caching
@@ -42,17 +44,115 @@ module PubMed
42
44
  # Processes the xml with an articles as served by MedLine and extracts
43
45
  # the abstract, title and journal information
44
46
  class Article
45
- attr_reader :title, :abstract, :journal
47
+
48
+
49
+ XML_KEYS = [
50
+ [:title , "ArticleTitle"],
51
+ [:journal , "Journal/Title"],
52
+ [:issue , "Journal/JournalIssue/Issue"],
53
+ [:volume , "Journal/JournalIssue/Volume"],
54
+ [:issn , "Journal/ISSN"],
55
+ [:year , "Journal/JournalIssue/PubDate/Year"],
56
+ [:pages , "Pagination/MedlinePgn"],
57
+ [:abstract , "Abstract/AbstractText"],
58
+ ]
59
+
60
+ PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
61
+
62
+ def self.escape_title(title)
63
+ title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
64
+ end
65
+
66
+ def self.parse_xml(xml)
67
+ parser = LibXML::XML::Parser.string(xml)
68
+ pubmed = parser.parse.find("/PubmedArticle").first
69
+ medline = pubmed.find("MedlineCitation").first
70
+ article = medline.find("Article").first
71
+
72
+ info = {}
73
+
74
+ info[:pmid] = medline.find("PMID").first.content
75
+
76
+ XML_KEYS.each do |p|
77
+ name, key = p
78
+ node = article.find(key).first
79
+
80
+ next if node.nil?
81
+
82
+ info[name] = node.content
83
+ end
84
+
85
+ bibentry = nil
86
+ info[:author] = article.find("AuthorList/Author").collect do |author|
87
+ lastname = author.find("LastName").first.content
88
+ if author.find("ForeName").first.nil?
89
+ forename = nil
90
+ else
91
+ forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
92
+ end
93
+ bibentry ||= [lastname, (info[:year] || "NOYEAR"), info[:title].scan(/\w+/)[0]] * ""
94
+ [lastname, forename] * ", "
95
+ end * " and "
96
+
97
+ info[:bibentry] = bibentry.downcase
98
+
99
+ info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
100
+
101
+ if info[:pmc_pdf]
102
+ info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
103
+ end
104
+
105
+ info
106
+ end
107
+
108
+ attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
109
+ attr_accessor *XML_KEYS.collect{|p| p.first }
110
+
46
111
  def initialize(xml)
47
- xml ||= ""
48
- @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
49
- @title = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
50
- @journal = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
112
+ if xml && ! xml.empty?
113
+ info = PubMed::Article.parse_xml xml
114
+ info.each do |key, value|
115
+ self.send("#{ key }=", value)
116
+ end
117
+ end
118
+ end
119
+
120
+ def pdf_url
121
+ return pmc_pdf if pmc_pdf
122
+ @gscholar_pdf ||= GoogleScholar::full_text_url title
123
+ end
124
+
125
+ def bibtex
126
+ keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
127
+ bibtex = "@article{#{bibentry},\n"
128
+
129
+ keys.each do |key|
130
+ next if self.send(key).nil?
131
+
132
+ case key
133
+
134
+ when :title
135
+ bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
136
+
137
+ when :issue
138
+ bibtex += " number = { #{ issue } },\n"
139
+
140
+ else
141
+ bibtex += " #{ key } = { #{ self.send(key) } },\n"
142
+ end
143
+
144
+ end
145
+
146
+ bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
147
+ bibtex += " pmid = { #{ pmid } }\n}"
148
+
149
+
150
+ bibtex
51
151
  end
52
152
 
53
153
  # Join the text from title and abstract
54
154
  def text
55
- [@title, @abstract].join("\n")
155
+ [title, abstract].join("\n")
56
156
  end
57
157
  end
58
158
 
@@ -78,7 +178,7 @@ module PubMed
78
178
  return list unless missing.any?
79
179
  chunk_size = [100, missing.length].min
80
180
  chunks = (missing.length.to_f / chunk_size).ceil
81
-
181
+
82
182
  articles = {}
83
183
  chunks.times do |chunk|
84
184
  pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
@@ -6,6 +6,16 @@ require 'rbbt/util/tmpfile'
6
6
  # for accessing remote files. It supports caching the files.
7
7
  module Open
8
8
 
9
+ # Return a Proc to use in the :select parameter of the Open.to_hash method.
10
+ # It selects those lines with the content of the first field present on the
11
+ # entities array. The field can be chosen to be a different one in the
12
+ # options hash, also the separation string or regexp to determine fields.
13
+ def self.func_match_field(entities, options = {})
14
+ field, sep = {:field => 0, :sep => "\t"}.merge(options).values_at(:field, :sep)
15
+
16
+ Proc.new {|line| entities.include? line.split(sep)[field] }
17
+ end
18
+
9
19
  def self.fields(line, sep = "\t")
10
20
  chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
11
21
  if line =~ /#{sep}$/
@@ -176,10 +186,12 @@ module Open
176
186
  # * :single => for each key select only the first of the values, instead of the complete array.
177
187
  # * :fix => A Proc that is called to pre-process the line
178
188
  # * :exclude => A Proc that is called to check if the line must be excluded from the process.
189
+ # * :select => A Proc that is called to check if the line must be selected to process.
179
190
  def self.to_hash(input, options = {})
180
191
  native = options[:native] || 0
181
192
  extra = options[:extra]
182
193
  exclude = options[:exclude]
194
+ select = options[:select]
183
195
  fix = options[:fix]
184
196
  sep = options[:sep] || "\t"
185
197
  sep2 = options[:sep2] || "|"
@@ -200,6 +212,7 @@ module Open
200
212
  content.each_line{|l|
201
213
  l = fix.call(l) if fix
202
214
  next if exclude and exclude.call(l)
215
+ next if select and ! select.call(l)
203
216
 
204
217
  row_fields = self.fields(l, sep)
205
218
  id = row_fields[native]
@@ -89,6 +89,19 @@ row2 a d e r
89
89
  assert_equal(["","",""] , Open.fields("\t\t") )
90
90
  end
91
91
 
92
+ def test_select_field
93
+ data =<<-EOD
94
+ row1 a b 3
95
+ row1 aa bb 33
96
+ row2 a d e r
97
+ EOD
98
+
99
+ TmpFile.with_file(data) do |file|
100
+ data = Open.to_hash(file, :select => Open.func_match_field(%w(row1), :sep => " "), :sep => " ")
101
+ assert ! data.include?('row2')
102
+ assert data.include?('row1')
103
+ end
104
+ end
92
105
 
93
106
 
94
107
 
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 2
8
+ - 2
9
+ version: 1.2.2
5
10
  platform: ruby
6
11
  authors:
7
12
  - Miguel Vazquez
@@ -9,59 +14,71 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2010-02-15 00:00:00 +01:00
17
+ date: 2010-05-27 00:00:00 +02:00
13
18
  default_executable: rbbt_config
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: rake
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 8
30
+ - 4
23
31
  version: 0.8.4
24
- version:
32
+ type: :runtime
33
+ version_requirements: *id001
25
34
  - !ruby/object:Gem::Dependency
26
35
  name: simpleconsole
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
30
38
  requirements:
31
39
  - - ">="
32
40
  - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
33
43
  version: "0"
34
- version:
44
+ type: :runtime
45
+ version_requirements: *id002
35
46
  - !ruby/object:Gem::Dependency
36
47
  name: stemmer
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
40
50
  requirements:
41
51
  - - ">="
42
52
  - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
43
55
  version: "0"
44
- version:
56
+ type: :runtime
57
+ version_requirements: *id003
45
58
  - !ruby/object:Gem::Dependency
46
59
  name: progress-monitor
47
- type: :runtime
48
- version_requirement:
49
- version_requirements: !ruby/object:Gem::Requirement
60
+ prerelease: false
61
+ requirement: &id004 !ruby/object:Gem::Requirement
50
62
  requirements:
51
63
  - - ">="
52
64
  - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
53
67
  version: "0"
54
- version:
68
+ type: :runtime
69
+ version_requirements: *id004
55
70
  - !ruby/object:Gem::Dependency
56
71
  name: simpleconsole
57
- type: :runtime
58
- version_requirement:
59
- version_requirements: !ruby/object:Gem::Requirement
72
+ prerelease: false
73
+ requirement: &id005 !ruby/object:Gem::Requirement
60
74
  requirements:
61
75
  - - ">="
62
76
  - !ruby/object:Gem::Version
77
+ segments:
78
+ - 0
63
79
  version: "0"
64
- version:
80
+ type: :runtime
81
+ version_requirements: *id005
65
82
  description: |-
66
83
  This toolbox includes modules for text-mining, like Named Entity Recognition and Normalization and document
67
84
  classification, as well as data integration modules that interface with PubMed, Entrez Gene, BioMart.
@@ -118,6 +135,7 @@ files:
118
135
  - lib/rbbt/sources/biomart.rb
119
136
  - lib/rbbt/sources/entrez.rb
120
137
  - lib/rbbt/sources/go.rb
138
+ - lib/rbbt/sources/gscholar.rb
121
139
  - lib/rbbt/sources/organism.rb
122
140
  - lib/rbbt/sources/polysearch.rb
123
141
  - lib/rbbt/sources/pubmed.rb
@@ -145,18 +163,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
145
163
  requirements:
146
164
  - - ">="
147
165
  - !ruby/object:Gem::Version
166
+ segments:
167
+ - 0
148
168
  version: "0"
149
- version:
150
169
  required_rubygems_version: !ruby/object:Gem::Requirement
151
170
  requirements:
152
171
  - - ">="
153
172
  - !ruby/object:Gem::Version
173
+ segments:
174
+ - 0
154
175
  version: "0"
155
- version:
156
176
  requirements: []
157
177
 
158
178
  rubyforge_project:
159
- rubygems_version: 1.3.5
179
+ rubygems_version: 1.3.6
160
180
  signing_key:
161
181
  specification_version: 3
162
182
  summary: Bioinformatics and text mining toolbox