rbbt 1.2.1 → 1.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,74 @@
1
+ require 'mechanize'
2
+
3
+
4
+ module GoogleScholar
5
+ def self.user_agent
6
+ @@a ||= Mechanize.new
7
+ end
8
+
9
+ def self.citation_link(title)
10
+ citation_link = nil
11
+
12
+ # Get citation page
13
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
14
+ article = page.search('div[@class=gs_r]').first
15
+ return nil if article.nil?
16
+
17
+ return article.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
18
+ end
19
+ end
20
+
21
+ def self.full_text_url(title)
22
+ full_text_link = nil
23
+
24
+ # Get page
25
+ user_agent.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
26
+ article = page.search('div[@class=gs_r]').first
27
+ return nil if article.nil?
28
+
29
+ link = article.search('a').select{ |link|
30
+ link['href'] =~ /\.pdf$/ || link['href'] =~ /type=pdf/
31
+ }.first
32
+
33
+ return nil if link.nil?
34
+
35
+ return link['href']
36
+ end
37
+ end
38
+
39
+
40
+ def self.number_cites(title)
41
+
42
+ link = citation_link title
43
+ return 0 if link.nil?
44
+
45
+ link.inner_html =~ /(\d+)$/
46
+
47
+ return $1.to_i
48
+ end
49
+
50
+ end
51
+
52
+
53
+ #def get_citers(title)
54
+ # puts title
55
+ # citation_link = nil
56
+ #
57
+ # # Get citation page
58
+ # $a.get("http://scholar.google.es/scholar?q='#{ title }'&hl=es&lr=&lr=") do |page|
59
+ # citation_link = page.search('div[@class=gs_r]').first.search('a').select{|link| link['href'] =~ /scholar\?cites/ && link.inner_html =~ /\d+$/ }.first
60
+ # end
61
+ #
62
+ # return [] if citation_link.nil?
63
+ #
64
+ # # Parse citations
65
+ #
66
+ # citers = []
67
+ # $a.get("http://scholar.google.es" + citation_link['href']) do |page|
68
+ # citers = page.search('div[@class=gs_r]').collect do |entry|
69
+ # entry.search('h3').first.search('a').first.inner_html
70
+ # end
71
+ # end
72
+ #
73
+ # return citers
74
+ #end
@@ -127,12 +127,12 @@ module Organism
127
127
  if i == 0
128
128
  i += 1
129
129
  next unless l=~/^\s*#/
130
- formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
130
+ formats = Open.fields(l.sub(/^[\s#]+/,'')).collect{|n| n.strip}
131
131
  return formats unless examples
132
132
  next
133
133
  end
134
134
 
135
- if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.length
135
+ if Open.fields(l).select{|name| name && name =~ /\w/}.length > examples.compact.length
136
136
  examples = Open.fields(l).collect{|name| name.split(/\|/).first}
137
137
  end
138
138
  i += 1
@@ -216,6 +216,7 @@ module Organism
216
216
  first = nil
217
217
  if native
218
218
  first = id_position(supported,native,options)
219
+ raise "No match for native format '#{ native }'"
219
220
  else
220
221
  first = 0
221
222
  end
@@ -1,6 +1,8 @@
1
1
  require 'rbbt/util/filecache'
2
2
  require 'rbbt/util/open'
3
+ require 'rbbt/sources/gscholar'
3
4
  require 'rbbt'
5
+ require 'libxml'
4
6
 
5
7
  # This module offers an interface with PubMed, to perform queries, and
6
8
  # retrieve simple information from articles. It uses the caching
@@ -42,17 +44,115 @@ module PubMed
42
44
  # Processes the xml with an articles as served by MedLine and extracts
43
45
  # the abstract, title and journal information
44
46
  class Article
45
- attr_reader :title, :abstract, :journal
47
+
48
+
49
+ XML_KEYS = [
50
+ [:title , "ArticleTitle"],
51
+ [:journal , "Journal/Title"],
52
+ [:issue , "Journal/JournalIssue/Issue"],
53
+ [:volume , "Journal/JournalIssue/Volume"],
54
+ [:issn , "Journal/ISSN"],
55
+ [:year , "Journal/JournalIssue/PubDate/Year"],
56
+ [:pages , "Pagination/MedlinePgn"],
57
+ [:abstract , "Abstract/AbstractText"],
58
+ ]
59
+
60
+ PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/"
61
+
62
+ def self.escape_title(title)
63
+ title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}')
64
+ end
65
+
66
+ def self.parse_xml(xml)
67
+ parser = LibXML::XML::Parser.string(xml)
68
+ pubmed = parser.parse.find("/PubmedArticle").first
69
+ medline = pubmed.find("MedlineCitation").first
70
+ article = medline.find("Article").first
71
+
72
+ info = {}
73
+
74
+ info[:pmid] = medline.find("PMID").first.content
75
+
76
+ XML_KEYS.each do |p|
77
+ name, key = p
78
+ node = article.find(key).first
79
+
80
+ next if node.nil?
81
+
82
+ info[name] = node.content
83
+ end
84
+
85
+ bibentry = nil
86
+ info[:author] = article.find("AuthorList/Author").collect do |author|
87
+ lastname = author.find("LastName").first.content
88
+ if author.find("ForeName").first.nil?
89
+ forename = nil
90
+ else
91
+ forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " "
92
+ end
93
+ bibentry ||= [lastname, (info[:year] || "NOYEAR"), info[:title].scan(/\w+/)[0]] * ""
94
+ [lastname, forename] * ", "
95
+ end * " and "
96
+
97
+ info[:bibentry] = bibentry.downcase
98
+
99
+ info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first
100
+
101
+ if info[:pmc_pdf]
102
+ info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content)
103
+ end
104
+
105
+ info
106
+ end
107
+
108
+ attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url
109
+ attr_accessor *XML_KEYS.collect{|p| p.first }
110
+
46
111
  def initialize(xml)
47
- xml ||= ""
48
- @abstract = $1 if xml.match(/<AbstractText>(.*)<\/AbstractText>/sm)
49
- @title = $1 if xml.match(/<ArticleTitle>(.*)<\/ArticleTitle>/sm)
50
- @journal = $1 if xml.match(/<Title>(.*)<\/Title>/sm)
112
+ if xml && ! xml.empty?
113
+ info = PubMed::Article.parse_xml xml
114
+ info.each do |key, value|
115
+ self.send("#{ key }=", value)
116
+ end
117
+ end
118
+ end
119
+
120
+ def pdf_url
121
+ return pmc_pdf if pmc_pdf
122
+ @gscholar_pdf ||= GoogleScholar::full_text_url title
123
+ end
124
+
125
+ def bibtex
126
+ keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry]
127
+ bibtex = "@article{#{bibentry},\n"
128
+
129
+ keys.each do |key|
130
+ next if self.send(key).nil?
131
+
132
+ case key
133
+
134
+ when :title
135
+ bibtex += " title = { #{ PubMed::Article.escape_title title } },\n"
136
+
137
+ when :issue
138
+ bibtex += " number = { #{ issue } },\n"
139
+
140
+ else
141
+ bibtex += " #{ key } = { #{ self.send(key) } },\n"
142
+ end
143
+
144
+ end
145
+
146
+ bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url
147
+ bibtex += " pmid = { #{ pmid } }\n}"
148
+
149
+
150
+ bibtex
51
151
  end
52
152
 
53
153
  # Join the text from title and abstract
54
154
  def text
55
- [@title, @abstract].join("\n")
155
+ [title, abstract].join("\n")
56
156
  end
57
157
  end
58
158
 
@@ -78,7 +178,7 @@ module PubMed
78
178
  return list unless missing.any?
79
179
  chunk_size = [100, missing.length].min
80
180
  chunks = (missing.length.to_f / chunk_size).ceil
81
-
181
+
82
182
  articles = {}
83
183
  chunks.times do |chunk|
84
184
  pmids = missing[(chunk * chunk_size)..((chunk + 1) *chunk_size)]
@@ -6,6 +6,16 @@ require 'rbbt/util/tmpfile'
6
6
  # for accessing remote files. It supports caching the files.
7
7
  module Open
8
8
 
9
+ # Return a Proc to use in the :select parameter of the Open.to_hash method.
10
+ # It selects those lines with the content of the first field present on the
11
+ # entities array. The field can be chosen to be a different one in the
12
+ # options hash, also the separation string or regexp to determine fields.
13
+ def self.func_match_field(entities, options = {})
14
+ field, sep = {:field => 0, :sep => "\t"}.merge(options).values_at(:field, :sep)
15
+
16
+ Proc.new {|line| entities.include? line.split(sep)[field] }
17
+ end
18
+
9
19
  def self.fields(line, sep = "\t")
10
20
  chunks = line.chomp.split(/(#{sep})/).select{|c| c !~ /^#{sep}$/ }
11
21
  if line =~ /#{sep}$/
@@ -176,10 +186,12 @@ module Open
176
186
  # * :single => for each key select only the first of the values, instead of the complete array.
177
187
  # * :fix => A Proc that is called to pre-process the line
178
188
  # * :exclude => A Proc that is called to check if the line must be excluded from the process.
189
+ # * :select => A Proc that is called to check if the line must be selected to process.
179
190
  def self.to_hash(input, options = {})
180
191
  native = options[:native] || 0
181
192
  extra = options[:extra]
182
193
  exclude = options[:exclude]
194
+ select = options[:select]
183
195
  fix = options[:fix]
184
196
  sep = options[:sep] || "\t"
185
197
  sep2 = options[:sep2] || "|"
@@ -200,6 +212,7 @@ module Open
200
212
  content.each_line{|l|
201
213
  l = fix.call(l) if fix
202
214
  next if exclude and exclude.call(l)
215
+ next if select and ! select.call(l)
203
216
 
204
217
  row_fields = self.fields(l, sep)
205
218
  id = row_fields[native]
@@ -89,6 +89,19 @@ row2 a d e r
89
89
  assert_equal(["","",""] , Open.fields("\t\t") )
90
90
  end
91
91
 
92
+ def test_select_field
93
+ data =<<-EOD
94
+ row1 a b 3
95
+ row1 aa bb 33
96
+ row2 a d e r
97
+ EOD
98
+
99
+ TmpFile.with_file(data) do |file|
100
+ data = Open.to_hash(file, :select => Open.func_match_field(%w(row1), :sep => " "), :sep => " ")
101
+ assert ! data.include?('row2')
102
+ assert data.include?('row1')
103
+ end
104
+ end
92
105
 
93
106
 
94
107
 
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ prerelease: false
5
+ segments:
6
+ - 1
7
+ - 2
8
+ - 2
9
+ version: 1.2.2
5
10
  platform: ruby
6
11
  authors:
7
12
  - Miguel Vazquez
@@ -9,59 +14,71 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2010-02-15 00:00:00 +01:00
17
+ date: 2010-05-27 00:00:00 +02:00
13
18
  default_executable: rbbt_config
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: rake
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 0
29
+ - 8
30
+ - 4
23
31
  version: 0.8.4
24
- version:
32
+ type: :runtime
33
+ version_requirements: *id001
25
34
  - !ruby/object:Gem::Dependency
26
35
  name: simpleconsole
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
36
+ prerelease: false
37
+ requirement: &id002 !ruby/object:Gem::Requirement
30
38
  requirements:
31
39
  - - ">="
32
40
  - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
33
43
  version: "0"
34
- version:
44
+ type: :runtime
45
+ version_requirements: *id002
35
46
  - !ruby/object:Gem::Dependency
36
47
  name: stemmer
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
48
+ prerelease: false
49
+ requirement: &id003 !ruby/object:Gem::Requirement
40
50
  requirements:
41
51
  - - ">="
42
52
  - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
43
55
  version: "0"
44
- version:
56
+ type: :runtime
57
+ version_requirements: *id003
45
58
  - !ruby/object:Gem::Dependency
46
59
  name: progress-monitor
47
- type: :runtime
48
- version_requirement:
49
- version_requirements: !ruby/object:Gem::Requirement
60
+ prerelease: false
61
+ requirement: &id004 !ruby/object:Gem::Requirement
50
62
  requirements:
51
63
  - - ">="
52
64
  - !ruby/object:Gem::Version
65
+ segments:
66
+ - 0
53
67
  version: "0"
54
- version:
68
+ type: :runtime
69
+ version_requirements: *id004
55
70
  - !ruby/object:Gem::Dependency
56
71
  name: simpleconsole
57
- type: :runtime
58
- version_requirement:
59
- version_requirements: !ruby/object:Gem::Requirement
72
+ prerelease: false
73
+ requirement: &id005 !ruby/object:Gem::Requirement
60
74
  requirements:
61
75
  - - ">="
62
76
  - !ruby/object:Gem::Version
77
+ segments:
78
+ - 0
63
79
  version: "0"
64
- version:
80
+ type: :runtime
81
+ version_requirements: *id005
65
82
  description: |-
66
83
  This toolbox includes modules for text-mining, like Named Entity Recognition and Normalization and document
67
84
  classification, as well as data integration modules that interface with PubMed, Entrez Gene, BioMart.
@@ -118,6 +135,7 @@ files:
118
135
  - lib/rbbt/sources/biomart.rb
119
136
  - lib/rbbt/sources/entrez.rb
120
137
  - lib/rbbt/sources/go.rb
138
+ - lib/rbbt/sources/gscholar.rb
121
139
  - lib/rbbt/sources/organism.rb
122
140
  - lib/rbbt/sources/polysearch.rb
123
141
  - lib/rbbt/sources/pubmed.rb
@@ -145,18 +163,20 @@ required_ruby_version: !ruby/object:Gem::Requirement
145
163
  requirements:
146
164
  - - ">="
147
165
  - !ruby/object:Gem::Version
166
+ segments:
167
+ - 0
148
168
  version: "0"
149
- version:
150
169
  required_rubygems_version: !ruby/object:Gem::Requirement
151
170
  requirements:
152
171
  - - ">="
153
172
  - !ruby/object:Gem::Version
173
+ segments:
174
+ - 0
154
175
  version: "0"
155
- version:
156
176
  requirements: []
157
177
 
158
178
  rubyforge_project:
159
- rubygems_version: 1.3.5
179
+ rubygems_version: 1.3.6
160
180
  signing_key:
161
181
  specification_version: 3
162
182
  summary: Bioinformatics and text mining toolbox