google-scholar 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Guardfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +5 -0
- data/google-scholar.gemspec +29 -0
- data/lib/google/scholar.rb +29 -0
- data/lib/google/scholar/article_enumerator.rb +32 -0
- data/lib/google/scholar/article_summary.rb +27 -0
- data/lib/google/scholar/author.rb +31 -0
- data/lib/google/scholar/author_enumerator.rb +32 -0
- data/lib/google/scholar/author_search.rb +10 -0
- data/lib/google/scholar/base.rb +15 -0
- data/lib/google/scholar/document.rb +31 -0
- data/lib/google/scholar/document/authors_document.rb +25 -0
- data/lib/google/scholar/document/authors_profile_document.rb +14 -0
- data/lib/google/scholar/scraper.rb +44 -0
- data/lib/google/scholar/version.rb +5 -0
- data/spec/fixtures/article_part.htm +1 -0
- data/spec/fixtures/author_profile_page.htm +1 -0
- data/spec/fixtures/author_result_page.htm +1 -0
- data/spec/fixtures/author_result_page_has_next.htm +1 -0
- data/spec/fixtures/full_article.htm +1 -0
- data/spec/fixtures/single_author_page.htm +14 -0
- data/spec/lib/google/scholar/article_enumerator_spec.rb +32 -0
- data/spec/lib/google/scholar/article_summary_spec.rb +41 -0
- data/spec/lib/google/scholar/author_enumerator_spec.rb +33 -0
- data/spec/lib/google/scholar/author_search_spec.rb +20 -0
- data/spec/lib/google/scholar/author_spec.rb +66 -0
- data/spec/lib/google/scholar/base_spec.rb +15 -0
- data/spec/lib/google/scholar/document/authors_document_spec.rb +103 -0
- data/spec/lib/google/scholar/document/authors_profile_document_spec.rb +40 -0
- data/spec/lib/google/scholar/scraper_spec.rb +39 -0
- data/spec/lib/google/scholar_spec.rb +21 -0
- data/spec/spec_helper.rb +19 -0
- metadata +196 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OGM2NTE5Njc2ZTYxODM1NGQxNTI4ODg3MzY3ZmVmNzlhNjJiMjhmMA==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
ZGRlYzQ3MDc3MGEyMGUwMGQ4NDAyM2QwNzFlNzUzY2Q2MmNmZTBjYg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZjE0YzY2NjA5YjBlNDBiODI4N2VkMmFkNDZjYmI5OGEyYzEyNjA2ZjdlMjk2
|
10
|
+
YzljNWRkMTlmZTE4N2I2YzIyMmQ4M2UzMGIzYTQzMWIzMzc0MTMzNDJjODEx
|
11
|
+
NzYxODQ4ZmVkMzY1M2M0ZTBlNjY0ZjBmY2Y2Mjg3ODg5YmEzYjE=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
Y2Q2MDJiMjFiMWY5MDg5MjIwMDFjMWNkMWU0YWEyODZlMzIzMWVkNjEzZTNk
|
14
|
+
ZWE1YmQxYjIzODMzNDJjZjA4MjJkY2U3ODRiMmMzNjdiZWI2OTM0Y2FhYWIy
|
15
|
+
YWUxYjcyYjFhMGZlZTQxNjk1ZDE2MWYyNDRiYzRjMzQ5NGZkMWY=
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard :rspec, :cli => "--color --format documentation" do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Trey Terrell
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Google::Scholar
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'google-scholar'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install google-scholar
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'google/scholar/version'
|
5
|
+
require 'rbconfig'
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "google-scholar"
|
8
|
+
spec.version = Google::Scholar::VERSION
|
9
|
+
spec.authors = ["Trey Terrell"]
|
10
|
+
spec.email = ["trey.terrell@oregonstate.edu"]
|
11
|
+
spec.description = %q{Google Scholar interface. Currently only works for Author searches.}
|
12
|
+
spec.summary = %q{Google Scholar interface. Currently only works for Author searches.}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency 'nokogiri', '~> 1.5.0'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency 'rspec', '~> 2.13'
|
26
|
+
spec.add_development_dependency 'guard', '~> 1.8.0'
|
27
|
+
spec.add_development_dependency 'guard-rspec'
|
28
|
+
spec.add_development_dependency 'wdm', '>= 0.1.0' if RbConfig::CONFIG['target_os'] =~ /mswin|mingw/i
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "google/scholar/version"
|
2
|
+
require "google/scholar/base"
|
3
|
+
require "google/scholar/scraper"
|
4
|
+
require "google/scholar/document"
|
5
|
+
require "google/scholar/author"
|
6
|
+
require "google/scholar/author_search"
|
7
|
+
require "google/scholar/author_enumerator"
|
8
|
+
require "google/scholar/article_summary"
|
9
|
+
require "google/scholar/article_enumerator"
|
10
|
+
require "google/scholar/document/authors_document"
|
11
|
+
require "google/scholar/document/authors_profile_document"
|
12
|
+
require 'cgi'
|
13
|
+
module Google
|
14
|
+
module Scholar
|
15
|
+
def self.google_root
|
16
|
+
"scholar.google.com"
|
17
|
+
end
|
18
|
+
def self.http_scheme
|
19
|
+
"http://"
|
20
|
+
end
|
21
|
+
def self.google_url
|
22
|
+
"#{self.http_scheme}#{self.google_root}"
|
23
|
+
end
|
24
|
+
# @TODO May want to move this to AuthorSearch.
|
25
|
+
def self.author_search_url(author)
|
26
|
+
"#{self.http_scheme}#{self.google_root}/citations?view_op=search_authors&hl=en&mauthors=#{::CGI::escape("author:\"#{author}\"")}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'enumerator'
|
2
|
+
module Google
|
3
|
+
module Scholar
|
4
|
+
class ArticleEnumerator
|
5
|
+
include Enumerable
|
6
|
+
def initialize(scraper)
|
7
|
+
@scraper = scraper
|
8
|
+
@documents = scraper.documents
|
9
|
+
self
|
10
|
+
end
|
11
|
+
def each
|
12
|
+
current_document = @documents.first
|
13
|
+
i = 1
|
14
|
+
while(current_document)
|
15
|
+
current_document.articles.each {|article| yield(article)}
|
16
|
+
if(@documents.length > i)
|
17
|
+
current_document = @documents[i]
|
18
|
+
else
|
19
|
+
if(@scraper.has_more_pages?)
|
20
|
+
@scraper.load_next_page
|
21
|
+
@documents = @scraper.documents
|
22
|
+
current_document = @documents[i]
|
23
|
+
else
|
24
|
+
current_document = nil
|
25
|
+
end
|
26
|
+
end
|
27
|
+
i += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class ArticleSummary
|
4
|
+
def initialize(doc)
|
5
|
+
@document = doc
|
6
|
+
end
|
7
|
+
def title
|
8
|
+
@title ||= @document.css("#col-title a:first").text
|
9
|
+
end
|
10
|
+
def authors
|
11
|
+
@authors ||= @document.css("#col-title span:first").text
|
12
|
+
end
|
13
|
+
def publisher
|
14
|
+
@publisher ||= @document.css("#col-title span:last").text
|
15
|
+
end
|
16
|
+
def citations
|
17
|
+
@citations ||= @document.css("#col-citedby a:first").text.to_i
|
18
|
+
end
|
19
|
+
def year
|
20
|
+
@year ||= @document.css("#col-year").text.to_i
|
21
|
+
end
|
22
|
+
def full_article_url
|
23
|
+
@full_article_url ||= "#{Google::Scholar.google_url}#{@document.css("#col-title a:first").attr("href").text}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
module Google
|
3
|
+
module Scholar
|
4
|
+
class Author
|
5
|
+
def initialize(document)
|
6
|
+
@summary_doc = document
|
7
|
+
end
|
8
|
+
def name
|
9
|
+
@name ||= @summary_doc.css("td:last a:first").text.strip.gsub('\n','')
|
10
|
+
end
|
11
|
+
def citation_count
|
12
|
+
@citation_count ||= @summary_doc.css("td:last").children.reject{|x| !x.text?}.last.text.split(" ").last.to_i
|
13
|
+
end
|
14
|
+
def author_url
|
15
|
+
@author_url ||= "#{Google::Scholar.google_url}#{@summary_doc.css("td:last a").first.attr("href")}&pagesize=100"
|
16
|
+
end
|
17
|
+
def full_profile
|
18
|
+
@full_profile ||= Google::Scholar::Scraper.new(author_url).documents.first
|
19
|
+
end
|
20
|
+
def citations
|
21
|
+
full_profile.css("#stats td.cit-data").first.text.to_i
|
22
|
+
end
|
23
|
+
def id
|
24
|
+
@id ||= @summary_doc.css("td:last a:first").attr("href").to_s.match(/user=(.*)&/)[1]
|
25
|
+
end
|
26
|
+
def articles
|
27
|
+
@articles ||= Google::Scholar::ArticleEnumerator.new(Google::Scholar::Scraper.new(nil,self.full_profile))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'enumerator'
|
2
|
+
module Google
|
3
|
+
module Scholar
|
4
|
+
class AuthorEnumerator
|
5
|
+
include Enumerable
|
6
|
+
def initialize(scraper)
|
7
|
+
@scraper = scraper
|
8
|
+
@documents = scraper.documents
|
9
|
+
self
|
10
|
+
end
|
11
|
+
def each
|
12
|
+
current_document = @documents.first
|
13
|
+
i = 1
|
14
|
+
while(current_document)
|
15
|
+
current_document.authors.each {|author| yield(author)}
|
16
|
+
if(@documents.length > i)
|
17
|
+
current_document = @documents[i]
|
18
|
+
else
|
19
|
+
if(@scraper.has_more_pages?)
|
20
|
+
@scraper.load_next_page
|
21
|
+
@documents = @scraper.documents
|
22
|
+
current_document = @documents[i]
|
23
|
+
else
|
24
|
+
current_document = nil
|
25
|
+
end
|
26
|
+
end
|
27
|
+
i += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class AuthorSearch < Google::Scholar::Base
|
4
|
+
def authors
|
5
|
+
return unless @scraper.documents.first.kind_of?(Google::Scholar::AuthorsDocument)
|
6
|
+
@authors ||= Google::Scholar::AuthorEnumerator.new(@scraper)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class Base
|
4
|
+
attr_accessor :scraper
|
5
|
+
def initialize(url)
|
6
|
+
@scraper = Google::Scholar::Scraper.new(url)
|
7
|
+
self
|
8
|
+
end
|
9
|
+
def self.search_author(author)
|
10
|
+
url = Google::Scholar.author_search_url(author)
|
11
|
+
Google::Scholar::AuthorSearch.new(url)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class Document
|
4
|
+
attr_reader :document
|
5
|
+
def initialize(nokogiri_document)
|
6
|
+
@document = nokogiri_document
|
7
|
+
end
|
8
|
+
def method_missing(meth, *args, &block)
|
9
|
+
if(@document.respond_to?(meth))
|
10
|
+
return @document.send(meth,*args,&block)
|
11
|
+
else
|
12
|
+
super
|
13
|
+
end
|
14
|
+
end
|
15
|
+
def has_next_page?
|
16
|
+
@document.css('.cit-dgb .cit-dark-link').each do |link|
|
17
|
+
return true if link.content.include?("Next")
|
18
|
+
end
|
19
|
+
return false
|
20
|
+
end
|
21
|
+
def next_page_url
|
22
|
+
return nil unless self.has_next_page?
|
23
|
+
@document.css('.cit-dgb .cit-dark-link').each do |link|
|
24
|
+
if(link.content.include?("Next"))
|
25
|
+
return "#{Google::Scholar.google_url}#{link.attr("href")}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module Google
|
4
|
+
module Scholar
|
5
|
+
class AuthorsDocument < Document
|
6
|
+
def valid?
|
7
|
+
validity = !self.content.downcase.index("authors").nil?
|
8
|
+
validity = false if !self.content.downcase.index("didn't match any").nil?
|
9
|
+
validity
|
10
|
+
end
|
11
|
+
def authors_count
|
12
|
+
self.css('.g-unit').length
|
13
|
+
end
|
14
|
+
def authors(force=false)
|
15
|
+
return @authors if @authors && !force
|
16
|
+
@authors = []
|
17
|
+
self.css('.g-unit').each {|author| @authors << Google::Scholar::Author.new(author)}
|
18
|
+
@authors
|
19
|
+
end
|
20
|
+
def last_author
|
21
|
+
authors.last
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module Google
|
4
|
+
module Scholar
|
5
|
+
class AuthorsProfileDocument < Document
|
6
|
+
def articles(force=false)
|
7
|
+
return @citations if @citations && !force
|
8
|
+
@citations = []
|
9
|
+
self.css(".cit-table tr.item").each {|row| @citations << Google::Scholar::ArticleSummary.new(row)}
|
10
|
+
return @citations
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module Google
|
4
|
+
module Scholar
|
5
|
+
class Scraper
|
6
|
+
attr_accessor :documents
|
7
|
+
def initialize(url,initial_document=nil)
|
8
|
+
@documents = []
|
9
|
+
@documents << initial_document if initial_document
|
10
|
+
@documents << self.class.load_url(url) if url
|
11
|
+
self
|
12
|
+
end
|
13
|
+
def self.class_lookup(url="")
|
14
|
+
arguments = url.split("?")
|
15
|
+
arguments = arguments[1].split("&") if arguments.length > 1
|
16
|
+
if(arguments.include?("view_op=search_authors"))
|
17
|
+
return Google::Scholar::AuthorsDocument
|
18
|
+
end
|
19
|
+
if(arguments.any?{|x| x.include?("user=")})
|
20
|
+
return Google::Scholar::AuthorsProfileDocument
|
21
|
+
end
|
22
|
+
return Google::Scholar::Document
|
23
|
+
end
|
24
|
+
def valid?
|
25
|
+
@documents.each do |document|
|
26
|
+
return false unless document.valid?
|
27
|
+
end
|
28
|
+
return true
|
29
|
+
end
|
30
|
+
def load_next_page
|
31
|
+
return unless self.has_more_pages?
|
32
|
+
@documents << self.class.load_url(@documents.last.next_page_url)
|
33
|
+
end
|
34
|
+
def self.load_url(url)
|
35
|
+
uri = URI(url)
|
36
|
+
raise "Invalid scheme for #{url}" if uri.scheme.nil? || !%w{http https}.any?{|scheme| uri.scheme == scheme}
|
37
|
+
return class_lookup(url).new(Nokogiri::HTML(open(url)))
|
38
|
+
end
|
39
|
+
def has_more_pages?
|
40
|
+
@documents.last.has_next_page?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|