google-scholar 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/Guardfile +8 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +5 -0
- data/google-scholar.gemspec +29 -0
- data/lib/google/scholar.rb +29 -0
- data/lib/google/scholar/article_enumerator.rb +32 -0
- data/lib/google/scholar/article_summary.rb +27 -0
- data/lib/google/scholar/author.rb +31 -0
- data/lib/google/scholar/author_enumerator.rb +32 -0
- data/lib/google/scholar/author_search.rb +10 -0
- data/lib/google/scholar/base.rb +15 -0
- data/lib/google/scholar/document.rb +31 -0
- data/lib/google/scholar/document/authors_document.rb +25 -0
- data/lib/google/scholar/document/authors_profile_document.rb +14 -0
- data/lib/google/scholar/scraper.rb +44 -0
- data/lib/google/scholar/version.rb +5 -0
- data/spec/fixtures/article_part.htm +1 -0
- data/spec/fixtures/author_profile_page.htm +1 -0
- data/spec/fixtures/author_result_page.htm +1 -0
- data/spec/fixtures/author_result_page_has_next.htm +1 -0
- data/spec/fixtures/full_article.htm +1 -0
- data/spec/fixtures/single_author_page.htm +14 -0
- data/spec/lib/google/scholar/article_enumerator_spec.rb +32 -0
- data/spec/lib/google/scholar/article_summary_spec.rb +41 -0
- data/spec/lib/google/scholar/author_enumerator_spec.rb +33 -0
- data/spec/lib/google/scholar/author_search_spec.rb +20 -0
- data/spec/lib/google/scholar/author_spec.rb +66 -0
- data/spec/lib/google/scholar/base_spec.rb +15 -0
- data/spec/lib/google/scholar/document/authors_document_spec.rb +103 -0
- data/spec/lib/google/scholar/document/authors_profile_document_spec.rb +40 -0
- data/spec/lib/google/scholar/scraper_spec.rb +39 -0
- data/spec/lib/google/scholar_spec.rb +21 -0
- data/spec/spec_helper.rb +19 -0
- metadata +196 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OGM2NTE5Njc2ZTYxODM1NGQxNTI4ODg3MzY3ZmVmNzlhNjJiMjhmMA==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
ZGRlYzQ3MDc3MGEyMGUwMGQ4NDAyM2QwNzFlNzUzY2Q2MmNmZTBjYg==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ZjE0YzY2NjA5YjBlNDBiODI4N2VkMmFkNDZjYmI5OGEyYzEyNjA2ZjdlMjk2
|
10
|
+
YzljNWRkMTlmZTE4N2I2YzIyMmQ4M2UzMGIzYTQzMWIzMzc0MTMzNDJjODEx
|
11
|
+
NzYxODQ4ZmVkMzY1M2M0ZTBlNjY0ZjBmY2Y2Mjg3ODg5YmEzYjE=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
Y2Q2MDJiMjFiMWY5MDg5MjIwMDFjMWNkMWU0YWEyODZlMzIzMWVkNjEzZTNk
|
14
|
+
ZWE1YmQxYjIzODMzNDJjZjA4MjJkY2U3ODRiMmMzNjdiZWI2OTM0Y2FhYWIy
|
15
|
+
YWUxYjcyYjFhMGZlZTQxNjk1ZDE2MWYyNDRiYzRjMzQ5NGZkMWY=
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard :rspec, :cli => "--color --format documentation" do
|
5
|
+
watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
|
7
|
+
watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Trey Terrell
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Google::Scholar
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'google-scholar'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install google-scholar
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'google/scholar/version'
|
5
|
+
require 'rbconfig'
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "google-scholar"
|
8
|
+
spec.version = Google::Scholar::VERSION
|
9
|
+
spec.authors = ["Trey Terrell"]
|
10
|
+
spec.email = ["trey.terrell@oregonstate.edu"]
|
11
|
+
spec.description = %q{Google Scholar interface. Currently only works for Author searches.}
|
12
|
+
spec.summary = %q{Google Scholar interface. Currently only works for Author searches.}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency 'nokogiri', '~> 1.5.0'
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency 'rspec', '~> 2.13'
|
26
|
+
spec.add_development_dependency 'guard', '~> 1.8.0'
|
27
|
+
spec.add_development_dependency 'guard-rspec'
|
28
|
+
spec.add_development_dependency 'wdm', '>= 0.1.0' if RbConfig::CONFIG['target_os'] =~ /mswin|mingw/i
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require "google/scholar/version"
|
2
|
+
require "google/scholar/base"
|
3
|
+
require "google/scholar/scraper"
|
4
|
+
require "google/scholar/document"
|
5
|
+
require "google/scholar/author"
|
6
|
+
require "google/scholar/author_search"
|
7
|
+
require "google/scholar/author_enumerator"
|
8
|
+
require "google/scholar/article_summary"
|
9
|
+
require "google/scholar/article_enumerator"
|
10
|
+
require "google/scholar/document/authors_document"
|
11
|
+
require "google/scholar/document/authors_profile_document"
|
12
|
+
require 'cgi'
|
13
|
+
module Google
|
14
|
+
module Scholar
|
15
|
+
def self.google_root
|
16
|
+
"scholar.google.com"
|
17
|
+
end
|
18
|
+
def self.http_scheme
|
19
|
+
"http://"
|
20
|
+
end
|
21
|
+
def self.google_url
|
22
|
+
"#{self.http_scheme}#{self.google_root}"
|
23
|
+
end
|
24
|
+
# @TODO May want to move this to AuthorSearch.
|
25
|
+
def self.author_search_url(author)
|
26
|
+
"#{self.http_scheme}#{self.google_root}/citations?view_op=search_authors&hl=en&mauthors=#{::CGI::escape("author:\"#{author}\"")}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'enumerator'
|
2
|
+
module Google
|
3
|
+
module Scholar
|
4
|
+
class ArticleEnumerator
|
5
|
+
include Enumerable
|
6
|
+
def initialize(scraper)
|
7
|
+
@scraper = scraper
|
8
|
+
@documents = scraper.documents
|
9
|
+
self
|
10
|
+
end
|
11
|
+
def each
|
12
|
+
current_document = @documents.first
|
13
|
+
i = 1
|
14
|
+
while(current_document)
|
15
|
+
current_document.articles.each {|article| yield(article)}
|
16
|
+
if(@documents.length > i)
|
17
|
+
current_document = @documents[i]
|
18
|
+
else
|
19
|
+
if(@scraper.has_more_pages?)
|
20
|
+
@scraper.load_next_page
|
21
|
+
@documents = @scraper.documents
|
22
|
+
current_document = @documents[i]
|
23
|
+
else
|
24
|
+
current_document = nil
|
25
|
+
end
|
26
|
+
end
|
27
|
+
i += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class ArticleSummary
|
4
|
+
def initialize(doc)
|
5
|
+
@document = doc
|
6
|
+
end
|
7
|
+
def title
|
8
|
+
@title ||= @document.css("#col-title a:first").text
|
9
|
+
end
|
10
|
+
def authors
|
11
|
+
@authors ||= @document.css("#col-title span:first").text
|
12
|
+
end
|
13
|
+
def publisher
|
14
|
+
@publisher ||= @document.css("#col-title span:last").text
|
15
|
+
end
|
16
|
+
def citations
|
17
|
+
@citations ||= @document.css("#col-citedby a:first").text.to_i
|
18
|
+
end
|
19
|
+
def year
|
20
|
+
@year ||= @document.css("#col-year").text.to_i
|
21
|
+
end
|
22
|
+
def full_article_url
|
23
|
+
@full_article_url ||= "#{Google::Scholar.google_url}#{@document.css("#col-title a:first").attr("href").text}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
module Google
|
3
|
+
module Scholar
|
4
|
+
class Author
|
5
|
+
def initialize(document)
|
6
|
+
@summary_doc = document
|
7
|
+
end
|
8
|
+
def name
|
9
|
+
@name ||= @summary_doc.css("td:last a:first").text.strip.gsub('\n','')
|
10
|
+
end
|
11
|
+
def citation_count
|
12
|
+
@citation_count ||= @summary_doc.css("td:last").children.reject{|x| !x.text?}.last.text.split(" ").last.to_i
|
13
|
+
end
|
14
|
+
def author_url
|
15
|
+
@author_url ||= "#{Google::Scholar.google_url}#{@summary_doc.css("td:last a").first.attr("href")}&pagesize=100"
|
16
|
+
end
|
17
|
+
def full_profile
|
18
|
+
@full_profile ||= Google::Scholar::Scraper.new(author_url).documents.first
|
19
|
+
end
|
20
|
+
def citations
|
21
|
+
full_profile.css("#stats td.cit-data").first.text.to_i
|
22
|
+
end
|
23
|
+
def id
|
24
|
+
@id ||= @summary_doc.css("td:last a:first").attr("href").to_s.match(/user=(.*)&/)[1]
|
25
|
+
end
|
26
|
+
def articles
|
27
|
+
@articles ||= Google::Scholar::ArticleEnumerator.new(Google::Scholar::Scraper.new(nil,self.full_profile))
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'enumerator'
|
2
|
+
module Google
|
3
|
+
module Scholar
|
4
|
+
class AuthorEnumerator
|
5
|
+
include Enumerable
|
6
|
+
def initialize(scraper)
|
7
|
+
@scraper = scraper
|
8
|
+
@documents = scraper.documents
|
9
|
+
self
|
10
|
+
end
|
11
|
+
def each
|
12
|
+
current_document = @documents.first
|
13
|
+
i = 1
|
14
|
+
while(current_document)
|
15
|
+
current_document.authors.each {|author| yield(author)}
|
16
|
+
if(@documents.length > i)
|
17
|
+
current_document = @documents[i]
|
18
|
+
else
|
19
|
+
if(@scraper.has_more_pages?)
|
20
|
+
@scraper.load_next_page
|
21
|
+
@documents = @scraper.documents
|
22
|
+
current_document = @documents[i]
|
23
|
+
else
|
24
|
+
current_document = nil
|
25
|
+
end
|
26
|
+
end
|
27
|
+
i += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class AuthorSearch < Google::Scholar::Base
|
4
|
+
def authors
|
5
|
+
return unless @scraper.documents.first.kind_of?(Google::Scholar::AuthorsDocument)
|
6
|
+
@authors ||= Google::Scholar::AuthorEnumerator.new(@scraper)
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class Base
|
4
|
+
attr_accessor :scraper
|
5
|
+
def initialize(url)
|
6
|
+
@scraper = Google::Scholar::Scraper.new(url)
|
7
|
+
self
|
8
|
+
end
|
9
|
+
def self.search_author(author)
|
10
|
+
url = Google::Scholar.author_search_url(author)
|
11
|
+
Google::Scholar::AuthorSearch.new(url)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Google
|
2
|
+
module Scholar
|
3
|
+
class Document
|
4
|
+
attr_reader :document
|
5
|
+
def initialize(nokogiri_document)
|
6
|
+
@document = nokogiri_document
|
7
|
+
end
|
8
|
+
def method_missing(meth, *args, &block)
|
9
|
+
if(@document.respond_to?(meth))
|
10
|
+
return @document.send(meth,*args,&block)
|
11
|
+
else
|
12
|
+
super
|
13
|
+
end
|
14
|
+
end
|
15
|
+
def has_next_page?
|
16
|
+
@document.css('.cit-dgb .cit-dark-link').each do |link|
|
17
|
+
return true if link.content.include?("Next")
|
18
|
+
end
|
19
|
+
return false
|
20
|
+
end
|
21
|
+
def next_page_url
|
22
|
+
return nil unless self.has_next_page?
|
23
|
+
@document.css('.cit-dgb .cit-dark-link').each do |link|
|
24
|
+
if(link.content.include?("Next"))
|
25
|
+
return "#{Google::Scholar.google_url}#{link.attr("href")}"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module Google
|
4
|
+
module Scholar
|
5
|
+
class AuthorsDocument < Document
|
6
|
+
def valid?
|
7
|
+
validity = !self.content.downcase.index("authors").nil?
|
8
|
+
validity = false if !self.content.downcase.index("didn't match any").nil?
|
9
|
+
validity
|
10
|
+
end
|
11
|
+
def authors_count
|
12
|
+
self.css('.g-unit').length
|
13
|
+
end
|
14
|
+
def authors(force=false)
|
15
|
+
return @authors if @authors && !force
|
16
|
+
@authors = []
|
17
|
+
self.css('.g-unit').each {|author| @authors << Google::Scholar::Author.new(author)}
|
18
|
+
@authors
|
19
|
+
end
|
20
|
+
def last_author
|
21
|
+
authors.last
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module Google
|
4
|
+
module Scholar
|
5
|
+
class AuthorsProfileDocument < Document
|
6
|
+
def articles(force=false)
|
7
|
+
return @citations if @citations && !force
|
8
|
+
@citations = []
|
9
|
+
self.css(".cit-table tr.item").each {|row| @citations << Google::Scholar::ArticleSummary.new(row)}
|
10
|
+
return @citations
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'open-uri'
|
3
|
+
module Google
|
4
|
+
module Scholar
|
5
|
+
class Scraper
|
6
|
+
attr_accessor :documents
|
7
|
+
def initialize(url,initial_document=nil)
|
8
|
+
@documents = []
|
9
|
+
@documents << initial_document if initial_document
|
10
|
+
@documents << self.class.load_url(url) if url
|
11
|
+
self
|
12
|
+
end
|
13
|
+
def self.class_lookup(url="")
|
14
|
+
arguments = url.split("?")
|
15
|
+
arguments = arguments[1].split("&") if arguments.length > 1
|
16
|
+
if(arguments.include?("view_op=search_authors"))
|
17
|
+
return Google::Scholar::AuthorsDocument
|
18
|
+
end
|
19
|
+
if(arguments.any?{|x| x.include?("user=")})
|
20
|
+
return Google::Scholar::AuthorsProfileDocument
|
21
|
+
end
|
22
|
+
return Google::Scholar::Document
|
23
|
+
end
|
24
|
+
def valid?
|
25
|
+
@documents.each do |document|
|
26
|
+
return false unless document.valid?
|
27
|
+
end
|
28
|
+
return true
|
29
|
+
end
|
30
|
+
def load_next_page
|
31
|
+
return unless self.has_more_pages?
|
32
|
+
@documents << self.class.load_url(@documents.last.next_page_url)
|
33
|
+
end
|
34
|
+
def self.load_url(url)
|
35
|
+
uri = URI(url)
|
36
|
+
raise "Invalid scheme for #{url}" if uri.scheme.nil? || !%w{http https}.any?{|scheme| uri.scheme == scheme}
|
37
|
+
return class_lookup(url).new(Nokogiri::HTML(open(url)))
|
38
|
+
end
|
39
|
+
def has_more_pages?
|
40
|
+
@documents.last.has_next_page?
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|