google-scholar 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +18 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +4 -0
  5. data/Guardfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +29 -0
  8. data/Rakefile +5 -0
  9. data/google-scholar.gemspec +29 -0
  10. data/lib/google/scholar.rb +29 -0
  11. data/lib/google/scholar/article_enumerator.rb +32 -0
  12. data/lib/google/scholar/article_summary.rb +27 -0
  13. data/lib/google/scholar/author.rb +31 -0
  14. data/lib/google/scholar/author_enumerator.rb +32 -0
  15. data/lib/google/scholar/author_search.rb +10 -0
  16. data/lib/google/scholar/base.rb +15 -0
  17. data/lib/google/scholar/document.rb +31 -0
  18. data/lib/google/scholar/document/authors_document.rb +25 -0
  19. data/lib/google/scholar/document/authors_profile_document.rb +14 -0
  20. data/lib/google/scholar/scraper.rb +44 -0
  21. data/lib/google/scholar/version.rb +5 -0
  22. data/spec/fixtures/article_part.htm +1 -0
  23. data/spec/fixtures/author_profile_page.htm +1 -0
  24. data/spec/fixtures/author_result_page.htm +1 -0
  25. data/spec/fixtures/author_result_page_has_next.htm +1 -0
  26. data/spec/fixtures/full_article.htm +1 -0
  27. data/spec/fixtures/single_author_page.htm +14 -0
  28. data/spec/lib/google/scholar/article_enumerator_spec.rb +32 -0
  29. data/spec/lib/google/scholar/article_summary_spec.rb +41 -0
  30. data/spec/lib/google/scholar/author_enumerator_spec.rb +33 -0
  31. data/spec/lib/google/scholar/author_search_spec.rb +20 -0
  32. data/spec/lib/google/scholar/author_spec.rb +66 -0
  33. data/spec/lib/google/scholar/base_spec.rb +15 -0
  34. data/spec/lib/google/scholar/document/authors_document_spec.rb +103 -0
  35. data/spec/lib/google/scholar/document/authors_profile_document_spec.rb +40 -0
  36. data/spec/lib/google/scholar/scraper_spec.rb +39 -0
  37. data/spec/lib/google/scholar_spec.rb +21 -0
  38. data/spec/spec_helper.rb +19 -0
  39. metadata +196 -0
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OGM2NTE5Njc2ZTYxODM1NGQxNTI4ODg3MzY3ZmVmNzlhNjJiMjhmMA==
5
+ data.tar.gz: !binary |-
6
+ ZGRlYzQ3MDc3MGEyMGUwMGQ4NDAyM2QwNzFlNzUzY2Q2MmNmZTBjYg==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZjE0YzY2NjA5YjBlNDBiODI4N2VkMmFkNDZjYmI5OGEyYzEyNjA2ZjdlMjk2
10
+ YzljNWRkMTlmZTE4N2I2YzIyMmQ4M2UzMGIzYTQzMWIzMzc0MTMzNDJjODEx
11
+ NzYxODQ4ZmVkMzY1M2M0ZTBlNjY0ZjBmY2Y2Mjg3ODg5YmEzYjE=
12
+ data.tar.gz: !binary |-
13
+ Y2Q2MDJiMjFiMWY5MDg5MjIwMDFjMWNkMWU0YWEyODZlMzIzMWVkNjEzZTNk
14
+ ZWE1YmQxYjIzODMzNDJjZjA4MjJkY2U3ODRiMmMzNjdiZWI2OTM0Y2FhYWIy
15
+ YWUxYjcyYjFhMGZlZTQxNjk1ZDE2MWYyNDRiYzRjMzQ5NGZkMWY=
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in google-scholar.gemspec
4
+ gemspec
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec, :cli => "--color --format documentation" do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Trey Terrell
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Google::Scholar
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'google-scholar'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install google-scholar
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+ RSpec::Core::RakeTask.new('spec')
4
+
5
+ task :default => :spec
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'google/scholar/version'
5
+ require 'rbconfig'
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "google-scholar"
8
+ spec.version = Google::Scholar::VERSION
9
+ spec.authors = ["Trey Terrell"]
10
+ spec.email = ["trey.terrell@oregonstate.edu"]
11
+ spec.description = %q{Google Scholar interface. Currently only works for Author searches.}
12
+ spec.summary = %q{Google Scholar interface. Currently only works for Author searches.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency 'nokogiri', '~> 1.5.0'
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency 'rspec', '~> 2.13'
26
+ spec.add_development_dependency 'guard', '~> 1.8.0'
27
+ spec.add_development_dependency 'guard-rspec'
28
+ spec.add_development_dependency 'wdm', '>= 0.1.0' if RbConfig::CONFIG['target_os'] =~ /mswin|mingw/i
29
+ end
@@ -0,0 +1,29 @@
1
+ require "google/scholar/version"
2
+ require "google/scholar/base"
3
+ require "google/scholar/scraper"
4
+ require "google/scholar/document"
5
+ require "google/scholar/author"
6
+ require "google/scholar/author_search"
7
+ require "google/scholar/author_enumerator"
8
+ require "google/scholar/article_summary"
9
+ require "google/scholar/article_enumerator"
10
+ require "google/scholar/document/authors_document"
11
+ require "google/scholar/document/authors_profile_document"
12
+ require 'cgi'
13
+ module Google
14
+ module Scholar
15
+ def self.google_root
16
+ "scholar.google.com"
17
+ end
18
+ def self.http_scheme
19
+ "http://"
20
+ end
21
+ def self.google_url
22
+ "#{self.http_scheme}#{self.google_root}"
23
+ end
24
+ # @TODO May want to move this to AuthorSearch.
25
+ def self.author_search_url(author)
26
+ "#{self.http_scheme}#{self.google_root}/citations?view_op=search_authors&hl=en&mauthors=#{::CGI::escape("author:\"#{author}\"")}"
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,32 @@
1
+ require 'enumerator'
2
+ module Google
3
+ module Scholar
4
+ class ArticleEnumerator
5
+ include Enumerable
6
+ def initialize(scraper)
7
+ @scraper = scraper
8
+ @documents = scraper.documents
9
+ self
10
+ end
11
+ def each
12
+ current_document = @documents.first
13
+ i = 1
14
+ while(current_document)
15
+ current_document.articles.each {|article| yield(article)}
16
+ if(@documents.length > i)
17
+ current_document = @documents[i]
18
+ else
19
+ if(@scraper.has_more_pages?)
20
+ @scraper.load_next_page
21
+ @documents = @scraper.documents
22
+ current_document = @documents[i]
23
+ else
24
+ current_document = nil
25
+ end
26
+ end
27
+ i += 1
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ module Google
2
+ module Scholar
3
+ class ArticleSummary
4
+ def initialize(doc)
5
+ @document = doc
6
+ end
7
+ def title
8
+ @title ||= @document.css("#col-title a:first").text
9
+ end
10
+ def authors
11
+ @authors ||= @document.css("#col-title span:first").text
12
+ end
13
+ def publisher
14
+ @publisher ||= @document.css("#col-title span:last").text
15
+ end
16
+ def citations
17
+ @citations ||= @document.css("#col-citedby a:first").text.to_i
18
+ end
19
+ def year
20
+ @year ||= @document.css("#col-year").text.to_i
21
+ end
22
+ def full_article_url
23
+ @full_article_url ||= "#{Google::Scholar.google_url}#{@document.css("#col-title a:first").attr("href").text}"
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,31 @@
1
+ require 'nokogiri'
2
+ module Google
3
+ module Scholar
4
+ class Author
5
+ def initialize(document)
6
+ @summary_doc = document
7
+ end
8
+ def name
9
+ @name ||= @summary_doc.css("td:last a:first").text.strip.gsub('\n','')
10
+ end
11
+ def citation_count
12
+ @citation_count ||= @summary_doc.css("td:last").children.reject{|x| !x.text?}.last.text.split(" ").last.to_i
13
+ end
14
+ def author_url
15
+ @author_url ||= "#{Google::Scholar.google_url}#{@summary_doc.css("td:last a").first.attr("href")}&pagesize=100"
16
+ end
17
+ def full_profile
18
+ @full_profile ||= Google::Scholar::Scraper.new(author_url).documents.first
19
+ end
20
+ def citations
21
+ full_profile.css("#stats td.cit-data").first.text.to_i
22
+ end
23
+ def id
24
+ @id ||= @summary_doc.css("td:last a:first").attr("href").to_s.match(/user=(.*)&/)[1]
25
+ end
26
+ def articles
27
+ @articles ||= Google::Scholar::ArticleEnumerator.new(Google::Scholar::Scraper.new(nil,self.full_profile))
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,32 @@
1
+ require 'enumerator'
2
+ module Google
3
+ module Scholar
4
+ class AuthorEnumerator
5
+ include Enumerable
6
+ def initialize(scraper)
7
+ @scraper = scraper
8
+ @documents = scraper.documents
9
+ self
10
+ end
11
+ def each
12
+ current_document = @documents.first
13
+ i = 1
14
+ while(current_document)
15
+ current_document.authors.each {|author| yield(author)}
16
+ if(@documents.length > i)
17
+ current_document = @documents[i]
18
+ else
19
+ if(@scraper.has_more_pages?)
20
+ @scraper.load_next_page
21
+ @documents = @scraper.documents
22
+ current_document = @documents[i]
23
+ else
24
+ current_document = nil
25
+ end
26
+ end
27
+ i += 1
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,10 @@
1
+ module Google
2
+ module Scholar
3
+ class AuthorSearch < Google::Scholar::Base
4
+ def authors
5
+ return unless @scraper.documents.first.kind_of?(Google::Scholar::AuthorsDocument)
6
+ @authors ||= Google::Scholar::AuthorEnumerator.new(@scraper)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ module Google
2
+ module Scholar
3
+ class Base
4
+ attr_accessor :scraper
5
+ def initialize(url)
6
+ @scraper = Google::Scholar::Scraper.new(url)
7
+ self
8
+ end
9
+ def self.search_author(author)
10
+ url = Google::Scholar.author_search_url(author)
11
+ Google::Scholar::AuthorSearch.new(url)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,31 @@
1
+ module Google
2
+ module Scholar
3
+ class Document
4
+ attr_reader :document
5
+ def initialize(nokogiri_document)
6
+ @document = nokogiri_document
7
+ end
8
+ def method_missing(meth, *args, &block)
9
+ if(@document.respond_to?(meth))
10
+ return @document.send(meth,*args,&block)
11
+ else
12
+ super
13
+ end
14
+ end
15
+ def has_next_page?
16
+ @document.css('.cit-dgb .cit-dark-link').each do |link|
17
+ return true if link.content.include?("Next")
18
+ end
19
+ return false
20
+ end
21
+ def next_page_url
22
+ return nil unless self.has_next_page?
23
+ @document.css('.cit-dgb .cit-dark-link').each do |link|
24
+ if(link.content.include?("Next"))
25
+ return "#{Google::Scholar.google_url}#{link.attr("href")}"
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,25 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module Google
4
+ module Scholar
5
+ class AuthorsDocument < Document
6
+ def valid?
7
+ validity = !self.content.downcase.index("authors").nil?
8
+ validity = false if !self.content.downcase.index("didn't match any").nil?
9
+ validity
10
+ end
11
+ def authors_count
12
+ self.css('.g-unit').length
13
+ end
14
+ def authors(force=false)
15
+ return @authors if @authors && !force
16
+ @authors = []
17
+ self.css('.g-unit').each {|author| @authors << Google::Scholar::Author.new(author)}
18
+ @authors
19
+ end
20
+ def last_author
21
+ authors.last
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,14 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module Google
4
+ module Scholar
5
+ class AuthorsProfileDocument < Document
6
+ def articles(force=false)
7
+ return @citations if @citations && !force
8
+ @citations = []
9
+ self.css(".cit-table tr.item").each {|row| @citations << Google::Scholar::ArticleSummary.new(row)}
10
+ return @citations
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,44 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module Google
4
+ module Scholar
5
+ class Scraper
6
+ attr_accessor :documents
7
+ def initialize(url,initial_document=nil)
8
+ @documents = []
9
+ @documents << initial_document if initial_document
10
+ @documents << self.class.load_url(url) if url
11
+ self
12
+ end
13
+ def self.class_lookup(url="")
14
+ arguments = url.split("?")
15
+ arguments = arguments[1].split("&") if arguments.length > 1
16
+ if(arguments.include?("view_op=search_authors"))
17
+ return Google::Scholar::AuthorsDocument
18
+ end
19
+ if(arguments.any?{|x| x.include?("user=")})
20
+ return Google::Scholar::AuthorsProfileDocument
21
+ end
22
+ return Google::Scholar::Document
23
+ end
24
+ def valid?
25
+ @documents.each do |document|
26
+ return false unless document.valid?
27
+ end
28
+ return true
29
+ end
30
+ def load_next_page
31
+ return unless self.has_more_pages?
32
+ @documents << self.class.load_url(@documents.last.next_page_url)
33
+ end
34
+ def self.load_url(url)
35
+ uri = URI(url)
36
+ raise "Invalid scheme for #{url}" if uri.scheme.nil? || !%w{http https}.any?{|scheme| uri.scheme == scheme}
37
+ return class_lookup(url).new(Nokogiri::HTML(open(url)))
38
+ end
39
+ def has_more_pages?
40
+ @documents.last.has_next_page?
41
+ end
42
+ end
43
+ end
44
+ end