google-scholar 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +18 -0
  3. data/.rspec +2 -0
  4. data/Gemfile +4 -0
  5. data/Guardfile +8 -0
  6. data/LICENSE.txt +22 -0
  7. data/README.md +29 -0
  8. data/Rakefile +5 -0
  9. data/google-scholar.gemspec +29 -0
  10. data/lib/google/scholar.rb +29 -0
  11. data/lib/google/scholar/article_enumerator.rb +32 -0
  12. data/lib/google/scholar/article_summary.rb +27 -0
  13. data/lib/google/scholar/author.rb +31 -0
  14. data/lib/google/scholar/author_enumerator.rb +32 -0
  15. data/lib/google/scholar/author_search.rb +10 -0
  16. data/lib/google/scholar/base.rb +15 -0
  17. data/lib/google/scholar/document.rb +31 -0
  18. data/lib/google/scholar/document/authors_document.rb +25 -0
  19. data/lib/google/scholar/document/authors_profile_document.rb +14 -0
  20. data/lib/google/scholar/scraper.rb +44 -0
  21. data/lib/google/scholar/version.rb +5 -0
  22. data/spec/fixtures/article_part.htm +1 -0
  23. data/spec/fixtures/author_profile_page.htm +1 -0
  24. data/spec/fixtures/author_result_page.htm +1 -0
  25. data/spec/fixtures/author_result_page_has_next.htm +1 -0
  26. data/spec/fixtures/full_article.htm +1 -0
  27. data/spec/fixtures/single_author_page.htm +14 -0
  28. data/spec/lib/google/scholar/article_enumerator_spec.rb +32 -0
  29. data/spec/lib/google/scholar/article_summary_spec.rb +41 -0
  30. data/spec/lib/google/scholar/author_enumerator_spec.rb +33 -0
  31. data/spec/lib/google/scholar/author_search_spec.rb +20 -0
  32. data/spec/lib/google/scholar/author_spec.rb +66 -0
  33. data/spec/lib/google/scholar/base_spec.rb +15 -0
  34. data/spec/lib/google/scholar/document/authors_document_spec.rb +103 -0
  35. data/spec/lib/google/scholar/document/authors_profile_document_spec.rb +40 -0
  36. data/spec/lib/google/scholar/scraper_spec.rb +39 -0
  37. data/spec/lib/google/scholar_spec.rb +21 -0
  38. data/spec/spec_helper.rb +19 -0
  39. metadata +196 -0
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ OGM2NTE5Njc2ZTYxODM1NGQxNTI4ODg3MzY3ZmVmNzlhNjJiMjhmMA==
5
+ data.tar.gz: !binary |-
6
+ ZGRlYzQ3MDc3MGEyMGUwMGQ4NDAyM2QwNzFlNzUzY2Q2MmNmZTBjYg==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ ZjE0YzY2NjA5YjBlNDBiODI4N2VkMmFkNDZjYmI5OGEyYzEyNjA2ZjdlMjk2
10
+ YzljNWRkMTlmZTE4N2I2YzIyMmQ4M2UzMGIzYTQzMWIzMzc0MTMzNDJjODEx
11
+ NzYxODQ4ZmVkMzY1M2M0ZTBlNjY0ZjBmY2Y2Mjg3ODg5YmEzYjE=
12
+ data.tar.gz: !binary |-
13
+ Y2Q2MDJiMjFiMWY5MDg5MjIwMDFjMWNkMWU0YWEyODZlMzIzMWVkNjEzZTNk
14
+ ZWE1YmQxYjIzODMzNDJjZjA4MjJkY2U3ODRiMmMzNjdiZWI2OTM0Y2FhYWIy
15
+ YWUxYjcyYjFhMGZlZTQxNjk1ZDE2MWYyNDRiYzRjMzQ5NGZkMWY=
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ .idea/*
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in google-scholar.gemspec
4
+ gemspec
@@ -0,0 +1,8 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec, :cli => "--color --format documentation" do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Trey Terrell
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # Google::Scholar
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'google-scholar'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install google-scholar
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rspec/core/rake_task'
3
+ RSpec::Core::RakeTask.new('spec')
4
+
5
+ task :default => :spec
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'google/scholar/version'
5
+ require 'rbconfig'
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "google-scholar"
8
+ spec.version = Google::Scholar::VERSION
9
+ spec.authors = ["Trey Terrell"]
10
+ spec.email = ["trey.terrell@oregonstate.edu"]
11
+ spec.description = %q{Google Scholar interface. Currently only works for Author searches.}
12
+ spec.summary = %q{Google Scholar interface. Currently only works for Author searches.}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency 'nokogiri', '~> 1.5.0'
22
+
23
+ spec.add_development_dependency "bundler", "~> 1.3"
24
+ spec.add_development_dependency "rake"
25
+ spec.add_development_dependency 'rspec', '~> 2.13'
26
+ spec.add_development_dependency 'guard', '~> 1.8.0'
27
+ spec.add_development_dependency 'guard-rspec'
28
+ spec.add_development_dependency 'wdm', '>= 0.1.0' if RbConfig::CONFIG['target_os'] =~ /mswin|mingw/i
29
+ end
@@ -0,0 +1,29 @@
1
+ require "google/scholar/version"
2
+ require "google/scholar/base"
3
+ require "google/scholar/scraper"
4
+ require "google/scholar/document"
5
+ require "google/scholar/author"
6
+ require "google/scholar/author_search"
7
+ require "google/scholar/author_enumerator"
8
+ require "google/scholar/article_summary"
9
+ require "google/scholar/article_enumerator"
10
+ require "google/scholar/document/authors_document"
11
+ require "google/scholar/document/authors_profile_document"
12
+ require 'cgi'
13
+ module Google
14
+ module Scholar
15
+ def self.google_root
16
+ "scholar.google.com"
17
+ end
18
+ def self.http_scheme
19
+ "http://"
20
+ end
21
+ def self.google_url
22
+ "#{self.http_scheme}#{self.google_root}"
23
+ end
24
+ # @TODO May want to move this to AuthorSearch.
25
+ def self.author_search_url(author)
26
+ "#{self.http_scheme}#{self.google_root}/citations?view_op=search_authors&hl=en&mauthors=#{::CGI::escape("author:\"#{author}\"")}"
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,32 @@
1
+ require 'enumerator'
2
+ module Google
3
+ module Scholar
4
+ class ArticleEnumerator
5
+ include Enumerable
6
+ def initialize(scraper)
7
+ @scraper = scraper
8
+ @documents = scraper.documents
9
+ self
10
+ end
11
+ def each
12
+ current_document = @documents.first
13
+ i = 1
14
+ while(current_document)
15
+ current_document.articles.each {|article| yield(article)}
16
+ if(@documents.length > i)
17
+ current_document = @documents[i]
18
+ else
19
+ if(@scraper.has_more_pages?)
20
+ @scraper.load_next_page
21
+ @documents = @scraper.documents
22
+ current_document = @documents[i]
23
+ else
24
+ current_document = nil
25
+ end
26
+ end
27
+ i += 1
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,27 @@
1
+ module Google
2
+ module Scholar
3
+ class ArticleSummary
4
+ def initialize(doc)
5
+ @document = doc
6
+ end
7
+ def title
8
+ @title ||= @document.css("#col-title a:first").text
9
+ end
10
+ def authors
11
+ @authors ||= @document.css("#col-title span:first").text
12
+ end
13
+ def publisher
14
+ @publisher ||= @document.css("#col-title span:last").text
15
+ end
16
+ def citations
17
+ @citations ||= @document.css("#col-citedby a:first").text.to_i
18
+ end
19
+ def year
20
+ @year ||= @document.css("#col-year").text.to_i
21
+ end
22
+ def full_article_url
23
+ @full_article_url ||= "#{Google::Scholar.google_url}#{@document.css("#col-title a:first").attr("href").text}"
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,31 @@
1
+ require 'nokogiri'
2
+ module Google
3
+ module Scholar
4
+ class Author
5
+ def initialize(document)
6
+ @summary_doc = document
7
+ end
8
+ def name
9
+ @name ||= @summary_doc.css("td:last a:first").text.strip.gsub('\n','')
10
+ end
11
+ def citation_count
12
+ @citation_count ||= @summary_doc.css("td:last").children.reject{|x| !x.text?}.last.text.split(" ").last.to_i
13
+ end
14
+ def author_url
15
+ @author_url ||= "#{Google::Scholar.google_url}#{@summary_doc.css("td:last a").first.attr("href")}&pagesize=100"
16
+ end
17
+ def full_profile
18
+ @full_profile ||= Google::Scholar::Scraper.new(author_url).documents.first
19
+ end
20
+ def citations
21
+ full_profile.css("#stats td.cit-data").first.text.to_i
22
+ end
23
+ def id
24
+ @id ||= @summary_doc.css("td:last a:first").attr("href").to_s.match(/user=(.*)&/)[1]
25
+ end
26
+ def articles
27
+ @articles ||= Google::Scholar::ArticleEnumerator.new(Google::Scholar::Scraper.new(nil,self.full_profile))
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,32 @@
1
+ require 'enumerator'
2
+ module Google
3
+ module Scholar
4
+ class AuthorEnumerator
5
+ include Enumerable
6
+ def initialize(scraper)
7
+ @scraper = scraper
8
+ @documents = scraper.documents
9
+ self
10
+ end
11
+ def each
12
+ current_document = @documents.first
13
+ i = 1
14
+ while(current_document)
15
+ current_document.authors.each {|author| yield(author)}
16
+ if(@documents.length > i)
17
+ current_document = @documents[i]
18
+ else
19
+ if(@scraper.has_more_pages?)
20
+ @scraper.load_next_page
21
+ @documents = @scraper.documents
22
+ current_document = @documents[i]
23
+ else
24
+ current_document = nil
25
+ end
26
+ end
27
+ i += 1
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,10 @@
1
+ module Google
2
+ module Scholar
3
+ class AuthorSearch < Google::Scholar::Base
4
+ def authors
5
+ return unless @scraper.documents.first.kind_of?(Google::Scholar::AuthorsDocument)
6
+ @authors ||= Google::Scholar::AuthorEnumerator.new(@scraper)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,15 @@
1
+ module Google
2
+ module Scholar
3
+ class Base
4
+ attr_accessor :scraper
5
+ def initialize(url)
6
+ @scraper = Google::Scholar::Scraper.new(url)
7
+ self
8
+ end
9
+ def self.search_author(author)
10
+ url = Google::Scholar.author_search_url(author)
11
+ Google::Scholar::AuthorSearch.new(url)
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,31 @@
1
+ module Google
2
+ module Scholar
3
+ class Document
4
+ attr_reader :document
5
+ def initialize(nokogiri_document)
6
+ @document = nokogiri_document
7
+ end
8
+ def method_missing(meth, *args, &block)
9
+ if(@document.respond_to?(meth))
10
+ return @document.send(meth,*args,&block)
11
+ else
12
+ super
13
+ end
14
+ end
15
+ def has_next_page?
16
+ @document.css('.cit-dgb .cit-dark-link').each do |link|
17
+ return true if link.content.include?("Next")
18
+ end
19
+ return false
20
+ end
21
+ def next_page_url
22
+ return nil unless self.has_next_page?
23
+ @document.css('.cit-dgb .cit-dark-link').each do |link|
24
+ if(link.content.include?("Next"))
25
+ return "#{Google::Scholar.google_url}#{link.attr("href")}"
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,25 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module Google
4
+ module Scholar
5
+ class AuthorsDocument < Document
6
+ def valid?
7
+ validity = !self.content.downcase.index("authors").nil?
8
+ validity = false if !self.content.downcase.index("didn't match any").nil?
9
+ validity
10
+ end
11
+ def authors_count
12
+ self.css('.g-unit').length
13
+ end
14
+ def authors(force=false)
15
+ return @authors if @authors && !force
16
+ @authors = []
17
+ self.css('.g-unit').each {|author| @authors << Google::Scholar::Author.new(author)}
18
+ @authors
19
+ end
20
+ def last_author
21
+ authors.last
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,14 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module Google
4
+ module Scholar
5
+ class AuthorsProfileDocument < Document
6
+ def articles(force=false)
7
+ return @citations if @citations && !force
8
+ @citations = []
9
+ self.css(".cit-table tr.item").each {|row| @citations << Google::Scholar::ArticleSummary.new(row)}
10
+ return @citations
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,44 @@
1
+ require 'nokogiri'
2
+ require 'open-uri'
3
+ module Google
4
+ module Scholar
5
+ class Scraper
6
+ attr_accessor :documents
7
+ def initialize(url,initial_document=nil)
8
+ @documents = []
9
+ @documents << initial_document if initial_document
10
+ @documents << self.class.load_url(url) if url
11
+ self
12
+ end
13
+ def self.class_lookup(url="")
14
+ arguments = url.split("?")
15
+ arguments = arguments[1].split("&") if arguments.length > 1
16
+ if(arguments.include?("view_op=search_authors"))
17
+ return Google::Scholar::AuthorsDocument
18
+ end
19
+ if(arguments.any?{|x| x.include?("user=")})
20
+ return Google::Scholar::AuthorsProfileDocument
21
+ end
22
+ return Google::Scholar::Document
23
+ end
24
+ def valid?
25
+ @documents.each do |document|
26
+ return false unless document.valid?
27
+ end
28
+ return true
29
+ end
30
+ def load_next_page
31
+ return unless self.has_more_pages?
32
+ @documents << self.class.load_url(@documents.last.next_page_url)
33
+ end
34
+ def self.load_url(url)
35
+ uri = URI(url)
36
+ raise "Invalid scheme for #{url}" if uri.scheme.nil? || !%w{http https}.any?{|scheme| uri.scheme == scheme}
37
+ return class_lookup(url).new(Nokogiri::HTML(open(url)))
38
+ end
39
+ def has_more_pages?
40
+ @documents.last.has_next_page?
41
+ end
42
+ end
43
+ end
44
+ end