gentle-scholar 1.0.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a16b27e1ce933a215cfc7d0e8ba1763f0c24aec
4
- data.tar.gz: 778624c2dcc3146444938c40869775192ab65edc
3
+ metadata.gz: aa9d57fcccb8fd1fea02fe1a2651942dcbbf3940
4
+ data.tar.gz: 0f9d9a785ff33dba50e9418f9d892e3991e09e15
5
5
  SHA512:
6
- metadata.gz: d1f88ec34776d3abadca4c53ba613151c12c0b77df0dd993aef403ec72a7aba3aa27485728e543f0cfa0a308c6c2b6674d7761394beea3543c299f37b79eb2d7
7
- data.tar.gz: 462726a60bf05d720f9503390f4e3ebed7cdbc55d825631275a40ccb7aed4d2b65f8ea8aa2afd9e6434f261e765b428af1331243ed609f0d60e3aa8ddacbcc95
6
+ metadata.gz: 814517c4c9ed75066116a33106013645f14a0d67e964f89b92db1ae8bfebbca4b5a82822d349c97ff9081f3093dbdfd5946db2dfee90f9de0209588a8a52b8bd
7
+ data.tar.gz: 38af181ecc696bc79056c638ac96337806c363e3178c0fa30f36ae2f9e43161156975fedc42fa388c87ec55b94dda259fc8c61e707954b1209fed23cbf0c5842
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ Gemfile.lock
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - ruby-head
4
+ - 2.1.0
5
+ - 1.9.3
6
+ - jruby-head
7
+ - jruby-19mode
8
+ only:
9
+ - master
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'pry' # allows us to use pry for bundle console
4
+ gemspec
5
+
6
+ group :test do
7
+ gem 'rake'
8
+ end
@@ -0,0 +1,42 @@
1
+ # gentle-scholar gem
2
+ [![Gem Version](https://badge.fury.io/rb/gentle-scholar.svg)](http://badge.fury.io/rb/gentle-scholar)
3
+ [![Build Status](https://travis-ci.org/soumyaray/gentle-scholar.svg?branch=v1.0.1)](https://travis-ci.org/soumyaray/gentle-scholar)
4
+
5
+ Gem to extract Google Scholar publication given URL of a publication
6
+
7
+ ##About:
8
+ **NOTE:** This gem is not to to search or crawl through Google Scholar.
9
+ This also will not extract bibtex (please see the excellent
10
+ [gscholar](https://rubygems.org/gems/gscholar) gem for that).
11
+
12
+ This gem is for publishing academics to extract their own publication
13
+ information from Google Scholar. It can retrieve standard citation information
14
+ (authors, journal, title, etc.). Additionally, it can retrieve number of
15
+ citations reported by Google Scholar, link to citations page, chart of
16
+ citations over time, and link to author's Google Scholar profile.
17
+
18
+ ##Usage:
19
+
20
+ Given a google scholar article such as:
21
+
22
+ http://scholar.google.com.tw/citations?view_op=view_citation&hl=en&user=6WjiSOwAAAAJ&citation_for_view=6WjiSOwAAAAJ:9yKSN-GCB0IC
23
+
24
+ Retrieve information by copying the author and article ID from the URL:
25
+
26
+ sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
27
+
28
+ This returns:
29
+
30
+ #<GScholarPub:0x007fb0bc18df70
31
+ @article_url="http://pubsonline.informs.org/doi/abs/10.1287/isre.1100.0340",
32
+ @authors=[["Soumya", "Ray"], ["Sung", "S", "Kim"], ["James", "G", "Morris"]],
33
+ @chart_url="http://www.google.com/chart?chs=475x90&cht=bvs&chf=bg,s,e8f4f7&chco=1111cc&chbh=r,2.0,0.0&chxt=x,y&chxr=1,0,5,5&chd=t:100.0,80.0,100.0
34
+ @cites=14,
35
+ @cites_url="http://scholar.google.com/scholar?oi=bibs&hl=en&oe=ASCII&cites=11777343089817755068",
36
+ @description="The highly competitive and rapidly changing market for online services is becoming increasingly effective at locking users in through
37
+ @issue="1",
38
+ @journal="Information Systems Research",
39
+ @pages="197-213",
40
+ @title="Research Note\u0097Online Users' Switching Costs: Their Nature and Formation",
41
+ @volume="23"
42
+ >
@@ -0,0 +1,7 @@
1
+ require 'rake/testtask'
2
+
3
+ task :default => [:spec]
4
+
5
+ Rake::TestTask.new(name=:spec) do |t|
6
+ t.pattern = "spec/*_spec.rb"
7
+ end
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'gentle-scholar'
4
+
5
+ pub = GentleScholar::Publication.get_from_http('6WjiSOwAAAAJ:9yKSN-GCB0IC')
6
+
7
+ puts 'Publication Details'
8
+ pub.each do |field, data|
9
+ puts "#{field}: #{pub[field]}"
10
+ end
@@ -0,0 +1,20 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require 'gentle-scholar/version'
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'gentle-scholar'
6
+ s.version = GentleScholar::VERSION
7
+ s.add_development_dependency 'minitest'
8
+ s.add_development_dependency 'minitest-rg'
9
+ s.add_runtime_dependency 'nokogiri', '>= 1.6.2'
10
+ s.add_runtime_dependency 'typhoeus', '>= 0.6.8'
11
+ s.date = '2014-05-27'
12
+ s.summary = 'Google Scholar infor extractor'
13
+ s.description = 'Extract author/paper info from Google Scholar'
14
+ s.authors = ['Soumya Ray']
15
+ s.email = 'soumya.ray@gmail.com'
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.homepage = 'https://github.com/soumyaray/gentle-scholar'
19
+ s.license = 'MIT'
20
+ end
@@ -0,0 +1 @@
1
+ require 'gentle-scholar/publication.rb'
@@ -0,0 +1,80 @@
1
+ # Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
2
+ # License:: MIT
3
+ require 'typhoeus'
4
+ require 'nokogiri'
5
+ require 'date'
6
+
7
+ module GentleScholar
8
+ # This class loads a single publication from Google scholar and returns
9
+ # all its attributes, including dynamic attributes like number of citations
10
+ class Publication
11
+ GS_HOST_URL = 'http://scholar.google.com'
12
+ GS_CIT_URL = "#{GS_HOST_URL}/citations?view_op=view_citation&hl=en"
13
+
14
+ SCAN_STR = {
15
+ gscholar_url: "//div[contains(@class,'g-section cit-dgb')]"\
16
+ "/div/table/tr/td/a",
17
+ cites: "//div[contains(@id,'scholar_sec')]/div/a",
18
+ cites_url: "//div[contains(@id,'scholar_sec')]/div/a",
19
+ title: '//div[@id="title"]/a',
20
+ article_url: '//div[@id="title"]/a',
21
+ chart_url: '//div[contains(@class,"cit-dd")]/img'
22
+ }
23
+
24
+ SCAN_LAMBDAS = {
25
+ cites: ->(x) { x.text[/\d+/].to_i },
26
+ cites_url: ->(x) { x[0].attributes['href'].value },
27
+ title: ->(x) { x.text },
28
+ article_url: ->(x) { x.attr('href').value },
29
+ chart_url: ->(x) { x.attr('src').value },
30
+ gscholar_url: ->(x) { GS_HOST_URL + x.attr('href').value }
31
+ }
32
+
33
+ TABLE_ATTR = {
34
+ authors: 'Authors',
35
+ date: 'Publication date',
36
+ journal: 'Journal name',
37
+ volume: 'Volume',
38
+ issue: 'Issue',
39
+ pages: 'Pages',
40
+ publisher: 'Publisher',
41
+ description: 'Description'
42
+ }
43
+
44
+ TABLE_LAMBDAS = {
45
+ authors: ->(x) { x.split(/,/).map { |a| a.split(' ') } },
46
+ date: ->(x) { Date.strptime(x, '%Y/%m/%d') }
47
+ }
48
+
49
+ def self.get_from_http(scholar_pub_id)
50
+ auth_id, pub_id = scholar_pub_id.split(/:/)
51
+ url = GS_CIT_URL + '&user=' + auth_id \
52
+ + '&citation_for_view=' + auth_id + ':' + pub_id
53
+ res = Typhoeus::Request.new(url).run
54
+ doc = Nokogiri::HTML(res.response_body)
55
+
56
+ extract_html_elements(doc).merge(extract_html_table(doc))
57
+ end
58
+
59
+ def self.extract_html_elements(doc)
60
+ xpath = Hash[SCAN_STR.map { |elem, path| [elem, doc.xpath(path)] }]
61
+ elements = SCAN_LAMBDAS.map do |key, lam|
62
+ if xpath[key].any? then [key, lam.call(xpath[key])] end
63
+ end
64
+
65
+ Hash[elements.compact]
66
+ end
67
+
68
+ def self.extract_html_table(doc)
69
+ # lambda gets text from right html column given name in left column
70
+ table_extract = lambda do |name|
71
+ doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
72
+ end
73
+
74
+ elements = Hash[TABLE_ATTR.map { |k, v| [k, table_extract.call(v)] }]
75
+ elements.merge(
76
+ Hash[TABLE_LAMBDAS.map { |key, lam| [key, lam.call(elements[key])] }]
77
+ )
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,3 @@
1
+ module GentleScholar
2
+ VERSION = '2.0.0'
3
+ end
@@ -0,0 +1,3 @@
1
+ require './lib/gentle-scholar.rb'
2
+
3
+ SEC_PAPER = GentleScholar::Publication.get_from_http('6WjiSOwAAAAJ:u5HHmVD_uO8C')
@@ -0,0 +1,76 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/rg'
3
+ require './spec/minitest_helper.rb'
4
+
5
+ describe 'Publication', 'A single publication listing' do
6
+
7
+ # let (:@sec_paper) do
8
+ # GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
9
+ # end
10
+
11
+ describe 'when it is a paper' do
12
+ #
13
+ # before do
14
+ # @sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
15
+ # end
16
+
17
+ before do
18
+ @sec_paper = SEC_PAPER
19
+ end
20
+
21
+ it 'has the right title' do
22
+ @sec_paper[:title].must_equal 'Security Assurance: How Online Service '\
23
+ 'Providers Can Influence Security Control Perceptions and Gain Trust'
24
+ end
25
+
26
+ it 'has some number of cites' do
27
+ @sec_paper[:cites].must_be :>, 0
28
+ end
29
+
30
+ it 'has some url for listing cites' do
31
+ @sec_paper[:cites_url].must_match /http:\/\/.*/
32
+ end
33
+
34
+ it 'has some url for citations chart' do
35
+ @sec_paper[:chart_url].must_match /http:\/\/.*/
36
+ end
37
+
38
+ it 'has some url for the pulished article' do
39
+ @sec_paper[:article_url].must_match /http:\/\/.*/
40
+ end
41
+
42
+ it 'has the right author(s) (as nested array)' do
43
+ @sec_paper[:authors].must_equal [['Soumya', 'Ray'], ['Terence', 'Ow'], ['Sung', 'S', 'Kim']]
44
+ end
45
+
46
+ it 'has a publication date' do
47
+ @sec_paper[:date].must_be_instance_of Date
48
+ end
49
+
50
+ it 'has the right journal\'s name' do
51
+ @sec_paper[:journal].must_equal 'Decision Sciences'
52
+ end
53
+
54
+ it 'has the right volume number (as string)' do
55
+ @sec_paper[:volume].must_equal '42'
56
+ end
57
+
58
+ it 'has the right issue number (as string)' do
59
+ @sec_paper[:issue].must_equal '2'
60
+ end
61
+
62
+ it 'has the right page numbers' do
63
+ @sec_paper[:pages].must_equal '391-412'
64
+ end
65
+
66
+ it 'has the right publisher' do
67
+ @sec_paper[:publisher].must_equal 'Blackwell Publishing Inc'
68
+ end
69
+
70
+ it 'has some url for the main citations apage' do
71
+ @sec_paper[:gscholar_url].must_match /citations/
72
+ end
73
+
74
+ end
75
+
76
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gentle-scholar
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Soumya Ray
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-25 00:00:00.000000000 Z
11
+ date: 2014-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -72,7 +72,18 @@ executables: []
72
72
  extensions: []
73
73
  extra_rdoc_files: []
74
74
  files:
75
- - lib/publication.rb
75
+ - ".gitignore"
76
+ - ".travis.yml"
77
+ - Gemfile
78
+ - README.md
79
+ - Rakefile
80
+ - bin/gentle
81
+ - gentle-scholar.gemspec
82
+ - lib/gentle-scholar.rb
83
+ - lib/gentle-scholar/publication.rb
84
+ - lib/gentle-scholar/version.rb
85
+ - spec/minitest_helper.rb
86
+ - spec/publication_spec.rb
76
87
  homepage: https://github.com/soumyaray/gentle-scholar
77
88
  licenses:
78
89
  - MIT
@@ -97,4 +108,6 @@ rubygems_version: 2.1.11
97
108
  signing_key:
98
109
  specification_version: 4
99
110
  summary: Google Scholar infor extractor
100
- test_files: []
111
+ test_files:
112
+ - spec/minitest_helper.rb
113
+ - spec/publication_spec.rb
@@ -1,78 +0,0 @@
1
- # Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
2
- # License:: MIT
3
-
4
- require 'typhoeus'
5
- require 'nokogiri'
6
- require 'date'
7
-
8
- # This class loads a single publication from Google scholar and returns
9
- # all its attributes, including dynamic attributes like number of citations
10
- class GScholarPub
11
- GSCHOLAR_HOST_URL = 'http://scholar.google.com'
12
- GSCHOLAR_CIT_URL =
13
- 'http://scholar.google.com/citations?view_op=view_citation&hl=en'
14
-
15
- attr_reader :title, :cites, :cites_url, :chart_url, :article_url
16
- attr_reader :authors, :date, :journal, :volume, :issue, :pages, :publisher
17
- attr_reader :description, :gscholar_url
18
- # TODO: @doc only for development, testing modes
19
- attr_reader :doc
20
-
21
- SCAN_STR = {
22
- gscholar_url:
23
- "//div[contains(@class,'g-section cit-dgb')]/div/table/tr/td/a",
24
- cites: "//div[contains(@id,'scholar_sec')]/div/a",
25
- title: '//div[@id="title"]/a',
26
- article_url: '//div[@id="title"]/a',
27
- chart_url: '//div[contains(@class,"cit-dd")]/img'
28
- }
29
-
30
- TABLE_ATTR = {
31
- authors: 'Authors',
32
- date: 'Publication date',
33
- journal: 'Journal name',
34
- volume: 'Volume',
35
- issue: 'Issue',
36
- pages: 'Pages',
37
- publisher: 'Publisher',
38
- description: 'Description'
39
- }
40
-
41
- def initialize(scholar_pub_id)
42
- auth_id, pub_id = scholar_pub_id.split(/:/)
43
- url = GSCHOLAR_CIT_URL + '&user=' + auth_id \
44
- + '&citation_for_view=' + auth_id + ':' + pub_id
45
- res = Typhoeus::Request.new(url).run
46
- @doc = Nokogiri::HTML(res.response_body)
47
-
48
- extract_html_elements
49
- extract_html_table
50
- end
51
-
52
- def extract_html_elements
53
- @cites = @doc.xpath(SCAN_STR[:cites]).text[/\d+/].to_i
54
- @cites_url = @doc.xpath(SCAN_STR[:cites])[0].attributes['href'].value
55
-
56
- @title = @doc.xpath(SCAN_STR[:title]).text
57
- @article_url = @doc.xpath(SCAN_STR[:article_url]).attr('href').value
58
-
59
- @chart_url = @doc.xpath(SCAN_STR[:chart_url]).attr('src').value
60
-
61
- @gscholar_url = GSCHOLAR_HOST_URL + @doc.xpath(
62
- SCAN_STR[:gscholar_url]).attr('href').value
63
- end
64
-
65
- def extract_html_table
66
- # lambda gets text from right html column given name in left column
67
- table_pick = lambda do |name|
68
- @doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
69
- end
70
-
71
- TABLE_ATTR.each do |k, v|
72
- instance_variable_set("@#{k}", table_pick.call(v))
73
- end
74
-
75
- @authors = @authors.split(/,/).map { |a| a.split(' ') }
76
- @date = Date.strptime(@date, '%Y/%m/%d')
77
- end
78
- end