gentle-scholar 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +9 -0
- data/Gemfile +8 -0
- data/README.md +42 -0
- data/Rakefile +7 -0
- data/bin/gentle +10 -0
- data/gentle-scholar.gemspec +20 -0
- data/lib/gentle-scholar.rb +1 -0
- data/lib/gentle-scholar/publication.rb +80 -0
- data/lib/gentle-scholar/version.rb +3 -0
- data/spec/minitest_helper.rb +3 -0
- data/spec/publication_spec.rb +76 -0
- metadata +17 -4
- data/lib/publication.rb +0 -78
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa9d57fcccb8fd1fea02fe1a2651942dcbbf3940
|
4
|
+
data.tar.gz: 0f9d9a785ff33dba50e9418f9d892e3991e09e15
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 814517c4c9ed75066116a33106013645f14a0d67e964f89b92db1ae8bfebbca4b5a82822d349c97ff9081f3093dbdfd5946db2dfee90f9de0209588a8a52b8bd
|
7
|
+
data.tar.gz: 38af181ecc696bc79056c638ac96337806c363e3178c0fa30f36ae2f9e43161156975fedc42fa388c87ec55b94dda259fc8c61e707954b1209fed23cbf0c5842
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# gentle-scholar gem
|
2
|
+
[](http://badge.fury.io/rb/gentle-scholar)
|
3
|
+
[](https://travis-ci.org/soumyaray/gentle-scholar)
|
4
|
+
|
5
|
+
Gem to extract Google Scholar publication given URL of a publication
|
6
|
+
|
7
|
+
##About:
|
8
|
+
**NOTE:** This gem is not to to search or crawl through Google Scholar.
|
9
|
+
This also will not extract bibtex (please see the excellent
|
10
|
+
[gscholar](https://rubygems.org/gems/gscholar) gem for that).
|
11
|
+
|
12
|
+
This gem is for publishing academics to extract their own publication
|
13
|
+
information from Google Scholar. It can retrieve standard citation information
|
14
|
+
(authors, journal, title, etc.). Additionally, it can retrieve number of
|
15
|
+
citations reported by Google Scholar, link to citations page, chart of
|
16
|
+
citations over time, and link to author's Google Scholar profile.
|
17
|
+
|
18
|
+
##Usage:
|
19
|
+
|
20
|
+
Given a google scholar article such as:
|
21
|
+
|
22
|
+
http://scholar.google.com.tw/citations?view_op=view_citation&hl=en&user=6WjiSOwAAAAJ&citation_for_view=6WjiSOwAAAAJ:9yKSN-GCB0IC
|
23
|
+
|
24
|
+
Retrieve information by copying the author and article ID from the URL:
|
25
|
+
|
26
|
+
sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
|
27
|
+
|
28
|
+
This returns:
|
29
|
+
|
30
|
+
#<GScholarPub:0x007fb0bc18df70
|
31
|
+
@article_url="http://pubsonline.informs.org/doi/abs/10.1287/isre.1100.0340",
|
32
|
+
@authors=[["Soumya", "Ray"], ["Sung", "S", "Kim"], ["James", "G", "Morris"]],
|
33
|
+
@chart_url="http://www.google.com/chart?chs=475x90&cht=bvs&chf=bg,s,e8f4f7&chco=1111cc&chbh=r,2.0,0.0&chxt=x,y&chxr=1,0,5,5&chd=t:100.0,80.0,100.0
|
34
|
+
@cites=14,
|
35
|
+
@cites_url="http://scholar.google.com/scholar?oi=bibs&hl=en&oe=ASCII&cites=11777343089817755068",
|
36
|
+
@description="The highly competitive and rapidly changing market for online services is becoming increasingly effective at locking users in through
|
37
|
+
@issue="1",
|
38
|
+
@journal="Information Systems Research",
|
39
|
+
@pages="197-213",
|
40
|
+
@title="Research Note\u0097Online Users' Switching Costs: Their Nature and Formation",
|
41
|
+
@volume="23"
|
42
|
+
>
|
data/Rakefile
ADDED
data/bin/gentle
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require 'gentle-scholar/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'gentle-scholar'
|
6
|
+
s.version = GentleScholar::VERSION
|
7
|
+
s.add_development_dependency 'minitest'
|
8
|
+
s.add_development_dependency 'minitest-rg'
|
9
|
+
s.add_runtime_dependency 'nokogiri', '>= 1.6.2'
|
10
|
+
s.add_runtime_dependency 'typhoeus', '>= 0.6.8'
|
11
|
+
s.date = '2014-05-27'
|
12
|
+
s.summary = 'Google Scholar infor extractor'
|
13
|
+
s.description = 'Extract author/paper info from Google Scholar'
|
14
|
+
s.authors = ['Soumya Ray']
|
15
|
+
s.email = 'soumya.ray@gmail.com'
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.homepage = 'https://github.com/soumyaray/gentle-scholar'
|
19
|
+
s.license = 'MIT'
|
20
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'gentle-scholar/publication.rb'
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
|
2
|
+
# License:: MIT
|
3
|
+
require 'typhoeus'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'date'
|
6
|
+
|
7
|
+
module GentleScholar
|
8
|
+
# This class loads a single publication from Google scholar and returns
|
9
|
+
# all its attributes, including dynamic attributes like number of citations
|
10
|
+
class Publication
|
11
|
+
GS_HOST_URL = 'http://scholar.google.com'
|
12
|
+
GS_CIT_URL = "#{GS_HOST_URL}/citations?view_op=view_citation&hl=en"
|
13
|
+
|
14
|
+
SCAN_STR = {
|
15
|
+
gscholar_url: "//div[contains(@class,'g-section cit-dgb')]"\
|
16
|
+
"/div/table/tr/td/a",
|
17
|
+
cites: "//div[contains(@id,'scholar_sec')]/div/a",
|
18
|
+
cites_url: "//div[contains(@id,'scholar_sec')]/div/a",
|
19
|
+
title: '//div[@id="title"]/a',
|
20
|
+
article_url: '//div[@id="title"]/a',
|
21
|
+
chart_url: '//div[contains(@class,"cit-dd")]/img'
|
22
|
+
}
|
23
|
+
|
24
|
+
SCAN_LAMBDAS = {
|
25
|
+
cites: ->(x) { x.text[/\d+/].to_i },
|
26
|
+
cites_url: ->(x) { x[0].attributes['href'].value },
|
27
|
+
title: ->(x) { x.text },
|
28
|
+
article_url: ->(x) { x.attr('href').value },
|
29
|
+
chart_url: ->(x) { x.attr('src').value },
|
30
|
+
gscholar_url: ->(x) { GS_HOST_URL + x.attr('href').value }
|
31
|
+
}
|
32
|
+
|
33
|
+
TABLE_ATTR = {
|
34
|
+
authors: 'Authors',
|
35
|
+
date: 'Publication date',
|
36
|
+
journal: 'Journal name',
|
37
|
+
volume: 'Volume',
|
38
|
+
issue: 'Issue',
|
39
|
+
pages: 'Pages',
|
40
|
+
publisher: 'Publisher',
|
41
|
+
description: 'Description'
|
42
|
+
}
|
43
|
+
|
44
|
+
TABLE_LAMBDAS = {
|
45
|
+
authors: ->(x) { x.split(/,/).map { |a| a.split(' ') } },
|
46
|
+
date: ->(x) { Date.strptime(x, '%Y/%m/%d') }
|
47
|
+
}
|
48
|
+
|
49
|
+
def self.get_from_http(scholar_pub_id)
|
50
|
+
auth_id, pub_id = scholar_pub_id.split(/:/)
|
51
|
+
url = GS_CIT_URL + '&user=' + auth_id \
|
52
|
+
+ '&citation_for_view=' + auth_id + ':' + pub_id
|
53
|
+
res = Typhoeus::Request.new(url).run
|
54
|
+
doc = Nokogiri::HTML(res.response_body)
|
55
|
+
|
56
|
+
extract_html_elements(doc).merge(extract_html_table(doc))
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.extract_html_elements(doc)
|
60
|
+
xpath = Hash[SCAN_STR.map { |elem, path| [elem, doc.xpath(path)] }]
|
61
|
+
elements = SCAN_LAMBDAS.map do |key, lam|
|
62
|
+
if xpath[key].any? then [key, lam.call(xpath[key])] end
|
63
|
+
end
|
64
|
+
|
65
|
+
Hash[elements.compact]
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.extract_html_table(doc)
|
69
|
+
# lambda gets text from right html column given name in left column
|
70
|
+
table_extract = lambda do |name|
|
71
|
+
doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
|
72
|
+
end
|
73
|
+
|
74
|
+
elements = Hash[TABLE_ATTR.map { |k, v| [k, table_extract.call(v)] }]
|
75
|
+
elements.merge(
|
76
|
+
Hash[TABLE_LAMBDAS.map { |key, lam| [key, lam.call(elements[key])] }]
|
77
|
+
)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'minitest/rg'
|
3
|
+
require './spec/minitest_helper.rb'
|
4
|
+
|
5
|
+
describe 'Publication', 'A single publication listing' do
|
6
|
+
|
7
|
+
# let (:@sec_paper) do
|
8
|
+
# GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
|
9
|
+
# end
|
10
|
+
|
11
|
+
describe 'when it is a paper' do
|
12
|
+
#
|
13
|
+
# before do
|
14
|
+
# @sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
|
15
|
+
# end
|
16
|
+
|
17
|
+
before do
|
18
|
+
@sec_paper = SEC_PAPER
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'has the right title' do
|
22
|
+
@sec_paper[:title].must_equal 'Security Assurance: How Online Service '\
|
23
|
+
'Providers Can Influence Security Control Perceptions and Gain Trust'
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'has some number of cites' do
|
27
|
+
@sec_paper[:cites].must_be :>, 0
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'has some url for listing cites' do
|
31
|
+
@sec_paper[:cites_url].must_match /http:\/\/.*/
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'has some url for citations chart' do
|
35
|
+
@sec_paper[:chart_url].must_match /http:\/\/.*/
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'has some url for the pulished article' do
|
39
|
+
@sec_paper[:article_url].must_match /http:\/\/.*/
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'has the right author(s) (as nested array)' do
|
43
|
+
@sec_paper[:authors].must_equal [['Soumya', 'Ray'], ['Terence', 'Ow'], ['Sung', 'S', 'Kim']]
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'has a publication date' do
|
47
|
+
@sec_paper[:date].must_be_instance_of Date
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'has the right journal\'s name' do
|
51
|
+
@sec_paper[:journal].must_equal 'Decision Sciences'
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'has the right volume number (as string)' do
|
55
|
+
@sec_paper[:volume].must_equal '42'
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'has the right issue number (as string)' do
|
59
|
+
@sec_paper[:issue].must_equal '2'
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'has the right page numbers' do
|
63
|
+
@sec_paper[:pages].must_equal '391-412'
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'has the right publisher' do
|
67
|
+
@sec_paper[:publisher].must_equal 'Blackwell Publishing Inc'
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'has some url for the main citations apage' do
|
71
|
+
@sec_paper[:gscholar_url].must_match /citations/
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gentle-scholar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Soumya Ray
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
@@ -72,7 +72,18 @@ executables: []
|
|
72
72
|
extensions: []
|
73
73
|
extra_rdoc_files: []
|
74
74
|
files:
|
75
|
-
-
|
75
|
+
- ".gitignore"
|
76
|
+
- ".travis.yml"
|
77
|
+
- Gemfile
|
78
|
+
- README.md
|
79
|
+
- Rakefile
|
80
|
+
- bin/gentle
|
81
|
+
- gentle-scholar.gemspec
|
82
|
+
- lib/gentle-scholar.rb
|
83
|
+
- lib/gentle-scholar/publication.rb
|
84
|
+
- lib/gentle-scholar/version.rb
|
85
|
+
- spec/minitest_helper.rb
|
86
|
+
- spec/publication_spec.rb
|
76
87
|
homepage: https://github.com/soumyaray/gentle-scholar
|
77
88
|
licenses:
|
78
89
|
- MIT
|
@@ -97,4 +108,6 @@ rubygems_version: 2.1.11
|
|
97
108
|
signing_key:
|
98
109
|
specification_version: 4
|
99
110
|
summary: Google Scholar infor extractor
|
100
|
-
test_files:
|
111
|
+
test_files:
|
112
|
+
- spec/minitest_helper.rb
|
113
|
+
- spec/publication_spec.rb
|
data/lib/publication.rb
DELETED
@@ -1,78 +0,0 @@
|
|
1
|
-
# Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
|
2
|
-
# License:: MIT
|
3
|
-
|
4
|
-
require 'typhoeus'
|
5
|
-
require 'nokogiri'
|
6
|
-
require 'date'
|
7
|
-
|
8
|
-
# This class loads a single publication from Google scholar and returns
|
9
|
-
# all its attributes, including dynamic attributes like number of citations
|
10
|
-
class GScholarPub
|
11
|
-
GSCHOLAR_HOST_URL = 'http://scholar.google.com'
|
12
|
-
GSCHOLAR_CIT_URL =
|
13
|
-
'http://scholar.google.com/citations?view_op=view_citation&hl=en'
|
14
|
-
|
15
|
-
attr_reader :title, :cites, :cites_url, :chart_url, :article_url
|
16
|
-
attr_reader :authors, :date, :journal, :volume, :issue, :pages, :publisher
|
17
|
-
attr_reader :description, :gscholar_url
|
18
|
-
# TODO: @doc only for development, testing modes
|
19
|
-
attr_reader :doc
|
20
|
-
|
21
|
-
SCAN_STR = {
|
22
|
-
gscholar_url:
|
23
|
-
"//div[contains(@class,'g-section cit-dgb')]/div/table/tr/td/a",
|
24
|
-
cites: "//div[contains(@id,'scholar_sec')]/div/a",
|
25
|
-
title: '//div[@id="title"]/a',
|
26
|
-
article_url: '//div[@id="title"]/a',
|
27
|
-
chart_url: '//div[contains(@class,"cit-dd")]/img'
|
28
|
-
}
|
29
|
-
|
30
|
-
TABLE_ATTR = {
|
31
|
-
authors: 'Authors',
|
32
|
-
date: 'Publication date',
|
33
|
-
journal: 'Journal name',
|
34
|
-
volume: 'Volume',
|
35
|
-
issue: 'Issue',
|
36
|
-
pages: 'Pages',
|
37
|
-
publisher: 'Publisher',
|
38
|
-
description: 'Description'
|
39
|
-
}
|
40
|
-
|
41
|
-
def initialize(scholar_pub_id)
|
42
|
-
auth_id, pub_id = scholar_pub_id.split(/:/)
|
43
|
-
url = GSCHOLAR_CIT_URL + '&user=' + auth_id \
|
44
|
-
+ '&citation_for_view=' + auth_id + ':' + pub_id
|
45
|
-
res = Typhoeus::Request.new(url).run
|
46
|
-
@doc = Nokogiri::HTML(res.response_body)
|
47
|
-
|
48
|
-
extract_html_elements
|
49
|
-
extract_html_table
|
50
|
-
end
|
51
|
-
|
52
|
-
def extract_html_elements
|
53
|
-
@cites = @doc.xpath(SCAN_STR[:cites]).text[/\d+/].to_i
|
54
|
-
@cites_url = @doc.xpath(SCAN_STR[:cites])[0].attributes['href'].value
|
55
|
-
|
56
|
-
@title = @doc.xpath(SCAN_STR[:title]).text
|
57
|
-
@article_url = @doc.xpath(SCAN_STR[:article_url]).attr('href').value
|
58
|
-
|
59
|
-
@chart_url = @doc.xpath(SCAN_STR[:chart_url]).attr('src').value
|
60
|
-
|
61
|
-
@gscholar_url = GSCHOLAR_HOST_URL + @doc.xpath(
|
62
|
-
SCAN_STR[:gscholar_url]).attr('href').value
|
63
|
-
end
|
64
|
-
|
65
|
-
def extract_html_table
|
66
|
-
# lambda gets text from right html column given name in left column
|
67
|
-
table_pick = lambda do |name|
|
68
|
-
@doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
|
69
|
-
end
|
70
|
-
|
71
|
-
TABLE_ATTR.each do |k, v|
|
72
|
-
instance_variable_set("@#{k}", table_pick.call(v))
|
73
|
-
end
|
74
|
-
|
75
|
-
@authors = @authors.split(/,/).map { |a| a.split(' ') }
|
76
|
-
@date = Date.strptime(@date, '%Y/%m/%d')
|
77
|
-
end
|
78
|
-
end
|