gentle-scholar 1.0.2 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +9 -0
- data/Gemfile +8 -0
- data/README.md +42 -0
- data/Rakefile +7 -0
- data/bin/gentle +10 -0
- data/gentle-scholar.gemspec +20 -0
- data/lib/gentle-scholar.rb +1 -0
- data/lib/gentle-scholar/publication.rb +80 -0
- data/lib/gentle-scholar/version.rb +3 -0
- data/spec/minitest_helper.rb +3 -0
- data/spec/publication_spec.rb +76 -0
- metadata +17 -4
- data/lib/publication.rb +0 -78
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa9d57fcccb8fd1fea02fe1a2651942dcbbf3940
|
4
|
+
data.tar.gz: 0f9d9a785ff33dba50e9418f9d892e3991e09e15
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 814517c4c9ed75066116a33106013645f14a0d67e964f89b92db1ae8bfebbca4b5a82822d349c97ff9081f3093dbdfd5946db2dfee90f9de0209588a8a52b8bd
|
7
|
+
data.tar.gz: 38af181ecc696bc79056c638ac96337806c363e3178c0fa30f36ae2f9e43161156975fedc42fa388c87ec55b94dda259fc8c61e707954b1209fed23cbf0c5842
|
data/.gitignore
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# gentle-scholar gem
|
2
|
+
[![Gem Version](https://badge.fury.io/rb/gentle-scholar.svg)](http://badge.fury.io/rb/gentle-scholar)
|
3
|
+
[![Build Status](https://travis-ci.org/soumyaray/gentle-scholar.svg?branch=v1.0.1)](https://travis-ci.org/soumyaray/gentle-scholar)
|
4
|
+
|
5
|
+
Gem to extract Google Scholar publication given URL of a publication
|
6
|
+
|
7
|
+
##About:
|
8
|
+
**NOTE:** This gem is not to to search or crawl through Google Scholar.
|
9
|
+
This also will not extract bibtex (please see the excellent
|
10
|
+
[gscholar](https://rubygems.org/gems/gscholar) gem for that).
|
11
|
+
|
12
|
+
This gem is for publishing academics to extract their own publication
|
13
|
+
information from Google Scholar. It can retrieve standard citation information
|
14
|
+
(authors, journal, title, etc.). Additionally, it can retrieve number of
|
15
|
+
citations reported by Google Scholar, link to citations page, chart of
|
16
|
+
citations over time, and link to author's Google Scholar profile.
|
17
|
+
|
18
|
+
##Usage:
|
19
|
+
|
20
|
+
Given a google scholar article such as:
|
21
|
+
|
22
|
+
http://scholar.google.com.tw/citations?view_op=view_citation&hl=en&user=6WjiSOwAAAAJ&citation_for_view=6WjiSOwAAAAJ:9yKSN-GCB0IC
|
23
|
+
|
24
|
+
Retrieve information by copying the author and article ID from the URL:
|
25
|
+
|
26
|
+
sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
|
27
|
+
|
28
|
+
This returns:
|
29
|
+
|
30
|
+
#<GScholarPub:0x007fb0bc18df70
|
31
|
+
@article_url="http://pubsonline.informs.org/doi/abs/10.1287/isre.1100.0340",
|
32
|
+
@authors=[["Soumya", "Ray"], ["Sung", "S", "Kim"], ["James", "G", "Morris"]],
|
33
|
+
@chart_url="http://www.google.com/chart?chs=475x90&cht=bvs&chf=bg,s,e8f4f7&chco=1111cc&chbh=r,2.0,0.0&chxt=x,y&chxr=1,0,5,5&chd=t:100.0,80.0,100.0
|
34
|
+
@cites=14,
|
35
|
+
@cites_url="http://scholar.google.com/scholar?oi=bibs&hl=en&oe=ASCII&cites=11777343089817755068",
|
36
|
+
@description="The highly competitive and rapidly changing market for online services is becoming increasingly effective at locking users in through
|
37
|
+
@issue="1",
|
38
|
+
@journal="Information Systems Research",
|
39
|
+
@pages="197-213",
|
40
|
+
@title="Research Note\u0097Online Users' Switching Costs: Their Nature and Formation",
|
41
|
+
@volume="23"
|
42
|
+
>
|
data/Rakefile
ADDED
data/bin/gentle
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$:.push File.expand_path("../lib", __FILE__)
|
2
|
+
require 'gentle-scholar/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'gentle-scholar'
|
6
|
+
s.version = GentleScholar::VERSION
|
7
|
+
s.add_development_dependency 'minitest'
|
8
|
+
s.add_development_dependency 'minitest-rg'
|
9
|
+
s.add_runtime_dependency 'nokogiri', '>= 1.6.2'
|
10
|
+
s.add_runtime_dependency 'typhoeus', '>= 0.6.8'
|
11
|
+
s.date = '2014-05-27'
|
12
|
+
s.summary = 'Google Scholar infor extractor'
|
13
|
+
s.description = 'Extract author/paper info from Google Scholar'
|
14
|
+
s.authors = ['Soumya Ray']
|
15
|
+
s.email = 'soumya.ray@gmail.com'
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.homepage = 'https://github.com/soumyaray/gentle-scholar'
|
19
|
+
s.license = 'MIT'
|
20
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
require 'gentle-scholar/publication.rb'
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
|
2
|
+
# License:: MIT
|
3
|
+
require 'typhoeus'
|
4
|
+
require 'nokogiri'
|
5
|
+
require 'date'
|
6
|
+
|
7
|
+
module GentleScholar
|
8
|
+
# This class loads a single publication from Google scholar and returns
|
9
|
+
# all its attributes, including dynamic attributes like number of citations
|
10
|
+
class Publication
|
11
|
+
GS_HOST_URL = 'http://scholar.google.com'
|
12
|
+
GS_CIT_URL = "#{GS_HOST_URL}/citations?view_op=view_citation&hl=en"
|
13
|
+
|
14
|
+
SCAN_STR = {
|
15
|
+
gscholar_url: "//div[contains(@class,'g-section cit-dgb')]"\
|
16
|
+
"/div/table/tr/td/a",
|
17
|
+
cites: "//div[contains(@id,'scholar_sec')]/div/a",
|
18
|
+
cites_url: "//div[contains(@id,'scholar_sec')]/div/a",
|
19
|
+
title: '//div[@id="title"]/a',
|
20
|
+
article_url: '//div[@id="title"]/a',
|
21
|
+
chart_url: '//div[contains(@class,"cit-dd")]/img'
|
22
|
+
}
|
23
|
+
|
24
|
+
SCAN_LAMBDAS = {
|
25
|
+
cites: ->(x) { x.text[/\d+/].to_i },
|
26
|
+
cites_url: ->(x) { x[0].attributes['href'].value },
|
27
|
+
title: ->(x) { x.text },
|
28
|
+
article_url: ->(x) { x.attr('href').value },
|
29
|
+
chart_url: ->(x) { x.attr('src').value },
|
30
|
+
gscholar_url: ->(x) { GS_HOST_URL + x.attr('href').value }
|
31
|
+
}
|
32
|
+
|
33
|
+
TABLE_ATTR = {
|
34
|
+
authors: 'Authors',
|
35
|
+
date: 'Publication date',
|
36
|
+
journal: 'Journal name',
|
37
|
+
volume: 'Volume',
|
38
|
+
issue: 'Issue',
|
39
|
+
pages: 'Pages',
|
40
|
+
publisher: 'Publisher',
|
41
|
+
description: 'Description'
|
42
|
+
}
|
43
|
+
|
44
|
+
TABLE_LAMBDAS = {
|
45
|
+
authors: ->(x) { x.split(/,/).map { |a| a.split(' ') } },
|
46
|
+
date: ->(x) { Date.strptime(x, '%Y/%m/%d') }
|
47
|
+
}
|
48
|
+
|
49
|
+
def self.get_from_http(scholar_pub_id)
|
50
|
+
auth_id, pub_id = scholar_pub_id.split(/:/)
|
51
|
+
url = GS_CIT_URL + '&user=' + auth_id \
|
52
|
+
+ '&citation_for_view=' + auth_id + ':' + pub_id
|
53
|
+
res = Typhoeus::Request.new(url).run
|
54
|
+
doc = Nokogiri::HTML(res.response_body)
|
55
|
+
|
56
|
+
extract_html_elements(doc).merge(extract_html_table(doc))
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.extract_html_elements(doc)
|
60
|
+
xpath = Hash[SCAN_STR.map { |elem, path| [elem, doc.xpath(path)] }]
|
61
|
+
elements = SCAN_LAMBDAS.map do |key, lam|
|
62
|
+
if xpath[key].any? then [key, lam.call(xpath[key])] end
|
63
|
+
end
|
64
|
+
|
65
|
+
Hash[elements.compact]
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.extract_html_table(doc)
|
69
|
+
# lambda gets text from right html column given name in left column
|
70
|
+
table_extract = lambda do |name|
|
71
|
+
doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
|
72
|
+
end
|
73
|
+
|
74
|
+
elements = Hash[TABLE_ATTR.map { |k, v| [k, table_extract.call(v)] }]
|
75
|
+
elements.merge(
|
76
|
+
Hash[TABLE_LAMBDAS.map { |key, lam| [key, lam.call(elements[key])] }]
|
77
|
+
)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'minitest/rg'
|
3
|
+
require './spec/minitest_helper.rb'
|
4
|
+
|
5
|
+
describe 'Publication', 'A single publication listing' do
|
6
|
+
|
7
|
+
# let (:@sec_paper) do
|
8
|
+
# GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
|
9
|
+
# end
|
10
|
+
|
11
|
+
describe 'when it is a paper' do
|
12
|
+
#
|
13
|
+
# before do
|
14
|
+
# @sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
|
15
|
+
# end
|
16
|
+
|
17
|
+
before do
|
18
|
+
@sec_paper = SEC_PAPER
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'has the right title' do
|
22
|
+
@sec_paper[:title].must_equal 'Security Assurance: How Online Service '\
|
23
|
+
'Providers Can Influence Security Control Perceptions and Gain Trust'
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'has some number of cites' do
|
27
|
+
@sec_paper[:cites].must_be :>, 0
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'has some url for listing cites' do
|
31
|
+
@sec_paper[:cites_url].must_match /http:\/\/.*/
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'has some url for citations chart' do
|
35
|
+
@sec_paper[:chart_url].must_match /http:\/\/.*/
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'has some url for the pulished article' do
|
39
|
+
@sec_paper[:article_url].must_match /http:\/\/.*/
|
40
|
+
end
|
41
|
+
|
42
|
+
it 'has the right author(s) (as nested array)' do
|
43
|
+
@sec_paper[:authors].must_equal [['Soumya', 'Ray'], ['Terence', 'Ow'], ['Sung', 'S', 'Kim']]
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'has a publication date' do
|
47
|
+
@sec_paper[:date].must_be_instance_of Date
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'has the right journal\'s name' do
|
51
|
+
@sec_paper[:journal].must_equal 'Decision Sciences'
|
52
|
+
end
|
53
|
+
|
54
|
+
it 'has the right volume number (as string)' do
|
55
|
+
@sec_paper[:volume].must_equal '42'
|
56
|
+
end
|
57
|
+
|
58
|
+
it 'has the right issue number (as string)' do
|
59
|
+
@sec_paper[:issue].must_equal '2'
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'has the right page numbers' do
|
63
|
+
@sec_paper[:pages].must_equal '391-412'
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'has the right publisher' do
|
67
|
+
@sec_paper[:publisher].must_equal 'Blackwell Publishing Inc'
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'has some url for the main citations apage' do
|
71
|
+
@sec_paper[:gscholar_url].must_match /citations/
|
72
|
+
end
|
73
|
+
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gentle-scholar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Soumya Ray
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: minitest
|
@@ -72,7 +72,18 @@ executables: []
|
|
72
72
|
extensions: []
|
73
73
|
extra_rdoc_files: []
|
74
74
|
files:
|
75
|
-
-
|
75
|
+
- ".gitignore"
|
76
|
+
- ".travis.yml"
|
77
|
+
- Gemfile
|
78
|
+
- README.md
|
79
|
+
- Rakefile
|
80
|
+
- bin/gentle
|
81
|
+
- gentle-scholar.gemspec
|
82
|
+
- lib/gentle-scholar.rb
|
83
|
+
- lib/gentle-scholar/publication.rb
|
84
|
+
- lib/gentle-scholar/version.rb
|
85
|
+
- spec/minitest_helper.rb
|
86
|
+
- spec/publication_spec.rb
|
76
87
|
homepage: https://github.com/soumyaray/gentle-scholar
|
77
88
|
licenses:
|
78
89
|
- MIT
|
@@ -97,4 +108,6 @@ rubygems_version: 2.1.11
|
|
97
108
|
signing_key:
|
98
109
|
specification_version: 4
|
99
110
|
summary: Google Scholar infor extractor
|
100
|
-
test_files:
|
111
|
+
test_files:
|
112
|
+
- spec/minitest_helper.rb
|
113
|
+
- spec/publication_spec.rb
|
data/lib/publication.rb
DELETED
@@ -1,78 +0,0 @@
|
|
1
|
-
# Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
|
2
|
-
# License:: MIT
|
3
|
-
|
4
|
-
require 'typhoeus'
|
5
|
-
require 'nokogiri'
|
6
|
-
require 'date'
|
7
|
-
|
8
|
-
# This class loads a single publication from Google scholar and returns
|
9
|
-
# all its attributes, including dynamic attributes like number of citations
|
10
|
-
class GScholarPub
|
11
|
-
GSCHOLAR_HOST_URL = 'http://scholar.google.com'
|
12
|
-
GSCHOLAR_CIT_URL =
|
13
|
-
'http://scholar.google.com/citations?view_op=view_citation&hl=en'
|
14
|
-
|
15
|
-
attr_reader :title, :cites, :cites_url, :chart_url, :article_url
|
16
|
-
attr_reader :authors, :date, :journal, :volume, :issue, :pages, :publisher
|
17
|
-
attr_reader :description, :gscholar_url
|
18
|
-
# TODO: @doc only for development, testing modes
|
19
|
-
attr_reader :doc
|
20
|
-
|
21
|
-
SCAN_STR = {
|
22
|
-
gscholar_url:
|
23
|
-
"//div[contains(@class,'g-section cit-dgb')]/div/table/tr/td/a",
|
24
|
-
cites: "//div[contains(@id,'scholar_sec')]/div/a",
|
25
|
-
title: '//div[@id="title"]/a',
|
26
|
-
article_url: '//div[@id="title"]/a',
|
27
|
-
chart_url: '//div[contains(@class,"cit-dd")]/img'
|
28
|
-
}
|
29
|
-
|
30
|
-
TABLE_ATTR = {
|
31
|
-
authors: 'Authors',
|
32
|
-
date: 'Publication date',
|
33
|
-
journal: 'Journal name',
|
34
|
-
volume: 'Volume',
|
35
|
-
issue: 'Issue',
|
36
|
-
pages: 'Pages',
|
37
|
-
publisher: 'Publisher',
|
38
|
-
description: 'Description'
|
39
|
-
}
|
40
|
-
|
41
|
-
def initialize(scholar_pub_id)
|
42
|
-
auth_id, pub_id = scholar_pub_id.split(/:/)
|
43
|
-
url = GSCHOLAR_CIT_URL + '&user=' + auth_id \
|
44
|
-
+ '&citation_for_view=' + auth_id + ':' + pub_id
|
45
|
-
res = Typhoeus::Request.new(url).run
|
46
|
-
@doc = Nokogiri::HTML(res.response_body)
|
47
|
-
|
48
|
-
extract_html_elements
|
49
|
-
extract_html_table
|
50
|
-
end
|
51
|
-
|
52
|
-
def extract_html_elements
|
53
|
-
@cites = @doc.xpath(SCAN_STR[:cites]).text[/\d+/].to_i
|
54
|
-
@cites_url = @doc.xpath(SCAN_STR[:cites])[0].attributes['href'].value
|
55
|
-
|
56
|
-
@title = @doc.xpath(SCAN_STR[:title]).text
|
57
|
-
@article_url = @doc.xpath(SCAN_STR[:article_url]).attr('href').value
|
58
|
-
|
59
|
-
@chart_url = @doc.xpath(SCAN_STR[:chart_url]).attr('src').value
|
60
|
-
|
61
|
-
@gscholar_url = GSCHOLAR_HOST_URL + @doc.xpath(
|
62
|
-
SCAN_STR[:gscholar_url]).attr('href').value
|
63
|
-
end
|
64
|
-
|
65
|
-
def extract_html_table
|
66
|
-
# lambda gets text from right html column given name in left column
|
67
|
-
table_pick = lambda do |name|
|
68
|
-
@doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
|
69
|
-
end
|
70
|
-
|
71
|
-
TABLE_ATTR.each do |k, v|
|
72
|
-
instance_variable_set("@#{k}", table_pick.call(v))
|
73
|
-
end
|
74
|
-
|
75
|
-
@authors = @authors.split(/,/).map { |a| a.split(' ') }
|
76
|
-
@date = Date.strptime(@date, '%Y/%m/%d')
|
77
|
-
end
|
78
|
-
end
|