gentle-scholar 1.0.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a16b27e1ce933a215cfc7d0e8ba1763f0c24aec
4
- data.tar.gz: 778624c2dcc3146444938c40869775192ab65edc
3
+ metadata.gz: aa9d57fcccb8fd1fea02fe1a2651942dcbbf3940
4
+ data.tar.gz: 0f9d9a785ff33dba50e9418f9d892e3991e09e15
5
5
  SHA512:
6
- metadata.gz: d1f88ec34776d3abadca4c53ba613151c12c0b77df0dd993aef403ec72a7aba3aa27485728e543f0cfa0a308c6c2b6674d7761394beea3543c299f37b79eb2d7
7
- data.tar.gz: 462726a60bf05d720f9503390f4e3ebed7cdbc55d825631275a40ccb7aed4d2b65f8ea8aa2afd9e6434f261e765b428af1331243ed609f0d60e3aa8ddacbcc95
6
+ metadata.gz: 814517c4c9ed75066116a33106013645f14a0d67e964f89b92db1ae8bfebbca4b5a82822d349c97ff9081f3093dbdfd5946db2dfee90f9de0209588a8a52b8bd
7
+ data.tar.gz: 38af181ecc696bc79056c638ac96337806c363e3178c0fa30f36ae2f9e43161156975fedc42fa388c87ec55b94dda259fc8c61e707954b1209fed23cbf0c5842
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ Gemfile.lock
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - ruby-head
4
+ - 2.1.0
5
+ - 1.9.3
6
+ - jruby-head
7
+ - jruby-19mode
8
+ only:
9
+ - master
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gem 'pry' # allows us to use pry for bundle console
4
+ gemspec
5
+
6
+ group :test do
7
+ gem 'rake'
8
+ end
@@ -0,0 +1,42 @@
1
+ # gentle-scholar gem
2
+ [![Gem Version](https://badge.fury.io/rb/gentle-scholar.svg)](http://badge.fury.io/rb/gentle-scholar)
3
+ [![Build Status](https://travis-ci.org/soumyaray/gentle-scholar.svg?branch=v1.0.1)](https://travis-ci.org/soumyaray/gentle-scholar)
4
+
5
+ Gem to extract Google Scholar publication given URL of a publication
6
+
7
+ ##About:
8
+ **NOTE:** This gem is not to to search or crawl through Google Scholar.
9
+ This also will not extract bibtex (please see the excellent
10
+ [gscholar](https://rubygems.org/gems/gscholar) gem for that).
11
+
12
+ This gem is for publishing academics to extract their own publication
13
+ information from Google Scholar. It can retrieve standard citation information
14
+ (authors, journal, title, etc.). Additionally, it can retrieve number of
15
+ citations reported by Google Scholar, link to citations page, chart of
16
+ citations over time, and link to author's Google Scholar profile.
17
+
18
+ ##Usage:
19
+
20
+ Given a google scholar article such as:
21
+
22
+ http://scholar.google.com.tw/citations?view_op=view_citation&hl=en&user=6WjiSOwAAAAJ&citation_for_view=6WjiSOwAAAAJ:9yKSN-GCB0IC
23
+
24
+ Retrieve information by copying the author and article ID from the URL:
25
+
26
+ sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
27
+
28
+ This returns:
29
+
30
+ #<GScholarPub:0x007fb0bc18df70
31
+ @article_url="http://pubsonline.informs.org/doi/abs/10.1287/isre.1100.0340",
32
+ @authors=[["Soumya", "Ray"], ["Sung", "S", "Kim"], ["James", "G", "Morris"]],
33
+ @chart_url="http://www.google.com/chart?chs=475x90&cht=bvs&chf=bg,s,e8f4f7&chco=1111cc&chbh=r,2.0,0.0&chxt=x,y&chxr=1,0,5,5&chd=t:100.0,80.0,100.0
34
+ @cites=14,
35
+ @cites_url="http://scholar.google.com/scholar?oi=bibs&hl=en&oe=ASCII&cites=11777343089817755068",
36
+ @description="The highly competitive and rapidly changing market for online services is becoming increasingly effective at locking users in through
37
+ @issue="1",
38
+ @journal="Information Systems Research",
39
+ @pages="197-213",
40
+ @title="Research Note\u0097Online Users' Switching Costs: Their Nature and Formation",
41
+ @volume="23"
42
+ >
@@ -0,0 +1,7 @@
1
+ require 'rake/testtask'
2
+
3
+ task :default => [:spec]
4
+
5
+ Rake::TestTask.new(name=:spec) do |t|
6
+ t.pattern = "spec/*_spec.rb"
7
+ end
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'gentle-scholar'
4
+
5
+ pub = GentleScholar::Publication.get_from_http('6WjiSOwAAAAJ:9yKSN-GCB0IC')
6
+
7
+ puts 'Publication Details'
8
+ pub.each do |field, data|
9
+ puts "#{field}: #{pub[field]}"
10
+ end
@@ -0,0 +1,20 @@
1
+ $:.push File.expand_path("../lib", __FILE__)
2
+ require 'gentle-scholar/version'
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'gentle-scholar'
6
+ s.version = GentleScholar::VERSION
7
+ s.add_development_dependency 'minitest'
8
+ s.add_development_dependency 'minitest-rg'
9
+ s.add_runtime_dependency 'nokogiri', '>= 1.6.2'
10
+ s.add_runtime_dependency 'typhoeus', '>= 0.6.8'
11
+ s.date = '2014-05-27'
12
+ s.summary = 'Google Scholar infor extractor'
13
+ s.description = 'Extract author/paper info from Google Scholar'
14
+ s.authors = ['Soumya Ray']
15
+ s.email = 'soumya.ray@gmail.com'
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.homepage = 'https://github.com/soumyaray/gentle-scholar'
19
+ s.license = 'MIT'
20
+ end
@@ -0,0 +1 @@
1
+ require 'gentle-scholar/publication.rb'
@@ -0,0 +1,80 @@
1
+ # Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
2
+ # License:: MIT
3
+ require 'typhoeus'
4
+ require 'nokogiri'
5
+ require 'date'
6
+
7
+ module GentleScholar
8
+ # This class loads a single publication from Google scholar and returns
9
+ # all its attributes, including dynamic attributes like number of citations
10
+ class Publication
11
+ GS_HOST_URL = 'http://scholar.google.com'
12
+ GS_CIT_URL = "#{GS_HOST_URL}/citations?view_op=view_citation&hl=en"
13
+
14
+ SCAN_STR = {
15
+ gscholar_url: "//div[contains(@class,'g-section cit-dgb')]"\
16
+ "/div/table/tr/td/a",
17
+ cites: "//div[contains(@id,'scholar_sec')]/div/a",
18
+ cites_url: "//div[contains(@id,'scholar_sec')]/div/a",
19
+ title: '//div[@id="title"]/a',
20
+ article_url: '//div[@id="title"]/a',
21
+ chart_url: '//div[contains(@class,"cit-dd")]/img'
22
+ }
23
+
24
+ SCAN_LAMBDAS = {
25
+ cites: ->(x) { x.text[/\d+/].to_i },
26
+ cites_url: ->(x) { x[0].attributes['href'].value },
27
+ title: ->(x) { x.text },
28
+ article_url: ->(x) { x.attr('href').value },
29
+ chart_url: ->(x) { x.attr('src').value },
30
+ gscholar_url: ->(x) { GS_HOST_URL + x.attr('href').value }
31
+ }
32
+
33
+ TABLE_ATTR = {
34
+ authors: 'Authors',
35
+ date: 'Publication date',
36
+ journal: 'Journal name',
37
+ volume: 'Volume',
38
+ issue: 'Issue',
39
+ pages: 'Pages',
40
+ publisher: 'Publisher',
41
+ description: 'Description'
42
+ }
43
+
44
+ TABLE_LAMBDAS = {
45
+ authors: ->(x) { x.split(/,/).map { |a| a.split(' ') } },
46
+ date: ->(x) { Date.strptime(x, '%Y/%m/%d') }
47
+ }
48
+
49
+ def self.get_from_http(scholar_pub_id)
50
+ auth_id, pub_id = scholar_pub_id.split(/:/)
51
+ url = GS_CIT_URL + '&user=' + auth_id \
52
+ + '&citation_for_view=' + auth_id + ':' + pub_id
53
+ res = Typhoeus::Request.new(url).run
54
+ doc = Nokogiri::HTML(res.response_body)
55
+
56
+ extract_html_elements(doc).merge(extract_html_table(doc))
57
+ end
58
+
59
+ def self.extract_html_elements(doc)
60
+ xpath = Hash[SCAN_STR.map { |elem, path| [elem, doc.xpath(path)] }]
61
+ elements = SCAN_LAMBDAS.map do |key, lam|
62
+ if xpath[key].any? then [key, lam.call(xpath[key])] end
63
+ end
64
+
65
+ Hash[elements.compact]
66
+ end
67
+
68
+ def self.extract_html_table(doc)
69
+ # lambda gets text from right html column given name in left column
70
+ table_extract = lambda do |name|
71
+ doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
72
+ end
73
+
74
+ elements = Hash[TABLE_ATTR.map { |k, v| [k, table_extract.call(v)] }]
75
+ elements.merge(
76
+ Hash[TABLE_LAMBDAS.map { |key, lam| [key, lam.call(elements[key])] }]
77
+ )
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,3 @@
1
+ module GentleScholar
2
+ VERSION = '2.0.0'
3
+ end
@@ -0,0 +1,3 @@
1
+ require './lib/gentle-scholar.rb'
2
+
3
+ SEC_PAPER = GentleScholar::Publication.get_from_http('6WjiSOwAAAAJ:u5HHmVD_uO8C')
@@ -0,0 +1,76 @@
1
+ require 'minitest/autorun'
2
+ require 'minitest/rg'
3
+ require './spec/minitest_helper.rb'
4
+
5
+ describe 'Publication', 'A single publication listing' do
6
+
7
+ # let (:@sec_paper) do
8
+ # GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
9
+ # end
10
+
11
+ describe 'when it is a paper' do
12
+ #
13
+ # before do
14
+ # @sec_paper = GScholarPub.new('6WjiSOwAAAAJ:u5HHmVD_uO8C')
15
+ # end
16
+
17
+ before do
18
+ @sec_paper = SEC_PAPER
19
+ end
20
+
21
+ it 'has the right title' do
22
+ @sec_paper[:title].must_equal 'Security Assurance: How Online Service '\
23
+ 'Providers Can Influence Security Control Perceptions and Gain Trust'
24
+ end
25
+
26
+ it 'has some number of cites' do
27
+ @sec_paper[:cites].must_be :>, 0
28
+ end
29
+
30
+ it 'has some url for listing cites' do
31
+ @sec_paper[:cites_url].must_match /http:\/\/.*/
32
+ end
33
+
34
+ it 'has some url for citations chart' do
35
+ @sec_paper[:chart_url].must_match /http:\/\/.*/
36
+ end
37
+
38
+ it 'has some url for the pulished article' do
39
+ @sec_paper[:article_url].must_match /http:\/\/.*/
40
+ end
41
+
42
+ it 'has the right author(s) (as nested array)' do
43
+ @sec_paper[:authors].must_equal [['Soumya', 'Ray'], ['Terence', 'Ow'], ['Sung', 'S', 'Kim']]
44
+ end
45
+
46
+ it 'has a publication date' do
47
+ @sec_paper[:date].must_be_instance_of Date
48
+ end
49
+
50
+ it 'has the right journal\'s name' do
51
+ @sec_paper[:journal].must_equal 'Decision Sciences'
52
+ end
53
+
54
+ it 'has the right volume number (as string)' do
55
+ @sec_paper[:volume].must_equal '42'
56
+ end
57
+
58
+ it 'has the right issue number (as string)' do
59
+ @sec_paper[:issue].must_equal '2'
60
+ end
61
+
62
+ it 'has the right page numbers' do
63
+ @sec_paper[:pages].must_equal '391-412'
64
+ end
65
+
66
+ it 'has the right publisher' do
67
+ @sec_paper[:publisher].must_equal 'Blackwell Publishing Inc'
68
+ end
69
+
70
+ it 'has some url for the main citations apage' do
71
+ @sec_paper[:gscholar_url].must_match /citations/
72
+ end
73
+
74
+ end
75
+
76
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gentle-scholar
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Soumya Ray
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-25 00:00:00.000000000 Z
11
+ date: 2014-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: minitest
@@ -72,7 +72,18 @@ executables: []
72
72
  extensions: []
73
73
  extra_rdoc_files: []
74
74
  files:
75
- - lib/publication.rb
75
+ - ".gitignore"
76
+ - ".travis.yml"
77
+ - Gemfile
78
+ - README.md
79
+ - Rakefile
80
+ - bin/gentle
81
+ - gentle-scholar.gemspec
82
+ - lib/gentle-scholar.rb
83
+ - lib/gentle-scholar/publication.rb
84
+ - lib/gentle-scholar/version.rb
85
+ - spec/minitest_helper.rb
86
+ - spec/publication_spec.rb
76
87
  homepage: https://github.com/soumyaray/gentle-scholar
77
88
  licenses:
78
89
  - MIT
@@ -97,4 +108,6 @@ rubygems_version: 2.1.11
97
108
  signing_key:
98
109
  specification_version: 4
99
110
  summary: Google Scholar infor extractor
100
- test_files: []
111
+ test_files:
112
+ - spec/minitest_helper.rb
113
+ - spec/publication_spec.rb
@@ -1,78 +0,0 @@
1
- # Author:: Soumya Ray (mailto: soumya.ray@gmail.com)
2
- # License:: MIT
3
-
4
- require 'typhoeus'
5
- require 'nokogiri'
6
- require 'date'
7
-
8
- # This class loads a single publication from Google scholar and returns
9
- # all its attributes, including dynamic attributes like number of citations
10
- class GScholarPub
11
- GSCHOLAR_HOST_URL = 'http://scholar.google.com'
12
- GSCHOLAR_CIT_URL =
13
- 'http://scholar.google.com/citations?view_op=view_citation&hl=en'
14
-
15
- attr_reader :title, :cites, :cites_url, :chart_url, :article_url
16
- attr_reader :authors, :date, :journal, :volume, :issue, :pages, :publisher
17
- attr_reader :description, :gscholar_url
18
- # TODO: @doc only for development, testing modes
19
- attr_reader :doc
20
-
21
- SCAN_STR = {
22
- gscholar_url:
23
- "//div[contains(@class,'g-section cit-dgb')]/div/table/tr/td/a",
24
- cites: "//div[contains(@id,'scholar_sec')]/div/a",
25
- title: '//div[@id="title"]/a',
26
- article_url: '//div[@id="title"]/a',
27
- chart_url: '//div[contains(@class,"cit-dd")]/img'
28
- }
29
-
30
- TABLE_ATTR = {
31
- authors: 'Authors',
32
- date: 'Publication date',
33
- journal: 'Journal name',
34
- volume: 'Volume',
35
- issue: 'Issue',
36
- pages: 'Pages',
37
- publisher: 'Publisher',
38
- description: 'Description'
39
- }
40
-
41
- def initialize(scholar_pub_id)
42
- auth_id, pub_id = scholar_pub_id.split(/:/)
43
- url = GSCHOLAR_CIT_URL + '&user=' + auth_id \
44
- + '&citation_for_view=' + auth_id + ':' + pub_id
45
- res = Typhoeus::Request.new(url).run
46
- @doc = Nokogiri::HTML(res.response_body)
47
-
48
- extract_html_elements
49
- extract_html_table
50
- end
51
-
52
- def extract_html_elements
53
- @cites = @doc.xpath(SCAN_STR[:cites]).text[/\d+/].to_i
54
- @cites_url = @doc.xpath(SCAN_STR[:cites])[0].attributes['href'].value
55
-
56
- @title = @doc.xpath(SCAN_STR[:title]).text
57
- @article_url = @doc.xpath(SCAN_STR[:article_url]).attr('href').value
58
-
59
- @chart_url = @doc.xpath(SCAN_STR[:chart_url]).attr('src').value
60
-
61
- @gscholar_url = GSCHOLAR_HOST_URL + @doc.xpath(
62
- SCAN_STR[:gscholar_url]).attr('href').value
63
- end
64
-
65
- def extract_html_table
66
- # lambda gets text from right html column given name in left column
67
- table_pick = lambda do |name|
68
- @doc.xpath("//div[starts-with(.,'#{name}')]")[0].children[1].text
69
- end
70
-
71
- TABLE_ATTR.each do |k, v|
72
- instance_variable_set("@#{k}", table_pick.call(v))
73
- end
74
-
75
- @authors = @authors.split(/,/).map { |a| a.split(' ') }
76
- @date = Date.strptime(@date, '%Y/%m/%d')
77
- end
78
- end