gared 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d11aa7725f44fab48661ca7739a9f6a41e1ee0c686bc3c3df02279654b427616
4
+ data.tar.gz: c88f9cb2fdd8884f1f954e183dc54b055f6a38f5225b36a5ca375a08606e5436
5
+ SHA512:
6
+ metadata.gz: 3a24efea632ca0dc83f1e8f7ae7859926ae94073afe259504ff40873906d146dcbeb28ea7ae6614a18400051e800add1fcd1c5627149e4041d31a04f178e9c6a
7
+ data.tar.gz: cb7d1a344a0a432f88273054e21e98304b87e77f57b1131b0b37a85d56e66c7db4129f60e35b73181c5c4ba78875f56f58ff95c3eab31b6eefae5919fae75d3e
data/lib/gared.rb ADDED
@@ -0,0 +1,12 @@
1
+ module Gared
2
+ require 'gared/publication'
3
+ require 'gared/holding'
4
+ require 'gared/person'
5
+ require 'gared/primo'
6
+ require 'gared/aleph'
7
+ require 'gared/hebrewbooks'
8
+ require 'gared/idea'
9
+ require 'gared/googlebooks'
10
+ # ...
11
+
12
+ end
@@ -0,0 +1,80 @@
1
+ # Z39.50 values according to https://www.loc.gov/z3950/agency/bib1.html
2
+ # and NLI info according to http://web.nli.org.il/sites/NLI/Hebrew/infochannels/librarians/Pages/Z39.50.aspx
3
+ # Name of Database: NNL01
4
+ # Host name: aleph.nli.org.il
5
+ # IP address: 192.114.7.200
6
+ # Port: 9991
7
+ # Character-set: UTF-8
8
+ # We support the following record syntaxes:
9
+ # USMARC, OPAC, XML
10
+ # We support the following word searches:
11
+ # 1016, 1017, 1,1003, 1004,4,21,30,31,7,12,1007,1031,1007,5028,1033
12
+ # We support the following phrase searches:
13
+ # 7,12,1,1003,1004,4,21,15
14
+ # We support the following sorts:
15
+ # 1,4,30,31,1003
16
+
17
+ module Gared
18
+ require 'zoom'
19
+ require 'nokogiri'
20
+ class Aleph
21
+ def initialize(host, port, database, syntax = 'USMARC')
22
+ @options = {host: host, port: port, database: database, syntax: syntax}
23
+ end
24
+ def query_persons(q)
25
+ end
26
+
27
+ def query_person(person)
28
+ end
29
+
30
+ def query_publications(q)
31
+ end
32
+
33
+ def query_publication(publication)
34
+ end
35
+
36
+ def query_publications_by_person(person)
37
+ ZOOM::Connection.open(@options[:host], @options[:port]) do |conn|
38
+ conn.database_name = @options[:database] # 'aleph.nli.org.il',9991
39
+ conn.preferred_record_syntax = @options[:syntax]
40
+ rset = conn.search("@attr 1=1003 @attr 2=3 @attr 4=1 \"#{person}\"")
41
+ rr = rset.records
42
+ return nil if rr.nil? or rr.empty?
43
+ ret = []
44
+ rr.each do |r|
45
+ xml = Nokogiri::Slop(r.xml)
46
+ xml.remove_namespaces! # keeps biting me :)
47
+ # these scrapes are based on the National Library of Israel usage. No attempt to make it generic. :)
48
+ p = Publication.new
49
+ begin
50
+ p.author_line = xml.xpath('//datafield[@tag=\'100\']/subfield[@code=\'a\']')[0].text
51
+ # puts "author: #{p.author_line}" # DEBUG
52
+ rescue
53
+ nil
54
+ end
55
+ begin
56
+ p.title = xml.xpath('//datafield[@tag=\'245\']/subfield[@code=\'a\']')[0].text
57
+ # puts "title: #{p.title}" # DEBUG
58
+ rescue
59
+ nil
60
+ end
61
+ begin
62
+ p.notes = xml.xpath('//datafield[@tag=\'500\']/subfield[@code=\'a\']').collect{|note| note.text}.join("\n")
63
+ rescue
64
+ nil
65
+ end
66
+ begin
67
+ h = Holding.new
68
+ h.source_id = xml.xpath('//datafield[@tag=\'090\']/subfield[@code=\'a\']')[0].text
69
+ h.source_name = @options[:database]
70
+ p.add_holding(h)
71
+ ret << p
72
+ rescue
73
+ nil # ignore records with no holdings; they may be archival files or other non-publications
74
+ end #
75
+ end
76
+ return ret
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,39 @@
1
+ require 'rest-client'
2
+
3
+ module Gared
4
+ class Googlebooks
5
+ def initialize(api_key, page_size = '40')
6
+ @options = {api_key: api_key, maxResults: page_size}
7
+ end
8
+
9
+ def query_publications_by_person(person)
10
+ url = "https://www.googleapis.com/books/v1/volumes?q=inauthor:#{URI.escape(person)}&filter=full&key=#{@options[:api_key]}&maxResults=#{@options[:maxResults]}"
11
+ resp = JSON.parse(RestClient.get(url))
12
+
13
+ total = resp['totalItems']
14
+ ret = []
15
+ if total > 0
16
+ start_at = 0
17
+ recs = resp['items']
18
+ while recs.length < total
19
+ start_at += @options[:maxResults]
20
+ resp = JSON.parse(RestClient.get(url+"&startIndex=#{start_at}"))
21
+ recs += resp['items']
22
+ sleep 2 # respect the server and avoid flood-blocking
23
+ end
24
+ recs.each do |r|
25
+ next unless r['accessInfo']['pdf']['isAvailable']
26
+ p = Publication.new
27
+ p.source_id = r['id']
28
+ p.title = r['volumeInfo']['title']
29
+ h = Holding.new
30
+ h.source_id = r['id']
31
+ h.source_name = 'Google Books'
32
+ p.add_holding(h)
33
+ ret << p
34
+ end
35
+ end
36
+ return ret
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,47 @@
1
+ module Gared
2
+ class Hebrewbooks
3
+ require 'watir'
4
+
5
+ def initialize
6
+ @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
7
+ end
8
+
9
+ def query_persons(q)
10
+ end
11
+
12
+ def query_person(person)
13
+ end
14
+
15
+ def query_publications(q)
16
+ end
17
+
18
+ def query_publication(publication)
19
+ end
20
+
21
+ def query_publications_by_person(person)
22
+ @browser.goto 'http://hebrewbooks.org/home.aspx'
23
+ @browser.wait
24
+ t = @browser.text_field(id: 'cpMstr_author')
25
+ t.set(person)
26
+ @browser.form(id: 'form1').submit # get publications by person
27
+ @browser.wait
28
+ trs = @browser.div(id: 'dbresults').trs
29
+ ret = []
30
+ if trs.size > 0
31
+ trs.each do |tr|
32
+ p = Publication.new
33
+ p.title = tr.tds[0].text
34
+ p.author_line = tr.tds[1].text
35
+ p.source_id = tr.tds[0].a.href
36
+ h = Holding.new
37
+ h.source_id = tr.tds[0].a.href
38
+ h.source_name = 'Hebrewbooks'
39
+ p.add_holding(h)
40
+ ret << p
41
+ end
42
+ end
43
+ return ret
44
+ # TODO: support multiple result pages
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,6 @@
1
+ module Gared
2
+ class Holding
3
+ attr_accessor :source_id, :source_name
4
+
5
+ end
6
+ end
data/lib/gared/idea.rb ADDED
@@ -0,0 +1,63 @@
1
+ module Gared
2
+ class Idea
3
+ require 'watir'
4
+
5
+ def initialize(opac_url)
6
+ @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
7
+ @options = {opac_url: opac_url}
8
+ end
9
+
10
+ def query_persons(q)
11
+ end
12
+
13
+ def query_person(person)
14
+ end
15
+
16
+ def query_publications(q)
17
+ end
18
+
19
+ def query_publication(publication)
20
+ end
21
+
22
+ def query_publications_by_person(person)
23
+ @browser.goto @options[:opac_url]
24
+ @browser.wait
25
+ t = @browser.text_field(id: 'get_var_0')
26
+ t.set(person)
27
+ @browser.input(id: 'cb_update_0').click # "quick search" - not necessarily by author!
28
+ @browser.wait
29
+ ret = []
30
+ results = @browser.div(id: 'results_list')
31
+ if results.exists?
32
+ trs = @browser.div(id: 'results_list').table.rows
33
+ if trs.size > 0
34
+ trs.each do |tr|
35
+ item = tr.tr.tds[1]
36
+ urlpart = item.h5.a.href
37
+ p = Publication.new
38
+ p.title = item.ps[0].text
39
+ p.author_line = item.ps[1].text.sub('מחבר: ','')
40
+ p.pub_year = item.ps[2].text.sub('שנה לועזית:','').sub('שנת הוצאה:','')
41
+
42
+ p.source_id = urlpart
43
+ ret << p
44
+ end
45
+ # now that we've extracted everything useful from this page, iterate over the results to pick up the system ID
46
+ ret.each do |item|
47
+ @browser.goto item.source_id
48
+ @browser.wait
49
+ item.source_id = @browser.tr(id: '1').span(:class => 'bidie').text
50
+ # doesn't look like there's much more to learn from the Holdings screen, since we don't care if there's more than one copy or not
51
+ # @browser.goto @browser.ul(id: 'itemTabs').li(id: '2').a.href # check holdings
52
+ h = Holding.new
53
+ h.source_id = item.source_id
54
+ h.source_name = @options[:opac_url]
55
+ item.add_holding(h)
56
+ end
57
+ end
58
+ end
59
+ return ret
60
+ # TODO: support multiple result pages
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,4 @@
1
+ module Gared
2
+ class Jpress
3
+ end
4
+ end
@@ -0,0 +1,5 @@
1
+ module Gared
2
+ class Person
3
+ attr_accessor :name, :aliases, :source_id
4
+ end
5
+ end
@@ -0,0 +1,59 @@
1
+ require 'rest-client'
2
+ # require 'exlibris-primo' # using this gem doesn't support searching with facet filtering...
3
+
4
+ module Gared
5
+ class Primo
6
+ def initialize(url, institution)
7
+ @options = {url: url, institution: institution}
8
+ end
9
+
10
+ def query_persons(q)
11
+ end
12
+
13
+ def query_person(person)
14
+ end
15
+
16
+ def query_publications(q)
17
+ end
18
+
19
+ def query_publication(publication)
20
+ end
21
+
22
+ # return in-memory Publication instances with associated Holdings
23
+ def query_publications_by_person(person)
24
+ ret = []
25
+ begin
26
+ url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=1&bulkSize=50&query=facet_rtype,exact,books}&json=true"
27
+ json = JSON.parse(RestClient.get(url))
28
+ total = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['@TOTALHITS'].to_i
29
+ start_at = 1
30
+ recs = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC'] # stash the records
31
+ while recs.length < total
32
+ start_at += 50
33
+ url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=#{start_at}&bulkSize=50&query=facet_rtype,exact,books}&json=true"
34
+ json = JSON.parse(RestClient.get(url))
35
+ recs += json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC']
36
+ sleep 1 # respect the server and avoid flood-blocking
37
+ end
38
+ recs.each do |r|
39
+ deets = r['PrimoNMBib']['record']['display']
40
+ p = Publication.new
41
+ p.title = deets['title']
42
+ p.author_line = deets['creator']
43
+ p.notes = deets['subject']
44
+ p.publisher_line = deets['publisher']
45
+ p.pub_year = deets['creationdate']
46
+ p.source_id = r['PrimoNMBib']['record']['control']['recordid']
47
+ h = Holding.new
48
+ h.source_id = p.source_id
49
+ h.source_name = 'Primo:'+@options[:institution]
50
+ p.add_holding(h)
51
+ ret << p
52
+ end
53
+ rescue Exception
54
+ puts $!
55
+ end
56
+ return ret
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,14 @@
1
+ module Gared
2
+ class Publication
3
+
4
+ attr_accessor :title, :publisher_line, :author_line, :notes, :source_id, :holdings, :language, :pub_year
5
+
6
+ def initialize
7
+ @holdings = []
8
+ end
9
+
10
+ def add_holding(holding)
11
+ @holdings << holding
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,61 @@
1
+ require 'minitest/autorun'
2
+ require 'gared'
3
+
4
+ class GaredTest < Minitest::Test
5
+
6
+ def test_primo_query_publicatios_by_person
7
+ puts "Testing Primo"
8
+ primo = Gared::Primo.new('http://primo.nli.org.il/PrimoWebServices/xservice/search/brief', 'NNL')
9
+ refute_nil primo
10
+ recs = primo.query_publications_by_person('אילנאה')
11
+ refute_nil recs
12
+ refute_empty(recs)
13
+ refute_empty(recs[0].title)
14
+ end
15
+
16
+ def test_aleph_query_publicatios_by_person
17
+ puts "Testing Aleph"
18
+ aleph = Gared::Aleph.new('aleph.nli.org.il', 9991, 'NNL01')
19
+ refute_nil aleph
20
+ recs = aleph.query_publications_by_person('אילנאה')
21
+ refute_nil recs
22
+ refute_empty(recs)
23
+ refute_empty(recs[0].title)
24
+ end
25
+
26
+ def test_googlebooks_query_publicatios_by_person
27
+ puts "Testing Google Books"
28
+ gb = Gared::Googlebooks.new('AIzaSyCE2WFqTPdxAz1wv2f33hMfPWIF4tcocgM') # a key I made just for testing this gem. Please do not abuse.
29
+ refute_nil gb
30
+ recs = gb.query_publications_by_person('מנדלי')
31
+ refute_nil recs
32
+ refute_empty(recs)
33
+ refute_empty(recs[0].title)
34
+ end
35
+
36
+ def test_hebrewbooks_query_publicatios_by_person
37
+ skip("Skipping testing Hebrewbooks because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
38
+ puts "Testing Hebrewbooks"
39
+ hb = Gared::Hebrewbooks.new
40
+ refute_nil hb
41
+ recs = hb.query_publications_by_person('שיין')
42
+ refute_nil recs
43
+ refute_empty(recs)
44
+ refute_empty(recs[0].title)
45
+ end
46
+
47
+ def test_idea_query_publicatios_by_person
48
+ skip("Skipping testing IDEA because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
49
+ puts "Testing IDEA"
50
+ idea = Gared::Idea.new('http://infocenters.co.il/RAANANA/')
51
+ refute_nil idea
52
+ recs = idea.query_publications_by_person('גפלה, אופיר')
53
+ refute_nil recs
54
+ refute_empty(recs)
55
+ refute_empty(recs[0].title)
56
+ recs = idea.query_publications_by_person('אילנאהסןסן') # nonsense
57
+ refute_nil recs
58
+ assert_empty(recs)
59
+ end
60
+
61
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gared
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.7
5
+ platform: ruby
6
+ authors:
7
+ - Asaf Bartov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-06-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: zoom
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: watir
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rest-client
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.8'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '5.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '5.1'
83
+ description: A set of scrapers for bibliographic records of Hebrew titles
84
+ email: asaf.bartov@gmail.com
85
+ executables: []
86
+ extensions: []
87
+ extra_rdoc_files: []
88
+ files:
89
+ - lib/gared.rb
90
+ - lib/gared/aleph.rb
91
+ - lib/gared/googlebooks.rb
92
+ - lib/gared/hebrewbooks.rb
93
+ - lib/gared/holding.rb
94
+ - lib/gared/idea.rb
95
+ - lib/gared/jpress.rb
96
+ - lib/gared/person.rb
97
+ - lib/gared/primo.rb
98
+ - lib/gared/publication.rb
99
+ - test/test_gared.rb
100
+ homepage: https://gitlab.com/abartov/gared
101
+ licenses:
102
+ - MIT
103
+ metadata: {}
104
+ post_install_message:
105
+ rdoc_options: []
106
+ require_paths:
107
+ - lib
108
+ required_ruby_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ required_rubygems_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ requirements: []
119
+ rubyforge_project:
120
+ rubygems_version: 2.7.7
121
+ signing_key:
122
+ specification_version: 4
123
+ summary: Scrape Hebrew bibliography sources
124
+ test_files:
125
+ - test/test_gared.rb