gared 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d11aa7725f44fab48661ca7739a9f6a41e1ee0c686bc3c3df02279654b427616
4
+ data.tar.gz: c88f9cb2fdd8884f1f954e183dc54b055f6a38f5225b36a5ca375a08606e5436
5
+ SHA512:
6
+ metadata.gz: 3a24efea632ca0dc83f1e8f7ae7859926ae94073afe259504ff40873906d146dcbeb28ea7ae6614a18400051e800add1fcd1c5627149e4041d31a04f178e9c6a
7
+ data.tar.gz: cb7d1a344a0a432f88273054e21e98304b87e77f57b1131b0b37a85d56e66c7db4129f60e35b73181c5c4ba78875f56f58ff95c3eab31b6eefae5919fae75d3e
data/lib/gared.rb ADDED
@@ -0,0 +1,12 @@
1
+ module Gared
2
+ require 'gared/publication'
3
+ require 'gared/holding'
4
+ require 'gared/person'
5
+ require 'gared/primo'
6
+ require 'gared/aleph'
7
+ require 'gared/hebrewbooks'
8
+ require 'gared/idea'
9
+ require 'gared/googlebooks'
10
+ # ...
11
+
12
+ end
@@ -0,0 +1,80 @@
1
+ # Z39.50 values according to https://www.loc.gov/z3950/agency/bib1.html
2
+ # and NLI info according to http://web.nli.org.il/sites/NLI/Hebrew/infochannels/librarians/Pages/Z39.50.aspx
3
+ # Name of Database: NNL01
4
+ # Host name: aleph.nli.org.il
5
+ # IP address: 192.114.7.200
6
+ # Port: 9991
7
+ # Character-set: UTF-8
8
+ # We support the following record syntaxes:
9
+ # USMARC, OPAC, XML
10
+ # We support the following word searches:
11
+ # 1016, 1017, 1,1003, 1004,4,21,30,31,7,12,1007,1031,1007,5028,1033
12
+ # We support the following phrase searches:
13
+ # 7,12,1,1003,1004,4,21,15
14
+ # We support the following sorts:
15
+ # 1,4,30,31,1003
16
+
17
+ module Gared
18
+ require 'zoom'
19
+ require 'nokogiri'
20
+ class Aleph
21
+ def initialize(host, port, database, syntax = 'USMARC')
22
+ @options = {host: host, port: port, database: database, syntax: syntax}
23
+ end
24
+ def query_persons(q)
25
+ end
26
+
27
+ def query_person(person)
28
+ end
29
+
30
+ def query_publications(q)
31
+ end
32
+
33
+ def query_publication(publication)
34
+ end
35
+
36
+ def query_publications_by_person(person)
37
+ ZOOM::Connection.open(@options[:host], @options[:port]) do |conn|
38
+ conn.database_name = @options[:database] # 'aleph.nli.org.il',9991
39
+ conn.preferred_record_syntax = @options[:syntax]
40
+ rset = conn.search("@attr 1=1003 @attr 2=3 @attr 4=1 \"#{person}\"")
41
+ rr = rset.records
42
+ return nil if rr.nil? or rr.empty?
43
+ ret = []
44
+ rr.each do |r|
45
+ xml = Nokogiri::Slop(r.xml)
46
+ xml.remove_namespaces! # keeps biting me :)
47
+ # these scrapes are based on the National Library of Israel usage. No attempt to make it generic. :)
48
+ p = Publication.new
49
+ begin
50
+ p.author_line = xml.xpath('//datafield[@tag=\'100\']/subfield[@code=\'a\']')[0].text
51
+ # puts "author: #{p.author_line}" # DEBUG
52
+ rescue
53
+ nil
54
+ end
55
+ begin
56
+ p.title = xml.xpath('//datafield[@tag=\'245\']/subfield[@code=\'a\']')[0].text
57
+ # puts "title: #{p.title}" # DEBUG
58
+ rescue
59
+ nil
60
+ end
61
+ begin
62
+ p.notes = xml.xpath('//datafield[@tag=\'500\']/subfield[@code=\'a\']').collect{|note| note.text}.join("\n")
63
+ rescue
64
+ nil
65
+ end
66
+ begin
67
+ h = Holding.new
68
+ h.source_id = xml.xpath('//datafield[@tag=\'090\']/subfield[@code=\'a\']')[0].text
69
+ h.source_name = @options[:database]
70
+ p.add_holding(h)
71
+ ret << p
72
+ rescue
73
+ nil # ignore records with no holdings; they may be archival files or other non-publications
74
+ end #
75
+ end
76
+ return ret
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,39 @@
1
+ require 'rest-client'
2
+
3
+ module Gared
4
+ class Googlebooks
5
+ def initialize(api_key, page_size = '40')
6
+ @options = {api_key: api_key, maxResults: page_size}
7
+ end
8
+
9
+ def query_publications_by_person(person)
10
+ url = "https://www.googleapis.com/books/v1/volumes?q=inauthor:#{URI.escape(person)}&filter=full&key=#{@options[:api_key]}&maxResults=#{@options[:maxResults]}"
11
+ resp = JSON.parse(RestClient.get(url))
12
+
13
+ total = resp['totalItems']
14
+ ret = []
15
+ if total > 0
16
+ start_at = 0
17
+ recs = resp['items']
18
+ while recs.length < total
19
+ start_at += @options[:maxResults]
20
+ resp = JSON.parse(RestClient.get(url+"&startIndex=#{start_at}"))
21
+ recs += resp['items']
22
+ sleep 2 # respect the server and avoid flood-blocking
23
+ end
24
+ recs.each do |r|
25
+ next unless r['accessInfo']['pdf']['isAvailable']
26
+ p = Publication.new
27
+ p.source_id = r['id']
28
+ p.title = r['volumeInfo']['title']
29
+ h = Holding.new
30
+ h.source_id = r['id']
31
+ h.source_name = 'Google Books'
32
+ p.add_holding(h)
33
+ ret << p
34
+ end
35
+ end
36
+ return ret
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,47 @@
1
+ module Gared
2
+ class Hebrewbooks
3
+ require 'watir'
4
+
5
+ def initialize
6
+ @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
7
+ end
8
+
9
+ def query_persons(q)
10
+ end
11
+
12
+ def query_person(person)
13
+ end
14
+
15
+ def query_publications(q)
16
+ end
17
+
18
+ def query_publication(publication)
19
+ end
20
+
21
+ def query_publications_by_person(person)
22
+ @browser.goto 'http://hebrewbooks.org/home.aspx'
23
+ @browser.wait
24
+ t = @browser.text_field(id: 'cpMstr_author')
25
+ t.set(person)
26
+ @browser.form(id: 'form1').submit # get publications by person
27
+ @browser.wait
28
+ trs = @browser.div(id: 'dbresults').trs
29
+ ret = []
30
+ if trs.size > 0
31
+ trs.each do |tr|
32
+ p = Publication.new
33
+ p.title = tr.tds[0].text
34
+ p.author_line = tr.tds[1].text
35
+ p.source_id = tr.tds[0].a.href
36
+ h = Holding.new
37
+ h.source_id = tr.tds[0].a.href
38
+ h.source_name = 'Hebrewbooks'
39
+ p.add_holding(h)
40
+ ret << p
41
+ end
42
+ end
43
+ return ret
44
+ # TODO: support multiple result pages
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,6 @@
1
+ module Gared
2
+ class Holding
3
+ attr_accessor :source_id, :source_name
4
+
5
+ end
6
+ end
data/lib/gared/idea.rb ADDED
@@ -0,0 +1,63 @@
1
+ module Gared
2
+ class Idea
3
+ require 'watir'
4
+
5
+ def initialize(opac_url)
6
+ @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
7
+ @options = {opac_url: opac_url}
8
+ end
9
+
10
+ def query_persons(q)
11
+ end
12
+
13
+ def query_person(person)
14
+ end
15
+
16
+ def query_publications(q)
17
+ end
18
+
19
+ def query_publication(publication)
20
+ end
21
+
22
+ def query_publications_by_person(person)
23
+ @browser.goto @options[:opac_url]
24
+ @browser.wait
25
+ t = @browser.text_field(id: 'get_var_0')
26
+ t.set(person)
27
+ @browser.input(id: 'cb_update_0').click # "quick search" - not necessarily by author!
28
+ @browser.wait
29
+ ret = []
30
+ results = @browser.div(id: 'results_list')
31
+ if results.exists?
32
+ trs = @browser.div(id: 'results_list').table.rows
33
+ if trs.size > 0
34
+ trs.each do |tr|
35
+ item = tr.tr.tds[1]
36
+ urlpart = item.h5.a.href
37
+ p = Publication.new
38
+ p.title = item.ps[0].text
39
+ p.author_line = item.ps[1].text.sub('מחבר: ','')
40
+ p.pub_year = item.ps[2].text.sub('שנה לועזית:','').sub('שנת הוצאה:','')
41
+
42
+ p.source_id = urlpart
43
+ ret << p
44
+ end
45
+ # now that we've extracted everything useful from this page, iterate over the results to pick up the system ID
46
+ ret.each do |item|
47
+ @browser.goto item.source_id
48
+ @browser.wait
49
+ item.source_id = @browser.tr(id: '1').span(:class => 'bidie').text
50
+ # doesn't look like there's much more to learn from the Holdings screen, since we don't care if there's more than one copy or not
51
+ # @browser.goto @browser.ul(id: 'itemTabs').li(id: '2').a.href # check holdings
52
+ h = Holding.new
53
+ h.source_id = item.source_id
54
+ h.source_name = @options[:opac_url]
55
+ item.add_holding(h)
56
+ end
57
+ end
58
+ end
59
+ return ret
60
+ # TODO: support multiple result pages
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,4 @@
1
+ module Gared
2
+ class Jpress
3
+ end
4
+ end
@@ -0,0 +1,5 @@
1
+ module Gared
2
+ class Person
3
+ attr_accessor :name, :aliases, :source_id
4
+ end
5
+ end
@@ -0,0 +1,59 @@
1
+ require 'rest-client'
2
+ # require 'exlibris-primo' # using this gem doesn't support searching with facet filtering...
3
+
4
+ module Gared
5
+ class Primo
6
+ def initialize(url, institution)
7
+ @options = {url: url, institution: institution}
8
+ end
9
+
10
+ def query_persons(q)
11
+ end
12
+
13
+ def query_person(person)
14
+ end
15
+
16
+ def query_publications(q)
17
+ end
18
+
19
+ def query_publication(publication)
20
+ end
21
+
22
+ # return in-memory Publication instances with associated Holdings
23
+ def query_publications_by_person(person)
24
+ ret = []
25
+ begin
26
+ url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=1&bulkSize=50&query=facet_rtype,exact,books}&json=true"
27
+ json = JSON.parse(RestClient.get(url))
28
+ total = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['@TOTALHITS'].to_i
29
+ start_at = 1
30
+ recs = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC'] # stash the records
31
+ while recs.length < total
32
+ start_at += 50
33
+ url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=#{start_at}&bulkSize=50&query=facet_rtype,exact,books}&json=true"
34
+ json = JSON.parse(RestClient.get(url))
35
+ recs += json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC']
36
+ sleep 1 # respect the server and avoid flood-blocking
37
+ end
38
+ recs.each do |r|
39
+ deets = r['PrimoNMBib']['record']['display']
40
+ p = Publication.new
41
+ p.title = deets['title']
42
+ p.author_line = deets['creator']
43
+ p.notes = deets['subject']
44
+ p.publisher_line = deets['publisher']
45
+ p.pub_year = deets['creationdate']
46
+ p.source_id = r['PrimoNMBib']['record']['control']['recordid']
47
+ h = Holding.new
48
+ h.source_id = p.source_id
49
+ h.source_name = 'Primo:'+@options[:institution]
50
+ p.add_holding(h)
51
+ ret << p
52
+ end
53
+ rescue Exception
54
+ puts $!
55
+ end
56
+ return ret
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,14 @@
1
+ module Gared
2
+ class Publication
3
+
4
+ attr_accessor :title, :publisher_line, :author_line, :notes, :source_id, :holdings, :language, :pub_year
5
+
6
+ def initialize
7
+ @holdings = []
8
+ end
9
+
10
+ def add_holding(holding)
11
+ @holdings << holding
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,61 @@
1
+ require 'minitest/autorun'
2
+ require 'gared'
3
+
4
+ class GaredTest < Minitest::Test
5
+
6
+ def test_primo_query_publicatios_by_person
7
+ puts "Testing Primo"
8
+ primo = Gared::Primo.new('http://primo.nli.org.il/PrimoWebServices/xservice/search/brief', 'NNL')
9
+ refute_nil primo
10
+ recs = primo.query_publications_by_person('אילנאה')
11
+ refute_nil recs
12
+ refute_empty(recs)
13
+ refute_empty(recs[0].title)
14
+ end
15
+
16
+ def test_aleph_query_publicatios_by_person
17
+ puts "Testing Aleph"
18
+ aleph = Gared::Aleph.new('aleph.nli.org.il', 9991, 'NNL01')
19
+ refute_nil aleph
20
+ recs = aleph.query_publications_by_person('אילנאה')
21
+ refute_nil recs
22
+ refute_empty(recs)
23
+ refute_empty(recs[0].title)
24
+ end
25
+
26
+ def test_googlebooks_query_publicatios_by_person
27
+ puts "Testing Google Books"
28
+ gb = Gared::Googlebooks.new('AIzaSyCE2WFqTPdxAz1wv2f33hMfPWIF4tcocgM') # a key I made just for testing this gem. Please do not abuse.
29
+ refute_nil gb
30
+ recs = gb.query_publications_by_person('מנדלי')
31
+ refute_nil recs
32
+ refute_empty(recs)
33
+ refute_empty(recs[0].title)
34
+ end
35
+
36
+ def test_hebrewbooks_query_publicatios_by_person
37
+ skip("Skipping testing Hebrewbooks because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
38
+ puts "Testing Hebrewbooks"
39
+ hb = Gared::Hebrewbooks.new
40
+ refute_nil hb
41
+ recs = hb.query_publications_by_person('שיין')
42
+ refute_nil recs
43
+ refute_empty(recs)
44
+ refute_empty(recs[0].title)
45
+ end
46
+
47
+ def test_idea_query_publicatios_by_person
48
+ skip("Skipping testing IDEA because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
49
+ puts "Testing IDEA"
50
+ idea = Gared::Idea.new('http://infocenters.co.il/RAANANA/')
51
+ refute_nil idea
52
+ recs = idea.query_publications_by_person('גפלה, אופיר')
53
+ refute_nil recs
54
+ refute_empty(recs)
55
+ refute_empty(recs[0].title)
56
+ recs = idea.query_publications_by_person('אילנאהסןסן') # nonsense
57
+ refute_nil recs
58
+ assert_empty(recs)
59
+ end
60
+
61
+ end
metadata ADDED
@@ -0,0 +1,125 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gared
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.7
5
+ platform: ruby
6
+ authors:
7
+ - Asaf Bartov
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-06-29 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: zoom
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.5'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: watir
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rest-client
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '2'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: nokogiri
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.8'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: minitest
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '5.1'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '5.1'
83
+ description: A set of scrapers for bibliographic records of Hebrew titles
84
+ email: asaf.bartov@gmail.com
85
+ executables: []
86
+ extensions: []
87
+ extra_rdoc_files: []
88
+ files:
89
+ - lib/gared.rb
90
+ - lib/gared/aleph.rb
91
+ - lib/gared/googlebooks.rb
92
+ - lib/gared/hebrewbooks.rb
93
+ - lib/gared/holding.rb
94
+ - lib/gared/idea.rb
95
+ - lib/gared/jpress.rb
96
+ - lib/gared/person.rb
97
+ - lib/gared/primo.rb
98
+ - lib/gared/publication.rb
99
+ - test/test_gared.rb
100
+ homepage: https://gitlab.com/abartov/gared
101
+ licenses:
102
+ - MIT
103
+ metadata: {}
104
+ post_install_message:
105
+ rdoc_options: []
106
+ require_paths:
107
+ - lib
108
+ required_ruby_version: !ruby/object:Gem::Requirement
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: '0'
113
+ required_rubygems_version: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ requirements: []
119
+ rubyforge_project:
120
+ rubygems_version: 2.7.7
121
+ signing_key:
122
+ specification_version: 4
123
+ summary: Scrape Hebrew bibliography sources
124
+ test_files:
125
+ - test/test_gared.rb