gared 0.0.21 → 0.0.26
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/gared/googlebooks.rb +6 -2
- data/lib/gared/jpress.rb +0 -31
- data/lib/gared/nli_api.rb +93 -0
- data/lib/gared/primo.rb +9 -4
- data/lib/gared.rb +1 -1
- data/test/test_gared.rb +38 -16
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3857f28b69b7cd0d1080208215cbf1bfc57a30a3af5df8162a98aee5b7558939
|
4
|
+
data.tar.gz: a3cf8fbe5018b902db45f6d70c8223f32101c6eb747b070a7f7dc56a8a37c48c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 69f3b46c6fa96f5bf2c2f440fbc9bf93bbe57ec8ac61e84a9621289e67b919342558a323a9b14008d58f7472e48651ee43bd584879b011d78741a6212b7e81b1
|
7
|
+
data.tar.gz: f822f3a7f95e6c4ca38f78843a4309b764a624d053234bc880b672ec57cf011b88b3089f5a40836f328a8e1a366cb6660500609449a11eed16d5b0c7a86ac64e
|
data/lib/gared/googlebooks.rb
CHANGED
@@ -5,9 +5,13 @@ module Gared
|
|
5
5
|
def initialize(api_key, page_size = '40')
|
6
6
|
@options = {api_key: api_key, maxResults: page_size}
|
7
7
|
end
|
8
|
-
|
8
|
+
def uri_escape(s)
|
9
|
+
p = URI::Parser.new
|
10
|
+
return p.escape(s)
|
11
|
+
end
|
12
|
+
|
9
13
|
def query_publications_by_person(person, ctx = nil)
|
10
|
-
url = "https://www.googleapis.com/books/v1/volumes?q=inauthor:#{
|
14
|
+
url = "https://www.googleapis.com/books/v1/volumes?q=inauthor:#{uri_escape(person)}&filter=full&key=#{@options[:api_key]}&maxResults=#{@options[:maxResults]}"
|
11
15
|
resp = JSON.parse(RestClient.get(url))
|
12
16
|
|
13
17
|
total = resp['totalItems']
|
data/lib/gared/jpress.rb
CHANGED
@@ -1,35 +1,4 @@
|
|
1
1
|
module Gared
|
2
2
|
class Jpress
|
3
|
-
require 'watir'
|
4
|
-
|
5
|
-
def initialize
|
6
|
-
@browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
|
7
|
-
end
|
8
|
-
|
9
|
-
def query_persons(q)
|
10
|
-
end
|
11
|
-
|
12
|
-
def query_person(person)
|
13
|
-
end
|
14
|
-
|
15
|
-
def query_publications(q)
|
16
|
-
end
|
17
|
-
|
18
|
-
def query_publication(publication)
|
19
|
-
end
|
20
|
-
|
21
|
-
# return in-memory Publication instances with associated Holdings
|
22
|
-
def query_publications_by_person(person, ctx = nil)
|
23
|
-
@browser.goto 'http://web.nli.org.il/sites/JPress/Hebrew/Pages/default.aspx'
|
24
|
-
@browser.wait
|
25
|
-
|
26
|
-
t = @browser.text_field(id: 'ctl00_PlaceHolderHeader_PlaceHolderSearchArea_ctl00_ctl00_ctl00_SD01C0892_InputKeywords') # srsly, Micro$oft
|
27
|
-
t.set(person)
|
28
|
-
@browser.a(id: 'ctl00_PlaceHolderHeader_PlaceHolderSearchArea_ctl00_ctl00_ctl00_SD01C0892_go').click # "quick search" - not necessarily by author!
|
29
|
-
@browser.wait
|
30
|
-
ret = []
|
31
|
-
begin
|
32
|
-
|
33
|
-
end
|
34
3
|
end
|
35
4
|
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require 'rest-client'
|
2
|
+
|
3
|
+
module Gared
|
4
|
+
class Nli_Api
|
5
|
+
def initialize(url, api_key)
|
6
|
+
@options = {url: url, api_key: api_key}
|
7
|
+
end
|
8
|
+
def uri_escape(s)
|
9
|
+
p = URI::Parser.new
|
10
|
+
return p.escape(s)
|
11
|
+
end
|
12
|
+
|
13
|
+
def query_persons(q)
|
14
|
+
end
|
15
|
+
|
16
|
+
def query_person(person)
|
17
|
+
end
|
18
|
+
|
19
|
+
def query_publications(q)
|
20
|
+
end
|
21
|
+
|
22
|
+
def query_publication(publication)
|
23
|
+
end
|
24
|
+
def fetch_value_by_dc_key(record, key)
|
25
|
+
ret = ''
|
26
|
+
fullkey = key[0] == '@' ? key : 'http://purl.org/dc/elements/1.1/' + key
|
27
|
+
unless record.nil?
|
28
|
+
unless record[fullkey].nil?
|
29
|
+
if record[fullkey].class == String
|
30
|
+
ret = record[fullkey ]
|
31
|
+
elsif record[fullkey].class == Array
|
32
|
+
ret = record[fullkey].map{|x| x['@value'] }.join('; ')
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
ret
|
37
|
+
end
|
38
|
+
# return in-memory Publication instances with associated Holdings
|
39
|
+
def query_publications_by_person(person, ctx = nil)
|
40
|
+
ret = []
|
41
|
+
begin
|
42
|
+
# first run obtain counts for the query
|
43
|
+
escaped_person = uri_escape(person)
|
44
|
+
url = @options[:url]+"?api_key=#{@options[:api_key]}&query=creator,contains,#{escaped_person},AND;language,exact,heb&sort_field=title&material_type=book&count_mode=true"
|
45
|
+
json = JSON.parse(RestClient::Resource.new(url,verify_ssl: OpenSSL::SSL::VERIFY_NONE).get)
|
46
|
+
total = json['countInfos']['total']
|
47
|
+
# then start loading the results
|
48
|
+
result_page = 1
|
49
|
+
recs = []
|
50
|
+
while recs.length < total
|
51
|
+
url = @options[:url]+"?api_key=#{@options[:api_key]}&query=creator,contains,#{escaped_person},AND;language,exact,heb&sort_field=title&material_type=book&result_page=#{result_page}"
|
52
|
+
puts "DBG: retrieving results page #{result_page}"
|
53
|
+
json = JSON.parse(RestClient::Resource.new(url,verify_ssl: OpenSSL::SSL::VERIFY_NONE).get)
|
54
|
+
recs += json
|
55
|
+
result_page += 1
|
56
|
+
# sleep 1 # respect the server and avoid flood-blocking
|
57
|
+
end
|
58
|
+
recs.each do |r|
|
59
|
+
begin
|
60
|
+
p = Publication.new(ctx)
|
61
|
+
p.title = fetch_value_by_dc_key(r, 'title')
|
62
|
+
p.author_line = fetch_value_by_dc_key(r, 'creator')
|
63
|
+
p.language = fetch_value_by_dc_key(r, 'language')
|
64
|
+
p.notes = "#{fetch_value_by_dc_key(r, 'format')}\n#{fetch_value_by_dc_key(r, 'subject')}"
|
65
|
+
p.publisher_line = fetch_value_by_dc_key(r,'publisher')
|
66
|
+
p.pub_year = fetch_value_by_dc_key(r, 'non_standard_date')
|
67
|
+
p.source_id = fetch_value_by_dc_key(r, '@id')
|
68
|
+
# collect additional URLS from record, for clients to be able to determine whether a scanned object exists
|
69
|
+
additional_urls = []
|
70
|
+
r.keys.each do |key|
|
71
|
+
val = fetch_value_by_dc_key(r, key)
|
72
|
+
additional_urls << val if val =~ /https?:[^\s]\/\//
|
73
|
+
end
|
74
|
+
p.additional_urls = additional_urls if additional_urls.length > 0
|
75
|
+
h = Holding.new
|
76
|
+
h.source_id = p.source_id
|
77
|
+
h.source_name = 'NLI API'
|
78
|
+
h.location = fetch_value_by_dc_key(r, 'recordid')
|
79
|
+
p.add_holding(h)
|
80
|
+
ret << p
|
81
|
+
rescue Exception
|
82
|
+
puts $!
|
83
|
+
end
|
84
|
+
end
|
85
|
+
# TODO: also collect IIIF links for the *subset* of titles that have them, using the availability_type param. No way to get that in the above query -- the fields are not emitted.
|
86
|
+
# the URL is like https://api.nli.org.il/openlibrary/search?api_key=(((KEY)))&query=title,contains,querystring&availability_type=online_and_api_access&material_type=books
|
87
|
+
rescue Exception
|
88
|
+
puts $!
|
89
|
+
end
|
90
|
+
return ret
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
data/lib/gared/primo.rb
CHANGED
@@ -6,7 +6,11 @@ module Gared
|
|
6
6
|
def initialize(url, institution)
|
7
7
|
@options = {url: url, institution: institution}
|
8
8
|
end
|
9
|
-
|
9
|
+
def uri_escape(s)
|
10
|
+
p = URI::Parser.new
|
11
|
+
return p.escape(s)
|
12
|
+
end
|
13
|
+
|
10
14
|
def query_persons(q)
|
11
15
|
end
|
12
16
|
|
@@ -23,14 +27,14 @@ module Gared
|
|
23
27
|
def query_publications_by_person(person, ctx = nil)
|
24
28
|
ret = []
|
25
29
|
begin
|
26
|
-
url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{
|
30
|
+
url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{uri_escape(person)}&indx=1&bulkSize=50&query=facet_rtype,exact,books&json=true"
|
27
31
|
json = JSON.parse(RestClient.get(url))
|
28
32
|
total = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['@TOTALHITS'].to_i
|
29
33
|
start_at = 1
|
30
34
|
recs = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC'] # stash the records
|
31
35
|
while recs.length < total
|
32
36
|
start_at += 50
|
33
|
-
url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{
|
37
|
+
url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{uri_escape(person)}&indx=#{start_at}&bulkSize=50&query=facet_rtype,exact,books&json=true"
|
34
38
|
json = JSON.parse(RestClient.get(url))
|
35
39
|
recs += json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC']
|
36
40
|
sleep 1 # respect the server and avoid flood-blocking
|
@@ -55,7 +59,8 @@ module Gared
|
|
55
59
|
h = Holding.new
|
56
60
|
h.source_id = p.source_id
|
57
61
|
h.source_name = 'Primo:'+@options[:institution]
|
58
|
-
|
62
|
+
|
63
|
+
h.location = r['LIBRARIES']['LIBRARY'][0].nil? ? r['LIBRARIES']['LIBRARY']['callNumber'] : r['LIBRARIES']['LIBRARY'][0]['callNumber'] # there seem to be two cases, different between NLI and TAU, for example
|
59
64
|
p.add_holding(h)
|
60
65
|
ret << p
|
61
66
|
rescue Exception
|
data/lib/gared.rb
CHANGED
data/test/test_gared.rb
CHANGED
@@ -1,31 +1,53 @@
|
|
1
|
+
require 'minitest/byebug' if ENV['DEBUG']
|
1
2
|
require 'minitest/autorun'
|
2
3
|
require 'gared'
|
3
4
|
|
4
5
|
class GaredTest < Minitest::Test
|
5
6
|
|
6
|
-
def
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
7
|
+
def test_nli_api_query_publicatios_by_person
|
8
|
+
if ENV['NLI_API_KEY'].nil?
|
9
|
+
puts "skipping NLI API test because NLI_API_KEY envvar is not set"
|
10
|
+
return
|
11
|
+
end
|
12
|
+
puts "Testing NLI API"
|
13
|
+
nli = Gared::Nli_Api.new('https://api.nli.org.il/openlibrary/search', ENV['NLI_API_KEY'])
|
14
|
+
refute_nil nli
|
15
|
+
#recs = nli.query_publications_by_person('ביאליק')
|
16
|
+
recs = nli.query_publications_by_person('אילנאה')
|
11
17
|
refute_nil recs
|
12
18
|
refute_empty(recs)
|
13
19
|
refute_empty(recs[0].title)
|
14
20
|
end
|
15
21
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
refute_empty(recs
|
24
|
-
|
22
|
+
# temporarily disabled until we find another Primo server to test against
|
23
|
+
# def test_primo_query_publicatios_by_person
|
24
|
+
# puts "Testing Primo"
|
25
|
+
# primo = Gared::Primo.new('http://primo.nli.org.il/PrimoWebServices/xservice/search/brief', 'NNL')
|
26
|
+
# refute_nil primo
|
27
|
+
# recs = primo.query_publications_by_person('אילנאה')
|
28
|
+
# refute_nil recs
|
29
|
+
# refute_empty(recs)
|
30
|
+
# refute_empty(recs[0].title)
|
31
|
+
# end
|
32
|
+
|
33
|
+
# temporarily disabled until we find another Aleph server to test against
|
34
|
+
# def test_aleph_query_publicatios_by_person
|
35
|
+
# puts "Testing Aleph"
|
36
|
+
# aleph = Gared::Aleph.new('aleph.nli.org.il', 9991, 'NNL01')
|
37
|
+
# refute_nil aleph
|
38
|
+
# recs = aleph.query_publications_by_person('אילנאה')
|
39
|
+
# refute_nil recs
|
40
|
+
# refute_empty(recs)
|
41
|
+
# refute_empty(recs[0].title)
|
42
|
+
# end
|
25
43
|
|
26
44
|
def test_googlebooks_query_publicatios_by_person
|
45
|
+
if ENV['GOOGLE_API_KEY'].nil?
|
46
|
+
puts "skipping Google Books API test because GOOGLE_API_KEY envvar is not set"
|
47
|
+
return
|
48
|
+
end
|
27
49
|
puts "Testing Google Books"
|
28
|
-
gb = Gared::Googlebooks.new('
|
50
|
+
gb = Gared::Googlebooks.new(ENV['GOOGLE_API_KEY'])
|
29
51
|
refute_nil gb
|
30
52
|
recs = gb.query_publications_by_person('מנדלי')
|
31
53
|
refute_nil recs
|
@@ -58,4 +80,4 @@ class GaredTest < Minitest::Test
|
|
58
80
|
assert_empty(recs)
|
59
81
|
end
|
60
82
|
|
61
|
-
end
|
83
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gared
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.26
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Asaf Bartov
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: zoom
|
@@ -93,6 +93,7 @@ files:
|
|
93
93
|
- lib/gared/holding.rb
|
94
94
|
- lib/gared/idea.rb
|
95
95
|
- lib/gared/jpress.rb
|
96
|
+
- lib/gared/nli_api.rb
|
96
97
|
- lib/gared/person.rb
|
97
98
|
- lib/gared/primo.rb
|
98
99
|
- lib/gared/publication.rb
|
@@ -116,8 +117,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
117
|
- !ruby/object:Gem::Version
|
117
118
|
version: '0'
|
118
119
|
requirements: []
|
119
|
-
|
120
|
-
rubygems_version: 2.7.7
|
120
|
+
rubygems_version: 3.1.4
|
121
121
|
signing_key:
|
122
122
|
specification_version: 4
|
123
123
|
summary: Scrape Hebrew bibliography sources
|