RubyGems - gared - Versions diffs - 0.0.7 - Mend

gared 0.0.7

Files changed (13) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: d11aa7725f44fab48661ca7739a9f6a41e1ee0c686bc3c3df02279654b427616
+  data.tar.gz: c88f9cb2fdd8884f1f954e183dc54b055f6a38f5225b36a5ca375a08606e5436
+SHA512:
+  metadata.gz: 3a24efea632ca0dc83f1e8f7ae7859926ae94073afe259504ff40873906d146dcbeb28ea7ae6614a18400051e800add1fcd1c5627149e4041d31a04f178e9c6a
+  data.tar.gz: cb7d1a344a0a432f88273054e21e98304b87e77f57b1131b0b37a85d56e66c7db4129f60e35b73181c5c4ba78875f56f58ff95c3eab31b6eefae5919fae75d3e

data/lib/gared.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Gared
+  require 'gared/publication'
+  require 'gared/holding'
+  require 'gared/person'
+  require 'gared/primo'
+  require 'gared/aleph'
+  require 'gared/hebrewbooks'
+  require 'gared/idea'
+  require 'gared/googlebooks'
+  # ...
+end

data/lib/gared/aleph.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# Z39.50 values according to https://www.loc.gov/z3950/agency/bib1.html
+# and NLI info according to http://web.nli.org.il/sites/NLI/Hebrew/infochannels/librarians/Pages/Z39.50.aspx
+# Name of Database: NNL01
+# Host name: aleph.nli.org.il
+# IP address: 192.114.7.200
+# Port: 9991
+# Character-set: UTF-8
+# We support the following record syntaxes:
+# USMARC, OPAC, XML
+# We support the following word searches:
+# 1016, 1017, 1,1003, 1004,4,21,30,31,7,12,1007,1031,1007,5028,1033
+# We support the following phrase searches:
+# 7,12,1,1003,1004,4,21,15
+# We support the following sorts:
+# 1,4,30,31,1003
+module Gared
+  require 'zoom'
+  require 'nokogiri'
+  class Aleph
+    def initialize(host, port, database, syntax = 'USMARC')
+      @options = {host: host, port: port, database: database, syntax: syntax}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    def query_publications_by_person(person)
+      ZOOM::Connection.open(@options[:host], @options[:port]) do |conn|
+        conn.database_name = @options[:database] # 'aleph.nli.org.il',9991
+        conn.preferred_record_syntax = @options[:syntax]
+        rset = conn.search("@attr 1=1003 @attr 2=3 @attr 4=1 \"#{person}\"")
+        rr = rset.records
+        return nil if rr.nil? or rr.empty?
+        ret = []
+        rr.each do |r|
+          xml = Nokogiri::Slop(r.xml)
+          xml.remove_namespaces! # keeps biting me :)
+          # these scrapes are based on the National Library of Israel usage. No attempt to make it generic. :)
+          p = Publication.new
+          begin
+            p.author_line = xml.xpath('//datafield[@tag=\'100\']/subfield[@code=\'a\']')[0].text
+            # puts "author: #{p.author_line}" # DEBUG
+          rescue
+            nil
+          end
+          begin
+            p.title = xml.xpath('//datafield[@tag=\'245\']/subfield[@code=\'a\']')[0].text
+            # puts "title: #{p.title}" # DEBUG
+          rescue
+            nil
+          end
+          begin
+            p.notes = xml.xpath('//datafield[@tag=\'500\']/subfield[@code=\'a\']').collect{|note| note.text}.join("\n")
+          rescue
+            nil
+          end
+          begin
+            h = Holding.new
+            h.source_id = xml.xpath('//datafield[@tag=\'090\']/subfield[@code=\'a\']')[0].text
+            h.source_name = @options[:database]
+            p.add_holding(h)
+            ret << p
+          rescue
+            nil # ignore records with no holdings; they may be archival files or other non-publications
+          end #
+        end
+        return ret
+      end
+    end
+  end
+end

data/lib/gared/googlebooks.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require 'rest-client'
+module Gared
+  class Googlebooks
+    def initialize(api_key, page_size = '40')
+      @options = {api_key: api_key, maxResults: page_size}
+    end
+    def query_publications_by_person(person)
+      url = "https://www.googleapis.com/books/v1/volumes?q=inauthor:#{URI.escape(person)}&filter=full&key=#{@options[:api_key]}&maxResults=#{@options[:maxResults]}"
+      resp = JSON.parse(RestClient.get(url))
+      total = resp['totalItems']
+      ret = []
+      if total > 0
+        start_at = 0
+        recs = resp['items']
+        while recs.length < total
+          start_at += @options[:maxResults]
+          resp = JSON.parse(RestClient.get(url+"&startIndex=#{start_at}"))
+          recs += resp['items']
+          sleep 2 # respect the server and avoid flood-blocking
+        end
+        recs.each do |r|
+          next unless r['accessInfo']['pdf']['isAvailable']
+          p = Publication.new
+          p.source_id = r['id']
+          p.title = r['volumeInfo']['title']
+          h = Holding.new
+          h.source_id = r['id']
+          h.source_name = 'Google Books'
+          p.add_holding(h)
+          ret << p
+        end
+      end
+      return ret
+    end
+  end
+end

data/lib/gared/hebrewbooks.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module Gared
+  class Hebrewbooks
+    require 'watir'
+    def initialize
+      @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    def query_publications_by_person(person)
+      @browser.goto 'http://hebrewbooks.org/home.aspx'
+      @browser.wait
+      t = @browser.text_field(id: 'cpMstr_author')
+      t.set(person)
+      @browser.form(id: 'form1').submit # get publications by person
+      @browser.wait
+      trs = @browser.div(id: 'dbresults').trs
+      ret = []
+      if trs.size > 0
+        trs.each do |tr|
+          p = Publication.new
+          p.title = tr.tds[0].text
+          p.author_line = tr.tds[1].text
+          p.source_id = tr.tds[0].a.href
+          h = Holding.new
+          h.source_id = tr.tds[0].a.href
+          h.source_name = 'Hebrewbooks'
+          p.add_holding(h)
+          ret << p
+        end
+      end
+      return ret
+      # TODO: support multiple result pages
+    end
+  end
+end

data/lib/gared/holding.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module Gared
+  class Holding
+    attr_accessor :source_id, :source_name
+  end
+end

data/lib/gared/idea.rb ADDED Viewed

@@ -0,0 +1,63 @@
+module Gared
+  class Idea
+    require 'watir'
+    def initialize(opac_url)
+      @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
+      @options = {opac_url: opac_url}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    def query_publications_by_person(person)
+      @browser.goto @options[:opac_url]
+      @browser.wait
+      t = @browser.text_field(id: 'get_var_0')
+      t.set(person)
+      @browser.input(id: 'cb_update_0').click # "quick search" - not necessarily by author!
+      @browser.wait
+      ret = []
+      results = @browser.div(id: 'results_list')
+      if results.exists?
+        trs = @browser.div(id: 'results_list').table.rows
+        if trs.size > 0
+          trs.each do |tr|
+            item = tr.tr.tds[1]
+            urlpart = item.h5.a.href
+            p = Publication.new
+            p.title = item.ps[0].text
+            p.author_line = item.ps[1].text.sub('מחבר: ','')
+            p.pub_year = item.ps[2].text.sub('שנה לועזית:','').sub('שנת הוצאה:','')
+            p.source_id = urlpart
+            ret << p
+          end
+          # now that we've extracted everything useful from this page, iterate over the results to pick up the system ID
+          ret.each do |item|
+            @browser.goto item.source_id
+            @browser.wait
+            item.source_id = @browser.tr(id: '1').span(:class => 'bidie').text
+            # doesn't look like there's much more to learn from the Holdings screen, since we don't care if there's more than one copy or not
+            # @browser.goto @browser.ul(id: 'itemTabs').li(id: '2').a.href # check holdings
+            h = Holding.new
+            h.source_id = item.source_id
+            h.source_name = @options[:opac_url]
+            item.add_holding(h)
+          end
+        end
+      end
+      return ret
+      # TODO: support multiple result pages
+    end
+  end
+end

data/lib/gared/jpress.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module Gared
+  class Jpress
+  end
+end

data/lib/gared/person.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Gared
+  class Person
+    attr_accessor :name, :aliases, :source_id
+  end
+end

data/lib/gared/primo.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'rest-client'
+# require 'exlibris-primo' # using this gem doesn't support searching with facet filtering...
+module Gared
+  class Primo
+    def initialize(url, institution)
+      @options = {url: url, institution: institution}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    # return in-memory Publication instances with associated Holdings
+    def query_publications_by_person(person)
+      ret = []
+      begin
+        url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=1&bulkSize=50&query=facet_rtype,exact,books}&json=true"
+        json = JSON.parse(RestClient.get(url))
+        total = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['@TOTALHITS'].to_i
+        start_at = 1
+        recs = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC'] # stash the records
+        while recs.length < total
+          start_at += 50
+          url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=#{start_at}&bulkSize=50&query=facet_rtype,exact,books}&json=true"
+          json = JSON.parse(RestClient.get(url))
+          recs += json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC']
+          sleep 1 # respect the server and avoid flood-blocking
+        end
+        recs.each do |r|
+          deets = r['PrimoNMBib']['record']['display']
+          p = Publication.new
+          p.title = deets['title']
+          p.author_line = deets['creator']
+          p.notes = deets['subject']
+          p.publisher_line = deets['publisher']
+          p.pub_year = deets['creationdate']
+          p.source_id = r['PrimoNMBib']['record']['control']['recordid']
+          h = Holding.new
+          h.source_id = p.source_id
+          h.source_name = 'Primo:'+@options[:institution]
+          p.add_holding(h)
+          ret << p
+        end
+      rescue Exception
+        puts $!
+      end
+      return ret
+    end
+  end
+end

data/lib/gared/publication.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Gared
+  class Publication
+    attr_accessor :title, :publisher_line, :author_line, :notes, :source_id, :holdings, :language, :pub_year
+    def initialize
+      @holdings = []
+    end
+    def add_holding(holding)
+      @holdings << holding
+    end
+  end
+end

data/test/test_gared.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'minitest/autorun'
+require 'gared'
+class GaredTest < Minitest::Test
+  def test_primo_query_publicatios_by_person
+    puts "Testing Primo"
+    primo = Gared::Primo.new('http://primo.nli.org.il/PrimoWebServices/xservice/search/brief', 'NNL')
+    refute_nil primo
+    recs = primo.query_publications_by_person('אילנאה')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_aleph_query_publicatios_by_person
+    puts "Testing Aleph"
+    aleph = Gared::Aleph.new('aleph.nli.org.il', 9991, 'NNL01')
+    refute_nil aleph
+    recs = aleph.query_publications_by_person('אילנאה')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_googlebooks_query_publicatios_by_person
+    puts "Testing Google Books"
+    gb = Gared::Googlebooks.new('AIzaSyCE2WFqTPdxAz1wv2f33hMfPWIF4tcocgM') # a key I made just for testing this gem. Please do not abuse.
+    refute_nil gb
+    recs = gb.query_publications_by_person('מנדלי')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_hebrewbooks_query_publicatios_by_person
+    skip("Skipping testing Hebrewbooks because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
+    puts "Testing Hebrewbooks"
+    hb = Gared::Hebrewbooks.new
+    refute_nil hb
+    recs = hb.query_publications_by_person('שיין')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_idea_query_publicatios_by_person
+    skip("Skipping testing IDEA because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
+    puts "Testing IDEA"
+    idea = Gared::Idea.new('http://infocenters.co.il/RAANANA/')
+    refute_nil idea
+    recs = idea.query_publications_by_person('גפלה, אופיר')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+    recs = idea.query_publications_by_person('אילנאהסןסן') # nonsense
+    refute_nil recs
+    assert_empty(recs)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,125 @@
+--- !ruby/object:Gem::Specification
+name: gared
+version: !ruby/object:Gem::Version
+  version: 0.0.7
+platform: ruby
+authors:
+- Asaf Bartov
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2018-06-29 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: zoom
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.5'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.5'
+- !ruby/object:Gem::Dependency
+  name: watir
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '6'
+- !ruby/object:Gem::Dependency
+  name: rest-client
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.1'
+description: A set of scrapers for bibliographic records of Hebrew titles
+email: asaf.bartov@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/gared.rb
+- lib/gared/aleph.rb
+- lib/gared/googlebooks.rb
+- lib/gared/hebrewbooks.rb
+- lib/gared/holding.rb
+- lib/gared/idea.rb
+- lib/gared/jpress.rb
+- lib/gared/person.rb
+- lib/gared/primo.rb
+- lib/gared/publication.rb
+- test/test_gared.rb
+homepage: https://gitlab.com/abartov/gared
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.7.7
+signing_key:
+specification_version: 4
+summary: Scrape Hebrew bibliography sources
+test_files:
+- test/test_gared.rb