RubyGems - gared - Versions diffs - 0.0.7 - Mend

gared 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: d11aa7725f44fab48661ca7739a9f6a41e1ee0c686bc3c3df02279654b427616
+  data.tar.gz: c88f9cb2fdd8884f1f954e183dc54b055f6a38f5225b36a5ca375a08606e5436
+SHA512:
+  metadata.gz: 3a24efea632ca0dc83f1e8f7ae7859926ae94073afe259504ff40873906d146dcbeb28ea7ae6614a18400051e800add1fcd1c5627149e4041d31a04f178e9c6a
+  data.tar.gz: cb7d1a344a0a432f88273054e21e98304b87e77f57b1131b0b37a85d56e66c7db4129f60e35b73181c5c4ba78875f56f58ff95c3eab31b6eefae5919fae75d3e

data/lib/gared.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Gared
+  require 'gared/publication'
+  require 'gared/holding'
+  require 'gared/person'
+  require 'gared/primo'
+  require 'gared/aleph'
+  require 'gared/hebrewbooks'
+  require 'gared/idea'
+  require 'gared/googlebooks'
+  # ...
+end

data/lib/gared/aleph.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# Z39.50 values according to https://www.loc.gov/z3950/agency/bib1.html
+# and NLI info according to http://web.nli.org.il/sites/NLI/Hebrew/infochannels/librarians/Pages/Z39.50.aspx
+# Name of Database: NNL01
+# Host name: aleph.nli.org.il
+# IP address: 192.114.7.200
+# Port: 9991
+# Character-set: UTF-8
+# We support the following record syntaxes:
+# USMARC, OPAC, XML
+# We support the following word searches:
+# 1016, 1017, 1,1003, 1004,4,21,30,31,7,12,1007,1031,1007,5028,1033
+# We support the following phrase searches:
+# 7,12,1,1003,1004,4,21,15
+# We support the following sorts:
+# 1,4,30,31,1003
+module Gared
+  require 'zoom'
+  require 'nokogiri'
+  class Aleph
+    def initialize(host, port, database, syntax = 'USMARC')
+      @options = {host: host, port: port, database: database, syntax: syntax}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    def query_publications_by_person(person)
+      ZOOM::Connection.open(@options[:host], @options[:port]) do |conn|
+        conn.database_name = @options[:database] # 'aleph.nli.org.il',9991
+        conn.preferred_record_syntax = @options[:syntax]
+        rset = conn.search("@attr 1=1003 @attr 2=3 @attr 4=1 \"#{person}\"")
+        rr = rset.records
+        return nil if rr.nil? or rr.empty?
+        ret = []
+        rr.each do |r|
+          xml = Nokogiri::Slop(r.xml)
+          xml.remove_namespaces! # keeps biting me :)
+          # these scrapes are based on the National Library of Israel usage. No attempt to make it generic. :)
+          p = Publication.new
+          begin
+            p.author_line = xml.xpath('//datafield[@tag=\'100\']/subfield[@code=\'a\']')[0].text
+            # puts "author: #{p.author_line}" # DEBUG
+          rescue
+            nil
+          end
+          begin
+            p.title = xml.xpath('//datafield[@tag=\'245\']/subfield[@code=\'a\']')[0].text
+            # puts "title: #{p.title}" # DEBUG
+          rescue
+            nil
+          end
+          begin
+            p.notes = xml.xpath('//datafield[@tag=\'500\']/subfield[@code=\'a\']').collect{|note| note.text}.join("\n")
+          rescue
+            nil
+          end
+          begin
+            h = Holding.new
+            h.source_id = xml.xpath('//datafield[@tag=\'090\']/subfield[@code=\'a\']')[0].text
+            h.source_name = @options[:database]
+            p.add_holding(h)
+            ret << p
+          rescue
+            nil # ignore records with no holdings; they may be archival files or other non-publications
+          end #
+        end
+        return ret
+      end
+    end
+  end
+end

data/lib/gared/googlebooks.rb ADDED Viewed

@@ -0,0 +1,39 @@
+require 'rest-client'
+module Gared
+  class Googlebooks
+    def initialize(api_key, page_size = '40')
+      @options = {api_key: api_key, maxResults: page_size}
+    end
+    def query_publications_by_person(person)
+      url = "https://www.googleapis.com/books/v1/volumes?q=inauthor:#{URI.escape(person)}&filter=full&key=#{@options[:api_key]}&maxResults=#{@options[:maxResults]}"
+      resp = JSON.parse(RestClient.get(url))
+      total = resp['totalItems']
+      ret = []
+      if total > 0
+        start_at = 0
+        recs = resp['items']
+        while recs.length < total
+          start_at += @options[:maxResults]
+          resp = JSON.parse(RestClient.get(url+"&startIndex=#{start_at}"))
+          recs += resp['items']
+          sleep 2 # respect the server and avoid flood-blocking
+        end
+        recs.each do |r|
+          next unless r['accessInfo']['pdf']['isAvailable']
+          p = Publication.new
+          p.source_id = r['id']
+          p.title = r['volumeInfo']['title']
+          h = Holding.new
+          h.source_id = r['id']
+          h.source_name = 'Google Books'
+          p.add_holding(h)
+          ret << p
+        end
+      end
+      return ret
+    end
+  end
+end

data/lib/gared/hebrewbooks.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module Gared
+  class Hebrewbooks
+    require 'watir'
+    def initialize
+      @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    def query_publications_by_person(person)
+      @browser.goto 'http://hebrewbooks.org/home.aspx'
+      @browser.wait
+      t = @browser.text_field(id: 'cpMstr_author')
+      t.set(person)
+      @browser.form(id: 'form1').submit # get publications by person
+      @browser.wait
+      trs = @browser.div(id: 'dbresults').trs
+      ret = []
+      if trs.size > 0
+        trs.each do |tr|
+          p = Publication.new
+          p.title = tr.tds[0].text
+          p.author_line = tr.tds[1].text
+          p.source_id = tr.tds[0].a.href
+          h = Holding.new
+          h.source_id = tr.tds[0].a.href
+          h.source_name = 'Hebrewbooks'
+          p.add_holding(h)
+          ret << p
+        end
+      end
+      return ret
+      # TODO: support multiple result pages
+    end
+  end
+end

data/lib/gared/holding.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module Gared
+  class Holding
+    attr_accessor :source_id, :source_name
+  end
+end

data/lib/gared/idea.rb ADDED Viewed

@@ -0,0 +1,63 @@
+module Gared
+  class Idea
+    require 'watir'
+    def initialize(opac_url)
+      @browser = Watir::Browser.new :chrome, options: {args: ['--no-sandbox', '--headless']}
+      @options = {opac_url: opac_url}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    def query_publications_by_person(person)
+      @browser.goto @options[:opac_url]
+      @browser.wait
+      t = @browser.text_field(id: 'get_var_0')
+      t.set(person)
+      @browser.input(id: 'cb_update_0').click # "quick search" - not necessarily by author!
+      @browser.wait
+      ret = []
+      results = @browser.div(id: 'results_list')
+      if results.exists?
+        trs = @browser.div(id: 'results_list').table.rows
+        if trs.size > 0
+          trs.each do |tr|
+            item = tr.tr.tds[1]
+            urlpart = item.h5.a.href
+            p = Publication.new
+            p.title = item.ps[0].text
+            p.author_line = item.ps[1].text.sub('מחבר: ','')
+            p.pub_year = item.ps[2].text.sub('שנה לועזית:','').sub('שנת הוצאה:','')
+            p.source_id = urlpart
+            ret << p
+          end
+          # now that we've extracted everything useful from this page, iterate over the results to pick up the system ID
+          ret.each do |item|
+            @browser.goto item.source_id
+            @browser.wait
+            item.source_id = @browser.tr(id: '1').span(:class => 'bidie').text
+            # doesn't look like there's much more to learn from the Holdings screen, since we don't care if there's more than one copy or not
+            # @browser.goto @browser.ul(id: 'itemTabs').li(id: '2').a.href # check holdings
+            h = Holding.new
+            h.source_id = item.source_id
+            h.source_name = @options[:opac_url]
+            item.add_holding(h)
+          end
+        end
+      end
+      return ret
+      # TODO: support multiple result pages
+    end
+  end
+end

data/lib/gared/jpress.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module Gared
+  class Jpress
+  end
+end

data/lib/gared/person.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module Gared
+  class Person
+    attr_accessor :name, :aliases, :source_id
+  end
+end

data/lib/gared/primo.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'rest-client'
+# require 'exlibris-primo' # using this gem doesn't support searching with facet filtering...
+module Gared
+  class Primo
+    def initialize(url, institution)
+      @options = {url: url, institution: institution}
+    end
+    def query_persons(q)
+    end
+    def query_person(person)
+    end
+    def query_publications(q)
+    end
+    def query_publication(publication)
+    end
+    # return in-memory Publication instances with associated Holdings
+    def query_publications_by_person(person)
+      ret = []
+      begin
+        url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=1&bulkSize=50&query=facet_rtype,exact,books}&json=true"
+        json = JSON.parse(RestClient.get(url))
+        total = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['@TOTALHITS'].to_i
+        start_at = 1
+        recs = json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC'] # stash the records
+        while recs.length < total
+          start_at += 50
+          url = @options[:url]+"?institution=#{@options[:institution]}&query=creator,contains,#{URI.escape(person)}&indx=#{start_at}&bulkSize=50&query=facet_rtype,exact,books}&json=true"
+          json = JSON.parse(RestClient.get(url))
+          recs += json['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC']
+          sleep 1 # respect the server and avoid flood-blocking
+        end
+        recs.each do |r|
+          deets = r['PrimoNMBib']['record']['display']
+          p = Publication.new
+          p.title = deets['title']
+          p.author_line = deets['creator']
+          p.notes = deets['subject']
+          p.publisher_line = deets['publisher']
+          p.pub_year = deets['creationdate']
+          p.source_id = r['PrimoNMBib']['record']['control']['recordid']
+          h = Holding.new
+          h.source_id = p.source_id
+          h.source_name = 'Primo:'+@options[:institution]
+          p.add_holding(h)
+          ret << p
+        end
+      rescue Exception
+        puts $!
+      end
+      return ret
+    end
+  end
+end

data/lib/gared/publication.rb ADDED Viewed

@@ -0,0 +1,14 @@
+module Gared
+  class Publication
+    attr_accessor :title, :publisher_line, :author_line, :notes, :source_id, :holdings, :language, :pub_year
+    def initialize
+      @holdings = []
+    end
+    def add_holding(holding)
+      @holdings << holding
+    end
+  end
+end

data/test/test_gared.rb ADDED Viewed

@@ -0,0 +1,61 @@
+require 'minitest/autorun'
+require 'gared'
+class GaredTest < Minitest::Test
+  def test_primo_query_publicatios_by_person
+    puts "Testing Primo"
+    primo = Gared::Primo.new('http://primo.nli.org.il/PrimoWebServices/xservice/search/brief', 'NNL')
+    refute_nil primo
+    recs = primo.query_publications_by_person('אילנאה')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_aleph_query_publicatios_by_person
+    puts "Testing Aleph"
+    aleph = Gared::Aleph.new('aleph.nli.org.il', 9991, 'NNL01')
+    refute_nil aleph
+    recs = aleph.query_publications_by_person('אילנאה')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_googlebooks_query_publicatios_by_person
+    puts "Testing Google Books"
+    gb = Gared::Googlebooks.new('AIzaSyCE2WFqTPdxAz1wv2f33hMfPWIF4tcocgM') # a key I made just for testing this gem. Please do not abuse.
+    refute_nil gb
+    recs = gb.query_publications_by_person('מנדלי')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_hebrewbooks_query_publicatios_by_person
+    skip("Skipping testing Hebrewbooks because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
+    puts "Testing Hebrewbooks"
+    hb = Gared::Hebrewbooks.new
+    refute_nil hb
+    recs = hb.query_publications_by_person('שיין')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+  end
+  def test_idea_query_publicatios_by_person
+    skip("Skipping testing IDEA because chromedriver not found") unless `chromedriver -v` =~ /ChromeDriver/
+    puts "Testing IDEA"
+    idea = Gared::Idea.new('http://infocenters.co.il/RAANANA/')
+    refute_nil idea
+    recs = idea.query_publications_by_person('גפלה, אופיר')
+    refute_nil recs
+    refute_empty(recs)
+    refute_empty(recs[0].title)
+    recs = idea.query_publications_by_person('אילנאהסןסן') # nonsense
+    refute_nil recs
+    assert_empty(recs)
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,125 @@
+--- !ruby/object:Gem::Specification
+name: gared
+version: !ruby/object:Gem::Version
+  version: 0.0.7
+platform: ruby
+authors:
+- Asaf Bartov
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2018-06-29 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: zoom
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.5'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.5'
+- !ruby/object:Gem::Dependency
+  name: watir
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '6'
+- !ruby/object:Gem::Dependency
+  name: rest-client
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '2'
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
+- !ruby/object:Gem::Dependency
+  name: minitest
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.1'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.1'
+description: A set of scrapers for bibliographic records of Hebrew titles
+email: asaf.bartov@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/gared.rb
+- lib/gared/aleph.rb
+- lib/gared/googlebooks.rb
+- lib/gared/hebrewbooks.rb
+- lib/gared/holding.rb
+- lib/gared/idea.rb
+- lib/gared/jpress.rb
+- lib/gared/person.rb
+- lib/gared/primo.rb
+- lib/gared/publication.rb
+- test/test_gared.rb
+homepage: https://gitlab.com/abartov/gared
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.7.7
+signing_key:
+specification_version: 4
+summary: Scrape Hebrew bibliography sources
+test_files:
+- test/test_gared.rb