RubyGems - bookshark - Versions diffs - 1.0.1 → 1.0.3 - Mend

bookshark 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/bookshark.gemspec +2 -0
data/lib/bookshark.rb +113 -93
data/lib/bookshark/extractors/bibliographical_book_extractor.rb +10 -3
data/lib/bookshark/extractors/nlg/base.rb +110 -0
data/lib/bookshark/extractors/nlg/book_extractor.rb +28 -0
data/lib/bookshark/version.rb +1 -1
data/spec/test_data/eager_book_184923.json +3 -3
data/spec/test_data/search_01.json +4 -4
metadata +32 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d35fee946c6b6dcf4ca740d89ba3a9cb89f36a94
-  data.tar.gz: ff928cdadd16b132adc9f193ff5c7f565a0b0398
+  metadata.gz: '03852f73c9246676ff20b75b0a893998e967f2c7'
+  data.tar.gz: 7fe938710c2e9344563395e5db7cf3e7eaf76b25
 SHA512:
-  metadata.gz: 2dac9ad4842172d896a488fa60baaf13d862c8d2e3b1e68b3a9f5e6ee28bec5136d4ed7d084c4bf6fa0f5f1cfd3224241958467c7df99929ea8cfc1c5ad92abc
-  data.tar.gz: e8f9dcb4f20e0a2330a91588c6dfbb306a74820ee9ed7fa7564097013c16244de46ffc3a636a751126f19ce9c1a32b209fb254d7533d901d6f77a00a6b8b5100
+  metadata.gz: 3ccd4cfd0e82aa6918304e82df18f382f2b7e33470c9f52ff362b4897aaf53b703a04bd2689871925293d7e084d3e0008c74aa289fb36a106be7dca20c01ee0f
+  data.tar.gz: fb4590ee8c1a24402f48b3502ddb77e24fbbde2c988a18fb62fe867762950f351d3271512773602bd9cec47d7e51a6cb94de1e2d695cf7c0d8704a39bc72f905

data/bookshark.gemspec CHANGED

@@ -24,9 +24,11 @@ Gem::Specification.new do |spec|
   spec.add_dependency "sanitize", "~> 4.0"
   spec.add_dependency "json", "~> 1.8"
   spec.add_dependency "htmlentities", "~> 4.3"
+  spec.add_dependency "marc", "~> 1.0"
   spec.add_development_dependency "bundler", ">= 1.6"
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_development_dependency 'rspec', "~> 3.2"
   spec.add_development_dependency "webmock", "~> 1.2"
+  spec.add_development_dependency "pry-byebug", "~> 3.4"
 end

data/lib/bookshark.rb CHANGED

@@ -7,6 +7,7 @@ require 'bookshark/extractors/book_extractor'
 require 'bookshark/extractors/bibliographical_book_extractor'
 require 'bookshark/extractors/publisher_extractor'
 require 'bookshark/extractors/search'
+require 'bookshark/extractors/nlg/book_extractor'
 require 'bookshark/crawlers/base'
 require 'bookshark/crawlers/publisher_crawler'
@@ -22,8 +23,8 @@ module Bookshark
   def self.root
     # File.dirname __dir__ # Works only on ruby > 2.0.0
     File.expand_path(File.join(File.dirname(__FILE__), '../'))
-  end
+  end
   def self.path_to_storage
     File.join root, 'lib/bookshark/storage'
   end
@@ -31,7 +32,7 @@ module Bookshark
   class Extractor
     include FileManager
-    attr_accessor :site, :format
+    attr_accessor :site, :format
     def initialize(options = {})
       options = DEFAULTS.merge(options)
@@ -44,9 +45,9 @@ module Bookshark
       options[:format] ||= @format
       author_extractor = Biblionet::Extractors::AuthorExtractor.new
-      author = author_extractor.load_and_extract_author(uri)
-      response = {}
+      author = author_extractor.load_and_extract_author(uri)
+      response = {}
       response[:author] = !author.nil? ? [author] : []
       response = change_format(response, options[:format])
       return response
@@ -58,90 +59,109 @@ module Bookshark
       publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
       publisher = publisher_extractor.load_and_extract_publisher(uri)
-      response = {}
+      response = {}
       response[:publisher] = !publisher.nil? ? [publisher] : []
       response = change_format(response, options[:format])
       response = publisher_extractor.decode_text(response)
       return response
-      # return uri
-    end
+      # return uri
+    end
     def book(options = {})
-      book_extractor = Biblionet::Extractors::BookExtractor.new
-      if book_extractor.present?(options[:isbn])
-        search_engine = Biblionet::Extractors::Search.new
-        options[:id]  = search_engine.search_by_isbn(options[:isbn])
-      end
+      options[:site] ||= @site
-      uri = process_options(options, __method__)
-      options[:format]  ||= @format
-      options[:eager]   ||= false
-      options[:nilify]  ||= false
-      if options[:eager]
-        book = eager_extract_book(uri)
-      else
-        book = book_extractor.load_and_extract_book(uri)
-      end
+      if options[:site] == 'biblionet'
+        book_extractor = Biblionet::Extractors::BookExtractor.new
-      response = {}
-      response[:book] = !book.nil? ? [book] : []
+        if book_extractor.present?(options[:isbn])
+          search_engine = Biblionet::Extractors::Search.new
+          options[:id]  = search_engine.search_by_isbn(options[:isbn])
+        end
-      return nil if response[:book].empty? and options[:nilify]
-      response = change_format(response, options[:format])
-      response = book_extractor.decode_text(response) if response.class == "String"
-      return response
+        uri = process_options(options, __method__)
+        options[:format]  ||= @format
+        options[:eager]   ||= false
+        options[:nilify]  ||= false
+        if options[:eager]
+          book = eager_extract_book(uri)
+        else
+          book = book_extractor.load_and_extract_book(uri)
+        end
+        response = {}
+        response[:book] = !book.nil? ? [book] : []
+        return nil if response[:book].empty? and options[:nilify]
+        response = change_format(response, options[:format])
+        response = book_extractor.decode_text(response) if response.class == "String"
+        return response
+      elsif options[:site] == 'nlg'
+        book_extractor = Nlg::Extractors::BookExtractor.new
+        options[:format] ||= @format
+        # if !options[:uri].nil?
+        #   uri = "#{options[:uri]}/Export?style=MARCXML"
+        # elsif !options[:id].nil?
+        #   uri = "http://nbib.nlg.gr/Record/#{options[:id]}/Export?style=MARCXML"
+        # end
+        book = book_extractor.load_and_extract_book(options[:id])
+        response = {}
+        response[:book] = !book.nil? ? [book] : []
+      end
     end
     # def bibliographical_book(options = {})
     #   bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
     #   uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{options[:id]}"
     #   options[:format]  ||= @format
     #   book = bibliographical_book_extractor.load_and_extract_book(uri)
-    #   response = {}
+    #   response = {}
     #   response[:book] = !book.nil? ? [book] : []
     #   response = change_format(response, options[:format])
-    #   response = bibliographical_book_extractor.decode_text(response)
-    # end
+    #   response = bibliographical_book_extractor.decode_text(response)
+    # end
     # puts Bookshark::Extractor.new(format: 'pretty_json').bibliographical_book(id: 103788)
     def category(options = {})
       uri = process_options(options, __method__)
-      options[:format] ||= @format
+      options[:format] ||= @format
       category_extractor = Biblionet::Extractors::CategoryExtractor.new
       category = category_extractor.extract_categories_from(uri)
-      response = {}
+      response = {}
       response[:category] = !category.nil? ? [category] : []
       response = change_format(response, options[:format])
-      return response
+      return response
     end
     def search(options = {})
       options[:format]        ||= @format
-      options[:results_type]  ||= 'metadata'
+      options[:results_type]  ||= 'metadata'
       search_engine  = Biblionet::Extractors::Search.new
       search_results = search_engine.perform_search(options)
-      response = {}
+      response = {}
       response[:book] = search_results
       response = change_format(response, options[:format])
-      return response
+      return response
     end
     # def books_from_storage
@@ -165,22 +185,22 @@ module Bookshark
         record = book(id: book_id, local: true, format: format, nilify: true)
         dir_to_save = Bookshark.path_to_storage + '/' + 'json_book_records/' + "#{((book_id-1)/1000)}/" + "book_#{book_id}.json"
         save_to(dir_to_save, record) unless record.nil?
       end
     end
-    def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
+    def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
       list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
-        dir_to_save = dir.gsub(source_dir, target_dir)
+        dir_to_save = dir.gsub(source_dir, target_dir)
         list_files(path: dir, extension: 'html', all:true).each do |file|
-          puts "Extracting from file: " + file.to_s
+          puts "Extracting from file: " + file.to_s
           # Extract publisher metadata form local file.
-          options = {uri: file, format: 'pretty_json', local: true}
+          options = {uri: file, format: 'pretty_json', local: true}
           case metadata_type
           when 'author'
             record = author(options)
@@ -189,16 +209,16 @@ module Bookshark
           # when 'book'
           #   record = book(options)
           when 'category'
-            record = category(options)
-          end
+            record = category(options)
+          end
           # Prepare a path to save the new file.
           filename  = File.basename(file,".*")
           path_to_save = "#{dir_to_save}#{filename}.json"
-          # Save to file.
+          # Save to file.
           save_to("#{path_to_save}", record)
         end # unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
       end
     end
@@ -209,9 +229,9 @@ module Bookshark
       # end
       category_extractor = Biblionet::Extractors::CategoryExtractor.new
       all_categories = Hash.new
       list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
-        categories = category_extractor.extract_categories_from(file)
+        categories = category_extractor.extract_categories_from(file)
         all_categories.merge!(categories) unless categories.nil? or categories.empty?
       end
@@ -228,19 +248,19 @@ module Bookshark
       list_directories(path: 'storage/raw_html_pages').each do |dir|
         dir_to_save = dir.gsub(/raw_html_pages/, 'books')
-        list_files(path: dir, extension: 'html', all:true).each do |file|
+        list_files(path: dir, extension: 'html', all:true).each do |file|
           # Load the book from html file and parse the data.
           # pp "Parsing book: #{file}"
           pp file
           book = bp.load_and_extract_book(file)
           # Prepare a path to save the new file.
           filename  = File.basename(file,".*")
           path_to_save = "#{dir_to_save}#{filename}.json"
-          # Save to file.
+          # Save to file.
           bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
           # pp "Book #{file} saved!"
         end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
@@ -266,11 +286,11 @@ module Bookshark
           url_method    = 'book'
           local_path    = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
         when 'category'
-          url_method    = 'index'
-          local_path    = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
+          url_method    = 'index'
+          local_path    = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
         else
           puts "Called from unknown method. Probably its rspec."
-        end
+        end
         options[:local] ||= false
         url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
@@ -279,7 +299,7 @@ module Bookshark
       uri = options[:uri] ||= url
       return uri
-    end
+    end
     def change_format(hash, format)
       case format
@@ -288,10 +308,10 @@ module Bookshark
       when 'json'
         hash = hash.to_json
       when 'pretty_json'
-        hash = JSON.pretty_generate(hash)
+        hash = JSON.pretty_generate(hash)
       end
       return hash
-    end
+    end
     def eager_extract_book(uri)
       book_extractor      = Biblionet::Extractors::BookExtractor.new
@@ -301,13 +321,13 @@ module Bookshark
       book = book_extractor.load_and_extract_book(uri)
-      tmp_data = []
+      tmp_data = []
       book[:author].each do |author|
-        tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
+        tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
       end
-      book[:author] = tmp_data
-      tmp_data, tmp_hash = [], {}
+      book[:author] = tmp_data
+      tmp_data, tmp_hash = [], {}
       book[:contributors].each do |job, contributors|
         contributors.each do |contributor|
           tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
@@ -317,19 +337,19 @@ module Bookshark
       end
       book[:contributors] = tmp_hash
-      tmp_data, tmp_hash = [], {}
+      tmp_data, tmp_hash = [], {}
       book[:category].each do |category|
         tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
       end
-      book[:category] = tmp_data
-      tmp_data = []
-      tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
+      book[:category] = tmp_data
+      tmp_data = []
+      tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
       book[:publisher] = tmp_data
       book
-    end
+    end
   end
@@ -339,7 +359,7 @@ module Bookshark
     def initialize(options = {})
       options = DEFAULTS.merge(options)
-      @site   = options[:site]
+      @site   = options[:site]
     end
     def publishers
@@ -362,11 +382,11 @@ module Bookshark
       crawler.crawl_and_save
     end
-  end
+  end
 #   module Biblionet
 #     class Extract
-#       class << self
+#       class << self
 #         def author(uri=nil)
 #           author_extractor = BiblionetParser::Core::AuthorExtractor.new
 #           author_extractor.load_and_extract_author(uri)
@@ -384,7 +404,7 @@ module Bookshark
 #       end
 #     end
-#   end
+#   end
 end
@@ -467,4 +487,4 @@ end
 # Problematic at biblionet
 # http://biblionet.gr/book/196388
 # http://biblionet.gr/book/196386
-# http://biblionet.gr/book/195525
+# http://biblionet.gr/book/195525

data/lib/bookshark/extractors/bibliographical_book_extractor.rb CHANGED

@@ -170,10 +170,17 @@ module Biblionet
               text: publisher_node.text,
               b_id: (publisher_node[:href].split("/"))[2]
             }
-            after_last_author_text = @nodeset
+            last_author = @nodeset
               .xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]][last()]").last
-              .next_sibling.text.strip
-            puts after_last_author_text
+            if !last_author.nil? && !last_author.empty?
+              after_last_author_text = last_author.next_sibling.text.strip
+            else
+              last_book = @nodeset
+                .xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][last()]").last
+              after_last_author_text = last_book.next_sibling.text.strip
+            end
             details_hash[:publication] = {
               year: after_last_author_text[/(?<=: )\d+(?=\.)/],
               version: after_last_author_text[/(?<=- )\d+(?=η)/],

data/lib/bookshark/extractors/nlg/base.rb ADDED

@@ -0,0 +1,110 @@
+#!/bin/env ruby
+# encoding: utf-8
+require 'rubygems'
+require 'json'
+require 'logger'
+require 'pp'
+require 'marc'
+require 'htmlentities'
+module Nlg
+  module Extractors
+    class Base
+      attr_reader :url, :nlg_id, :page
+      def initialize(id=nil)
+        load_page(id)
+      end
+      def load_page(id=nil)
+        load_page_by_id(id) unless id.nil?
+      end
+      def load_page_by_id(id)
+        begin
+          @nlg_id = id unless id.nil? # id is expected to be the last number.
+          @url = "http://nbib.nlg.gr/Record/#{@nlg_id}/Export?style=MARCXML"
+          pp "Downloading page: #{@url}"
+          Net::HTTP.start("nbib.nlg.gr") do |http|
+            response = http.get("/Record/#{@nlg_id}/Export?style=MARCXML")
+            pp response.content_type
+            pp response.code
+            raise EmptyPageError.new(@url) unless response.content_type == "text/xml" && response.code == "200"
+            @page = response.body
+          end
+        rescue Errno::ENOENT => e
+          pp "Page: #{@url} NOT FOUND."
+          pp e
+        rescue EmptyPageError => e
+          pp "Page: #{@url} is EMPTY."
+          pp e
+          @page = nil
+        rescue OpenURI::HTTPError => e
+          pp e
+          pp e.io.status
+        rescue StandardError => e
+          pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
+          pp e
+          sleep(120)
+          retry
+        end
+      end
+      # Decodes text with escaped html entities and returns the decoded text.
+      #
+      # ==== Params:
+      #
+      # +encoded_text+:: the text which contains encoded entities
+      #
+      def decode_text(encoded_text)
+        self.class.decode_text(encoded_text)
+      end
+      def self.decode_text(encoded_text)
+        # encoded_text = File.read(encoded_file_path)
+        coder = HTMLEntities.new
+        coder.decode(encoded_text)
+      end
+      def present?(value)
+        return (not value.nil? and not value.empty?) ? true : false
+      end
+    end
+    # Raised when a page is considered empty.
+    #
+    class EmptyPageError < StandardError
+      attr_reader :url
+      def initialize(url)
+        @url = url
+        msg = "Page: #{url} is not valid xml so it is considered EMPTY."
+        super(msg)
+      end
+    end
+    # Raised when something unexpected or in wrong format is parsed.
+    #
+    class NoIdeaWhatThisIsError < StandardError
+      attr_reader :nlg_id, :the_unexpected
+      def initialize(nlg_id, the_unexpected)
+        @nlg_id = nlg_id
+        @the_unexpected = the_unexpected
+        msg = "We have no idea what this: #{the_unexpected} is. At book #{nlg_id}"
+        super(msg)
+      end
+    end
+  end
+end

data/lib/bookshark/extractors/nlg/book_extractor.rb ADDED

@@ -0,0 +1,28 @@
+#!/bin/env ruby
+# encoding: utf-8
+require_relative 'base'
+module Nlg
+  module Extractors
+    class BookExtractor < Base
+      attr_reader :book
+      def initialize(id=nil)
+        super(id)
+        extract_book unless id.nil? or @page.nil?
+      end
+      def load_and_extract_book(id=nil)
+        load_page(id)
+        extract_book unless id.nil? or @page.nil?
+      end
+      def extract_book(nlg_id=@nlg_id, book_page=@page)
+        puts "should extract book #{nlg_id} from nlg"
+      end
+    end
+  end
+end

data/lib/bookshark/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Bookshark
-  VERSION = "1.0.1"
+  VERSION = "1.0.3"
 end

data/spec/test_data/eager_book_184923.json CHANGED

@@ -42,10 +42,10 @@
           "name": "Πανεπιστημιακές Εκδόσεις Κρήτης",
           "owner": "Στέφανος Τραχανάς",
           "bookstores": {
-            "Υποκατάστημα": {
+            "&Upsilon;&pi;&omicron;&kappa;&alpha;&tau;&#940;&sigma;&tau;&eta;&mu;&alpha;": {
               "address": [
-                "Κλεισόβης 3",
-                "106 77 Αθήνα"
+                "&Kappa;&lambda;&epsilon;&iota;&sigma;&#972;&beta;&eta;&sigmaf; 3",
+                "106 77 &Alpha;&theta;&#942;&nu;&alpha;"
               ],
               "telephone": [
                 "210 38490203"

data/spec/test_data/search_01.json CHANGED

@@ -372,7 +372,7 @@
       "format": "Βιβλίο",
       "original_language": null,
       "original_title": null,
-      "price": "6,85",
+      "price": "6,82",
       "availability": "Κυκλοφορεί",
       "last_update": null,
       "series": {
@@ -421,7 +421,7 @@
       "format": "Βιβλίο",
       "original_language": null,
       "original_title": null,
-      "price": "3,73",
+      "price": "3,71",
       "availability": "Κυκλοφορεί",
       "last_update": null,
       "series": {
@@ -445,7 +445,7 @@
       "contributors": {
       },
       "publisher": {
-        "text": "Δωδώνη Εκδοτική ΕΠΕ",
+        "text": "Δωδώνη",
         "b_id": "1"
       },
       "isbn": "960-248-541-8",
@@ -470,7 +470,7 @@
       "format": "Βιβλίο",
       "original_language": null,
       "original_title": null,
-      "price": "10,60",
+      "price": "10,55",
       "availability": "Κυκλοφορεί",
       "last_update": null,
       "series": {

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bookshark
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.3
 platform: ruby
 authors:
 - Dimitris Klisiaris
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-02-23 00:00:00.000000000 Z
+date: 2017-02-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -72,6 +72,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '4.3'
+- !ruby/object:Gem::Dependency
+  name: marc
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -128,6 +142,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.2'
+- !ruby/object:Gem::Dependency
+  name: pry-byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.4'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.4'
 description: Extracts book, author, publisher and category metadata from biblionet.gr.
 email:
 - dklisiaris@gmail.com
@@ -155,6 +183,8 @@ files:
 - lib/bookshark/extractors/bibliographical_book_extractor.rb
 - lib/bookshark/extractors/book_extractor.rb
 - lib/bookshark/extractors/category_extractor.rb
+- lib/bookshark/extractors/nlg/base.rb
+- lib/bookshark/extractors/nlg/book_extractor.rb
 - lib/bookshark/extractors/publisher_extractor.rb
 - lib/bookshark/extractors/search.rb
 - lib/bookshark/storage/file_manager.rb