RubyGems - bookshark - Versions diffs - 1.0.1 → 1.0.3 - Mend

bookshark 1.0.1 → 1.0.3

Files changed (10) hide show

checksums.yaml +4 -4
data/bookshark.gemspec +2 -0
data/lib/bookshark.rb +113 -93
data/lib/bookshark/extractors/bibliographical_book_extractor.rb +10 -3
data/lib/bookshark/extractors/nlg/base.rb +110 -0
data/lib/bookshark/extractors/nlg/book_extractor.rb +28 -0
data/lib/bookshark/version.rb +1 -1
data/spec/test_data/eager_book_184923.json +3 -3
data/spec/test_data/search_01.json +4 -4
metadata +32 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d35fee946c6b6dcf4ca740d89ba3a9cb89f36a94
-  data.tar.gz: ff928cdadd16b132adc9f193ff5c7f565a0b0398
+  metadata.gz: '03852f73c9246676ff20b75b0a893998e967f2c7'
+  data.tar.gz: 7fe938710c2e9344563395e5db7cf3e7eaf76b25
 SHA512:
-  metadata.gz: 2dac9ad4842172d896a488fa60baaf13d862c8d2e3b1e68b3a9f5e6ee28bec5136d4ed7d084c4bf6fa0f5f1cfd3224241958467c7df99929ea8cfc1c5ad92abc
-  data.tar.gz: e8f9dcb4f20e0a2330a91588c6dfbb306a74820ee9ed7fa7564097013c16244de46ffc3a636a751126f19ce9c1a32b209fb254d7533d901d6f77a00a6b8b5100
+  metadata.gz: 3ccd4cfd0e82aa6918304e82df18f382f2b7e33470c9f52ff362b4897aaf53b703a04bd2689871925293d7e084d3e0008c74aa289fb36a106be7dca20c01ee0f
+  data.tar.gz: fb4590ee8c1a24402f48b3502ddb77e24fbbde2c988a18fb62fe867762950f351d3271512773602bd9cec47d7e51a6cb94de1e2d695cf7c0d8704a39bc72f905

data/bookshark.gemspec CHANGED

@@ -24,9 +24,11 @@ Gem::Specification.new do |spec|
   spec.add_dependency "sanitize", "~> 4.0"
   spec.add_dependency "json", "~> 1.8"
   spec.add_dependency "htmlentities", "~> 4.3"
+  spec.add_dependency "marc", "~> 1.0"
   spec.add_development_dependency "bundler", ">= 1.6"
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_development_dependency 'rspec', "~> 3.2"
   spec.add_development_dependency "webmock", "~> 1.2"
+  spec.add_development_dependency "pry-byebug", "~> 3.4"
 end

data/lib/bookshark.rb CHANGED

@@ -7,6 +7,7 @@ require 'bookshark/extractors/book_extractor'
 require 'bookshark/extractors/bibliographical_book_extractor'
 require 'bookshark/extractors/publisher_extractor'
 require 'bookshark/extractors/search'
+require 'bookshark/extractors/nlg/book_extractor'
 require 'bookshark/crawlers/base'
 require 'bookshark/crawlers/publisher_crawler'
@@ -22,8 +23,8 @@ module Bookshark
   def self.root
     # File.dirname __dir__ # Works only on ruby > 2.0.0
     File.expand_path(File.join(File.dirname(__FILE__), '../'))
-  end
+  end
   def self.path_to_storage
     File.join root, 'lib/bookshark/storage'
   end
@@ -31,7 +32,7 @@ module Bookshark
   class Extractor
     include FileManager
-    attr_accessor :site, :format
+    attr_accessor :site, :format
     def initialize(options = {})
       options = DEFAULTS.merge(options)
@@ -44,9 +45,9 @@ module Bookshark
       options[:format] ||= @format
       author_extractor = Biblionet::Extractors::AuthorExtractor.new
-      author = author_extractor.load_and_extract_author(uri)
-      response = {}
+      author = author_extractor.load_and_extract_author(uri)
+      response = {}
       response[:author] = !author.nil? ? [author] : []
       response = change_format(response, options[:format])
       return response
@@ -58,90 +59,109 @@ module Bookshark
       publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
       publisher = publisher_extractor.load_and_extract_publisher(uri)
-      response = {}
+      response = {}
       response[:publisher] = !publisher.nil? ? [publisher] : []
       response = change_format(response, options[:format])
       response = publisher_extractor.decode_text(response)
       return response
-      # return uri
-    end
+      # return uri
+    end
     def book(options = {})
-      book_extractor = Biblionet::Extractors::BookExtractor.new
-      if book_extractor.present?(options[:isbn])
-        search_engine = Biblionet::Extractors::Search.new
-        options[:id]  = search_engine.search_by_isbn(options[:isbn])
-      end
+      options[:site] ||= @site
-      uri = process_options(options, __method__)
-      options[:format]  ||= @format
-      options[:eager]   ||= false
-      options[:nilify]  ||= false
-      if options[:eager]
-        book = eager_extract_book(uri)
-      else
-        book = book_extractor.load_and_extract_book(uri)
-      end
+      if options[:site] == 'biblionet'
+        book_extractor = Biblionet::Extractors::BookExtractor.new
-      response = {}
-      response[:book] = !book.nil? ? [book] : []
+        if book_extractor.present?(options[:isbn])
+          search_engine = Biblionet::Extractors::Search.new
+          options[:id]  = search_engine.search_by_isbn(options[:isbn])
+        end
-      return nil if response[:book].empty? and options[:nilify]
-      response = change_format(response, options[:format])
-      response = book_extractor.decode_text(response) if response.class == "String"
-      return response
+        uri = process_options(options, __method__)
+        options[:format]  ||= @format
+        options[:eager]   ||= false
+        options[:nilify]  ||= false
+        if options[:eager]
+          book = eager_extract_book(uri)
+        else
+          book = book_extractor.load_and_extract_book(uri)
+        end
+        response = {}
+        response[:book] = !book.nil? ? [book] : []
+        return nil if response[:book].empty? and options[:nilify]
+        response = change_format(response, options[:format])
+        response = book_extractor.decode_text(response) if response.class == "String"
+        return response
+      elsif options[:site] == 'nlg'
+        book_extractor = Nlg::Extractors::BookExtractor.new
+        options[:format] ||= @format
+        # if !options[:uri].nil?
+        #   uri = "#{options[:uri]}/Export?style=MARCXML"
+        # elsif !options[:id].nil?
+        #   uri = "http://nbib.nlg.gr/Record/#{options[:id]}/Export?style=MARCXML"
+        # end
+        book = book_extractor.load_and_extract_book(options[:id])
+        response = {}
+        response[:book] = !book.nil? ? [book] : []
+      end
     end
     # def bibliographical_book(options = {})
     #   bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
     #   uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{options[:id]}"
     #   options[:format]  ||= @format
     #   book = bibliographical_book_extractor.load_and_extract_book(uri)
-    #   response = {}
+    #   response = {}
     #   response[:book] = !book.nil? ? [book] : []
     #   response = change_format(response, options[:format])
-    #   response = bibliographical_book_extractor.decode_text(response)
-    # end
+    #   response = bibliographical_book_extractor.decode_text(response)
+    # end
     # puts Bookshark::Extractor.new(format: 'pretty_json').bibliographical_book(id: 103788)
     def category(options = {})
       uri = process_options(options, __method__)
-      options[:format] ||= @format
+      options[:format] ||= @format
       category_extractor = Biblionet::Extractors::CategoryExtractor.new
       category = category_extractor.extract_categories_from(uri)
-      response = {}
+      response = {}
       response[:category] = !category.nil? ? [category] : []
       response = change_format(response, options[:format])
-      return response
+      return response
     end
     def search(options = {})
       options[:format]        ||= @format
-      options[:results_type]  ||= 'metadata'
+      options[:results_type]  ||= 'metadata'
       search_engine  = Biblionet::Extractors::Search.new
       search_results = search_engine.perform_search(options)
-      response = {}
+      response = {}
       response[:book] = search_results
       response = change_format(response, options[:format])
-      return response
+      return response
     end
     # def books_from_storage
@@ -165,22 +185,22 @@ module Bookshark
         record = book(id: book_id, local: true, format: format, nilify: true)
         dir_to_save = Bookshark.path_to_storage + '/' + 'json_book_records/' + "#{((book_id-1)/1000)}/" + "book_#{book_id}.json"
         save_to(dir_to_save, record) unless record.nil?
       end
     end
-    def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
+    def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
       list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
-        dir_to_save = dir.gsub(source_dir, target_dir)
+        dir_to_save = dir.gsub(source_dir, target_dir)
         list_files(path: dir, extension: 'html', all:true).each do |file|
-          puts "Extracting from file: " + file.to_s
+          puts "Extracting from file: " + file.to_s
           # Extract publisher metadata form local file.
-          options = {uri: file, format: 'pretty_json', local: true}
+          options = {uri: file, format: 'pretty_json', local: true}
           case metadata_type
           when 'author'
             record = author(options)
@@ -189,16 +209,16 @@ module Bookshark
           # when 'book'
           #   record = book(options)
           when 'category'
-            record = category(options)
-          end
+            record = category(options)
+          end
           # Prepare a path to save the new file.
           filename  = File.basename(file,".*")
           path_to_save = "#{dir_to_save}#{filename}.json"
-          # Save to file.
+          # Save to file.
           save_to("#{path_to_save}", record)
         end # unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
       end
     end
@@ -209,9 +229,9 @@ module Bookshark
       # end
       category_extractor = Biblionet::Extractors::CategoryExtractor.new
       all_categories = Hash.new
       list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
-        categories = category_extractor.extract_categories_from(file)
+        categories = category_extractor.extract_categories_from(file)
         all_categories.merge!(categories) unless categories.nil? or categories.empty?
       end
@@ -228,19 +248,19 @@ module Bookshark
       list_directories(path: 'storage/raw_html_pages').each do |dir|
         dir_to_save = dir.gsub(/raw_html_pages/, 'books')
-        list_files(path: dir, extension: 'html', all:true).each do |file|
+        list_files(path: dir, extension: 'html', all:true).each do |file|
           # Load the book from html file and parse the data.
           # pp "Parsing book: #{file}"
           pp file
           book = bp.load_and_extract_book(file)
           # Prepare a path to save the new file.
           filename  = File.basename(file,".*")
           path_to_save = "#{dir_to_save}#{filename}.json"
-          # Save to file.
+          # Save to file.
           bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
           # pp "Book #{file} saved!"
         end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
@@ -266,11 +286,11 @@ module Bookshark
           url_method    = 'book'
           local_path    = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
         when 'category'
-          url_method    = 'index'
-          local_path    = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
+          url_method    = 'index'
+          local_path    = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
         else
           puts "Called from unknown method. Probably its rspec."
-        end
+        end
         options[:local] ||= false
         url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
@@ -279,7 +299,7 @@ module Bookshark
       uri = options[:uri] ||= url
       return uri
-    end
+    end
     def change_format(hash, format)
       case format
@@ -288,10 +308,10 @@ module Bookshark
       when 'json'
         hash = hash.to_json
       when 'pretty_json'
-        hash = JSON.pretty_generate(hash)
+        hash = JSON.pretty_generate(hash)
       end
       return hash
-    end
+    end
     def eager_extract_book(uri)
       book_extractor      = Biblionet::Extractors::BookExtractor.new
@@ -301,13 +321,13 @@ module Bookshark
       book = book_extractor.load_and_extract_book(uri)
-      tmp_data = []
+      tmp_data = []
       book[:author].each do |author|
-        tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
+        tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
       end
-      book[:author] = tmp_data
-      tmp_data, tmp_hash = [], {}
+      book[:author] = tmp_data
+      tmp_data, tmp_hash = [], {}
       book[:contributors].each do |job, contributors|
         contributors.each do |contributor|
           tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
@@ -317,19 +337,19 @@ module Bookshark
       end
       book[:contributors] = tmp_hash
-      tmp_data, tmp_hash = [], {}
+      tmp_data, tmp_hash = [], {}
       book[:category].each do |category|
         tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
       end
-      book[:category] = tmp_data
-      tmp_data = []
-      tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
+      book[:category] = tmp_data
+      tmp_data = []
+      tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
       book[:publisher] = tmp_data
       book
-    end
+    end
   end
@@ -339,7 +359,7 @@ module Bookshark
     def initialize(options = {})
       options = DEFAULTS.merge(options)
-      @site   = options[:site]
+      @site   = options[:site]
     end
     def publishers
@@ -362,11 +382,11 @@ module Bookshark
       crawler.crawl_and_save
     end
-  end
+  end
 #   module Biblionet
 #     class Extract
-#       class << self
+#       class << self
 #         def author(uri=nil)
 #           author_extractor = BiblionetParser::Core::AuthorExtractor.new
 #           author_extractor.load_and_extract_author(uri)
@@ -384,7 +404,7 @@ module Bookshark
 #       end
 #     end
-#   end
+#   end
 end
@@ -467,4 +487,4 @@ end
 # Problematic at biblionet
 # http://biblionet.gr/book/196388
 # http://biblionet.gr/book/196386
-# http://biblionet.gr/book/195525
+# http://biblionet.gr/book/195525

data/lib/bookshark/extractors/bibliographical_book_extractor.rb CHANGED

@@ -170,10 +170,17 @@ module Biblionet
               text: publisher_node.text,
               b_id: (publisher_node[:href].split("/"))[2]
             }
-            after_last_author_text = @nodeset
+            last_author = @nodeset
               .xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]][last()]").last
-              .next_sibling.text.strip
-            puts after_last_author_text
+            if !last_author.nil? && !last_author.empty?
+              after_last_author_text = last_author.next_sibling.text.strip
+            else
+              last_book = @nodeset
+                .xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][last()]").last
+              after_last_author_text = last_book.next_sibling.text.strip
+            end
             details_hash[:publication] = {
               year: after_last_author_text[/(?<=: )\d+(?=\.)/],
               version: after_last_author_text[/(?<=- )\d+(?=η)/],

data/lib/bookshark/extractors/nlg/base.rb ADDED

@@ -0,0 +1,110 @@
+#!/bin/env ruby
+# encoding: utf-8
+require 'rubygems'
+require 'json'
+require 'logger'
+require 'pp'
+require 'marc'
+require 'htmlentities'
+module Nlg
+  module Extractors
+    class Base
+      attr_reader :url, :nlg_id, :page
+      def initialize(id=nil)
+        load_page(id)
+      end
+      def load_page(id=nil)
+        load_page_by_id(id) unless id.nil?
+      end
+      def load_page_by_id(id)
+        begin
+          @nlg_id = id unless id.nil? # id is expected to be the last number.
+          @url = "http://nbib.nlg.gr/Record/#{@nlg_id}/Export?style=MARCXML"
+          pp "Downloading page: #{@url}"
+          Net::HTTP.start("nbib.nlg.gr") do |http|
+            response = http.get("/Record/#{@nlg_id}/Export?style=MARCXML")
+            pp response.content_type
+            pp response.code
+            raise EmptyPageError.new(@url) unless response.content_type == "text/xml" && response.code == "200"
+            @page = response.body
+          end
+        rescue Errno::ENOENT => e
+          pp "Page: #{@url} NOT FOUND."
+          pp e
+        rescue EmptyPageError => e
+          pp "Page: #{@url} is EMPTY."
+          pp e
+          @page = nil
+        rescue OpenURI::HTTPError => e
+          pp e
+          pp e.io.status
+        rescue StandardError => e
+          pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
+          pp e
+          sleep(120)
+          retry
+        end
+      end
+      # Decodes text with escaped html entities and returns the decoded text.
+      #
+      # ==== Params:
+      #
+      # +encoded_text+:: the text which contains encoded entities
+      #
+      def decode_text(encoded_text)
+        self.class.decode_text(encoded_text)
+      end
+      def self.decode_text(encoded_text)
+        # encoded_text = File.read(encoded_file_path)
+        coder = HTMLEntities.new
+        coder.decode(encoded_text)
+      end
+      def present?(value)
+        return (not value.nil? and not value.empty?) ? true : false
+      end
+    end
+    # Raised when a page is considered empty.
+    #
+    class EmptyPageError < StandardError
+      attr_reader :url
+      def initialize(url)
+        @url = url
+        msg = "Page: #{url} is not valid xml so it is considered EMPTY."
+        super(msg)
+      end
+    end
+    # Raised when something unexpected or in wrong format is parsed.
+    #
+    class NoIdeaWhatThisIsError < StandardError
+      attr_reader :nlg_id, :the_unexpected
+      def initialize(nlg_id, the_unexpected)
+        @nlg_id = nlg_id
+        @the_unexpected = the_unexpected
+        msg = "We have no idea what this: #{the_unexpected} is. At book #{nlg_id}"
+        super(msg)
+      end
+    end
+  end
+end

data/lib/bookshark/extractors/nlg/book_extractor.rb ADDED

@@ -0,0 +1,28 @@
+#!/bin/env ruby
+# encoding: utf-8
+require_relative 'base'
+module Nlg
+  module Extractors
+    class BookExtractor < Base
+      attr_reader :book
+      def initialize(id=nil)
+        super(id)
+        extract_book unless id.nil? or @page.nil?
+      end
+      def load_and_extract_book(id=nil)
+        load_page(id)
+        extract_book unless id.nil? or @page.nil?
+      end
+      def extract_book(nlg_id=@nlg_id, book_page=@page)
+        puts "should extract book #{nlg_id} from nlg"
+      end
+    end
+  end
+end

data/lib/bookshark/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Bookshark
-  VERSION = "1.0.1"
+  VERSION = "1.0.3"
 end

data/spec/test_data/eager_book_184923.json CHANGED

@@ -42,10 +42,10 @@
           "name": "Πανεπιστημιακές Εκδόσεις Κρήτης",
           "owner": "Στέφανος Τραχανάς",
           "bookstores": {
-            "Υποκατάστημα": {
+            "&Upsilon;&pi;&omicron;&kappa;&alpha;&tau;&#940;&sigma;&tau;&eta;&mu;&alpha;": {
               "address": [
-                "Κλεισόβης 3",
-                "106 77 Αθήνα"
+                "&Kappa;&lambda;&epsilon;&iota;&sigma;&#972;&beta;&eta;&sigmaf; 3",
+                "106 77 &Alpha;&theta;&#942;&nu;&alpha;"
               ],
               "telephone": [
                 "210 38490203"

data/spec/test_data/search_01.json CHANGED

@@ -372,7 +372,7 @@
       "format": "Βιβλίο",
       "original_language": null,
       "original_title": null,
-      "price": "6,85",
+      "price": "6,82",
       "availability": "Κυκλοφορεί",
       "last_update": null,
       "series": {
@@ -421,7 +421,7 @@
       "format": "Βιβλίο",
       "original_language": null,
       "original_title": null,
-      "price": "3,73",
+      "price": "3,71",
       "availability": "Κυκλοφορεί",
       "last_update": null,
       "series": {
@@ -445,7 +445,7 @@
       "contributors": {
       },
       "publisher": {
-        "text": "Δωδώνη Εκδοτική ΕΠΕ",
+        "text": "Δωδώνη",
         "b_id": "1"
       },
       "isbn": "960-248-541-8",
@@ -470,7 +470,7 @@
       "format": "Βιβλίο",
       "original_language": null,
       "original_title": null,
-      "price": "10,60",
+      "price": "10,55",
       "availability": "Κυκλοφορεί",
       "last_update": null,
       "series": {

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bookshark
 version: !ruby/object:Gem::Version
-  version: 1.0.1
+  version: 1.0.3
 platform: ruby
 authors:
 - Dimitris Klisiaris
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-02-23 00:00:00.000000000 Z
+date: 2017-02-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -72,6 +72,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '4.3'
+- !ruby/object:Gem::Dependency
+  name: marc
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.0'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement
@@ -128,6 +142,20 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.2'
+- !ruby/object:Gem::Dependency
+  name: pry-byebug
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.4'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.4'
 description: Extracts book, author, publisher and category metadata from biblionet.gr.
 email:
 - dklisiaris@gmail.com
@@ -155,6 +183,8 @@ files:
 - lib/bookshark/extractors/bibliographical_book_extractor.rb
 - lib/bookshark/extractors/book_extractor.rb
 - lib/bookshark/extractors/category_extractor.rb
+- lib/bookshark/extractors/nlg/base.rb
+- lib/bookshark/extractors/nlg/book_extractor.rb
 - lib/bookshark/extractors/publisher_extractor.rb
 - lib/bookshark/extractors/search.rb
 - lib/bookshark/storage/file_manager.rb