RubyGems - bookshark - Versions diffs - 1.0.0.alpha.2 - Mend

bookshark 1.0.0.alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/.gitignore +20 -0
data/.rspec +3 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +453 -0
data/Rakefile +4 -0
data/bookshark.gemspec +29 -0
data/lib/bookshark.rb +371 -0
data/lib/bookshark/crawlers/author_crawler.rb +42 -0
data/lib/bookshark/crawlers/base.rb +46 -0
data/lib/bookshark/crawlers/book_crawler.rb +55 -0
data/lib/bookshark/crawlers/category_crawler.rb +55 -0
data/lib/bookshark/crawlers/publisher_crawler.rb +35 -0
data/lib/bookshark/extractors/author_extractor.rb +116 -0
data/lib/bookshark/extractors/base.rb +187 -0
data/lib/bookshark/extractors/book_extractor.rb +453 -0
data/lib/bookshark/extractors/category_extractor.rb +82 -0
data/lib/bookshark/extractors/publisher_extractor.rb +138 -0
data/lib/bookshark/extractors/search.rb +104 -0
data/lib/bookshark/storage/file_manager.rb +103 -0
data/lib/bookshark/version.rb +3 -0
data/spec/bookshark_spec.rb +96 -0
data/spec/spec_helper.rb +1 -0
data/spec/test_data/author_13219.html +313 -0
data/spec/test_data/author_13219.json +23 -0
data/spec/test_data/book_103788.json +49 -0
data/spec/test_data/category_1041.json +42 -0
data/spec/test_data/eager_book_184923.json +215 -0
data/spec/test_data/publisher_20.json +43 -0
data/spec/test_data/search_01.json +355 -0
data/spec/test_data/search_ids_01.json +13 -0
data/tasks/console.rake +4 -0
data/tasks/rspec.rake +3 -0
metadata +191 -0

data/Rakefile ADDED

@@ -0,0 +1,4 @@
+require "bundler/gem_tasks"
+Dir.glob('tasks/**/*.rake').each(&method(:import))

data/bookshark.gemspec ADDED

@@ -0,0 +1,29 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'bookshark/version'
+Gem::Specification.new do |spec|
+  spec.name          = "bookshark"
+  spec.version       = Bookshark::VERSION
+  spec.authors       = ["Dimitris Klisiaris"]
+  spec.email         = ["dklisiaris@gmail.com"]
+  spec.summary       = %q{Book metadata extractor from biblionet.gr.}
+  spec.description   = %q{Extracts book, author, publisher and category metadata from biblionet.gr.}
+  spec.homepage      = "https://github.com/dklisiaris/bookshark"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0")
+  spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
+  spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
+  spec.require_paths = ["lib"]
+  spec.add_dependency "nokogiri", "~> 1.6", ">= 1.6.6"
+  spec.add_dependency "sanitize", "~> 3.1"
+  spec.add_dependency "json", "~> 1.8"
+  spec.add_dependency "htmlentities", "~> 4.3"
+  spec.add_development_dependency "bundler", "~> 1.7"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency 'rspec', "~> 3.1"
+end

data/lib/bookshark.rb ADDED

@@ -0,0 +1,371 @@
+require "bookshark/version"
+require 'bookshark/storage/file_manager'
+require 'bookshark/extractors/author_extractor'
+require 'bookshark/extractors/category_extractor'
+require 'bookshark/extractors/book_extractor'
+require 'bookshark/extractors/publisher_extractor'
+require 'bookshark/extractors/search'
+require 'bookshark/crawlers/base'
+require 'bookshark/crawlers/publisher_crawler'
+module Bookshark
+  DEFAULTS = {
+    site: 'biblionet',
+    format: 'hash'
+  }
+  def self.root
+    File.dirname __dir__
+  end
+  def self.path_to_storage
+    File.join root, 'lib/bookshark/storage'
+  end
+  class Extractor
+    include FileManager
+    attr_accessor :site, :format
+    def initialize(options = {})
+      options = DEFAULTS.merge(options)
+      @site   = options[:site]
+      @format = options[:format]
+    end
+    def author(options = {})
+      uri = process_options(options, __method__)
+      options[:format] ||= @format
+      author_extractor = Biblionet::Extractors::AuthorExtractor.new
+      author = author_extractor.load_and_extract_author(uri)
+      response = {}
+      response[:author] = [author]
+      response = change_format(response, options[:format])
+      return response
+    end
+    def publisher(options = {})
+      uri = process_options(options, __method__)
+      options[:format] ||= @format
+      publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
+      publisher = publisher_extractor.load_and_extract_publisher(uri)
+      response = {}
+      response[:publisher] = [publisher]
+      response = change_format(response, options[:format])
+      response = publisher_extractor.decode_text(response)
+      return response
+      # return uri
+    end
+    def book(options = {})
+      book_extractor = Biblionet::Extractors::BookExtractor.new
+      uri = process_options(options, __method__)
+      options[:format]  ||= @format
+      options[:eager]   ||= false
+      if options[:eager]
+        book = eager_extract_book(uri)
+      else
+        book = book_extractor.load_and_extract_book(uri)
+      end
+      response = {}
+      response[:book] = [book]
+      response = change_format(response, options[:format])
+      response = book_extractor.decode_text(response)
+      return response
+    end
+    def category(options = {})
+      uri = process_options(options, __method__)
+      options[:format] ||= @format
+      category_extractor = Biblionet::Extractors::CategoryExtractor.new
+      category = category_extractor.extract_categories_from(uri)
+      response = {}
+      response[:category] = [category]
+      response = change_format(response, options[:format])
+      return response
+    end
+    def search(options = {})
+      options[:format]        ||= @format
+      options[:results_type]  ||= 'metadata'
+      search_engine  = Biblionet::Extractors::Search.new
+      search_results = search_engine.perform_search(options)
+      response = {}
+      response[:book] = search_results
+      response = change_format(response, options[:format])
+      return response
+    end
+    def parse_all_categories(will_save=false)
+      # list_directories('raw_ddc_pages').each do |dir|
+        # p dir
+      # end
+      category_extractor = Biblionet::Extractors::CategoryExtractor.new
+      all_categories = Hash.new
+      list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
+        categories = category_extractor.extract_categories_from(file)
+        all_categories.merge!(categories) unless categories.nil? or categories.empty?
+      end
+      if will_save
+        all_categories_json = all_categories.to_json
+        save_to('storage/all_categories.json',all_categories_json)
+      end
+      all_categories
+    end
+    def parse_all_books
+      bp = Biblionet::Extractors::BookExtractor.new
+      list_directories(path: 'storage/raw_html_pages').each do |dir|
+        dir_to_save = dir.gsub(/raw_html_pages/, 'books')
+        list_files(path: dir, extension: 'html', all:true).each do |file|
+          # Load the book from html file and parse the data.
+          # pp "Parsing book: #{file}"
+          pp file
+          book = bp.load_and_extract_book(file)
+          # Prepare a path to save the new file.
+          filename  = File.basename(file,".*")
+          path_to_save = "#{dir_to_save}#{filename}.json"
+          # Save to file.
+          bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
+          # pp "Book #{file} saved!"
+        end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
+      end
+    end
+    private
+    def process_options(options = {}, caller = nil)
+      # puts caller_locations(1,1)[0].label
+      # options[:format] ||= @format
+      puts caller
+      id = options[:id]
+      if id
+        case caller.to_s
+        when 'author'
+          url_method    = 'author'
+          local_path    = "html_author_pages/#{((id-1)/1000)}/author_#{id}.html"
+        when 'publisher'
+          url_method    = 'com'
+          local_path    = "html_publisher_pages/#{((id-1)/100)}/publisher_#{id}.html"
+        when 'book'
+          url_method    = 'book'
+          local_path    = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
+        when 'category'
+          url_method    = 'index'
+          local_path    = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
+        else
+          puts "Called from unknown method. Probably its rspec."
+        end
+        options[:local] ||= false
+        url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
+        url = "http://www.biblionet.gr/#{url_method}/#{id}" unless options[:local]
+      end
+      uri = options[:uri] ||= url
+      return uri
+    end
+    def change_format(hash, format)
+      case format
+      when 'hash'
+        return hash
+      when 'json'
+        hash = hash.to_json
+      when 'pretty_json'
+        hash = JSON.pretty_generate(hash)
+      end
+      return hash
+    end
+    def eager_extract_book(uri)
+      book_extractor      = Biblionet::Extractors::BookExtractor.new
+      author_extractor    = Biblionet::Extractors::AuthorExtractor.new
+      publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
+      category_extractor  = Biblionet::Extractors::CategoryExtractor.new
+      book = book_extractor.load_and_extract_book(uri)
+      tmp_data = []
+      book[:author].each do |author|
+        tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
+      end
+      book[:author] = tmp_data
+      tmp_data, tmp_hash = [], {}
+      book[:contributors].each do |job, contributors|
+        contributors.each do |contributor|
+          tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
+        end
+        tmp_hash[job] = tmp_data
+        tmp_data = []
+      end
+      book[:contributors] = tmp_hash
+      tmp_data, tmp_hash = [], {}
+      book[:category].each do |category|
+        tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
+      end
+      book[:category] = tmp_data
+      tmp_data = []
+      tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
+      book[:publisher] = tmp_data
+      book
+    end
+  end
+  class Crawler
+    include FileManager
+    attr_accessor :site
+    def initialize(options = {})
+      options = DEFAULTS.merge(options)
+      @site   = options[:site]
+    end
+    def publishers
+      # crawler = Biblionet::Crawlers::Base.new(start:1, finish:100, step:10)
+      # crawler.spider do |url, path|
+      #   puts "URL: #{url}, PATH: #{path}"
+      # end
+      # puts Biblionet::Extractors::Base.new("http://www.biblionet.gr/com/245").page
+      crawler = Biblionet::Crawlers::PublisherCrawler.new
+      crawler.crawl_and_save
+    end
+  end
+#   module Biblionet
+#     class Extract
+#       class << self
+#         def author(uri=nil)
+#           author_extractor = BiblionetParser::Core::AuthorExtractor.new
+#           author_extractor.load_and_extract_author(uri)
+#         end
+#         def book(uri=nil)
+#           bp = BiblionetParser::Core::BookParser.new
+#           bp.load_and_parse_book(uri)
+#         end
+#         def categories(uri=nil)
+#           category_extractor = BiblionetParser::Core::DDCParser.new
+#           category_extractor.extract_categories_from(uri)
+#         end
+#       end
+#     end
+#   end
+end
+# ae = BiblionetParser::Core::AuthorExtractor.new
+# ae.load_and_extract_author('storage/html_author_pages/0/author_5.html')
+# Biblionet::Extract.author('storage/html_author_pages/0/author_5.html')
+# Biblionet::Extract.author('storage/html_author_pages/2/author_2423.html')
+# Biblionet::Extract.author('storage/html_author_pages/0/author_764.html')
+# Biblionet::Extract.author('storage/html_author_pages/0/author_435.html')
+# bib = Bibliotheca.new
+# categories = bib.parse_all_categories(true)
+# p bib.list_files(path: 'raw_html_pages/2', extension:'html')
+# p bib.list_directories
+# p categories[787]
+# categories = 'test'
+# bib.save_to('all_categories_test.json', categories)
+# bp = BiblionetParser::Core::BookParser.new
+# bp.load_and_parse_book('storage/raw_html_pages/96/book_96592.html') # BAD Book --no image
+# bp.load_and_parse_book('storage/raw_html_pages/96/book_96937.html') # BAD Book --award
+# bp.load_and_parse_book('storage/raw_html_pages/78/book_78836.html') # BAD Book --multiple awards
+# bp.load_and_parse_book('storage/raw_html_pages/149/book_149345.html') # BAD Book --2 sets of details (ebooks, normals)
+# bp.load_and_parse_book('storage/raw_html_pages/149/book_149402.html') # BAD Book --2 sets of details (normals, reviews)
+# bp.load_and_parse_book('storage/raw_html_pages/149/book_149278.html') # BAD Book --3 sets of details (ebooks, normals, reviews)
+# bp.load_and_parse_book('storage/raw_html_pages/149/book_149647.html')
+# puts JSON.pretty_generate(bp.book)
+# bp.load_and_parse_book('storage/raw_html_pages/70/book_70076.html') # BAD Book --Has comma inside award
+# bp.load_and_parse_book('storage/raw_html_pages/70/book_70828.html') # BAD Book --No author. Collective Work
+# puts JSON.pretty_generate(bp.book)
+# bp.load_and_parse_book('storage/raw_html_pages/70/book_70829.html') # BAD Book --No author, No publisher. Collective Work
+# puts JSON.pretty_generate(bp.book)
+# bp.load_and_parse_book('storage/raw_html_pages/145/book_145326.html') # BAD Book --ISMN istead of ISBN
+# bp.load_and_parse_book('storage/raw_html_pages/45/book_45455.html') # BAD Book --No author. Has contributors.
+# puts JSON.pretty_generate(bp.book)
+# bp.load_and_parse_book('storage/raw_html_pages/132/book_132435.html') # BAD Book --Two authors.
+# puts JSON.pretty_generate(bp.book)
+# bp.load_and_parse_book('storage/raw_html_pages/133/book_133435.html') # GOOD Book
+# puts JSON.pretty_generate(bp.book)
+# ddcp = BiblionetParser::Core::DDCParser.new('storage/raw_ddc_pages/0/ddc_298.html')
+# pp all = ddcp.categories
+# pp cur = ddcp.categories.values.last
+# pp sel = ddcp.categories["2703"]
+# bp.parse_book('12351', bp.page)
+# bp.save_page('storage/mits_ts/mits1.json')
+# pp bp.url='http://www.biblionet.gr/book/123351'
+# pp bp.page
+# pp bib.list_directories(path: 'storage/raw_html_pages')
+# pp bib.list_files(path: "storage/raw_html_pages/24/", extension: 'html')
+# bib = Bibliotheca.new
+# bib.parse_all_books
+# Good cases:
+# 'storage/raw_html_pages/123/book_123351.html'
+# 'storage/raw_html_pages/17/book_17351.html'
+# 'storage/raw_html_pages/133/book_133435.html'
+# Special book cases to check out:
+# 'storage/raw_html_pages/96/book_96592.html' --no image
+# 'storage/raw_html_pages/96/book_96937.html'
+# Problematic at biblionet
+# http://biblionet.gr/book/196388
+# http://biblionet.gr/book/196386
+# http://biblionet.gr/book/195525

data/lib/bookshark/crawlers/author_crawler.rb ADDED

@@ -0,0 +1,42 @@
+require 'rubygems'
+require 'nokogiri'
+require 'open-uri'
+require 'fileutils'
+require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
+DEFAULTS = {
+  folder: 'storage/html_author_pages',
+  base_url: 'http://www.biblionet.gr/author/',
+  extension: '.html',
+  first_id: 1,
+  last_id: 112000,
+  step: 1000
+}
+def crawl_and_save(options={})
+  options = DEFAULTS.merge(options)
+  start_id  = options[:first_id] + options[:step] - 1
+  last_id   = options[:last_id]
+  step      = options[:step]
+  start_id.step(last_id, step) do |last|
+    first     = last - step + 1
+    subfolder = (last/step - 1).to_s
+    path      = "#{options[:folder]}/#{subfolder}/"
+    # Create a new directory (does nothing if directory exists)
+    FileUtils.mkdir_p path
+    first.upto(last) do |id|
+      file_to_save = "#{path}author_#{id}#{options[:extension]}"
+      url_to_download = "#{options[:base_url]}#{id}/"
+      downloader = Biblionet::Core::Base.new(url_to_download)
+      downloader.save_page(file_to_save) unless downloader.page.nil?
+    end
+  end
+end

data/lib/bookshark/crawlers/base.rb ADDED

@@ -0,0 +1,46 @@
+require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
+module Biblionet
+  module Crawlers
+    class Base
+      def initialize(options = {})
+        @folder     = options[:folder]    ||= 'lib/bookshark/storage/html_base_pages'
+        @base_url   = options[:base_url]  ||= 'http://www.biblionet.gr/base/'
+        @page_type  = options[:page_type] ||= 'base'
+        @extension  = options[:extension] ||= '.html'
+        @start      = options[:start]     ||= 1
+        @finish     = options[:finish]    ||= 10000
+        @step       = options[:step]      ||= 1000
+      end
+      def spider
+        start  = @start  + @step - 1
+        finish = @finish
+        start.step(finish, @step) do |last|
+          first     = last - @step + 1
+          subfolder = (last/@step - 1).to_s
+          path      = "#{@folder}/#{subfolder}/"
+          # Create a new directory (does nothing if directory exists)
+          # FileUtils.mkdir_p path
+          first.upto(last) do |id|
+            file_to_save    = "#{path}#{@page_type}_#{id}#{@extension}"
+            url_to_download = "#{@base_url}#{id}/"
+            yield(url_to_download, file_to_save)
+            # downloader = Biblionet::Core::Base.new(url_to_download)
+            # downloader.save_page(file_to_save) unless downloader.page.nil?
+          end
+        end
+      end
+    end
+  end
+end