RubyGems - bookshark - Versions diffs - 1.0.0.alpha.2 - Mend

bookshark 1.0.0.alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +7 -0
data/.gitignore +20 -0
data/.rspec +3 -0
data/Gemfile +4 -0
data/LICENSE.txt +22 -0
data/README.md +453 -0
data/Rakefile +4 -0
data/bookshark.gemspec +29 -0
data/lib/bookshark.rb +371 -0
data/lib/bookshark/crawlers/author_crawler.rb +42 -0
data/lib/bookshark/crawlers/base.rb +46 -0
data/lib/bookshark/crawlers/book_crawler.rb +55 -0
data/lib/bookshark/crawlers/category_crawler.rb +55 -0
data/lib/bookshark/crawlers/publisher_crawler.rb +35 -0
data/lib/bookshark/extractors/author_extractor.rb +116 -0
data/lib/bookshark/extractors/base.rb +187 -0
data/lib/bookshark/extractors/book_extractor.rb +453 -0
data/lib/bookshark/extractors/category_extractor.rb +82 -0
data/lib/bookshark/extractors/publisher_extractor.rb +138 -0
data/lib/bookshark/extractors/search.rb +104 -0
data/lib/bookshark/storage/file_manager.rb +103 -0
data/lib/bookshark/version.rb +3 -0
data/spec/bookshark_spec.rb +96 -0
data/spec/spec_helper.rb +1 -0
data/spec/test_data/author_13219.html +313 -0
data/spec/test_data/author_13219.json +23 -0
data/spec/test_data/book_103788.json +49 -0
data/spec/test_data/category_1041.json +42 -0
data/spec/test_data/eager_book_184923.json +215 -0
data/spec/test_data/publisher_20.json +43 -0
data/spec/test_data/search_01.json +355 -0
data/spec/test_data/search_ids_01.json +13 -0
data/tasks/console.rake +4 -0
data/tasks/rspec.rake +3 -0
metadata +191 -0

data/lib/bookshark/extractors/category_extractor.rb ADDED

@@ -0,0 +1,82 @@
+require_relative 'base'
+module Biblionet
+  module Extractors
+    class CategoryExtractor < Base
+      attr_reader :categories
+      def initialize(uri=nil)
+        super(uri)
+        extract_categories unless uri.nil?
+      end
+      def extract_categories(category_page=@page)
+        page = Nokogiri::HTML(category_page)
+        parent, previous_indent, previous_id = nil, nil, nil,
+        @categories = page.xpath("//a[@class='menu' and @href[contains(.,'/index/') ]]").map do |category|
+          # Extract from href the id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
+          biblionet_id = category[:href].split(/\//).last
+          # Get the text before <a>. It is expected to be a number of space characters
+          spaces = category.previous_sibling.text # TODO: make sure text is only spaces
+          # Indent size
+          indent = spaces.size
+          # Determine parent-child-sibling relationships based on indent.
+          # Indent size seems to be inconsistent, so it better to compare sizes than actually use them.
+          if (indent <=> previous_indent).nil?
+            previous_indent = indent
+          elsif (indent <=> previous_indent)>0
+            parent = previous_id
+            previous_indent = indent
+          end
+          previous_id = biblionet_id
+          # Extact DdC id and DdC text.
+          category = proccess_category(category.text)
+          category.merge!(parent: parent)
+          category_hash = {biblionet_id => category.clone}
+        end.reduce({}, :update) unless @page.nil?
+        @categories[:current] = (@categories[@biblionet_id.to_s].clone)
+        @categories[:current][:b_id] = @biblionet_id
+        return @categories
+      end
+      def extract_categories_from(uri=nil)
+        load_page(uri)
+        extract_categories unless uri.nil?
+      end
+      private
+      def proccess_category(category)
+        # matches the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
+        ddc_re = /(\[\d*(?:[\.|\s]\d*)*\])/
+        # matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
+        non_text_re = /\s*(\[.*\]|\(\d*\))\s*/
+        category_ddc = category.scan(ddc_re).join.gsub(/[\[\]]/, '')
+        category_name = category.gsub(non_text_re, '').strip
+        category_hash = { ddc: category_ddc, name: category_name }
+        return category_hash
+      end
+    end
+  end
+end
+# categoryp = DDCParser.new("raw_category_pages/0/category_787.html")
+# categoryp.extract_categories
+# categoryp.filepath="category_1.html"

data/lib/bookshark/extractors/publisher_extractor.rb ADDED

@@ -0,0 +1,138 @@
+require_relative 'base'
+module Biblionet
+  module Extractors
+    class PublisherExtractor < Base
+      attr_reader :publisher
+      def initialize(uri=nil)
+        super(uri)
+        extract_publisher unless uri.nil? or @page.nil?
+      end
+      def load_and_extract_publisher(uri=nil)
+        load_page(uri)
+        extract_publisher unless uri.nil? or @page.nil?
+      end
+      def extract_publisher(biblionet_id=@biblionet_id, publisher_page=@page)
+        puts "Extracting publisher: #{biblionet_id}"
+        page = PublisherDataExtractor.new(publisher_page)
+        headquarters                    = page.headquarters
+        bookstores                      = page.bookstores
+        bookstores['Έδρα']              = headquarters
+        publisher_hash = {}
+        publisher_hash[:name]          = page.name
+        publisher_hash[:owner]         = page.owner
+        publisher_hash[:bookstores]    = bookstores
+        publisher_hash[:b_id]          = biblionet_id
+        return @publisher = publisher_hash
+      end
+    end
+    class PublisherDataExtractor
+      attr_reader :nodeset
+      def initialize(document)
+        # No need to operate on whole page. Just on part containing the content.
+        content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
+        if (content_re.match(document)).nil?
+          puts document
+        end
+        content = content_re.match(document)[0]
+        @nodeset = Nokogiri::HTML(content)
+      end
+      def name
+        @nodeset.css('h1.page_title').text.strip
+      end
+      def owner
+        return (@nodeset.xpath("//h1[@class='page_title'][1]/following::text()") & @nodeset.xpath("//table[@class='book_details'][1]/preceding::text()")).text.strip
+      end
+      def headquarters
+        headquarters_hash   = {}
+        temp_array          = []
+        current_key         = nil
+        last_key            = nil
+        @nodeset.xpath("//table[@class='book_details'][1]//tr").each do |item|
+          key         = item.children[0].text.strip
+          current_key = key.end_with?(":") ? key[0..-2] : last_key
+          value       = item.children[1].text.strip
+          unless key.empty? and value.empty?
+            if current_key == last_key
+              temp_array << headquarters_hash[current_key] unless headquarters_hash[current_key].is_a?(Array)
+              temp_array << value.gsub(/,$/, '').strip unless value.empty?
+              headquarters_hash[current_key] = temp_array
+            else
+              temp_array                      = []
+              headquarters_hash[current_key]  = value.gsub(/,$/, '').strip
+            end
+          end
+          last_key = current_key
+        end
+        # Change keys. Use the same as in bookstores.
+        mappings                      = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website}
+        headquarters_hash             = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
+        headquarters_hash[:website]  = headquarters_hash[:website].split(',').map(&:strip) if headquarters_hash[:website].include? ','
+        return headquarters_hash
+      end
+      def bookstores
+        bookstores_hash = Hash.new { |h,k| h[k] = {} }
+        address_array   = []
+        tel_array       = []
+        # Defaunt key in case there is none.
+        key = 'Βιβλιοπωλείο'
+        @nodeset.css('//p[align="justify"]').inner_html.split('<br>').map(&:strip).reject(&:empty?).each do |item|
+          regex_tel   = /\d{3} \d{7}/
+          regex_tk    = /\d{3} \d{2}/
+          regex_email = /([\w+\-].?)+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+/i
+          regex_url   = /((http(?:s)?\:\/\/)?[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,6}(?:\/?|(?:\/[\w\-]+)*)(?:\/?|\/\w+\.[a-zA-Z]{2,4}(?:\?[\w]+\=[\w\-]+)?)?(?:\&[\w]+\=[\w\-]+)*)/ix
+          if item.end_with?(":")
+            key           = item[0..-2]
+            address_array = []
+            tel_array     = []
+          elsif (item.start_with?("Fax") or item.start_with?("fax")) and item =~ regex_tel
+            bookstores_hash[key][:fax]        = item.gsub(/[^\d{3} \d{2}]/, '').strip
+          elsif item =~ regex_tel
+            tel_array << item.gsub(/[^\d{3} \d{2}]/, '').strip
+            bookstores_hash[key][:telephone]  = tel_array
+          elsif item =~ regex_tk
+            address_array << item.gsub(/,$/, '').strip
+            bookstores_hash[key][:address]    = address_array
+          elsif item =~ regex_email
+            bookstores_hash[key][:email]      = (regex_email.match(item))[0]
+          elsif item =~ regex_url
+            bookstores_hash[key][:website]    = item[regex_url,1]
+          else
+            address_array << item.gsub(/,$/, '').strip
+            bookstores_hash[key][:address]    = address_array
+          end
+        end
+        return bookstores_hash
+      end
+    end
+  end
+end

data/lib/bookshark/extractors/search.rb ADDED

@@ -0,0 +1,104 @@
+require_relative 'book_extractor'
+module Biblionet
+  module Extractors
+    class Search < BookExtractor
+      def initialize(options = {})
+        perform_search(options) unless options.empty?
+      end
+      def perform_search(options = {})
+        search_url = build_search_url(options)
+        load_page(URI.encode(search_url)) # Page gets loaded on @page variable.
+        book_ids = []
+        # No need to operate on whole page. Just on part containing the book.
+        content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
+        if (content_re.match(@page)).nil?
+          puts @page
+        end
+        content = content_re.match(@page)[0]
+        nodeset = Nokogiri::HTML(content)
+        nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]]").each do |item|
+          book_ids << item[:href].split("/")[2]
+        end
+        books = []
+        if options[:results_type] == 'ids'
+          return book_ids
+        elsif options[:results_type] == 'metadata'
+          book_ids.each do |id|
+            url = "http://www.biblionet.gr/book/#{id}"
+            books << load_and_extract_book(url)
+          end
+        end
+        return books
+      end
+      def build_search_url(options = {})
+        title         = present?(options[:title])     ? options[:title].gsub(' ','+')     : ''
+        author        = present?(options[:author])    ? options[:author].gsub(' ','+')    : ''
+        publisher     = present?(options[:publisher]) ? options[:publisher].gsub(' ','+') : ''
+        category      = present?(options[:category])  ? options[:category].gsub(' ','+')  : ''
+        title_split   = options[:title_split]  ||= '1'
+        book_id       = options[:book_id]      ||= ''
+        isbn          = options[:isbn]         ||= ''
+        author_id     = options[:author_id]    ||= ''
+        publisher_id  = options[:publisher_id] ||= ''
+        category_id   = options[:category_id]  ||= ''
+        after_year    = options[:after_year]   ||= ''
+        before_year   = options[:before_year]  ||= ''
+        url_builder = StringBuilder.new
+        url_builder.append('http://www.biblionet.gr/main.asp?page=results')
+        url_builder.append('&title=')
+        url_builder.append(title)
+        url_builder.append('&TitleSplit=')
+        url_builder.append(title_split)
+        url_builder.append('&Titlesid=')
+        url_builder.append(book_id)
+        url_builder.append('&isbn=')
+        url_builder.append(isbn)
+        url_builder.append('&person=')
+        url_builder.append(author)
+        url_builder.append('&person_ID=')
+        url_builder.append(author_id)
+        url_builder.append('&com=')
+        url_builder.append(publisher)
+        url_builder.append('&com_ID=')
+        url_builder.append(publisher_id)
+        url_builder.append('&from=')
+        url_builder.append(after_year)
+        url_builder.append('&untill=')
+        url_builder.append(before_year)
+        url_builder.append('&subject=')
+        url_builder.append(category)
+        url_builder.append('&subject_ID=')
+        url_builder.append(category_id)
+        url_builder.build
+      end
+    end
+    class StringBuilder
+      def initialize
+        @string = []
+      end
+      def append(text)
+        @string << text
+      end
+      def build
+        @string.join.to_s
+      end
+    end
+  end
+end

data/lib/bookshark/storage/file_manager.rb ADDED

@@ -0,0 +1,103 @@
+require 'fileutils'
+require 'json'
+module FileManager
+  DEFAULTS = {
+    path: '',
+    all: false,
+    extension: '',
+  }
+  # Lists directories in current path or in path specified by options hash.
+  #
+  # ==== Attributes
+  #
+  # * +options+ - The options hash accepts options for a more specialized directory search operation.
+  #
+  # ==== Options
+  #
+  # * +:path+ - The path where directory search will happen.
+  # * +:all+ - If true, recursive search is enabled.
+  #
+  def list_directories(options = {})
+    options = DEFAULTS.merge(options)
+    path = options[:path]
+    all = options[:all]
+    path = "#{path}/" unless path == '' or path.end_with?('/')
+    path = path+'**/' if all
+    Dir.glob("#{path}*/")
+  end
+  # Returns a list of all files in current directory or as specified in options hash.
+  #
+  # ==== Attributes
+  #
+  # * +options+ - The options hash accepts options for a more specialized file search operation.
+  #
+  # ==== Options
+  #
+  # * +:path+ - The path where file search will happen.
+  # * +:extension+ - The extension of target files.
+  # * +:all+ - If true, recursive search is enabled.
+  #
+  # ==== Examples
+  #
+  #   files = list_files
+  #   files = list_files path: 'html_pages'
+  #   files = list_files path: 'raw_html_pages/2', extension:'html'
+  #   files = list_files(path: 'ddc_pages', extension:'json', all:true).each do |file|
+  #     file.do_something
+  #   end
+  #
+  def list_files(options = {})
+    options = DEFAULTS.merge(options)
+    path = options[:path]
+    all = options[:all]
+    extension = options[:extension]
+    extension = ".#{extension}" unless extension == '' or extension.start_with?('.')
+    file_wildcard = "*#{extension}"
+    path = "#{path}/" unless path == '' or path.end_with?('/')
+    path = path+'**/' if all
+    Dir.glob("#{path}#{file_wildcard}")
+  end
+  # Saves some text/string to file.
+  #
+  # ==== Attributes
+  #
+  # * +path+ - The path to file(including filename) where content will be saved.
+  # * +content+ - The text which will be saved to file.
+  #
+  # ==== Examples
+  #
+  #   save_to('data_pages/categories/cat_15.txt', 'Some text')
+  #
+  def save_to(path, content)
+    begin
+      dir = File.dirname(path)
+      # Create a new directory (does nothing if directory exists or is a file)
+      FileUtils.mkdir_p dir #unless File.dirname(path) == "."
+      open(path, "w") do |f|
+        f.write(content)
+      end
+    rescue StandardError => e
+      puts e
+    end
+  end
+end

data/lib/bookshark/version.rb ADDED

@@ -0,0 +1,3 @@
+module Bookshark
+  VERSION = "1.0.0.alpha.2"
+end

data/spec/bookshark_spec.rb ADDED

@@ -0,0 +1,96 @@
+require 'spec_helper'
+describe Bookshark::Extractor do
+  subject { Bookshark::Extractor.new(format: 'pretty_json') }
+  let(:author_13219)      { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/author_13219.json"     , File.dirname(__FILE__))).read)) }
+  let(:publisher_20)      { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/publisher_20.json"     , File.dirname(__FILE__))).read)) }
+  let(:category_1041)     { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/category_1041.json"    , File.dirname(__FILE__))).read)) }
+  let(:book_103788)       { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/book_103788.json"      , File.dirname(__FILE__))).read)) }
+  let(:eager_book_184923) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/eager_book_184923.json", File.dirname(__FILE__))).read)) }
+  let(:search_01)         { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/search_01.json"        , File.dirname(__FILE__))).read)) }
+  let(:search_ids_01)     { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/search_ids_01.json"    , File.dirname(__FILE__))).read)) }
+  describe '#author' do
+    context 'from remote html source' do
+      it 'reads html from the web and extracts author data' do
+        expect(subject.author(id: 13219)).to eq author_13219
+      end
+    end
+    context 'from local storage' do
+      it 'reads html from file and extracts author data' do
+        file_name = File.expand_path("test_data/author_13219.html", File.dirname(__FILE__))
+        expect(subject.author(uri: file_name)).to eq author_13219
+      end
+    end
+  end
+  describe '#publisher' do
+    context 'extract from remote html source' do
+      it 'reads html from the web and extracts publisher data' do
+        expect(subject.publisher(id: 20)).to eq publisher_20
+      end
+    end
+  end
+  describe '#category' do
+    context 'extract from remote html source' do
+      it 'reads html from the web and extracts category data' do
+        expect(subject.category(id: 1041)).to eq category_1041
+      end
+    end
+  end
+  describe '#book' do
+    context 'extract from remote html source' do
+      it 'reads html from the web and extracts book data' do
+        expect(subject.book(id: 103788)).to eq book_103788
+      end
+      it 'reads html from the web and eager extracts all book and reference data' do
+        expect(subject.book(id: 184923, eager: true)).to eq eager_book_184923
+      end
+    end
+  end
+  describe '#search' do
+    context 'extract from remote html source' do
+      it 'builds a search url and extracts book ids from search page' do
+        expect(subject.search(title: 'σημεια και τερατα', results_type: 'ids')).to eq search_ids_01
+      end
+      it 'builds a search url and extracts book data from search page' do
+        expect(subject.search(title: 'σημεια και τερατα', results_type: 'metadata')).to eq search_01
+      end
+    end
+  end
+  describe '#process_options' do
+    context 'with valid options' do
+      it 'returns a biblionet url when there is no local option set' do
+        expect(subject.send(:process_options, {id:56}, 'author')).to eq("http://www.biblionet.gr/author/56")
+      end
+      it 'returns a local path when the local option is set to true' do
+        expect(subject.send(:process_options, {id: 56, local: true}, 'author')).to eq("#{Bookshark::path_to_storage}/html_author_pages/0/author_56.html")
+      end
+      it 'returns the given uri' do
+        expect(subject.send(:process_options, {uri:'http://www.biblionet.gr/book/5487', id: 56, local: true}, 'book')).to eq("http://www.biblionet.gr/book/5487")
+      end
+      it 'returns the given uri if uri option is set even if other options are set' do
+       expect(subject.send(:process_options, {uri:'http://www.biblionet.gr/book/87', id: 56, local: true}, 'book')).to eq("http://www.biblionet.gr/book/87")
+      end
+    end
+    context 'with invalid options' do
+      it 'returns the given uri' do
+      end
+    end
+  end
+end