bookshark 1.0.0.alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
 - data/.gitignore +20 -0
 - data/.rspec +3 -0
 - data/Gemfile +4 -0
 - data/LICENSE.txt +22 -0
 - data/README.md +453 -0
 - data/Rakefile +4 -0
 - data/bookshark.gemspec +29 -0
 - data/lib/bookshark.rb +371 -0
 - data/lib/bookshark/crawlers/author_crawler.rb +42 -0
 - data/lib/bookshark/crawlers/base.rb +46 -0
 - data/lib/bookshark/crawlers/book_crawler.rb +55 -0
 - data/lib/bookshark/crawlers/category_crawler.rb +55 -0
 - data/lib/bookshark/crawlers/publisher_crawler.rb +35 -0
 - data/lib/bookshark/extractors/author_extractor.rb +116 -0
 - data/lib/bookshark/extractors/base.rb +187 -0
 - data/lib/bookshark/extractors/book_extractor.rb +453 -0
 - data/lib/bookshark/extractors/category_extractor.rb +82 -0
 - data/lib/bookshark/extractors/publisher_extractor.rb +138 -0
 - data/lib/bookshark/extractors/search.rb +104 -0
 - data/lib/bookshark/storage/file_manager.rb +103 -0
 - data/lib/bookshark/version.rb +3 -0
 - data/spec/bookshark_spec.rb +96 -0
 - data/spec/spec_helper.rb +1 -0
 - data/spec/test_data/author_13219.html +313 -0
 - data/spec/test_data/author_13219.json +23 -0
 - data/spec/test_data/book_103788.json +49 -0
 - data/spec/test_data/category_1041.json +42 -0
 - data/spec/test_data/eager_book_184923.json +215 -0
 - data/spec/test_data/publisher_20.json +43 -0
 - data/spec/test_data/search_01.json +355 -0
 - data/spec/test_data/search_ids_01.json +13 -0
 - data/tasks/console.rake +4 -0
 - data/tasks/rspec.rake +3 -0
 - metadata +191 -0
 
    
        data/Rakefile
    ADDED
    
    
    
        data/bookshark.gemspec
    ADDED
    
    | 
         @@ -0,0 +1,29 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # coding: utf-8
         
     | 
| 
      
 2 
     | 
    
         
            +
            lib = File.expand_path('../lib', __FILE__)
         
     | 
| 
      
 3 
     | 
    
         
            +
            $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'bookshark/version'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            Gem::Specification.new do |spec|
         
     | 
| 
      
 7 
     | 
    
         
            +
              spec.name          = "bookshark"
         
     | 
| 
      
 8 
     | 
    
         
            +
              spec.version       = Bookshark::VERSION
         
     | 
| 
      
 9 
     | 
    
         
            +
              spec.authors       = ["Dimitris Klisiaris"]
         
     | 
| 
      
 10 
     | 
    
         
            +
              spec.email         = ["dklisiaris@gmail.com"]
         
     | 
| 
      
 11 
     | 
    
         
            +
              spec.summary       = %q{Book metadata extractor from biblionet.gr.}
         
     | 
| 
      
 12 
     | 
    
         
            +
              spec.description   = %q{Extracts book, author, publisher and category metadata from biblionet.gr.}
         
     | 
| 
      
 13 
     | 
    
         
            +
              spec.homepage      = "https://github.com/dklisiaris/bookshark"
         
     | 
| 
      
 14 
     | 
    
         
            +
              spec.license       = "MIT"
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
              spec.files         = `git ls-files -z`.split("\x0")
         
     | 
| 
      
 17 
     | 
    
         
            +
              spec.executables   = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
         
     | 
| 
      
 18 
     | 
    
         
            +
              spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
         
     | 
| 
      
 19 
     | 
    
         
            +
              spec.require_paths = ["lib"]
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
              spec.add_dependency "nokogiri", "~> 1.6", ">= 1.6.6"
         
     | 
| 
      
 22 
     | 
    
         
            +
              spec.add_dependency "sanitize", "~> 3.1"
         
     | 
| 
      
 23 
     | 
    
         
            +
              spec.add_dependency "json", "~> 1.8"
         
     | 
| 
      
 24 
     | 
    
         
            +
              spec.add_dependency "htmlentities", "~> 4.3"
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
              spec.add_development_dependency "bundler", "~> 1.7"
         
     | 
| 
      
 27 
     | 
    
         
            +
              spec.add_development_dependency "rake", "~> 10.0"
         
     | 
| 
      
 28 
     | 
    
         
            +
              spec.add_development_dependency 'rspec', "~> 3.1"
         
     | 
| 
      
 29 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/bookshark.rb
    ADDED
    
    | 
         @@ -0,0 +1,371 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "bookshark/version"
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'bookshark/storage/file_manager'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            require 'bookshark/extractors/author_extractor'
         
     | 
| 
      
 5 
     | 
    
         
            +
            require 'bookshark/extractors/category_extractor'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'bookshark/extractors/book_extractor'
         
     | 
| 
      
 7 
     | 
    
         
            +
            require 'bookshark/extractors/publisher_extractor'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require 'bookshark/extractors/search'
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
            require 'bookshark/crawlers/base'
         
     | 
| 
      
 11 
     | 
    
         
            +
            require 'bookshark/crawlers/publisher_crawler'
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            module Bookshark
         
     | 
| 
      
 14 
     | 
    
         
            +
              DEFAULTS = {
         
     | 
| 
      
 15 
     | 
    
         
            +
                site: 'biblionet',
         
     | 
| 
      
 16 
     | 
    
         
            +
                format: 'hash'
         
     | 
| 
      
 17 
     | 
    
         
            +
              }
         
     | 
| 
      
 18 
     | 
    
         
            +
             
     | 
| 
      
 19 
     | 
    
         
            +
              def self.root
         
     | 
| 
      
 20 
     | 
    
         
            +
                File.dirname __dir__
         
     | 
| 
      
 21 
     | 
    
         
            +
              end  
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
              def self.path_to_storage
         
     | 
| 
      
 24 
     | 
    
         
            +
                File.join root, 'lib/bookshark/storage'
         
     | 
| 
      
 25 
     | 
    
         
            +
              end
         
     | 
| 
      
 26 
     | 
    
         
            +
             
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
              class Extractor
         
     | 
| 
      
 29 
     | 
    
         
            +
                include FileManager
         
     | 
| 
      
 30 
     | 
    
         
            +
                attr_accessor :site, :format    
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                def initialize(options = {})
         
     | 
| 
      
 33 
     | 
    
         
            +
                  options = DEFAULTS.merge(options)
         
     | 
| 
      
 34 
     | 
    
         
            +
                  @site   = options[:site]
         
     | 
| 
      
 35 
     | 
    
         
            +
                  @format = options[:format]
         
     | 
| 
      
 36 
     | 
    
         
            +
                end
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                def author(options = {})
         
     | 
| 
      
 39 
     | 
    
         
            +
                  uri = process_options(options, __method__)
         
     | 
| 
      
 40 
     | 
    
         
            +
                  options[:format] ||= @format
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                  author_extractor = Biblionet::Extractors::AuthorExtractor.new
         
     | 
| 
      
 43 
     | 
    
         
            +
                  author = author_extractor.load_and_extract_author(uri) 
         
     | 
| 
      
 44 
     | 
    
         
            +
                      
         
     | 
| 
      
 45 
     | 
    
         
            +
                  response = {}      
         
     | 
| 
      
 46 
     | 
    
         
            +
                  response[:author] = [author]
         
     | 
| 
      
 47 
     | 
    
         
            +
                  response = change_format(response, options[:format])
         
     | 
| 
      
 48 
     | 
    
         
            +
                  return response
         
     | 
| 
      
 49 
     | 
    
         
            +
                end
         
     | 
| 
      
 50 
     | 
    
         
            +
             
     | 
| 
      
 51 
     | 
    
         
            +
                def publisher(options = {})
         
     | 
| 
      
 52 
     | 
    
         
            +
                  uri = process_options(options, __method__)
         
     | 
| 
      
 53 
     | 
    
         
            +
                  options[:format] ||= @format
         
     | 
| 
      
 54 
     | 
    
         
            +
             
     | 
| 
      
 55 
     | 
    
         
            +
                  publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
         
     | 
| 
      
 56 
     | 
    
         
            +
                  publisher = publisher_extractor.load_and_extract_publisher(uri)
         
     | 
| 
      
 57 
     | 
    
         
            +
                  
         
     | 
| 
      
 58 
     | 
    
         
            +
                  response = {}      
         
     | 
| 
      
 59 
     | 
    
         
            +
                  response[:publisher] = [publisher]
         
     | 
| 
      
 60 
     | 
    
         
            +
                  response = change_format(response, options[:format])
         
     | 
| 
      
 61 
     | 
    
         
            +
                  response = publisher_extractor.decode_text(response)
         
     | 
| 
      
 62 
     | 
    
         
            +
             
     | 
| 
      
 63 
     | 
    
         
            +
                  return response
         
     | 
| 
      
 64 
     | 
    
         
            +
                  # return uri     
         
     | 
| 
      
 65 
     | 
    
         
            +
                end    
         
     | 
| 
      
 66 
     | 
    
         
            +
             
     | 
| 
      
 67 
     | 
    
         
            +
                def book(options = {})
         
     | 
| 
      
 68 
     | 
    
         
            +
                  book_extractor = Biblionet::Extractors::BookExtractor.new
         
     | 
| 
      
 69 
     | 
    
         
            +
             
     | 
| 
      
 70 
     | 
    
         
            +
                  uri = process_options(options, __method__)
         
     | 
| 
      
 71 
     | 
    
         
            +
                  options[:format]  ||= @format
         
     | 
| 
      
 72 
     | 
    
         
            +
                  options[:eager]   ||= false
         
     | 
| 
      
 73 
     | 
    
         
            +
                  
         
     | 
| 
      
 74 
     | 
    
         
            +
                  if options[:eager]
         
     | 
| 
      
 75 
     | 
    
         
            +
                    book = eager_extract_book(uri)
         
     | 
| 
      
 76 
     | 
    
         
            +
                  else        
         
     | 
| 
      
 77 
     | 
    
         
            +
                    book = book_extractor.load_and_extract_book(uri)
         
     | 
| 
      
 78 
     | 
    
         
            +
                  end
         
     | 
| 
      
 79 
     | 
    
         
            +
             
     | 
| 
      
 80 
     | 
    
         
            +
                  response = {}      
         
     | 
| 
      
 81 
     | 
    
         
            +
                  response[:book] = [book]
         
     | 
| 
      
 82 
     | 
    
         
            +
                  response = change_format(response, options[:format])
         
     | 
| 
      
 83 
     | 
    
         
            +
                  response = book_extractor.decode_text(response)
         
     | 
| 
      
 84 
     | 
    
         
            +
                  
         
     | 
| 
      
 85 
     | 
    
         
            +
                  return response            
         
     | 
| 
      
 86 
     | 
    
         
            +
                end
         
     | 
| 
      
 87 
     | 
    
         
            +
             
     | 
| 
      
 88 
     | 
    
         
            +
                def category(options = {})
         
     | 
| 
      
 89 
     | 
    
         
            +
                  uri = process_options(options, __method__)
         
     | 
| 
      
 90 
     | 
    
         
            +
                  options[:format] ||= @format      
         
     | 
| 
      
 91 
     | 
    
         
            +
             
     | 
| 
      
 92 
     | 
    
         
            +
                  category_extractor = Biblionet::Extractors::CategoryExtractor.new
         
     | 
| 
      
 93 
     | 
    
         
            +
                  category = category_extractor.extract_categories_from(uri)
         
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
                  response = {}      
         
     | 
| 
      
 96 
     | 
    
         
            +
                  response[:category] = [category]
         
     | 
| 
      
 97 
     | 
    
         
            +
                  response = change_format(response, options[:format])
         
     | 
| 
      
 98 
     | 
    
         
            +
                  
         
     | 
| 
      
 99 
     | 
    
         
            +
                  return response        
         
     | 
| 
      
 100 
     | 
    
         
            +
                end
         
     | 
| 
      
 101 
     | 
    
         
            +
             
     | 
| 
      
 102 
     | 
    
         
            +
                def search(options = {})
         
     | 
| 
      
 103 
     | 
    
         
            +
                  options[:format]        ||= @format
         
     | 
| 
      
 104 
     | 
    
         
            +
                  options[:results_type]  ||= 'metadata'           
         
     | 
| 
      
 105 
     | 
    
         
            +
             
     | 
| 
      
 106 
     | 
    
         
            +
                  search_engine  = Biblionet::Extractors::Search.new
         
     | 
| 
      
 107 
     | 
    
         
            +
                  search_results = search_engine.perform_search(options)
         
     | 
| 
      
 108 
     | 
    
         
            +
             
     | 
| 
      
 109 
     | 
    
         
            +
                  response = {}      
         
     | 
| 
      
 110 
     | 
    
         
            +
                  response[:book] = search_results
         
     | 
| 
      
 111 
     | 
    
         
            +
                  response = change_format(response, options[:format])
         
     | 
| 
      
 112 
     | 
    
         
            +
                  
         
     | 
| 
      
 113 
     | 
    
         
            +
                  return response       
         
     | 
| 
      
 114 
     | 
    
         
            +
                end
         
     | 
| 
      
 115 
     | 
    
         
            +
             
     | 
| 
      
 116 
     | 
    
         
            +
                def parse_all_categories(will_save=false)
         
     | 
| 
      
 117 
     | 
    
         
            +
                  # list_directories('raw_ddc_pages').each do |dir|
         
     | 
| 
      
 118 
     | 
    
         
            +
                    # p dir
         
     | 
| 
      
 119 
     | 
    
         
            +
                  # end
         
     | 
| 
      
 120 
     | 
    
         
            +
                  category_extractor = Biblionet::Extractors::CategoryExtractor.new
         
     | 
| 
      
 121 
     | 
    
         
            +
                  all_categories = Hash.new
         
     | 
| 
      
 122 
     | 
    
         
            +
                  
         
     | 
| 
      
 123 
     | 
    
         
            +
                  list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
         
     | 
| 
      
 124 
     | 
    
         
            +
                    categories = category_extractor.extract_categories_from(file)                 
         
     | 
| 
      
 125 
     | 
    
         
            +
                    all_categories.merge!(categories) unless categories.nil? or categories.empty?
         
     | 
| 
      
 126 
     | 
    
         
            +
                  end
         
     | 
| 
      
 127 
     | 
    
         
            +
             
     | 
| 
      
 128 
     | 
    
         
            +
                  if will_save
         
     | 
| 
      
 129 
     | 
    
         
            +
                    all_categories_json = all_categories.to_json
         
     | 
| 
      
 130 
     | 
    
         
            +
                    save_to('storage/all_categories.json',all_categories_json)
         
     | 
| 
      
 131 
     | 
    
         
            +
                  end
         
     | 
| 
      
 132 
     | 
    
         
            +
             
     | 
| 
      
 133 
     | 
    
         
            +
                  all_categories
         
     | 
| 
      
 134 
     | 
    
         
            +
                end
         
     | 
| 
      
 135 
     | 
    
         
            +
             
     | 
| 
      
 136 
     | 
    
         
            +
                def parse_all_books
         
     | 
| 
      
 137 
     | 
    
         
            +
                  bp = Biblionet::Extractors::BookExtractor.new
         
     | 
| 
      
 138 
     | 
    
         
            +
             
     | 
| 
      
 139 
     | 
    
         
            +
                  list_directories(path: 'storage/raw_html_pages').each do |dir|
         
     | 
| 
      
 140 
     | 
    
         
            +
                    dir_to_save = dir.gsub(/raw_html_pages/, 'books')
         
     | 
| 
      
 141 
     | 
    
         
            +
                    
         
     | 
| 
      
 142 
     | 
    
         
            +
                    list_files(path: dir, extension: 'html', all:true).each do |file|        
         
     | 
| 
      
 143 
     | 
    
         
            +
                  
         
     | 
| 
      
 144 
     | 
    
         
            +
                      # Load the book from html file and parse the data.
         
     | 
| 
      
 145 
     | 
    
         
            +
                      # pp "Parsing book: #{file}"
         
     | 
| 
      
 146 
     | 
    
         
            +
                      pp file
         
     | 
| 
      
 147 
     | 
    
         
            +
                      book = bp.load_and_extract_book(file)
         
     | 
| 
      
 148 
     | 
    
         
            +
                  
         
     | 
| 
      
 149 
     | 
    
         
            +
                      # Prepare a path to save the new file.
         
     | 
| 
      
 150 
     | 
    
         
            +
                      filename  = File.basename(file,".*")
         
     | 
| 
      
 151 
     | 
    
         
            +
                      path_to_save = "#{dir_to_save}#{filename}.json"
         
     | 
| 
      
 152 
     | 
    
         
            +
                  
         
     | 
| 
      
 153 
     | 
    
         
            +
                      # Save to file.        
         
     | 
| 
      
 154 
     | 
    
         
            +
                      bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
         
     | 
| 
      
 155 
     | 
    
         
            +
                      # pp "Book #{file} saved!"
         
     | 
| 
      
 156 
     | 
    
         
            +
                    end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
         
     | 
| 
      
 157 
     | 
    
         
            +
                  end
         
     | 
| 
      
 158 
     | 
    
         
            +
                end
         
     | 
| 
      
 159 
     | 
    
         
            +
             
     | 
| 
      
 160 
     | 
    
         
            +
                private
         
     | 
| 
      
 161 
     | 
    
         
            +
             
     | 
| 
      
 162 
     | 
    
         
            +
                def process_options(options = {}, caller = nil)
         
     | 
| 
      
 163 
     | 
    
         
            +
                  # puts caller_locations(1,1)[0].label
         
     | 
| 
      
 164 
     | 
    
         
            +
                  # options[:format] ||= @format
         
     | 
| 
      
 165 
     | 
    
         
            +
                  puts caller
         
     | 
| 
      
 166 
     | 
    
         
            +
                  id = options[:id]
         
     | 
| 
      
 167 
     | 
    
         
            +
             
     | 
| 
      
 168 
     | 
    
         
            +
                  if id
         
     | 
| 
      
 169 
     | 
    
         
            +
                    case caller.to_s
         
     | 
| 
      
 170 
     | 
    
         
            +
                    when 'author'
         
     | 
| 
      
 171 
     | 
    
         
            +
                      url_method    = 'author'
         
     | 
| 
      
 172 
     | 
    
         
            +
                      local_path    = "html_author_pages/#{((id-1)/1000)}/author_#{id}.html"
         
     | 
| 
      
 173 
     | 
    
         
            +
                    when 'publisher'
         
     | 
| 
      
 174 
     | 
    
         
            +
                      url_method    = 'com'
         
     | 
| 
      
 175 
     | 
    
         
            +
                      local_path    = "html_publisher_pages/#{((id-1)/100)}/publisher_#{id}.html"
         
     | 
| 
      
 176 
     | 
    
         
            +
                    when 'book'
         
     | 
| 
      
 177 
     | 
    
         
            +
                      url_method    = 'book'
         
     | 
| 
      
 178 
     | 
    
         
            +
                      local_path    = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
         
     | 
| 
      
 179 
     | 
    
         
            +
                    when 'category'
         
     | 
| 
      
 180 
     | 
    
         
            +
                      url_method    = 'index' 
         
     | 
| 
      
 181 
     | 
    
         
            +
                      local_path    = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"       
         
     | 
| 
      
 182 
     | 
    
         
            +
                    else
         
     | 
| 
      
 183 
     | 
    
         
            +
                      puts "Called from unknown method. Probably its rspec."
         
     | 
| 
      
 184 
     | 
    
         
            +
                    end      
         
     | 
| 
      
 185 
     | 
    
         
            +
             
     | 
| 
      
 186 
     | 
    
         
            +
                    options[:local] ||= false
         
     | 
| 
      
 187 
     | 
    
         
            +
                    url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
         
     | 
| 
      
 188 
     | 
    
         
            +
                    url = "http://www.biblionet.gr/#{url_method}/#{id}" unless options[:local]
         
     | 
| 
      
 189 
     | 
    
         
            +
                  end
         
     | 
| 
      
 190 
     | 
    
         
            +
                  uri = options[:uri] ||= url
         
     | 
| 
      
 191 
     | 
    
         
            +
             
     | 
| 
      
 192 
     | 
    
         
            +
                  return uri
         
     | 
| 
      
 193 
     | 
    
         
            +
                end  
         
     | 
| 
      
 194 
     | 
    
         
            +
             
     | 
| 
      
 195 
     | 
    
         
            +
                def change_format(hash, format)
         
     | 
| 
      
 196 
     | 
    
         
            +
                  case format
         
     | 
| 
      
 197 
     | 
    
         
            +
                  when 'hash'
         
     | 
| 
      
 198 
     | 
    
         
            +
                    return hash
         
     | 
| 
      
 199 
     | 
    
         
            +
                  when 'json'
         
     | 
| 
      
 200 
     | 
    
         
            +
                    hash = hash.to_json
         
     | 
| 
      
 201 
     | 
    
         
            +
                  when 'pretty_json'
         
     | 
| 
      
 202 
     | 
    
         
            +
                    hash = JSON.pretty_generate(hash) 
         
     | 
| 
      
 203 
     | 
    
         
            +
                  end
         
     | 
| 
      
 204 
     | 
    
         
            +
                  return hash
         
     | 
| 
      
 205 
     | 
    
         
            +
                end    
         
     | 
| 
      
 206 
     | 
    
         
            +
             
     | 
| 
      
 207 
     | 
    
         
            +
                def eager_extract_book(uri)
         
     | 
| 
      
 208 
     | 
    
         
            +
                  book_extractor      = Biblionet::Extractors::BookExtractor.new
         
     | 
| 
      
 209 
     | 
    
         
            +
                  author_extractor    = Biblionet::Extractors::AuthorExtractor.new
         
     | 
| 
      
 210 
     | 
    
         
            +
                  publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
         
     | 
| 
      
 211 
     | 
    
         
            +
                  category_extractor  = Biblionet::Extractors::CategoryExtractor.new
         
     | 
| 
      
 212 
     | 
    
         
            +
             
     | 
| 
      
 213 
     | 
    
         
            +
                  book = book_extractor.load_and_extract_book(uri)
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
                  tmp_data = []                 
         
     | 
| 
      
 216 
     | 
    
         
            +
                  book[:author].each do |author|
         
     | 
| 
      
 217 
     | 
    
         
            +
                    tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}") 
         
     | 
| 
      
 218 
     | 
    
         
            +
                  end
         
     | 
| 
      
 219 
     | 
    
         
            +
                  book[:author] = tmp_data      
         
     | 
| 
      
 220 
     | 
    
         
            +
                  
         
     | 
| 
      
 221 
     | 
    
         
            +
                  tmp_data, tmp_hash = [], {}      
         
     | 
| 
      
 222 
     | 
    
         
            +
                  book[:contributors].each do |job, contributors|
         
     | 
| 
      
 223 
     | 
    
         
            +
                    contributors.each do |contributor|
         
     | 
| 
      
 224 
     | 
    
         
            +
                      tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
         
     | 
| 
      
 225 
     | 
    
         
            +
                    end
         
     | 
| 
      
 226 
     | 
    
         
            +
                    tmp_hash[job] = tmp_data
         
     | 
| 
      
 227 
     | 
    
         
            +
                    tmp_data = []
         
     | 
| 
      
 228 
     | 
    
         
            +
                  end
         
     | 
| 
      
 229 
     | 
    
         
            +
                  book[:contributors] = tmp_hash
         
     | 
| 
      
 230 
     | 
    
         
            +
             
     | 
| 
      
 231 
     | 
    
         
            +
                  tmp_data, tmp_hash = [], {} 
         
     | 
| 
      
 232 
     | 
    
         
            +
                  book[:category].each do |category|
         
     | 
| 
      
 233 
     | 
    
         
            +
                    tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
         
     | 
| 
      
 234 
     | 
    
         
            +
                  end
         
     | 
| 
      
 235 
     | 
    
         
            +
                  book[:category] = tmp_data 
         
     | 
| 
      
 236 
     | 
    
         
            +
                  
         
     | 
| 
      
 237 
     | 
    
         
            +
                  tmp_data = [] 
         
     | 
| 
      
 238 
     | 
    
         
            +
                  tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")  
         
     | 
| 
      
 239 
     | 
    
         
            +
                  book[:publisher] = tmp_data
         
     | 
| 
      
 240 
     | 
    
         
            +
             
     | 
| 
      
 241 
     | 
    
         
            +
                  book
         
     | 
| 
      
 242 
     | 
    
         
            +
                end       
         
     | 
| 
      
 243 
     | 
    
         
            +
              
         
     | 
| 
      
 244 
     | 
    
         
            +
              end
         
     | 
| 
      
 245 
     | 
    
         
            +
             
     | 
| 
      
 246 
     | 
    
         
            +
             
     | 
| 
      
 247 
     | 
    
         
            +
              class Crawler
         
     | 
| 
      
 248 
     | 
    
         
            +
                include FileManager
         
     | 
| 
      
 249 
     | 
    
         
            +
                attr_accessor :site
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
                def initialize(options = {})
         
     | 
| 
      
 252 
     | 
    
         
            +
                  options = DEFAULTS.merge(options)
         
     | 
| 
      
 253 
     | 
    
         
            +
                  @site   = options[:site]    
         
     | 
| 
      
 254 
     | 
    
         
            +
                end
         
     | 
| 
      
 255 
     | 
    
         
            +
             
     | 
| 
      
 256 
     | 
    
         
            +
                def publishers
         
     | 
| 
      
 257 
     | 
    
         
            +
                  # crawler = Biblionet::Crawlers::Base.new(start:1, finish:100, step:10)
         
     | 
| 
      
 258 
     | 
    
         
            +
                  # crawler.spider do |url, path|
         
     | 
| 
      
 259 
     | 
    
         
            +
                  #   puts "URL: #{url}, PATH: #{path}"
         
     | 
| 
      
 260 
     | 
    
         
            +
                  # end
         
     | 
| 
      
 261 
     | 
    
         
            +
                  # puts Biblionet::Extractors::Base.new("http://www.biblionet.gr/com/245").page
         
     | 
| 
      
 262 
     | 
    
         
            +
                  crawler = Biblionet::Crawlers::PublisherCrawler.new
         
     | 
| 
      
 263 
     | 
    
         
            +
                  crawler.crawl_and_save
         
     | 
| 
      
 264 
     | 
    
         
            +
                end
         
     | 
| 
      
 265 
     | 
    
         
            +
             
     | 
| 
      
 266 
     | 
    
         
            +
              end  
         
     | 
| 
      
 267 
     | 
    
         
            +
             
     | 
| 
      
 268 
     | 
    
         
            +
            #   module Biblionet
         
     | 
| 
      
 269 
     | 
    
         
            +
            #     class Extract
         
     | 
| 
      
 270 
     | 
    
         
            +
            #       class << self      
         
     | 
| 
      
 271 
     | 
    
         
            +
            #         def author(uri=nil)
         
     | 
| 
      
 272 
     | 
    
         
            +
            #           author_extractor = BiblionetParser::Core::AuthorExtractor.new
         
     | 
| 
      
 273 
     | 
    
         
            +
            #           author_extractor.load_and_extract_author(uri)
         
     | 
| 
      
 274 
     | 
    
         
            +
            #         end
         
     | 
| 
      
 275 
     | 
    
         
            +
             
     | 
| 
      
 276 
     | 
    
         
            +
            #         def book(uri=nil)
         
     | 
| 
      
 277 
     | 
    
         
            +
            #           bp = BiblionetParser::Core::BookParser.new
         
     | 
| 
      
 278 
     | 
    
         
            +
            #           bp.load_and_parse_book(uri)
         
     | 
| 
      
 279 
     | 
    
         
            +
            #         end
         
     | 
| 
      
 280 
     | 
    
         
            +
             
     | 
| 
      
 281 
     | 
    
         
            +
            #         def categories(uri=nil)
         
     | 
| 
      
 282 
     | 
    
         
            +
            #           category_extractor = BiblionetParser::Core::DDCParser.new
         
     | 
| 
      
 283 
     | 
    
         
            +
            #           category_extractor.extract_categories_from(uri)
         
     | 
| 
      
 284 
     | 
    
         
            +
            #         end
         
     | 
| 
      
 285 
     | 
    
         
            +
             
     | 
| 
      
 286 
     | 
    
         
            +
            #       end
         
     | 
| 
      
 287 
     | 
    
         
            +
            #     end
         
     | 
| 
      
 288 
     | 
    
         
            +
            #   end  
         
     | 
| 
      
 289 
     | 
    
         
            +
            end
         
     | 
| 
      
 290 
     | 
    
         
            +
             
     | 
| 
      
 291 
     | 
    
         
            +
             
     | 
| 
      
 292 
     | 
    
         
            +
            # ae = BiblionetParser::Core::AuthorExtractor.new
         
     | 
| 
      
 293 
     | 
    
         
            +
            # ae.load_and_extract_author('storage/html_author_pages/0/author_5.html')
         
     | 
| 
      
 294 
     | 
    
         
            +
             
     | 
| 
      
 295 
     | 
    
         
            +
             
     | 
| 
      
 296 
     | 
    
         
            +
            # Biblionet::Extract.author('storage/html_author_pages/0/author_5.html')
         
     | 
| 
      
 297 
     | 
    
         
            +
            # Biblionet::Extract.author('storage/html_author_pages/2/author_2423.html')
         
     | 
| 
      
 298 
     | 
    
         
            +
            # Biblionet::Extract.author('storage/html_author_pages/0/author_764.html')
         
     | 
| 
      
 299 
     | 
    
         
            +
            # Biblionet::Extract.author('storage/html_author_pages/0/author_435.html')
         
     | 
| 
      
 300 
     | 
    
         
            +
             
     | 
| 
      
 301 
     | 
    
         
            +
            # bib = Bibliotheca.new
         
     | 
| 
      
 302 
     | 
    
         
            +
            # categories = bib.parse_all_categories(true)
         
     | 
| 
      
 303 
     | 
    
         
            +
             
     | 
| 
      
 304 
     | 
    
         
            +
            # p bib.list_files(path: 'raw_html_pages/2', extension:'html')
         
     | 
| 
      
 305 
     | 
    
         
            +
            # p bib.list_directories
         
     | 
| 
      
 306 
     | 
    
         
            +
            # p categories[787]
         
     | 
| 
      
 307 
     | 
    
         
            +
            # categories = 'test'
         
     | 
| 
      
 308 
     | 
    
         
            +
            # bib.save_to('all_categories_test.json', categories)
         
     | 
| 
      
 309 
     | 
    
         
            +
             
     | 
| 
      
 310 
     | 
    
         
            +
            # bp = BiblionetParser::Core::BookParser.new
         
     | 
| 
      
 311 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/96/book_96592.html') # BAD Book --no image
         
     | 
| 
      
 312 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/96/book_96937.html') # BAD Book --award
         
     | 
| 
      
 313 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/78/book_78836.html') # BAD Book --multiple awards
         
     | 
| 
      
 314 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/149/book_149345.html') # BAD Book --2 sets of details (ebooks, normals)
         
     | 
| 
      
 315 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/149/book_149402.html') # BAD Book --2 sets of details (normals, reviews)
         
     | 
| 
      
 316 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/149/book_149278.html') # BAD Book --3 sets of details (ebooks, normals, reviews)
         
     | 
| 
      
 317 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/149/book_149647.html')
         
     | 
| 
      
 318 
     | 
    
         
            +
            # puts JSON.pretty_generate(bp.book)
         
     | 
| 
      
 319 
     | 
    
         
            +
             
     | 
| 
      
 320 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/70/book_70076.html') # BAD Book --Has comma inside award
         
     | 
| 
      
 321 
     | 
    
         
            +
             
     | 
| 
      
 322 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/70/book_70828.html') # BAD Book --No author. Collective Work
         
     | 
| 
      
 323 
     | 
    
         
            +
            # puts JSON.pretty_generate(bp.book)
         
     | 
| 
      
 324 
     | 
    
         
            +
             
     | 
| 
      
 325 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/70/book_70829.html') # BAD Book --No author, No publisher. Collective Work
         
     | 
| 
      
 326 
     | 
    
         
            +
            # puts JSON.pretty_generate(bp.book)
         
     | 
| 
      
 327 
     | 
    
         
            +
             
     | 
| 
      
 328 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/145/book_145326.html') # BAD Book --ISMN istead of ISBN
         
     | 
| 
      
 329 
     | 
    
         
            +
             
     | 
| 
      
 330 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/45/book_45455.html') # BAD Book --No author. Has contributors.
         
     | 
| 
      
 331 
     | 
    
         
            +
            # puts JSON.pretty_generate(bp.book)
         
     | 
| 
      
 332 
     | 
    
         
            +
             
     | 
| 
      
 333 
     | 
    
         
            +
             
     | 
| 
      
 334 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/132/book_132435.html') # BAD Book --Two authors.
         
     | 
| 
      
 335 
     | 
    
         
            +
            # puts JSON.pretty_generate(bp.book)
         
     | 
| 
      
 336 
     | 
    
         
            +
             
     | 
| 
      
 337 
     | 
    
         
            +
            # bp.load_and_parse_book('storage/raw_html_pages/133/book_133435.html') # GOOD Book
         
     | 
| 
      
 338 
     | 
    
         
            +
             
     | 
| 
      
 339 
     | 
    
         
            +
            # puts JSON.pretty_generate(bp.book)
         
     | 
| 
      
 340 
     | 
    
         
            +
             
     | 
| 
      
 341 
     | 
    
         
            +
            # ddcp = BiblionetParser::Core::DDCParser.new('storage/raw_ddc_pages/0/ddc_298.html')
         
     | 
| 
      
 342 
     | 
    
         
            +
            # pp all = ddcp.categories
         
     | 
| 
      
 343 
     | 
    
         
            +
            # pp cur = ddcp.categories.values.last
         
     | 
| 
      
 344 
     | 
    
         
            +
            # pp sel = ddcp.categories["2703"]
         
     | 
| 
      
 345 
     | 
    
         
            +
             
     | 
| 
      
 346 
     | 
    
         
            +
            # bp.parse_book('12351', bp.page)
         
     | 
| 
      
 347 
     | 
    
         
            +
             
     | 
| 
      
 348 
     | 
    
         
            +
            # bp.save_page('storage/mits_ts/mits1.json')
         
     | 
| 
      
 349 
     | 
    
         
            +
             
     | 
| 
      
 350 
     | 
    
         
            +
            # pp bp.url='http://www.biblionet.gr/book/123351'
         
     | 
| 
      
 351 
     | 
    
         
            +
            # pp bp.page
         
     | 
| 
      
 352 
     | 
    
         
            +
             
     | 
| 
      
 353 
     | 
    
         
            +
            # pp bib.list_directories(path: 'storage/raw_html_pages')
         
     | 
| 
      
 354 
     | 
    
         
            +
            # pp bib.list_files(path: "storage/raw_html_pages/24/", extension: 'html')
         
     | 
| 
      
 355 
     | 
    
         
            +
             
     | 
| 
      
 356 
     | 
    
         
            +
            # bib = Bibliotheca.new
         
     | 
| 
      
 357 
     | 
    
         
            +
            # bib.parse_all_books
         
     | 
| 
      
 358 
     | 
    
         
            +
             
     | 
| 
      
 359 
     | 
    
         
            +
            # Good cases:
         
     | 
| 
      
 360 
     | 
    
         
            +
            # 'storage/raw_html_pages/123/book_123351.html'
         
     | 
| 
      
 361 
     | 
    
         
            +
            # 'storage/raw_html_pages/17/book_17351.html'
         
     | 
| 
      
 362 
     | 
    
         
            +
            # 'storage/raw_html_pages/133/book_133435.html'
         
     | 
| 
      
 363 
     | 
    
         
            +
             
     | 
| 
      
 364 
     | 
    
         
            +
            # Special book cases to check out:
         
     | 
| 
      
 365 
     | 
    
         
            +
            # 'storage/raw_html_pages/96/book_96592.html' --no image
         
     | 
| 
      
 366 
     | 
    
         
            +
            # 'storage/raw_html_pages/96/book_96937.html'
         
     | 
| 
      
 367 
     | 
    
         
            +
             
     | 
| 
      
 368 
     | 
    
         
            +
            # Problematic at biblionet
         
     | 
| 
      
 369 
     | 
    
         
            +
            # http://biblionet.gr/book/196388
         
     | 
| 
      
 370 
     | 
    
         
            +
            # http://biblionet.gr/book/196386
         
     | 
| 
      
 371 
     | 
    
         
            +
            # http://biblionet.gr/book/195525
         
     | 
| 
         @@ -0,0 +1,42 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'rubygems'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'nokogiri'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'open-uri'
         
     | 
| 
      
 4 
     | 
    
         
            +
            require 'fileutils'
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
         
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
            DEFAULTS = {
         
     | 
| 
      
 9 
     | 
    
         
            +
              folder: 'storage/html_author_pages',
         
     | 
| 
      
 10 
     | 
    
         
            +
              base_url: 'http://www.biblionet.gr/author/',
         
     | 
| 
      
 11 
     | 
    
         
            +
              extension: '.html',
         
     | 
| 
      
 12 
     | 
    
         
            +
              first_id: 1,
         
     | 
| 
      
 13 
     | 
    
         
            +
              last_id: 112000,
         
     | 
| 
      
 14 
     | 
    
         
            +
              step: 1000
         
     | 
| 
      
 15 
     | 
    
         
            +
            }
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
            def crawl_and_save(options={})
         
     | 
| 
      
 18 
     | 
    
         
            +
              options = DEFAULTS.merge(options)
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
              start_id  = options[:first_id] + options[:step] - 1
         
     | 
| 
      
 21 
     | 
    
         
            +
              last_id   = options[:last_id]
         
     | 
| 
      
 22 
     | 
    
         
            +
              step      = options[:step]
         
     | 
| 
      
 23 
     | 
    
         
            +
             
     | 
| 
      
 24 
     | 
    
         
            +
              start_id.step(last_id, step) do |last|  
         
     | 
| 
      
 25 
     | 
    
         
            +
                first     = last - step + 1
         
     | 
| 
      
 26 
     | 
    
         
            +
                subfolder = (last/step - 1).to_s
         
     | 
| 
      
 27 
     | 
    
         
            +
                path      = "#{options[:folder]}/#{subfolder}/"
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                # Create a new directory (does nothing if directory exists)
         
     | 
| 
      
 30 
     | 
    
         
            +
                FileUtils.mkdir_p path
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
                first.upto(last) do |id|
         
     | 
| 
      
 33 
     | 
    
         
            +
                  file_to_save = "#{path}author_#{id}#{options[:extension]}"
         
     | 
| 
      
 34 
     | 
    
         
            +
                  url_to_download = "#{options[:base_url]}#{id}/"
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
                  downloader = Biblionet::Core::Base.new(url_to_download)
         
     | 
| 
      
 37 
     | 
    
         
            +
                  downloader.save_page(file_to_save) unless downloader.page.nil?
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
                end
         
     | 
| 
      
 40 
     | 
    
         
            +
              end
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,46 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            module Biblionet
         
     | 
| 
      
 4 
     | 
    
         
            +
              module Crawlers
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
                class Base
         
     | 
| 
      
 7 
     | 
    
         
            +
                  def initialize(options = {})
         
     | 
| 
      
 8 
     | 
    
         
            +
                    @folder     = options[:folder]    ||= 'lib/bookshark/storage/html_base_pages'
         
     | 
| 
      
 9 
     | 
    
         
            +
                    @base_url   = options[:base_url]  ||= 'http://www.biblionet.gr/base/'
         
     | 
| 
      
 10 
     | 
    
         
            +
                    @page_type  = options[:page_type] ||= 'base'
         
     | 
| 
      
 11 
     | 
    
         
            +
                    @extension  = options[:extension] ||= '.html'
         
     | 
| 
      
 12 
     | 
    
         
            +
                    @start      = options[:start]     ||= 1
         
     | 
| 
      
 13 
     | 
    
         
            +
                    @finish     = options[:finish]    ||= 10000
         
     | 
| 
      
 14 
     | 
    
         
            +
                    @step       = options[:step]      ||= 1000
         
     | 
| 
      
 15 
     | 
    
         
            +
                  end
         
     | 
| 
      
 16 
     | 
    
         
            +
             
     | 
| 
      
 17 
     | 
    
         
            +
                  def spider
         
     | 
| 
      
 18 
     | 
    
         
            +
                    start  = @start  + @step - 1
         
     | 
| 
      
 19 
     | 
    
         
            +
                    finish = @finish
         
     | 
| 
      
 20 
     | 
    
         
            +
                    
         
     | 
| 
      
 21 
     | 
    
         
            +
                    start.step(finish, @step) do |last|  
         
     | 
| 
      
 22 
     | 
    
         
            +
                      first     = last - @step + 1
         
     | 
| 
      
 23 
     | 
    
         
            +
                      subfolder = (last/@step - 1).to_s
         
     | 
| 
      
 24 
     | 
    
         
            +
                      path      = "#{@folder}/#{subfolder}/"
         
     | 
| 
      
 25 
     | 
    
         
            +
             
     | 
| 
      
 26 
     | 
    
         
            +
                      # Create a new directory (does nothing if directory exists)
         
     | 
| 
      
 27 
     | 
    
         
            +
                      # FileUtils.mkdir_p path
         
     | 
| 
      
 28 
     | 
    
         
            +
             
     | 
| 
      
 29 
     | 
    
         
            +
                      first.upto(last) do |id|
         
     | 
| 
      
 30 
     | 
    
         
            +
                        file_to_save    = "#{path}#{@page_type}_#{id}#{@extension}"
         
     | 
| 
      
 31 
     | 
    
         
            +
                        url_to_download = "#{@base_url}#{id}/"
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
      
 33 
     | 
    
         
            +
                        yield(url_to_download, file_to_save)
         
     | 
| 
      
 34 
     | 
    
         
            +
                        # downloader = Biblionet::Core::Base.new(url_to_download)
         
     | 
| 
      
 35 
     | 
    
         
            +
                        # downloader.save_page(file_to_save) unless downloader.page.nil?
         
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
      
 37 
     | 
    
         
            +
                      end
         
     | 
| 
      
 38 
     | 
    
         
            +
                    end
         
     | 
| 
      
 39 
     | 
    
         
            +
                  end
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
                end
         
     | 
| 
      
 43 
     | 
    
         
            +
             
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
              end
         
     | 
| 
      
 46 
     | 
    
         
            +
            end
         
     |