bookshark 1.0.0.alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,82 @@
1
+ require_relative 'base'
2
+
3
+ module Biblionet
4
+ module Extractors
5
+
6
+ class CategoryExtractor < Base
7
+ attr_reader :categories
8
+
9
+ def initialize(uri=nil)
10
+ super(uri)
11
+ extract_categories unless uri.nil?
12
+ end
13
+
14
+ def extract_categories(category_page=@page)
15
+ page = Nokogiri::HTML(category_page)
16
+ parent, previous_indent, previous_id = nil, nil, nil,
17
+
18
+ @categories = page.xpath("//a[@class='menu' and @href[contains(.,'/index/') ]]").map do |category|
19
+ # Extract from href the id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
20
+ biblionet_id = category[:href].split(/\//).last
21
+
22
+ # Get the text before <a>. It is expected to be a number of space characters
23
+ spaces = category.previous_sibling.text # TODO: make sure text is only spaces
24
+ # Indent size
25
+ indent = spaces.size
26
+
27
+ # Determine parent-child-sibling relationships based on indent.
28
+ # Indent size seems to be inconsistent, so it better to compare sizes than actually use them.
29
+ if (indent <=> previous_indent).nil?
30
+ previous_indent = indent
31
+ elsif (indent <=> previous_indent)>0
32
+ parent = previous_id
33
+ previous_indent = indent
34
+ end
35
+
36
+ previous_id = biblionet_id
37
+
38
+ # Extact DdC id and DdC text.
39
+ category = proccess_category(category.text)
40
+
41
+ category.merge!(parent: parent)
42
+
43
+ category_hash = {biblionet_id => category.clone}
44
+ end.reduce({}, :update) unless @page.nil?
45
+
46
+ @categories[:current] = (@categories[@biblionet_id.to_s].clone)
47
+ @categories[:current][:b_id] = @biblionet_id
48
+
49
+ return @categories
50
+ end
51
+
52
+ def extract_categories_from(uri=nil)
53
+ load_page(uri)
54
+ extract_categories unless uri.nil?
55
+ end
56
+
57
+
58
+ private
59
+
60
+ def proccess_category(category)
61
+ # matches the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
62
+ ddc_re = /(\[\d*(?:[\.|\s]\d*)*\])/
63
+
64
+ # matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
65
+ non_text_re = /\s*(\[.*\]|\(\d*\))\s*/
66
+
67
+ category_ddc = category.scan(ddc_re).join.gsub(/[\[\]]/, '')
68
+ category_name = category.gsub(non_text_re, '').strip
69
+
70
+ category_hash = { ddc: category_ddc, name: category_name }
71
+ return category_hash
72
+ end
73
+
74
+ end
75
+
76
+ end
77
+ end
78
+
79
+ # categoryp = DDCParser.new("raw_category_pages/0/category_787.html")
80
+ # categoryp.extract_categories
81
+
82
+ # categoryp.filepath="category_1.html"
@@ -0,0 +1,138 @@
1
+ require_relative 'base'
2
+
3
+ module Biblionet
4
+ module Extractors
5
+
6
+ class PublisherExtractor < Base
7
+ attr_reader :publisher
8
+
9
+ def initialize(uri=nil)
10
+ super(uri)
11
+ extract_publisher unless uri.nil? or @page.nil?
12
+ end
13
+
14
+
15
+ def load_and_extract_publisher(uri=nil)
16
+ load_page(uri)
17
+ extract_publisher unless uri.nil? or @page.nil?
18
+ end
19
+
20
+ def extract_publisher(biblionet_id=@biblionet_id, publisher_page=@page)
21
+ puts "Extracting publisher: #{biblionet_id}"
22
+ page = PublisherDataExtractor.new(publisher_page)
23
+
24
+ headquarters = page.headquarters
25
+ bookstores = page.bookstores
26
+ bookstores['Έδρα'] = headquarters
27
+
28
+ publisher_hash = {}
29
+ publisher_hash[:name] = page.name
30
+ publisher_hash[:owner] = page.owner
31
+ publisher_hash[:bookstores] = bookstores
32
+ publisher_hash[:b_id] = biblionet_id
33
+
34
+ return @publisher = publisher_hash
35
+ end
36
+
37
+
38
+ end
39
+
40
+ class PublisherDataExtractor
41
+ attr_reader :nodeset
42
+
43
+ def initialize(document)
44
+ # No need to operate on whole page. Just on part containing the content.
45
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
46
+ if (content_re.match(document)).nil?
47
+ puts document
48
+ end
49
+ content = content_re.match(document)[0]
50
+
51
+ @nodeset = Nokogiri::HTML(content)
52
+ end
53
+
54
+ def name
55
+ @nodeset.css('h1.page_title').text.strip
56
+ end
57
+
58
+ def owner
59
+ return (@nodeset.xpath("//h1[@class='page_title'][1]/following::text()") & @nodeset.xpath("//table[@class='book_details'][1]/preceding::text()")).text.strip
60
+ end
61
+
62
+ def headquarters
63
+ headquarters_hash = {}
64
+ temp_array = []
65
+ current_key = nil
66
+ last_key = nil
67
+
68
+ @nodeset.xpath("//table[@class='book_details'][1]//tr").each do |item|
69
+ key = item.children[0].text.strip
70
+ current_key = key.end_with?(":") ? key[0..-2] : last_key
71
+ value = item.children[1].text.strip
72
+
73
+ unless key.empty? and value.empty?
74
+ if current_key == last_key
75
+ temp_array << headquarters_hash[current_key] unless headquarters_hash[current_key].is_a?(Array)
76
+ temp_array << value.gsub(/,$/, '').strip unless value.empty?
77
+ headquarters_hash[current_key] = temp_array
78
+ else
79
+ temp_array = []
80
+ headquarters_hash[current_key] = value.gsub(/,$/, '').strip
81
+ end
82
+ end
83
+
84
+ last_key = current_key
85
+ end
86
+
87
+ # Change keys. Use the same as in bookstores.
88
+ mappings = {"Διεύθυνση" => :address, "Τηλ" => :telephone, "FAX" => :fax, "E-mail" => :email, "Web site" => :website}
89
+ headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
90
+ headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if headquarters_hash[:website].include? ','
91
+
92
+ return headquarters_hash
93
+ end
94
+
95
+ def bookstores
96
+ bookstores_hash = Hash.new { |h,k| h[k] = {} }
97
+ address_array = []
98
+ tel_array = []
99
+
100
+ # Defaunt key in case there is none.
101
+ key = 'Βιβλιοπωλείο'
102
+
103
+ @nodeset.css('//p[align="justify"]').inner_html.split('<br>').map(&:strip).reject(&:empty?).each do |item|
104
+ regex_tel = /\d{3} \d{7}/
105
+ regex_tk = /\d{3} \d{2}/
106
+ regex_email = /([\w+\-].?)+@[a-z\d\-]+(\.[a-z]+)*\.[a-z]+/i
107
+ regex_url = /((http(?:s)?\:\/\/)?[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,6}(?:\/?|(?:\/[\w\-]+)*)(?:\/?|\/\w+\.[a-zA-Z]{2,4}(?:\?[\w]+\=[\w\-]+)?)?(?:\&[\w]+\=[\w\-]+)*)/ix
108
+
109
+ if item.end_with?(":")
110
+ key = item[0..-2]
111
+ address_array = []
112
+ tel_array = []
113
+ elsif (item.start_with?("Fax") or item.start_with?("fax")) and item =~ regex_tel
114
+ bookstores_hash[key][:fax] = item.gsub(/[^\d{3} \d{2}]/, '').strip
115
+ elsif item =~ regex_tel
116
+ tel_array << item.gsub(/[^\d{3} \d{2}]/, '').strip
117
+ bookstores_hash[key][:telephone] = tel_array
118
+ elsif item =~ regex_tk
119
+ address_array << item.gsub(/,$/, '').strip
120
+ bookstores_hash[key][:address] = address_array
121
+ elsif item =~ regex_email
122
+ bookstores_hash[key][:email] = (regex_email.match(item))[0]
123
+ elsif item =~ regex_url
124
+ bookstores_hash[key][:website] = item[regex_url,1]
125
+ else
126
+ address_array << item.gsub(/,$/, '').strip
127
+ bookstores_hash[key][:address] = address_array
128
+ end
129
+
130
+ end
131
+
132
+ return bookstores_hash
133
+ end
134
+
135
+ end
136
+
137
+ end
138
+ end
@@ -0,0 +1,104 @@
1
+ require_relative 'book_extractor'
2
+
3
+ module Biblionet
4
+ module Extractors
5
+
6
+ class Search < BookExtractor
7
+ def initialize(options = {})
8
+ perform_search(options) unless options.empty?
9
+ end
10
+
11
+ def perform_search(options = {})
12
+ search_url = build_search_url(options)
13
+ load_page(URI.encode(search_url)) # Page gets loaded on @page variable.
14
+
15
+ book_ids = []
16
+
17
+ # No need to operate on whole page. Just on part containing the book.
18
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
19
+ if (content_re.match(@page)).nil?
20
+ puts @page
21
+ end
22
+ content = content_re.match(@page)[0]
23
+
24
+ nodeset = Nokogiri::HTML(content)
25
+ nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]]").each do |item|
26
+ book_ids << item[:href].split("/")[2]
27
+ end
28
+
29
+ books = []
30
+
31
+ if options[:results_type] == 'ids'
32
+ return book_ids
33
+ elsif options[:results_type] == 'metadata'
34
+ book_ids.each do |id|
35
+ url = "http://www.biblionet.gr/book/#{id}"
36
+ books << load_and_extract_book(url)
37
+ end
38
+ end
39
+
40
+ return books
41
+ end
42
+
43
+ def build_search_url(options = {})
44
+ title = present?(options[:title]) ? options[:title].gsub(' ','+') : ''
45
+ author = present?(options[:author]) ? options[:author].gsub(' ','+') : ''
46
+ publisher = present?(options[:publisher]) ? options[:publisher].gsub(' ','+') : ''
47
+ category = present?(options[:category]) ? options[:category].gsub(' ','+') : ''
48
+
49
+ title_split = options[:title_split] ||= '1'
50
+ book_id = options[:book_id] ||= ''
51
+ isbn = options[:isbn] ||= ''
52
+ author_id = options[:author_id] ||= ''
53
+ publisher_id = options[:publisher_id] ||= ''
54
+ category_id = options[:category_id] ||= ''
55
+ after_year = options[:after_year] ||= ''
56
+ before_year = options[:before_year] ||= ''
57
+
58
+ url_builder = StringBuilder.new
59
+ url_builder.append('http://www.biblionet.gr/main.asp?page=results')
60
+ url_builder.append('&title=')
61
+ url_builder.append(title)
62
+ url_builder.append('&TitleSplit=')
63
+ url_builder.append(title_split)
64
+ url_builder.append('&Titlesid=')
65
+ url_builder.append(book_id)
66
+ url_builder.append('&isbn=')
67
+ url_builder.append(isbn)
68
+ url_builder.append('&person=')
69
+ url_builder.append(author)
70
+ url_builder.append('&person_ID=')
71
+ url_builder.append(author_id)
72
+ url_builder.append('&com=')
73
+ url_builder.append(publisher)
74
+ url_builder.append('&com_ID=')
75
+ url_builder.append(publisher_id)
76
+ url_builder.append('&from=')
77
+ url_builder.append(after_year)
78
+ url_builder.append('&untill=')
79
+ url_builder.append(before_year)
80
+ url_builder.append('&subject=')
81
+ url_builder.append(category)
82
+ url_builder.append('&subject_ID=')
83
+ url_builder.append(category_id)
84
+ url_builder.build
85
+ end
86
+
87
+ end
88
+
89
+ class StringBuilder
90
+ def initialize
91
+ @string = []
92
+ end
93
+
94
+ def append(text)
95
+ @string << text
96
+ end
97
+
98
+ def build
99
+ @string.join.to_s
100
+ end
101
+ end
102
+
103
+ end
104
+ end
@@ -0,0 +1,103 @@
1
+ require 'fileutils'
2
+ require 'json'
3
+
4
+ module FileManager
5
+
6
+ DEFAULTS = {
7
+ path: '',
8
+ all: false,
9
+ extension: '',
10
+ }
11
+
12
+ # Lists directories in current path or in path specified by options hash.
13
+ #
14
+ # ==== Attributes
15
+ #
16
+ # * +options+ - The options hash accepts options for a more specialized directory search operation.
17
+ #
18
+ # ==== Options
19
+ #
20
+ # * +:path+ - The path where directory search will happen.
21
+ # * +:all+ - If true, recursive search is enabled.
22
+ #
23
+ def list_directories(options = {})
24
+ options = DEFAULTS.merge(options)
25
+
26
+ path = options[:path]
27
+ all = options[:all]
28
+
29
+ path = "#{path}/" unless path == '' or path.end_with?('/')
30
+ path = path+'**/' if all
31
+
32
+
33
+ Dir.glob("#{path}*/")
34
+ end
35
+
36
+
37
+ # Returns a list of all files in current directory or as specified in options hash.
38
+ #
39
+ # ==== Attributes
40
+ #
41
+ # * +options+ - The options hash accepts options for a more specialized file search operation.
42
+ #
43
+ # ==== Options
44
+ #
45
+ # * +:path+ - The path where file search will happen.
46
+ # * +:extension+ - The extension of target files.
47
+ # * +:all+ - If true, recursive search is enabled.
48
+ #
49
+ # ==== Examples
50
+ #
51
+ # files = list_files
52
+ # files = list_files path: 'html_pages'
53
+ # files = list_files path: 'raw_html_pages/2', extension:'html'
54
+ # files = list_files(path: 'ddc_pages', extension:'json', all:true).each do |file|
55
+ # file.do_something
56
+ # end
57
+ #
58
+ def list_files(options = {})
59
+ options = DEFAULTS.merge(options)
60
+
61
+ path = options[:path]
62
+ all = options[:all]
63
+ extension = options[:extension]
64
+
65
+ extension = ".#{extension}" unless extension == '' or extension.start_with?('.')
66
+ file_wildcard = "*#{extension}"
67
+
68
+ path = "#{path}/" unless path == '' or path.end_with?('/')
69
+ path = path+'**/' if all
70
+
71
+ Dir.glob("#{path}#{file_wildcard}")
72
+ end
73
+
74
+ # Saves some text/string to file.
75
+ #
76
+ # ==== Attributes
77
+ #
78
+ # * +path+ - The path to file(including filename) where content will be saved.
79
+ # * +content+ - The text which will be saved to file.
80
+ #
81
+ # ==== Examples
82
+ #
83
+ # save_to('data_pages/categories/cat_15.txt', 'Some text')
84
+ #
85
+ def save_to(path, content)
86
+ begin
87
+ dir = File.dirname(path)
88
+ # Create a new directory (does nothing if directory exists or is a file)
89
+ FileUtils.mkdir_p dir #unless File.dirname(path) == "."
90
+
91
+ open(path, "w") do |f|
92
+ f.write(content)
93
+ end
94
+
95
+ rescue StandardError => e
96
+ puts e
97
+ end
98
+ end
99
+
100
+
101
+
102
+ end
103
+
@@ -0,0 +1,3 @@
1
+ module Bookshark
2
+ VERSION = "1.0.0.alpha.2"
3
+ end
@@ -0,0 +1,96 @@
1
+ require 'spec_helper'
2
+
3
+ describe Bookshark::Extractor do
4
+ subject { Bookshark::Extractor.new(format: 'pretty_json') }
5
+ let(:author_13219) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/author_13219.json" , File.dirname(__FILE__))).read)) }
6
+ let(:publisher_20) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/publisher_20.json" , File.dirname(__FILE__))).read)) }
7
+ let(:category_1041) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/category_1041.json" , File.dirname(__FILE__))).read)) }
8
+ let(:book_103788) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/book_103788.json" , File.dirname(__FILE__))).read)) }
9
+ let(:eager_book_184923) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/eager_book_184923.json", File.dirname(__FILE__))).read)) }
10
+ let(:search_01) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/search_01.json" , File.dirname(__FILE__))).read)) }
11
+ let(:search_ids_01) { JSON.pretty_generate(JSON.parse(open(File.expand_path("test_data/search_ids_01.json" , File.dirname(__FILE__))).read)) }
12
+
13
+
14
+ describe '#author' do
15
+ context 'from remote html source' do
16
+ it 'reads html from the web and extracts author data' do
17
+ expect(subject.author(id: 13219)).to eq author_13219
18
+ end
19
+ end
20
+
21
+ context 'from local storage' do
22
+ it 'reads html from file and extracts author data' do
23
+ file_name = File.expand_path("test_data/author_13219.html", File.dirname(__FILE__))
24
+ expect(subject.author(uri: file_name)).to eq author_13219
25
+ end
26
+ end
27
+ end
28
+
29
+ describe '#publisher' do
30
+ context 'extract from remote html source' do
31
+ it 'reads html from the web and extracts publisher data' do
32
+ expect(subject.publisher(id: 20)).to eq publisher_20
33
+ end
34
+ end
35
+ end
36
+
37
+ describe '#category' do
38
+ context 'extract from remote html source' do
39
+ it 'reads html from the web and extracts category data' do
40
+ expect(subject.category(id: 1041)).to eq category_1041
41
+ end
42
+ end
43
+ end
44
+
45
+ describe '#book' do
46
+ context 'extract from remote html source' do
47
+ it 'reads html from the web and extracts book data' do
48
+ expect(subject.book(id: 103788)).to eq book_103788
49
+ end
50
+
51
+ it 'reads html from the web and eager extracts all book and reference data' do
52
+ expect(subject.book(id: 184923, eager: true)).to eq eager_book_184923
53
+ end
54
+ end
55
+ end
56
+
57
+ describe '#search' do
58
+ context 'extract from remote html source' do
59
+ it 'builds a search url and extracts book ids from search page' do
60
+ expect(subject.search(title: 'σημεια και τερατα', results_type: 'ids')).to eq search_ids_01
61
+ end
62
+
63
+ it 'builds a search url and extracts book data from search page' do
64
+ expect(subject.search(title: 'σημεια και τερατα', results_type: 'metadata')).to eq search_01
65
+ end
66
+ end
67
+ end
68
+
69
+
70
+ describe '#process_options' do
71
+ context 'with valid options' do
72
+ it 'returns a biblionet url when there is no local option set' do
73
+ expect(subject.send(:process_options, {id:56}, 'author')).to eq("http://www.biblionet.gr/author/56")
74
+ end
75
+
76
+ it 'returns a local path when the local option is set to true' do
77
+ expect(subject.send(:process_options, {id: 56, local: true}, 'author')).to eq("#{Bookshark::path_to_storage}/html_author_pages/0/author_56.html")
78
+ end
79
+
80
+ it 'returns the given uri' do
81
+ expect(subject.send(:process_options, {uri:'http://www.biblionet.gr/book/5487', id: 56, local: true}, 'book')).to eq("http://www.biblionet.gr/book/5487")
82
+ end
83
+
84
+ it 'returns the given uri if uri option is set even if other options are set' do
85
+ expect(subject.send(:process_options, {uri:'http://www.biblionet.gr/book/87', id: 56, local: true}, 'book')).to eq("http://www.biblionet.gr/book/87")
86
+ end
87
+ end
88
+
89
+ context 'with invalid options' do
90
+ it 'returns the given uri' do
91
+
92
+ end
93
+ end
94
+ end
95
+
96
+ end