bookshark 1.0.0.alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ Dir.glob('tasks/**/*.rake').each(&method(:import))
4
+
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'bookshark/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "bookshark"
8
+ spec.version = Bookshark::VERSION
9
+ spec.authors = ["Dimitris Klisiaris"]
10
+ spec.email = ["dklisiaris@gmail.com"]
11
+ spec.summary = %q{Book metadata extractor from biblionet.gr.}
12
+ spec.description = %q{Extracts book, author, publisher and category metadata from biblionet.gr.}
13
+ spec.homepage = "https://github.com/dklisiaris/bookshark"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "nokogiri", "~> 1.6", ">= 1.6.6"
22
+ spec.add_dependency "sanitize", "~> 3.1"
23
+ spec.add_dependency "json", "~> 1.8"
24
+ spec.add_dependency "htmlentities", "~> 4.3"
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.7"
27
+ spec.add_development_dependency "rake", "~> 10.0"
28
+ spec.add_development_dependency 'rspec', "~> 3.1"
29
+ end
@@ -0,0 +1,371 @@
1
+ require "bookshark/version"
2
+ require 'bookshark/storage/file_manager'
3
+
4
+ require 'bookshark/extractors/author_extractor'
5
+ require 'bookshark/extractors/category_extractor'
6
+ require 'bookshark/extractors/book_extractor'
7
+ require 'bookshark/extractors/publisher_extractor'
8
+ require 'bookshark/extractors/search'
9
+
10
+ require 'bookshark/crawlers/base'
11
+ require 'bookshark/crawlers/publisher_crawler'
12
+
13
+ module Bookshark
14
+ DEFAULTS = {
15
+ site: 'biblionet',
16
+ format: 'hash'
17
+ }
18
+
19
+ def self.root
20
+ File.dirname __dir__
21
+ end
22
+
23
+ def self.path_to_storage
24
+ File.join root, 'lib/bookshark/storage'
25
+ end
26
+
27
+
28
+ class Extractor
29
+ include FileManager
30
+ attr_accessor :site, :format
31
+
32
+ def initialize(options = {})
33
+ options = DEFAULTS.merge(options)
34
+ @site = options[:site]
35
+ @format = options[:format]
36
+ end
37
+
38
+ def author(options = {})
39
+ uri = process_options(options, __method__)
40
+ options[:format] ||= @format
41
+
42
+ author_extractor = Biblionet::Extractors::AuthorExtractor.new
43
+ author = author_extractor.load_and_extract_author(uri)
44
+
45
+ response = {}
46
+ response[:author] = [author]
47
+ response = change_format(response, options[:format])
48
+ return response
49
+ end
50
+
51
+ def publisher(options = {})
52
+ uri = process_options(options, __method__)
53
+ options[:format] ||= @format
54
+
55
+ publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
56
+ publisher = publisher_extractor.load_and_extract_publisher(uri)
57
+
58
+ response = {}
59
+ response[:publisher] = [publisher]
60
+ response = change_format(response, options[:format])
61
+ response = publisher_extractor.decode_text(response)
62
+
63
+ return response
64
+ # return uri
65
+ end
66
+
67
+ def book(options = {})
68
+ book_extractor = Biblionet::Extractors::BookExtractor.new
69
+
70
+ uri = process_options(options, __method__)
71
+ options[:format] ||= @format
72
+ options[:eager] ||= false
73
+
74
+ if options[:eager]
75
+ book = eager_extract_book(uri)
76
+ else
77
+ book = book_extractor.load_and_extract_book(uri)
78
+ end
79
+
80
+ response = {}
81
+ response[:book] = [book]
82
+ response = change_format(response, options[:format])
83
+ response = book_extractor.decode_text(response)
84
+
85
+ return response
86
+ end
87
+
88
+ def category(options = {})
89
+ uri = process_options(options, __method__)
90
+ options[:format] ||= @format
91
+
92
+ category_extractor = Biblionet::Extractors::CategoryExtractor.new
93
+ category = category_extractor.extract_categories_from(uri)
94
+
95
+ response = {}
96
+ response[:category] = [category]
97
+ response = change_format(response, options[:format])
98
+
99
+ return response
100
+ end
101
+
102
+ def search(options = {})
103
+ options[:format] ||= @format
104
+ options[:results_type] ||= 'metadata'
105
+
106
+ search_engine = Biblionet::Extractors::Search.new
107
+ search_results = search_engine.perform_search(options)
108
+
109
+ response = {}
110
+ response[:book] = search_results
111
+ response = change_format(response, options[:format])
112
+
113
+ return response
114
+ end
115
+
116
+ def parse_all_categories(will_save=false)
117
+ # list_directories('raw_ddc_pages').each do |dir|
118
+ # p dir
119
+ # end
120
+ category_extractor = Biblionet::Extractors::CategoryExtractor.new
121
+ all_categories = Hash.new
122
+
123
+ list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
124
+ categories = category_extractor.extract_categories_from(file)
125
+ all_categories.merge!(categories) unless categories.nil? or categories.empty?
126
+ end
127
+
128
+ if will_save
129
+ all_categories_json = all_categories.to_json
130
+ save_to('storage/all_categories.json',all_categories_json)
131
+ end
132
+
133
+ all_categories
134
+ end
135
+
136
+ def parse_all_books
137
+ bp = Biblionet::Extractors::BookExtractor.new
138
+
139
+ list_directories(path: 'storage/raw_html_pages').each do |dir|
140
+ dir_to_save = dir.gsub(/raw_html_pages/, 'books')
141
+
142
+ list_files(path: dir, extension: 'html', all:true).each do |file|
143
+
144
+ # Load the book from html file and parse the data.
145
+ # pp "Parsing book: #{file}"
146
+ pp file
147
+ book = bp.load_and_extract_book(file)
148
+
149
+ # Prepare a path to save the new file.
150
+ filename = File.basename(file,".*")
151
+ path_to_save = "#{dir_to_save}#{filename}.json"
152
+
153
+ # Save to file.
154
+ bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
155
+ # pp "Book #{file} saved!"
156
+ end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
157
+ end
158
+ end
159
+
160
+ private
161
+
162
+ def process_options(options = {}, caller = nil)
163
+ # puts caller_locations(1,1)[0].label
164
+ # options[:format] ||= @format
165
+ puts caller
166
+ id = options[:id]
167
+
168
+ if id
169
+ case caller.to_s
170
+ when 'author'
171
+ url_method = 'author'
172
+ local_path = "html_author_pages/#{((id-1)/1000)}/author_#{id}.html"
173
+ when 'publisher'
174
+ url_method = 'com'
175
+ local_path = "html_publisher_pages/#{((id-1)/100)}/publisher_#{id}.html"
176
+ when 'book'
177
+ url_method = 'book'
178
+ local_path = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
179
+ when 'category'
180
+ url_method = 'index'
181
+ local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
182
+ else
183
+ puts "Called from unknown method. Probably its rspec."
184
+ end
185
+
186
+ options[:local] ||= false
187
+ url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
188
+ url = "http://www.biblionet.gr/#{url_method}/#{id}" unless options[:local]
189
+ end
190
+ uri = options[:uri] ||= url
191
+
192
+ return uri
193
+ end
194
+
195
+ def change_format(hash, format)
196
+ case format
197
+ when 'hash'
198
+ return hash
199
+ when 'json'
200
+ hash = hash.to_json
201
+ when 'pretty_json'
202
+ hash = JSON.pretty_generate(hash)
203
+ end
204
+ return hash
205
+ end
206
+
207
+ def eager_extract_book(uri)
208
+ book_extractor = Biblionet::Extractors::BookExtractor.new
209
+ author_extractor = Biblionet::Extractors::AuthorExtractor.new
210
+ publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
211
+ category_extractor = Biblionet::Extractors::CategoryExtractor.new
212
+
213
+ book = book_extractor.load_and_extract_book(uri)
214
+
215
+ tmp_data = []
216
+ book[:author].each do |author|
217
+ tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
218
+ end
219
+ book[:author] = tmp_data
220
+
221
+ tmp_data, tmp_hash = [], {}
222
+ book[:contributors].each do |job, contributors|
223
+ contributors.each do |contributor|
224
+ tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
225
+ end
226
+ tmp_hash[job] = tmp_data
227
+ tmp_data = []
228
+ end
229
+ book[:contributors] = tmp_hash
230
+
231
+ tmp_data, tmp_hash = [], {}
232
+ book[:category].each do |category|
233
+ tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
234
+ end
235
+ book[:category] = tmp_data
236
+
237
+ tmp_data = []
238
+ tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
239
+ book[:publisher] = tmp_data
240
+
241
+ book
242
+ end
243
+
244
+ end
245
+
246
+
247
+ class Crawler
248
+ include FileManager
249
+ attr_accessor :site
250
+
251
+ def initialize(options = {})
252
+ options = DEFAULTS.merge(options)
253
+ @site = options[:site]
254
+ end
255
+
256
+ def publishers
257
+ # crawler = Biblionet::Crawlers::Base.new(start:1, finish:100, step:10)
258
+ # crawler.spider do |url, path|
259
+ # puts "URL: #{url}, PATH: #{path}"
260
+ # end
261
+ # puts Biblionet::Extractors::Base.new("http://www.biblionet.gr/com/245").page
262
+ crawler = Biblionet::Crawlers::PublisherCrawler.new
263
+ crawler.crawl_and_save
264
+ end
265
+
266
+ end
267
+
268
+ # module Biblionet
269
+ # class Extract
270
+ # class << self
271
+ # def author(uri=nil)
272
+ # author_extractor = BiblionetParser::Core::AuthorExtractor.new
273
+ # author_extractor.load_and_extract_author(uri)
274
+ # end
275
+
276
+ # def book(uri=nil)
277
+ # bp = BiblionetParser::Core::BookParser.new
278
+ # bp.load_and_parse_book(uri)
279
+ # end
280
+
281
+ # def categories(uri=nil)
282
+ # category_extractor = BiblionetParser::Core::DDCParser.new
283
+ # category_extractor.extract_categories_from(uri)
284
+ # end
285
+
286
+ # end
287
+ # end
288
+ # end
289
+ end
290
+
291
+
292
+ # ae = BiblionetParser::Core::AuthorExtractor.new
293
+ # ae.load_and_extract_author('storage/html_author_pages/0/author_5.html')
294
+
295
+
296
+ # Biblionet::Extract.author('storage/html_author_pages/0/author_5.html')
297
+ # Biblionet::Extract.author('storage/html_author_pages/2/author_2423.html')
298
+ # Biblionet::Extract.author('storage/html_author_pages/0/author_764.html')
299
+ # Biblionet::Extract.author('storage/html_author_pages/0/author_435.html')
300
+
301
+ # bib = Bibliotheca.new
302
+ # categories = bib.parse_all_categories(true)
303
+
304
+ # p bib.list_files(path: 'raw_html_pages/2', extension:'html')
305
+ # p bib.list_directories
306
+ # p categories[787]
307
+ # categories = 'test'
308
+ # bib.save_to('all_categories_test.json', categories)
309
+
310
+ # bp = BiblionetParser::Core::BookParser.new
311
+ # bp.load_and_parse_book('storage/raw_html_pages/96/book_96592.html') # BAD Book --no image
312
+ # bp.load_and_parse_book('storage/raw_html_pages/96/book_96937.html') # BAD Book --award
313
+ # bp.load_and_parse_book('storage/raw_html_pages/78/book_78836.html') # BAD Book --multiple awards
314
+ # bp.load_and_parse_book('storage/raw_html_pages/149/book_149345.html') # BAD Book --2 sets of details (ebooks, normals)
315
+ # bp.load_and_parse_book('storage/raw_html_pages/149/book_149402.html') # BAD Book --2 sets of details (normals, reviews)
316
+ # bp.load_and_parse_book('storage/raw_html_pages/149/book_149278.html') # BAD Book --3 sets of details (ebooks, normals, reviews)
317
+ # bp.load_and_parse_book('storage/raw_html_pages/149/book_149647.html')
318
+ # puts JSON.pretty_generate(bp.book)
319
+
320
+ # bp.load_and_parse_book('storage/raw_html_pages/70/book_70076.html') # BAD Book --Has comma inside award
321
+
322
+ # bp.load_and_parse_book('storage/raw_html_pages/70/book_70828.html') # BAD Book --No author. Collective Work
323
+ # puts JSON.pretty_generate(bp.book)
324
+
325
+ # bp.load_and_parse_book('storage/raw_html_pages/70/book_70829.html') # BAD Book --No author, No publisher. Collective Work
326
+ # puts JSON.pretty_generate(bp.book)
327
+
328
+ # bp.load_and_parse_book('storage/raw_html_pages/145/book_145326.html') # BAD Book --ISMN istead of ISBN
329
+
330
+ # bp.load_and_parse_book('storage/raw_html_pages/45/book_45455.html') # BAD Book --No author. Has contributors.
331
+ # puts JSON.pretty_generate(bp.book)
332
+
333
+
334
+ # bp.load_and_parse_book('storage/raw_html_pages/132/book_132435.html') # BAD Book --Two authors.
335
+ # puts JSON.pretty_generate(bp.book)
336
+
337
+ # bp.load_and_parse_book('storage/raw_html_pages/133/book_133435.html') # GOOD Book
338
+
339
+ # puts JSON.pretty_generate(bp.book)
340
+
341
+ # ddcp = BiblionetParser::Core::DDCParser.new('storage/raw_ddc_pages/0/ddc_298.html')
342
+ # pp all = ddcp.categories
343
+ # pp cur = ddcp.categories.values.last
344
+ # pp sel = ddcp.categories["2703"]
345
+
346
+ # bp.parse_book('12351', bp.page)
347
+
348
+ # bp.save_page('storage/mits_ts/mits1.json')
349
+
350
+ # pp bp.url='http://www.biblionet.gr/book/123351'
351
+ # pp bp.page
352
+
353
+ # pp bib.list_directories(path: 'storage/raw_html_pages')
354
+ # pp bib.list_files(path: "storage/raw_html_pages/24/", extension: 'html')
355
+
356
+ # bib = Bibliotheca.new
357
+ # bib.parse_all_books
358
+
359
+ # Good cases:
360
+ # 'storage/raw_html_pages/123/book_123351.html'
361
+ # 'storage/raw_html_pages/17/book_17351.html'
362
+ # 'storage/raw_html_pages/133/book_133435.html'
363
+
364
+ # Special book cases to check out:
365
+ # 'storage/raw_html_pages/96/book_96592.html' --no image
366
+ # 'storage/raw_html_pages/96/book_96937.html'
367
+
368
+ # Problematic at biblionet
369
+ # http://biblionet.gr/book/196388
370
+ # http://biblionet.gr/book/196386
371
+ # http://biblionet.gr/book/195525
@@ -0,0 +1,42 @@
1
+ require 'rubygems'
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'fileutils'
5
+
6
+ require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
7
+
8
+ DEFAULTS = {
9
+ folder: 'storage/html_author_pages',
10
+ base_url: 'http://www.biblionet.gr/author/',
11
+ extension: '.html',
12
+ first_id: 1,
13
+ last_id: 112000,
14
+ step: 1000
15
+ }
16
+
17
+ def crawl_and_save(options={})
18
+ options = DEFAULTS.merge(options)
19
+
20
+ start_id = options[:first_id] + options[:step] - 1
21
+ last_id = options[:last_id]
22
+ step = options[:step]
23
+
24
+ start_id.step(last_id, step) do |last|
25
+ first = last - step + 1
26
+ subfolder = (last/step - 1).to_s
27
+ path = "#{options[:folder]}/#{subfolder}/"
28
+
29
+ # Create a new directory (does nothing if directory exists)
30
+ FileUtils.mkdir_p path
31
+
32
+ first.upto(last) do |id|
33
+ file_to_save = "#{path}author_#{id}#{options[:extension]}"
34
+ url_to_download = "#{options[:base_url]}#{id}/"
35
+
36
+ downloader = Biblionet::Core::Base.new(url_to_download)
37
+ downloader.save_page(file_to_save) unless downloader.page.nil?
38
+
39
+ end
40
+ end
41
+
42
+ end
@@ -0,0 +1,46 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
2
+
3
+ module Biblionet
4
+ module Crawlers
5
+
6
+ class Base
7
+ def initialize(options = {})
8
+ @folder = options[:folder] ||= 'lib/bookshark/storage/html_base_pages'
9
+ @base_url = options[:base_url] ||= 'http://www.biblionet.gr/base/'
10
+ @page_type = options[:page_type] ||= 'base'
11
+ @extension = options[:extension] ||= '.html'
12
+ @start = options[:start] ||= 1
13
+ @finish = options[:finish] ||= 10000
14
+ @step = options[:step] ||= 1000
15
+ end
16
+
17
+ def spider
18
+ start = @start + @step - 1
19
+ finish = @finish
20
+
21
+ start.step(finish, @step) do |last|
22
+ first = last - @step + 1
23
+ subfolder = (last/@step - 1).to_s
24
+ path = "#{@folder}/#{subfolder}/"
25
+
26
+ # Create a new directory (does nothing if directory exists)
27
+ # FileUtils.mkdir_p path
28
+
29
+ first.upto(last) do |id|
30
+ file_to_save = "#{path}#{@page_type}_#{id}#{@extension}"
31
+ url_to_download = "#{@base_url}#{id}/"
32
+
33
+ yield(url_to_download, file_to_save)
34
+ # downloader = Biblionet::Core::Base.new(url_to_download)
35
+ # downloader.save_page(file_to_save) unless downloader.page.nil?
36
+
37
+ end
38
+ end
39
+ end
40
+
41
+
42
+ end
43
+
44
+
45
+ end
46
+ end