bookshark 1.0.0.alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +453 -0
- data/Rakefile +4 -0
- data/bookshark.gemspec +29 -0
- data/lib/bookshark.rb +371 -0
- data/lib/bookshark/crawlers/author_crawler.rb +42 -0
- data/lib/bookshark/crawlers/base.rb +46 -0
- data/lib/bookshark/crawlers/book_crawler.rb +55 -0
- data/lib/bookshark/crawlers/category_crawler.rb +55 -0
- data/lib/bookshark/crawlers/publisher_crawler.rb +35 -0
- data/lib/bookshark/extractors/author_extractor.rb +116 -0
- data/lib/bookshark/extractors/base.rb +187 -0
- data/lib/bookshark/extractors/book_extractor.rb +453 -0
- data/lib/bookshark/extractors/category_extractor.rb +82 -0
- data/lib/bookshark/extractors/publisher_extractor.rb +138 -0
- data/lib/bookshark/extractors/search.rb +104 -0
- data/lib/bookshark/storage/file_manager.rb +103 -0
- data/lib/bookshark/version.rb +3 -0
- data/spec/bookshark_spec.rb +96 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/test_data/author_13219.html +313 -0
- data/spec/test_data/author_13219.json +23 -0
- data/spec/test_data/book_103788.json +49 -0
- data/spec/test_data/category_1041.json +42 -0
- data/spec/test_data/eager_book_184923.json +215 -0
- data/spec/test_data/publisher_20.json +43 -0
- data/spec/test_data/search_01.json +355 -0
- data/spec/test_data/search_ids_01.json +13 -0
- data/tasks/console.rake +4 -0
- data/tasks/rspec.rake +3 -0
- metadata +191 -0
data/Rakefile
ADDED
data/bookshark.gemspec
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# coding: utf-8
|
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
|
+
require 'bookshark/version'
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |spec|
|
|
7
|
+
spec.name = "bookshark"
|
|
8
|
+
spec.version = Bookshark::VERSION
|
|
9
|
+
spec.authors = ["Dimitris Klisiaris"]
|
|
10
|
+
spec.email = ["dklisiaris@gmail.com"]
|
|
11
|
+
spec.summary = %q{Book metadata extractor from biblionet.gr.}
|
|
12
|
+
spec.description = %q{Extracts book, author, publisher and category metadata from biblionet.gr.}
|
|
13
|
+
spec.homepage = "https://github.com/dklisiaris/bookshark"
|
|
14
|
+
spec.license = "MIT"
|
|
15
|
+
|
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
19
|
+
spec.require_paths = ["lib"]
|
|
20
|
+
|
|
21
|
+
spec.add_dependency "nokogiri", "~> 1.6", ">= 1.6.6"
|
|
22
|
+
spec.add_dependency "sanitize", "~> 3.1"
|
|
23
|
+
spec.add_dependency "json", "~> 1.8"
|
|
24
|
+
spec.add_dependency "htmlentities", "~> 4.3"
|
|
25
|
+
|
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
|
27
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
|
28
|
+
spec.add_development_dependency 'rspec', "~> 3.1"
|
|
29
|
+
end
|
data/lib/bookshark.rb
ADDED
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
require "bookshark/version"
|
|
2
|
+
require 'bookshark/storage/file_manager'
|
|
3
|
+
|
|
4
|
+
require 'bookshark/extractors/author_extractor'
|
|
5
|
+
require 'bookshark/extractors/category_extractor'
|
|
6
|
+
require 'bookshark/extractors/book_extractor'
|
|
7
|
+
require 'bookshark/extractors/publisher_extractor'
|
|
8
|
+
require 'bookshark/extractors/search'
|
|
9
|
+
|
|
10
|
+
require 'bookshark/crawlers/base'
|
|
11
|
+
require 'bookshark/crawlers/publisher_crawler'
|
|
12
|
+
|
|
13
|
+
module Bookshark
|
|
14
|
+
DEFAULTS = {
|
|
15
|
+
site: 'biblionet',
|
|
16
|
+
format: 'hash'
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
def self.root
|
|
20
|
+
File.dirname __dir__
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def self.path_to_storage
|
|
24
|
+
File.join root, 'lib/bookshark/storage'
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Extractor
|
|
29
|
+
include FileManager
|
|
30
|
+
attr_accessor :site, :format
|
|
31
|
+
|
|
32
|
+
def initialize(options = {})
|
|
33
|
+
options = DEFAULTS.merge(options)
|
|
34
|
+
@site = options[:site]
|
|
35
|
+
@format = options[:format]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def author(options = {})
|
|
39
|
+
uri = process_options(options, __method__)
|
|
40
|
+
options[:format] ||= @format
|
|
41
|
+
|
|
42
|
+
author_extractor = Biblionet::Extractors::AuthorExtractor.new
|
|
43
|
+
author = author_extractor.load_and_extract_author(uri)
|
|
44
|
+
|
|
45
|
+
response = {}
|
|
46
|
+
response[:author] = [author]
|
|
47
|
+
response = change_format(response, options[:format])
|
|
48
|
+
return response
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def publisher(options = {})
|
|
52
|
+
uri = process_options(options, __method__)
|
|
53
|
+
options[:format] ||= @format
|
|
54
|
+
|
|
55
|
+
publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
|
|
56
|
+
publisher = publisher_extractor.load_and_extract_publisher(uri)
|
|
57
|
+
|
|
58
|
+
response = {}
|
|
59
|
+
response[:publisher] = [publisher]
|
|
60
|
+
response = change_format(response, options[:format])
|
|
61
|
+
response = publisher_extractor.decode_text(response)
|
|
62
|
+
|
|
63
|
+
return response
|
|
64
|
+
# return uri
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def book(options = {})
|
|
68
|
+
book_extractor = Biblionet::Extractors::BookExtractor.new
|
|
69
|
+
|
|
70
|
+
uri = process_options(options, __method__)
|
|
71
|
+
options[:format] ||= @format
|
|
72
|
+
options[:eager] ||= false
|
|
73
|
+
|
|
74
|
+
if options[:eager]
|
|
75
|
+
book = eager_extract_book(uri)
|
|
76
|
+
else
|
|
77
|
+
book = book_extractor.load_and_extract_book(uri)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
response = {}
|
|
81
|
+
response[:book] = [book]
|
|
82
|
+
response = change_format(response, options[:format])
|
|
83
|
+
response = book_extractor.decode_text(response)
|
|
84
|
+
|
|
85
|
+
return response
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def category(options = {})
|
|
89
|
+
uri = process_options(options, __method__)
|
|
90
|
+
options[:format] ||= @format
|
|
91
|
+
|
|
92
|
+
category_extractor = Biblionet::Extractors::CategoryExtractor.new
|
|
93
|
+
category = category_extractor.extract_categories_from(uri)
|
|
94
|
+
|
|
95
|
+
response = {}
|
|
96
|
+
response[:category] = [category]
|
|
97
|
+
response = change_format(response, options[:format])
|
|
98
|
+
|
|
99
|
+
return response
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def search(options = {})
|
|
103
|
+
options[:format] ||= @format
|
|
104
|
+
options[:results_type] ||= 'metadata'
|
|
105
|
+
|
|
106
|
+
search_engine = Biblionet::Extractors::Search.new
|
|
107
|
+
search_results = search_engine.perform_search(options)
|
|
108
|
+
|
|
109
|
+
response = {}
|
|
110
|
+
response[:book] = search_results
|
|
111
|
+
response = change_format(response, options[:format])
|
|
112
|
+
|
|
113
|
+
return response
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def parse_all_categories(will_save=false)
|
|
117
|
+
# list_directories('raw_ddc_pages').each do |dir|
|
|
118
|
+
# p dir
|
|
119
|
+
# end
|
|
120
|
+
category_extractor = Biblionet::Extractors::CategoryExtractor.new
|
|
121
|
+
all_categories = Hash.new
|
|
122
|
+
|
|
123
|
+
list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
|
|
124
|
+
categories = category_extractor.extract_categories_from(file)
|
|
125
|
+
all_categories.merge!(categories) unless categories.nil? or categories.empty?
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
if will_save
|
|
129
|
+
all_categories_json = all_categories.to_json
|
|
130
|
+
save_to('storage/all_categories.json',all_categories_json)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
all_categories
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def parse_all_books
|
|
137
|
+
bp = Biblionet::Extractors::BookExtractor.new
|
|
138
|
+
|
|
139
|
+
list_directories(path: 'storage/raw_html_pages').each do |dir|
|
|
140
|
+
dir_to_save = dir.gsub(/raw_html_pages/, 'books')
|
|
141
|
+
|
|
142
|
+
list_files(path: dir, extension: 'html', all:true).each do |file|
|
|
143
|
+
|
|
144
|
+
# Load the book from html file and parse the data.
|
|
145
|
+
# pp "Parsing book: #{file}"
|
|
146
|
+
pp file
|
|
147
|
+
book = bp.load_and_extract_book(file)
|
|
148
|
+
|
|
149
|
+
# Prepare a path to save the new file.
|
|
150
|
+
filename = File.basename(file,".*")
|
|
151
|
+
path_to_save = "#{dir_to_save}#{filename}.json"
|
|
152
|
+
|
|
153
|
+
# Save to file.
|
|
154
|
+
bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
|
|
155
|
+
# pp "Book #{file} saved!"
|
|
156
|
+
end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
private
|
|
161
|
+
|
|
162
|
+
def process_options(options = {}, caller = nil)
|
|
163
|
+
# puts caller_locations(1,1)[0].label
|
|
164
|
+
# options[:format] ||= @format
|
|
165
|
+
puts caller
|
|
166
|
+
id = options[:id]
|
|
167
|
+
|
|
168
|
+
if id
|
|
169
|
+
case caller.to_s
|
|
170
|
+
when 'author'
|
|
171
|
+
url_method = 'author'
|
|
172
|
+
local_path = "html_author_pages/#{((id-1)/1000)}/author_#{id}.html"
|
|
173
|
+
when 'publisher'
|
|
174
|
+
url_method = 'com'
|
|
175
|
+
local_path = "html_publisher_pages/#{((id-1)/100)}/publisher_#{id}.html"
|
|
176
|
+
when 'book'
|
|
177
|
+
url_method = 'book'
|
|
178
|
+
local_path = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
|
|
179
|
+
when 'category'
|
|
180
|
+
url_method = 'index'
|
|
181
|
+
local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
|
|
182
|
+
else
|
|
183
|
+
puts "Called from unknown method. Probably its rspec."
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
options[:local] ||= false
|
|
187
|
+
url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
|
|
188
|
+
url = "http://www.biblionet.gr/#{url_method}/#{id}" unless options[:local]
|
|
189
|
+
end
|
|
190
|
+
uri = options[:uri] ||= url
|
|
191
|
+
|
|
192
|
+
return uri
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def change_format(hash, format)
|
|
196
|
+
case format
|
|
197
|
+
when 'hash'
|
|
198
|
+
return hash
|
|
199
|
+
when 'json'
|
|
200
|
+
hash = hash.to_json
|
|
201
|
+
when 'pretty_json'
|
|
202
|
+
hash = JSON.pretty_generate(hash)
|
|
203
|
+
end
|
|
204
|
+
return hash
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def eager_extract_book(uri)
|
|
208
|
+
book_extractor = Biblionet::Extractors::BookExtractor.new
|
|
209
|
+
author_extractor = Biblionet::Extractors::AuthorExtractor.new
|
|
210
|
+
publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
|
|
211
|
+
category_extractor = Biblionet::Extractors::CategoryExtractor.new
|
|
212
|
+
|
|
213
|
+
book = book_extractor.load_and_extract_book(uri)
|
|
214
|
+
|
|
215
|
+
tmp_data = []
|
|
216
|
+
book[:author].each do |author|
|
|
217
|
+
tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
|
|
218
|
+
end
|
|
219
|
+
book[:author] = tmp_data
|
|
220
|
+
|
|
221
|
+
tmp_data, tmp_hash = [], {}
|
|
222
|
+
book[:contributors].each do |job, contributors|
|
|
223
|
+
contributors.each do |contributor|
|
|
224
|
+
tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
|
|
225
|
+
end
|
|
226
|
+
tmp_hash[job] = tmp_data
|
|
227
|
+
tmp_data = []
|
|
228
|
+
end
|
|
229
|
+
book[:contributors] = tmp_hash
|
|
230
|
+
|
|
231
|
+
tmp_data, tmp_hash = [], {}
|
|
232
|
+
book[:category].each do |category|
|
|
233
|
+
tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
|
|
234
|
+
end
|
|
235
|
+
book[:category] = tmp_data
|
|
236
|
+
|
|
237
|
+
tmp_data = []
|
|
238
|
+
tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
|
|
239
|
+
book[:publisher] = tmp_data
|
|
240
|
+
|
|
241
|
+
book
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class Crawler
|
|
248
|
+
include FileManager
|
|
249
|
+
attr_accessor :site
|
|
250
|
+
|
|
251
|
+
def initialize(options = {})
|
|
252
|
+
options = DEFAULTS.merge(options)
|
|
253
|
+
@site = options[:site]
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def publishers
|
|
257
|
+
# crawler = Biblionet::Crawlers::Base.new(start:1, finish:100, step:10)
|
|
258
|
+
# crawler.spider do |url, path|
|
|
259
|
+
# puts "URL: #{url}, PATH: #{path}"
|
|
260
|
+
# end
|
|
261
|
+
# puts Biblionet::Extractors::Base.new("http://www.biblionet.gr/com/245").page
|
|
262
|
+
crawler = Biblionet::Crawlers::PublisherCrawler.new
|
|
263
|
+
crawler.crawl_and_save
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
# module Biblionet
|
|
269
|
+
# class Extract
|
|
270
|
+
# class << self
|
|
271
|
+
# def author(uri=nil)
|
|
272
|
+
# author_extractor = BiblionetParser::Core::AuthorExtractor.new
|
|
273
|
+
# author_extractor.load_and_extract_author(uri)
|
|
274
|
+
# end
|
|
275
|
+
|
|
276
|
+
# def book(uri=nil)
|
|
277
|
+
# bp = BiblionetParser::Core::BookParser.new
|
|
278
|
+
# bp.load_and_parse_book(uri)
|
|
279
|
+
# end
|
|
280
|
+
|
|
281
|
+
# def categories(uri=nil)
|
|
282
|
+
# category_extractor = BiblionetParser::Core::DDCParser.new
|
|
283
|
+
# category_extractor.extract_categories_from(uri)
|
|
284
|
+
# end
|
|
285
|
+
|
|
286
|
+
# end
|
|
287
|
+
# end
|
|
288
|
+
# end
|
|
289
|
+
end
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
# ae = BiblionetParser::Core::AuthorExtractor.new
|
|
293
|
+
# ae.load_and_extract_author('storage/html_author_pages/0/author_5.html')
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# Biblionet::Extract.author('storage/html_author_pages/0/author_5.html')
|
|
297
|
+
# Biblionet::Extract.author('storage/html_author_pages/2/author_2423.html')
|
|
298
|
+
# Biblionet::Extract.author('storage/html_author_pages/0/author_764.html')
|
|
299
|
+
# Biblionet::Extract.author('storage/html_author_pages/0/author_435.html')
|
|
300
|
+
|
|
301
|
+
# bib = Bibliotheca.new
|
|
302
|
+
# categories = bib.parse_all_categories(true)
|
|
303
|
+
|
|
304
|
+
# p bib.list_files(path: 'raw_html_pages/2', extension:'html')
|
|
305
|
+
# p bib.list_directories
|
|
306
|
+
# p categories[787]
|
|
307
|
+
# categories = 'test'
|
|
308
|
+
# bib.save_to('all_categories_test.json', categories)
|
|
309
|
+
|
|
310
|
+
# bp = BiblionetParser::Core::BookParser.new
|
|
311
|
+
# bp.load_and_parse_book('storage/raw_html_pages/96/book_96592.html') # BAD Book --no image
|
|
312
|
+
# bp.load_and_parse_book('storage/raw_html_pages/96/book_96937.html') # BAD Book --award
|
|
313
|
+
# bp.load_and_parse_book('storage/raw_html_pages/78/book_78836.html') # BAD Book --multiple awards
|
|
314
|
+
# bp.load_and_parse_book('storage/raw_html_pages/149/book_149345.html') # BAD Book --2 sets of details (ebooks, normals)
|
|
315
|
+
# bp.load_and_parse_book('storage/raw_html_pages/149/book_149402.html') # BAD Book --2 sets of details (normals, reviews)
|
|
316
|
+
# bp.load_and_parse_book('storage/raw_html_pages/149/book_149278.html') # BAD Book --3 sets of details (ebooks, normals, reviews)
|
|
317
|
+
# bp.load_and_parse_book('storage/raw_html_pages/149/book_149647.html')
|
|
318
|
+
# puts JSON.pretty_generate(bp.book)
|
|
319
|
+
|
|
320
|
+
# bp.load_and_parse_book('storage/raw_html_pages/70/book_70076.html') # BAD Book --Has comma inside award
|
|
321
|
+
|
|
322
|
+
# bp.load_and_parse_book('storage/raw_html_pages/70/book_70828.html') # BAD Book --No author. Collective Work
|
|
323
|
+
# puts JSON.pretty_generate(bp.book)
|
|
324
|
+
|
|
325
|
+
# bp.load_and_parse_book('storage/raw_html_pages/70/book_70829.html') # BAD Book --No author, No publisher. Collective Work
|
|
326
|
+
# puts JSON.pretty_generate(bp.book)
|
|
327
|
+
|
|
328
|
+
# bp.load_and_parse_book('storage/raw_html_pages/145/book_145326.html') # BAD Book --ISMN istead of ISBN
|
|
329
|
+
|
|
330
|
+
# bp.load_and_parse_book('storage/raw_html_pages/45/book_45455.html') # BAD Book --No author. Has contributors.
|
|
331
|
+
# puts JSON.pretty_generate(bp.book)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
# bp.load_and_parse_book('storage/raw_html_pages/132/book_132435.html') # BAD Book --Two authors.
|
|
335
|
+
# puts JSON.pretty_generate(bp.book)
|
|
336
|
+
|
|
337
|
+
# bp.load_and_parse_book('storage/raw_html_pages/133/book_133435.html') # GOOD Book
|
|
338
|
+
|
|
339
|
+
# puts JSON.pretty_generate(bp.book)
|
|
340
|
+
|
|
341
|
+
# ddcp = BiblionetParser::Core::DDCParser.new('storage/raw_ddc_pages/0/ddc_298.html')
|
|
342
|
+
# pp all = ddcp.categories
|
|
343
|
+
# pp cur = ddcp.categories.values.last
|
|
344
|
+
# pp sel = ddcp.categories["2703"]
|
|
345
|
+
|
|
346
|
+
# bp.parse_book('12351', bp.page)
|
|
347
|
+
|
|
348
|
+
# bp.save_page('storage/mits_ts/mits1.json')
|
|
349
|
+
|
|
350
|
+
# pp bp.url='http://www.biblionet.gr/book/123351'
|
|
351
|
+
# pp bp.page
|
|
352
|
+
|
|
353
|
+
# pp bib.list_directories(path: 'storage/raw_html_pages')
|
|
354
|
+
# pp bib.list_files(path: "storage/raw_html_pages/24/", extension: 'html')
|
|
355
|
+
|
|
356
|
+
# bib = Bibliotheca.new
|
|
357
|
+
# bib.parse_all_books
|
|
358
|
+
|
|
359
|
+
# Good cases:
|
|
360
|
+
# 'storage/raw_html_pages/123/book_123351.html'
|
|
361
|
+
# 'storage/raw_html_pages/17/book_17351.html'
|
|
362
|
+
# 'storage/raw_html_pages/133/book_133435.html'
|
|
363
|
+
|
|
364
|
+
# Special book cases to check out:
|
|
365
|
+
# 'storage/raw_html_pages/96/book_96592.html' --no image
|
|
366
|
+
# 'storage/raw_html_pages/96/book_96937.html'
|
|
367
|
+
|
|
368
|
+
# Problematic at biblionet
|
|
369
|
+
# http://biblionet.gr/book/196388
|
|
370
|
+
# http://biblionet.gr/book/196386
|
|
371
|
+
# http://biblionet.gr/book/195525
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'open-uri'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
|
|
6
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
|
|
7
|
+
|
|
8
|
+
DEFAULTS = {
|
|
9
|
+
folder: 'storage/html_author_pages',
|
|
10
|
+
base_url: 'http://www.biblionet.gr/author/',
|
|
11
|
+
extension: '.html',
|
|
12
|
+
first_id: 1,
|
|
13
|
+
last_id: 112000,
|
|
14
|
+
step: 1000
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
def crawl_and_save(options={})
|
|
18
|
+
options = DEFAULTS.merge(options)
|
|
19
|
+
|
|
20
|
+
start_id = options[:first_id] + options[:step] - 1
|
|
21
|
+
last_id = options[:last_id]
|
|
22
|
+
step = options[:step]
|
|
23
|
+
|
|
24
|
+
start_id.step(last_id, step) do |last|
|
|
25
|
+
first = last - step + 1
|
|
26
|
+
subfolder = (last/step - 1).to_s
|
|
27
|
+
path = "#{options[:folder]}/#{subfolder}/"
|
|
28
|
+
|
|
29
|
+
# Create a new directory (does nothing if directory exists)
|
|
30
|
+
FileUtils.mkdir_p path
|
|
31
|
+
|
|
32
|
+
first.upto(last) do |id|
|
|
33
|
+
file_to_save = "#{path}author_#{id}#{options[:extension]}"
|
|
34
|
+
url_to_download = "#{options[:base_url]}#{id}/"
|
|
35
|
+
|
|
36
|
+
downloader = Biblionet::Core::Base.new(url_to_download)
|
|
37
|
+
downloader.save_page(file_to_save) unless downloader.page.nil?
|
|
38
|
+
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
|
|
2
|
+
|
|
3
|
+
module Biblionet
|
|
4
|
+
module Crawlers
|
|
5
|
+
|
|
6
|
+
class Base
|
|
7
|
+
def initialize(options = {})
|
|
8
|
+
@folder = options[:folder] ||= 'lib/bookshark/storage/html_base_pages'
|
|
9
|
+
@base_url = options[:base_url] ||= 'http://www.biblionet.gr/base/'
|
|
10
|
+
@page_type = options[:page_type] ||= 'base'
|
|
11
|
+
@extension = options[:extension] ||= '.html'
|
|
12
|
+
@start = options[:start] ||= 1
|
|
13
|
+
@finish = options[:finish] ||= 10000
|
|
14
|
+
@step = options[:step] ||= 1000
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def spider
|
|
18
|
+
start = @start + @step - 1
|
|
19
|
+
finish = @finish
|
|
20
|
+
|
|
21
|
+
start.step(finish, @step) do |last|
|
|
22
|
+
first = last - @step + 1
|
|
23
|
+
subfolder = (last/@step - 1).to_s
|
|
24
|
+
path = "#{@folder}/#{subfolder}/"
|
|
25
|
+
|
|
26
|
+
# Create a new directory (does nothing if directory exists)
|
|
27
|
+
# FileUtils.mkdir_p path
|
|
28
|
+
|
|
29
|
+
first.upto(last) do |id|
|
|
30
|
+
file_to_save = "#{path}#{@page_type}_#{id}#{@extension}"
|
|
31
|
+
url_to_download = "#{@base_url}#{id}/"
|
|
32
|
+
|
|
33
|
+
yield(url_to_download, file_to_save)
|
|
34
|
+
# downloader = Biblionet::Core::Base.new(url_to_download)
|
|
35
|
+
# downloader.save_page(file_to_save) unless downloader.page.nil?
|
|
36
|
+
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
end
|
|
46
|
+
end
|