bookshark 1.0.0.alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +20 -0
- data/.rspec +3 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +453 -0
- data/Rakefile +4 -0
- data/bookshark.gemspec +29 -0
- data/lib/bookshark.rb +371 -0
- data/lib/bookshark/crawlers/author_crawler.rb +42 -0
- data/lib/bookshark/crawlers/base.rb +46 -0
- data/lib/bookshark/crawlers/book_crawler.rb +55 -0
- data/lib/bookshark/crawlers/category_crawler.rb +55 -0
- data/lib/bookshark/crawlers/publisher_crawler.rb +35 -0
- data/lib/bookshark/extractors/author_extractor.rb +116 -0
- data/lib/bookshark/extractors/base.rb +187 -0
- data/lib/bookshark/extractors/book_extractor.rb +453 -0
- data/lib/bookshark/extractors/category_extractor.rb +82 -0
- data/lib/bookshark/extractors/publisher_extractor.rb +138 -0
- data/lib/bookshark/extractors/search.rb +104 -0
- data/lib/bookshark/storage/file_manager.rb +103 -0
- data/lib/bookshark/version.rb +3 -0
- data/spec/bookshark_spec.rb +96 -0
- data/spec/spec_helper.rb +1 -0
- data/spec/test_data/author_13219.html +313 -0
- data/spec/test_data/author_13219.json +23 -0
- data/spec/test_data/book_103788.json +49 -0
- data/spec/test_data/category_1041.json +42 -0
- data/spec/test_data/eager_book_184923.json +215 -0
- data/spec/test_data/publisher_20.json +43 -0
- data/spec/test_data/search_01.json +355 -0
- data/spec/test_data/search_ids_01.json +13 -0
- data/tasks/console.rake +4 -0
- data/tasks/rspec.rake +3 -0
- metadata +191 -0
@@ -0,0 +1,453 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
require 'sanitize'
|
3
|
+
|
4
|
+
module Biblionet
|
5
|
+
module Extractors
|
6
|
+
|
7
|
+
class BookExtractor < Base
|
8
|
+
attr_reader :book
|
9
|
+
|
10
|
+
def initialize(uri=nil)
|
11
|
+
super(uri)
|
12
|
+
extract_book unless uri.nil?
|
13
|
+
end
|
14
|
+
|
15
|
+
def load_and_extract_book(uri=nil)
|
16
|
+
load_page(uri)
|
17
|
+
extract_book unless uri.nil?
|
18
|
+
end
|
19
|
+
|
20
|
+
# Converts the parsed contributors string to hash.
|
21
|
+
# String must have been processed into the following form:
|
22
|
+
# job1: contributor1, contributor2 job2: contributor3
|
23
|
+
# The returned hash is in form: {job1 => ["contributor1","contributor2"],job2 => ["contributor3"]}
|
24
|
+
def proccess_contributors(raw_contributors)
|
25
|
+
contributors = Hash.new
|
26
|
+
partners = Array.new
|
27
|
+
job = :author
|
28
|
+
raw_contributors.each do |cb|
|
29
|
+
if cb.is_a?(String) and cb.end_with? ":"
|
30
|
+
job = cb[0..-2]
|
31
|
+
partners.clear
|
32
|
+
else
|
33
|
+
partners << cb
|
34
|
+
contributors[job] = partners.clone
|
35
|
+
end
|
36
|
+
end unless raw_contributors.nil? or raw_contributors.empty?
|
37
|
+
|
38
|
+
return contributors
|
39
|
+
end
|
40
|
+
|
41
|
+
def proccess_details(details)
|
42
|
+
details_hash = Hash.new
|
43
|
+
|
44
|
+
details.each do |detail|
|
45
|
+
date_regex = /(^\d{4}$)/
|
46
|
+
status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
|
47
|
+
detail = decode_text(detail)
|
48
|
+
|
49
|
+
begin
|
50
|
+
if detail =~ date_regex
|
51
|
+
#puts "Publication Year: #{detail}"
|
52
|
+
details_hash[:publication_year] = detail
|
53
|
+
elsif detail.end_with? "σελ."
|
54
|
+
pages = detail.gsub(/[^\d]/, '')
|
55
|
+
#puts "Pages: #{pages}"
|
56
|
+
details_hash[:pages] = pages
|
57
|
+
elsif detail.start_with? "ISBN-13"
|
58
|
+
isbn_13 = detail.gsub(/ISBN-13 /, "")
|
59
|
+
details_hash[:isbn_13] = isbn_13
|
60
|
+
#puts "ISBN: #{isbn_13}"
|
61
|
+
elsif detail.start_with? "ISBN"
|
62
|
+
isbn = detail.gsub(/ISBN /, "")
|
63
|
+
#puts "ISBN: #{isbn}"
|
64
|
+
details_hash[:isbn] = isbn
|
65
|
+
elsif detail =~ status_regex
|
66
|
+
status = detail.gsub(/\[|\]/, '')
|
67
|
+
#puts "Status: #{status}"
|
68
|
+
details_hash[:status] = status
|
69
|
+
elsif detail.start_with? "Τιμή"
|
70
|
+
price = detail.gsub(/[^\d,\d]/, '')
|
71
|
+
#puts "Price: #{price}"
|
72
|
+
details_hash[:price] = price
|
73
|
+
elsif detail.start_with? '<img src="/images/award.jpg" border="0" title="Βραβείο">'
|
74
|
+
award = Sanitize.clean(detail).strip
|
75
|
+
details_hash[:awards] = [] if details_hash[:awards].nil?
|
76
|
+
details_hash[:awards] << award
|
77
|
+
elsif detail.start_with? "ISMN" #Special typo case
|
78
|
+
isbn = detail.gsub(/ISMN /, "")
|
79
|
+
#puts "ISBN: #{isbn}"
|
80
|
+
details_hash[:isbn] = isbn
|
81
|
+
else
|
82
|
+
raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
|
83
|
+
end
|
84
|
+
rescue NoIdeaWhatThisIsError => e
|
85
|
+
pp e
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
return details_hash
|
90
|
+
end
|
91
|
+
|
92
|
+
def proccess_ddc(ddc, extract_parents = false)
|
93
|
+
# Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
|
94
|
+
id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/
|
95
|
+
|
96
|
+
# Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
|
97
|
+
non_text_re = /\s*(\[.*\]|\(.*\))\s*/
|
98
|
+
|
99
|
+
# Gets the dcc part from text and removes anything but digits in [DDC: digits].
|
100
|
+
ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.
|
101
|
+
|
102
|
+
# Extracts the parent tree of current ddc.
|
103
|
+
# ddcparser.parse(ddc_id)
|
104
|
+
|
105
|
+
# Gets text by reomoving anything but text.
|
106
|
+
ddc_text = ddc.gsub(non_text_re, '').strip
|
107
|
+
|
108
|
+
ddc_hash = { ddc: ddc_id, name: ddc_text }
|
109
|
+
return ddc_hash
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
def extract_book(biblionet_id=@biblionet_id, book_page=@page)
|
114
|
+
# log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
|
115
|
+
log = Logger.new(STDOUT)
|
116
|
+
|
117
|
+
page = BookDataExtractor.new(book_page)
|
118
|
+
|
119
|
+
book_hash = Hash.new
|
120
|
+
|
121
|
+
begin
|
122
|
+
img = page.image
|
123
|
+
raise NoImageError.new(biblionet_id) if img.nil?
|
124
|
+
rescue NoImageError => e
|
125
|
+
pp e
|
126
|
+
log.warn(e.message)
|
127
|
+
rescue StandardError => e
|
128
|
+
pp err_msg = "Error #{e} at book: #{biblionet_id}"
|
129
|
+
log.error(err_msg)
|
130
|
+
end
|
131
|
+
|
132
|
+
book_hash[:title] = page.title
|
133
|
+
book_hash[:subtitle] = page.subtitle
|
134
|
+
book_hash[:image] = img
|
135
|
+
|
136
|
+
contributors = proccess_contributors(page.contributors)
|
137
|
+
|
138
|
+
author = contributors[:author]
|
139
|
+
contributors.delete(:author)
|
140
|
+
|
141
|
+
# If author is empty, maybe its a collective work.
|
142
|
+
if author.nil? or author.empty?
|
143
|
+
if page.collective_work?
|
144
|
+
# author = 'Συλλογικό έργο'
|
145
|
+
author = ['Συλλογικό έργο']
|
146
|
+
else
|
147
|
+
pp err_msg = "No author has been found at book: #{biblionet_id}"
|
148
|
+
log.warn(err_msg)
|
149
|
+
author = []
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
book_hash[:author] = author
|
154
|
+
book_hash[:contributors] = contributors
|
155
|
+
book_hash[:publisher] = page.publisher
|
156
|
+
|
157
|
+
details = page.details
|
158
|
+
if details.nil?
|
159
|
+
pp err_msg = "No details at book: #{biblionet_id}"
|
160
|
+
log.error(err_msg)
|
161
|
+
end
|
162
|
+
|
163
|
+
details_hash = proccess_details(details)
|
164
|
+
|
165
|
+
book_hash[:publication_year] = details_hash[:publication_year]
|
166
|
+
book_hash[:pages] = details_hash[:pages]
|
167
|
+
book_hash[:isbn] = details_hash[:isbn]
|
168
|
+
book_hash[:isbn_13] = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
|
169
|
+
book_hash[:status] = details_hash[:status]
|
170
|
+
book_hash[:price] = details_hash[:price]
|
171
|
+
book_hash[:award] = page.awards
|
172
|
+
|
173
|
+
|
174
|
+
book_hash[:description] = page.description
|
175
|
+
|
176
|
+
ddcs = page.ddcs.map do |ddc|
|
177
|
+
# Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
|
178
|
+
ddc_biblionet_id = ddc[:href].split(/\//).last
|
179
|
+
# Extact DdC id and DdC text.
|
180
|
+
ddc = proccess_ddc(ddc.text)
|
181
|
+
|
182
|
+
ddc.merge!(b_id: ddc_biblionet_id)
|
183
|
+
|
184
|
+
end
|
185
|
+
|
186
|
+
|
187
|
+
book_hash[:category] = ddcs
|
188
|
+
book_hash[:b_id] = biblionet_id
|
189
|
+
|
190
|
+
return @book = book_hash
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
class BookDataExtractor
|
195
|
+
attr_reader :nodeset
|
196
|
+
|
197
|
+
def initialize(document)
|
198
|
+
# No need to operate on whole page. Just on part containing the book.
|
199
|
+
content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
|
200
|
+
if (content_re.match(document)).nil?
|
201
|
+
puts document
|
202
|
+
end
|
203
|
+
content = content_re.match(document)[0]
|
204
|
+
|
205
|
+
@nodeset = Nokogiri::HTML(content)
|
206
|
+
end
|
207
|
+
|
208
|
+
def image
|
209
|
+
img_node = nil
|
210
|
+
img_nodes = @nodeset.xpath("/html/body//img").each do |i|
|
211
|
+
img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
|
212
|
+
img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
|
213
|
+
end
|
214
|
+
|
215
|
+
img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]
|
216
|
+
|
217
|
+
return img
|
218
|
+
end
|
219
|
+
|
220
|
+
def title
|
221
|
+
@nodeset.css('h1.book_title').text
|
222
|
+
end
|
223
|
+
|
224
|
+
def subtitle
|
225
|
+
subtitle = nil
|
226
|
+
@nodeset.xpath("//h1[@class='book_title']").each do |item|
|
227
|
+
if item.next_element.name == 'br' and item.next_element.next.name != 'br'
|
228
|
+
subtitle = item.next_element.next.text.strip
|
229
|
+
end
|
230
|
+
end
|
231
|
+
subtitle
|
232
|
+
end
|
233
|
+
|
234
|
+
def publisher
|
235
|
+
publisher_hash = {}
|
236
|
+
@nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
|
237
|
+
publisher_hash[:name] = item.text
|
238
|
+
publisher_hash[:b_id] = (item[:href].split("/"))[2]
|
239
|
+
end
|
240
|
+
publisher_hash
|
241
|
+
end
|
242
|
+
|
243
|
+
def contributors
|
244
|
+
contributors = []
|
245
|
+
@nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
|
246
|
+
pre_text = item.previous.text.strip
|
247
|
+
contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
|
248
|
+
contributor = {}
|
249
|
+
contributor[:name] = item.text
|
250
|
+
contributor[:b_id] = (item[:href].split("/"))[2]
|
251
|
+
contributors << contributor
|
252
|
+
end
|
253
|
+
# Alternative way based on intersecting sets
|
254
|
+
# set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
|
255
|
+
# set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"
|
256
|
+
|
257
|
+
# others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
|
258
|
+
# text = other.inner_text.strip
|
259
|
+
# other = text == "," ? nil : text
|
260
|
+
# end.compact
|
261
|
+
contributors
|
262
|
+
end
|
263
|
+
|
264
|
+
def details
|
265
|
+
details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
|
266
|
+
if details.nil?
|
267
|
+
details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
|
268
|
+
end
|
269
|
+
|
270
|
+
return details
|
271
|
+
end
|
272
|
+
|
273
|
+
def description
|
274
|
+
desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
|
275
|
+
desc = Sanitize.clean(desc, elements: ['br'])
|
276
|
+
|
277
|
+
if (desc =~ /\p{Word}{3,}/).nil?
|
278
|
+
return nil
|
279
|
+
else
|
280
|
+
return desc
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def ddcs
|
285
|
+
@nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
|
286
|
+
end
|
287
|
+
|
288
|
+
def collective_work?
|
289
|
+
return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
|
290
|
+
end
|
291
|
+
|
292
|
+
# Special case in which there is no author but there are contributors
|
293
|
+
def has_contributors_but_no_authors?
|
294
|
+
node_start = "//h1[@class='book_title']/following::text()"
|
295
|
+
node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
|
296
|
+
between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip
|
297
|
+
|
298
|
+
if !between.empty? and between.end_with? ':'
|
299
|
+
true
|
300
|
+
else
|
301
|
+
false
|
302
|
+
end
|
303
|
+
end
|
304
|
+
|
305
|
+
def awards
|
306
|
+
awards = []
|
307
|
+
@nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
|
308
|
+
award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
|
309
|
+
awards << award
|
310
|
+
end
|
311
|
+
|
312
|
+
return awards
|
313
|
+
end
|
314
|
+
|
315
|
+
end
|
316
|
+
|
317
|
+
# Raised when a book has no image.
|
318
|
+
#
|
319
|
+
class NoImageError < StandardError
|
320
|
+
attr_reader :biblionet_id
|
321
|
+
|
322
|
+
def initialize(biblionet_id)
|
323
|
+
msg = "This book has no image. At book #{biblionet_id}"
|
324
|
+
super(msg)
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
|
332
|
+
# Both methods write a file
|
333
|
+
# File.open('book_133435_decoded.html', 'w') { |file| file.write(dec) }
|
334
|
+
# File.write('filename', 'content')
|
335
|
+
|
336
|
+
# puts decode_file('book_133435.html')
|
337
|
+
|
338
|
+
# biblionet_id = '123351'
|
339
|
+
|
340
|
+
# biblionet_id = '17351'
|
341
|
+
|
342
|
+
# biblionet_id = '133435'
|
343
|
+
|
344
|
+
# page = Nokogiri::HTML(open("book_#{biblionet_id}.html"))
|
345
|
+
|
346
|
+
# book_hash = Hash.new
|
347
|
+
|
348
|
+
# book = page.css('//tr/td[width="180"][valign="top"][align="left"]')
|
349
|
+
|
350
|
+
|
351
|
+
# img = (page.xpath("/html/body//img[@src[contains(.,'/covers/')]][1]").first)['src']
|
352
|
+
# book_hash['image'] = BASE_URL+img
|
353
|
+
|
354
|
+
# title = page.css('h1.book_title').text
|
355
|
+
# book_hash['title'] = title
|
356
|
+
|
357
|
+
# author = page.css('a.booklink').first.text
|
358
|
+
# book_hash['author'] = author
|
359
|
+
|
360
|
+
# # others = page.xpath("//a[@class='booklink' and @href[not(contains(.,'/com/')) ]]")
|
361
|
+
|
362
|
+
# publisher = page.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").text
|
363
|
+
# book_hash['publisher'] = publisher
|
364
|
+
|
365
|
+
# A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
|
366
|
+
# B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"
|
367
|
+
|
368
|
+
# others = book.xpath("#{A}[count(.|#{B}) = count(#{B})]").inner_text
|
369
|
+
|
370
|
+
# others = others.split(/\n/).map(&:strip).reject!(&:empty?)
|
371
|
+
|
372
|
+
# details = page.css('.book_details').inner_html.gsub(/(^\d,\d)|(\D,)|(,\D)/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
|
373
|
+
# details_hash = proccess_details(details)
|
374
|
+
|
375
|
+
# book_hash['publication_year'] = details_hash['publication_year']
|
376
|
+
# book_hash['pages'] = details_hash['pages']
|
377
|
+
# book_hash['isbn'] = details_hash['isbn']
|
378
|
+
# book_hash['isbn_13'] = details_hash['isbn_13'].nil? ? nil : details_hash['isbn_13']
|
379
|
+
# book_hash['status'] = details_hash['status']
|
380
|
+
# book_hash['price'] = details_hash['price']
|
381
|
+
|
382
|
+
# contributors = proccess_contributors(others)
|
383
|
+
# book_hash['contributors'] = contributors
|
384
|
+
|
385
|
+
# # puts test.xpath("#{A}[count(.|#{B}) = count(#{B})]")
|
386
|
+
|
387
|
+
# # puts author.search('/following::node()')
|
388
|
+
|
389
|
+
# desc = page.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
|
390
|
+
# desc = Sanitize.clean(desc, elements: ['br'])
|
391
|
+
|
392
|
+
# if (desc =~ /\p{Word}{3,}/).nil?
|
393
|
+
# book_hash['description'] = nil
|
394
|
+
# else
|
395
|
+
# book_hash['description'] = desc
|
396
|
+
# end
|
397
|
+
|
398
|
+
# ddcs = page.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]").map do |ddc|
|
399
|
+
# # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
|
400
|
+
# ddc_biblionet_id = ddc[:href].split(/\//).last
|
401
|
+
# # Extact DdC id and DdC text.
|
402
|
+
# ddc = proccess_ddc(ddc.text)
|
403
|
+
|
404
|
+
# ddc.merge!(b_id: ddc_biblionet_id)
|
405
|
+
|
406
|
+
# end
|
407
|
+
|
408
|
+
|
409
|
+
# book_hash['ddc_ids'] = ddcs
|
410
|
+
# book_hash['biblionet_id'] = biblionet_id
|
411
|
+
|
412
|
+
# book_json = book_hash.to_json
|
413
|
+
|
414
|
+
# puts book_json_pretty = JSON.pretty_generate(book_hash)
|
415
|
+
|
416
|
+
# File.open("book_#{biblionet_id}.json","w") do |f|
|
417
|
+
# f.write(book_json)
|
418
|
+
# end
|
419
|
+
|
420
|
+
# def contributors(n)
|
421
|
+
# contributors = []
|
422
|
+
# n.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
|
423
|
+
# pre_text = item.previous.text.strip
|
424
|
+
# contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
|
425
|
+
# contributor = {}
|
426
|
+
# contributor['name'] = item.text
|
427
|
+
# contributor['b_id'] = (item[:href].split("/"))[2]
|
428
|
+
# contributors << contributor
|
429
|
+
# end
|
430
|
+
# contributors
|
431
|
+
# end
|
432
|
+
|
433
|
+
# c = contributors(n4)
|
434
|
+
|
435
|
+
# def proccess_contributors(raw_contributors)
|
436
|
+
# contributors = Hash.new
|
437
|
+
# partners = Array.new
|
438
|
+
# job = "author"
|
439
|
+
# raw_contributors.each do |cb|
|
440
|
+
# if cb.is_a?(String) and cb.end_with? ":"
|
441
|
+
# job = cb[0..-2]
|
442
|
+
# partners.clear
|
443
|
+
# else
|
444
|
+
# partners << cb
|
445
|
+
# contributors[job] = partners.clone
|
446
|
+
# end
|
447
|
+
# end unless raw_contributors.nil? or raw_contributors.empty?
|
448
|
+
|
449
|
+
# return contributors
|
450
|
+
# end
|
451
|
+
|
452
|
+
# c2 = proccess_contributors(c)
|
453
|
+
|