bookshark 1.0.0.alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,453 @@
1
+ require_relative 'base'
2
+ require 'sanitize'
3
+
4
+ module Biblionet
5
+ module Extractors
6
+
7
+ class BookExtractor < Base
8
+ attr_reader :book
9
+
10
+ def initialize(uri=nil)
11
+ super(uri)
12
+ extract_book unless uri.nil?
13
+ end
14
+
15
+ def load_and_extract_book(uri=nil)
16
+ load_page(uri)
17
+ extract_book unless uri.nil?
18
+ end
19
+
20
+ # Converts the parsed contributors string to hash.
21
+ # String must have been processed into the following form:
22
+ # job1: contributor1, contributor2 job2: contributor3
23
+ # The returned hash is in form: {job1 => ["contributor1","contributor2"],job2 => ["contributor3"]}
24
+ def proccess_contributors(raw_contributors)
25
+ contributors = Hash.new
26
+ partners = Array.new
27
+ job = :author
28
+ raw_contributors.each do |cb|
29
+ if cb.is_a?(String) and cb.end_with? ":"
30
+ job = cb[0..-2]
31
+ partners.clear
32
+ else
33
+ partners << cb
34
+ contributors[job] = partners.clone
35
+ end
36
+ end unless raw_contributors.nil? or raw_contributors.empty?
37
+
38
+ return contributors
39
+ end
40
+
41
+ def proccess_details(details)
42
+ details_hash = Hash.new
43
+
44
+ details.each do |detail|
45
+ date_regex = /(^\d{4}$)/
46
+ status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
47
+ detail = decode_text(detail)
48
+
49
+ begin
50
+ if detail =~ date_regex
51
+ #puts "Publication Year: #{detail}"
52
+ details_hash[:publication_year] = detail
53
+ elsif detail.end_with? "σελ."
54
+ pages = detail.gsub(/[^\d]/, '')
55
+ #puts "Pages: #{pages}"
56
+ details_hash[:pages] = pages
57
+ elsif detail.start_with? "ISBN-13"
58
+ isbn_13 = detail.gsub(/ISBN-13 /, "")
59
+ details_hash[:isbn_13] = isbn_13
60
+ #puts "ISBN: #{isbn_13}"
61
+ elsif detail.start_with? "ISBN"
62
+ isbn = detail.gsub(/ISBN /, "")
63
+ #puts "ISBN: #{isbn}"
64
+ details_hash[:isbn] = isbn
65
+ elsif detail =~ status_regex
66
+ status = detail.gsub(/\[|\]/, '')
67
+ #puts "Status: #{status}"
68
+ details_hash[:status] = status
69
+ elsif detail.start_with? "Τιμή"
70
+ price = detail.gsub(/[^\d,\d]/, '')
71
+ #puts "Price: #{price}"
72
+ details_hash[:price] = price
73
+ elsif detail.start_with? '<img src="/images/award.jpg" border="0" title="Βραβείο">'
74
+ award = Sanitize.clean(detail).strip
75
+ details_hash[:awards] = [] if details_hash[:awards].nil?
76
+ details_hash[:awards] << award
77
+ elsif detail.start_with? "ISMN" #Special typo case
78
+ isbn = detail.gsub(/ISMN /, "")
79
+ #puts "ISBN: #{isbn}"
80
+ details_hash[:isbn] = isbn
81
+ else
82
+ raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
83
+ end
84
+ rescue NoIdeaWhatThisIsError => e
85
+ pp e
86
+ end
87
+ end
88
+
89
+ return details_hash
90
+ end
91
+
92
+ def proccess_ddc(ddc, extract_parents = false)
93
+ # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
94
+ id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/
95
+
96
+ # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
97
+ non_text_re = /\s*(\[.*\]|\(.*\))\s*/
98
+
99
+ # Gets the dcc part from text and removes anything but digits in [DDC: digits].
100
+ ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.
101
+
102
+ # Extracts the parent tree of current ddc.
103
+ # ddcparser.parse(ddc_id)
104
+
105
+ # Gets text by reomoving anything but text.
106
+ ddc_text = ddc.gsub(non_text_re, '').strip
107
+
108
+ ddc_hash = { ddc: ddc_id, name: ddc_text }
109
+ return ddc_hash
110
+ end
111
+
112
+
113
+ def extract_book(biblionet_id=@biblionet_id, book_page=@page)
114
+ # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
115
+ log = Logger.new(STDOUT)
116
+
117
+ page = BookDataExtractor.new(book_page)
118
+
119
+ book_hash = Hash.new
120
+
121
+ begin
122
+ img = page.image
123
+ raise NoImageError.new(biblionet_id) if img.nil?
124
+ rescue NoImageError => e
125
+ pp e
126
+ log.warn(e.message)
127
+ rescue StandardError => e
128
+ pp err_msg = "Error #{e} at book: #{biblionet_id}"
129
+ log.error(err_msg)
130
+ end
131
+
132
+ book_hash[:title] = page.title
133
+ book_hash[:subtitle] = page.subtitle
134
+ book_hash[:image] = img
135
+
136
+ contributors = proccess_contributors(page.contributors)
137
+
138
+ author = contributors[:author]
139
+ contributors.delete(:author)
140
+
141
+ # If author is empty, maybe its a collective work.
142
+ if author.nil? or author.empty?
143
+ if page.collective_work?
144
+ # author = 'Συλλογικό έργο'
145
+ author = ['Συλλογικό έργο']
146
+ else
147
+ pp err_msg = "No author has been found at book: #{biblionet_id}"
148
+ log.warn(err_msg)
149
+ author = []
150
+ end
151
+ end
152
+
153
+ book_hash[:author] = author
154
+ book_hash[:contributors] = contributors
155
+ book_hash[:publisher] = page.publisher
156
+
157
+ details = page.details
158
+ if details.nil?
159
+ pp err_msg = "No details at book: #{biblionet_id}"
160
+ log.error(err_msg)
161
+ end
162
+
163
+ details_hash = proccess_details(details)
164
+
165
+ book_hash[:publication_year] = details_hash[:publication_year]
166
+ book_hash[:pages] = details_hash[:pages]
167
+ book_hash[:isbn] = details_hash[:isbn]
168
+ book_hash[:isbn_13] = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
169
+ book_hash[:status] = details_hash[:status]
170
+ book_hash[:price] = details_hash[:price]
171
+ book_hash[:award] = page.awards
172
+
173
+
174
+ book_hash[:description] = page.description
175
+
176
+ ddcs = page.ddcs.map do |ddc|
177
+ # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
178
+ ddc_biblionet_id = ddc[:href].split(/\//).last
179
+ # Extact DdC id and DdC text.
180
+ ddc = proccess_ddc(ddc.text)
181
+
182
+ ddc.merge!(b_id: ddc_biblionet_id)
183
+
184
+ end
185
+
186
+
187
+ book_hash[:category] = ddcs
188
+ book_hash[:b_id] = biblionet_id
189
+
190
+ return @book = book_hash
191
+ end
192
+ end
193
+
194
+ class BookDataExtractor
195
+ attr_reader :nodeset
196
+
197
+ def initialize(document)
198
+ # No need to operate on whole page. Just on part containing the book.
199
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
200
+ if (content_re.match(document)).nil?
201
+ puts document
202
+ end
203
+ content = content_re.match(document)[0]
204
+
205
+ @nodeset = Nokogiri::HTML(content)
206
+ end
207
+
208
+ def image
209
+ img_node = nil
210
+ img_nodes = @nodeset.xpath("/html/body//img").each do |i|
211
+ img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
212
+ img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
213
+ end
214
+
215
+ img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]
216
+
217
+ return img
218
+ end
219
+
220
+ def title
221
+ @nodeset.css('h1.book_title').text
222
+ end
223
+
224
+ def subtitle
225
+ subtitle = nil
226
+ @nodeset.xpath("//h1[@class='book_title']").each do |item|
227
+ if item.next_element.name == 'br' and item.next_element.next.name != 'br'
228
+ subtitle = item.next_element.next.text.strip
229
+ end
230
+ end
231
+ subtitle
232
+ end
233
+
234
+ def publisher
235
+ publisher_hash = {}
236
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
237
+ publisher_hash[:name] = item.text
238
+ publisher_hash[:b_id] = (item[:href].split("/"))[2]
239
+ end
240
+ publisher_hash
241
+ end
242
+
243
+ def contributors
244
+ contributors = []
245
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
246
+ pre_text = item.previous.text.strip
247
+ contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
248
+ contributor = {}
249
+ contributor[:name] = item.text
250
+ contributor[:b_id] = (item[:href].split("/"))[2]
251
+ contributors << contributor
252
+ end
253
+ # Alternative way based on intersecting sets
254
+ # set_A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
255
+ # set_B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"
256
+
257
+ # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
258
+ # text = other.inner_text.strip
259
+ # other = text == "," ? nil : text
260
+ # end.compact
261
+ contributors
262
+ end
263
+
264
+ def details
265
+ details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
266
+ if details.nil?
267
+ details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
268
+ end
269
+
270
+ return details
271
+ end
272
+
273
+ def description
274
+ desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
275
+ desc = Sanitize.clean(desc, elements: ['br'])
276
+
277
+ if (desc =~ /\p{Word}{3,}/).nil?
278
+ return nil
279
+ else
280
+ return desc
281
+ end
282
+ end
283
+
284
+ def ddcs
285
+ @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
286
+ end
287
+
288
+ def collective_work?
289
+ return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
290
+ end
291
+
292
+ # Special case in which there is no author but there are contributors
293
+ def has_contributors_but_no_authors?
294
+ node_start = "//h1[@class='book_title']/following::text()"
295
+ node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
296
+ between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip
297
+
298
+ if !between.empty? and between.end_with? ':'
299
+ true
300
+ else
301
+ false
302
+ end
303
+ end
304
+
305
+ def awards
306
+ awards = []
307
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
308
+ award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
309
+ awards << award
310
+ end
311
+
312
+ return awards
313
+ end
314
+
315
+ end
316
+
317
+ # Raised when a book has no image.
318
+ #
319
+ class NoImageError < StandardError
320
+ attr_reader :biblionet_id
321
+
322
+ def initialize(biblionet_id)
323
+ msg = "This book has no image. At book #{biblionet_id}"
324
+ super(msg)
325
+ end
326
+ end
327
+
328
+ end
329
+ end
330
+
331
+
332
+ # Both methods write a file
333
+ # File.open('book_133435_decoded.html', 'w') { |file| file.write(dec) }
334
+ # File.write('filename', 'content')
335
+
336
+ # puts decode_file('book_133435.html')
337
+
338
+ # biblionet_id = '123351'
339
+
340
+ # biblionet_id = '17351'
341
+
342
+ # biblionet_id = '133435'
343
+
344
+ # page = Nokogiri::HTML(open("book_#{biblionet_id}.html"))
345
+
346
+ # book_hash = Hash.new
347
+
348
+ # book = page.css('//tr/td[width="180"][valign="top"][align="left"]')
349
+
350
+
351
+ # img = (page.xpath("/html/body//img[@src[contains(.,'/covers/')]][1]").first)['src']
352
+ # book_hash['image'] = BASE_URL+img
353
+
354
+ # title = page.css('h1.book_title').text
355
+ # book_hash['title'] = title
356
+
357
+ # author = page.css('a.booklink').first.text
358
+ # book_hash['author'] = author
359
+
360
+ # # others = page.xpath("//a[@class='booklink' and @href[not(contains(.,'/com/')) ]]")
361
+
362
+ # publisher = page.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").text
363
+ # book_hash['publisher'] = publisher
364
+
365
+ # A = "//a[@class='booklink' and @href[contains(.,'/com/') ]][1]/preceding::text()"
366
+ # B = "//a[@class='booklink' and @href[not(contains(.,'/com/')) ]][1]/following::text()"
367
+
368
+ # others = book.xpath("#{A}[count(.|#{B}) = count(#{B})]").inner_text
369
+
370
+ # others = others.split(/\n/).map(&:strip).reject!(&:empty?)
371
+
372
+ # details = page.css('.book_details').inner_html.gsub(/(^\d,\d)|(\D,)|(,\D)/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
373
+ # details_hash = proccess_details(details)
374
+
375
+ # book_hash['publication_year'] = details_hash['publication_year']
376
+ # book_hash['pages'] = details_hash['pages']
377
+ # book_hash['isbn'] = details_hash['isbn']
378
+ # book_hash['isbn_13'] = details_hash['isbn_13'].nil? ? nil : details_hash['isbn_13']
379
+ # book_hash['status'] = details_hash['status']
380
+ # book_hash['price'] = details_hash['price']
381
+
382
+ # contributors = proccess_contributors(others)
383
+ # book_hash['contributors'] = contributors
384
+
385
+ # # puts test.xpath("#{A}[count(.|#{B}) = count(#{B})]")
386
+
387
+ # # puts author.search('/following::node()')
388
+
389
+ # desc = page.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
390
+ # desc = Sanitize.clean(desc, elements: ['br'])
391
+
392
+ # if (desc =~ /\p{Word}{3,}/).nil?
393
+ # book_hash['description'] = nil
394
+ # else
395
+ # book_hash['description'] = desc
396
+ # end
397
+
398
+ # ddcs = page.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]").map do |ddc|
399
+ # # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
400
+ # ddc_biblionet_id = ddc[:href].split(/\//).last
401
+ # # Extact DdC id and DdC text.
402
+ # ddc = proccess_ddc(ddc.text)
403
+
404
+ # ddc.merge!(b_id: ddc_biblionet_id)
405
+
406
+ # end
407
+
408
+
409
+ # book_hash['ddc_ids'] = ddcs
410
+ # book_hash['biblionet_id'] = biblionet_id
411
+
412
+ # book_json = book_hash.to_json
413
+
414
+ # puts book_json_pretty = JSON.pretty_generate(book_hash)
415
+
416
+ # File.open("book_#{biblionet_id}.json","w") do |f|
417
+ # f.write(book_json)
418
+ # end
419
+
420
+ # def contributors(n)
421
+ # contributors = []
422
+ # n.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
423
+ # pre_text = item.previous.text.strip
424
+ # contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
425
+ # contributor = {}
426
+ # contributor['name'] = item.text
427
+ # contributor['b_id'] = (item[:href].split("/"))[2]
428
+ # contributors << contributor
429
+ # end
430
+ # contributors
431
+ # end
432
+
433
+ # c = contributors(n4)
434
+
435
+ # def proccess_contributors(raw_contributors)
436
+ # contributors = Hash.new
437
+ # partners = Array.new
438
+ # job = "author"
439
+ # raw_contributors.each do |cb|
440
+ # if cb.is_a?(String) and cb.end_with? ":"
441
+ # job = cb[0..-2]
442
+ # partners.clear
443
+ # else
444
+ # partners << cb
445
+ # contributors[job] = partners.clone
446
+ # end
447
+ # end unless raw_contributors.nil? or raw_contributors.empty?
448
+
449
+ # return contributors
450
+ # end
451
+
452
+ # c2 = proccess_contributors(c)
453
+