bookshark 1.0.4 → 1.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3c55195dacb83de8db6a4e40bc79d042abd3dbca
4
- data.tar.gz: 07eb3cc3e019b52bea3a26e03fea7957d5a5ea8a
3
+ metadata.gz: 140903206c1308ba6eda3c1c1ee6a910be0a28b3
4
+ data.tar.gz: d1891f2c7ba96340201e5bd2936a07ca83032e4b
5
5
  SHA512:
6
- metadata.gz: 9754f7035b8d85724f7ba0cd1cff7a471f454feca7f3d03759a5f18a0d7d9902ab67498dec57021d9d118203ffc8dad12f32b6dcf4dee8f392813221b61cbe21
7
- data.tar.gz: ac70590c2eb3424e13105c9aa410505daef18f51e778252c8aa39ddc9a358560d469633521ef315a91479299efb7124f5bb648e864614cf6ba2ffc457e2a6cbb
6
+ metadata.gz: 1a0fa064e9fc157ad837682e520d5f287546ebb63631887f0108892f0d12118ba2ec3f0cf20e7a0def18a02e8213958c87963a7cbbfb9f05548ef57d11d8fc12
7
+ data.tar.gz: a51c475919cb2c55c5be0b1bda1765edfb9cc20ca1b68ea75fa9eefbfcc576d141ba6c1f6bd2744da3e0cb63dbc8de23a303dfecd0587dbc8ab62e8c9ab4221f
@@ -116,6 +116,20 @@ module Biblionet
116
116
 
117
117
  end
118
118
 
119
+ # Splits availability with date like "Υπό Έκδοση 4/2017" to availablity and last update
120
+ unless details_hash[:availability].nil?
121
+ availability = details_hash[:availability]
122
+ details_hash[:availability] = "Υπό Έκδοση" if availability.include? "Υπό Έκδοση"
123
+
124
+ if details_hash[:last_update].nil?
125
+ if availability =~ /(\d{1,2}\/)?\d{1,2}\/\d{2,}/
126
+ last_update = availability.match(/(\d{1,2}\/)?\d{1,2}\/\d{2,}/)[0]
127
+ last_update = "10/" + last_update if last_update.split('/').length == 2
128
+ details_hash[:last_update] = last_update
129
+ end
130
+ end
131
+ end
132
+
119
133
  pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text
120
134
  pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text)
121
135
 
@@ -6,22 +6,22 @@ require_relative 'bibliographical_book_extractor'
6
6
  require 'sanitize'
7
7
 
8
8
  module Biblionet
9
- module Extractors
10
-
11
- class BookExtractor < Base
9
+ module Extractors
10
+
11
+ class BookExtractor < Base
12
12
  attr_reader :book
13
13
 
14
14
  def initialize(uri=nil)
15
- super(uri)
16
- extract_book unless uri.nil? or @page.nil?
15
+ super(uri)
16
+ extract_book unless uri.nil? or @page.nil?
17
17
  end
18
18
 
19
19
  def load_and_extract_book(uri=nil)
20
20
  load_page(uri)
21
21
  extract_book unless uri.nil? or @page.nil?
22
- end
22
+ end
23
23
 
24
- # Converts the parsed contributors string to hash.
24
+ # Converts the parsed contributors string to hash.
25
25
  # String must have been processed into the following form:
26
26
  # job1: contributor1, contributor2 job2: contributor3
27
27
  # The returned hash is in form: {job1 => ["contributor1","contributor2"],job2 => ["contributor3"]}
@@ -36,18 +36,18 @@ module Biblionet
36
36
  else
37
37
  partners << cb
38
38
  contributors[job] = partners.clone
39
- end
39
+ end
40
40
  end unless raw_contributors.nil? or raw_contributors.empty?
41
-
41
+
42
42
  return contributors
43
43
  end
44
44
 
45
45
  def proccess_details(details)
46
46
  details_hash = Hash.new
47
-
48
- details.each do |detail|
47
+
48
+ details.each do |detail|
49
49
  date_regex = /(^\d{4}$)/
50
- status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
50
+ status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
51
51
  detail = decode_text(detail)
52
52
 
53
53
  begin
@@ -59,11 +59,11 @@ module Biblionet
59
59
  #puts "Pages: #{pages}"
60
60
  details_hash[:pages] = pages
61
61
  elsif detail.start_with? "ISBN-13"
62
- isbn_13 = detail.gsub(/ISBN-13 /, "")
62
+ isbn_13 = detail.gsub(/ISBN-13 /, "").gsub("&Chi","X")
63
63
  details_hash[:isbn_13] = isbn_13
64
- #puts "ISBN: #{isbn_13}"
64
+ #puts "ISBN: #{isbn_13}"
65
65
  elsif detail.start_with? "ISBN"
66
- isbn = detail.gsub(/ISBN /, "")
66
+ isbn = detail.gsub(/ISBN /, "").gsub("&Chi","X")
67
67
  #puts "ISBN: #{isbn}"
68
68
  details_hash[:isbn] = isbn
69
69
  elsif detail =~ status_regex
@@ -81,12 +81,12 @@ module Biblionet
81
81
  elsif detail.start_with? "ISMN" #Special typo case
82
82
  isbn = detail.gsub(/ISMN /, "")
83
83
  #puts "ISBN: #{isbn}"
84
- details_hash[:isbn] = isbn
85
- else
84
+ details_hash[:isbn] = isbn
85
+ else
86
86
  raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
87
87
  end
88
88
  rescue NoIdeaWhatThisIsError => e
89
- pp e
89
+ pp e
90
90
  end
91
91
  end
92
92
 
@@ -94,86 +94,86 @@ module Biblionet
94
94
  end
95
95
 
96
96
  def proccess_ddc(ddc, extract_parents = false)
97
- # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
97
+ # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
98
98
  id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/
99
99
 
100
- # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
100
+ # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
101
101
  non_text_re = /\s*(\[.*\]|\(.*\))\s*/
102
-
103
- # Gets the dcc part from text and removes anything but digits in [DDC: digits].
104
- ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.
102
+
103
+ # Gets the dcc part from text and removes anything but digits in [DDC: digits].
104
+ ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.
105
105
 
106
106
  # Extracts the parent tree of current ddc.
107
- # ddcparser.parse(ddc_id)
107
+ # ddcparser.parse(ddc_id)
108
108
 
109
109
  # Gets text by reomoving anything but text.
110
110
  ddc_text = ddc.gsub(non_text_re, '').strip
111
111
 
112
- ddc_hash = { ddc: ddc_id, name: ddc_text }
112
+ ddc_hash = { ddc: ddc_id, name: ddc_text }
113
113
  return ddc_hash
114
- end
114
+ end
115
115
 
116
116
 
117
- def extract_book(biblionet_id=@biblionet_id, book_page=@page)
117
+ def extract_book(biblionet_id=@biblionet_id, book_page=@page)
118
118
  # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
119
119
  log = Logger.new(STDOUT)
120
-
120
+
121
121
  page = BookDataExtractor.new(book_page)
122
122
 
123
123
  # End extraction if BookDataExtractor couldnt create a nodeset
124
124
  return nil if page.nodeset.nil?
125
125
 
126
126
 
127
- book_hash = Hash.new
127
+ book_hash = Hash.new
128
128
 
129
- begin
130
- img = page.image
129
+ begin
130
+ img = page.image
131
131
  raise NoImageError.new(biblionet_id) if img.nil?
132
132
  rescue NoImageError => e
133
- pp e
134
- log.warn(e.message)
133
+ pp e
134
+ log.warn(e.message)
135
135
  rescue StandardError => e
136
- pp err_msg = "Error #{e} at book: #{biblionet_id}"
137
- log.error(err_msg)
136
+ pp err_msg = "Error #{e} at book: #{biblionet_id}"
137
+ log.error(err_msg)
138
138
  end
139
139
 
140
- book_hash[:title] = page.title
141
- book_hash[:subtitle] = page.subtitle
142
- book_hash[:image] = img
143
-
140
+ book_hash[:title] = page.title
141
+ book_hash[:subtitle] = page.subtitle
142
+ book_hash[:image] = img
143
+
144
144
  contributors = proccess_contributors(page.contributors)
145
145
 
146
146
  author = contributors[:author]
147
147
  contributors.delete(:author)
148
-
148
+
149
149
  # If author is empty, maybe its a collective work.
150
150
  if author.nil? or author.empty?
151
- if page.collective_work?
151
+ if page.collective_work?
152
152
  # author = 'Συλλογικό έργο'
153
153
  author = ['Συλλογικό έργο']
154
- else
155
- pp err_msg = "No author has been found at book: #{biblionet_id}"
156
- log.warn(err_msg)
157
- author = []
154
+ else
155
+ pp err_msg = "No author has been found at book: #{biblionet_id}"
156
+ log.warn(err_msg)
157
+ author = []
158
158
  end
159
159
  end
160
160
 
161
161
  book_hash[:author] = author
162
- book_hash[:contributors] = contributors
162
+ book_hash[:contributors] = contributors
163
163
  book_hash[:publisher] = page.publisher
164
164
 
165
165
  details = page.details
166
166
  if details.nil?
167
167
  pp err_msg = "No details at book: #{biblionet_id}"
168
- log.error(err_msg)
169
- end
168
+ log.error(err_msg)
169
+ end
170
170
 
171
171
  details_hash = proccess_details(details)
172
172
 
173
173
  # book_hash[:publication_year] = details_hash[:publication_year]
174
174
  # book_hash[:pages] = details_hash[:pages]
175
175
  book_hash[:isbn] = details_hash[:isbn]
176
-
176
+
177
177
  if details_hash[:isbn_13].nil?
178
178
  if present?(details_hash[:isbn]) and (details_hash[:isbn].strip.gsub('-','').length == 13)
179
179
  book_hash[:isbn_13] = book_hash[:isbn]
@@ -192,10 +192,10 @@ module Biblionet
192
192
 
193
193
  book_hash[:description] = page.description
194
194
 
195
- ddcs = page.ddcs.map do |ddc|
195
+ ddcs = page.ddcs.map do |ddc|
196
196
  # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
197
197
  ddc_biblionet_id = ddc[:href].split(/\//).last
198
- # Extact DdC id and DdC text.
198
+ # Extact DdC id and DdC text.
199
199
  ddc = proccess_ddc(ddc.text)
200
200
 
201
201
  ddc.merge!(b_id: ddc_biblionet_id)
@@ -217,21 +217,21 @@ module Biblionet
217
217
  # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
218
218
 
219
219
  bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
220
- bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
220
+ bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
221
221
 
222
222
  book_hash[:publisher] = bibliographical_details[:publisher]
223
- book_hash[:publication] = bibliographical_details[:publication]
223
+ book_hash[:publication] = bibliographical_details[:publication]
224
224
 
225
- book_hash[:format] = bibliographical_details[:format]
225
+ book_hash[:format] = bibliographical_details[:format]
226
226
 
227
227
  book_hash[:original_language] = bibliographical_details[:original_language]
228
228
  book_hash[:original_title] = bibliographical_details[:original_title]
229
229
 
230
- book_hash[:price] = bibliographical_details[:price]
230
+ book_hash[:price] = bibliographical_details[:price]
231
231
  book_hash[:availability] = bibliographical_details[:availability]
232
232
  book_hash[:last_update] = bibliographical_details[:last_update]
233
-
234
- book_hash[:series] = bibliographical_details[:series]
233
+
234
+ book_hash[:series] = bibliographical_details[:series]
235
235
 
236
236
  physical_description_hash = {}
237
237
  physical_description_hash[:pages] = details_hash[:pages]
@@ -239,10 +239,10 @@ module Biblionet
239
239
  physical_description_hash[:cover_type] = bibliographical_details[:cover_type]
240
240
 
241
241
  book_hash[:physical_description] = physical_description_hash
242
-
243
242
 
244
- return @book = book_hash
245
- end
243
+
244
+ return @book = book_hash
245
+ end
246
246
  end
247
247
 
248
248
  class BookDataExtractor
@@ -255,25 +255,25 @@ module Biblionet
255
255
  puts document
256
256
  end
257
257
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?
258
-
258
+
259
259
  # If content is nil, there is something wrong with the html, so return nil
260
260
  if content.nil?
261
261
  @nodeset = nil
262
262
  else
263
- @nodeset = Nokogiri::HTML(content)
264
- end
263
+ @nodeset = Nokogiri::HTML(content)
264
+ end
265
265
  end
266
266
 
267
267
  def image
268
268
  img_node = nil
269
269
  img_nodes = @nodeset.xpath("/html/body//img").each do |i|
270
- img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
271
- img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
272
- end
270
+ img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
271
+ img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
272
+ end
273
273
 
274
- img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]
274
+ img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]
275
275
 
276
- return img
276
+ return img
277
277
  end
278
278
 
279
279
  def title
@@ -292,7 +292,7 @@ module Biblionet
292
292
 
293
293
  def publisher
294
294
  publisher_hash = {}
295
- @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
295
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
296
296
  publisher_hash[:name] = item.text
297
297
  publisher_hash[:b_id] = (item[:href].split("/"))[2]
298
298
  end
@@ -301,12 +301,12 @@ module Biblionet
301
301
 
302
302
  def contributors
303
303
  contributors = []
304
- @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
305
- pre_text = item.previous.text.strip
304
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
305
+ pre_text = item.previous.text.strip
306
306
  contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
307
307
  contributor = {}
308
- contributor[:name] = item.text
309
- contributor[:b_id] = (item[:href].split("/"))[2]
308
+ contributor[:name] = item.text
309
+ contributor[:b_id] = (item[:href].split("/"))[2]
310
310
  contributors << contributor
311
311
  end
312
312
  # Alternative way based on intersecting sets
@@ -315,19 +315,19 @@ module Biblionet
315
315
 
316
316
  # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
317
317
  # text = other.inner_text.strip
318
- # other = text == "," ? nil : text
319
- # end.compact
318
+ # other = text == "," ? nil : text
319
+ # end.compact
320
320
  contributors
321
- end
321
+ end
322
322
 
323
323
  def details
324
324
  details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
325
325
  if details.nil?
326
- details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
326
+ details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
327
327
  end
328
328
 
329
- return details
330
- end
329
+ return details
330
+ end
331
331
 
332
332
  def description
333
333
  desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
@@ -338,51 +338,51 @@ module Biblionet
338
338
  else
339
339
  return desc
340
340
  end
341
- end
341
+ end
342
342
 
343
343
  def ddcs
344
344
  @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
345
- end
345
+ end
346
346
 
347
347
  def collective_work?
348
348
  return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
349
- end
349
+ end
350
350
 
351
351
  # Special case in which there is no author but there are contributors
352
352
  def has_contributors_but_no_authors?
353
353
  node_start = "//h1[@class='book_title']/following::text()"
354
354
  node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
355
355
  between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip
356
-
357
- if !between.empty? and between.end_with? ':'
356
+
357
+ if !between.empty? and between.end_with? ':'
358
358
  true
359
359
  else
360
360
  false
361
361
  end
362
- end
362
+ end
363
363
 
364
364
  def awards
365
- awards = []
365
+ awards = []
366
366
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
367
- award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
367
+ award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
368
368
  awards << award
369
369
  end
370
-
370
+
371
371
  return awards
372
- end
372
+ end
373
373
 
374
374
  end
375
375
 
376
376
  # Raised when a book has no image.
377
377
  #
378
- class NoImageError < StandardError
378
+ class NoImageError < StandardError
379
379
  attr_reader :biblionet_id
380
380
 
381
- def initialize(biblionet_id)
381
+ def initialize(biblionet_id)
382
382
  msg = "This book has no image. At book #{biblionet_id}"
383
383
  super(msg)
384
- end
385
- end
384
+ end
385
+ end
386
386
 
387
387
  end
388
388
  end
@@ -454,10 +454,10 @@ end
454
454
  # book_hash['description'] = desc
455
455
  # end
456
456
 
457
- # ddcs = page.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]").map do |ddc|
457
+ # ddcs = page.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]").map do |ddc|
458
458
  # # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
459
459
  # ddc_biblionet_id = ddc[:href].split(/\//).last
460
- # # Extact DdC id and DdC text.
460
+ # # Extact DdC id and DdC text.
461
461
  # ddc = proccess_ddc(ddc.text)
462
462
 
463
463
  # ddc.merge!(b_id: ddc_biblionet_id)
@@ -477,17 +477,17 @@ end
477
477
  # end
478
478
 
479
479
  # def contributors(n)
480
- # contributors = []
481
- # n.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
482
- # pre_text = item.previous.text.strip
480
+ # contributors = []
481
+ # n.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
482
+ # pre_text = item.previous.text.strip
483
483
  # contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
484
484
  # contributor = {}
485
- # contributor['name'] = item.text
486
- # contributor['b_id'] = (item[:href].split("/"))[2]
485
+ # contributor['name'] = item.text
486
+ # contributor['b_id'] = (item[:href].split("/"))[2]
487
487
  # contributors << contributor
488
- # end
488
+ # end
489
489
  # contributors
490
- # end
490
+ # end
491
491
 
492
492
  # c = contributors(n4)
493
493
 
@@ -502,9 +502,9 @@ end
502
502
  # else
503
503
  # partners << cb
504
504
  # contributors[job] = partners.clone
505
- # end
505
+ # end
506
506
  # end unless raw_contributors.nil? or raw_contributors.empty?
507
-
507
+
508
508
  # return contributors
509
509
  # end
510
510
 
@@ -1,3 +1,3 @@
1
1
  module Bookshark
2
- VERSION = "1.0.4"
2
+ VERSION = "1.0.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bookshark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dimitris Klisiaris
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-28 00:00:00.000000000 Z
11
+ date: 2017-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri