bookshark 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3c55195dacb83de8db6a4e40bc79d042abd3dbca
4
- data.tar.gz: 07eb3cc3e019b52bea3a26e03fea7957d5a5ea8a
3
+ metadata.gz: 140903206c1308ba6eda3c1c1ee6a910be0a28b3
4
+ data.tar.gz: d1891f2c7ba96340201e5bd2936a07ca83032e4b
5
5
  SHA512:
6
- metadata.gz: 9754f7035b8d85724f7ba0cd1cff7a471f454feca7f3d03759a5f18a0d7d9902ab67498dec57021d9d118203ffc8dad12f32b6dcf4dee8f392813221b61cbe21
7
- data.tar.gz: ac70590c2eb3424e13105c9aa410505daef18f51e778252c8aa39ddc9a358560d469633521ef315a91479299efb7124f5bb648e864614cf6ba2ffc457e2a6cbb
6
+ metadata.gz: 1a0fa064e9fc157ad837682e520d5f287546ebb63631887f0108892f0d12118ba2ec3f0cf20e7a0def18a02e8213958c87963a7cbbfb9f05548ef57d11d8fc12
7
+ data.tar.gz: a51c475919cb2c55c5be0b1bda1765edfb9cc20ca1b68ea75fa9eefbfcc576d141ba6c1f6bd2744da3e0cb63dbc8de23a303dfecd0587dbc8ab62e8c9ab4221f
@@ -116,6 +116,20 @@ module Biblionet
116
116
 
117
117
  end
118
118
 
119
+ # Splits availability with date like "Υπό Έκδοση 4/2017" to availablity and last update
120
+ unless details_hash[:availability].nil?
121
+ availability = details_hash[:availability]
122
+ details_hash[:availability] = "Υπό Έκδοση" if availability.include? "Υπό Έκδοση"
123
+
124
+ if details_hash[:last_update].nil?
125
+ if availability =~ /(\d{1,2}\/)?\d{1,2}\/\d{2,}/
126
+ last_update = availability.match(/(\d{1,2}\/)?\d{1,2}\/\d{2,}/)[0]
127
+ last_update = "10/" + last_update if last_update.split('/').length == 2
128
+ details_hash[:last_update] = last_update
129
+ end
130
+ end
131
+ end
132
+
119
133
  pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text
120
134
  pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text)
121
135
 
@@ -6,22 +6,22 @@ require_relative 'bibliographical_book_extractor'
6
6
  require 'sanitize'
7
7
 
8
8
  module Biblionet
9
- module Extractors
10
-
11
- class BookExtractor < Base
9
+ module Extractors
10
+
11
+ class BookExtractor < Base
12
12
  attr_reader :book
13
13
 
14
14
  def initialize(uri=nil)
15
- super(uri)
16
- extract_book unless uri.nil? or @page.nil?
15
+ super(uri)
16
+ extract_book unless uri.nil? or @page.nil?
17
17
  end
18
18
 
19
19
  def load_and_extract_book(uri=nil)
20
20
  load_page(uri)
21
21
  extract_book unless uri.nil? or @page.nil?
22
- end
22
+ end
23
23
 
24
- # Converts the parsed contributors string to hash.
24
+ # Converts the parsed contributors string to hash.
25
25
  # String must have been processed into the following form:
26
26
  # job1: contributor1, contributor2 job2: contributor3
27
27
  # The returned hash is in form: {job1 => ["contributor1","contributor2"],job2 => ["contributor3"]}
@@ -36,18 +36,18 @@ module Biblionet
36
36
  else
37
37
  partners << cb
38
38
  contributors[job] = partners.clone
39
- end
39
+ end
40
40
  end unless raw_contributors.nil? or raw_contributors.empty?
41
-
41
+
42
42
  return contributors
43
43
  end
44
44
 
45
45
  def proccess_details(details)
46
46
  details_hash = Hash.new
47
-
48
- details.each do |detail|
47
+
48
+ details.each do |detail|
49
49
  date_regex = /(^\d{4}$)/
50
- status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
50
+ status_regex = /^\[\p{Word}+(?:\s*[\'\-\+\s]\s*\p{Word}+)*\]$/
51
51
  detail = decode_text(detail)
52
52
 
53
53
  begin
@@ -59,11 +59,11 @@ module Biblionet
59
59
  #puts "Pages: #{pages}"
60
60
  details_hash[:pages] = pages
61
61
  elsif detail.start_with? "ISBN-13"
62
- isbn_13 = detail.gsub(/ISBN-13 /, "")
62
+ isbn_13 = detail.gsub(/ISBN-13 /, "").gsub("&Chi","X")
63
63
  details_hash[:isbn_13] = isbn_13
64
- #puts "ISBN: #{isbn_13}"
64
+ #puts "ISBN: #{isbn_13}"
65
65
  elsif detail.start_with? "ISBN"
66
- isbn = detail.gsub(/ISBN /, "")
66
+ isbn = detail.gsub(/ISBN /, "").gsub("&Chi","X")
67
67
  #puts "ISBN: #{isbn}"
68
68
  details_hash[:isbn] = isbn
69
69
  elsif detail =~ status_regex
@@ -81,12 +81,12 @@ module Biblionet
81
81
  elsif detail.start_with? "ISMN" #Special typo case
82
82
  isbn = detail.gsub(/ISMN /, "")
83
83
  #puts "ISBN: #{isbn}"
84
- details_hash[:isbn] = isbn
85
- else
84
+ details_hash[:isbn] = isbn
85
+ else
86
86
  raise NoIdeaWhatThisIsError.new(@biblionet_id, detail)
87
87
  end
88
88
  rescue NoIdeaWhatThisIsError => e
89
- pp e
89
+ pp e
90
90
  end
91
91
  end
92
92
 
@@ -94,86 +94,86 @@ module Biblionet
94
94
  end
95
95
 
96
96
  def proccess_ddc(ddc, extract_parents = false)
97
- # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
97
+ # Matches only the digits inside [] in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
98
98
  id_re = /(\[DDC\:\s\d*(?:[\.|\s]\d*)*\])/
99
99
 
100
- # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
100
+ # Matches [digits] and (digits) in text like: [889.09300] Νεοελληνική λογοτεχνία - Ιστορία και κριτική (300)
101
101
  non_text_re = /\s*(\[.*\]|\(.*\))\s*/
102
-
103
- # Gets the dcc part from text and removes anything but digits in [DDC: digits].
104
- ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.
102
+
103
+ # Gets the dcc part from text and removes anything but digits in [DDC: digits].
104
+ ddc_id = ddc.scan(id_re).join.gsub(/[\[\]DDC: ]/, '') # Gets the dcc part from text.
105
105
 
106
106
  # Extracts the parent tree of current ddc.
107
- # ddcparser.parse(ddc_id)
107
+ # ddcparser.parse(ddc_id)
108
108
 
109
109
  # Gets text by reomoving anything but text.
110
110
  ddc_text = ddc.gsub(non_text_re, '').strip
111
111
 
112
- ddc_hash = { ddc: ddc_id, name: ddc_text }
112
+ ddc_hash = { ddc: ddc_id, name: ddc_text }
113
113
  return ddc_hash
114
- end
114
+ end
115
115
 
116
116
 
117
- def extract_book(biblionet_id=@biblionet_id, book_page=@page)
117
+ def extract_book(biblionet_id=@biblionet_id, book_page=@page)
118
118
  # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
119
119
  log = Logger.new(STDOUT)
120
-
120
+
121
121
  page = BookDataExtractor.new(book_page)
122
122
 
123
123
  # End extraction if BookDataExtractor couldnt create a nodeset
124
124
  return nil if page.nodeset.nil?
125
125
 
126
126
 
127
- book_hash = Hash.new
127
+ book_hash = Hash.new
128
128
 
129
- begin
130
- img = page.image
129
+ begin
130
+ img = page.image
131
131
  raise NoImageError.new(biblionet_id) if img.nil?
132
132
  rescue NoImageError => e
133
- pp e
134
- log.warn(e.message)
133
+ pp e
134
+ log.warn(e.message)
135
135
  rescue StandardError => e
136
- pp err_msg = "Error #{e} at book: #{biblionet_id}"
137
- log.error(err_msg)
136
+ pp err_msg = "Error #{e} at book: #{biblionet_id}"
137
+ log.error(err_msg)
138
138
  end
139
139
 
140
- book_hash[:title] = page.title
141
- book_hash[:subtitle] = page.subtitle
142
- book_hash[:image] = img
143
-
140
+ book_hash[:title] = page.title
141
+ book_hash[:subtitle] = page.subtitle
142
+ book_hash[:image] = img
143
+
144
144
  contributors = proccess_contributors(page.contributors)
145
145
 
146
146
  author = contributors[:author]
147
147
  contributors.delete(:author)
148
-
148
+
149
149
  # If author is empty, maybe its a collective work.
150
150
  if author.nil? or author.empty?
151
- if page.collective_work?
151
+ if page.collective_work?
152
152
  # author = 'Συλλογικό έργο'
153
153
  author = ['Συλλογικό έργο']
154
- else
155
- pp err_msg = "No author has been found at book: #{biblionet_id}"
156
- log.warn(err_msg)
157
- author = []
154
+ else
155
+ pp err_msg = "No author has been found at book: #{biblionet_id}"
156
+ log.warn(err_msg)
157
+ author = []
158
158
  end
159
159
  end
160
160
 
161
161
  book_hash[:author] = author
162
- book_hash[:contributors] = contributors
162
+ book_hash[:contributors] = contributors
163
163
  book_hash[:publisher] = page.publisher
164
164
 
165
165
  details = page.details
166
166
  if details.nil?
167
167
  pp err_msg = "No details at book: #{biblionet_id}"
168
- log.error(err_msg)
169
- end
168
+ log.error(err_msg)
169
+ end
170
170
 
171
171
  details_hash = proccess_details(details)
172
172
 
173
173
  # book_hash[:publication_year] = details_hash[:publication_year]
174
174
  # book_hash[:pages] = details_hash[:pages]
175
175
  book_hash[:isbn] = details_hash[:isbn]
176
-
176
+
177
177
  if details_hash[:isbn_13].nil?
178
178
  if present?(details_hash[:isbn]) and (details_hash[:isbn].strip.gsub('-','').length == 13)
179
179
  book_hash[:isbn_13] = book_hash[:isbn]
@@ -192,10 +192,10 @@ module Biblionet
192
192
 
193
193
  book_hash[:description] = page.description
194
194
 
195
- ddcs = page.ddcs.map do |ddc|
195
+ ddcs = page.ddcs.map do |ddc|
196
196
  # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
197
197
  ddc_biblionet_id = ddc[:href].split(/\//).last
198
- # Extact DdC id and DdC text.
198
+ # Extact DdC id and DdC text.
199
199
  ddc = proccess_ddc(ddc.text)
200
200
 
201
201
  ddc.merge!(b_id: ddc_biblionet_id)
@@ -217,21 +217,21 @@ module Biblionet
217
217
  # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
218
218
 
219
219
  bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
220
- bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
220
+ bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
221
221
 
222
222
  book_hash[:publisher] = bibliographical_details[:publisher]
223
- book_hash[:publication] = bibliographical_details[:publication]
223
+ book_hash[:publication] = bibliographical_details[:publication]
224
224
 
225
- book_hash[:format] = bibliographical_details[:format]
225
+ book_hash[:format] = bibliographical_details[:format]
226
226
 
227
227
  book_hash[:original_language] = bibliographical_details[:original_language]
228
228
  book_hash[:original_title] = bibliographical_details[:original_title]
229
229
 
230
- book_hash[:price] = bibliographical_details[:price]
230
+ book_hash[:price] = bibliographical_details[:price]
231
231
  book_hash[:availability] = bibliographical_details[:availability]
232
232
  book_hash[:last_update] = bibliographical_details[:last_update]
233
-
234
- book_hash[:series] = bibliographical_details[:series]
233
+
234
+ book_hash[:series] = bibliographical_details[:series]
235
235
 
236
236
  physical_description_hash = {}
237
237
  physical_description_hash[:pages] = details_hash[:pages]
@@ -239,10 +239,10 @@ module Biblionet
239
239
  physical_description_hash[:cover_type] = bibliographical_details[:cover_type]
240
240
 
241
241
  book_hash[:physical_description] = physical_description_hash
242
-
243
242
 
244
- return @book = book_hash
245
- end
243
+
244
+ return @book = book_hash
245
+ end
246
246
  end
247
247
 
248
248
  class BookDataExtractor
@@ -255,25 +255,25 @@ module Biblionet
255
255
  puts document
256
256
  end
257
257
  content = content_re.match(document)[0] unless (content_re.match(document)).nil?
258
-
258
+
259
259
  # If content is nil, there is something wrong with the html, so return nil
260
260
  if content.nil?
261
261
  @nodeset = nil
262
262
  else
263
- @nodeset = Nokogiri::HTML(content)
264
- end
263
+ @nodeset = Nokogiri::HTML(content)
264
+ end
265
265
  end
266
266
 
267
267
  def image
268
268
  img_node = nil
269
269
  img_nodes = @nodeset.xpath("/html/body//img").each do |i|
270
- img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
271
- img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
272
- end
270
+ img_candidate = i.xpath("//img[@src[contains(.,'/covers/')]][1]")
271
+ img_node = img_candidate unless img_candidate.nil? or img_candidate.empty?
272
+ end
273
273
 
274
- img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]
274
+ img = img_node.nil? ? nil : BASE_URL+(img_node.first)[:src]
275
275
 
276
- return img
276
+ return img
277
277
  end
278
278
 
279
279
  def title
@@ -292,7 +292,7 @@ module Biblionet
292
292
 
293
293
  def publisher
294
294
  publisher_hash = {}
295
- @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
295
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]]").each do |item|
296
296
  publisher_hash[:name] = item.text
297
297
  publisher_hash[:b_id] = (item[:href].split("/"))[2]
298
298
  end
@@ -301,12 +301,12 @@ module Biblionet
301
301
 
302
302
  def contributors
303
303
  contributors = []
304
- @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
305
- pre_text = item.previous.text.strip
304
+ @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
305
+ pre_text = item.previous.text.strip
306
306
  contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
307
307
  contributor = {}
308
- contributor[:name] = item.text
309
- contributor[:b_id] = (item[:href].split("/"))[2]
308
+ contributor[:name] = item.text
309
+ contributor[:b_id] = (item[:href].split("/"))[2]
310
310
  contributors << contributor
311
311
  end
312
312
  # Alternative way based on intersecting sets
@@ -315,19 +315,19 @@ module Biblionet
315
315
 
316
316
  # others = book.xpath("#{set_A}[count(.|#{set_B}) = count(#{set_B})]").map do |other|
317
317
  # text = other.inner_text.strip
318
- # other = text == "," ? nil : text
319
- # end.compact
318
+ # other = text == "," ? nil : text
319
+ # end.compact
320
320
  contributors
321
- end
321
+ end
322
322
 
323
323
  def details
324
324
  details = @nodeset.css('.book_details')[0].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
325
325
  if details.nil?
326
- details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
326
+ details = @nodeset.css('.book_details')[1].inner_html.gsub(/(^\d,\d)|(\D,|,\D)(?=[^\]]*(?:\[|$))/, "<br>").split("<br>").map(&:strip).reject!(&:empty?)
327
327
  end
328
328
 
329
- return details
330
- end
329
+ return details
330
+ end
331
331
 
332
332
  def description
333
333
  desc = @nodeset.css('p').last.inner_html #.to_s.gsub(/<br>/,'\\n')
@@ -338,51 +338,51 @@ module Biblionet
338
338
  else
339
339
  return desc
340
340
  end
341
- end
341
+ end
342
342
 
343
343
  def ddcs
344
344
  @nodeset.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]")
345
- end
345
+ end
346
346
 
347
347
  def collective_work?
348
348
  return @nodeset.at_css('h1.book_title').parent.text.include?('Συλλογικό έργο') ? true : false
349
- end
349
+ end
350
350
 
351
351
  # Special case in which there is no author but there are contributors
352
352
  def has_contributors_but_no_authors?
353
353
  node_start = "//h1[@class='book_title']/following::text()"
354
354
  node_end = "//a[@class='booklink' and @href[contains(.,'/author/') ]][1]/preceding::text()"
355
355
  between = (@nodeset.xpath(node_start) & @nodeset.xpath(node_end)).text.strip
356
-
357
- if !between.empty? and between.end_with? ':'
356
+
357
+ if !between.empty? and between.end_with? ':'
358
358
  true
359
359
  else
360
360
  false
361
361
  end
362
- end
362
+ end
363
363
 
364
364
  def awards
365
- awards = []
365
+ awards = []
366
366
  @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'page=showaward') ]]").each do |item|
367
- award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
367
+ award = {name: item.text, year: item.next_sibling.text.strip.gsub(/[^\d]/, '')}
368
368
  awards << award
369
369
  end
370
-
370
+
371
371
  return awards
372
- end
372
+ end
373
373
 
374
374
  end
375
375
 
376
376
  # Raised when a book has no image.
377
377
  #
378
- class NoImageError < StandardError
378
+ class NoImageError < StandardError
379
379
  attr_reader :biblionet_id
380
380
 
381
- def initialize(biblionet_id)
381
+ def initialize(biblionet_id)
382
382
  msg = "This book has no image. At book #{biblionet_id}"
383
383
  super(msg)
384
- end
385
- end
384
+ end
385
+ end
386
386
 
387
387
  end
388
388
  end
@@ -454,10 +454,10 @@ end
454
454
  # book_hash['description'] = desc
455
455
  # end
456
456
 
457
- # ddcs = page.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]").map do |ddc|
457
+ # ddcs = page.xpath("//a[@class='subjectlink' and @href[contains(.,'/index/') ]]").map do |ddc|
458
458
  # # Extract from href the ddc id used by biblionet. --- DdC url http://biblionet.gr/index/id ---
459
459
  # ddc_biblionet_id = ddc[:href].split(/\//).last
460
- # # Extact DdC id and DdC text.
460
+ # # Extact DdC id and DdC text.
461
461
  # ddc = proccess_ddc(ddc.text)
462
462
 
463
463
  # ddc.merge!(b_id: ddc_biblionet_id)
@@ -477,17 +477,17 @@ end
477
477
  # end
478
478
 
479
479
  # def contributors(n)
480
- # contributors = []
481
- # n.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
482
- # pre_text = item.previous.text.strip
480
+ # contributors = []
481
+ # n.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]]").each do |item|
482
+ # pre_text = item.previous.text.strip
483
483
  # contributors << pre_text unless pre_text == ',' or !pre_text.end_with? ':'
484
484
  # contributor = {}
485
- # contributor['name'] = item.text
486
- # contributor['b_id'] = (item[:href].split("/"))[2]
485
+ # contributor['name'] = item.text
486
+ # contributor['b_id'] = (item[:href].split("/"))[2]
487
487
  # contributors << contributor
488
- # end
488
+ # end
489
489
  # contributors
490
- # end
490
+ # end
491
491
 
492
492
  # c = contributors(n4)
493
493
 
@@ -502,9 +502,9 @@ end
502
502
  # else
503
503
  # partners << cb
504
504
  # contributors[job] = partners.clone
505
- # end
505
+ # end
506
506
  # end unless raw_contributors.nil? or raw_contributors.empty?
507
-
507
+
508
508
  # return contributors
509
509
  # end
510
510
 
@@ -1,3 +1,3 @@
1
1
  module Bookshark
2
- VERSION = "1.0.4"
2
+ VERSION = "1.0.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bookshark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dimitris Klisiaris
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-28 00:00:00.000000000 Z
11
+ date: 2017-05-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri