bookshark 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bookshark.gemspec +2 -0
- data/lib/bookshark.rb +113 -93
- data/lib/bookshark/extractors/bibliographical_book_extractor.rb +10 -3
- data/lib/bookshark/extractors/nlg/base.rb +110 -0
- data/lib/bookshark/extractors/nlg/book_extractor.rb +28 -0
- data/lib/bookshark/version.rb +1 -1
- data/spec/test_data/eager_book_184923.json +3 -3
- data/spec/test_data/search_01.json +4 -4
- metadata +32 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '03852f73c9246676ff20b75b0a893998e967f2c7'
|
4
|
+
data.tar.gz: 7fe938710c2e9344563395e5db7cf3e7eaf76b25
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3ccd4cfd0e82aa6918304e82df18f382f2b7e33470c9f52ff362b4897aaf53b703a04bd2689871925293d7e084d3e0008c74aa289fb36a106be7dca20c01ee0f
|
7
|
+
data.tar.gz: fb4590ee8c1a24402f48b3502ddb77e24fbbde2c988a18fb62fe867762950f351d3271512773602bd9cec47d7e51a6cb94de1e2d695cf7c0d8704a39bc72f905
|
data/bookshark.gemspec
CHANGED
@@ -24,9 +24,11 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency "sanitize", "~> 4.0"
|
25
25
|
spec.add_dependency "json", "~> 1.8"
|
26
26
|
spec.add_dependency "htmlentities", "~> 4.3"
|
27
|
+
spec.add_dependency "marc", "~> 1.0"
|
27
28
|
|
28
29
|
spec.add_development_dependency "bundler", ">= 1.6"
|
29
30
|
spec.add_development_dependency "rake", "~> 10.0"
|
30
31
|
spec.add_development_dependency 'rspec', "~> 3.2"
|
31
32
|
spec.add_development_dependency "webmock", "~> 1.2"
|
33
|
+
spec.add_development_dependency "pry-byebug", "~> 3.4"
|
32
34
|
end
|
data/lib/bookshark.rb
CHANGED
@@ -7,6 +7,7 @@ require 'bookshark/extractors/book_extractor'
|
|
7
7
|
require 'bookshark/extractors/bibliographical_book_extractor'
|
8
8
|
require 'bookshark/extractors/publisher_extractor'
|
9
9
|
require 'bookshark/extractors/search'
|
10
|
+
require 'bookshark/extractors/nlg/book_extractor'
|
10
11
|
|
11
12
|
require 'bookshark/crawlers/base'
|
12
13
|
require 'bookshark/crawlers/publisher_crawler'
|
@@ -22,8 +23,8 @@ module Bookshark
|
|
22
23
|
def self.root
|
23
24
|
# File.dirname __dir__ # Works only on ruby > 2.0.0
|
24
25
|
File.expand_path(File.join(File.dirname(__FILE__), '../'))
|
25
|
-
end
|
26
|
-
|
26
|
+
end
|
27
|
+
|
27
28
|
def self.path_to_storage
|
28
29
|
File.join root, 'lib/bookshark/storage'
|
29
30
|
end
|
@@ -31,7 +32,7 @@ module Bookshark
|
|
31
32
|
|
32
33
|
class Extractor
|
33
34
|
include FileManager
|
34
|
-
attr_accessor :site, :format
|
35
|
+
attr_accessor :site, :format
|
35
36
|
|
36
37
|
def initialize(options = {})
|
37
38
|
options = DEFAULTS.merge(options)
|
@@ -44,9 +45,9 @@ module Bookshark
|
|
44
45
|
options[:format] ||= @format
|
45
46
|
|
46
47
|
author_extractor = Biblionet::Extractors::AuthorExtractor.new
|
47
|
-
author = author_extractor.load_and_extract_author(uri)
|
48
|
-
|
49
|
-
response = {}
|
48
|
+
author = author_extractor.load_and_extract_author(uri)
|
49
|
+
|
50
|
+
response = {}
|
50
51
|
response[:author] = !author.nil? ? [author] : []
|
51
52
|
response = change_format(response, options[:format])
|
52
53
|
return response
|
@@ -58,90 +59,109 @@ module Bookshark
|
|
58
59
|
|
59
60
|
publisher_extractor = Biblionet::Extractors::PublisherExtractor.new
|
60
61
|
publisher = publisher_extractor.load_and_extract_publisher(uri)
|
61
|
-
|
62
|
-
response = {}
|
62
|
+
|
63
|
+
response = {}
|
63
64
|
response[:publisher] = !publisher.nil? ? [publisher] : []
|
64
65
|
response = change_format(response, options[:format])
|
65
66
|
response = publisher_extractor.decode_text(response)
|
66
67
|
|
67
68
|
return response
|
68
|
-
# return uri
|
69
|
-
end
|
69
|
+
# return uri
|
70
|
+
end
|
70
71
|
|
71
72
|
def book(options = {})
|
72
|
-
|
73
|
-
|
74
|
-
if book_extractor.present?(options[:isbn])
|
75
|
-
search_engine = Biblionet::Extractors::Search.new
|
76
|
-
options[:id] = search_engine.search_by_isbn(options[:isbn])
|
77
|
-
end
|
73
|
+
options[:site] ||= @site
|
78
74
|
|
79
|
-
|
80
|
-
|
81
|
-
options[:eager] ||= false
|
82
|
-
options[:nilify] ||= false
|
83
|
-
|
84
|
-
if options[:eager]
|
85
|
-
book = eager_extract_book(uri)
|
86
|
-
else
|
87
|
-
book = book_extractor.load_and_extract_book(uri)
|
88
|
-
end
|
75
|
+
if options[:site] == 'biblionet'
|
76
|
+
book_extractor = Biblionet::Extractors::BookExtractor.new
|
89
77
|
|
90
|
-
|
91
|
-
|
78
|
+
if book_extractor.present?(options[:isbn])
|
79
|
+
search_engine = Biblionet::Extractors::Search.new
|
80
|
+
options[:id] = search_engine.search_by_isbn(options[:isbn])
|
81
|
+
end
|
92
82
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
83
|
+
uri = process_options(options, __method__)
|
84
|
+
options[:format] ||= @format
|
85
|
+
options[:eager] ||= false
|
86
|
+
options[:nilify] ||= false
|
87
|
+
|
88
|
+
if options[:eager]
|
89
|
+
book = eager_extract_book(uri)
|
90
|
+
else
|
91
|
+
book = book_extractor.load_and_extract_book(uri)
|
92
|
+
end
|
93
|
+
|
94
|
+
response = {}
|
95
|
+
response[:book] = !book.nil? ? [book] : []
|
96
|
+
|
97
|
+
return nil if response[:book].empty? and options[:nilify]
|
98
|
+
|
99
|
+
response = change_format(response, options[:format])
|
100
|
+
|
101
|
+
response = book_extractor.decode_text(response) if response.class == "String"
|
102
|
+
|
103
|
+
return response
|
104
|
+
elsif options[:site] == 'nlg'
|
105
|
+
book_extractor = Nlg::Extractors::BookExtractor.new
|
106
|
+
|
107
|
+
options[:format] ||= @format
|
108
|
+
|
109
|
+
# if !options[:uri].nil?
|
110
|
+
# uri = "#{options[:uri]}/Export?style=MARCXML"
|
111
|
+
# elsif !options[:id].nil?
|
112
|
+
# uri = "http://nbib.nlg.gr/Record/#{options[:id]}/Export?style=MARCXML"
|
113
|
+
# end
|
114
|
+
|
115
|
+
book = book_extractor.load_and_extract_book(options[:id])
|
116
|
+
|
117
|
+
response = {}
|
118
|
+
response[:book] = !book.nil? ? [book] : []
|
119
|
+
end
|
100
120
|
end
|
101
121
|
|
102
|
-
|
122
|
+
|
103
123
|
# def bibliographical_book(options = {})
|
104
124
|
# bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
|
105
|
-
|
125
|
+
|
106
126
|
# uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{options[:id]}"
|
107
127
|
# options[:format] ||= @format
|
108
|
-
|
128
|
+
|
109
129
|
# book = bibliographical_book_extractor.load_and_extract_book(uri)
|
110
|
-
|
111
|
-
# response = {}
|
130
|
+
|
131
|
+
# response = {}
|
112
132
|
# response[:book] = !book.nil? ? [book] : []
|
113
133
|
# response = change_format(response, options[:format])
|
114
|
-
# response = bibliographical_book_extractor.decode_text(response)
|
115
|
-
# end
|
116
|
-
|
134
|
+
# response = bibliographical_book_extractor.decode_text(response)
|
135
|
+
# end
|
136
|
+
|
117
137
|
# puts Bookshark::Extractor.new(format: 'pretty_json').bibliographical_book(id: 103788)
|
118
138
|
|
119
139
|
def category(options = {})
|
120
140
|
uri = process_options(options, __method__)
|
121
|
-
options[:format] ||= @format
|
141
|
+
options[:format] ||= @format
|
122
142
|
|
123
143
|
category_extractor = Biblionet::Extractors::CategoryExtractor.new
|
124
144
|
category = category_extractor.extract_categories_from(uri)
|
125
145
|
|
126
|
-
response = {}
|
146
|
+
response = {}
|
127
147
|
response[:category] = !category.nil? ? [category] : []
|
128
148
|
response = change_format(response, options[:format])
|
129
|
-
|
130
|
-
return response
|
149
|
+
|
150
|
+
return response
|
131
151
|
end
|
132
152
|
|
133
153
|
def search(options = {})
|
134
154
|
options[:format] ||= @format
|
135
|
-
options[:results_type] ||= 'metadata'
|
155
|
+
options[:results_type] ||= 'metadata'
|
136
156
|
|
137
157
|
search_engine = Biblionet::Extractors::Search.new
|
138
158
|
search_results = search_engine.perform_search(options)
|
139
159
|
|
140
|
-
response = {}
|
160
|
+
response = {}
|
141
161
|
response[:book] = search_results
|
142
162
|
response = change_format(response, options[:format])
|
143
|
-
|
144
|
-
return response
|
163
|
+
|
164
|
+
return response
|
145
165
|
end
|
146
166
|
|
147
167
|
# def books_from_storage
|
@@ -165,22 +185,22 @@ module Bookshark
|
|
165
185
|
record = book(id: book_id, local: true, format: format, nilify: true)
|
166
186
|
|
167
187
|
dir_to_save = Bookshark.path_to_storage + '/' + 'json_book_records/' + "#{((book_id-1)/1000)}/" + "book_#{book_id}.json"
|
168
|
-
|
188
|
+
|
169
189
|
save_to(dir_to_save, record) unless record.nil?
|
170
190
|
end
|
171
191
|
end
|
172
192
|
|
173
193
|
|
174
|
-
def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
|
194
|
+
def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
|
175
195
|
list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
|
176
|
-
dir_to_save = dir.gsub(source_dir, target_dir)
|
196
|
+
dir_to_save = dir.gsub(source_dir, target_dir)
|
177
197
|
|
178
198
|
list_files(path: dir, extension: 'html', all:true).each do |file|
|
179
|
-
puts "Extracting from file: " + file.to_s
|
199
|
+
puts "Extracting from file: " + file.to_s
|
180
200
|
|
181
201
|
# Extract publisher metadata form local file.
|
182
|
-
options = {uri: file, format: 'pretty_json', local: true}
|
183
|
-
|
202
|
+
options = {uri: file, format: 'pretty_json', local: true}
|
203
|
+
|
184
204
|
case metadata_type
|
185
205
|
when 'author'
|
186
206
|
record = author(options)
|
@@ -189,16 +209,16 @@ module Bookshark
|
|
189
209
|
# when 'book'
|
190
210
|
# record = book(options)
|
191
211
|
when 'category'
|
192
|
-
record = category(options)
|
193
|
-
end
|
212
|
+
record = category(options)
|
213
|
+
end
|
194
214
|
|
195
215
|
# Prepare a path to save the new file.
|
196
216
|
filename = File.basename(file,".*")
|
197
217
|
path_to_save = "#{dir_to_save}#{filename}.json"
|
198
|
-
|
199
|
-
# Save to file.
|
218
|
+
|
219
|
+
# Save to file.
|
200
220
|
save_to("#{path_to_save}", record)
|
201
|
-
|
221
|
+
|
202
222
|
end # unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
|
203
223
|
end
|
204
224
|
end
|
@@ -209,9 +229,9 @@ module Bookshark
|
|
209
229
|
# end
|
210
230
|
category_extractor = Biblionet::Extractors::CategoryExtractor.new
|
211
231
|
all_categories = Hash.new
|
212
|
-
|
232
|
+
|
213
233
|
list_files(path: 'storage/raw_ddc_pages', extension: 'html', all:true).each do |file|
|
214
|
-
categories = category_extractor.extract_categories_from(file)
|
234
|
+
categories = category_extractor.extract_categories_from(file)
|
215
235
|
all_categories.merge!(categories) unless categories.nil? or categories.empty?
|
216
236
|
end
|
217
237
|
|
@@ -228,19 +248,19 @@ module Bookshark
|
|
228
248
|
|
229
249
|
list_directories(path: 'storage/raw_html_pages').each do |dir|
|
230
250
|
dir_to_save = dir.gsub(/raw_html_pages/, 'books')
|
231
|
-
|
232
|
-
list_files(path: dir, extension: 'html', all:true).each do |file|
|
233
|
-
|
251
|
+
|
252
|
+
list_files(path: dir, extension: 'html', all:true).each do |file|
|
253
|
+
|
234
254
|
# Load the book from html file and parse the data.
|
235
255
|
# pp "Parsing book: #{file}"
|
236
256
|
pp file
|
237
257
|
book = bp.load_and_extract_book(file)
|
238
|
-
|
258
|
+
|
239
259
|
# Prepare a path to save the new file.
|
240
260
|
filename = File.basename(file,".*")
|
241
261
|
path_to_save = "#{dir_to_save}#{filename}.json"
|
242
|
-
|
243
|
-
# Save to file.
|
262
|
+
|
263
|
+
# Save to file.
|
244
264
|
bp.save_to("#{path_to_save}", JSON.pretty_generate(book))
|
245
265
|
# pp "Book #{file} saved!"
|
246
266
|
end unless File.directory?(dir_to_save) # if dir.end_with? '/195/'
|
@@ -266,11 +286,11 @@ module Bookshark
|
|
266
286
|
url_method = 'book'
|
267
287
|
local_path = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
|
268
288
|
when 'category'
|
269
|
-
url_method = 'index'
|
270
|
-
local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
|
289
|
+
url_method = 'index'
|
290
|
+
local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
|
271
291
|
else
|
272
292
|
puts "Called from unknown method. Probably its rspec."
|
273
|
-
end
|
293
|
+
end
|
274
294
|
|
275
295
|
options[:local] ||= false
|
276
296
|
url = "#{Bookshark::path_to_storage}/#{local_path}" if options[:local]
|
@@ -279,7 +299,7 @@ module Bookshark
|
|
279
299
|
uri = options[:uri] ||= url
|
280
300
|
|
281
301
|
return uri
|
282
|
-
end
|
302
|
+
end
|
283
303
|
|
284
304
|
def change_format(hash, format)
|
285
305
|
case format
|
@@ -288,10 +308,10 @@ module Bookshark
|
|
288
308
|
when 'json'
|
289
309
|
hash = hash.to_json
|
290
310
|
when 'pretty_json'
|
291
|
-
hash = JSON.pretty_generate(hash)
|
311
|
+
hash = JSON.pretty_generate(hash)
|
292
312
|
end
|
293
313
|
return hash
|
294
|
-
end
|
314
|
+
end
|
295
315
|
|
296
316
|
def eager_extract_book(uri)
|
297
317
|
book_extractor = Biblionet::Extractors::BookExtractor.new
|
@@ -301,13 +321,13 @@ module Bookshark
|
|
301
321
|
|
302
322
|
book = book_extractor.load_and_extract_book(uri)
|
303
323
|
|
304
|
-
tmp_data = []
|
324
|
+
tmp_data = []
|
305
325
|
book[:author].each do |author|
|
306
|
-
tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
|
326
|
+
tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{author[:b_id]}")
|
307
327
|
end
|
308
|
-
book[:author] = tmp_data
|
309
|
-
|
310
|
-
tmp_data, tmp_hash = [], {}
|
328
|
+
book[:author] = tmp_data
|
329
|
+
|
330
|
+
tmp_data, tmp_hash = [], {}
|
311
331
|
book[:contributors].each do |job, contributors|
|
312
332
|
contributors.each do |contributor|
|
313
333
|
tmp_data << author_extractor.load_and_extract_author("http://www.biblionet.gr/author/#{contributor[:b_id]}")
|
@@ -317,19 +337,19 @@ module Bookshark
|
|
317
337
|
end
|
318
338
|
book[:contributors] = tmp_hash
|
319
339
|
|
320
|
-
tmp_data, tmp_hash = [], {}
|
340
|
+
tmp_data, tmp_hash = [], {}
|
321
341
|
book[:category].each do |category|
|
322
342
|
tmp_data << category_extractor.extract_categories_from("http://www.biblionet.gr/index/#{category[:b_id]}")
|
323
343
|
end
|
324
|
-
book[:category] = tmp_data
|
325
|
-
|
326
|
-
tmp_data = []
|
327
|
-
tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
|
344
|
+
book[:category] = tmp_data
|
345
|
+
|
346
|
+
tmp_data = []
|
347
|
+
tmp_data << publisher_extractor.load_and_extract_publisher("http://www.biblionet.gr/com/#{book[:publisher][:b_id]}")
|
328
348
|
book[:publisher] = tmp_data
|
329
349
|
|
330
350
|
book
|
331
|
-
end
|
332
|
-
|
351
|
+
end
|
352
|
+
|
333
353
|
end
|
334
354
|
|
335
355
|
|
@@ -339,7 +359,7 @@ module Bookshark
|
|
339
359
|
|
340
360
|
def initialize(options = {})
|
341
361
|
options = DEFAULTS.merge(options)
|
342
|
-
@site = options[:site]
|
362
|
+
@site = options[:site]
|
343
363
|
end
|
344
364
|
|
345
365
|
def publishers
|
@@ -362,11 +382,11 @@ module Bookshark
|
|
362
382
|
crawler.crawl_and_save
|
363
383
|
end
|
364
384
|
|
365
|
-
end
|
385
|
+
end
|
366
386
|
|
367
387
|
# module Biblionet
|
368
388
|
# class Extract
|
369
|
-
# class << self
|
389
|
+
# class << self
|
370
390
|
# def author(uri=nil)
|
371
391
|
# author_extractor = BiblionetParser::Core::AuthorExtractor.new
|
372
392
|
# author_extractor.load_and_extract_author(uri)
|
@@ -384,7 +404,7 @@ module Bookshark
|
|
384
404
|
|
385
405
|
# end
|
386
406
|
# end
|
387
|
-
# end
|
407
|
+
# end
|
388
408
|
end
|
389
409
|
|
390
410
|
|
@@ -467,4 +487,4 @@ end
|
|
467
487
|
# Problematic at biblionet
|
468
488
|
# http://biblionet.gr/book/196388
|
469
489
|
# http://biblionet.gr/book/196386
|
470
|
-
# http://biblionet.gr/book/195525
|
490
|
+
# http://biblionet.gr/book/195525
|
@@ -170,10 +170,17 @@ module Biblionet
|
|
170
170
|
text: publisher_node.text,
|
171
171
|
b_id: (publisher_node[:href].split("/"))[2]
|
172
172
|
}
|
173
|
-
|
173
|
+
last_author = @nodeset
|
174
174
|
.xpath("//a[@class='booklink' and @href[contains(.,'/author/') ]][last()]").last
|
175
|
-
|
176
|
-
|
175
|
+
|
176
|
+
if !last_author.nil? && !last_author.empty?
|
177
|
+
after_last_author_text = last_author.next_sibling.text.strip
|
178
|
+
else
|
179
|
+
last_book = @nodeset
|
180
|
+
.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][last()]").last
|
181
|
+
after_last_author_text = last_book.next_sibling.text.strip
|
182
|
+
end
|
183
|
+
|
177
184
|
details_hash[:publication] = {
|
178
185
|
year: after_last_author_text[/(?<=: )\d+(?=\.)/],
|
179
186
|
version: after_last_author_text[/(?<=- )\d+(?=η)/],
|
@@ -0,0 +1,110 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'json'
|
6
|
+
require 'logger'
|
7
|
+
require 'pp'
|
8
|
+
require 'marc'
|
9
|
+
require 'htmlentities'
|
10
|
+
|
11
|
+
module Nlg
|
12
|
+
module Extractors
|
13
|
+
|
14
|
+
class Base
|
15
|
+
|
16
|
+
attr_reader :url, :nlg_id, :page
|
17
|
+
|
18
|
+
def initialize(id=nil)
|
19
|
+
load_page(id)
|
20
|
+
end
|
21
|
+
|
22
|
+
def load_page(id=nil)
|
23
|
+
load_page_by_id(id) unless id.nil?
|
24
|
+
end
|
25
|
+
|
26
|
+
def load_page_by_id(id)
|
27
|
+
begin
|
28
|
+
@nlg_id = id unless id.nil? # id is expected to be the last number.
|
29
|
+
@url = "http://nbib.nlg.gr/Record/#{@nlg_id}/Export?style=MARCXML"
|
30
|
+
|
31
|
+
pp "Downloading page: #{@url}"
|
32
|
+
|
33
|
+
Net::HTTP.start("nbib.nlg.gr") do |http|
|
34
|
+
response = http.get("/Record/#{@nlg_id}/Export?style=MARCXML")
|
35
|
+
pp response.content_type
|
36
|
+
pp response.code
|
37
|
+
raise EmptyPageError.new(@url) unless response.content_type == "text/xml" && response.code == "200"
|
38
|
+
|
39
|
+
@page = response.body
|
40
|
+
end
|
41
|
+
|
42
|
+
rescue Errno::ENOENT => e
|
43
|
+
pp "Page: #{@url} NOT FOUND."
|
44
|
+
pp e
|
45
|
+
rescue EmptyPageError => e
|
46
|
+
pp "Page: #{@url} is EMPTY."
|
47
|
+
pp e
|
48
|
+
@page = nil
|
49
|
+
rescue OpenURI::HTTPError => e
|
50
|
+
pp e
|
51
|
+
pp e.io.status
|
52
|
+
rescue StandardError => e
|
53
|
+
pp "Generic error #{e.class}. Will wait for 2 minutes and then try again."
|
54
|
+
pp e
|
55
|
+
sleep(120)
|
56
|
+
retry
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Decodes text with escaped html entities and returns the decoded text.
|
61
|
+
#
|
62
|
+
# ==== Params:
|
63
|
+
#
|
64
|
+
# +encoded_text+:: the text which contains encoded entities
|
65
|
+
#
|
66
|
+
def decode_text(encoded_text)
|
67
|
+
self.class.decode_text(encoded_text)
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.decode_text(encoded_text)
|
71
|
+
# encoded_text = File.read(encoded_file_path)
|
72
|
+
coder = HTMLEntities.new
|
73
|
+
coder.decode(encoded_text)
|
74
|
+
end
|
75
|
+
|
76
|
+
def present?(value)
|
77
|
+
return (not value.nil? and not value.empty?) ? true : false
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
|
82
|
+
# Raised when a page is considered empty.
|
83
|
+
#
|
84
|
+
class EmptyPageError < StandardError
|
85
|
+
attr_reader :url
|
86
|
+
|
87
|
+
def initialize(url)
|
88
|
+
@url = url
|
89
|
+
|
90
|
+
msg = "Page: #{url} is not valid xml so it is considered EMPTY."
|
91
|
+
super(msg)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Raised when something unexpected or in wrong format is parsed.
|
96
|
+
#
|
97
|
+
class NoIdeaWhatThisIsError < StandardError
|
98
|
+
attr_reader :nlg_id, :the_unexpected
|
99
|
+
|
100
|
+
def initialize(nlg_id, the_unexpected)
|
101
|
+
@nlg_id = nlg_id
|
102
|
+
@the_unexpected = the_unexpected
|
103
|
+
|
104
|
+
msg = "We have no idea what this: #{the_unexpected} is. At book #{nlg_id}"
|
105
|
+
super(msg)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require_relative 'base'
|
5
|
+
|
6
|
+
module Nlg
|
7
|
+
module Extractors
|
8
|
+
|
9
|
+
class BookExtractor < Base
|
10
|
+
attr_reader :book
|
11
|
+
|
12
|
+
def initialize(id=nil)
|
13
|
+
super(id)
|
14
|
+
extract_book unless id.nil? or @page.nil?
|
15
|
+
end
|
16
|
+
|
17
|
+
def load_and_extract_book(id=nil)
|
18
|
+
load_page(id)
|
19
|
+
extract_book unless id.nil? or @page.nil?
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract_book(nlg_id=@nlg_id, book_page=@page)
|
23
|
+
puts "should extract book #{nlg_id} from nlg"
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
data/lib/bookshark/version.rb
CHANGED
@@ -42,10 +42,10 @@
|
|
42
42
|
"name": "Πανεπιστημιακές Εκδόσεις Κρήτης",
|
43
43
|
"owner": "Στέφανος Τραχανάς",
|
44
44
|
"bookstores": {
|
45
|
-
"
|
45
|
+
"Υποκατάστημα": {
|
46
46
|
"address": [
|
47
|
-
"
|
48
|
-
"106 77
|
47
|
+
"Κλεισόβης 3",
|
48
|
+
"106 77 Αθήνα"
|
49
49
|
],
|
50
50
|
"telephone": [
|
51
51
|
"210 38490203"
|
@@ -372,7 +372,7 @@
|
|
372
372
|
"format": "Βιβλίο",
|
373
373
|
"original_language": null,
|
374
374
|
"original_title": null,
|
375
|
-
"price": "6,
|
375
|
+
"price": "6,82",
|
376
376
|
"availability": "Κυκλοφορεί",
|
377
377
|
"last_update": null,
|
378
378
|
"series": {
|
@@ -421,7 +421,7 @@
|
|
421
421
|
"format": "Βιβλίο",
|
422
422
|
"original_language": null,
|
423
423
|
"original_title": null,
|
424
|
-
"price": "3,
|
424
|
+
"price": "3,71",
|
425
425
|
"availability": "Κυκλοφορεί",
|
426
426
|
"last_update": null,
|
427
427
|
"series": {
|
@@ -445,7 +445,7 @@
|
|
445
445
|
"contributors": {
|
446
446
|
},
|
447
447
|
"publisher": {
|
448
|
-
"text": "Δωδώνη
|
448
|
+
"text": "Δωδώνη",
|
449
449
|
"b_id": "1"
|
450
450
|
},
|
451
451
|
"isbn": "960-248-541-8",
|
@@ -470,7 +470,7 @@
|
|
470
470
|
"format": "Βιβλίο",
|
471
471
|
"original_language": null,
|
472
472
|
"original_title": null,
|
473
|
-
"price": "10,
|
473
|
+
"price": "10,55",
|
474
474
|
"availability": "Κυκλοφορεί",
|
475
475
|
"last_update": null,
|
476
476
|
"series": {
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bookshark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitris Klisiaris
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-02-
|
11
|
+
date: 2017-02-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -72,6 +72,20 @@ dependencies:
|
|
72
72
|
- - "~>"
|
73
73
|
- !ruby/object:Gem::Version
|
74
74
|
version: '4.3'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: marc
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '1.0'
|
82
|
+
type: :runtime
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '1.0'
|
75
89
|
- !ruby/object:Gem::Dependency
|
76
90
|
name: bundler
|
77
91
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +142,20 @@ dependencies:
|
|
128
142
|
- - "~>"
|
129
143
|
- !ruby/object:Gem::Version
|
130
144
|
version: '1.2'
|
145
|
+
- !ruby/object:Gem::Dependency
|
146
|
+
name: pry-byebug
|
147
|
+
requirement: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - "~>"
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '3.4'
|
152
|
+
type: :development
|
153
|
+
prerelease: false
|
154
|
+
version_requirements: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - "~>"
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '3.4'
|
131
159
|
description: Extracts book, author, publisher and category metadata from biblionet.gr.
|
132
160
|
email:
|
133
161
|
- dklisiaris@gmail.com
|
@@ -155,6 +183,8 @@ files:
|
|
155
183
|
- lib/bookshark/extractors/bibliographical_book_extractor.rb
|
156
184
|
- lib/bookshark/extractors/book_extractor.rb
|
157
185
|
- lib/bookshark/extractors/category_extractor.rb
|
186
|
+
- lib/bookshark/extractors/nlg/base.rb
|
187
|
+
- lib/bookshark/extractors/nlg/book_extractor.rb
|
158
188
|
- lib/bookshark/extractors/publisher_extractor.rb
|
159
189
|
- lib/bookshark/extractors/search.rb
|
160
190
|
- lib/bookshark/storage/file_manager.rb
|