bookshark 1.0.0.beta.5 → 1.0.0.pre.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +26 -8
- data/bookshark.gemspec +1 -1
- data/lib/bookshark.rb +19 -1
- data/lib/bookshark/extractors/base.rb +4 -0
- data/lib/bookshark/extractors/bibliographical_book_extractor.rb +172 -0
- data/lib/bookshark/extractors/book_extractor.rb +33 -5
- data/lib/bookshark/extractors/publisher_extractor.rb +1 -0
- data/lib/bookshark/version.rb +1 -1
- data/spec/bookshark_spec.rb +4 -0
- data/spec/test_data/book_103788.json +22 -6
- data/spec/test_data/eager_book_184923.json +22 -6
- data/spec/test_data/search_01.json +185 -55
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7e2e0077beb0dadb4ef9df9fcce7b7d55d54e8a6
|
4
|
+
data.tar.gz: 712b2769781e8b1ea55ecb25a501d886a939e307
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a2348fa9b757abe44be69c0d21e5c61bc41e87eaae80d11bafff115201f40793d2724d6fe5f518c3c3f26308696febd4c772f8a7e989f085949ea6ab584c7264
|
7
|
+
data.tar.gz: 221cdadd0bcaf5df156cb668465b6a60e7471a5a650ddcce703a602a646dac19dee008a4de088ed037695289d7fa983792d1a5397ea40cfe843a39bf50155bb0
|
data/README.md
CHANGED
@@ -13,7 +13,7 @@ The representation of bibliographic metadata in JSON is inspired by [BibJSON](ht
|
|
13
13
|
Add this line to your application's Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem 'bookshark', "~> 1.0.0.
|
16
|
+
gem 'bookshark', "~> 1.0.0.pre"
|
17
17
|
```
|
18
18
|
|
19
19
|
And then execute:
|
@@ -145,29 +145,47 @@ The expected result of a book extraction is something like this:
|
|
145
145
|
]
|
146
146
|
},
|
147
147
|
"publisher": {
|
148
|
-
"
|
148
|
+
"text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
|
149
149
|
"b_id": "271"
|
150
150
|
},
|
151
|
-
"publication_year": "2006",
|
152
|
-
"pages": "326",
|
153
151
|
"isbn": "960-14-1157-7",
|
154
152
|
"isbn_13": "978-960-14-1157-6",
|
155
|
-
"status": "Κυκλοφορεί",
|
156
|
-
"price": "16,31",
|
157
153
|
"award": [
|
154
|
+
|
158
155
|
],
|
159
156
|
"description": "Τι είναι πιο επικίνδυνο, ένα όπλο ή μια πισίνα; Τι κοινό έχουν οι δάσκαλοι με τους παλαιστές του σούμο;...",
|
160
157
|
"category": [
|
161
158
|
{
|
162
159
|
"ddc": "330",
|
163
|
-
"
|
160
|
+
"name": "Οικονομία",
|
164
161
|
"b_id": "142"
|
165
162
|
}
|
166
163
|
],
|
167
|
-
"b_id": "103788"
|
164
|
+
"b_id": "103788",
|
165
|
+
"publication": {
|
166
|
+
"year": "2006",
|
167
|
+
"version": "1",
|
168
|
+
"place": "Αθήνα"
|
169
|
+
},
|
170
|
+
"format": "Βιβλίο",
|
171
|
+
"original_language": "αγγλικά",
|
172
|
+
"original_title": "Freakonomics",
|
173
|
+
"price": "16,31",
|
174
|
+
"availability": "Κυκλοφορεί",
|
175
|
+
"last_update": "27/1/2006",
|
176
|
+
"series": {
|
177
|
+
"name": "Οικονομία",
|
178
|
+
"volume": null
|
179
|
+
},
|
180
|
+
"physical_description": {
|
181
|
+
"pages": "326",
|
182
|
+
"size": "21x14",
|
183
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
184
|
+
}
|
168
185
|
}
|
169
186
|
]
|
170
187
|
}
|
188
|
+
|
171
189
|
```
|
172
190
|
Here is a [Book Sample](https://gist.github.com/dklisiaris/a6f3d6f37806186f3c79) extracted with eager option enabled.
|
173
191
|
|
data/bookshark.gemspec
CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.require_paths = ["lib"]
|
22
22
|
|
23
23
|
spec.add_dependency "nokogiri", "~> 1.6", ">= 1.6.6"
|
24
|
-
spec.add_dependency "sanitize", "~>
|
24
|
+
spec.add_dependency "sanitize", "~> 4.0"
|
25
25
|
spec.add_dependency "json", "~> 1.8"
|
26
26
|
spec.add_dependency "htmlentities", "~> 4.3"
|
27
27
|
|
data/lib/bookshark.rb
CHANGED
@@ -4,6 +4,7 @@ require 'bookshark/storage/file_manager'
|
|
4
4
|
require 'bookshark/extractors/author_extractor'
|
5
5
|
require 'bookshark/extractors/category_extractor'
|
6
6
|
require 'bookshark/extractors/book_extractor'
|
7
|
+
require 'bookshark/extractors/bibliographical_book_extractor'
|
7
8
|
require 'bookshark/extractors/publisher_extractor'
|
8
9
|
require 'bookshark/extractors/search'
|
9
10
|
|
@@ -91,6 +92,23 @@ module Bookshark
|
|
91
92
|
return response
|
92
93
|
end
|
93
94
|
|
95
|
+
|
96
|
+
# def bibliographical_book(options = {})
|
97
|
+
# bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
|
98
|
+
|
99
|
+
# uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{options[:id]}"
|
100
|
+
# options[:format] ||= @format
|
101
|
+
|
102
|
+
# book = bibliographical_book_extractor.load_and_extract_book(uri)
|
103
|
+
|
104
|
+
# response = {}
|
105
|
+
# response[:book] = !book.nil? ? [book] : []
|
106
|
+
# response = change_format(response, options[:format])
|
107
|
+
# response = bibliographical_book_extractor.decode_text(response)
|
108
|
+
# end
|
109
|
+
|
110
|
+
# puts Bookshark::Extractor.new(format: 'pretty_json').bibliographical_book(id: 103788)
|
111
|
+
|
94
112
|
def category(options = {})
|
95
113
|
uri = process_options(options, __method__)
|
96
114
|
options[:format] ||= @format
|
@@ -231,7 +249,7 @@ module Bookshark
|
|
231
249
|
local_path = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
|
232
250
|
when 'category'
|
233
251
|
url_method = 'index'
|
234
|
-
local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
|
252
|
+
local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
|
235
253
|
else
|
236
254
|
puts "Called from unknown method. Probably its rspec."
|
237
255
|
end
|
@@ -141,6 +141,10 @@ module Biblionet
|
|
141
141
|
# +encoded_text+:: the text which contains encoded entities
|
142
142
|
#
|
143
143
|
def decode_text(encoded_text)
|
144
|
+
self.class.decode_text(encoded_text)
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.decode_text(encoded_text)
|
144
148
|
# encoded_text = File.read(encoded_file_path)
|
145
149
|
coder = HTMLEntities.new
|
146
150
|
coder.decode(encoded_text)
|
@@ -0,0 +1,172 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
# encoding: utf-8
|
3
|
+
|
4
|
+
require_relative 'base'
|
5
|
+
|
6
|
+
|
7
|
+
module Biblionet
|
8
|
+
module Extractors
|
9
|
+
|
10
|
+
class BibliographicalBookExtractor < Base
|
11
|
+
attr_reader :bibliographical_book
|
12
|
+
|
13
|
+
def initialize(uri=nil)
|
14
|
+
super(uri)
|
15
|
+
extract_bibliographical_book unless uri.nil? or @page.nil?
|
16
|
+
end
|
17
|
+
|
18
|
+
def load_and_extract_book(uri=nil)
|
19
|
+
load_page(uri)
|
20
|
+
extract_bibliographical_book unless uri.nil? or @page.nil?
|
21
|
+
end
|
22
|
+
|
23
|
+
def extract_bibliographical_book(biblionet_id=@biblionet_id, book_page=@page)
|
24
|
+
# log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
|
25
|
+
log = Logger.new(STDOUT)
|
26
|
+
|
27
|
+
page = BibliographicalBookDataExtractor.new(book_page)
|
28
|
+
|
29
|
+
# End extraction if BookDataExtractor couldnt create a nodeset
|
30
|
+
return nil if page.nodeset.nil?
|
31
|
+
|
32
|
+
bibliographical_book_hash = Hash.new
|
33
|
+
|
34
|
+
extracted_details = page.details
|
35
|
+
|
36
|
+
bibliographical_book_hash[:original_language] = extracted_details[:original_language]
|
37
|
+
bibliographical_book_hash[:original_title] = extracted_details[:original_title]
|
38
|
+
bibliographical_book_hash[:last_update] = extracted_details[:last_update]
|
39
|
+
bibliographical_book_hash[:cover_type] = extracted_details[:cover_type]
|
40
|
+
bibliographical_book_hash[:availability] = extracted_details[:availability]
|
41
|
+
bibliographical_book_hash[:price] = extracted_details[:price]
|
42
|
+
|
43
|
+
bibliographical_book_hash[:series] = extracted_details[:series]
|
44
|
+
bibliographical_book_hash[:physical_size] = extracted_details[:physical_size]
|
45
|
+
|
46
|
+
bibliographical_book_hash[:format] = extracted_details[:format]
|
47
|
+
|
48
|
+
bibliographical_book_hash[:publisher] = extracted_details[:publisher]
|
49
|
+
bibliographical_book_hash[:publication] = extracted_details[:publication]
|
50
|
+
|
51
|
+
return @bibliographical_book = bibliographical_book_hash
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
class BibliographicalBookDataExtractor
|
57
|
+
attr_reader :nodeset
|
58
|
+
|
59
|
+
def initialize(document)
|
60
|
+
# No need to operate on whole page. Just on part containing the book.
|
61
|
+
content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
|
62
|
+
if (content_re.match(document)).nil?
|
63
|
+
puts document
|
64
|
+
end
|
65
|
+
content = content_re.match(document)[0] unless (content_re.match(document)).nil?
|
66
|
+
|
67
|
+
# If content is nil, there is something wrong with the html, so return nil
|
68
|
+
if content.nil?
|
69
|
+
@nodeset = nil
|
70
|
+
else
|
71
|
+
@nodeset = Nokogiri::HTML(content)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def size
|
76
|
+
size_regex = /\d+x\d+/
|
77
|
+
end
|
78
|
+
|
79
|
+
def series
|
80
|
+
series_regex = /(?<=\()\p{Word}+( \p{Word}+)* · \d+(?=\))/
|
81
|
+
series_name_regex = /\p{Word}+( \p{Word}+)*(?= ·)/
|
82
|
+
series_volume_regex = /(?<=· )\d+/
|
83
|
+
end
|
84
|
+
|
85
|
+
def details
|
86
|
+
details_hash = {}
|
87
|
+
isbn_regex = /(?<= )\d+-\d+-\d+-\d+(?= |,)/
|
88
|
+
isbn_13_regex = /\d+-\d+-\d+-\d+-\d+/
|
89
|
+
last_update_regex = /\d{1,2}\/\d{1,2}\/\d{2,4}/
|
90
|
+
cover_type_regex = /(?<=\()\p{Word}+( \p{Word}+)?(?=\))/
|
91
|
+
availability_regex = /(?<=\[).+(?=\])/
|
92
|
+
price_regex = /(?<=€ )\d+,\d*/
|
93
|
+
|
94
|
+
@nodeset.xpath("//span[@class='small'][1]").inner_html.split('<br>').each do |detail|
|
95
|
+
detail = BibliographicalBookExtractor.decode_text(detail)
|
96
|
+
|
97
|
+
if detail.start_with? "Γλώσσα πρωτοτύπου:"
|
98
|
+
original_language = detail.gsub(/Γλώσσα πρωτοτύπου:/, "").strip
|
99
|
+
details_hash[:original_language] = original_language
|
100
|
+
elsif detail.start_with? "Τίτλος πρωτοτύπου:"
|
101
|
+
original_title = detail.gsub(/Τίτλος πρωτοτύπου:/, "").strip
|
102
|
+
details_hash[:original_title] = original_title
|
103
|
+
end
|
104
|
+
|
105
|
+
details_hash[:isbn] = detail[isbn_regex] if detail =~ isbn_regex
|
106
|
+
|
107
|
+
details_hash[:isbn_13] = detail[isbn_13_regex] if detail =~ isbn_13_regex
|
108
|
+
|
109
|
+
details_hash[:last_update] = detail[last_update_regex] if detail =~ last_update_regex
|
110
|
+
|
111
|
+
details_hash[:cover_type] = detail[cover_type_regex] if detail =~ cover_type_regex
|
112
|
+
|
113
|
+
details_hash[:availability] = detail[availability_regex] if detail =~ availability_regex
|
114
|
+
|
115
|
+
details_hash[:price] = detail[price_regex] if detail =~ price_regex
|
116
|
+
|
117
|
+
end
|
118
|
+
|
119
|
+
pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text
|
120
|
+
pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text)
|
121
|
+
|
122
|
+
series_regex = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)* · \d+(?=\))/
|
123
|
+
series_regex_no_vol = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)*(?=\))/
|
124
|
+
series_name_regex = /\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= ·)/
|
125
|
+
series_volume_regex = /(?<=· )\d+/
|
126
|
+
physical_size_regex = /\d+x\d+/
|
127
|
+
|
128
|
+
series_hash = {}
|
129
|
+
if pre_details_text =~ series_regex
|
130
|
+
series = pre_details_text[series_regex]
|
131
|
+
series_hash[:name] = series[series_name_regex] if series =~ series_name_regex
|
132
|
+
series_hash[:volume] = series[series_volume_regex] if series =~ series_volume_regex
|
133
|
+
elsif pre_details_text =~ series_regex_no_vol
|
134
|
+
series = pre_details_text[series_regex_no_vol]
|
135
|
+
series_hash[:name] = series
|
136
|
+
series_hash[:volume] = nil
|
137
|
+
end
|
138
|
+
|
139
|
+
details_hash[:series] = series_hash
|
140
|
+
|
141
|
+
details_hash[:physical_size] = (pre_details_text =~ physical_size_regex) ? pre_details_text[physical_size_regex] : nil
|
142
|
+
|
143
|
+
format_regex = /(?<=\[).+(?=\])/
|
144
|
+
|
145
|
+
after_title_text = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][1]").first.next_sibling.text.strip
|
146
|
+
format = after_title_text[format_regex] if after_title_text =~ format_regex
|
147
|
+
|
148
|
+
details_hash[:format] = format.nil? ? 'Βιβλίο' : format
|
149
|
+
|
150
|
+
publisher_node = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").first
|
151
|
+
publisher_hash = {}
|
152
|
+
publisher_hash[:text] = publisher_node.text
|
153
|
+
publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2]
|
154
|
+
|
155
|
+
pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text)
|
156
|
+
after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text)
|
157
|
+
|
158
|
+
publication_hash = {}
|
159
|
+
publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/]
|
160
|
+
publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/]
|
161
|
+
publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]
|
162
|
+
|
163
|
+
details_hash[:publisher] = publisher_hash
|
164
|
+
details_hash[:publication] = publication_hash
|
165
|
+
|
166
|
+
details_hash
|
167
|
+
end
|
168
|
+
|
169
|
+
end
|
170
|
+
|
171
|
+
end
|
172
|
+
end
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# encoding: utf-8
|
3
3
|
|
4
4
|
require_relative 'base'
|
5
|
+
require_relative 'bibliographical_book_extractor'
|
5
6
|
require 'sanitize'
|
6
7
|
|
7
8
|
module Biblionet
|
@@ -169,12 +170,12 @@ module Biblionet
|
|
169
170
|
|
170
171
|
details_hash = proccess_details(details)
|
171
172
|
|
172
|
-
book_hash[:publication_year] = details_hash[:publication_year]
|
173
|
-
book_hash[:pages] = details_hash[:pages]
|
173
|
+
# book_hash[:publication_year] = details_hash[:publication_year]
|
174
|
+
# book_hash[:pages] = details_hash[:pages]
|
174
175
|
book_hash[:isbn] = details_hash[:isbn]
|
175
176
|
book_hash[:isbn_13] = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
|
176
|
-
book_hash[:status] = details_hash[:status]
|
177
|
-
book_hash[:price] = details_hash[:price]
|
177
|
+
# book_hash[:status] = details_hash[:status]
|
178
|
+
# book_hash[:price] = details_hash[:price]
|
178
179
|
book_hash[:award] = page.awards
|
179
180
|
|
180
181
|
|
@@ -192,7 +193,34 @@ module Biblionet
|
|
192
193
|
|
193
194
|
|
194
195
|
book_hash[:category] = ddcs
|
195
|
-
book_hash[:b_id] = biblionet_id
|
196
|
+
book_hash[:b_id] = biblionet_id
|
197
|
+
|
198
|
+
uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
|
199
|
+
|
200
|
+
bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
|
201
|
+
bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
|
202
|
+
|
203
|
+
book_hash[:publisher] = bibliographical_details[:publisher]
|
204
|
+
book_hash[:publication] = bibliographical_details[:publication]
|
205
|
+
|
206
|
+
book_hash[:format] = bibliographical_details[:format]
|
207
|
+
|
208
|
+
book_hash[:original_language] = bibliographical_details[:original_language]
|
209
|
+
book_hash[:original_title] = bibliographical_details[:original_title]
|
210
|
+
|
211
|
+
book_hash[:price] = bibliographical_details[:price]
|
212
|
+
book_hash[:availability] = bibliographical_details[:availability]
|
213
|
+
book_hash[:last_update] = bibliographical_details[:last_update]
|
214
|
+
|
215
|
+
book_hash[:series] = bibliographical_details[:series]
|
216
|
+
|
217
|
+
physical_description_hash = {}
|
218
|
+
physical_description_hash[:pages] = details_hash[:pages]
|
219
|
+
physical_description_hash[:size] = bibliographical_details[:physical_size]
|
220
|
+
physical_description_hash[:cover_type] = bibliographical_details[:cover_type]
|
221
|
+
|
222
|
+
book_hash[:physical_description] = physical_description_hash
|
223
|
+
|
196
224
|
|
197
225
|
return @book = book_hash
|
198
226
|
end
|
@@ -99,6 +99,7 @@ module Biblionet
|
|
99
99
|
headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
|
100
100
|
headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
|
101
101
|
headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
|
102
|
+
headquarters_hash[:address] = [headquarters_hash[:address]] unless headquarters_hash[:address].kind_of?(Array)
|
102
103
|
|
103
104
|
return headquarters_hash
|
104
105
|
end
|
data/lib/bookshark/version.rb
CHANGED
data/spec/bookshark_spec.rb
CHANGED
@@ -23,15 +23,11 @@
|
|
23
23
|
]
|
24
24
|
},
|
25
25
|
"publisher": {
|
26
|
-
"
|
26
|
+
"text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
|
27
27
|
"b_id": "271"
|
28
28
|
},
|
29
|
-
"publication_year": "2006",
|
30
|
-
"pages": "326",
|
31
29
|
"isbn": "960-14-1157-7",
|
32
30
|
"isbn_13": "978-960-14-1157-6",
|
33
|
-
"status": "Κυκλοφορεί",
|
34
|
-
"price": "16,31",
|
35
31
|
"award": [
|
36
32
|
|
37
33
|
],
|
@@ -43,7 +39,27 @@
|
|
43
39
|
"b_id": "142"
|
44
40
|
}
|
45
41
|
],
|
46
|
-
"b_id": "103788"
|
42
|
+
"b_id": "103788",
|
43
|
+
"publication": {
|
44
|
+
"year": "2006",
|
45
|
+
"version": "1",
|
46
|
+
"place": "Αθήνα"
|
47
|
+
},
|
48
|
+
"format": "Βιβλίο",
|
49
|
+
"original_language": "αγγλικά",
|
50
|
+
"original_title": "Freakonomics",
|
51
|
+
"price": "16,31",
|
52
|
+
"availability": "Κυκλοφορεί",
|
53
|
+
"last_update": "27/1/2006",
|
54
|
+
"series": {
|
55
|
+
"name": "Οικονομία",
|
56
|
+
"volume": null
|
57
|
+
},
|
58
|
+
"physical_description": {
|
59
|
+
"pages": "326",
|
60
|
+
"size": "21x14",
|
61
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
62
|
+
}
|
47
63
|
}
|
48
64
|
]
|
49
65
|
}
|
@@ -71,12 +71,8 @@
|
|
71
71
|
"b_id": "112"
|
72
72
|
}
|
73
73
|
],
|
74
|
-
"publication_year": "2012",
|
75
|
-
"pages": "345",
|
76
74
|
"isbn": "978-960-524-394-4",
|
77
75
|
"isbn_13": null,
|
78
|
-
"status": "Κυκλοφορεί",
|
79
|
-
"price": "16,00",
|
80
76
|
"award": [
|
81
77
|
|
82
78
|
],
|
@@ -209,7 +205,27 @@
|
|
209
205
|
}
|
210
206
|
}
|
211
207
|
],
|
212
|
-
"b_id": "184923"
|
208
|
+
"b_id": "184923",
|
209
|
+
"publication": {
|
210
|
+
"year": "2012",
|
211
|
+
"version": "1",
|
212
|
+
"place": "Ηράκλειο Κρήτης"
|
213
|
+
},
|
214
|
+
"format": "Βιβλίο",
|
215
|
+
"original_language": null,
|
216
|
+
"original_title": null,
|
217
|
+
"price": "16,00",
|
218
|
+
"availability": "Κυκλοφορεί",
|
219
|
+
"last_update": "12/12/2012",
|
220
|
+
"series": {
|
221
|
+
"name": "Εκλαΐκευση της Επιστήμης",
|
222
|
+
"volume": null
|
223
|
+
},
|
224
|
+
"physical_description": {
|
225
|
+
"pages": "345",
|
226
|
+
"size": "24x17",
|
227
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
228
|
+
}
|
213
229
|
}
|
214
230
|
]
|
215
|
-
}
|
231
|
+
}
|
@@ -13,15 +13,11 @@
|
|
13
13
|
"contributors": {
|
14
14
|
},
|
15
15
|
"publisher": {
|
16
|
-
"
|
16
|
+
"text": "Λογοσοφία",
|
17
17
|
"b_id": "7628"
|
18
18
|
},
|
19
|
-
"publication_year": "2007",
|
20
|
-
"pages": "181",
|
21
19
|
"isbn": "978-960-89288-3-1",
|
22
20
|
"isbn_13": null,
|
23
|
-
"status": "Κυκλοφορεί",
|
24
|
-
"price": "13,52",
|
25
21
|
"award": [
|
26
22
|
|
27
23
|
],
|
@@ -33,7 +29,25 @@
|
|
33
29
|
"b_id": "3"
|
34
30
|
}
|
35
31
|
],
|
36
|
-
"b_id": "119000"
|
32
|
+
"b_id": "119000",
|
33
|
+
"publication": {
|
34
|
+
"year": "2007",
|
35
|
+
"version": "1",
|
36
|
+
"place": "Αθήνα"
|
37
|
+
},
|
38
|
+
"format": "Βιβλίο",
|
39
|
+
"original_language": null,
|
40
|
+
"original_title": null,
|
41
|
+
"price": "13,52",
|
42
|
+
"availability": "Κυκλοφορεί",
|
43
|
+
"last_update": "9/5/2007",
|
44
|
+
"series": {
|
45
|
+
},
|
46
|
+
"physical_description": {
|
47
|
+
"pages": "181",
|
48
|
+
"size": "21x14",
|
49
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
50
|
+
}
|
37
51
|
},
|
38
52
|
{
|
39
53
|
"title": "Σημεία και τέρατα της οικονομίας",
|
@@ -58,15 +72,11 @@
|
|
58
72
|
]
|
59
73
|
},
|
60
74
|
"publisher": {
|
61
|
-
"
|
75
|
+
"text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
|
62
76
|
"b_id": "271"
|
63
77
|
},
|
64
|
-
"publication_year": "2006",
|
65
|
-
"pages": "326",
|
66
78
|
"isbn": "960-14-1157-7",
|
67
79
|
"isbn_13": "978-960-14-1157-6",
|
68
|
-
"status": "Κυκλοφορεί",
|
69
|
-
"price": "16,31",
|
70
80
|
"award": [
|
71
81
|
|
72
82
|
],
|
@@ -78,7 +88,27 @@
|
|
78
88
|
"b_id": "142"
|
79
89
|
}
|
80
90
|
],
|
81
|
-
"b_id": "103788"
|
91
|
+
"b_id": "103788",
|
92
|
+
"publication": {
|
93
|
+
"year": "2006",
|
94
|
+
"version": "1",
|
95
|
+
"place": "Αθήνα"
|
96
|
+
},
|
97
|
+
"format": "Βιβλίο",
|
98
|
+
"original_language": "αγγλικά",
|
99
|
+
"original_title": "Freakonomics",
|
100
|
+
"price": "16,31",
|
101
|
+
"availability": "Κυκλοφορεί",
|
102
|
+
"last_update": "27/1/2006",
|
103
|
+
"series": {
|
104
|
+
"name": "Οικονομία",
|
105
|
+
"volume": null
|
106
|
+
},
|
107
|
+
"physical_description": {
|
108
|
+
"pages": "326",
|
109
|
+
"size": "21x14",
|
110
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
111
|
+
}
|
82
112
|
},
|
83
113
|
{
|
84
114
|
"title": "Και άλλα σημεία και τέρατα από την ιστορία",
|
@@ -103,15 +133,11 @@
|
|
103
133
|
]
|
104
134
|
},
|
105
135
|
"publisher": {
|
106
|
-
"
|
136
|
+
"text": "Modern Times",
|
107
137
|
"b_id": "191"
|
108
138
|
},
|
109
|
-
"publication_year": "2004",
|
110
|
-
"pages": "62",
|
111
139
|
"isbn": "960-397-927-9",
|
112
140
|
"isbn_13": "978-960-397-927-2",
|
113
|
-
"status": "Κυκλοφορεί",
|
114
|
-
"price": "15,14",
|
115
141
|
"award": [
|
116
142
|
|
117
143
|
],
|
@@ -123,7 +149,25 @@
|
|
123
149
|
"b_id": "2456"
|
124
150
|
}
|
125
151
|
],
|
126
|
-
"b_id": "87815"
|
152
|
+
"b_id": "87815",
|
153
|
+
"publication": {
|
154
|
+
"year": "2004",
|
155
|
+
"version": null,
|
156
|
+
"place": "Αθήνα"
|
157
|
+
},
|
158
|
+
"format": "Βιβλίο",
|
159
|
+
"original_language": "αγγλικά",
|
160
|
+
"original_title": "Even more horrible history",
|
161
|
+
"price": "15,14",
|
162
|
+
"availability": "Κυκλοφορεί",
|
163
|
+
"last_update": null,
|
164
|
+
"series": {
|
165
|
+
},
|
166
|
+
"physical_description": {
|
167
|
+
"pages": "62",
|
168
|
+
"size": "28x22",
|
169
|
+
"cover_type": "Σκληρό εξώφυλλο"
|
170
|
+
}
|
127
171
|
},
|
128
172
|
{
|
129
173
|
"title": "Σημεία και τέρατα από την ιστορία",
|
@@ -148,15 +192,11 @@
|
|
148
192
|
]
|
149
193
|
},
|
150
194
|
"publisher": {
|
151
|
-
"
|
195
|
+
"text": "Modern Times",
|
152
196
|
"b_id": "191"
|
153
197
|
},
|
154
|
-
"publication_year": "2004",
|
155
|
-
"pages": "78",
|
156
198
|
"isbn": "960-397-926-0",
|
157
199
|
"isbn_13": "978-960-397-926-5",
|
158
|
-
"status": "Κυκλοφορεί",
|
159
|
-
"price": "15,14",
|
160
200
|
"award": [
|
161
201
|
|
162
202
|
],
|
@@ -168,7 +208,25 @@
|
|
168
208
|
"b_id": "2456"
|
169
209
|
}
|
170
210
|
],
|
171
|
-
"b_id": "87812"
|
211
|
+
"b_id": "87812",
|
212
|
+
"publication": {
|
213
|
+
"year": "2004",
|
214
|
+
"version": null,
|
215
|
+
"place": "Αθήνα"
|
216
|
+
},
|
217
|
+
"format": "Βιβλίο",
|
218
|
+
"original_language": "αγγλικά",
|
219
|
+
"original_title": "Horrible history",
|
220
|
+
"price": "15,14",
|
221
|
+
"availability": "Κυκλοφορεί",
|
222
|
+
"last_update": null,
|
223
|
+
"series": {
|
224
|
+
},
|
225
|
+
"physical_description": {
|
226
|
+
"pages": "78",
|
227
|
+
"size": "28x22",
|
228
|
+
"cover_type": "Σκληρό εξώφυλλο"
|
229
|
+
}
|
172
230
|
},
|
173
231
|
{
|
174
232
|
"title": "Σημεία και τέρατα",
|
@@ -183,15 +241,11 @@
|
|
183
241
|
"contributors": {
|
184
242
|
},
|
185
243
|
"publisher": {
|
186
|
-
"
|
244
|
+
"text": "Κέδρος",
|
187
245
|
"b_id": "21"
|
188
246
|
},
|
189
|
-
"publication_year": "1994",
|
190
|
-
"pages": "126",
|
191
247
|
"isbn": "960-04-0941-2",
|
192
248
|
"isbn_13": "978-960-04-0941-3",
|
193
|
-
"status": "Κυκλοφορεί",
|
194
|
-
"price": "9,17",
|
195
249
|
"award": [
|
196
250
|
|
197
251
|
],
|
@@ -203,7 +257,25 @@
|
|
203
257
|
"b_id": "9"
|
204
258
|
}
|
205
259
|
],
|
206
|
-
"b_id": "15839"
|
260
|
+
"b_id": "15839",
|
261
|
+
"publication": {
|
262
|
+
"year": "1994",
|
263
|
+
"version": "2",
|
264
|
+
"place": "Αθήνα"
|
265
|
+
},
|
266
|
+
"format": "Βιβλίο",
|
267
|
+
"original_language": null,
|
268
|
+
"original_title": null,
|
269
|
+
"price": "9,17",
|
270
|
+
"availability": "Κυκλοφορεί",
|
271
|
+
"last_update": "28/7/2010",
|
272
|
+
"series": {
|
273
|
+
},
|
274
|
+
"physical_description": {
|
275
|
+
"pages": "126",
|
276
|
+
"size": "21x14",
|
277
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
278
|
+
}
|
207
279
|
},
|
208
280
|
{
|
209
281
|
"title": "Επίσημη αγραμματοσύνη",
|
@@ -218,15 +290,11 @@
|
|
218
290
|
"contributors": {
|
219
291
|
},
|
220
292
|
"publisher": {
|
221
|
-
"
|
293
|
+
"text": "Περιοδικό Πνευματική Ζωή",
|
222
294
|
"b_id": "6770"
|
223
295
|
},
|
224
|
-
"publication_year": null,
|
225
|
-
"pages": "48",
|
226
296
|
"isbn": null,
|
227
297
|
"isbn_13": null,
|
228
|
-
"status": "Κυκλοφορεί",
|
229
|
-
"price": null,
|
230
298
|
"award": [
|
231
299
|
|
232
300
|
],
|
@@ -238,7 +306,27 @@
|
|
238
306
|
"b_id": "1459"
|
239
307
|
}
|
240
308
|
],
|
241
|
-
"b_id": "77381"
|
309
|
+
"b_id": "77381",
|
310
|
+
"publication": {
|
311
|
+
"year": null,
|
312
|
+
"version": null,
|
313
|
+
"place": "Αθήνα"
|
314
|
+
},
|
315
|
+
"format": "Βιβλίο",
|
316
|
+
"original_language": null,
|
317
|
+
"original_title": null,
|
318
|
+
"price": null,
|
319
|
+
"availability": "Κυκλοφορεί",
|
320
|
+
"last_update": null,
|
321
|
+
"series": {
|
322
|
+
"name": "Νεοελληνικά Αφιερώματα",
|
323
|
+
"volume": "21"
|
324
|
+
},
|
325
|
+
"physical_description": {
|
326
|
+
"pages": "48",
|
327
|
+
"size": "24x17",
|
328
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
329
|
+
}
|
242
330
|
},
|
243
331
|
{
|
244
332
|
"title": "Ο όλεθρος της πυραμίδος Καμπαλά",
|
@@ -259,15 +347,11 @@
|
|
259
347
|
]
|
260
348
|
},
|
261
349
|
"publisher": {
|
262
|
-
"
|
350
|
+
"text": "Μπίμπης Στερέωμα",
|
263
351
|
"b_id": "244"
|
264
352
|
},
|
265
|
-
"publication_year": null,
|
266
|
-
"pages": "269",
|
267
353
|
"isbn": null,
|
268
354
|
"isbn_13": null,
|
269
|
-
"status": "Κυκλοφορεί",
|
270
|
-
"price": "6,85",
|
271
355
|
"award": [
|
272
356
|
|
273
357
|
],
|
@@ -279,7 +363,25 @@
|
|
279
363
|
"b_id": "179"
|
280
364
|
}
|
281
365
|
],
|
282
|
-
"b_id": "46856"
|
366
|
+
"b_id": "46856",
|
367
|
+
"publication": {
|
368
|
+
"year": null,
|
369
|
+
"version": null,
|
370
|
+
"place": "Θεσσαλονίκη"
|
371
|
+
},
|
372
|
+
"format": "Βιβλίο",
|
373
|
+
"original_language": null,
|
374
|
+
"original_title": null,
|
375
|
+
"price": "6,85",
|
376
|
+
"availability": "Κυκλοφορεί",
|
377
|
+
"last_update": null,
|
378
|
+
"series": {
|
379
|
+
},
|
380
|
+
"physical_description": {
|
381
|
+
"pages": "269",
|
382
|
+
"size": "21x14",
|
383
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
384
|
+
}
|
283
385
|
},
|
284
386
|
{
|
285
387
|
"title": "Η επανάσταση κατά της νέας τάξης",
|
@@ -294,15 +396,11 @@
|
|
294
396
|
"contributors": {
|
295
397
|
},
|
296
398
|
"publisher": {
|
297
|
-
"
|
399
|
+
"text": "Μπίμπης Στερέωμα",
|
298
400
|
"b_id": "244"
|
299
401
|
},
|
300
|
-
"publication_year": null,
|
301
|
-
"pages": "71",
|
302
402
|
"isbn": null,
|
303
403
|
"isbn_13": null,
|
304
|
-
"status": "Κυκλοφορεί",
|
305
|
-
"price": "3,73",
|
306
404
|
"award": [
|
307
405
|
|
308
406
|
],
|
@@ -314,7 +412,25 @@
|
|
314
412
|
"b_id": "2336"
|
315
413
|
}
|
316
414
|
],
|
317
|
-
"b_id": "46763"
|
415
|
+
"b_id": "46763",
|
416
|
+
"publication": {
|
417
|
+
"year": null,
|
418
|
+
"version": null,
|
419
|
+
"place": "Θεσσαλονίκη"
|
420
|
+
},
|
421
|
+
"format": "Βιβλίο",
|
422
|
+
"original_language": null,
|
423
|
+
"original_title": null,
|
424
|
+
"price": "3,73",
|
425
|
+
"availability": "Κυκλοφορεί",
|
426
|
+
"last_update": null,
|
427
|
+
"series": {
|
428
|
+
},
|
429
|
+
"physical_description": {
|
430
|
+
"pages": "71",
|
431
|
+
"size": "21x14",
|
432
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
433
|
+
}
|
318
434
|
},
|
319
435
|
{
|
320
436
|
"title": "Επιχείρησις: Μαργαριτάρια",
|
@@ -329,15 +445,11 @@
|
|
329
445
|
"contributors": {
|
330
446
|
},
|
331
447
|
"publisher": {
|
332
|
-
"
|
448
|
+
"text": "Δωδώνη Εκδοτική ΕΠΕ",
|
333
449
|
"b_id": "1"
|
334
450
|
},
|
335
|
-
"publication_year": null,
|
336
|
-
"pages": "188",
|
337
451
|
"isbn": "960-248-541-8",
|
338
452
|
"isbn_13": "978-960-248-541-5",
|
339
|
-
"status": "Κυκλοφορεί",
|
340
|
-
"price": "10,60",
|
341
453
|
"award": [
|
342
454
|
|
343
455
|
],
|
@@ -349,7 +461,25 @@
|
|
349
461
|
"b_id": "1309"
|
350
462
|
}
|
351
463
|
],
|
352
|
-
"b_id": "33301"
|
464
|
+
"b_id": "33301",
|
465
|
+
"publication": {
|
466
|
+
"year": null,
|
467
|
+
"version": null,
|
468
|
+
"place": "Αθήνα"
|
469
|
+
},
|
470
|
+
"format": "Βιβλίο",
|
471
|
+
"original_language": null,
|
472
|
+
"original_title": null,
|
473
|
+
"price": "10,60",
|
474
|
+
"availability": "Κυκλοφορεί",
|
475
|
+
"last_update": null,
|
476
|
+
"series": {
|
477
|
+
},
|
478
|
+
"physical_description": {
|
479
|
+
"pages": "188",
|
480
|
+
"size": "20x13",
|
481
|
+
"cover_type": "Μαλακό εξώφυλλο"
|
482
|
+
}
|
353
483
|
}
|
354
484
|
]
|
355
|
-
}
|
485
|
+
}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bookshark
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.0.
|
4
|
+
version: 1.0.0.pre.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dimitris Klisiaris
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-05-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -36,14 +36,14 @@ dependencies:
|
|
36
36
|
requirements:
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version: '
|
39
|
+
version: '4.0'
|
40
40
|
type: :runtime
|
41
41
|
prerelease: false
|
42
42
|
version_requirements: !ruby/object:Gem::Requirement
|
43
43
|
requirements:
|
44
44
|
- - "~>"
|
45
45
|
- !ruby/object:Gem::Version
|
46
|
-
version: '
|
46
|
+
version: '4.0'
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: json
|
49
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -137,6 +137,7 @@ files:
|
|
137
137
|
- lib/bookshark/crawlers/publisher_crawler.rb
|
138
138
|
- lib/bookshark/extractors/author_extractor.rb
|
139
139
|
- lib/bookshark/extractors/base.rb
|
140
|
+
- lib/bookshark/extractors/bibliographical_book_extractor.rb
|
140
141
|
- lib/bookshark/extractors/book_extractor.rb
|
141
142
|
- lib/bookshark/extractors/category_extractor.rb
|
142
143
|
- lib/bookshark/extractors/publisher_extractor.rb
|