bookshark 1.0.0.beta.5 → 1.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a592c6e055f501c19f4a7dea23bad79446b6b28c
4
- data.tar.gz: 797820d896b398e6294f9804a4a42f151682a027
3
+ metadata.gz: 7e2e0077beb0dadb4ef9df9fcce7b7d55d54e8a6
4
+ data.tar.gz: 712b2769781e8b1ea55ecb25a501d886a939e307
5
5
  SHA512:
6
- metadata.gz: 1f8f7b1b0b0f7312549964153afe8198aab00b0acabdb9693c5e9f7c8242e90a19357fb3507397df6cd5b3278cb1ceb22d2b45dfb4646851e698b5cf2bfeb96d
7
- data.tar.gz: 73e4f60098595c1e5b20101bf66c5d24d3cc2e2659c5776a7973635a9789149e5a6d2a2a4a69ff7af126314f25f9b2e31824fd131879f0b3b5939eff55462604
6
+ metadata.gz: a2348fa9b757abe44be69c0d21e5c61bc41e87eaae80d11bafff115201f40793d2724d6fe5f518c3c3f26308696febd4c772f8a7e989f085949ea6ab584c7264
7
+ data.tar.gz: 221cdadd0bcaf5df156cb668465b6a60e7471a5a650ddcce703a602a646dac19dee008a4de088ed037695289d7fa983792d1a5397ea40cfe843a39bf50155bb0
data/README.md CHANGED
@@ -13,7 +13,7 @@ The representation of bibliographic metadata in JSON is inspired by [BibJSON](ht
13
13
  Add this line to your application's Gemfile:
14
14
 
15
15
  ```ruby
16
- gem 'bookshark', "~> 1.0.0.beta"
16
+ gem 'bookshark', "~> 1.0.0.pre"
17
17
  ```
18
18
 
19
19
  And then execute:
@@ -145,29 +145,47 @@ The expected result of a book extraction is something like this:
145
145
  ]
146
146
  },
147
147
  "publisher": {
148
- "name": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
148
+ "text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
149
149
  "b_id": "271"
150
150
  },
151
- "publication_year": "2006",
152
- "pages": "326",
153
151
  "isbn": "960-14-1157-7",
154
152
  "isbn_13": "978-960-14-1157-6",
155
- "status": "Κυκλοφορεί",
156
- "price": "16,31",
157
153
  "award": [
154
+
158
155
  ],
159
156
  "description": "Τι είναι πιο επικίνδυνο, ένα όπλο ή μια πισίνα; Τι κοινό έχουν οι δάσκαλοι με τους παλαιστές του σούμο;...",
160
157
  "category": [
161
158
  {
162
159
  "ddc": "330",
163
- "text": "Οικονομία",
160
+ "name": "Οικονομία",
164
161
  "b_id": "142"
165
162
  }
166
163
  ],
167
- "b_id": "103788"
164
+ "b_id": "103788",
165
+ "publication": {
166
+ "year": "2006",
167
+ "version": "1",
168
+ "place": "Αθήνα"
169
+ },
170
+ "format": "Βιβλίο",
171
+ "original_language": "αγγλικά",
172
+ "original_title": "Freakonomics",
173
+ "price": "16,31",
174
+ "availability": "Κυκλοφορεί",
175
+ "last_update": "27/1/2006",
176
+ "series": {
177
+ "name": "Οικονομία",
178
+ "volume": null
179
+ },
180
+ "physical_description": {
181
+ "pages": "326",
182
+ "size": "21x14",
183
+ "cover_type": "Μαλακό εξώφυλλο"
184
+ }
168
185
  }
169
186
  ]
170
187
  }
188
+
171
189
  ```
172
190
  Here is a [Book Sample](https://gist.github.com/dklisiaris/a6f3d6f37806186f3c79) extracted with eager option enabled.
173
191
 
data/bookshark.gemspec CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.require_paths = ["lib"]
22
22
 
23
23
  spec.add_dependency "nokogiri", "~> 1.6", ">= 1.6.6"
24
- spec.add_dependency "sanitize", "~> 3.1"
24
+ spec.add_dependency "sanitize", "~> 4.0"
25
25
  spec.add_dependency "json", "~> 1.8"
26
26
  spec.add_dependency "htmlentities", "~> 4.3"
27
27
 
data/lib/bookshark.rb CHANGED
@@ -4,6 +4,7 @@ require 'bookshark/storage/file_manager'
4
4
  require 'bookshark/extractors/author_extractor'
5
5
  require 'bookshark/extractors/category_extractor'
6
6
  require 'bookshark/extractors/book_extractor'
7
+ require 'bookshark/extractors/bibliographical_book_extractor'
7
8
  require 'bookshark/extractors/publisher_extractor'
8
9
  require 'bookshark/extractors/search'
9
10
 
@@ -91,6 +92,23 @@ module Bookshark
91
92
  return response
92
93
  end
93
94
 
95
+
96
+ # def bibliographical_book(options = {})
97
+ # bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
98
+
99
+ # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{options[:id]}"
100
+ # options[:format] ||= @format
101
+
102
+ # book = bibliographical_book_extractor.load_and_extract_book(uri)
103
+
104
+ # response = {}
105
+ # response[:book] = !book.nil? ? [book] : []
106
+ # response = change_format(response, options[:format])
107
+ # response = bibliographical_book_extractor.decode_text(response)
108
+ # end
109
+
110
+ # puts Bookshark::Extractor.new(format: 'pretty_json').bibliographical_book(id: 103788)
111
+
94
112
  def category(options = {})
95
113
  uri = process_options(options, __method__)
96
114
  options[:format] ||= @format
@@ -231,7 +249,7 @@ module Bookshark
231
249
  local_path = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
232
250
  when 'category'
233
251
  url_method = 'index'
234
- local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
252
+ local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
235
253
  else
236
254
  puts "Called from unknown method. Probably its rspec."
237
255
  end
@@ -141,6 +141,10 @@ module Biblionet
141
141
  # +encoded_text+:: the text which contains encoded entities
142
142
  #
143
143
  def decode_text(encoded_text)
144
+ self.class.decode_text(encoded_text)
145
+ end
146
+
147
+ def self.decode_text(encoded_text)
144
148
  # encoded_text = File.read(encoded_file_path)
145
149
  coder = HTMLEntities.new
146
150
  coder.decode(encoded_text)
@@ -0,0 +1,172 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require_relative 'base'
5
+
6
+
7
+ module Biblionet
8
+ module Extractors
9
+
10
+ class BibliographicalBookExtractor < Base
11
+ attr_reader :bibliographical_book
12
+
13
+ def initialize(uri=nil)
14
+ super(uri)
15
+ extract_bibliographical_book unless uri.nil? or @page.nil?
16
+ end
17
+
18
+ def load_and_extract_book(uri=nil)
19
+ load_page(uri)
20
+ extract_bibliographical_book unless uri.nil? or @page.nil?
21
+ end
22
+
23
+ def extract_bibliographical_book(biblionet_id=@biblionet_id, book_page=@page)
24
+ # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
25
+ log = Logger.new(STDOUT)
26
+
27
+ page = BibliographicalBookDataExtractor.new(book_page)
28
+
29
+ # End extraction if BookDataExtractor couldnt create a nodeset
30
+ return nil if page.nodeset.nil?
31
+
32
+ bibliographical_book_hash = Hash.new
33
+
34
+ extracted_details = page.details
35
+
36
+ bibliographical_book_hash[:original_language] = extracted_details[:original_language]
37
+ bibliographical_book_hash[:original_title] = extracted_details[:original_title]
38
+ bibliographical_book_hash[:last_update] = extracted_details[:last_update]
39
+ bibliographical_book_hash[:cover_type] = extracted_details[:cover_type]
40
+ bibliographical_book_hash[:availability] = extracted_details[:availability]
41
+ bibliographical_book_hash[:price] = extracted_details[:price]
42
+
43
+ bibliographical_book_hash[:series] = extracted_details[:series]
44
+ bibliographical_book_hash[:physical_size] = extracted_details[:physical_size]
45
+
46
+ bibliographical_book_hash[:format] = extracted_details[:format]
47
+
48
+ bibliographical_book_hash[:publisher] = extracted_details[:publisher]
49
+ bibliographical_book_hash[:publication] = extracted_details[:publication]
50
+
51
+ return @bibliographical_book = bibliographical_book_hash
52
+ end
53
+
54
+ end
55
+
56
+ class BibliographicalBookDataExtractor
57
+ attr_reader :nodeset
58
+
59
+ def initialize(document)
60
+ # No need to operate on whole page. Just on part containing the book.
61
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
62
+ if (content_re.match(document)).nil?
63
+ puts document
64
+ end
65
+ content = content_re.match(document)[0] unless (content_re.match(document)).nil?
66
+
67
+ # If content is nil, there is something wrong with the html, so return nil
68
+ if content.nil?
69
+ @nodeset = nil
70
+ else
71
+ @nodeset = Nokogiri::HTML(content)
72
+ end
73
+ end
74
+
75
+ def size
76
+ size_regex = /\d+x\d+/
77
+ end
78
+
79
+ def series
80
+ series_regex = /(?<=\()\p{Word}+( \p{Word}+)* · \d+(?=\))/
81
+ series_name_regex = /\p{Word}+( \p{Word}+)*(?= ·)/
82
+ series_volume_regex = /(?<=· )\d+/
83
+ end
84
+
85
+ def details
86
+ details_hash = {}
87
+ isbn_regex = /(?<= )\d+-\d+-\d+-\d+(?= |,)/
88
+ isbn_13_regex = /\d+-\d+-\d+-\d+-\d+/
89
+ last_update_regex = /\d{1,2}\/\d{1,2}\/\d{2,4}/
90
+ cover_type_regex = /(?<=\()\p{Word}+( \p{Word}+)?(?=\))/
91
+ availability_regex = /(?<=\[).+(?=\])/
92
+ price_regex = /(?<=€ )\d+,\d*/
93
+
94
+ @nodeset.xpath("//span[@class='small'][1]").inner_html.split('<br>').each do |detail|
95
+ detail = BibliographicalBookExtractor.decode_text(detail)
96
+
97
+ if detail.start_with? "Γλώσσα πρωτοτύπου:"
98
+ original_language = detail.gsub(/Γλώσσα πρωτοτύπου:/, "").strip
99
+ details_hash[:original_language] = original_language
100
+ elsif detail.start_with? "Τίτλος πρωτοτύπου:"
101
+ original_title = detail.gsub(/Τίτλος πρωτοτύπου:/, "").strip
102
+ details_hash[:original_title] = original_title
103
+ end
104
+
105
+ details_hash[:isbn] = detail[isbn_regex] if detail =~ isbn_regex
106
+
107
+ details_hash[:isbn_13] = detail[isbn_13_regex] if detail =~ isbn_13_regex
108
+
109
+ details_hash[:last_update] = detail[last_update_regex] if detail =~ last_update_regex
110
+
111
+ details_hash[:cover_type] = detail[cover_type_regex] if detail =~ cover_type_regex
112
+
113
+ details_hash[:availability] = detail[availability_regex] if detail =~ availability_regex
114
+
115
+ details_hash[:price] = detail[price_regex] if detail =~ price_regex
116
+
117
+ end
118
+
119
+ pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text
120
+ pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text)
121
+
122
+ series_regex = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)* · \d+(?=\))/
123
+ series_regex_no_vol = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)*(?=\))/
124
+ series_name_regex = /\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= ·)/
125
+ series_volume_regex = /(?<=· )\d+/
126
+ physical_size_regex = /\d+x\d+/
127
+
128
+ series_hash = {}
129
+ if pre_details_text =~ series_regex
130
+ series = pre_details_text[series_regex]
131
+ series_hash[:name] = series[series_name_regex] if series =~ series_name_regex
132
+ series_hash[:volume] = series[series_volume_regex] if series =~ series_volume_regex
133
+ elsif pre_details_text =~ series_regex_no_vol
134
+ series = pre_details_text[series_regex_no_vol]
135
+ series_hash[:name] = series
136
+ series_hash[:volume] = nil
137
+ end
138
+
139
+ details_hash[:series] = series_hash
140
+
141
+ details_hash[:physical_size] = (pre_details_text =~ physical_size_regex) ? pre_details_text[physical_size_regex] : nil
142
+
143
+ format_regex = /(?<=\[).+(?=\])/
144
+
145
+ after_title_text = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][1]").first.next_sibling.text.strip
146
+ format = after_title_text[format_regex] if after_title_text =~ format_regex
147
+
148
+ details_hash[:format] = format.nil? ? 'Βιβλίο' : format
149
+
150
+ publisher_node = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").first
151
+ publisher_hash = {}
152
+ publisher_hash[:text] = publisher_node.text
153
+ publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2]
154
+
155
+ pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text)
156
+ after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text)
157
+
158
+ publication_hash = {}
159
+ publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/]
160
+ publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/]
161
+ publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]
162
+
163
+ details_hash[:publisher] = publisher_hash
164
+ details_hash[:publication] = publication_hash
165
+
166
+ details_hash
167
+ end
168
+
169
+ end
170
+
171
+ end
172
+ end
@@ -2,6 +2,7 @@
2
2
  # encoding: utf-8
3
3
 
4
4
  require_relative 'base'
5
+ require_relative 'bibliographical_book_extractor'
5
6
  require 'sanitize'
6
7
 
7
8
  module Biblionet
@@ -169,12 +170,12 @@ module Biblionet
169
170
 
170
171
  details_hash = proccess_details(details)
171
172
 
172
- book_hash[:publication_year] = details_hash[:publication_year]
173
- book_hash[:pages] = details_hash[:pages]
173
+ # book_hash[:publication_year] = details_hash[:publication_year]
174
+ # book_hash[:pages] = details_hash[:pages]
174
175
  book_hash[:isbn] = details_hash[:isbn]
175
176
  book_hash[:isbn_13] = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
176
- book_hash[:status] = details_hash[:status]
177
- book_hash[:price] = details_hash[:price]
177
+ # book_hash[:status] = details_hash[:status]
178
+ # book_hash[:price] = details_hash[:price]
178
179
  book_hash[:award] = page.awards
179
180
 
180
181
 
@@ -192,7 +193,34 @@ module Biblionet
192
193
 
193
194
 
194
195
  book_hash[:category] = ddcs
195
- book_hash[:b_id] = biblionet_id
196
+ book_hash[:b_id] = biblionet_id
197
+
198
+ uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
199
+
200
+ bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
201
+ bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
202
+
203
+ book_hash[:publisher] = bibliographical_details[:publisher]
204
+ book_hash[:publication] = bibliographical_details[:publication]
205
+
206
+ book_hash[:format] = bibliographical_details[:format]
207
+
208
+ book_hash[:original_language] = bibliographical_details[:original_language]
209
+ book_hash[:original_title] = bibliographical_details[:original_title]
210
+
211
+ book_hash[:price] = bibliographical_details[:price]
212
+ book_hash[:availability] = bibliographical_details[:availability]
213
+ book_hash[:last_update] = bibliographical_details[:last_update]
214
+
215
+ book_hash[:series] = bibliographical_details[:series]
216
+
217
+ physical_description_hash = {}
218
+ physical_description_hash[:pages] = details_hash[:pages]
219
+ physical_description_hash[:size] = bibliographical_details[:physical_size]
220
+ physical_description_hash[:cover_type] = bibliographical_details[:cover_type]
221
+
222
+ book_hash[:physical_description] = physical_description_hash
223
+
196
224
 
197
225
  return @book = book_hash
198
226
  end
@@ -99,6 +99,7 @@ module Biblionet
99
99
  headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
100
100
  headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
101
101
  headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
102
+ headquarters_hash[:address] = [headquarters_hash[:address]] unless headquarters_hash[:address].kind_of?(Array)
102
103
 
103
104
  return headquarters_hash
104
105
  end
@@ -1,3 +1,3 @@
1
1
  module Bookshark
2
- VERSION = "1.0.0.beta.5"
2
+ VERSION = "1.0.0.pre.1"
3
3
  end
@@ -206,4 +206,8 @@ describe Bookshark::Extractor do
206
206
  end
207
207
  end
208
208
 
209
+ describe 'Biblionet::Extractors::BibliographicalBook' do
210
+
211
+ end
212
+
209
213
  end
@@ -23,15 +23,11 @@
23
23
  ]
24
24
  },
25
25
  "publisher": {
26
- "name": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
26
+ "text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
27
27
  "b_id": "271"
28
28
  },
29
- "publication_year": "2006",
30
- "pages": "326",
31
29
  "isbn": "960-14-1157-7",
32
30
  "isbn_13": "978-960-14-1157-6",
33
- "status": "Κυκλοφορεί",
34
- "price": "16,31",
35
31
  "award": [
36
32
 
37
33
  ],
@@ -43,7 +39,27 @@
43
39
  "b_id": "142"
44
40
  }
45
41
  ],
46
- "b_id": "103788"
42
+ "b_id": "103788",
43
+ "publication": {
44
+ "year": "2006",
45
+ "version": "1",
46
+ "place": "Αθήνα"
47
+ },
48
+ "format": "Βιβλίο",
49
+ "original_language": "αγγλικά",
50
+ "original_title": "Freakonomics",
51
+ "price": "16,31",
52
+ "availability": "Κυκλοφορεί",
53
+ "last_update": "27/1/2006",
54
+ "series": {
55
+ "name": "Οικονομία",
56
+ "volume": null
57
+ },
58
+ "physical_description": {
59
+ "pages": "326",
60
+ "size": "21x14",
61
+ "cover_type": "Μαλακό εξώφυλλο"
62
+ }
47
63
  }
48
64
  ]
49
65
  }
@@ -71,12 +71,8 @@
71
71
  "b_id": "112"
72
72
  }
73
73
  ],
74
- "publication_year": "2012",
75
- "pages": "345",
76
74
  "isbn": "978-960-524-394-4",
77
75
  "isbn_13": null,
78
- "status": "Κυκλοφορεί",
79
- "price": "16,00",
80
76
  "award": [
81
77
 
82
78
  ],
@@ -209,7 +205,27 @@
209
205
  }
210
206
  }
211
207
  ],
212
- "b_id": "184923"
208
+ "b_id": "184923",
209
+ "publication": {
210
+ "year": "2012",
211
+ "version": "1",
212
+ "place": "Ηράκλειο Κρήτης"
213
+ },
214
+ "format": "Βιβλίο",
215
+ "original_language": null,
216
+ "original_title": null,
217
+ "price": "16,00",
218
+ "availability": "Κυκλοφορεί",
219
+ "last_update": "12/12/2012",
220
+ "series": {
221
+ "name": "Εκλαΐκευση της Επιστήμης",
222
+ "volume": null
223
+ },
224
+ "physical_description": {
225
+ "pages": "345",
226
+ "size": "24x17",
227
+ "cover_type": "Μαλακό εξώφυλλο"
228
+ }
213
229
  }
214
230
  ]
215
- }
231
+ }
@@ -13,15 +13,11 @@
13
13
  "contributors": {
14
14
  },
15
15
  "publisher": {
16
- "name": "Λογοσοφία",
16
+ "text": "Λογοσοφία",
17
17
  "b_id": "7628"
18
18
  },
19
- "publication_year": "2007",
20
- "pages": "181",
21
19
  "isbn": "978-960-89288-3-1",
22
20
  "isbn_13": null,
23
- "status": "Κυκλοφορεί",
24
- "price": "13,52",
25
21
  "award": [
26
22
 
27
23
  ],
@@ -33,7 +29,25 @@
33
29
  "b_id": "3"
34
30
  }
35
31
  ],
36
- "b_id": "119000"
32
+ "b_id": "119000",
33
+ "publication": {
34
+ "year": "2007",
35
+ "version": "1",
36
+ "place": "Αθήνα"
37
+ },
38
+ "format": "Βιβλίο",
39
+ "original_language": null,
40
+ "original_title": null,
41
+ "price": "13,52",
42
+ "availability": "Κυκλοφορεί",
43
+ "last_update": "9/5/2007",
44
+ "series": {
45
+ },
46
+ "physical_description": {
47
+ "pages": "181",
48
+ "size": "21x14",
49
+ "cover_type": "Μαλακό εξώφυλλο"
50
+ }
37
51
  },
38
52
  {
39
53
  "title": "Σημεία και τέρατα της οικονομίας",
@@ -58,15 +72,11 @@
58
72
  ]
59
73
  },
60
74
  "publisher": {
61
- "name": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
75
+ "text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
62
76
  "b_id": "271"
63
77
  },
64
- "publication_year": "2006",
65
- "pages": "326",
66
78
  "isbn": "960-14-1157-7",
67
79
  "isbn_13": "978-960-14-1157-6",
68
- "status": "Κυκλοφορεί",
69
- "price": "16,31",
70
80
  "award": [
71
81
 
72
82
  ],
@@ -78,7 +88,27 @@
78
88
  "b_id": "142"
79
89
  }
80
90
  ],
81
- "b_id": "103788"
91
+ "b_id": "103788",
92
+ "publication": {
93
+ "year": "2006",
94
+ "version": "1",
95
+ "place": "Αθήνα"
96
+ },
97
+ "format": "Βιβλίο",
98
+ "original_language": "αγγλικά",
99
+ "original_title": "Freakonomics",
100
+ "price": "16,31",
101
+ "availability": "Κυκλοφορεί",
102
+ "last_update": "27/1/2006",
103
+ "series": {
104
+ "name": "Οικονομία",
105
+ "volume": null
106
+ },
107
+ "physical_description": {
108
+ "pages": "326",
109
+ "size": "21x14",
110
+ "cover_type": "Μαλακό εξώφυλλο"
111
+ }
82
112
  },
83
113
  {
84
114
  "title": "Και άλλα σημεία και τέρατα από την ιστορία",
@@ -103,15 +133,11 @@
103
133
  ]
104
134
  },
105
135
  "publisher": {
106
- "name": "Modern Times",
136
+ "text": "Modern Times",
107
137
  "b_id": "191"
108
138
  },
109
- "publication_year": "2004",
110
- "pages": "62",
111
139
  "isbn": "960-397-927-9",
112
140
  "isbn_13": "978-960-397-927-2",
113
- "status": "Κυκλοφορεί",
114
- "price": "15,14",
115
141
  "award": [
116
142
 
117
143
  ],
@@ -123,7 +149,25 @@
123
149
  "b_id": "2456"
124
150
  }
125
151
  ],
126
- "b_id": "87815"
152
+ "b_id": "87815",
153
+ "publication": {
154
+ "year": "2004",
155
+ "version": null,
156
+ "place": "Αθήνα"
157
+ },
158
+ "format": "Βιβλίο",
159
+ "original_language": "αγγλικά",
160
+ "original_title": "Even more horrible history",
161
+ "price": "15,14",
162
+ "availability": "Κυκλοφορεί",
163
+ "last_update": null,
164
+ "series": {
165
+ },
166
+ "physical_description": {
167
+ "pages": "62",
168
+ "size": "28x22",
169
+ "cover_type": "Σκληρό εξώφυλλο"
170
+ }
127
171
  },
128
172
  {
129
173
  "title": "Σημεία και τέρατα από την ιστορία",
@@ -148,15 +192,11 @@
148
192
  ]
149
193
  },
150
194
  "publisher": {
151
- "name": "Modern Times",
195
+ "text": "Modern Times",
152
196
  "b_id": "191"
153
197
  },
154
- "publication_year": "2004",
155
- "pages": "78",
156
198
  "isbn": "960-397-926-0",
157
199
  "isbn_13": "978-960-397-926-5",
158
- "status": "Κυκλοφορεί",
159
- "price": "15,14",
160
200
  "award": [
161
201
 
162
202
  ],
@@ -168,7 +208,25 @@
168
208
  "b_id": "2456"
169
209
  }
170
210
  ],
171
- "b_id": "87812"
211
+ "b_id": "87812",
212
+ "publication": {
213
+ "year": "2004",
214
+ "version": null,
215
+ "place": "Αθήνα"
216
+ },
217
+ "format": "Βιβλίο",
218
+ "original_language": "αγγλικά",
219
+ "original_title": "Horrible history",
220
+ "price": "15,14",
221
+ "availability": "Κυκλοφορεί",
222
+ "last_update": null,
223
+ "series": {
224
+ },
225
+ "physical_description": {
226
+ "pages": "78",
227
+ "size": "28x22",
228
+ "cover_type": "Σκληρό εξώφυλλο"
229
+ }
172
230
  },
173
231
  {
174
232
  "title": "Σημεία και τέρατα",
@@ -183,15 +241,11 @@
183
241
  "contributors": {
184
242
  },
185
243
  "publisher": {
186
- "name": "Κέδρος",
244
+ "text": "Κέδρος",
187
245
  "b_id": "21"
188
246
  },
189
- "publication_year": "1994",
190
- "pages": "126",
191
247
  "isbn": "960-04-0941-2",
192
248
  "isbn_13": "978-960-04-0941-3",
193
- "status": "Κυκλοφορεί",
194
- "price": "9,17",
195
249
  "award": [
196
250
 
197
251
  ],
@@ -203,7 +257,25 @@
203
257
  "b_id": "9"
204
258
  }
205
259
  ],
206
- "b_id": "15839"
260
+ "b_id": "15839",
261
+ "publication": {
262
+ "year": "1994",
263
+ "version": "2",
264
+ "place": "Αθήνα"
265
+ },
266
+ "format": "Βιβλίο",
267
+ "original_language": null,
268
+ "original_title": null,
269
+ "price": "9,17",
270
+ "availability": "Κυκλοφορεί",
271
+ "last_update": "28/7/2010",
272
+ "series": {
273
+ },
274
+ "physical_description": {
275
+ "pages": "126",
276
+ "size": "21x14",
277
+ "cover_type": "Μαλακό εξώφυλλο"
278
+ }
207
279
  },
208
280
  {
209
281
  "title": "Επίσημη αγραμματοσύνη",
@@ -218,15 +290,11 @@
218
290
  "contributors": {
219
291
  },
220
292
  "publisher": {
221
- "name": "Περιοδικό Πνευματική Ζωή",
293
+ "text": "Περιοδικό Πνευματική Ζωή",
222
294
  "b_id": "6770"
223
295
  },
224
- "publication_year": null,
225
- "pages": "48",
226
296
  "isbn": null,
227
297
  "isbn_13": null,
228
- "status": "Κυκλοφορεί",
229
- "price": null,
230
298
  "award": [
231
299
 
232
300
  ],
@@ -238,7 +306,27 @@
238
306
  "b_id": "1459"
239
307
  }
240
308
  ],
241
- "b_id": "77381"
309
+ "b_id": "77381",
310
+ "publication": {
311
+ "year": null,
312
+ "version": null,
313
+ "place": "Αθήνα"
314
+ },
315
+ "format": "Βιβλίο",
316
+ "original_language": null,
317
+ "original_title": null,
318
+ "price": null,
319
+ "availability": "Κυκλοφορεί",
320
+ "last_update": null,
321
+ "series": {
322
+ "name": "Νεοελληνικά Αφιερώματα",
323
+ "volume": "21"
324
+ },
325
+ "physical_description": {
326
+ "pages": "48",
327
+ "size": "24x17",
328
+ "cover_type": "Μαλακό εξώφυλλο"
329
+ }
242
330
  },
243
331
  {
244
332
  "title": "Ο όλεθρος της πυραμίδος Καμπαλά",
@@ -259,15 +347,11 @@
259
347
  ]
260
348
  },
261
349
  "publisher": {
262
- "name": "Μπίμπης Στερέωμα",
350
+ "text": "Μπίμπης Στερέωμα",
263
351
  "b_id": "244"
264
352
  },
265
- "publication_year": null,
266
- "pages": "269",
267
353
  "isbn": null,
268
354
  "isbn_13": null,
269
- "status": "Κυκλοφορεί",
270
- "price": "6,85",
271
355
  "award": [
272
356
 
273
357
  ],
@@ -279,7 +363,25 @@
279
363
  "b_id": "179"
280
364
  }
281
365
  ],
282
- "b_id": "46856"
366
+ "b_id": "46856",
367
+ "publication": {
368
+ "year": null,
369
+ "version": null,
370
+ "place": "Θεσσαλονίκη"
371
+ },
372
+ "format": "Βιβλίο",
373
+ "original_language": null,
374
+ "original_title": null,
375
+ "price": "6,85",
376
+ "availability": "Κυκλοφορεί",
377
+ "last_update": null,
378
+ "series": {
379
+ },
380
+ "physical_description": {
381
+ "pages": "269",
382
+ "size": "21x14",
383
+ "cover_type": "Μαλακό εξώφυλλο"
384
+ }
283
385
  },
284
386
  {
285
387
  "title": "Η επανάσταση κατά της νέας τάξης",
@@ -294,15 +396,11 @@
294
396
  "contributors": {
295
397
  },
296
398
  "publisher": {
297
- "name": "Μπίμπης Στερέωμα",
399
+ "text": "Μπίμπης Στερέωμα",
298
400
  "b_id": "244"
299
401
  },
300
- "publication_year": null,
301
- "pages": "71",
302
402
  "isbn": null,
303
403
  "isbn_13": null,
304
- "status": "Κυκλοφορεί",
305
- "price": "3,73",
306
404
  "award": [
307
405
 
308
406
  ],
@@ -314,7 +412,25 @@
314
412
  "b_id": "2336"
315
413
  }
316
414
  ],
317
- "b_id": "46763"
415
+ "b_id": "46763",
416
+ "publication": {
417
+ "year": null,
418
+ "version": null,
419
+ "place": "Θεσσαλονίκη"
420
+ },
421
+ "format": "Βιβλίο",
422
+ "original_language": null,
423
+ "original_title": null,
424
+ "price": "3,73",
425
+ "availability": "Κυκλοφορεί",
426
+ "last_update": null,
427
+ "series": {
428
+ },
429
+ "physical_description": {
430
+ "pages": "71",
431
+ "size": "21x14",
432
+ "cover_type": "Μαλακό εξώφυλλο"
433
+ }
318
434
  },
319
435
  {
320
436
  "title": "Επιχείρησις: Μαργαριτάρια",
@@ -329,15 +445,11 @@
329
445
  "contributors": {
330
446
  },
331
447
  "publisher": {
332
- "name": "Δωδώνη Εκδοτική ΕΠΕ",
448
+ "text": "Δωδώνη Εκδοτική ΕΠΕ",
333
449
  "b_id": "1"
334
450
  },
335
- "publication_year": null,
336
- "pages": "188",
337
451
  "isbn": "960-248-541-8",
338
452
  "isbn_13": "978-960-248-541-5",
339
- "status": "Κυκλοφορεί",
340
- "price": "10,60",
341
453
  "award": [
342
454
 
343
455
  ],
@@ -349,7 +461,25 @@
349
461
  "b_id": "1309"
350
462
  }
351
463
  ],
352
- "b_id": "33301"
464
+ "b_id": "33301",
465
+ "publication": {
466
+ "year": null,
467
+ "version": null,
468
+ "place": "Αθήνα"
469
+ },
470
+ "format": "Βιβλίο",
471
+ "original_language": null,
472
+ "original_title": null,
473
+ "price": "10,60",
474
+ "availability": "Κυκλοφορεί",
475
+ "last_update": null,
476
+ "series": {
477
+ },
478
+ "physical_description": {
479
+ "pages": "188",
480
+ "size": "20x13",
481
+ "cover_type": "Μαλακό εξώφυλλο"
482
+ }
353
483
  }
354
484
  ]
355
- }
485
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bookshark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.beta.5
4
+ version: 1.0.0.pre.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dimitris Klisiaris
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-18 00:00:00.000000000 Z
11
+ date: 2015-05-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -36,14 +36,14 @@ dependencies:
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: '3.1'
39
+ version: '4.0'
40
40
  type: :runtime
41
41
  prerelease: false
42
42
  version_requirements: !ruby/object:Gem::Requirement
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '3.1'
46
+ version: '4.0'
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: json
49
49
  requirement: !ruby/object:Gem::Requirement
@@ -137,6 +137,7 @@ files:
137
137
  - lib/bookshark/crawlers/publisher_crawler.rb
138
138
  - lib/bookshark/extractors/author_extractor.rb
139
139
  - lib/bookshark/extractors/base.rb
140
+ - lib/bookshark/extractors/bibliographical_book_extractor.rb
140
141
  - lib/bookshark/extractors/book_extractor.rb
141
142
  - lib/bookshark/extractors/category_extractor.rb
142
143
  - lib/bookshark/extractors/publisher_extractor.rb