bookshark 1.0.0.beta.5 → 1.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a592c6e055f501c19f4a7dea23bad79446b6b28c
4
- data.tar.gz: 797820d896b398e6294f9804a4a42f151682a027
3
+ metadata.gz: 7e2e0077beb0dadb4ef9df9fcce7b7d55d54e8a6
4
+ data.tar.gz: 712b2769781e8b1ea55ecb25a501d886a939e307
5
5
  SHA512:
6
- metadata.gz: 1f8f7b1b0b0f7312549964153afe8198aab00b0acabdb9693c5e9f7c8242e90a19357fb3507397df6cd5b3278cb1ceb22d2b45dfb4646851e698b5cf2bfeb96d
7
- data.tar.gz: 73e4f60098595c1e5b20101bf66c5d24d3cc2e2659c5776a7973635a9789149e5a6d2a2a4a69ff7af126314f25f9b2e31824fd131879f0b3b5939eff55462604
6
+ metadata.gz: a2348fa9b757abe44be69c0d21e5c61bc41e87eaae80d11bafff115201f40793d2724d6fe5f518c3c3f26308696febd4c772f8a7e989f085949ea6ab584c7264
7
+ data.tar.gz: 221cdadd0bcaf5df156cb668465b6a60e7471a5a650ddcce703a602a646dac19dee008a4de088ed037695289d7fa983792d1a5397ea40cfe843a39bf50155bb0
data/README.md CHANGED
@@ -13,7 +13,7 @@ The representation of bibliographic metadata in JSON is inspired by [BibJSON](ht
13
13
  Add this line to your application's Gemfile:
14
14
 
15
15
  ```ruby
16
- gem 'bookshark', "~> 1.0.0.beta"
16
+ gem 'bookshark', "~> 1.0.0.pre"
17
17
  ```
18
18
 
19
19
  And then execute:
@@ -145,29 +145,47 @@ The expected result of a book extraction is something like this:
145
145
  ]
146
146
  },
147
147
  "publisher": {
148
- "name": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
148
+ "text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
149
149
  "b_id": "271"
150
150
  },
151
- "publication_year": "2006",
152
- "pages": "326",
153
151
  "isbn": "960-14-1157-7",
154
152
  "isbn_13": "978-960-14-1157-6",
155
- "status": "Κυκλοφορεί",
156
- "price": "16,31",
157
153
  "award": [
154
+
158
155
  ],
159
156
  "description": "Τι είναι πιο επικίνδυνο, ένα όπλο ή μια πισίνα; Τι κοινό έχουν οι δάσκαλοι με τους παλαιστές του σούμο;...",
160
157
  "category": [
161
158
  {
162
159
  "ddc": "330",
163
- "text": "Οικονομία",
160
+ "name": "Οικονομία",
164
161
  "b_id": "142"
165
162
  }
166
163
  ],
167
- "b_id": "103788"
164
+ "b_id": "103788",
165
+ "publication": {
166
+ "year": "2006",
167
+ "version": "1",
168
+ "place": "Αθήνα"
169
+ },
170
+ "format": "Βιβλίο",
171
+ "original_language": "αγγλικά",
172
+ "original_title": "Freakonomics",
173
+ "price": "16,31",
174
+ "availability": "Κυκλοφορεί",
175
+ "last_update": "27/1/2006",
176
+ "series": {
177
+ "name": "Οικονομία",
178
+ "volume": null
179
+ },
180
+ "physical_description": {
181
+ "pages": "326",
182
+ "size": "21x14",
183
+ "cover_type": "Μαλακό εξώφυλλο"
184
+ }
168
185
  }
169
186
  ]
170
187
  }
188
+
171
189
  ```
172
190
  Here is a [Book Sample](https://gist.github.com/dklisiaris/a6f3d6f37806186f3c79) extracted with eager option enabled.
173
191
 
data/bookshark.gemspec CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
21
21
  spec.require_paths = ["lib"]
22
22
 
23
23
  spec.add_dependency "nokogiri", "~> 1.6", ">= 1.6.6"
24
- spec.add_dependency "sanitize", "~> 3.1"
24
+ spec.add_dependency "sanitize", "~> 4.0"
25
25
  spec.add_dependency "json", "~> 1.8"
26
26
  spec.add_dependency "htmlentities", "~> 4.3"
27
27
 
data/lib/bookshark.rb CHANGED
@@ -4,6 +4,7 @@ require 'bookshark/storage/file_manager'
4
4
  require 'bookshark/extractors/author_extractor'
5
5
  require 'bookshark/extractors/category_extractor'
6
6
  require 'bookshark/extractors/book_extractor'
7
+ require 'bookshark/extractors/bibliographical_book_extractor'
7
8
  require 'bookshark/extractors/publisher_extractor'
8
9
  require 'bookshark/extractors/search'
9
10
 
@@ -91,6 +92,23 @@ module Bookshark
91
92
  return response
92
93
  end
93
94
 
95
+
96
+ # def bibliographical_book(options = {})
97
+ # bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
98
+
99
+ # uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{options[:id]}"
100
+ # options[:format] ||= @format
101
+
102
+ # book = bibliographical_book_extractor.load_and_extract_book(uri)
103
+
104
+ # response = {}
105
+ # response[:book] = !book.nil? ? [book] : []
106
+ # response = change_format(response, options[:format])
107
+ # response = bibliographical_book_extractor.decode_text(response)
108
+ # end
109
+
110
+ # puts Bookshark::Extractor.new(format: 'pretty_json').bibliographical_book(id: 103788)
111
+
94
112
  def category(options = {})
95
113
  uri = process_options(options, __method__)
96
114
  options[:format] ||= @format
@@ -231,7 +249,7 @@ module Bookshark
231
249
  local_path = "html_book_pages/#{((id-1)/1000)}/book_#{id}.html"
232
250
  when 'category'
233
251
  url_method = 'index'
234
- local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
252
+ local_path = "html_ddc_pages/#{((id-1)/1000)}/ddc_#{id}.html"
235
253
  else
236
254
  puts "Called from unknown method. Probably its rspec."
237
255
  end
@@ -141,6 +141,10 @@ module Biblionet
141
141
  # +encoded_text+:: the text which contains encoded entities
142
142
  #
143
143
  def decode_text(encoded_text)
144
+ self.class.decode_text(encoded_text)
145
+ end
146
+
147
+ def self.decode_text(encoded_text)
144
148
  # encoded_text = File.read(encoded_file_path)
145
149
  coder = HTMLEntities.new
146
150
  coder.decode(encoded_text)
@@ -0,0 +1,172 @@
1
+ #!/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ require_relative 'base'
5
+
6
+
7
+ module Biblionet
8
+ module Extractors
9
+
10
+ class BibliographicalBookExtractor < Base
11
+ attr_reader :bibliographical_book
12
+
13
+ def initialize(uri=nil)
14
+ super(uri)
15
+ extract_bibliographical_book unless uri.nil? or @page.nil?
16
+ end
17
+
18
+ def load_and_extract_book(uri=nil)
19
+ load_page(uri)
20
+ extract_bibliographical_book unless uri.nil? or @page.nil?
21
+ end
22
+
23
+ def extract_bibliographical_book(biblionet_id=@biblionet_id, book_page=@page)
24
+ # log = Logger.new(File.new(File.dirname(__dir__).to_s + "/logs/book_parsing.log",'a+'))
25
+ log = Logger.new(STDOUT)
26
+
27
+ page = BibliographicalBookDataExtractor.new(book_page)
28
+
29
+ # End extraction if BookDataExtractor couldnt create a nodeset
30
+ return nil if page.nodeset.nil?
31
+
32
+ bibliographical_book_hash = Hash.new
33
+
34
+ extracted_details = page.details
35
+
36
+ bibliographical_book_hash[:original_language] = extracted_details[:original_language]
37
+ bibliographical_book_hash[:original_title] = extracted_details[:original_title]
38
+ bibliographical_book_hash[:last_update] = extracted_details[:last_update]
39
+ bibliographical_book_hash[:cover_type] = extracted_details[:cover_type]
40
+ bibliographical_book_hash[:availability] = extracted_details[:availability]
41
+ bibliographical_book_hash[:price] = extracted_details[:price]
42
+
43
+ bibliographical_book_hash[:series] = extracted_details[:series]
44
+ bibliographical_book_hash[:physical_size] = extracted_details[:physical_size]
45
+
46
+ bibliographical_book_hash[:format] = extracted_details[:format]
47
+
48
+ bibliographical_book_hash[:publisher] = extracted_details[:publisher]
49
+ bibliographical_book_hash[:publication] = extracted_details[:publication]
50
+
51
+ return @bibliographical_book = bibliographical_book_hash
52
+ end
53
+
54
+ end
55
+
56
+ class BibliographicalBookDataExtractor
57
+ attr_reader :nodeset
58
+
59
+ def initialize(document)
60
+ # No need to operate on whole page. Just on part containing the book.
61
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
62
+ if (content_re.match(document)).nil?
63
+ puts document
64
+ end
65
+ content = content_re.match(document)[0] unless (content_re.match(document)).nil?
66
+
67
+ # If content is nil, there is something wrong with the html, so return nil
68
+ if content.nil?
69
+ @nodeset = nil
70
+ else
71
+ @nodeset = Nokogiri::HTML(content)
72
+ end
73
+ end
74
+
75
+ def size
76
+ size_regex = /\d+x\d+/
77
+ end
78
+
79
+ def series
80
+ series_regex = /(?<=\()\p{Word}+( \p{Word}+)* · \d+(?=\))/
81
+ series_name_regex = /\p{Word}+( \p{Word}+)*(?= ·)/
82
+ series_volume_regex = /(?<=· )\d+/
83
+ end
84
+
85
+ def details
86
+ details_hash = {}
87
+ isbn_regex = /(?<= )\d+-\d+-\d+-\d+(?= |,)/
88
+ isbn_13_regex = /\d+-\d+-\d+-\d+-\d+/
89
+ last_update_regex = /\d{1,2}\/\d{1,2}\/\d{2,4}/
90
+ cover_type_regex = /(?<=\()\p{Word}+( \p{Word}+)?(?=\))/
91
+ availability_regex = /(?<=\[).+(?=\])/
92
+ price_regex = /(?<=€ )\d+,\d*/
93
+
94
+ @nodeset.xpath("//span[@class='small'][1]").inner_html.split('<br>').each do |detail|
95
+ detail = BibliographicalBookExtractor.decode_text(detail)
96
+
97
+ if detail.start_with? "Γλώσσα πρωτοτύπου:"
98
+ original_language = detail.gsub(/Γλώσσα πρωτοτύπου:/, "").strip
99
+ details_hash[:original_language] = original_language
100
+ elsif detail.start_with? "Τίτλος πρωτοτύπου:"
101
+ original_title = detail.gsub(/Τίτλος πρωτοτύπου:/, "").strip
102
+ details_hash[:original_title] = original_title
103
+ end
104
+
105
+ details_hash[:isbn] = detail[isbn_regex] if detail =~ isbn_regex
106
+
107
+ details_hash[:isbn_13] = detail[isbn_13_regex] if detail =~ isbn_13_regex
108
+
109
+ details_hash[:last_update] = detail[last_update_regex] if detail =~ last_update_regex
110
+
111
+ details_hash[:cover_type] = detail[cover_type_regex] if detail =~ cover_type_regex
112
+
113
+ details_hash[:availability] = detail[availability_regex] if detail =~ availability_regex
114
+
115
+ details_hash[:price] = detail[price_regex] if detail =~ price_regex
116
+
117
+ end
118
+
119
+ pre_details_text = @nodeset.xpath("//span[@class='small'][1]/preceding::text()").text
120
+ pre_details_text = BibliographicalBookExtractor.decode_text(pre_details_text)
121
+
122
+ series_regex = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)* · \d+(?=\))/
123
+ series_regex_no_vol = /(?<=\()\p{Word}+( \S)?( \p{Word}+( \S)?)*(?=\))/
124
+ series_name_regex = /\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= ·)/
125
+ series_volume_regex = /(?<=· )\d+/
126
+ physical_size_regex = /\d+x\d+/
127
+
128
+ series_hash = {}
129
+ if pre_details_text =~ series_regex
130
+ series = pre_details_text[series_regex]
131
+ series_hash[:name] = series[series_name_regex] if series =~ series_name_regex
132
+ series_hash[:volume] = series[series_volume_regex] if series =~ series_volume_regex
133
+ elsif pre_details_text =~ series_regex_no_vol
134
+ series = pre_details_text[series_regex_no_vol]
135
+ series_hash[:name] = series
136
+ series_hash[:volume] = nil
137
+ end
138
+
139
+ details_hash[:series] = series_hash
140
+
141
+ details_hash[:physical_size] = (pre_details_text =~ physical_size_regex) ? pre_details_text[physical_size_regex] : nil
142
+
143
+ format_regex = /(?<=\[).+(?=\])/
144
+
145
+ after_title_text = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/book/') ]][1]").first.next_sibling.text.strip
146
+ format = after_title_text[format_regex] if after_title_text =~ format_regex
147
+
148
+ details_hash[:format] = format.nil? ? 'Βιβλίο' : format
149
+
150
+ publisher_node = @nodeset.xpath("//a[@class='booklink' and @href[contains(.,'/com/') ]][1]").first
151
+ publisher_hash = {}
152
+ publisher_hash[:text] = publisher_node.text
153
+ publisher_hash[:b_id] = (publisher_node[:href].split("/"))[2]
154
+
155
+ pre_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.previous_sibling.text)
156
+ after_publisher_text = BibliographicalBookExtractor.decode_text(publisher_node.next_sibling.text)
157
+
158
+ publication_hash = {}
159
+ publication_hash[:year] = after_publisher_text[/(?<=, )\d+(?=\.)/]
160
+ publication_hash[:version] = pre_details_text[/(?<=- )\d+(?=η)/]
161
+ publication_hash[:place] = pre_details_text[/(?<=- )\p{Word}+( \S)?( \p{Word}+( \S)?)*(?= :)/]
162
+
163
+ details_hash[:publisher] = publisher_hash
164
+ details_hash[:publication] = publication_hash
165
+
166
+ details_hash
167
+ end
168
+
169
+ end
170
+
171
+ end
172
+ end
@@ -2,6 +2,7 @@
2
2
  # encoding: utf-8
3
3
 
4
4
  require_relative 'base'
5
+ require_relative 'bibliographical_book_extractor'
5
6
  require 'sanitize'
6
7
 
7
8
  module Biblionet
@@ -169,12 +170,12 @@ module Biblionet
169
170
 
170
171
  details_hash = proccess_details(details)
171
172
 
172
- book_hash[:publication_year] = details_hash[:publication_year]
173
- book_hash[:pages] = details_hash[:pages]
173
+ # book_hash[:publication_year] = details_hash[:publication_year]
174
+ # book_hash[:pages] = details_hash[:pages]
174
175
  book_hash[:isbn] = details_hash[:isbn]
175
176
  book_hash[:isbn_13] = details_hash[:isbn_13].nil? ? nil : details_hash[:isbn_13]
176
- book_hash[:status] = details_hash[:status]
177
- book_hash[:price] = details_hash[:price]
177
+ # book_hash[:status] = details_hash[:status]
178
+ # book_hash[:price] = details_hash[:price]
178
179
  book_hash[:award] = page.awards
179
180
 
180
181
 
@@ -192,7 +193,34 @@ module Biblionet
192
193
 
193
194
 
194
195
  book_hash[:category] = ddcs
195
- book_hash[:b_id] = biblionet_id
196
+ book_hash[:b_id] = biblionet_id
197
+
198
+ uri = "http://www.biblionet.gr/main.asp?page=results&Titlesid=#{biblionet_id}"
199
+
200
+ bibliographical_book_extractor = Biblionet::Extractors::BibliographicalBookExtractor.new
201
+ bibliographical_details = bibliographical_book_extractor.load_and_extract_book(uri)
202
+
203
+ book_hash[:publisher] = bibliographical_details[:publisher]
204
+ book_hash[:publication] = bibliographical_details[:publication]
205
+
206
+ book_hash[:format] = bibliographical_details[:format]
207
+
208
+ book_hash[:original_language] = bibliographical_details[:original_language]
209
+ book_hash[:original_title] = bibliographical_details[:original_title]
210
+
211
+ book_hash[:price] = bibliographical_details[:price]
212
+ book_hash[:availability] = bibliographical_details[:availability]
213
+ book_hash[:last_update] = bibliographical_details[:last_update]
214
+
215
+ book_hash[:series] = bibliographical_details[:series]
216
+
217
+ physical_description_hash = {}
218
+ physical_description_hash[:pages] = details_hash[:pages]
219
+ physical_description_hash[:size] = bibliographical_details[:physical_size]
220
+ physical_description_hash[:cover_type] = bibliographical_details[:cover_type]
221
+
222
+ book_hash[:physical_description] = physical_description_hash
223
+
196
224
 
197
225
  return @book = book_hash
198
226
  end
@@ -99,6 +99,7 @@ module Biblionet
99
99
  headquarters_hash = Hash[headquarters_hash.map {|k, v| [mappings[k], v] }]
100
100
  headquarters_hash[:telephone] = [headquarters_hash[:telephone]] unless headquarters_hash[:telephone].kind_of?(Array) or headquarters_hash[:telephone].nil?
101
101
  headquarters_hash[:website] = headquarters_hash[:website].split(',').map(&:strip) if (headquarters_hash[:website] and headquarters_hash[:website].include? ',')
102
+ headquarters_hash[:address] = [headquarters_hash[:address]] unless headquarters_hash[:address].kind_of?(Array)
102
103
 
103
104
  return headquarters_hash
104
105
  end
@@ -1,3 +1,3 @@
1
1
  module Bookshark
2
- VERSION = "1.0.0.beta.5"
2
+ VERSION = "1.0.0.pre.1"
3
3
  end
@@ -206,4 +206,8 @@ describe Bookshark::Extractor do
206
206
  end
207
207
  end
208
208
 
209
+ describe 'Biblionet::Extractors::BibliographicalBook' do
210
+
211
+ end
212
+
209
213
  end
@@ -23,15 +23,11 @@
23
23
  ]
24
24
  },
25
25
  "publisher": {
26
- "name": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
26
+ "text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
27
27
  "b_id": "271"
28
28
  },
29
- "publication_year": "2006",
30
- "pages": "326",
31
29
  "isbn": "960-14-1157-7",
32
30
  "isbn_13": "978-960-14-1157-6",
33
- "status": "Κυκλοφορεί",
34
- "price": "16,31",
35
31
  "award": [
36
32
 
37
33
  ],
@@ -43,7 +39,27 @@
43
39
  "b_id": "142"
44
40
  }
45
41
  ],
46
- "b_id": "103788"
42
+ "b_id": "103788",
43
+ "publication": {
44
+ "year": "2006",
45
+ "version": "1",
46
+ "place": "Αθήνα"
47
+ },
48
+ "format": "Βιβλίο",
49
+ "original_language": "αγγλικά",
50
+ "original_title": "Freakonomics",
51
+ "price": "16,31",
52
+ "availability": "Κυκλοφορεί",
53
+ "last_update": "27/1/2006",
54
+ "series": {
55
+ "name": "Οικονομία",
56
+ "volume": null
57
+ },
58
+ "physical_description": {
59
+ "pages": "326",
60
+ "size": "21x14",
61
+ "cover_type": "Μαλακό εξώφυλλο"
62
+ }
47
63
  }
48
64
  ]
49
65
  }
@@ -71,12 +71,8 @@
71
71
  "b_id": "112"
72
72
  }
73
73
  ],
74
- "publication_year": "2012",
75
- "pages": "345",
76
74
  "isbn": "978-960-524-394-4",
77
75
  "isbn_13": null,
78
- "status": "Κυκλοφορεί",
79
- "price": "16,00",
80
76
  "award": [
81
77
 
82
78
  ],
@@ -209,7 +205,27 @@
209
205
  }
210
206
  }
211
207
  ],
212
- "b_id": "184923"
208
+ "b_id": "184923",
209
+ "publication": {
210
+ "year": "2012",
211
+ "version": "1",
212
+ "place": "Ηράκλειο Κρήτης"
213
+ },
214
+ "format": "Βιβλίο",
215
+ "original_language": null,
216
+ "original_title": null,
217
+ "price": "16,00",
218
+ "availability": "Κυκλοφορεί",
219
+ "last_update": "12/12/2012",
220
+ "series": {
221
+ "name": "Εκλαΐκευση της Επιστήμης",
222
+ "volume": null
223
+ },
224
+ "physical_description": {
225
+ "pages": "345",
226
+ "size": "24x17",
227
+ "cover_type": "Μαλακό εξώφυλλο"
228
+ }
213
229
  }
214
230
  ]
215
- }
231
+ }
@@ -13,15 +13,11 @@
13
13
  "contributors": {
14
14
  },
15
15
  "publisher": {
16
- "name": "Λογοσοφία",
16
+ "text": "Λογοσοφία",
17
17
  "b_id": "7628"
18
18
  },
19
- "publication_year": "2007",
20
- "pages": "181",
21
19
  "isbn": "978-960-89288-3-1",
22
20
  "isbn_13": null,
23
- "status": "Κυκλοφορεί",
24
- "price": "13,52",
25
21
  "award": [
26
22
 
27
23
  ],
@@ -33,7 +29,25 @@
33
29
  "b_id": "3"
34
30
  }
35
31
  ],
36
- "b_id": "119000"
32
+ "b_id": "119000",
33
+ "publication": {
34
+ "year": "2007",
35
+ "version": "1",
36
+ "place": "Αθήνα"
37
+ },
38
+ "format": "Βιβλίο",
39
+ "original_language": null,
40
+ "original_title": null,
41
+ "price": "13,52",
42
+ "availability": "Κυκλοφορεί",
43
+ "last_update": "9/5/2007",
44
+ "series": {
45
+ },
46
+ "physical_description": {
47
+ "pages": "181",
48
+ "size": "21x14",
49
+ "cover_type": "Μαλακό εξώφυλλο"
50
+ }
37
51
  },
38
52
  {
39
53
  "title": "Σημεία και τέρατα της οικονομίας",
@@ -58,15 +72,11 @@
58
72
  ]
59
73
  },
60
74
  "publisher": {
61
- "name": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
75
+ "text": "Εκδοτικός Οίκος Α. Α. Λιβάνη",
62
76
  "b_id": "271"
63
77
  },
64
- "publication_year": "2006",
65
- "pages": "326",
66
78
  "isbn": "960-14-1157-7",
67
79
  "isbn_13": "978-960-14-1157-6",
68
- "status": "Κυκλοφορεί",
69
- "price": "16,31",
70
80
  "award": [
71
81
 
72
82
  ],
@@ -78,7 +88,27 @@
78
88
  "b_id": "142"
79
89
  }
80
90
  ],
81
- "b_id": "103788"
91
+ "b_id": "103788",
92
+ "publication": {
93
+ "year": "2006",
94
+ "version": "1",
95
+ "place": "Αθήνα"
96
+ },
97
+ "format": "Βιβλίο",
98
+ "original_language": "αγγλικά",
99
+ "original_title": "Freakonomics",
100
+ "price": "16,31",
101
+ "availability": "Κυκλοφορεί",
102
+ "last_update": "27/1/2006",
103
+ "series": {
104
+ "name": "Οικονομία",
105
+ "volume": null
106
+ },
107
+ "physical_description": {
108
+ "pages": "326",
109
+ "size": "21x14",
110
+ "cover_type": "Μαλακό εξώφυλλο"
111
+ }
82
112
  },
83
113
  {
84
114
  "title": "Και άλλα σημεία και τέρατα από την ιστορία",
@@ -103,15 +133,11 @@
103
133
  ]
104
134
  },
105
135
  "publisher": {
106
- "name": "Modern Times",
136
+ "text": "Modern Times",
107
137
  "b_id": "191"
108
138
  },
109
- "publication_year": "2004",
110
- "pages": "62",
111
139
  "isbn": "960-397-927-9",
112
140
  "isbn_13": "978-960-397-927-2",
113
- "status": "Κυκλοφορεί",
114
- "price": "15,14",
115
141
  "award": [
116
142
 
117
143
  ],
@@ -123,7 +149,25 @@
123
149
  "b_id": "2456"
124
150
  }
125
151
  ],
126
- "b_id": "87815"
152
+ "b_id": "87815",
153
+ "publication": {
154
+ "year": "2004",
155
+ "version": null,
156
+ "place": "Αθήνα"
157
+ },
158
+ "format": "Βιβλίο",
159
+ "original_language": "αγγλικά",
160
+ "original_title": "Even more horrible history",
161
+ "price": "15,14",
162
+ "availability": "Κυκλοφορεί",
163
+ "last_update": null,
164
+ "series": {
165
+ },
166
+ "physical_description": {
167
+ "pages": "62",
168
+ "size": "28x22",
169
+ "cover_type": "Σκληρό εξώφυλλο"
170
+ }
127
171
  },
128
172
  {
129
173
  "title": "Σημεία και τέρατα από την ιστορία",
@@ -148,15 +192,11 @@
148
192
  ]
149
193
  },
150
194
  "publisher": {
151
- "name": "Modern Times",
195
+ "text": "Modern Times",
152
196
  "b_id": "191"
153
197
  },
154
- "publication_year": "2004",
155
- "pages": "78",
156
198
  "isbn": "960-397-926-0",
157
199
  "isbn_13": "978-960-397-926-5",
158
- "status": "Κυκλοφορεί",
159
- "price": "15,14",
160
200
  "award": [
161
201
 
162
202
  ],
@@ -168,7 +208,25 @@
168
208
  "b_id": "2456"
169
209
  }
170
210
  ],
171
- "b_id": "87812"
211
+ "b_id": "87812",
212
+ "publication": {
213
+ "year": "2004",
214
+ "version": null,
215
+ "place": "Αθήνα"
216
+ },
217
+ "format": "Βιβλίο",
218
+ "original_language": "αγγλικά",
219
+ "original_title": "Horrible history",
220
+ "price": "15,14",
221
+ "availability": "Κυκλοφορεί",
222
+ "last_update": null,
223
+ "series": {
224
+ },
225
+ "physical_description": {
226
+ "pages": "78",
227
+ "size": "28x22",
228
+ "cover_type": "Σκληρό εξώφυλλο"
229
+ }
172
230
  },
173
231
  {
174
232
  "title": "Σημεία και τέρατα",
@@ -183,15 +241,11 @@
183
241
  "contributors": {
184
242
  },
185
243
  "publisher": {
186
- "name": "Κέδρος",
244
+ "text": "Κέδρος",
187
245
  "b_id": "21"
188
246
  },
189
- "publication_year": "1994",
190
- "pages": "126",
191
247
  "isbn": "960-04-0941-2",
192
248
  "isbn_13": "978-960-04-0941-3",
193
- "status": "Κυκλοφορεί",
194
- "price": "9,17",
195
249
  "award": [
196
250
 
197
251
  ],
@@ -203,7 +257,25 @@
203
257
  "b_id": "9"
204
258
  }
205
259
  ],
206
- "b_id": "15839"
260
+ "b_id": "15839",
261
+ "publication": {
262
+ "year": "1994",
263
+ "version": "2",
264
+ "place": "Αθήνα"
265
+ },
266
+ "format": "Βιβλίο",
267
+ "original_language": null,
268
+ "original_title": null,
269
+ "price": "9,17",
270
+ "availability": "Κυκλοφορεί",
271
+ "last_update": "28/7/2010",
272
+ "series": {
273
+ },
274
+ "physical_description": {
275
+ "pages": "126",
276
+ "size": "21x14",
277
+ "cover_type": "Μαλακό εξώφυλλο"
278
+ }
207
279
  },
208
280
  {
209
281
  "title": "Επίσημη αγραμματοσύνη",
@@ -218,15 +290,11 @@
218
290
  "contributors": {
219
291
  },
220
292
  "publisher": {
221
- "name": "Περιοδικό Πνευματική Ζωή",
293
+ "text": "Περιοδικό Πνευματική Ζωή",
222
294
  "b_id": "6770"
223
295
  },
224
- "publication_year": null,
225
- "pages": "48",
226
296
  "isbn": null,
227
297
  "isbn_13": null,
228
- "status": "Κυκλοφορεί",
229
- "price": null,
230
298
  "award": [
231
299
 
232
300
  ],
@@ -238,7 +306,27 @@
238
306
  "b_id": "1459"
239
307
  }
240
308
  ],
241
- "b_id": "77381"
309
+ "b_id": "77381",
310
+ "publication": {
311
+ "year": null,
312
+ "version": null,
313
+ "place": "Αθήνα"
314
+ },
315
+ "format": "Βιβλίο",
316
+ "original_language": null,
317
+ "original_title": null,
318
+ "price": null,
319
+ "availability": "Κυκλοφορεί",
320
+ "last_update": null,
321
+ "series": {
322
+ "name": "Νεοελληνικά Αφιερώματα",
323
+ "volume": "21"
324
+ },
325
+ "physical_description": {
326
+ "pages": "48",
327
+ "size": "24x17",
328
+ "cover_type": "Μαλακό εξώφυλλο"
329
+ }
242
330
  },
243
331
  {
244
332
  "title": "Ο όλεθρος της πυραμίδος Καμπαλά",
@@ -259,15 +347,11 @@
259
347
  ]
260
348
  },
261
349
  "publisher": {
262
- "name": "Μπίμπης Στερέωμα",
350
+ "text": "Μπίμπης Στερέωμα",
263
351
  "b_id": "244"
264
352
  },
265
- "publication_year": null,
266
- "pages": "269",
267
353
  "isbn": null,
268
354
  "isbn_13": null,
269
- "status": "Κυκλοφορεί",
270
- "price": "6,85",
271
355
  "award": [
272
356
 
273
357
  ],
@@ -279,7 +363,25 @@
279
363
  "b_id": "179"
280
364
  }
281
365
  ],
282
- "b_id": "46856"
366
+ "b_id": "46856",
367
+ "publication": {
368
+ "year": null,
369
+ "version": null,
370
+ "place": "Θεσσαλονίκη"
371
+ },
372
+ "format": "Βιβλίο",
373
+ "original_language": null,
374
+ "original_title": null,
375
+ "price": "6,85",
376
+ "availability": "Κυκλοφορεί",
377
+ "last_update": null,
378
+ "series": {
379
+ },
380
+ "physical_description": {
381
+ "pages": "269",
382
+ "size": "21x14",
383
+ "cover_type": "Μαλακό εξώφυλλο"
384
+ }
283
385
  },
284
386
  {
285
387
  "title": "Η επανάσταση κατά της νέας τάξης",
@@ -294,15 +396,11 @@
294
396
  "contributors": {
295
397
  },
296
398
  "publisher": {
297
- "name": "Μπίμπης Στερέωμα",
399
+ "text": "Μπίμπης Στερέωμα",
298
400
  "b_id": "244"
299
401
  },
300
- "publication_year": null,
301
- "pages": "71",
302
402
  "isbn": null,
303
403
  "isbn_13": null,
304
- "status": "Κυκλοφορεί",
305
- "price": "3,73",
306
404
  "award": [
307
405
 
308
406
  ],
@@ -314,7 +412,25 @@
314
412
  "b_id": "2336"
315
413
  }
316
414
  ],
317
- "b_id": "46763"
415
+ "b_id": "46763",
416
+ "publication": {
417
+ "year": null,
418
+ "version": null,
419
+ "place": "Θεσσαλονίκη"
420
+ },
421
+ "format": "Βιβλίο",
422
+ "original_language": null,
423
+ "original_title": null,
424
+ "price": "3,73",
425
+ "availability": "Κυκλοφορεί",
426
+ "last_update": null,
427
+ "series": {
428
+ },
429
+ "physical_description": {
430
+ "pages": "71",
431
+ "size": "21x14",
432
+ "cover_type": "Μαλακό εξώφυλλο"
433
+ }
318
434
  },
319
435
  {
320
436
  "title": "Επιχείρησις: Μαργαριτάρια",
@@ -329,15 +445,11 @@
329
445
  "contributors": {
330
446
  },
331
447
  "publisher": {
332
- "name": "Δωδώνη Εκδοτική ΕΠΕ",
448
+ "text": "Δωδώνη Εκδοτική ΕΠΕ",
333
449
  "b_id": "1"
334
450
  },
335
- "publication_year": null,
336
- "pages": "188",
337
451
  "isbn": "960-248-541-8",
338
452
  "isbn_13": "978-960-248-541-5",
339
- "status": "Κυκλοφορεί",
340
- "price": "10,60",
341
453
  "award": [
342
454
 
343
455
  ],
@@ -349,7 +461,25 @@
349
461
  "b_id": "1309"
350
462
  }
351
463
  ],
352
- "b_id": "33301"
464
+ "b_id": "33301",
465
+ "publication": {
466
+ "year": null,
467
+ "version": null,
468
+ "place": "Αθήνα"
469
+ },
470
+ "format": "Βιβλίο",
471
+ "original_language": null,
472
+ "original_title": null,
473
+ "price": "10,60",
474
+ "availability": "Κυκλοφορεί",
475
+ "last_update": null,
476
+ "series": {
477
+ },
478
+ "physical_description": {
479
+ "pages": "188",
480
+ "size": "20x13",
481
+ "cover_type": "Μαλακό εξώφυλλο"
482
+ }
353
483
  }
354
484
  ]
355
- }
485
+ }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bookshark
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0.beta.5
4
+ version: 1.0.0.pre.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dimitris Klisiaris
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-04-18 00:00:00.000000000 Z
11
+ date: 2015-05-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -36,14 +36,14 @@ dependencies:
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: '3.1'
39
+ version: '4.0'
40
40
  type: :runtime
41
41
  prerelease: false
42
42
  version_requirements: !ruby/object:Gem::Requirement
43
43
  requirements:
44
44
  - - "~>"
45
45
  - !ruby/object:Gem::Version
46
- version: '3.1'
46
+ version: '4.0'
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: json
49
49
  requirement: !ruby/object:Gem::Requirement
@@ -137,6 +137,7 @@ files:
137
137
  - lib/bookshark/crawlers/publisher_crawler.rb
138
138
  - lib/bookshark/extractors/author_extractor.rb
139
139
  - lib/bookshark/extractors/base.rb
140
+ - lib/bookshark/extractors/bibliographical_book_extractor.rb
140
141
  - lib/bookshark/extractors/book_extractor.rb
141
142
  - lib/bookshark/extractors/category_extractor.rb
142
143
  - lib/bookshark/extractors/publisher_extractor.rb