bookshark 1.0.0.pre.2 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (78) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +30 -30
  3. data/bookshark.gemspec +1 -0
  4. data/lib/bookshark.rb +35 -7
  5. data/lib/bookshark/crawlers/base.rb +11 -9
  6. data/lib/bookshark/crawlers/bibliographical_record_crawler.rb +43 -0
  7. data/lib/bookshark/crawlers/book_crawler.rb +42 -54
  8. data/lib/bookshark/extractors/bibliographical_book_extractor.rb +57 -35
  9. data/lib/bookshark/extractors/book_extractor.rb +9 -1
  10. data/lib/bookshark/storage/json_book_records/0/book_63.json +61 -0
  11. data/lib/bookshark/storage/json_book_records/0/book_67.json +53 -0
  12. data/lib/bookshark/storage/json_book_records/2/book_2110.json +59 -0
  13. data/lib/bookshark/storage/json_book_records/2/book_2111.json +65 -0
  14. data/lib/bookshark/storage/json_book_records/2/book_2112.json +69 -0
  15. data/lib/bookshark/storage/json_book_records/2/book_2113.json +59 -0
  16. data/lib/bookshark/storage/json_book_records/2/book_2114.json +67 -0
  17. data/lib/bookshark/storage/json_book_records/2/book_2115.json +71 -0
  18. data/lib/bookshark/storage/json_book_records/2/book_2116.json +63 -0
  19. data/lib/bookshark/storage/json_book_records/2/book_2117.json +61 -0
  20. data/lib/bookshark/storage/json_book_records/2/book_2118.json +83 -0
  21. data/lib/bookshark/storage/json_book_records/2/book_2119.json +69 -0
  22. data/lib/bookshark/storage/json_book_records/2/book_2120.json +69 -0
  23. data/lib/bookshark/storage/json_book_records/2/book_2121.json +63 -0
  24. data/lib/bookshark/storage/json_book_records/2/book_2122.json +72 -0
  25. data/lib/bookshark/storage/json_book_records/2/book_2123.json +67 -0
  26. data/lib/bookshark/storage/json_book_records/2/book_2124.json +72 -0
  27. data/lib/bookshark/storage/json_book_records/2/book_2125.json +67 -0
  28. data/lib/bookshark/storage/json_book_records/2/book_2126.json +72 -0
  29. data/lib/bookshark/storage/json_book_records/2/book_2127.json +61 -0
  30. data/lib/bookshark/storage/json_book_records/2/book_2128.json +61 -0
  31. data/lib/bookshark/storage/json_book_records/2/book_2129.json +61 -0
  32. data/lib/bookshark/storage/json_book_records/2/book_2130.json +72 -0
  33. data/lib/bookshark/storage/json_book_records/2/book_2131.json +55 -0
  34. data/lib/bookshark/storage/json_book_records/2/book_2132.json +61 -0
  35. data/lib/bookshark/storage/json_book_records/2/book_2133.json +61 -0
  36. data/lib/bookshark/storage/json_book_records/2/book_2134.json +61 -0
  37. data/lib/bookshark/storage/json_book_records/2/book_2135.json +55 -0
  38. data/lib/bookshark/storage/json_book_records/2/book_2136.json +67 -0
  39. data/lib/bookshark/storage/json_book_records/2/book_2137.json +67 -0
  40. data/lib/bookshark/storage/json_book_records/2/book_2138.json +57 -0
  41. data/lib/bookshark/storage/json_book_records/2/book_2139.json +67 -0
  42. data/lib/bookshark/storage/json_book_records/2/book_2140.json +53 -0
  43. data/lib/bookshark/storage/json_book_records/2/book_2141.json +61 -0
  44. data/lib/bookshark/storage/json_book_records/2/book_2142.json +67 -0
  45. data/lib/bookshark/storage/json_book_records/2/book_2143.json +65 -0
  46. data/lib/bookshark/storage/json_book_records/2/book_2144.json +64 -0
  47. data/lib/bookshark/storage/json_book_records/2/book_2145.json +53 -0
  48. data/lib/bookshark/storage/json_book_records/2/book_2146.json +70 -0
  49. data/lib/bookshark/storage/json_book_records/2/book_2147.json +67 -0
  50. data/lib/bookshark/storage/json_book_records/2/book_2148.json +66 -0
  51. data/lib/bookshark/storage/json_book_records/2/book_2149.json +72 -0
  52. data/lib/bookshark/storage/json_book_records/2/book_2150.json +53 -0
  53. data/lib/bookshark/storage/json_book_records/2/book_2151.json +67 -0
  54. data/lib/bookshark/storage/json_book_records/2/book_2152.json +67 -0
  55. data/lib/bookshark/storage/json_book_records/2/book_2153.json +67 -0
  56. data/lib/bookshark/storage/json_book_records/2/book_2154.json +67 -0
  57. data/lib/bookshark/storage/json_book_records/2/book_2155.json +67 -0
  58. data/lib/bookshark/storage/json_book_records/2/book_2156.json +76 -0
  59. data/lib/bookshark/storage/json_book_records/2/book_2157.json +65 -0
  60. data/lib/bookshark/storage/json_book_records/2/book_2158.json +77 -0
  61. data/lib/bookshark/storage/json_book_records/2/book_2159.json +76 -0
  62. data/lib/bookshark/storage/json_book_records/2/book_2160.json +67 -0
  63. data/lib/bookshark/storage/json_book_records/2/book_2161.json +61 -0
  64. data/lib/bookshark/storage/json_book_records/2/book_2162.json +65 -0
  65. data/lib/bookshark/storage/json_book_records/2/book_2163.json +68 -0
  66. data/lib/bookshark/storage/json_book_records/2/book_2164.json +59 -0
  67. data/lib/bookshark/storage/json_book_records/2/book_2165.json +59 -0
  68. data/lib/bookshark/storage/json_book_records/2/book_2166.json +53 -0
  69. data/lib/bookshark/storage/json_book_records/2/book_2167.json +53 -0
  70. data/lib/bookshark/storage/json_book_records/2/book_2168.json +53 -0
  71. data/lib/bookshark/storage/json_book_records/2/book_2169.json +53 -0
  72. data/lib/bookshark/storage/json_book_records/2/book_2170.json +53 -0
  73. data/lib/bookshark/version.rb +1 -1
  74. data/spec/bookshark_spec.rb +62 -46
  75. data/spec/spec_helper.rb +2 -1
  76. data/spec/test_data/bg_record_103788.html +1 -0
  77. data/spec/test_data/book_103788.html +1 -0
  78. metadata +88 -5
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 55cf891db20054fa8dee78ebb422af2a2d2cb25c
4
- data.tar.gz: f5c777f653ab0a52ba7e3d071d89f0bd62763954
3
+ metadata.gz: d35fee946c6b6dcf4ca740d89ba3a9cb89f36a94
4
+ data.tar.gz: ff928cdadd16b132adc9f193ff5c7f565a0b0398
5
5
  SHA512:
6
- metadata.gz: 83e1699c160bd1c578c1335e8b6a4e490598e3588da23c29549d47f7f2860f4d3995c326f583e582d115c06ba8c57a8df224f5a3d77ea3b4f45db95ebe2e91a9
7
- data.tar.gz: 0364e7d5cd6c6f6a01863c1f29bd8a0a6e14f5a2cf44bb66ecee76f86f136ca2da479573e3d64b457939f1f7b2a8bd6ae6f9652818d5ea1984d6b2d4a3f3e42a
6
+ metadata.gz: 2dac9ad4842172d896a488fa60baaf13d862c8d2e3b1e68b3a9f5e6ee28bec5136d4ed7d084c4bf6fa0f5f1cfd3224241958467c7df99929ea8cfc1c5ad92abc
7
+ data.tar.gz: e8f9dcb4f20e0a2330a91588c6dfbb306a74820ee9ed7fa7564097013c16244de46ffc3a636a751126f19ce9c1a32b209fb254d7533d901d6f77a00a6b8b5100
data/README.md CHANGED
@@ -13,7 +13,7 @@ The representation of bibliographic metadata in JSON is inspired by [BibJSON](ht
13
13
  Add this line to your application's Gemfile:
14
14
 
15
15
  ```ruby
16
- gem 'bookshark', "~> 1.0.0.pre"
16
+ gem 'bookshark', "~> 1.0"
17
17
  ```
18
18
 
19
19
  And then execute:
@@ -22,7 +22,7 @@ And then execute:
22
22
 
23
23
  Or install it yourself as:
24
24
 
25
- $ gem install bookshark --pre
25
+ $ gem install bookshark
26
26
 
27
27
  Require and include bookshark in your class/module.
28
28
 
@@ -85,7 +85,7 @@ extractor.book(isbn: '9789601411576')
85
85
  # Extract book with id 103788 from website
86
86
  extractor.book(id: 103788)
87
87
 
88
- # Extract book from the provided webpage
88
+ # Extract book from the provided webpage
89
89
  extractor.book(uri: 'http://biblionet.gr/book/103788/')
90
90
 
91
91
  # Extract book with id 103788 from local storage
@@ -93,12 +93,12 @@ extractor.book(id: 103788, local: true)
93
93
  ```
94
94
  For more options, like book's title or author, use the search method which is described below.
95
95
 
96
- **Book Options**
96
+ **Book Options**
97
97
  (Recommended option is to use just the id and let bookshark to generate uri):
98
98
 
99
99
  * id : The id of book on the corresponding site (Integer)
100
100
  * uri : The url of book web page or the path to local file.
101
- * local : Boolean value. Has page been saved locally? (default is false)
101
+ * local : Boolean value. Has page been saved locally? (default is false)
102
102
  * format : The format in which the extracted data are returned
103
103
  * hash (default)
104
104
  * json
@@ -112,9 +112,9 @@ puts Bookshark::Extractor.new(format: 'pretty_json').book(id: 185281)
112
112
 
113
113
  #### Eager Extraction
114
114
 
115
- Each book has some attributes such as authors, contributors, categories etc which are actually references to other objects.
116
- By default when extracting a book, you get only names of these objects and references to their pages.
117
- With eager option set to true, each of these objects' data is extracted and the produced output contains complete information about every object.
115
+ Each book has some attributes such as authors, contributors, categories etc which are actually references to other objects.
116
+ By default when extracting a book, you get only names of these objects and references to their pages.
117
+ With eager option set to true, each of these objects' data is extracted and the produced output contains complete information about every object.
118
118
  Eager extraction doesn't work with local option enabled.
119
119
 
120
120
  ```ruby
@@ -215,24 +215,24 @@ extractor.search(title: 'αρχοντας', author: 'τολκιν', results_type
215
215
  ```
216
216
  Searching and extracting several books can be very slow at times, so instead of extracting every single book you may prefer only the ids of found books. In that case pass the option `results_type: 'ids'`.
217
217
 
218
- **Search Options**:
218
+ **Search Options**:
219
219
  With enought options you can customize your query to your needs. It is recommended to use at least two of the search options.
220
220
 
221
- * title (The title of book to search)
222
- * author (The author's last name is enough for filter the search)
221
+ * title (The title of book to search)
222
+ * author (The author's last name is enough for filter the search)
223
223
  * publisher
224
224
  * category
225
225
  * title_split
226
226
  * 0 (The exact title phrase must by matched)
227
- * 1 (Default - All the words in title must be matched in whatever order)
227
+ * 1 (Default - All the words in title must be matched in whatever order)
228
228
  * 2 (At least one word should match)
229
- * book_id (Providing id means only one book should returned)
230
- * isbn
231
- * author_id (ID of the selected author)
232
- * publisher_id
233
- * category_id
234
- * after_year (Published this year or later)
235
- * before_year (Published this year or before)
229
+ * book_id (Providing id means only one book should returned)
230
+ * isbn
231
+ * author_id (ID of the selected author)
232
+ * publisher_id
233
+ * category_id
234
+ * after_year (Published this year or later)
235
+ * before_year (Published this year or before)
236
236
  * results_type
237
237
  * metadata (Default - Every book is extracted and an array of metadata is returned)
238
238
  * ids (Only ids are returned)
@@ -243,7 +243,7 @@ With enought options you can customize your query to your needs. It is recommend
243
243
 
244
244
  Results with ids option look like that:
245
245
 
246
- ```json
246
+ ```json
247
247
  {
248
248
  "book": [
249
249
  "119000",
@@ -271,7 +271,7 @@ Normally results are multiple books like the ones in book extractors:
271
271
  {
272
272
  "title": "Σημεία και τέρατα της οικονομίας",
273
273
  "subtitle": "Η κρυφή πλευρά των πάντων",
274
- "... Rest of Metadata ...": "... condensed ..."
274
+ "... Rest of Metadata ...": "... condensed ..."
275
275
  },
276
276
  {
277
277
  "title": "Και άλλα σημεία και τέρατα από την ιστορία",
@@ -281,7 +281,7 @@ Normally results are multiple books like the ones in book extractors:
281
281
  {
282
282
  "title": "Σημεία και τέρατα από την ιστορία",
283
283
  "subtitle": null,
284
- "... Rest of Metadata ...": "... condensed ..."
284
+ "... Rest of Metadata ...": "... condensed ..."
285
285
  }
286
286
  ]
287
287
  }
@@ -304,7 +304,7 @@ extractor.author(uri: 'storage/html_author_pages/2/author_2423.html', local: tru
304
304
  **Author Options**: (Recommended option is to use just the id and let bookshark to generate uri):
305
305
  * id : The id of author on the corresponding site (Integer)
306
306
  * uri : The url of author web page or the path to local file.
307
- * local : Boolean value. Has page been saved locally? (default is false)
307
+ * local : Boolean value. Has page been saved locally? (default is false)
308
308
 
309
309
  The expected result of an author extraction is something like this:
310
310
 
@@ -329,7 +329,7 @@ The expected result of an author extraction is something like this:
329
329
  ]
330
330
  }
331
331
  ```
332
- The convention here is that there is never just a single author, but instead the author hash is stored inside an array.
332
+ The convention here is that there is never just a single author, but instead the author hash is stored inside an array.
333
333
  So, it is easy to include metadata for multiple authors or even for multiple types of entities such as publishers or books on the same json file.
334
334
 
335
335
  ### Extract Publisher Data
@@ -342,7 +342,7 @@ extractor = Extractor.new(format: 'pretty_json')
342
342
  # Extract publisher with id 20 from website
343
343
  extractor.publisher(id: 20)
344
344
 
345
- # Extract publisher from the provided webpage
345
+ # Extract publisher from the provided webpage
346
346
  extractor.publisher(uri: 'http://biblionet.gr/com/20/')
347
347
 
348
348
  # Extract publisher with id 20 from local storage
@@ -352,7 +352,7 @@ extractor.publisher(id: 20, local: true)
352
352
 
353
353
  * id : The id of publisher on the corresponding site (Integer)
354
354
  * uri : The url of publisher web page or the path to local file.
355
- * local : Boolean value. Has page been saved locally? (default is false)
355
+ * local : Boolean value. Has page been saved locally? (default is false)
356
356
  * format : The format in which the extracted data are returned
357
357
  * hash (default)
358
358
  * json
@@ -397,7 +397,7 @@ The expected result of an author extraction is something like this:
397
397
  ],
398
398
  "fax": "210 3650069",
399
399
  "email": "info@patakis.gr",
400
- "website": "www.patakis.gr"
400
+ "website": "www.patakis.gr"
401
401
  }
402
402
  },
403
403
  "b_id": "20"
@@ -415,7 +415,7 @@ extractor = Extractor.new(format: 'pretty_json')
415
415
  # Extract category with id 1041 from website
416
416
  extractor.category(id: 1041)
417
417
 
418
- # Extract category from the provided webpage
418
+ # Extract category from the provided webpage
419
419
  extractor.category(uri: 'http://biblionet.gr/index/1041/')
420
420
 
421
421
  # Extract category with id 1041 from local storage
@@ -425,7 +425,7 @@ extractor.category(id: 1041, local: true)
425
425
 
426
426
  * id : The id of category on the corresponding site (Integer)
427
427
  * uri : The url of category web page or the path to local file.
428
- * local : Boolean value. Has page been saved locally? (default is false)
428
+ * local : Boolean value. Has page been saved locally? (default is false)
429
429
  * format : The format in which the extracted data are returned
430
430
  * hash (default)
431
431
  * json
@@ -490,7 +490,7 @@ Take a look at this table:
490
490
  |---------|:-----------:|----------------------------------|
491
491
  | 103788 | book | http://biblionet.gr/book/103788 |
492
492
  | 10207 | author | http://biblionet.gr/author/10207 |
493
- | 20 | publisher | http://biblionet.gr/com/20 |
493
+ | 20 | publisher | http://biblionet.gr/com/20 |
494
494
  | 1041 | category | http://biblionet.gr/index/1041 |
495
495
 
496
496
  So if you want to use the uri option provide the target webpage's url as seen above without any slugs after th id.
@@ -28,4 +28,5 @@ Gem::Specification.new do |spec|
28
28
  spec.add_development_dependency "bundler", ">= 1.6"
29
29
  spec.add_development_dependency "rake", "~> 10.0"
30
30
  spec.add_development_dependency 'rspec', "~> 3.2"
31
+ spec.add_development_dependency "webmock", "~> 1.2"
31
32
  end
@@ -10,6 +10,8 @@ require 'bookshark/extractors/search'
10
10
 
11
11
  require 'bookshark/crawlers/base'
12
12
  require 'bookshark/crawlers/publisher_crawler'
13
+ require 'bookshark/crawlers/book_crawler'
14
+ require 'bookshark/crawlers/bibliographical_record_crawler'
13
15
 
14
16
  module Bookshark
15
17
  DEFAULTS ||= {
@@ -76,7 +78,8 @@ module Bookshark
76
78
 
77
79
  uri = process_options(options, __method__)
78
80
  options[:format] ||= @format
79
- options[:eager] ||= false
81
+ options[:eager] ||= false
82
+ options[:nilify] ||= false
80
83
 
81
84
  if options[:eager]
82
85
  book = eager_extract_book(uri)
@@ -86,8 +89,12 @@ module Bookshark
86
89
 
87
90
  response = {}
88
91
  response[:book] = !book.nil? ? [book] : []
92
+
93
+ return nil if response[:book].empty? and options[:nilify]
94
+
89
95
  response = change_format(response, options[:format])
90
- response = book_extractor.decode_text(response)
96
+
97
+ response = book_extractor.decode_text(response) if response.class == "String"
91
98
 
92
99
  return response
93
100
  end
@@ -137,9 +144,9 @@ module Bookshark
137
144
  return response
138
145
  end
139
146
 
140
- def books_from_storage
141
- extract_from_storage_and_save('book', 'html_book_pages', 'json_book_pages')
142
- end
147
+ # def books_from_storage
148
+ # extract_from_storage_and_save('book', 'html_book_pages', 'json_book_pages')
149
+ # end
143
150
 
144
151
  def authors_from_storage
145
152
  extract_from_storage_and_save('author', 'html_author_pages', 'json_author_pages')
@@ -153,6 +160,17 @@ module Bookshark
153
160
  extract_from_storage_and_save('category', 'html_category_pages', 'json_category_pages')
154
161
  end
155
162
 
163
+ def extract_books_from_storage_and_save(start_id, finish_id, format = 'pretty_json')
164
+ start_id.upto(finish_id) do |book_id|
165
+ record = book(id: book_id, local: true, format: format, nilify: true)
166
+
167
+ dir_to_save = Bookshark.path_to_storage + '/' + 'json_book_records/' + "#{((book_id-1)/1000)}/" + "book_#{book_id}.json"
168
+
169
+ save_to(dir_to_save, record) unless record.nil?
170
+ end
171
+ end
172
+
173
+
156
174
  def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
157
175
  list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
158
176
  dir_to_save = dir.gsub(source_dir, target_dir)
@@ -168,8 +186,8 @@ module Bookshark
168
186
  record = author(options)
169
187
  when 'publisher'
170
188
  record = publisher(options)
171
- when 'book'
172
- record = book(options)
189
+ # when 'book'
190
+ # record = book(options)
173
191
  when 'category'
174
192
  record = category(options)
175
193
  end
@@ -334,6 +352,16 @@ module Bookshark
334
352
  crawler.crawl_and_save
335
353
  end
336
354
 
355
+ def books(options = {})
356
+ crawler = Biblionet::Crawlers::BookCrawler.new(options)
357
+ crawler.crawl_and_save
358
+ end
359
+
360
+ def bibliographical_records(options = {})
361
+ crawler = Biblionet::Crawlers::BibliographicalRecordCrawler.new(options)
362
+ crawler.crawl_and_save
363
+ end
364
+
337
365
  end
338
366
 
339
367
  # module Biblionet
@@ -5,13 +5,14 @@ module Biblionet
5
5
 
6
6
  class Base
7
7
  def initialize(options = {})
8
- @folder = options[:folder] ||= 'lib/bookshark/storage/html_base_pages'
9
- @base_url = options[:base_url] ||= 'http://www.biblionet.gr/base/'
10
- @page_type = options[:page_type] ||= 'base'
11
- @extension = options[:extension] ||= '.html'
12
- @start = options[:start] ||= 1
13
- @finish = options[:finish] ||= 10000
14
- @step = options[:step] ||= 1000
8
+ @folder = options[:folder] ||= 'lib/bookshark/storage/html_base_pages'
9
+ @base_url = options[:base_url] ||= 'http://www.biblionet.gr/base/'
10
+ @page_type = options[:page_type] ||= 'base'
11
+ @extension = options[:extension] ||= '.html'
12
+ @save_only_content = options[:save_only_content] ||= false
13
+ @start = options[:start] ||= 1
14
+ @finish = options[:finish] ||= 10000
15
+ @step = options[:step] ||= 1000
15
16
  end
16
17
 
17
18
  def spider
@@ -20,7 +21,8 @@ module Biblionet
20
21
 
21
22
  start.step(finish, @step) do |last|
22
23
  first = last - @step + 1
23
- subfolder = (last/@step - 1).to_s
24
+ subfolder = (last/@step - 1).to_s
25
+ slash = (@page_type != 'bg_record') ? '/' : ''
24
26
  path = "#{@folder}/#{subfolder}/"
25
27
 
26
28
  # Create a new directory (does nothing if directory exists)
@@ -28,7 +30,7 @@ module Biblionet
28
30
 
29
31
  first.upto(last) do |id|
30
32
  file_to_save = "#{path}#{@page_type}_#{id}#{@extension}"
31
- url_to_download = "#{@base_url}#{id}/"
33
+ url_to_download = "#{@base_url}#{id}#{slash}"
32
34
 
33
35
  yield(url_to_download, file_to_save)
34
36
  # downloader = Biblionet::Core::Base.new(url_to_download)
@@ -0,0 +1,43 @@
1
+ require_relative 'base'
2
+
3
+ module Biblionet
4
+ module Crawlers
5
+
6
+ class BibliographicalRecordCrawler < Base
7
+ def initialize(options = {})
8
+ options[:folder] ||= 'lib/bookshark/storage/html_book_pages'
9
+ options[:base_url] ||= 'http://www.biblionet.gr/main.asp?page=results&Titlesid='
10
+ options[:page_type] ||= 'bg_record'
11
+ options[:extension] ||= '.html'
12
+ options[:save_only_content] ||= true
13
+ options[:start] ||= 176001
14
+ options[:finish] ||= 180000
15
+ options[:step] ||= 1000
16
+ super(options)
17
+ end
18
+
19
+ def crawl_and_save
20
+ downloader = Extractors::Base.new
21
+
22
+ spider do |url_to_download, file_to_save|
23
+ downloader.load_page(url_to_download)
24
+
25
+ # Create a new directory (does nothing if directory exists)
26
+ path = File.dirname(file_to_save)
27
+ FileUtils.mkdir_p path unless File.directory?(path)
28
+
29
+ # No need to download the whole page. Just the part containing the book.
30
+ if @save_only_content
31
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
32
+ content = content_re.match(downloader.page)[0] unless (content_re.match(downloader.page)).nil?
33
+ downloader.save_to(file_to_save, content) unless downloader.page.nil? or downloader.page.length < 1024
34
+ else
35
+ downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024
36
+ end
37
+
38
+ end
39
+ end
40
+ end
41
+
42
+ end
43
+ end
@@ -1,55 +1,43 @@
1
- require 'rubygems'
2
- require 'nokogiri'
3
- require 'open-uri'
4
- require 'fileutils'
5
-
6
- require File.expand_path(File.join(File.dirname(__FILE__), '../extractors', 'base'))
7
- # page = Nokogiri::HTML(open("raw_html_pages/book_45454.html"))
8
- # puts page.class # => Nokogiri::HTML::Document
9
- # puts page
10
-
11
- FOLDER = 'html_book_pages'
12
- BASE_URL = 'http://www.biblionet.gr/book/'
13
- EXTENSION = '.html'
14
-
15
- 301000.step(400000, 1000) do |last|
16
- # saved_pages = 0
17
- # empty_pages = 0
18
-
19
- first = last - 1000 + 1
20
- subfolder = (last/1000 - 1).to_s
21
- path = "#{FOLDER}/#{subfolder}/"
22
-
23
- # Create a new directory (does nothing if directory exists)
24
- FileUtils.mkdir_p path
25
-
26
- first.upto(last) do |id|
27
- file_to_save = "#{path}book_#{id}#{EXTENSION}"
28
- url_to_download = "#{BASE_URL}#{id}/"
29
-
30
- downloader = Biblionet::Core::Base.new(url_to_download)
31
- downloader.save_page(file_to_save) unless downloader.page.nil?
32
-
33
- # open(url_to_parse) do |uri|
34
- # puts "Parsing page: #{url_to_parse}"
35
- # page = uri.read.gsub(/\s+/, " ")
36
- # # doc = Nokogiri::HTML(page)
37
- # # body = doc.at('title').inner_html
38
- # # puts body
39
- # if page.include? "</body>"
40
- # puts "Saving page: #{file_to_save}"
41
- # open(file_to_save, "w") do |file|
42
- # file.write(page)
43
- # end
44
- # saved_pages += 1
45
- # else
46
- # puts "Page #{file_to_save} seems to be empty..."
47
- # empty_pages += 1
48
- # end
49
- # end
50
- end
51
-
52
- # puts "Saved Pages: #{saved_pages}"
53
- # puts "Empty Pages: #{empty_pages}"
1
+ require_relative 'base'
2
+
3
+ module Biblionet
4
+ module Crawlers
5
+
6
+ class BookCrawler < Base
7
+ def initialize(options = {})
8
+ options[:folder] ||= 'lib/bookshark/storage/html_book_pages'
9
+ options[:base_url] ||= 'http://www.biblionet.gr/book/'
10
+ options[:page_type] ||= 'book'
11
+ options[:extension] ||= '.html'
12
+ options[:save_only_content] ||= true
13
+ options[:start] ||= 1
14
+ options[:finish] ||= 10000
15
+ options[:step] ||= 1000
16
+ super(options)
17
+ end
18
+
19
+ def crawl_and_save
20
+ downloader = Extractors::Base.new
21
+
22
+ spider do |url_to_download, file_to_save|
23
+ downloader.load_page(url_to_download)
24
+
25
+ # Create a new directory (does nothing if directory exists)
26
+ path = File.dirname(file_to_save)
27
+ FileUtils.mkdir_p path unless File.directory?(path)
28
+
29
+ # No need to download the whole page. Just the part containing the book.
30
+ if @save_only_content
31
+ content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
32
+ content = content_re.match(downloader.page)[0] unless (content_re.match(downloader.page)).nil?
33
+ downloader.save_to(file_to_save, content) unless downloader.page.nil? or downloader.page.length < 1024
34
+ else
35
+ downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024
36
+ end
37
+ end
38
+ end
39
+
40
+ end
54
41
 
55
- end
42
+ end
43
+ end