bookshark 1.0.0.pre.2 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +30 -30
- data/bookshark.gemspec +1 -0
- data/lib/bookshark.rb +35 -7
- data/lib/bookshark/crawlers/base.rb +11 -9
- data/lib/bookshark/crawlers/bibliographical_record_crawler.rb +43 -0
- data/lib/bookshark/crawlers/book_crawler.rb +42 -54
- data/lib/bookshark/extractors/bibliographical_book_extractor.rb +57 -35
- data/lib/bookshark/extractors/book_extractor.rb +9 -1
- data/lib/bookshark/storage/json_book_records/0/book_63.json +61 -0
- data/lib/bookshark/storage/json_book_records/0/book_67.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2110.json +59 -0
- data/lib/bookshark/storage/json_book_records/2/book_2111.json +65 -0
- data/lib/bookshark/storage/json_book_records/2/book_2112.json +69 -0
- data/lib/bookshark/storage/json_book_records/2/book_2113.json +59 -0
- data/lib/bookshark/storage/json_book_records/2/book_2114.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2115.json +71 -0
- data/lib/bookshark/storage/json_book_records/2/book_2116.json +63 -0
- data/lib/bookshark/storage/json_book_records/2/book_2117.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2118.json +83 -0
- data/lib/bookshark/storage/json_book_records/2/book_2119.json +69 -0
- data/lib/bookshark/storage/json_book_records/2/book_2120.json +69 -0
- data/lib/bookshark/storage/json_book_records/2/book_2121.json +63 -0
- data/lib/bookshark/storage/json_book_records/2/book_2122.json +72 -0
- data/lib/bookshark/storage/json_book_records/2/book_2123.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2124.json +72 -0
- data/lib/bookshark/storage/json_book_records/2/book_2125.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2126.json +72 -0
- data/lib/bookshark/storage/json_book_records/2/book_2127.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2128.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2129.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2130.json +72 -0
- data/lib/bookshark/storage/json_book_records/2/book_2131.json +55 -0
- data/lib/bookshark/storage/json_book_records/2/book_2132.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2133.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2134.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2135.json +55 -0
- data/lib/bookshark/storage/json_book_records/2/book_2136.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2137.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2138.json +57 -0
- data/lib/bookshark/storage/json_book_records/2/book_2139.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2140.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2141.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2142.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2143.json +65 -0
- data/lib/bookshark/storage/json_book_records/2/book_2144.json +64 -0
- data/lib/bookshark/storage/json_book_records/2/book_2145.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2146.json +70 -0
- data/lib/bookshark/storage/json_book_records/2/book_2147.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2148.json +66 -0
- data/lib/bookshark/storage/json_book_records/2/book_2149.json +72 -0
- data/lib/bookshark/storage/json_book_records/2/book_2150.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2151.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2152.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2153.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2154.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2155.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2156.json +76 -0
- data/lib/bookshark/storage/json_book_records/2/book_2157.json +65 -0
- data/lib/bookshark/storage/json_book_records/2/book_2158.json +77 -0
- data/lib/bookshark/storage/json_book_records/2/book_2159.json +76 -0
- data/lib/bookshark/storage/json_book_records/2/book_2160.json +67 -0
- data/lib/bookshark/storage/json_book_records/2/book_2161.json +61 -0
- data/lib/bookshark/storage/json_book_records/2/book_2162.json +65 -0
- data/lib/bookshark/storage/json_book_records/2/book_2163.json +68 -0
- data/lib/bookshark/storage/json_book_records/2/book_2164.json +59 -0
- data/lib/bookshark/storage/json_book_records/2/book_2165.json +59 -0
- data/lib/bookshark/storage/json_book_records/2/book_2166.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2167.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2168.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2169.json +53 -0
- data/lib/bookshark/storage/json_book_records/2/book_2170.json +53 -0
- data/lib/bookshark/version.rb +1 -1
- data/spec/bookshark_spec.rb +62 -46
- data/spec/spec_helper.rb +2 -1
- data/spec/test_data/bg_record_103788.html +1 -0
- data/spec/test_data/book_103788.html +1 -0
- metadata +88 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d35fee946c6b6dcf4ca740d89ba3a9cb89f36a94
|
4
|
+
data.tar.gz: ff928cdadd16b132adc9f193ff5c7f565a0b0398
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2dac9ad4842172d896a488fa60baaf13d862c8d2e3b1e68b3a9f5e6ee28bec5136d4ed7d084c4bf6fa0f5f1cfd3224241958467c7df99929ea8cfc1c5ad92abc
|
7
|
+
data.tar.gz: e8f9dcb4f20e0a2330a91588c6dfbb306a74820ee9ed7fa7564097013c16244de46ffc3a636a751126f19ce9c1a32b209fb254d7533d901d6f77a00a6b8b5100
|
data/README.md
CHANGED
@@ -13,7 +13,7 @@ The representation of bibliographic metadata in JSON is inspired by [BibJSON](ht
|
|
13
13
|
Add this line to your application's Gemfile:
|
14
14
|
|
15
15
|
```ruby
|
16
|
-
gem 'bookshark', "~> 1.0
|
16
|
+
gem 'bookshark', "~> 1.0"
|
17
17
|
```
|
18
18
|
|
19
19
|
And then execute:
|
@@ -22,7 +22,7 @@ And then execute:
|
|
22
22
|
|
23
23
|
Or install it yourself as:
|
24
24
|
|
25
|
-
$ gem install bookshark
|
25
|
+
$ gem install bookshark
|
26
26
|
|
27
27
|
Require and include bookshark in your class/module.
|
28
28
|
|
@@ -85,7 +85,7 @@ extractor.book(isbn: '9789601411576')
|
|
85
85
|
# Extract book with id 103788 from website
|
86
86
|
extractor.book(id: 103788)
|
87
87
|
|
88
|
-
# Extract book from the provided webpage
|
88
|
+
# Extract book from the provided webpage
|
89
89
|
extractor.book(uri: 'http://biblionet.gr/book/103788/')
|
90
90
|
|
91
91
|
# Extract book with id 103788 from local storage
|
@@ -93,12 +93,12 @@ extractor.book(id: 103788, local: true)
|
|
93
93
|
```
|
94
94
|
For more options, like book's title or author, use the search method which is described below.
|
95
95
|
|
96
|
-
**Book Options**
|
96
|
+
**Book Options**
|
97
97
|
(Recommended option is to use just the id and let bookshark to generate uri):
|
98
98
|
|
99
99
|
* id : The id of book on the corresponding site (Integer)
|
100
100
|
* uri : The url of book web page or the path to local file.
|
101
|
-
* local : Boolean value. Has page been saved locally? (default is false)
|
101
|
+
* local : Boolean value. Has page been saved locally? (default is false)
|
102
102
|
* format : The format in which the extracted data are returned
|
103
103
|
* hash (default)
|
104
104
|
* json
|
@@ -112,9 +112,9 @@ puts Bookshark::Extractor.new(format: 'pretty_json').book(id: 185281)
|
|
112
112
|
|
113
113
|
#### Eager Extraction
|
114
114
|
|
115
|
-
Each book has some attributes such as authors, contributors, categories etc which are actually references to other objects.
|
116
|
-
By default when extracting a book, you get only names of these objects and references to their pages.
|
117
|
-
With eager option set to true, each of these objects' data is extracted and the produced output contains complete information about every object.
|
115
|
+
Each book has some attributes such as authors, contributors, categories etc which are actually references to other objects.
|
116
|
+
By default when extracting a book, you get only names of these objects and references to their pages.
|
117
|
+
With eager option set to true, each of these objects' data is extracted and the produced output contains complete information about every object.
|
118
118
|
Eager extraction doesn't work with local option enabled.
|
119
119
|
|
120
120
|
```ruby
|
@@ -215,24 +215,24 @@ extractor.search(title: 'αρχοντας', author: 'τολκιν', results_type
|
|
215
215
|
```
|
216
216
|
Searching and extracting several books can be very slow at times, so instead of extracting every single book you may prefer only the ids of found books. In that case pass the option `results_type: 'ids'`.
|
217
217
|
|
218
|
-
**Search Options**:
|
218
|
+
**Search Options**:
|
219
219
|
With enought options you can customize your query to your needs. It is recommended to use at least two of the search options.
|
220
220
|
|
221
|
-
* title (The title of book to search)
|
222
|
-
* author (The author's last name is enough for filter the search)
|
221
|
+
* title (The title of book to search)
|
222
|
+
* author (The author's last name is enough for filter the search)
|
223
223
|
* publisher
|
224
224
|
* category
|
225
225
|
* title_split
|
226
226
|
* 0 (The exact title phrase must by matched)
|
227
|
-
* 1 (Default - All the words in title must be matched in whatever order)
|
227
|
+
* 1 (Default - All the words in title must be matched in whatever order)
|
228
228
|
* 2 (At least one word should match)
|
229
|
-
* book_id (Providing id means only one book should returned)
|
230
|
-
* isbn
|
231
|
-
* author_id (ID of the selected author)
|
232
|
-
* publisher_id
|
233
|
-
* category_id
|
234
|
-
* after_year (Published this year or later)
|
235
|
-
* before_year (Published this year or before)
|
229
|
+
* book_id (Providing id means only one book should returned)
|
230
|
+
* isbn
|
231
|
+
* author_id (ID of the selected author)
|
232
|
+
* publisher_id
|
233
|
+
* category_id
|
234
|
+
* after_year (Published this year or later)
|
235
|
+
* before_year (Published this year or before)
|
236
236
|
* results_type
|
237
237
|
* metadata (Default - Every book is extracted and an array of metadata is returned)
|
238
238
|
* ids (Only ids are returned)
|
@@ -243,7 +243,7 @@ With enought options you can customize your query to your needs. It is recommend
|
|
243
243
|
|
244
244
|
Results with ids option look like that:
|
245
245
|
|
246
|
-
```json
|
246
|
+
```json
|
247
247
|
{
|
248
248
|
"book": [
|
249
249
|
"119000",
|
@@ -271,7 +271,7 @@ Normally results are multiple books like the ones in book extractors:
|
|
271
271
|
{
|
272
272
|
"title": "Σημεία και τέρατα της οικονομίας",
|
273
273
|
"subtitle": "Η κρυφή πλευρά των πάντων",
|
274
|
-
"... Rest of Metadata ...": "... condensed ..."
|
274
|
+
"... Rest of Metadata ...": "... condensed ..."
|
275
275
|
},
|
276
276
|
{
|
277
277
|
"title": "Και άλλα σημεία και τέρατα από την ιστορία",
|
@@ -281,7 +281,7 @@ Normally results are multiple books like the ones in book extractors:
|
|
281
281
|
{
|
282
282
|
"title": "Σημεία και τέρατα από την ιστορία",
|
283
283
|
"subtitle": null,
|
284
|
-
"... Rest of Metadata ...": "... condensed ..."
|
284
|
+
"... Rest of Metadata ...": "... condensed ..."
|
285
285
|
}
|
286
286
|
]
|
287
287
|
}
|
@@ -304,7 +304,7 @@ extractor.author(uri: 'storage/html_author_pages/2/author_2423.html', local: tru
|
|
304
304
|
**Author Options**: (Recommended option is to use just the id and let bookshark to generate uri):
|
305
305
|
* id : The id of author on the corresponding site (Integer)
|
306
306
|
* uri : The url of author web page or the path to local file.
|
307
|
-
* local : Boolean value. Has page been saved locally? (default is false)
|
307
|
+
* local : Boolean value. Has page been saved locally? (default is false)
|
308
308
|
|
309
309
|
The expected result of an author extraction is something like this:
|
310
310
|
|
@@ -329,7 +329,7 @@ The expected result of an author extraction is something like this:
|
|
329
329
|
]
|
330
330
|
}
|
331
331
|
```
|
332
|
-
The convention here is that there is never just a single author, but instead the author hash is stored inside an array.
|
332
|
+
The convention here is that there is never just a single author, but instead the author hash is stored inside an array.
|
333
333
|
So, it is easy to include metadata for multiple authors or even for multiple types of entities such as publishers or books on the same json file.
|
334
334
|
|
335
335
|
### Extract Publisher Data
|
@@ -342,7 +342,7 @@ extractor = Extractor.new(format: 'pretty_json')
|
|
342
342
|
# Extract publisher with id 20 from website
|
343
343
|
extractor.publisher(id: 20)
|
344
344
|
|
345
|
-
# Extract publisher from the provided webpage
|
345
|
+
# Extract publisher from the provided webpage
|
346
346
|
extractor.publisher(uri: 'http://biblionet.gr/com/20/')
|
347
347
|
|
348
348
|
# Extract publisher with id 20 from local storage
|
@@ -352,7 +352,7 @@ extractor.publisher(id: 20, local: true)
|
|
352
352
|
|
353
353
|
* id : The id of publisher on the corresponding site (Integer)
|
354
354
|
* uri : The url of publisher web page or the path to local file.
|
355
|
-
* local : Boolean value. Has page been saved locally? (default is false)
|
355
|
+
* local : Boolean value. Has page been saved locally? (default is false)
|
356
356
|
* format : The format in which the extracted data are returned
|
357
357
|
* hash (default)
|
358
358
|
* json
|
@@ -397,7 +397,7 @@ The expected result of an author extraction is something like this:
|
|
397
397
|
],
|
398
398
|
"fax": "210 3650069",
|
399
399
|
"email": "info@patakis.gr",
|
400
|
-
"website": "www.patakis.gr"
|
400
|
+
"website": "www.patakis.gr"
|
401
401
|
}
|
402
402
|
},
|
403
403
|
"b_id": "20"
|
@@ -415,7 +415,7 @@ extractor = Extractor.new(format: 'pretty_json')
|
|
415
415
|
# Extract category with id 1041 from website
|
416
416
|
extractor.category(id: 1041)
|
417
417
|
|
418
|
-
# Extract category from the provided webpage
|
418
|
+
# Extract category from the provided webpage
|
419
419
|
extractor.category(uri: 'http://biblionet.gr/index/1041/')
|
420
420
|
|
421
421
|
# Extract category with id 1041 from local storage
|
@@ -425,7 +425,7 @@ extractor.category(id: 1041, local: true)
|
|
425
425
|
|
426
426
|
* id : The id of category on the corresponding site (Integer)
|
427
427
|
* uri : The url of category web page or the path to local file.
|
428
|
-
* local : Boolean value. Has page been saved locally? (default is false)
|
428
|
+
* local : Boolean value. Has page been saved locally? (default is false)
|
429
429
|
* format : The format in which the extracted data are returned
|
430
430
|
* hash (default)
|
431
431
|
* json
|
@@ -490,7 +490,7 @@ Take a look at this table:
|
|
490
490
|
|---------|:-----------:|----------------------------------|
|
491
491
|
| 103788 | book | http://biblionet.gr/book/103788 |
|
492
492
|
| 10207 | author | http://biblionet.gr/author/10207 |
|
493
|
-
| 20 | publisher | http://biblionet.gr/com/20 |
|
493
|
+
| 20 | publisher | http://biblionet.gr/com/20 |
|
494
494
|
| 1041 | category | http://biblionet.gr/index/1041 |
|
495
495
|
|
496
496
|
So if you want to use the uri option provide the target webpage's url as seen above without any slugs after th id.
|
data/bookshark.gemspec
CHANGED
data/lib/bookshark.rb
CHANGED
@@ -10,6 +10,8 @@ require 'bookshark/extractors/search'
|
|
10
10
|
|
11
11
|
require 'bookshark/crawlers/base'
|
12
12
|
require 'bookshark/crawlers/publisher_crawler'
|
13
|
+
require 'bookshark/crawlers/book_crawler'
|
14
|
+
require 'bookshark/crawlers/bibliographical_record_crawler'
|
13
15
|
|
14
16
|
module Bookshark
|
15
17
|
DEFAULTS ||= {
|
@@ -76,7 +78,8 @@ module Bookshark
|
|
76
78
|
|
77
79
|
uri = process_options(options, __method__)
|
78
80
|
options[:format] ||= @format
|
79
|
-
options[:eager] ||= false
|
81
|
+
options[:eager] ||= false
|
82
|
+
options[:nilify] ||= false
|
80
83
|
|
81
84
|
if options[:eager]
|
82
85
|
book = eager_extract_book(uri)
|
@@ -86,8 +89,12 @@ module Bookshark
|
|
86
89
|
|
87
90
|
response = {}
|
88
91
|
response[:book] = !book.nil? ? [book] : []
|
92
|
+
|
93
|
+
return nil if response[:book].empty? and options[:nilify]
|
94
|
+
|
89
95
|
response = change_format(response, options[:format])
|
90
|
-
|
96
|
+
|
97
|
+
response = book_extractor.decode_text(response) if response.class == "String"
|
91
98
|
|
92
99
|
return response
|
93
100
|
end
|
@@ -137,9 +144,9 @@ module Bookshark
|
|
137
144
|
return response
|
138
145
|
end
|
139
146
|
|
140
|
-
def books_from_storage
|
141
|
-
|
142
|
-
end
|
147
|
+
# def books_from_storage
|
148
|
+
# extract_from_storage_and_save('book', 'html_book_pages', 'json_book_pages')
|
149
|
+
# end
|
143
150
|
|
144
151
|
def authors_from_storage
|
145
152
|
extract_from_storage_and_save('author', 'html_author_pages', 'json_author_pages')
|
@@ -153,6 +160,17 @@ module Bookshark
|
|
153
160
|
extract_from_storage_and_save('category', 'html_category_pages', 'json_category_pages')
|
154
161
|
end
|
155
162
|
|
163
|
+
def extract_books_from_storage_and_save(start_id, finish_id, format = 'pretty_json')
|
164
|
+
start_id.upto(finish_id) do |book_id|
|
165
|
+
record = book(id: book_id, local: true, format: format, nilify: true)
|
166
|
+
|
167
|
+
dir_to_save = Bookshark.path_to_storage + '/' + 'json_book_records/' + "#{((book_id-1)/1000)}/" + "book_#{book_id}.json"
|
168
|
+
|
169
|
+
save_to(dir_to_save, record) unless record.nil?
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
|
156
174
|
def extract_from_storage_and_save(metadata_type, source_dir, target_dir)
|
157
175
|
list_directories(path: Bookshark.path_to_storage + '/' + source_dir).each do |dir|
|
158
176
|
dir_to_save = dir.gsub(source_dir, target_dir)
|
@@ -168,8 +186,8 @@ module Bookshark
|
|
168
186
|
record = author(options)
|
169
187
|
when 'publisher'
|
170
188
|
record = publisher(options)
|
171
|
-
when 'book'
|
172
|
-
|
189
|
+
# when 'book'
|
190
|
+
# record = book(options)
|
173
191
|
when 'category'
|
174
192
|
record = category(options)
|
175
193
|
end
|
@@ -334,6 +352,16 @@ module Bookshark
|
|
334
352
|
crawler.crawl_and_save
|
335
353
|
end
|
336
354
|
|
355
|
+
def books(options = {})
|
356
|
+
crawler = Biblionet::Crawlers::BookCrawler.new(options)
|
357
|
+
crawler.crawl_and_save
|
358
|
+
end
|
359
|
+
|
360
|
+
def bibliographical_records(options = {})
|
361
|
+
crawler = Biblionet::Crawlers::BibliographicalRecordCrawler.new(options)
|
362
|
+
crawler.crawl_and_save
|
363
|
+
end
|
364
|
+
|
337
365
|
end
|
338
366
|
|
339
367
|
# module Biblionet
|
@@ -5,13 +5,14 @@ module Biblionet
|
|
5
5
|
|
6
6
|
class Base
|
7
7
|
def initialize(options = {})
|
8
|
-
@folder
|
9
|
-
@base_url
|
10
|
-
@page_type
|
11
|
-
@extension
|
12
|
-
@
|
13
|
-
@
|
14
|
-
@
|
8
|
+
@folder = options[:folder] ||= 'lib/bookshark/storage/html_base_pages'
|
9
|
+
@base_url = options[:base_url] ||= 'http://www.biblionet.gr/base/'
|
10
|
+
@page_type = options[:page_type] ||= 'base'
|
11
|
+
@extension = options[:extension] ||= '.html'
|
12
|
+
@save_only_content = options[:save_only_content] ||= false
|
13
|
+
@start = options[:start] ||= 1
|
14
|
+
@finish = options[:finish] ||= 10000
|
15
|
+
@step = options[:step] ||= 1000
|
15
16
|
end
|
16
17
|
|
17
18
|
def spider
|
@@ -20,7 +21,8 @@ module Biblionet
|
|
20
21
|
|
21
22
|
start.step(finish, @step) do |last|
|
22
23
|
first = last - @step + 1
|
23
|
-
subfolder = (last/@step - 1).to_s
|
24
|
+
subfolder = (last/@step - 1).to_s
|
25
|
+
slash = (@page_type != 'bg_record') ? '/' : ''
|
24
26
|
path = "#{@folder}/#{subfolder}/"
|
25
27
|
|
26
28
|
# Create a new directory (does nothing if directory exists)
|
@@ -28,7 +30,7 @@ module Biblionet
|
|
28
30
|
|
29
31
|
first.upto(last) do |id|
|
30
32
|
file_to_save = "#{path}#{@page_type}_#{id}#{@extension}"
|
31
|
-
url_to_download = "#{@base_url}#{id}
|
33
|
+
url_to_download = "#{@base_url}#{id}#{slash}"
|
32
34
|
|
33
35
|
yield(url_to_download, file_to_save)
|
34
36
|
# downloader = Biblionet::Core::Base.new(url_to_download)
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
|
3
|
+
module Biblionet
|
4
|
+
module Crawlers
|
5
|
+
|
6
|
+
class BibliographicalRecordCrawler < Base
|
7
|
+
def initialize(options = {})
|
8
|
+
options[:folder] ||= 'lib/bookshark/storage/html_book_pages'
|
9
|
+
options[:base_url] ||= 'http://www.biblionet.gr/main.asp?page=results&Titlesid='
|
10
|
+
options[:page_type] ||= 'bg_record'
|
11
|
+
options[:extension] ||= '.html'
|
12
|
+
options[:save_only_content] ||= true
|
13
|
+
options[:start] ||= 176001
|
14
|
+
options[:finish] ||= 180000
|
15
|
+
options[:step] ||= 1000
|
16
|
+
super(options)
|
17
|
+
end
|
18
|
+
|
19
|
+
def crawl_and_save
|
20
|
+
downloader = Extractors::Base.new
|
21
|
+
|
22
|
+
spider do |url_to_download, file_to_save|
|
23
|
+
downloader.load_page(url_to_download)
|
24
|
+
|
25
|
+
# Create a new directory (does nothing if directory exists)
|
26
|
+
path = File.dirname(file_to_save)
|
27
|
+
FileUtils.mkdir_p path unless File.directory?(path)
|
28
|
+
|
29
|
+
# No need to download the whole page. Just the part containing the book.
|
30
|
+
if @save_only_content
|
31
|
+
content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
|
32
|
+
content = content_re.match(downloader.page)[0] unless (content_re.match(downloader.page)).nil?
|
33
|
+
downloader.save_to(file_to_save, content) unless downloader.page.nil? or downloader.page.length < 1024
|
34
|
+
else
|
35
|
+
downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -1,55 +1,43 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
# open(file_to_save, "w") do |file|
|
42
|
-
# file.write(page)
|
43
|
-
# end
|
44
|
-
# saved_pages += 1
|
45
|
-
# else
|
46
|
-
# puts "Page #{file_to_save} seems to be empty..."
|
47
|
-
# empty_pages += 1
|
48
|
-
# end
|
49
|
-
# end
|
50
|
-
end
|
51
|
-
|
52
|
-
# puts "Saved Pages: #{saved_pages}"
|
53
|
-
# puts "Empty Pages: #{empty_pages}"
|
1
|
+
require_relative 'base'
|
2
|
+
|
3
|
+
module Biblionet
|
4
|
+
module Crawlers
|
5
|
+
|
6
|
+
class BookCrawler < Base
|
7
|
+
def initialize(options = {})
|
8
|
+
options[:folder] ||= 'lib/bookshark/storage/html_book_pages'
|
9
|
+
options[:base_url] ||= 'http://www.biblionet.gr/book/'
|
10
|
+
options[:page_type] ||= 'book'
|
11
|
+
options[:extension] ||= '.html'
|
12
|
+
options[:save_only_content] ||= true
|
13
|
+
options[:start] ||= 1
|
14
|
+
options[:finish] ||= 10000
|
15
|
+
options[:step] ||= 1000
|
16
|
+
super(options)
|
17
|
+
end
|
18
|
+
|
19
|
+
def crawl_and_save
|
20
|
+
downloader = Extractors::Base.new
|
21
|
+
|
22
|
+
spider do |url_to_download, file_to_save|
|
23
|
+
downloader.load_page(url_to_download)
|
24
|
+
|
25
|
+
# Create a new directory (does nothing if directory exists)
|
26
|
+
path = File.dirname(file_to_save)
|
27
|
+
FileUtils.mkdir_p path unless File.directory?(path)
|
28
|
+
|
29
|
+
# No need to download the whole page. Just the part containing the book.
|
30
|
+
if @save_only_content
|
31
|
+
content_re = /<!-- CONTENT START -->.*<!-- CONTENT END -->/m
|
32
|
+
content = content_re.match(downloader.page)[0] unless (content_re.match(downloader.page)).nil?
|
33
|
+
downloader.save_to(file_to_save, content) unless downloader.page.nil? or downloader.page.length < 1024
|
34
|
+
else
|
35
|
+
downloader.save_page(file_to_save) unless downloader.page.nil? or downloader.page.length < 1024
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
54
41
|
|
55
|
-
end
|
42
|
+
end
|
43
|
+
end
|