scruber 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/core_ext/const_missing.rb +9 -0
- data/lib/scruber/cli/project_generator.rb +2 -0
- data/lib/scruber/cli/templates/application.tt +1 -0
- data/lib/scruber/cli.rb +2 -0
- data/lib/scruber/core/configuration.rb +24 -1
- data/lib/scruber/core/crawler.rb +165 -7
- data/lib/scruber/core/extensions/base.rb +10 -0
- data/lib/scruber/core/extensions/csv_output.rb +21 -0
- data/lib/scruber/core/extensions/log.rb +39 -0
- data/lib/scruber/core/extensions/loop.rb +34 -0
- data/lib/scruber/core/extensions/parser_aliases.rb +24 -0
- data/lib/scruber/core/extensions/queue_aliases.rb +15 -0
- data/lib/scruber/core/extensions/seed.rb +23 -0
- data/lib/scruber/fetcher_adapters/abstract_adapter.rb +14 -14
- data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +1 -1
- data/lib/scruber/queue_adapters/abstract_adapter.rb +149 -10
- data/lib/scruber/queue_adapters/memory.rb +139 -9
- data/lib/scruber/version.rb +1 -1
- data/lib/scruber.rb +23 -8
- data/scruber.gemspec +6 -4
- metadata +71 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4661c429c6b33a12841569c62835cc587e7f9464
|
4
|
+
data.tar.gz: 9cb87a48248746b30d2ece9db71136560629bdcb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c86674726ae45109383e8d0712612895136384eba1414e061c7d1de3dd9b699af6ab4022de037a537b1377a95b46be2aba10b4e9cea1664449ec880ae1b8189d
|
7
|
+
data.tar.gz: 738d34f37fdc629eb3f4755fffeb70e640e53bd7e33333357af0f9af286d648dd92846220a7ef0f9d06918e412fdf8c801a0b25e15cf74b912b0e0459e04009d
|
@@ -19,6 +19,8 @@ module Scruber
|
|
19
19
|
raise ::Thor::Error, "ERROR: #{path} already exists." if File.exist?(path)
|
20
20
|
say "Creating scruber project at #{path}"
|
21
21
|
FileUtils.mkdir_p(path)
|
22
|
+
FileUtils.mkdir_p(path+'/lib')
|
23
|
+
FileUtils.mkdir_p(path+'/log')
|
22
24
|
end
|
23
25
|
|
24
26
|
def create_files
|
data/lib/scruber/cli.rb
CHANGED
@@ -16,6 +16,7 @@ module Scruber
|
|
16
16
|
register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
|
17
17
|
|
18
18
|
desc 'start', 'Run scraper'
|
19
|
+
method_option :silent, :type => :boolean, :aliases => '-s', default: false
|
19
20
|
def start(name)
|
20
21
|
if defined?(APP_PATH)
|
21
22
|
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
@@ -28,6 +29,7 @@ module Scruber
|
|
28
29
|
ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
|
29
30
|
say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
|
30
31
|
|
32
|
+
Scruber.configuration.silent = options[:silent]
|
31
33
|
require scraper_path
|
32
34
|
else
|
33
35
|
raise ::Thor::Error, "ERROR: Scruber project not found."
|
@@ -1,12 +1,26 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
|
+
#
|
4
|
+
# Configuration class
|
5
|
+
#
|
6
|
+
# @author Ivan Goncharov
|
7
|
+
# @attr [Symbol] fetcher_adapter Fetcher adapter name
|
8
|
+
# @attr [Hash] fetcher_options Fetcher options, see {Scruber::FetcherAdapters::AbstractAdapter} options
|
9
|
+
# @attr [Symbol] fetcher_agent_adapter Fetcher agent adapter name
|
10
|
+
# @attr [Hash] fetcher_agent_options Fetcher agent options, see {Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter}
|
11
|
+
# @attr [Symbol] queue_adapter Queue adapter name
|
12
|
+
# @attr [Hash] queue_options Queue options, see {Scruber::QueueAdapters::AbstractAdapter}
|
13
|
+
# @attr [Array<String>] autoload_paths Array with paths for autoloading classes
|
14
|
+
# @attr [Boolean] silent Don't output anything if true
|
3
15
|
class Configuration
|
4
16
|
attr_accessor :fetcher_adapter,
|
5
17
|
:fetcher_options,
|
6
18
|
:fetcher_agent_adapter,
|
7
19
|
:fetcher_agent_options,
|
8
20
|
:queue_adapter,
|
9
|
-
:queue_options
|
21
|
+
:queue_options,
|
22
|
+
:autoload_paths,
|
23
|
+
:silent
|
10
24
|
|
11
25
|
def initialize
|
12
26
|
@fetcher_adapter = :typhoeus_fetcher
|
@@ -15,8 +29,15 @@ module Scruber
|
|
15
29
|
@fetcher_agent_options = {}
|
16
30
|
@queue_adapter = :memory
|
17
31
|
@queue_options = {}
|
32
|
+
@autoload_paths = []
|
33
|
+
@silent = false
|
18
34
|
end
|
19
35
|
|
36
|
+
#
|
37
|
+
# Merge options from hash
|
38
|
+
# @param options [Hash] options
|
39
|
+
#
|
40
|
+
# @return [void]
|
20
41
|
def merge_options(options)
|
21
42
|
@fetcher_adapter = options.fetch(:fetcher_adapter){ @fetcher_adapter }
|
22
43
|
@fetcher_options.merge! options.fetch(:fetcher_options){ {} }
|
@@ -24,6 +45,8 @@ module Scruber
|
|
24
45
|
@fetcher_agent_options.merge! options.fetch(:fetcher_agent_options){ {} }
|
25
46
|
@queue_adapter = options.fetch(:queue_adapter){ @queue_adapter }
|
26
47
|
@queue_options.merge! options.fetch(:queue_options){ {} }
|
48
|
+
@autoload_paths += options.fetch(:autoload_paths){ [] }
|
49
|
+
@silent = options.fetch(:silent){ false }
|
27
50
|
end
|
28
51
|
end
|
29
52
|
end
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -1,8 +1,33 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
|
+
#
|
4
|
+
# Crawler class
|
5
|
+
#
|
6
|
+
# Main class-runner for scrapers.
|
7
|
+
#
|
8
|
+
# @example Simple scraper
|
9
|
+
# Scruber::Core::Crawler.new(:simple) do
|
10
|
+
# get 'http://example.com'
|
11
|
+
# parse :html do |page,html|
|
12
|
+
# puts html.at('title').text
|
13
|
+
# end
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# @author Ivan Goncharov
|
17
|
+
#
|
3
18
|
class Crawler
|
4
19
|
attr_reader :queue, :fetcher, :scraper_name
|
5
20
|
|
21
|
+
#
|
22
|
+
# Initialize crawler with scraper name and/or with options
|
23
|
+
#
|
24
|
+
# Crawler.new(:sample, fetcher_adapter: :custom)
|
25
|
+
# Crawler.new(:sample)
|
26
|
+
# Crawler.new(fetcher_adapter: :custom)
|
27
|
+
#
|
28
|
+
# @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
|
29
|
+
#
|
30
|
+
# @return [Scruber::Core::Crawler] [description]
|
6
31
|
def initialize(*args)
|
7
32
|
if args.first.is_a?(Hash)
|
8
33
|
scraper_name = nil
|
@@ -14,23 +39,29 @@ module Scruber
|
|
14
39
|
@scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
|
15
40
|
raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
|
16
41
|
@scraper_name = @scraper_name.to_sym
|
17
|
-
Scruber.configuration.merge_options(options)
|
18
42
|
@callbacks_options = {}
|
19
43
|
@callbacks = {}
|
44
|
+
@on_page_error_callback = nil
|
20
45
|
@on_complete_callbacks = []
|
21
|
-
|
46
|
+
|
47
|
+
Scruber.configuration.merge_options(options)
|
48
|
+
ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
|
49
|
+
|
50
|
+
@queue = Scruber::Queue.new(scraper_name: @scraper_name)
|
22
51
|
@fetcher = Scruber::Fetcher.new
|
52
|
+
initialize_progressbar
|
23
53
|
load_extenstions
|
24
54
|
end
|
25
55
|
|
26
56
|
#
|
27
|
-
#
|
57
|
+
# Crawling engine
|
28
58
|
#
|
29
59
|
# @param block [Proc] crawler body
|
30
60
|
def run(&block)
|
31
61
|
instance_eval &block
|
32
62
|
while @queue.has_work? do
|
33
63
|
@fetcher.run @queue
|
64
|
+
show_progress
|
34
65
|
while page = @queue.fetch_downloaded do
|
35
66
|
if @callbacks[page.page_type.to_sym]
|
36
67
|
processed_page = process_page(page, page.page_type.to_sym)
|
@@ -38,16 +69,40 @@ module Scruber
|
|
38
69
|
page.processed! unless page.sent_to_redownload?
|
39
70
|
end
|
40
71
|
end
|
72
|
+
if @on_page_error_callback
|
73
|
+
while page = @queue.fetch_error do
|
74
|
+
instance_exec page, &(@on_page_error_callback)
|
75
|
+
end
|
76
|
+
end
|
41
77
|
end
|
42
78
|
@on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
|
43
79
|
instance_exec &(callback)
|
44
80
|
end
|
45
81
|
end
|
46
82
|
|
83
|
+
#
|
84
|
+
# Register parser
|
85
|
+
#
|
86
|
+
# @param page_type [Symbol] type of page
|
87
|
+
# @param options [Hash] options for parser
|
88
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
89
|
+
# page body depends on this format. For example :json or :html
|
90
|
+
# @param block [Proc] body of parser
|
91
|
+
#
|
92
|
+
# @return [void]
|
47
93
|
def parser(page_type, options={}, &block)
|
48
94
|
register_callback(page_type, options, &block)
|
49
95
|
end
|
50
96
|
|
97
|
+
#
|
98
|
+
# Method missing callback. Scruber allows to register
|
99
|
+
# regexp and proc body to process calls
|
100
|
+
#
|
101
|
+
# @param method_sym [Symbol] missing method name
|
102
|
+
# @param arguments [Array] arguments
|
103
|
+
# @param block [Proc] block (if passed)
|
104
|
+
#
|
105
|
+
# @return [type] [description]
|
51
106
|
def method_missing(method_sym, *arguments, &block)
|
52
107
|
Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
|
53
108
|
if (scan_results = method_sym.to_s.scan(pattern)).present?
|
@@ -68,34 +123,137 @@ module Scruber
|
|
68
123
|
end
|
69
124
|
|
70
125
|
class << self
|
126
|
+
|
127
|
+
#
|
128
|
+
# Register method missing callback
|
129
|
+
#
|
130
|
+
# @param pattern [Regexp] Regexp to match missing name
|
131
|
+
# @param block [Proc] Body to process missing method
|
132
|
+
#
|
133
|
+
# @return [void]
|
71
134
|
def register_method_missing(pattern, &block)
|
72
135
|
_registered_method_missings[pattern] = block
|
73
136
|
end
|
74
137
|
|
138
|
+
#
|
139
|
+
# Registered method missing callbacks dictionary
|
140
|
+
#
|
141
|
+
# @return [Hash] callbacks
|
75
142
|
def _registered_method_missings
|
76
143
|
@registered_method_missings ||= {}
|
77
144
|
end
|
78
145
|
end
|
79
146
|
|
147
|
+
#
|
148
|
+
# Register callback which will be executed when
|
149
|
+
# downloading and parsing will be completed.
|
150
|
+
# For example when you need to write results to file,
|
151
|
+
# or to close files.
|
152
|
+
# @example Close file descriptors
|
153
|
+
# on_complete -1 do
|
154
|
+
# Scruber::Core::Extensions::CsvOutput.close_all
|
155
|
+
# end
|
156
|
+
#
|
157
|
+
# @param priority [Integer] priority of this callback
|
158
|
+
# @param block [Proc] body of callback
|
159
|
+
#
|
160
|
+
# @return [void]
|
161
|
+
def on_complete(priority=1, &block)
|
162
|
+
@on_complete_callbacks.push [priority,block]
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Register callback which will be executed for
|
167
|
+
# error pages, like 404 or 500
|
168
|
+
# Attention! You should call one of these methods for page
|
169
|
+
# to prevent infinite loop: page.processed!, page.delete, page.redownload!(0)
|
170
|
+
# @example Processing error page
|
171
|
+
# on_page_error do |page|
|
172
|
+
# if page.response_body =~ /distil/
|
173
|
+
# page.page.redownload!(0)
|
174
|
+
# elsif page.response_code == /404/
|
175
|
+
# get page.at('a.moved_to').attr('href')
|
176
|
+
# page.processed!
|
177
|
+
# else
|
178
|
+
# page.delete
|
179
|
+
# end
|
180
|
+
# end
|
181
|
+
#
|
182
|
+
# @param block [Proc] body of callback
|
183
|
+
#
|
184
|
+
# @return [void]
|
185
|
+
def on_page_error(&block)
|
186
|
+
@on_page_error_callback = block
|
187
|
+
end
|
188
|
+
|
80
189
|
private
|
81
190
|
|
191
|
+
#
|
192
|
+
# Register parser
|
193
|
+
#
|
194
|
+
# @param page_type [Symbol] type of page
|
195
|
+
# @param options [Hash] options for parser
|
196
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
197
|
+
# page body depends on this format. For example :json or :html
|
198
|
+
# @param block [Proc] body of parser
|
199
|
+
#
|
200
|
+
# @return [void]
|
82
201
|
def register_callback(page_type, options, &block)
|
83
202
|
@callbacks_options[page_type.to_sym] = options || {}
|
84
203
|
@callbacks[page_type.to_sym] = block
|
85
204
|
end
|
86
205
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
206
|
+
#
|
207
|
+
# Process page body depends on format of this page
|
208
|
+
# For example, if page_format = :html, then
|
209
|
+
# it will return Nokogiri::HTML(page.response_body)
|
210
|
+
#
|
211
|
+
# @param page [Page] page from queue
|
212
|
+
# @param page_type [Symbol] name of parser
|
213
|
+
#
|
214
|
+
# @return [Object] depends on page_type it will return different objects
|
91
215
|
def process_page(page, page_type)
|
92
216
|
page_format = @callbacks_options[page_type].fetch(:format){ nil }
|
93
217
|
Scruber::Core::PageFormat.process(page, page_format)
|
94
218
|
end
|
95
219
|
|
220
|
+
#
|
221
|
+
# Loads all extensions
|
222
|
+
#
|
223
|
+
# @return [void]
|
96
224
|
def load_extenstions
|
97
225
|
Scruber::Core::Extensions::Base.descendants.each(&:register)
|
98
226
|
end
|
227
|
+
|
228
|
+
#
|
229
|
+
# Initialize progressbar, that shows progress in console
|
230
|
+
#
|
231
|
+
# @return [void]
|
232
|
+
def initialize_progressbar
|
233
|
+
unless Scruber.configuration.silent
|
234
|
+
@progressbar = PowerBar.new
|
235
|
+
@progressbar.settings.tty.finite.template.main = "${<msg>} ${<bar> }\e[0m \e[33;1m${<percent>%} (${<done>/<total>})"
|
236
|
+
@progressbar.settings.tty.finite.template.padchar = "\e[30;1m#{@progressbar.settings.tty.finite.template.padchar}"
|
237
|
+
@progressbar.settings.tty.finite.template.barchar = "\e[34;1m#{@progressbar.settings.tty.finite.template.barchar}"
|
238
|
+
@progressbar.settings.tty.finite.template.exit = "\e[?25h\e[0m" # clean up after us
|
239
|
+
@progressbar.settings.tty.finite.template.close = "\e[?25h\e[0m\n" # clean up after us
|
240
|
+
@progressbar.settings.tty.finite.output = Proc.new{ |s|
|
241
|
+
$stderr.print s
|
242
|
+
}
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
#
|
247
|
+
# Out progress to console
|
248
|
+
#
|
249
|
+
# @return [void]
|
250
|
+
def show_progress
|
251
|
+
if @progressbar
|
252
|
+
s = queue.size
|
253
|
+
@progressbar.show({:msg => @proggress_status, :done => queue.downloaded_count, :total => s}) unless s.zero?
|
254
|
+
end
|
255
|
+
end
|
99
256
|
end
|
257
|
+
|
100
258
|
end
|
101
259
|
end
|
@@ -1,12 +1,22 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# Base class for extensions
|
6
|
+
# @abstract
|
7
|
+
#
|
8
|
+
# @author [revis0r]
|
9
|
+
#
|
4
10
|
class Base
|
5
11
|
module CoreMethods
|
6
12
|
|
7
13
|
end
|
8
14
|
|
9
15
|
class << self
|
16
|
+
#
|
17
|
+
# Register extension in crawler core
|
18
|
+
#
|
19
|
+
# @return [void]
|
10
20
|
def register
|
11
21
|
Scruber::Core::Crawler.include self.const_get(:CoreMethods)
|
12
22
|
end
|
@@ -1,6 +1,20 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# Helper to write csv files
|
6
|
+
# @example Writing log and products data
|
7
|
+
# Scruber.run :simple do
|
8
|
+
# csv_file Scruber.root.join('log.csv'), col_sep: ';'
|
9
|
+
# csv_products_file Scruber.root.join('products.csv'), col_sep: ';'
|
10
|
+
#
|
11
|
+
# csv_out [Time.now.to_i, 'sample log record']
|
12
|
+
# csv_product_out ['ID', 'Title']
|
13
|
+
# csv_product_out ['1', 'Soap']
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# @author Ivan Goncharov
|
17
|
+
#
|
4
18
|
class CsvOutput < Base
|
5
19
|
module CoreMethods
|
6
20
|
def csv_file(path, options={})
|
@@ -16,6 +30,13 @@ module Scruber
|
|
16
30
|
Scruber::Core::Extensions::CsvOutput.csv_out :default, fields
|
17
31
|
end
|
18
32
|
|
33
|
+
#
|
34
|
+
# Registering method missing callbacks on including
|
35
|
+
# to crawling class
|
36
|
+
#
|
37
|
+
# @param base [Class] class where module was included
|
38
|
+
#
|
39
|
+
# @return [void]
|
19
40
|
def self.included(base)
|
20
41
|
Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
|
21
42
|
file_id = scan_results.first.first.to_sym
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
#
|
5
|
+
# Logging class
|
6
|
+
# Allows to write logs to file and console, depends on configuration
|
7
|
+
#
|
8
|
+
# @author Ivan Goncharov
|
9
|
+
#
|
10
|
+
class Log < Base
|
11
|
+
module CoreMethods
|
12
|
+
#
|
13
|
+
# Writing log
|
14
|
+
#
|
15
|
+
# @param text [String] text
|
16
|
+
# @param color [Symbol] color of text to write
|
17
|
+
#
|
18
|
+
# @return [void]
|
19
|
+
def log(text, color=:white)
|
20
|
+
Scruber.logger.info(scraper_name){ text } rescue nil
|
21
|
+
if @progressbar
|
22
|
+
@progressbar.print "#{Paint[text, color]}\n"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Setting status for console progressbar
|
28
|
+
#
|
29
|
+
# @param text [String] text
|
30
|
+
#
|
31
|
+
# @return [void]
|
32
|
+
def set_status(text)
|
33
|
+
@proggress_status = text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -1,13 +1,47 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# Helper for reading dictionaries.
|
6
|
+
# @example Adding dictionary and reading it
|
7
|
+
# Scruber.run :sample do
|
8
|
+
# add_dictionary :zip_codes_usa, Scruber.root.join('dict', 'zip_codes_usa.csv'), :csv
|
9
|
+
# seed do
|
10
|
+
# loop :zip_codes_usa, state: 'NY' do |row|
|
11
|
+
# get 'https://example.com/by_zip/'+row['zip'].to_s
|
12
|
+
# end
|
13
|
+
# end
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# @author Ivan Goncharov
|
17
|
+
#
|
4
18
|
class Loop < Base
|
5
19
|
module CoreMethods
|
20
|
+
#
|
21
|
+
# Iterate records from dictionary
|
22
|
+
#
|
23
|
+
# @param dictionary [Symbol] name of dictionary
|
24
|
+
# @param options [Hash] search conditions
|
25
|
+
# @param block [Proc] body, yields row of dictionary
|
26
|
+
#
|
27
|
+
# @return [void]
|
6
28
|
def loop(dictionary, options={}, &block)
|
7
29
|
Scruber::Core::Extensions::Loop.loop dictionary, options do |*args|
|
8
30
|
instance_exec *args, &block
|
9
31
|
end
|
10
32
|
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Registering dictionary in system
|
36
|
+
#
|
37
|
+
# @param name [Symbol] name of dictionary
|
38
|
+
# @param file_path [String] path to file
|
39
|
+
# @param file_type [Symbol] type of file, :xml, :csv, etc..
|
40
|
+
#
|
41
|
+
# @return [void]
|
42
|
+
def add_dictionary(name, file_path, file_type)
|
43
|
+
Scruber::Core::Extensions::Loop.add_dictionary(name, file_path, file_type)
|
44
|
+
end
|
11
45
|
end
|
12
46
|
|
13
47
|
class << self
|
@@ -1,6 +1,30 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# DSL for registering parsers.
|
6
|
+
# @example Sample of DSL
|
7
|
+
# Scruber.run :sample do
|
8
|
+
# get 'https://example.com'
|
9
|
+
# get_product 'https://example.com/product1.html'
|
10
|
+
#
|
11
|
+
# # Parsing https://example.com
|
12
|
+
# parse :html do |page,doc|
|
13
|
+
# log doc.at('title').text
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# # Parsing https://example.com/product1.html
|
17
|
+
# parse_product :html do |page,doc|
|
18
|
+
# log doc.at('title').text
|
19
|
+
# end
|
20
|
+
# # Alias to
|
21
|
+
# # parser :product, format: :html do |page,doc|
|
22
|
+
# # log doc.at('title').text
|
23
|
+
# # end
|
24
|
+
# end
|
25
|
+
#
|
26
|
+
# @author Ivan Gocharov
|
27
|
+
#
|
4
28
|
class ParserAliases < Base
|
5
29
|
module CoreMethods
|
6
30
|
def parse(*args, &block)
|
@@ -1,6 +1,21 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# DSL for adding pages to queue
|
6
|
+
# @example Sample of DSL
|
7
|
+
# Scruber.run :sample do
|
8
|
+
# get_product 'https://example.com/product1.html'
|
9
|
+
# # Alias to
|
10
|
+
# # queue.add 'https://example.com/product1.html', page_type: :product
|
11
|
+
#
|
12
|
+
# post_subscribe 'https://example.com/subscribe', body: { email: 'sample@example.com' }
|
13
|
+
# # Alias to
|
14
|
+
# # queue.add 'https://example.com/product1.html', method: :post, page_type: :subscribe, body: { email: 'sample@example.com' }
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# @author Ivan Gocharov
|
18
|
+
#
|
4
19
|
class QueueAliases < Base
|
5
20
|
module CoreMethods
|
6
21
|
%w(get post head).each do |meth|
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
|
5
|
+
#
|
6
|
+
# Seed DSL
|
7
|
+
# Seed block executes only when queue was not initialized yet
|
8
|
+
# (queue has no any page, processed or pending)
|
9
|
+
#
|
10
|
+
# @author Ivan Goncharov
|
11
|
+
#
|
12
|
+
class Seed < Base
|
13
|
+
module CoreMethods
|
14
|
+
def seed(&block)
|
15
|
+
unless queue.initialized?
|
16
|
+
instance_exec &block
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -32,8 +32,18 @@ module Scruber
|
|
32
32
|
if page.max_retry_times.nil?
|
33
33
|
page.max_retry_times = @max_retry_times
|
34
34
|
end
|
35
|
+
if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i
|
36
|
+
page.retry_at = 1.year.from_now.to_i
|
37
|
+
end
|
35
38
|
else
|
36
|
-
|
39
|
+
# Monkey patch to prevent redownloading of 404 pages
|
40
|
+
# and processing 404 pages by regular parsers
|
41
|
+
if page.response_code == 404
|
42
|
+
page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero?
|
43
|
+
page.max_retry_times = page.retry_count
|
44
|
+
else
|
45
|
+
page.fetched_at = Time.now.to_i
|
46
|
+
end
|
37
47
|
end
|
38
48
|
page
|
39
49
|
end
|
@@ -89,26 +99,16 @@ module Scruber
|
|
89
99
|
|
90
100
|
def bad_response?(page)
|
91
101
|
case page.response_code
|
92
|
-
when 0
|
93
|
-
true
|
94
|
-
when 1
|
95
|
-
true
|
96
|
-
when 100..199
|
102
|
+
when 0..1
|
97
103
|
true
|
98
|
-
when 200
|
99
|
-
false
|
100
|
-
when 201..299
|
104
|
+
when 200..299
|
101
105
|
false
|
102
106
|
when 300..399
|
103
107
|
@options.fetch(:followlocation) { false }
|
104
108
|
when 404
|
105
109
|
false
|
106
110
|
when 407
|
107
|
-
raise "
|
108
|
-
when 400..499
|
109
|
-
true
|
110
|
-
when 500..599
|
111
|
-
true
|
111
|
+
raise "RejectedByProxy"
|
112
112
|
else
|
113
113
|
true
|
114
114
|
end
|