scruber 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/core_ext/const_missing.rb +9 -0
- data/lib/scruber/cli/project_generator.rb +2 -0
- data/lib/scruber/cli/templates/application.tt +1 -0
- data/lib/scruber/cli.rb +2 -0
- data/lib/scruber/core/configuration.rb +24 -1
- data/lib/scruber/core/crawler.rb +165 -7
- data/lib/scruber/core/extensions/base.rb +10 -0
- data/lib/scruber/core/extensions/csv_output.rb +21 -0
- data/lib/scruber/core/extensions/log.rb +39 -0
- data/lib/scruber/core/extensions/loop.rb +34 -0
- data/lib/scruber/core/extensions/parser_aliases.rb +24 -0
- data/lib/scruber/core/extensions/queue_aliases.rb +15 -0
- data/lib/scruber/core/extensions/seed.rb +23 -0
- data/lib/scruber/fetcher_adapters/abstract_adapter.rb +14 -14
- data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +1 -1
- data/lib/scruber/queue_adapters/abstract_adapter.rb +149 -10
- data/lib/scruber/queue_adapters/memory.rb +139 -9
- data/lib/scruber/version.rb +1 -1
- data/lib/scruber.rb +23 -8
- data/scruber.gemspec +6 -4
- metadata +71 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4661c429c6b33a12841569c62835cc587e7f9464
|
4
|
+
data.tar.gz: 9cb87a48248746b30d2ece9db71136560629bdcb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c86674726ae45109383e8d0712612895136384eba1414e061c7d1de3dd9b699af6ab4022de037a537b1377a95b46be2aba10b4e9cea1664449ec880ae1b8189d
|
7
|
+
data.tar.gz: 738d34f37fdc629eb3f4755fffeb70e640e53bd7e33333357af0f9af286d648dd92846220a7ef0f9d06918e412fdf8c801a0b25e15cf74b912b0e0459e04009d
|
@@ -19,6 +19,8 @@ module Scruber
|
|
19
19
|
raise ::Thor::Error, "ERROR: #{path} already exists." if File.exist?(path)
|
20
20
|
say "Creating scruber project at #{path}"
|
21
21
|
FileUtils.mkdir_p(path)
|
22
|
+
FileUtils.mkdir_p(path+'/lib')
|
23
|
+
FileUtils.mkdir_p(path+'/log')
|
22
24
|
end
|
23
25
|
|
24
26
|
def create_files
|
data/lib/scruber/cli.rb
CHANGED
@@ -16,6 +16,7 @@ module Scruber
|
|
16
16
|
register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
|
17
17
|
|
18
18
|
desc 'start', 'Run scraper'
|
19
|
+
method_option :silent, :type => :boolean, :aliases => '-s', default: false
|
19
20
|
def start(name)
|
20
21
|
if defined?(APP_PATH)
|
21
22
|
scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
|
@@ -28,6 +29,7 @@ module Scruber
|
|
28
29
|
ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
|
29
30
|
say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
|
30
31
|
|
32
|
+
Scruber.configuration.silent = options[:silent]
|
31
33
|
require scraper_path
|
32
34
|
else
|
33
35
|
raise ::Thor::Error, "ERROR: Scruber project not found."
|
@@ -1,12 +1,26 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
|
+
#
|
4
|
+
# Configuration class
|
5
|
+
#
|
6
|
+
# @author Ivan Goncharov
|
7
|
+
# @attr [Symbol] fetcher_adapter Fetcher adapter name
|
8
|
+
# @attr [Hash] fetcher_options Fetcher options, see {Scruber::FetcherAdapters::AbstractAdapter} options
|
9
|
+
# @attr [Symbol] fetcher_agent_adapter Fetcher agent adapter name
|
10
|
+
# @attr [Hash] fetcher_agent_options Fetcher agent options, see {Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter}
|
11
|
+
# @attr [Symbol] queue_adapter Queue adapter name
|
12
|
+
# @attr [Hash] queue_options Queue options, see {Scruber::QueueAdapters::AbstractAdapter}
|
13
|
+
# @attr [Array<String>] autoload_paths Array with paths for autoloading classes
|
14
|
+
# @attr [Boolean] silent Don't output anything if true
|
3
15
|
class Configuration
|
4
16
|
attr_accessor :fetcher_adapter,
|
5
17
|
:fetcher_options,
|
6
18
|
:fetcher_agent_adapter,
|
7
19
|
:fetcher_agent_options,
|
8
20
|
:queue_adapter,
|
9
|
-
:queue_options
|
21
|
+
:queue_options,
|
22
|
+
:autoload_paths,
|
23
|
+
:silent
|
10
24
|
|
11
25
|
def initialize
|
12
26
|
@fetcher_adapter = :typhoeus_fetcher
|
@@ -15,8 +29,15 @@ module Scruber
|
|
15
29
|
@fetcher_agent_options = {}
|
16
30
|
@queue_adapter = :memory
|
17
31
|
@queue_options = {}
|
32
|
+
@autoload_paths = []
|
33
|
+
@silent = false
|
18
34
|
end
|
19
35
|
|
36
|
+
#
|
37
|
+
# Merge options from hash
|
38
|
+
# @param options [Hash] options
|
39
|
+
#
|
40
|
+
# @return [void]
|
20
41
|
def merge_options(options)
|
21
42
|
@fetcher_adapter = options.fetch(:fetcher_adapter){ @fetcher_adapter }
|
22
43
|
@fetcher_options.merge! options.fetch(:fetcher_options){ {} }
|
@@ -24,6 +45,8 @@ module Scruber
|
|
24
45
|
@fetcher_agent_options.merge! options.fetch(:fetcher_agent_options){ {} }
|
25
46
|
@queue_adapter = options.fetch(:queue_adapter){ @queue_adapter }
|
26
47
|
@queue_options.merge! options.fetch(:queue_options){ {} }
|
48
|
+
@autoload_paths += options.fetch(:autoload_paths){ [] }
|
49
|
+
@silent = options.fetch(:silent){ false }
|
27
50
|
end
|
28
51
|
end
|
29
52
|
end
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -1,8 +1,33 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
|
+
#
|
4
|
+
# Crawler class
|
5
|
+
#
|
6
|
+
# Main class-runner for scrapers.
|
7
|
+
#
|
8
|
+
# @example Simple scraper
|
9
|
+
# Scruber::Core::Crawler.new(:simple) do
|
10
|
+
# get 'http://example.com'
|
11
|
+
# parse :html do |page,html|
|
12
|
+
# puts html.at('title').text
|
13
|
+
# end
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# @author Ivan Goncharov
|
17
|
+
#
|
3
18
|
class Crawler
|
4
19
|
attr_reader :queue, :fetcher, :scraper_name
|
5
20
|
|
21
|
+
#
|
22
|
+
# Initialize crawler with scraper name and/or with options
|
23
|
+
#
|
24
|
+
# Crawler.new(:sample, fetcher_adapter: :custom)
|
25
|
+
# Crawler.new(:sample)
|
26
|
+
# Crawler.new(fetcher_adapter: :custom)
|
27
|
+
#
|
28
|
+
# @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
|
29
|
+
#
|
30
|
+
# @return [Scruber::Core::Crawler] [description]
|
6
31
|
def initialize(*args)
|
7
32
|
if args.first.is_a?(Hash)
|
8
33
|
scraper_name = nil
|
@@ -14,23 +39,29 @@ module Scruber
|
|
14
39
|
@scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
|
15
40
|
raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
|
16
41
|
@scraper_name = @scraper_name.to_sym
|
17
|
-
Scruber.configuration.merge_options(options)
|
18
42
|
@callbacks_options = {}
|
19
43
|
@callbacks = {}
|
44
|
+
@on_page_error_callback = nil
|
20
45
|
@on_complete_callbacks = []
|
21
|
-
|
46
|
+
|
47
|
+
Scruber.configuration.merge_options(options)
|
48
|
+
ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
|
49
|
+
|
50
|
+
@queue = Scruber::Queue.new(scraper_name: @scraper_name)
|
22
51
|
@fetcher = Scruber::Fetcher.new
|
52
|
+
initialize_progressbar
|
23
53
|
load_extenstions
|
24
54
|
end
|
25
55
|
|
26
56
|
#
|
27
|
-
#
|
57
|
+
# Crawling engine
|
28
58
|
#
|
29
59
|
# @param block [Proc] crawler body
|
30
60
|
def run(&block)
|
31
61
|
instance_eval &block
|
32
62
|
while @queue.has_work? do
|
33
63
|
@fetcher.run @queue
|
64
|
+
show_progress
|
34
65
|
while page = @queue.fetch_downloaded do
|
35
66
|
if @callbacks[page.page_type.to_sym]
|
36
67
|
processed_page = process_page(page, page.page_type.to_sym)
|
@@ -38,16 +69,40 @@ module Scruber
|
|
38
69
|
page.processed! unless page.sent_to_redownload?
|
39
70
|
end
|
40
71
|
end
|
72
|
+
if @on_page_error_callback
|
73
|
+
while page = @queue.fetch_error do
|
74
|
+
instance_exec page, &(@on_page_error_callback)
|
75
|
+
end
|
76
|
+
end
|
41
77
|
end
|
42
78
|
@on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
|
43
79
|
instance_exec &(callback)
|
44
80
|
end
|
45
81
|
end
|
46
82
|
|
83
|
+
#
|
84
|
+
# Register parser
|
85
|
+
#
|
86
|
+
# @param page_type [Symbol] type of page
|
87
|
+
# @param options [Hash] options for parser
|
88
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
89
|
+
# page body depends on this format. For example :json or :html
|
90
|
+
# @param block [Proc] body of parser
|
91
|
+
#
|
92
|
+
# @return [void]
|
47
93
|
def parser(page_type, options={}, &block)
|
48
94
|
register_callback(page_type, options, &block)
|
49
95
|
end
|
50
96
|
|
97
|
+
#
|
98
|
+
# Method missing callback. Scruber allows to register
|
99
|
+
# regexp and proc body to process calls
|
100
|
+
#
|
101
|
+
# @param method_sym [Symbol] missing method name
|
102
|
+
# @param arguments [Array] arguments
|
103
|
+
# @param block [Proc] block (if passed)
|
104
|
+
#
|
105
|
+
# @return [type] [description]
|
51
106
|
def method_missing(method_sym, *arguments, &block)
|
52
107
|
Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
|
53
108
|
if (scan_results = method_sym.to_s.scan(pattern)).present?
|
@@ -68,34 +123,137 @@ module Scruber
|
|
68
123
|
end
|
69
124
|
|
70
125
|
class << self
|
126
|
+
|
127
|
+
#
|
128
|
+
# Register method missing callback
|
129
|
+
#
|
130
|
+
# @param pattern [Regexp] Regexp to match missing name
|
131
|
+
# @param block [Proc] Body to process missing method
|
132
|
+
#
|
133
|
+
# @return [void]
|
71
134
|
def register_method_missing(pattern, &block)
|
72
135
|
_registered_method_missings[pattern] = block
|
73
136
|
end
|
74
137
|
|
138
|
+
#
|
139
|
+
# Registered method missing callbacks dictionary
|
140
|
+
#
|
141
|
+
# @return [Hash] callbacks
|
75
142
|
def _registered_method_missings
|
76
143
|
@registered_method_missings ||= {}
|
77
144
|
end
|
78
145
|
end
|
79
146
|
|
147
|
+
#
|
148
|
+
# Register callback which will be executed when
|
149
|
+
# downloading and parsing will be completed.
|
150
|
+
# For example when you need to write results to file,
|
151
|
+
# or to close files.
|
152
|
+
# @example Close file descriptors
|
153
|
+
# on_complete -1 do
|
154
|
+
# Scruber::Core::Extensions::CsvOutput.close_all
|
155
|
+
# end
|
156
|
+
#
|
157
|
+
# @param priority [Integer] priority of this callback
|
158
|
+
# @param block [Proc] body of callback
|
159
|
+
#
|
160
|
+
# @return [void]
|
161
|
+
def on_complete(priority=1, &block)
|
162
|
+
@on_complete_callbacks.push [priority,block]
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Register callback which will be executed for
|
167
|
+
# error pages, like 404 or 500
|
168
|
+
# Attention! You should call one of these methods for page
|
169
|
+
# to prevent infinite loop: page.processed!, page.delete, page.redownload!(0)
|
170
|
+
# @example Processing error page
|
171
|
+
# on_page_error do |page|
|
172
|
+
# if page.response_body =~ /distil/
|
173
|
+
# page.page.redownload!(0)
|
174
|
+
# elsif page.response_code == /404/
|
175
|
+
# get page.at('a.moved_to').attr('href')
|
176
|
+
# page.processed!
|
177
|
+
# else
|
178
|
+
# page.delete
|
179
|
+
# end
|
180
|
+
# end
|
181
|
+
#
|
182
|
+
# @param block [Proc] body of callback
|
183
|
+
#
|
184
|
+
# @return [void]
|
185
|
+
def on_page_error(&block)
|
186
|
+
@on_page_error_callback = block
|
187
|
+
end
|
188
|
+
|
80
189
|
private
|
81
190
|
|
191
|
+
#
|
192
|
+
# Register parser
|
193
|
+
#
|
194
|
+
# @param page_type [Symbol] type of page
|
195
|
+
# @param options [Hash] options for parser
|
196
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
197
|
+
# page body depends on this format. For example :json or :html
|
198
|
+
# @param block [Proc] body of parser
|
199
|
+
#
|
200
|
+
# @return [void]
|
82
201
|
def register_callback(page_type, options, &block)
|
83
202
|
@callbacks_options[page_type.to_sym] = options || {}
|
84
203
|
@callbacks[page_type.to_sym] = block
|
85
204
|
end
|
86
205
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
206
|
+
#
|
207
|
+
# Process page body depends on format of this page
|
208
|
+
# For example, if page_format = :html, then
|
209
|
+
# it will return Nokogiri::HTML(page.response_body)
|
210
|
+
#
|
211
|
+
# @param page [Page] page from queue
|
212
|
+
# @param page_type [Symbol] name of parser
|
213
|
+
#
|
214
|
+
# @return [Object] depends on page_type it will return different objects
|
91
215
|
def process_page(page, page_type)
|
92
216
|
page_format = @callbacks_options[page_type].fetch(:format){ nil }
|
93
217
|
Scruber::Core::PageFormat.process(page, page_format)
|
94
218
|
end
|
95
219
|
|
220
|
+
#
|
221
|
+
# Loads all extensions
|
222
|
+
#
|
223
|
+
# @return [void]
|
96
224
|
def load_extenstions
|
97
225
|
Scruber::Core::Extensions::Base.descendants.each(&:register)
|
98
226
|
end
|
227
|
+
|
228
|
+
#
|
229
|
+
# Initialize progressbar, that shows progress in console
|
230
|
+
#
|
231
|
+
# @return [void]
|
232
|
+
def initialize_progressbar
|
233
|
+
unless Scruber.configuration.silent
|
234
|
+
@progressbar = PowerBar.new
|
235
|
+
@progressbar.settings.tty.finite.template.main = "${<msg>} ${<bar> }\e[0m \e[33;1m${<percent>%} (${<done>/<total>})"
|
236
|
+
@progressbar.settings.tty.finite.template.padchar = "\e[30;1m#{@progressbar.settings.tty.finite.template.padchar}"
|
237
|
+
@progressbar.settings.tty.finite.template.barchar = "\e[34;1m#{@progressbar.settings.tty.finite.template.barchar}"
|
238
|
+
@progressbar.settings.tty.finite.template.exit = "\e[?25h\e[0m" # clean up after us
|
239
|
+
@progressbar.settings.tty.finite.template.close = "\e[?25h\e[0m\n" # clean up after us
|
240
|
+
@progressbar.settings.tty.finite.output = Proc.new{ |s|
|
241
|
+
$stderr.print s
|
242
|
+
}
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
#
|
247
|
+
# Out progress to console
|
248
|
+
#
|
249
|
+
# @return [void]
|
250
|
+
def show_progress
|
251
|
+
if @progressbar
|
252
|
+
s = queue.size
|
253
|
+
@progressbar.show({:msg => @proggress_status, :done => queue.downloaded_count, :total => s}) unless s.zero?
|
254
|
+
end
|
255
|
+
end
|
99
256
|
end
|
257
|
+
|
100
258
|
end
|
101
259
|
end
|
@@ -1,12 +1,22 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# Base class for extensions
|
6
|
+
# @abstract
|
7
|
+
#
|
8
|
+
# @author [revis0r]
|
9
|
+
#
|
4
10
|
class Base
|
5
11
|
module CoreMethods
|
6
12
|
|
7
13
|
end
|
8
14
|
|
9
15
|
class << self
|
16
|
+
#
|
17
|
+
# Register extension in crawler core
|
18
|
+
#
|
19
|
+
# @return [void]
|
10
20
|
def register
|
11
21
|
Scruber::Core::Crawler.include self.const_get(:CoreMethods)
|
12
22
|
end
|
@@ -1,6 +1,20 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# Helper to write csv files
|
6
|
+
# @example Writing log and products data
|
7
|
+
# Scruber.run :simple do
|
8
|
+
# csv_file Scruber.root.join('log.csv'), col_sep: ';'
|
9
|
+
# csv_products_file Scruber.root.join('products.csv'), col_sep: ';'
|
10
|
+
#
|
11
|
+
# csv_out [Time.now.to_i, 'sample log record']
|
12
|
+
# csv_product_out ['ID', 'Title']
|
13
|
+
# csv_product_out ['1', 'Soap']
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# @author Ivan Goncharov
|
17
|
+
#
|
4
18
|
class CsvOutput < Base
|
5
19
|
module CoreMethods
|
6
20
|
def csv_file(path, options={})
|
@@ -16,6 +30,13 @@ module Scruber
|
|
16
30
|
Scruber::Core::Extensions::CsvOutput.csv_out :default, fields
|
17
31
|
end
|
18
32
|
|
33
|
+
#
|
34
|
+
# Registering method missing callbacks on including
|
35
|
+
# to crawling class
|
36
|
+
#
|
37
|
+
# @param base [Class] class where module was included
|
38
|
+
#
|
39
|
+
# @return [void]
|
19
40
|
def self.included(base)
|
20
41
|
Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
|
21
42
|
file_id = scan_results.first.first.to_sym
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
#
|
5
|
+
# Logging class
|
6
|
+
# Allows to write logs to file and console, depends on configuration
|
7
|
+
#
|
8
|
+
# @author Ivan Goncharov
|
9
|
+
#
|
10
|
+
class Log < Base
|
11
|
+
module CoreMethods
|
12
|
+
#
|
13
|
+
# Writing log
|
14
|
+
#
|
15
|
+
# @param text [String] text
|
16
|
+
# @param color [Symbol] color of text to write
|
17
|
+
#
|
18
|
+
# @return [void]
|
19
|
+
def log(text, color=:white)
|
20
|
+
Scruber.logger.info(scraper_name){ text } rescue nil
|
21
|
+
if @progressbar
|
22
|
+
@progressbar.print "#{Paint[text, color]}\n"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# Setting status for console progressbar
|
28
|
+
#
|
29
|
+
# @param text [String] text
|
30
|
+
#
|
31
|
+
# @return [void]
|
32
|
+
def set_status(text)
|
33
|
+
@proggress_status = text
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -1,13 +1,47 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# Helper for reading dictionaries.
|
6
|
+
# @example Adding dictionary and reading it
|
7
|
+
# Scruber.run :sample do
|
8
|
+
# add_dictionary :zip_codes_usa, Scruber.root.join('dict', 'zip_codes_usa.csv'), :csv
|
9
|
+
# seed do
|
10
|
+
# loop :zip_codes_usa, state: 'NY' do |row|
|
11
|
+
# get 'https://example.com/by_zip/'+row['zip'].to_s
|
12
|
+
# end
|
13
|
+
# end
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# @author Ivan Goncharov
|
17
|
+
#
|
4
18
|
class Loop < Base
|
5
19
|
module CoreMethods
|
20
|
+
#
|
21
|
+
# Iterate records from dictionary
|
22
|
+
#
|
23
|
+
# @param dictionary [Symbol] name of dictionary
|
24
|
+
# @param options [Hash] search conditions
|
25
|
+
# @param block [Proc] body, yields row of dictionary
|
26
|
+
#
|
27
|
+
# @return [void]
|
6
28
|
def loop(dictionary, options={}, &block)
|
7
29
|
Scruber::Core::Extensions::Loop.loop dictionary, options do |*args|
|
8
30
|
instance_exec *args, &block
|
9
31
|
end
|
10
32
|
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Registering dictionary in system
|
36
|
+
#
|
37
|
+
# @param name [Symbol] name of dictionary
|
38
|
+
# @param file_path [String] path to file
|
39
|
+
# @param file_type [Symbol] type of file, :xml, :csv, etc..
|
40
|
+
#
|
41
|
+
# @return [void]
|
42
|
+
def add_dictionary(name, file_path, file_type)
|
43
|
+
Scruber::Core::Extensions::Loop.add_dictionary(name, file_path, file_type)
|
44
|
+
end
|
11
45
|
end
|
12
46
|
|
13
47
|
class << self
|
@@ -1,6 +1,30 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# DSL for registering parsers.
|
6
|
+
# @example Sample of DSL
|
7
|
+
# Scruber.run :sample do
|
8
|
+
# get 'https://example.com'
|
9
|
+
# get_product 'https://example.com/product1.html'
|
10
|
+
#
|
11
|
+
# # Parsing https://example.com
|
12
|
+
# parse :html do |page,doc|
|
13
|
+
# log doc.at('title').text
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# # Parsing https://example.com/product1.html
|
17
|
+
# parse_product :html do |page,doc|
|
18
|
+
# log doc.at('title').text
|
19
|
+
# end
|
20
|
+
# # Alias to
|
21
|
+
# # parser :product, format: :html do |page,doc|
|
22
|
+
# # log doc.at('title').text
|
23
|
+
# # end
|
24
|
+
# end
|
25
|
+
#
|
26
|
+
# @author Ivan Gocharov
|
27
|
+
#
|
4
28
|
class ParserAliases < Base
|
5
29
|
module CoreMethods
|
6
30
|
def parse(*args, &block)
|
@@ -1,6 +1,21 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
3
|
module Extensions
|
4
|
+
#
|
5
|
+
# DSL for adding pages to queue
|
6
|
+
# @example Sample of DSL
|
7
|
+
# Scruber.run :sample do
|
8
|
+
# get_product 'https://example.com/product1.html'
|
9
|
+
# # Alias to
|
10
|
+
# # queue.add 'https://example.com/product1.html', page_type: :product
|
11
|
+
#
|
12
|
+
# post_subscribe 'https://example.com/subscribe', body: { email: 'sample@example.com' }
|
13
|
+
# # Alias to
|
14
|
+
# # queue.add 'https://example.com/product1.html', method: :post, page_type: :subscribe, body: { email: 'sample@example.com' }
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
# @author Ivan Gocharov
|
18
|
+
#
|
4
19
|
class QueueAliases < Base
|
5
20
|
module CoreMethods
|
6
21
|
%w(get post head).each do |meth|
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Scruber
|
2
|
+
module Core
|
3
|
+
module Extensions
|
4
|
+
|
5
|
+
#
|
6
|
+
# Seed DSL
|
7
|
+
# Seed block executes only when queue was not initialized yet
|
8
|
+
# (queue has no any page, processed or pending)
|
9
|
+
#
|
10
|
+
# @author Ivan Goncharov
|
11
|
+
#
|
12
|
+
class Seed < Base
|
13
|
+
module CoreMethods
|
14
|
+
def seed(&block)
|
15
|
+
unless queue.initialized?
|
16
|
+
instance_exec &block
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -32,8 +32,18 @@ module Scruber
|
|
32
32
|
if page.max_retry_times.nil?
|
33
33
|
page.max_retry_times = @max_retry_times
|
34
34
|
end
|
35
|
+
if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i
|
36
|
+
page.retry_at = 1.year.from_now.to_i
|
37
|
+
end
|
35
38
|
else
|
36
|
-
|
39
|
+
# Monkey patch to prevent redownloading of 404 pages
|
40
|
+
# and processing 404 pages by regular parsers
|
41
|
+
if page.response_code == 404
|
42
|
+
page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero?
|
43
|
+
page.max_retry_times = page.retry_count
|
44
|
+
else
|
45
|
+
page.fetched_at = Time.now.to_i
|
46
|
+
end
|
37
47
|
end
|
38
48
|
page
|
39
49
|
end
|
@@ -89,26 +99,16 @@ module Scruber
|
|
89
99
|
|
90
100
|
def bad_response?(page)
|
91
101
|
case page.response_code
|
92
|
-
when 0
|
93
|
-
true
|
94
|
-
when 1
|
95
|
-
true
|
96
|
-
when 100..199
|
102
|
+
when 0..1
|
97
103
|
true
|
98
|
-
when 200
|
99
|
-
false
|
100
|
-
when 201..299
|
104
|
+
when 200..299
|
101
105
|
false
|
102
106
|
when 300..399
|
103
107
|
@options.fetch(:followlocation) { false }
|
104
108
|
when 404
|
105
109
|
false
|
106
110
|
when 407
|
107
|
-
raise "
|
108
|
-
when 400..499
|
109
|
-
true
|
110
|
-
when 500..599
|
111
|
-
true
|
111
|
+
raise "RejectedByProxy"
|
112
112
|
else
|
113
113
|
true
|
114
114
|
end
|