scruber 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8deee66960a3768ace0af72a5cb1eced62c90329
4
- data.tar.gz: a0d3f330d8b838aee078f2d752226a1e5432b311
3
+ metadata.gz: 4661c429c6b33a12841569c62835cc587e7f9464
4
+ data.tar.gz: 9cb87a48248746b30d2ece9db71136560629bdcb
5
5
  SHA512:
6
- metadata.gz: 30df32ccd86afde913d47483e9f327b94869c52a21f7c1a43f442ef8a1f138a1500d0746d4641a4542937aa2dbba7e53e4697c95b234cfdc3d07eeb8ab3d13ed
7
- data.tar.gz: 4e57023647a62f7f312a8a77b89097920e7ab6750c1fd6562d1b6b6b4b3ff239b484f08aed6920a259365100ba4efe9d3f168b1a873183a377c877261d49ba15
6
+ metadata.gz: c86674726ae45109383e8d0712612895136384eba1414e061c7d1de3dd9b699af6ab4022de037a537b1377a95b46be2aba10b4e9cea1664449ec880ae1b8189d
7
+ data.tar.gz: 738d34f37fdc629eb3f4755fffeb70e640e53bd7e33333357af0f9af286d648dd92846220a7ef0f9d06918e412fdf8c801a0b25e15cf74b912b0e0459e04009d
@@ -0,0 +1,9 @@
1
+ class Object
2
+ def self.const_missing(name)
3
+ if Scruber::Helpers.const_defined?(name)
4
+ Scruber::Helpers.const_get(name)
5
+ else
6
+ super
7
+ end
8
+ end
9
+ end
@@ -19,6 +19,8 @@ module Scruber
19
19
  raise ::Thor::Error, "ERROR: #{path} already exists." if File.exist?(path)
20
20
  say "Creating scruber project at #{path}"
21
21
  FileUtils.mkdir_p(path)
22
+ FileUtils.mkdir_p(path+'/lib')
23
+ FileUtils.mkdir_p(path+'/log')
22
24
  end
23
25
 
24
26
  def create_files
@@ -15,4 +15,5 @@ Scruber.configure do |config|
15
15
  config.fetcher_agent_options = {}
16
16
  config.queue_adapter = :<%= options[:queue] %>
17
17
  config.queue_options = {}
18
+ config.autoload_paths << Scruber.root.join('lib')
18
19
  end
data/lib/scruber/cli.rb CHANGED
@@ -16,6 +16,7 @@ module Scruber
16
16
  register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
17
17
 
18
18
  desc 'start', 'Run scraper'
19
+ method_option :silent, :type => :boolean, :aliases => '-s', default: false
19
20
  def start(name)
20
21
  if defined?(APP_PATH)
21
22
  scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
@@ -28,6 +29,7 @@ module Scruber
28
29
  ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
29
30
  say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
30
31
 
32
+ Scruber.configuration.silent = options[:silent]
31
33
  require scraper_path
32
34
  else
33
35
  raise ::Thor::Error, "ERROR: Scruber project not found."
@@ -1,12 +1,26 @@
1
1
  module Scruber
2
2
  module Core
3
+ #
4
+ # Configuration class
5
+ #
6
+ # @author Ivan Goncharov
7
+ # @attr [Symbol] fetcher_adapter Fetcher adapter name
8
+ # @attr [Hash] fetcher_options Fetcher options, see {Scruber::FetcherAdapters::AbstractAdapter} options
9
+ # @attr [Symbol] fetcher_agent_adapter Fetcher agent adapter name
10
+ # @attr [Hash] fetcher_agent_options Fetcher agent options, see {Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter}
11
+ # @attr [Symbol] queue_adapter Queue adapter name
12
+ # @attr [Hash] queue_options Queue options, see {Scruber::QueueAdapters::AbstractAdapter}
13
+ # @attr [Array<String>] autoload_paths Array with paths for autoloading classes
14
+ # @attr [Boolean] silent Don't output anything if true
3
15
  class Configuration
4
16
  attr_accessor :fetcher_adapter,
5
17
  :fetcher_options,
6
18
  :fetcher_agent_adapter,
7
19
  :fetcher_agent_options,
8
20
  :queue_adapter,
9
- :queue_options
21
+ :queue_options,
22
+ :autoload_paths,
23
+ :silent
10
24
 
11
25
  def initialize
12
26
  @fetcher_adapter = :typhoeus_fetcher
@@ -15,8 +29,15 @@ module Scruber
15
29
  @fetcher_agent_options = {}
16
30
  @queue_adapter = :memory
17
31
  @queue_options = {}
32
+ @autoload_paths = []
33
+ @silent = false
18
34
  end
19
35
 
36
+ #
37
+ # Merge options from hash
38
+ # @param options [Hash] options
39
+ #
40
+ # @return [void]
20
41
  def merge_options(options)
21
42
  @fetcher_adapter = options.fetch(:fetcher_adapter){ @fetcher_adapter }
22
43
  @fetcher_options.merge! options.fetch(:fetcher_options){ {} }
@@ -24,6 +45,8 @@ module Scruber
24
45
  @fetcher_agent_options.merge! options.fetch(:fetcher_agent_options){ {} }
25
46
  @queue_adapter = options.fetch(:queue_adapter){ @queue_adapter }
26
47
  @queue_options.merge! options.fetch(:queue_options){ {} }
48
+ @autoload_paths += options.fetch(:autoload_paths){ [] }
49
+ @silent = options.fetch(:silent){ false }
27
50
  end
28
51
  end
29
52
  end
@@ -1,8 +1,33 @@
1
1
  module Scruber
2
2
  module Core
3
+ #
4
+ # Crawler class
5
+ #
6
+ # Main class-runner for scrapers.
7
+ #
8
+ # @example Simple scraper
9
+ # Scruber::Core::Crawler.new(:simple) do
10
+ # get 'http://example.com'
11
+ # parse :html do |page,html|
12
+ # puts html.at('title').text
13
+ # end
14
+ # end
15
+ #
16
+ # @author Ivan Goncharov
17
+ #
3
18
  class Crawler
4
19
  attr_reader :queue, :fetcher, :scraper_name
5
20
 
21
+ #
22
+ # Initialize crawler with scraper name and/or with options
23
+ #
24
+ # Crawler.new(:sample, fetcher_adapter: :custom)
25
+ # Crawler.new(:sample)
26
+ # Crawler.new(fetcher_adapter: :custom)
27
+ #
28
+ # @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
29
+ #
30
+ # @return [Scruber::Core::Crawler] [description]
6
31
  def initialize(*args)
7
32
  if args.first.is_a?(Hash)
8
33
  scraper_name = nil
@@ -14,23 +39,29 @@ module Scruber
14
39
  @scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
15
40
  raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
16
41
  @scraper_name = @scraper_name.to_sym
17
- Scruber.configuration.merge_options(options)
18
42
  @callbacks_options = {}
19
43
  @callbacks = {}
44
+ @on_page_error_callback = nil
20
45
  @on_complete_callbacks = []
21
- @queue = Scruber::Queue.new(scraper_name: scraper_name)
46
+
47
+ Scruber.configuration.merge_options(options)
48
+ ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
49
+
50
+ @queue = Scruber::Queue.new(scraper_name: @scraper_name)
22
51
  @fetcher = Scruber::Fetcher.new
52
+ initialize_progressbar
23
53
  load_extenstions
24
54
  end
25
55
 
26
56
  #
27
- # Run crawling.
57
+ # Crawling engine
28
58
  #
29
59
  # @param block [Proc] crawler body
30
60
  def run(&block)
31
61
  instance_eval &block
32
62
  while @queue.has_work? do
33
63
  @fetcher.run @queue
64
+ show_progress
34
65
  while page = @queue.fetch_downloaded do
35
66
  if @callbacks[page.page_type.to_sym]
36
67
  processed_page = process_page(page, page.page_type.to_sym)
@@ -38,16 +69,40 @@ module Scruber
38
69
  page.processed! unless page.sent_to_redownload?
39
70
  end
40
71
  end
72
+ if @on_page_error_callback
73
+ while page = @queue.fetch_error do
74
+ instance_exec page, &(@on_page_error_callback)
75
+ end
76
+ end
41
77
  end
42
78
  @on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
43
79
  instance_exec &(callback)
44
80
  end
45
81
  end
46
82
 
83
+ #
84
+ # Register parser
85
+ #
86
+ # @param page_type [Symbol] type of page
87
+ # @param options [Hash] options for parser
88
+ # @option options [Symbol] :format format of page. Scruber automatically process
89
+ # page body depends on this format. For example :json or :html
90
+ # @param block [Proc] body of parser
91
+ #
92
+ # @return [void]
47
93
  def parser(page_type, options={}, &block)
48
94
  register_callback(page_type, options, &block)
49
95
  end
50
96
 
97
+ #
98
+ # Method missing callback. Scruber allows to register
99
+ # regexp and proc body to process calls
100
+ #
101
+ # @param method_sym [Symbol] missing method name
102
+ # @param arguments [Array] arguments
103
+ # @param block [Proc] block (if passed)
104
+ #
105
+ # @return [type] [description]
51
106
  def method_missing(method_sym, *arguments, &block)
52
107
  Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
53
108
  if (scan_results = method_sym.to_s.scan(pattern)).present?
@@ -68,34 +123,137 @@ module Scruber
68
123
  end
69
124
 
70
125
  class << self
126
+
127
+ #
128
+ # Register method missing callback
129
+ #
130
+ # @param pattern [Regexp] Regexp to match missing name
131
+ # @param block [Proc] Body to process missing method
132
+ #
133
+ # @return [void]
71
134
  def register_method_missing(pattern, &block)
72
135
  _registered_method_missings[pattern] = block
73
136
  end
74
137
 
138
+ #
139
+ # Registered method missing callbacks dictionary
140
+ #
141
+ # @return [Hash] callbacks
75
142
  def _registered_method_missings
76
143
  @registered_method_missings ||= {}
77
144
  end
78
145
  end
79
146
 
147
+ #
148
+ # Register callback which will be executed when
149
+ # downloading and parsing will be completed.
150
+ # For example when you need to write results to file,
151
+ # or to close files.
152
+ # @example Close file descriptors
153
+ # on_complete -1 do
154
+ # Scruber::Core::Extensions::CsvOutput.close_all
155
+ # end
156
+ #
157
+ # @param priority [Integer] priority of this callback
158
+ # @param block [Proc] body of callback
159
+ #
160
+ # @return [void]
161
+ def on_complete(priority=1, &block)
162
+ @on_complete_callbacks.push [priority,block]
163
+ end
164
+
165
+ #
166
+ # Register callback which will be executed for
167
+ # error pages, like 404 or 500
168
+ # Attention! You should call one of these methods for page
169
+ # to prevent infinite loop: page.processed!, page.delete, page.redownload!(0)
170
+ # @example Processing error page
171
+ # on_page_error do |page|
172
+ # if page.response_body =~ /distil/
173
+ # page.page.redownload!(0)
174
+ # elsif page.response_code == /404/
175
+ # get page.at('a.moved_to').attr('href')
176
+ # page.processed!
177
+ # else
178
+ # page.delete
179
+ # end
180
+ # end
181
+ #
182
+ # @param block [Proc] body of callback
183
+ #
184
+ # @return [void]
185
+ def on_page_error(&block)
186
+ @on_page_error_callback = block
187
+ end
188
+
80
189
  private
81
190
 
191
+ #
192
+ # Register parser
193
+ #
194
+ # @param page_type [Symbol] type of page
195
+ # @param options [Hash] options for parser
196
+ # @option options [Symbol] :format format of page. Scruber automatically process
197
+ # page body depends on this format. For example :json or :html
198
+ # @param block [Proc] body of parser
199
+ #
200
+ # @return [void]
82
201
  def register_callback(page_type, options, &block)
83
202
  @callbacks_options[page_type.to_sym] = options || {}
84
203
  @callbacks[page_type.to_sym] = block
85
204
  end
86
205
 
87
- def on_complete(priority=1, &block)
88
- @on_complete_callbacks.push [priority,block]
89
- end
90
-
206
+ #
207
+ # Process page body depends on format of this page
208
+ # For example, if page_format = :html, then
209
+ # it will return Nokogiri::HTML(page.response_body)
210
+ #
211
+ # @param page [Page] page from queue
212
+ # @param page_type [Symbol] name of parser
213
+ #
214
+ # @return [Object] depends on page_type it will return different objects
91
215
  def process_page(page, page_type)
92
216
  page_format = @callbacks_options[page_type].fetch(:format){ nil }
93
217
  Scruber::Core::PageFormat.process(page, page_format)
94
218
  end
95
219
 
220
+ #
221
+ # Loads all extensions
222
+ #
223
+ # @return [void]
96
224
  def load_extenstions
97
225
  Scruber::Core::Extensions::Base.descendants.each(&:register)
98
226
  end
227
+
228
+ #
229
+ # Initialize progressbar, that shows progress in console
230
+ #
231
+ # @return [void]
232
+ def initialize_progressbar
233
+ unless Scruber.configuration.silent
234
+ @progressbar = PowerBar.new
235
+ @progressbar.settings.tty.finite.template.main = "${<msg>} ${<bar> }\e[0m \e[33;1m${<percent>%} (${<done>/<total>})"
236
+ @progressbar.settings.tty.finite.template.padchar = "\e[30;1m#{@progressbar.settings.tty.finite.template.padchar}"
237
+ @progressbar.settings.tty.finite.template.barchar = "\e[34;1m#{@progressbar.settings.tty.finite.template.barchar}"
238
+ @progressbar.settings.tty.finite.template.exit = "\e[?25h\e[0m" # clean up after us
239
+ @progressbar.settings.tty.finite.template.close = "\e[?25h\e[0m\n" # clean up after us
240
+ @progressbar.settings.tty.finite.output = Proc.new{ |s|
241
+ $stderr.print s
242
+ }
243
+ end
244
+ end
245
+
246
+ #
247
+ # Out progress to console
248
+ #
249
+ # @return [void]
250
+ def show_progress
251
+ if @progressbar
252
+ s = queue.size
253
+ @progressbar.show({:msg => @proggress_status, :done => queue.downloaded_count, :total => s}) unless s.zero?
254
+ end
255
+ end
99
256
  end
257
+
100
258
  end
101
259
  end
@@ -1,12 +1,22 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # Base class for extensions
6
+ # @abstract
7
+ #
8
+ # @author [revis0r]
9
+ #
4
10
  class Base
5
11
  module CoreMethods
6
12
 
7
13
  end
8
14
 
9
15
  class << self
16
+ #
17
+ # Register extension in crawler core
18
+ #
19
+ # @return [void]
10
20
  def register
11
21
  Scruber::Core::Crawler.include self.const_get(:CoreMethods)
12
22
  end
@@ -1,6 +1,20 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # Helper to write csv files
6
+ # @example Writing log and products data
7
+ # Scruber.run :simple do
8
+ # csv_file Scruber.root.join('log.csv'), col_sep: ';'
9
+ # csv_products_file Scruber.root.join('products.csv'), col_sep: ';'
10
+ #
11
+ # csv_out [Time.now.to_i, 'sample log record']
12
+ # csv_product_out ['ID', 'Title']
13
+ # csv_product_out ['1', 'Soap']
14
+ # end
15
+ #
16
+ # @author Ivan Goncharov
17
+ #
4
18
  class CsvOutput < Base
5
19
  module CoreMethods
6
20
  def csv_file(path, options={})
@@ -16,6 +30,13 @@ module Scruber
16
30
  Scruber::Core::Extensions::CsvOutput.csv_out :default, fields
17
31
  end
18
32
 
33
+ #
34
+ # Registering method missing callbacks on including
35
+ # to crawling class
36
+ #
37
+ # @param base [Class] class where module was included
38
+ #
39
+ # @return [void]
19
40
  def self.included(base)
20
41
  Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
21
42
  file_id = scan_results.first.first.to_sym
@@ -0,0 +1,39 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ #
5
+ # Logging class
6
+ # Allows to write logs to file and console, depends on configuration
7
+ #
8
+ # @author Ivan Goncharov
9
+ #
10
+ class Log < Base
11
+ module CoreMethods
12
+ #
13
+ # Writing log
14
+ #
15
+ # @param text [String] text
16
+ # @param color [Symbol] color of text to write
17
+ #
18
+ # @return [void]
19
+ def log(text, color=:white)
20
+ Scruber.logger.info(scraper_name){ text } rescue nil
21
+ if @progressbar
22
+ @progressbar.print "#{Paint[text, color]}\n"
23
+ end
24
+ end
25
+
26
+ #
27
+ # Setting status for console progressbar
28
+ #
29
+ # @param text [String] text
30
+ #
31
+ # @return [void]
32
+ def set_status(text)
33
+ @proggress_status = text
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,13 +1,47 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # Helper for reading dictionaries.
6
+ # @example Adding dictionary and reading it
7
+ # Scruber.run :sample do
8
+ # add_dictionary :zip_codes_usa, Scruber.root.join('dict', 'zip_codes_usa.csv'), :csv
9
+ # seed do
10
+ # loop :zip_codes_usa, state: 'NY' do |row|
11
+ # get 'https://example.com/by_zip/'+row['zip'].to_s
12
+ # end
13
+ # end
14
+ # end
15
+ #
16
+ # @author Ivan Goncharov
17
+ #
4
18
  class Loop < Base
5
19
  module CoreMethods
20
+ #
21
+ # Iterate records from dictionary
22
+ #
23
+ # @param dictionary [Symbol] name of dictionary
24
+ # @param options [Hash] search conditions
25
+ # @param block [Proc] body, yields row of dictionary
26
+ #
27
+ # @return [void]
6
28
  def loop(dictionary, options={}, &block)
7
29
  Scruber::Core::Extensions::Loop.loop dictionary, options do |*args|
8
30
  instance_exec *args, &block
9
31
  end
10
32
  end
33
+
34
+ #
35
+ # Registering dictionary in system
36
+ #
37
+ # @param name [Symbol] name of dictionary
38
+ # @param file_path [String] path to file
39
+ # @param file_type [Symbol] type of file, :xml, :csv, etc..
40
+ #
41
+ # @return [void]
42
+ def add_dictionary(name, file_path, file_type)
43
+ Scruber::Core::Extensions::Loop.add_dictionary(name, file_path, file_type)
44
+ end
11
45
  end
12
46
 
13
47
  class << self
@@ -1,6 +1,30 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # DSL for registering parsers.
6
+ # @example Sample of DSL
7
+ # Scruber.run :sample do
8
+ # get 'https://example.com'
9
+ # get_product 'https://example.com/product1.html'
10
+ #
11
+ # # Parsing https://example.com
12
+ # parse :html do |page,doc|
13
+ # log doc.at('title').text
14
+ # end
15
+ #
16
+ # # Parsing https://example.com/product1.html
17
+ # parse_product :html do |page,doc|
18
+ # log doc.at('title').text
19
+ # end
20
+ # # Alias to
21
+ # # parser :product, format: :html do |page,doc|
22
+ # # log doc.at('title').text
23
+ # # end
24
+ # end
25
+ #
26
+ # @author Ivan Gocharov
27
+ #
4
28
  class ParserAliases < Base
5
29
  module CoreMethods
6
30
  def parse(*args, &block)
@@ -1,6 +1,21 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # DSL for adding pages to queue
6
+ # @example Sample of DSL
7
+ # Scruber.run :sample do
8
+ # get_product 'https://example.com/product1.html'
9
+ # # Alias to
10
+ # # queue.add 'https://example.com/product1.html', page_type: :product
11
+ #
12
+ # post_subscribe 'https://example.com/subscribe', body: { email: 'sample@example.com' }
13
+ # # Alias to
14
+ # # queue.add 'https://example.com/product1.html', method: :post, page_type: :subscribe, body: { email: 'sample@example.com' }
15
+ # end
16
+ #
17
+ # @author Ivan Gocharov
18
+ #
4
19
  class QueueAliases < Base
5
20
  module CoreMethods
6
21
  %w(get post head).each do |meth|
@@ -0,0 +1,23 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+
5
+ #
6
+ # Seed DSL
7
+ # Seed block executes only when queue was not initialized yet
8
+ # (queue has no any page, processed or pending)
9
+ #
10
+ # @author Ivan Goncharov
11
+ #
12
+ class Seed < Base
13
+ module CoreMethods
14
+ def seed(&block)
15
+ unless queue.initialized?
16
+ instance_exec &block
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -32,8 +32,18 @@ module Scruber
32
32
  if page.max_retry_times.nil?
33
33
  page.max_retry_times = @max_retry_times
34
34
  end
35
+ if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i
36
+ page.retry_at = 1.year.from_now.to_i
37
+ end
35
38
  else
36
- page.fetched_at = Time.now.to_i
39
+ # Monkey patch to prevent redownloading of 404 pages
40
+ # and processing 404 pages by regular parsers
41
+ if page.response_code == 404
42
+ page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero?
43
+ page.max_retry_times = page.retry_count
44
+ else
45
+ page.fetched_at = Time.now.to_i
46
+ end
37
47
  end
38
48
  page
39
49
  end
@@ -89,26 +99,16 @@ module Scruber
89
99
 
90
100
  def bad_response?(page)
91
101
  case page.response_code
92
- when 0
93
- true
94
- when 1
95
- true
96
- when 100..199
102
+ when 0..1
97
103
  true
98
- when 200
99
- false
100
- when 201..299
104
+ when 200..299
101
105
  false
102
106
  when 300..399
103
107
  @options.fetch(:followlocation) { false }
104
108
  when 404
105
109
  false
106
110
  when 407
107
- raise "RejectedByProxyError"
108
- when 400..499
109
- true
110
- when 500..599
111
- true
111
+ raise "RejectedByProxy"
112
112
  else
113
113
  true
114
114
  end
@@ -64,7 +64,7 @@ module Scruber
64
64
  page.response_total_time = response.total_time
65
65
 
66
66
  if response.timed_out?
67
- page[:response_code] = 1
67
+ page.response_code = 1
68
68
  end
69
69
 
70
70
  page = after_request_callback(page)