scruber 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8deee66960a3768ace0af72a5cb1eced62c90329
4
- data.tar.gz: a0d3f330d8b838aee078f2d752226a1e5432b311
3
+ metadata.gz: 4661c429c6b33a12841569c62835cc587e7f9464
4
+ data.tar.gz: 9cb87a48248746b30d2ece9db71136560629bdcb
5
5
  SHA512:
6
- metadata.gz: 30df32ccd86afde913d47483e9f327b94869c52a21f7c1a43f442ef8a1f138a1500d0746d4641a4542937aa2dbba7e53e4697c95b234cfdc3d07eeb8ab3d13ed
7
- data.tar.gz: 4e57023647a62f7f312a8a77b89097920e7ab6750c1fd6562d1b6b6b4b3ff239b484f08aed6920a259365100ba4efe9d3f168b1a873183a377c877261d49ba15
6
+ metadata.gz: c86674726ae45109383e8d0712612895136384eba1414e061c7d1de3dd9b699af6ab4022de037a537b1377a95b46be2aba10b4e9cea1664449ec880ae1b8189d
7
+ data.tar.gz: 738d34f37fdc629eb3f4755fffeb70e640e53bd7e33333357af0f9af286d648dd92846220a7ef0f9d06918e412fdf8c801a0b25e15cf74b912b0e0459e04009d
@@ -0,0 +1,9 @@
1
+ class Object
2
+ def self.const_missing(name)
3
+ if Scruber::Helpers.const_defined?(name)
4
+ Scruber::Helpers.const_get(name)
5
+ else
6
+ super
7
+ end
8
+ end
9
+ end
@@ -19,6 +19,8 @@ module Scruber
19
19
  raise ::Thor::Error, "ERROR: #{path} already exists." if File.exist?(path)
20
20
  say "Creating scruber project at #{path}"
21
21
  FileUtils.mkdir_p(path)
22
+ FileUtils.mkdir_p(path+'/lib')
23
+ FileUtils.mkdir_p(path+'/log')
22
24
  end
23
25
 
24
26
  def create_files
@@ -15,4 +15,5 @@ Scruber.configure do |config|
15
15
  config.fetcher_agent_options = {}
16
16
  config.queue_adapter = :<%= options[:queue] %>
17
17
  config.queue_options = {}
18
+ config.autoload_paths << Scruber.root.join('lib')
18
19
  end
data/lib/scruber/cli.rb CHANGED
@@ -16,6 +16,7 @@ module Scruber
16
16
  register Generators, 'generate', 'generate [GENERATOR]', 'Generate something'
17
17
 
18
18
  desc 'start', 'Run scraper'
19
+ method_option :silent, :type => :boolean, :aliases => '-s', default: false
19
20
  def start(name)
20
21
  if defined?(APP_PATH)
21
22
  scraper_path = Scruber::AppSearcher.find_scraper(name, APP_PATH)
@@ -28,6 +29,7 @@ module Scruber
28
29
  ENV['SCRUBER_SCRAPER_NAME'] = File.basename(scraper_path).gsub(/\.rb\Z/, '').underscore
29
30
  say "starting #{ENV['SCRUBER_SCRAPER_NAME']}"
30
31
 
32
+ Scruber.configuration.silent = options[:silent]
31
33
  require scraper_path
32
34
  else
33
35
  raise ::Thor::Error, "ERROR: Scruber project not found."
@@ -1,12 +1,26 @@
1
1
  module Scruber
2
2
  module Core
3
+ #
4
+ # Configuration class
5
+ #
6
+ # @author Ivan Goncharov
7
+ # @attr [Symbol] fetcher_adapter Fetcher adapter name
8
+ # @attr [Hash] fetcher_options Fetcher options, see {Scruber::FetcherAdapters::AbstractAdapter} options
9
+ # @attr [Symbol] fetcher_agent_adapter Fetcher agent adapter name
10
+ # @attr [Hash] fetcher_agent_options Fetcher agent options, see {Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter}
11
+ # @attr [Symbol] queue_adapter Queue adapter name
12
+ # @attr [Hash] queue_options Queue options, see {Scruber::QueueAdapters::AbstractAdapter}
13
+ # @attr [Array<String>] autoload_paths Array with paths for autoloading classes
14
+ # @attr [Boolean] silent Don't output anything if true
3
15
  class Configuration
4
16
  attr_accessor :fetcher_adapter,
5
17
  :fetcher_options,
6
18
  :fetcher_agent_adapter,
7
19
  :fetcher_agent_options,
8
20
  :queue_adapter,
9
- :queue_options
21
+ :queue_options,
22
+ :autoload_paths,
23
+ :silent
10
24
 
11
25
  def initialize
12
26
  @fetcher_adapter = :typhoeus_fetcher
@@ -15,8 +29,15 @@ module Scruber
15
29
  @fetcher_agent_options = {}
16
30
  @queue_adapter = :memory
17
31
  @queue_options = {}
32
+ @autoload_paths = []
33
+ @silent = false
18
34
  end
19
35
 
36
+ #
37
+ # Merge options from hash
38
+ # @param options [Hash] options
39
+ #
40
+ # @return [void]
20
41
  def merge_options(options)
21
42
  @fetcher_adapter = options.fetch(:fetcher_adapter){ @fetcher_adapter }
22
43
  @fetcher_options.merge! options.fetch(:fetcher_options){ {} }
@@ -24,6 +45,8 @@ module Scruber
24
45
  @fetcher_agent_options.merge! options.fetch(:fetcher_agent_options){ {} }
25
46
  @queue_adapter = options.fetch(:queue_adapter){ @queue_adapter }
26
47
  @queue_options.merge! options.fetch(:queue_options){ {} }
48
+ @autoload_paths += options.fetch(:autoload_paths){ [] }
49
+ @silent = options.fetch(:silent){ false }
27
50
  end
28
51
  end
29
52
  end
@@ -1,8 +1,33 @@
1
1
  module Scruber
2
2
  module Core
3
+ #
4
+ # Crawler class
5
+ #
6
+ # Main class-runner for scrapers.
7
+ #
8
+ # @example Simple scraper
9
+ # Scruber::Core::Crawler.new(:simple) do
10
+ # get 'http://example.com'
11
+ # parse :html do |page,html|
12
+ # puts html.at('title').text
13
+ # end
14
+ # end
15
+ #
16
+ # @author Ivan Goncharov
17
+ #
3
18
  class Crawler
4
19
  attr_reader :queue, :fetcher, :scraper_name
5
20
 
21
+ #
22
+ # Initialize crawler with scraper name and/or with options
23
+ #
24
+ # Crawler.new(:sample, fetcher_adapter: :custom)
25
+ # Crawler.new(:sample)
26
+ # Crawler.new(fetcher_adapter: :custom)
27
+ #
28
+ # @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
29
+ #
30
+ # @return [Scruber::Core::Crawler] [description]
6
31
  def initialize(*args)
7
32
  if args.first.is_a?(Hash)
8
33
  scraper_name = nil
@@ -14,23 +39,29 @@ module Scruber
14
39
  @scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
15
40
  raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
16
41
  @scraper_name = @scraper_name.to_sym
17
- Scruber.configuration.merge_options(options)
18
42
  @callbacks_options = {}
19
43
  @callbacks = {}
44
+ @on_page_error_callback = nil
20
45
  @on_complete_callbacks = []
21
- @queue = Scruber::Queue.new(scraper_name: scraper_name)
46
+
47
+ Scruber.configuration.merge_options(options)
48
+ ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
49
+
50
+ @queue = Scruber::Queue.new(scraper_name: @scraper_name)
22
51
  @fetcher = Scruber::Fetcher.new
52
+ initialize_progressbar
23
53
  load_extenstions
24
54
  end
25
55
 
26
56
  #
27
- # Run crawling.
57
+ # Crawling engine
28
58
  #
29
59
  # @param block [Proc] crawler body
30
60
  def run(&block)
31
61
  instance_eval &block
32
62
  while @queue.has_work? do
33
63
  @fetcher.run @queue
64
+ show_progress
34
65
  while page = @queue.fetch_downloaded do
35
66
  if @callbacks[page.page_type.to_sym]
36
67
  processed_page = process_page(page, page.page_type.to_sym)
@@ -38,16 +69,40 @@ module Scruber
38
69
  page.processed! unless page.sent_to_redownload?
39
70
  end
40
71
  end
72
+ if @on_page_error_callback
73
+ while page = @queue.fetch_error do
74
+ instance_exec page, &(@on_page_error_callback)
75
+ end
76
+ end
41
77
  end
42
78
  @on_complete_callbacks.sort_by{|c| -c[0] }.each do |(_,callback)|
43
79
  instance_exec &(callback)
44
80
  end
45
81
  end
46
82
 
83
+ #
84
+ # Register parser
85
+ #
86
+ # @param page_type [Symbol] type of page
87
+ # @param options [Hash] options for parser
88
+ # @option options [Symbol] :format format of page. Scruber automatically process
89
+ # page body depends on this format. For example :json or :html
90
+ # @param block [Proc] body of parser
91
+ #
92
+ # @return [void]
47
93
  def parser(page_type, options={}, &block)
48
94
  register_callback(page_type, options, &block)
49
95
  end
50
96
 
97
+ #
98
+ # Method missing callback. Scruber allows to register
99
+ # regexp and proc body to process calls
100
+ #
101
+ # @param method_sym [Symbol] missing method name
102
+ # @param arguments [Array] arguments
103
+ # @param block [Proc] block (if passed)
104
+ #
105
+ # @return [type] [description]
51
106
  def method_missing(method_sym, *arguments, &block)
52
107
  Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
53
108
  if (scan_results = method_sym.to_s.scan(pattern)).present?
@@ -68,34 +123,137 @@ module Scruber
68
123
  end
69
124
 
70
125
  class << self
126
+
127
+ #
128
+ # Register method missing callback
129
+ #
130
+ # @param pattern [Regexp] Regexp to match missing name
131
+ # @param block [Proc] Body to process missing method
132
+ #
133
+ # @return [void]
71
134
  def register_method_missing(pattern, &block)
72
135
  _registered_method_missings[pattern] = block
73
136
  end
74
137
 
138
+ #
139
+ # Registered method missing callbacks dictionary
140
+ #
141
+ # @return [Hash] callbacks
75
142
  def _registered_method_missings
76
143
  @registered_method_missings ||= {}
77
144
  end
78
145
  end
79
146
 
147
+ #
148
+ # Register callback which will be executed when
149
+ # downloading and parsing will be completed.
150
+ # For example when you need to write results to file,
151
+ # or to close files.
152
+ # @example Close file descriptors
153
+ # on_complete -1 do
154
+ # Scruber::Core::Extensions::CsvOutput.close_all
155
+ # end
156
+ #
157
+ # @param priority [Integer] priority of this callback
158
+ # @param block [Proc] body of callback
159
+ #
160
+ # @return [void]
161
+ def on_complete(priority=1, &block)
162
+ @on_complete_callbacks.push [priority,block]
163
+ end
164
+
165
+ #
166
+ # Register callback which will be executed for
167
+ # error pages, like 404 or 500
168
+ # Attention! You should call one of these methods for page
169
+ # to prevent infinite loop: page.processed!, page.delete, page.redownload!(0)
170
+ # @example Processing error page
171
+ # on_page_error do |page|
172
+ # if page.response_body =~ /distil/
173
+ # page.page.redownload!(0)
174
+ # elsif page.response_code == /404/
175
+ # get page.at('a.moved_to').attr('href')
176
+ # page.processed!
177
+ # else
178
+ # page.delete
179
+ # end
180
+ # end
181
+ #
182
+ # @param block [Proc] body of callback
183
+ #
184
+ # @return [void]
185
+ def on_page_error(&block)
186
+ @on_page_error_callback = block
187
+ end
188
+
80
189
  private
81
190
 
191
+ #
192
+ # Register parser
193
+ #
194
+ # @param page_type [Symbol] type of page
195
+ # @param options [Hash] options for parser
196
+ # @option options [Symbol] :format format of page. Scruber automatically process
197
+ # page body depends on this format. For example :json or :html
198
+ # @param block [Proc] body of parser
199
+ #
200
+ # @return [void]
82
201
  def register_callback(page_type, options, &block)
83
202
  @callbacks_options[page_type.to_sym] = options || {}
84
203
  @callbacks[page_type.to_sym] = block
85
204
  end
86
205
 
87
- def on_complete(priority=1, &block)
88
- @on_complete_callbacks.push [priority,block]
89
- end
90
-
206
+ #
207
+ # Process page body depends on format of this page
208
+ # For example, if page_format = :html, then
209
+ # it will return Nokogiri::HTML(page.response_body)
210
+ #
211
+ # @param page [Page] page from queue
212
+ # @param page_type [Symbol] name of parser
213
+ #
214
+ # @return [Object] depends on page_type it will return different objects
91
215
  def process_page(page, page_type)
92
216
  page_format = @callbacks_options[page_type].fetch(:format){ nil }
93
217
  Scruber::Core::PageFormat.process(page, page_format)
94
218
  end
95
219
 
220
+ #
221
+ # Loads all extensions
222
+ #
223
+ # @return [void]
96
224
  def load_extenstions
97
225
  Scruber::Core::Extensions::Base.descendants.each(&:register)
98
226
  end
227
+
228
+ #
229
+ # Initialize progressbar, that shows progress in console
230
+ #
231
+ # @return [void]
232
+ def initialize_progressbar
233
+ unless Scruber.configuration.silent
234
+ @progressbar = PowerBar.new
235
+ @progressbar.settings.tty.finite.template.main = "${<msg>} ${<bar> }\e[0m \e[33;1m${<percent>%} (${<done>/<total>})"
236
+ @progressbar.settings.tty.finite.template.padchar = "\e[30;1m#{@progressbar.settings.tty.finite.template.padchar}"
237
+ @progressbar.settings.tty.finite.template.barchar = "\e[34;1m#{@progressbar.settings.tty.finite.template.barchar}"
238
+ @progressbar.settings.tty.finite.template.exit = "\e[?25h\e[0m" # clean up after us
239
+ @progressbar.settings.tty.finite.template.close = "\e[?25h\e[0m\n" # clean up after us
240
+ @progressbar.settings.tty.finite.output = Proc.new{ |s|
241
+ $stderr.print s
242
+ }
243
+ end
244
+ end
245
+
246
+ #
247
+ # Out progress to console
248
+ #
249
+ # @return [void]
250
+ def show_progress
251
+ if @progressbar
252
+ s = queue.size
253
+ @progressbar.show({:msg => @proggress_status, :done => queue.downloaded_count, :total => s}) unless s.zero?
254
+ end
255
+ end
99
256
  end
257
+
100
258
  end
101
259
  end
@@ -1,12 +1,22 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # Base class for extensions
6
+ # @abstract
7
+ #
8
+ # @author [revis0r]
9
+ #
4
10
  class Base
5
11
  module CoreMethods
6
12
 
7
13
  end
8
14
 
9
15
  class << self
16
+ #
17
+ # Register extension in crawler core
18
+ #
19
+ # @return [void]
10
20
  def register
11
21
  Scruber::Core::Crawler.include self.const_get(:CoreMethods)
12
22
  end
@@ -1,6 +1,20 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # Helper to write csv files
6
+ # @example Writing log and products data
7
+ # Scruber.run :simple do
8
+ # csv_file Scruber.root.join('log.csv'), col_sep: ';'
9
+ # csv_products_file Scruber.root.join('products.csv'), col_sep: ';'
10
+ #
11
+ # csv_out [Time.now.to_i, 'sample log record']
12
+ # csv_product_out ['ID', 'Title']
13
+ # csv_product_out ['1', 'Soap']
14
+ # end
15
+ #
16
+ # @author Ivan Goncharov
17
+ #
4
18
  class CsvOutput < Base
5
19
  module CoreMethods
6
20
  def csv_file(path, options={})
@@ -16,6 +30,13 @@ module Scruber
16
30
  Scruber::Core::Extensions::CsvOutput.csv_out :default, fields
17
31
  end
18
32
 
33
+ #
34
+ # Registering method missing callbacks on including
35
+ # to crawling class
36
+ #
37
+ # @param base [Class] class where module was included
38
+ #
39
+ # @return [void]
19
40
  def self.included(base)
20
41
  Scruber::Core::Crawler.register_method_missing /\Acsv_(\w+)_file\Z/ do |meth, scan_results, args|
21
42
  file_id = scan_results.first.first.to_sym
@@ -0,0 +1,39 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+ #
5
+ # Logging class
6
+ # Allows to write logs to file and console, depends on configuration
7
+ #
8
+ # @author Ivan Goncharov
9
+ #
10
+ class Log < Base
11
+ module CoreMethods
12
+ #
13
+ # Writing log
14
+ #
15
+ # @param text [String] text
16
+ # @param color [Symbol] color of text to write
17
+ #
18
+ # @return [void]
19
+ def log(text, color=:white)
20
+ Scruber.logger.info(scraper_name){ text } rescue nil
21
+ if @progressbar
22
+ @progressbar.print "#{Paint[text, color]}\n"
23
+ end
24
+ end
25
+
26
+ #
27
+ # Setting status for console progressbar
28
+ #
29
+ # @param text [String] text
30
+ #
31
+ # @return [void]
32
+ def set_status(text)
33
+ @proggress_status = text
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,13 +1,47 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # Helper for reading dictionaries.
6
+ # @example Adding dictionary and reading it
7
+ # Scruber.run :sample do
8
+ # add_dictionary :zip_codes_usa, Scruber.root.join('dict', 'zip_codes_usa.csv'), :csv
9
+ # seed do
10
+ # loop :zip_codes_usa, state: 'NY' do |row|
11
+ # get 'https://example.com/by_zip/'+row['zip'].to_s
12
+ # end
13
+ # end
14
+ # end
15
+ #
16
+ # @author Ivan Goncharov
17
+ #
4
18
  class Loop < Base
5
19
  module CoreMethods
20
+ #
21
+ # Iterate records from dictionary
22
+ #
23
+ # @param dictionary [Symbol] name of dictionary
24
+ # @param options [Hash] search conditions
25
+ # @param block [Proc] body, yields row of dictionary
26
+ #
27
+ # @return [void]
6
28
  def loop(dictionary, options={}, &block)
7
29
  Scruber::Core::Extensions::Loop.loop dictionary, options do |*args|
8
30
  instance_exec *args, &block
9
31
  end
10
32
  end
33
+
34
+ #
35
+ # Registering dictionary in system
36
+ #
37
+ # @param name [Symbol] name of dictionary
38
+ # @param file_path [String] path to file
39
+ # @param file_type [Symbol] type of file, :xml, :csv, etc..
40
+ #
41
+ # @return [void]
42
+ def add_dictionary(name, file_path, file_type)
43
+ Scruber::Core::Extensions::Loop.add_dictionary(name, file_path, file_type)
44
+ end
11
45
  end
12
46
 
13
47
  class << self
@@ -1,6 +1,30 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # DSL for registering parsers.
6
+ # @example Sample of DSL
7
+ # Scruber.run :sample do
8
+ # get 'https://example.com'
9
+ # get_product 'https://example.com/product1.html'
10
+ #
11
+ # # Parsing https://example.com
12
+ # parse :html do |page,doc|
13
+ # log doc.at('title').text
14
+ # end
15
+ #
16
+ # # Parsing https://example.com/product1.html
17
+ # parse_product :html do |page,doc|
18
+ # log doc.at('title').text
19
+ # end
20
+ # # Alias to
21
+ # # parser :product, format: :html do |page,doc|
22
+ # # log doc.at('title').text
23
+ # # end
24
+ # end
25
+ #
26
+ # @author Ivan Gocharov
27
+ #
4
28
  class ParserAliases < Base
5
29
  module CoreMethods
6
30
  def parse(*args, &block)
@@ -1,6 +1,21 @@
1
1
  module Scruber
2
2
  module Core
3
3
  module Extensions
4
+ #
5
+ # DSL for adding pages to queue
6
+ # @example Sample of DSL
7
+ # Scruber.run :sample do
8
+ # get_product 'https://example.com/product1.html'
9
+ # # Alias to
10
+ # # queue.add 'https://example.com/product1.html', page_type: :product
11
+ #
12
+ # post_subscribe 'https://example.com/subscribe', body: { email: 'sample@example.com' }
13
+ # # Alias to
14
+ # # queue.add 'https://example.com/product1.html', method: :post, page_type: :subscribe, body: { email: 'sample@example.com' }
15
+ # end
16
+ #
17
+ # @author Ivan Gocharov
18
+ #
4
19
  class QueueAliases < Base
5
20
  module CoreMethods
6
21
  %w(get post head).each do |meth|
@@ -0,0 +1,23 @@
1
+ module Scruber
2
+ module Core
3
+ module Extensions
4
+
5
+ #
6
+ # Seed DSL
7
+ # Seed block executes only when queue was not initialized yet
8
+ # (queue has no any page, processed or pending)
9
+ #
10
+ # @author Ivan Goncharov
11
+ #
12
+ class Seed < Base
13
+ module CoreMethods
14
+ def seed(&block)
15
+ unless queue.initialized?
16
+ instance_exec &block
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -32,8 +32,18 @@ module Scruber
32
32
  if page.max_retry_times.nil?
33
33
  page.max_retry_times = @max_retry_times
34
34
  end
35
+ if page.max_retry_times && page.retry_count >= page.max_retry_times.to_i
36
+ page.retry_at = 1.year.from_now.to_i
37
+ end
35
38
  else
36
- page.fetched_at = Time.now.to_i
39
+ # Monkey patch to prevent redownloading of 404 pages
40
+ # and processing 404 pages by regular parsers
41
+ if page.response_code == 404
42
+ page.retry_count = 1 if page.retry_count.nil? || page.retry_count.zero?
43
+ page.max_retry_times = page.retry_count
44
+ else
45
+ page.fetched_at = Time.now.to_i
46
+ end
37
47
  end
38
48
  page
39
49
  end
@@ -89,26 +99,16 @@ module Scruber
89
99
 
90
100
  def bad_response?(page)
91
101
  case page.response_code
92
- when 0
93
- true
94
- when 1
95
- true
96
- when 100..199
102
+ when 0..1
97
103
  true
98
- when 200
99
- false
100
- when 201..299
104
+ when 200..299
101
105
  false
102
106
  when 300..399
103
107
  @options.fetch(:followlocation) { false }
104
108
  when 404
105
109
  false
106
110
  when 407
107
- raise "RejectedByProxyError"
108
- when 400..499
109
- true
110
- when 500..599
111
- true
111
+ raise "RejectedByProxy"
112
112
  else
113
113
  true
114
114
  end
@@ -64,7 +64,7 @@ module Scruber
64
64
  page.response_total_time = response.total_time
65
65
 
66
66
  if response.timed_out?
67
- page[:response_code] = 1
67
+ page.response_code = 1
68
68
  end
69
69
 
70
70
  page = after_request_callback(page)