scruber 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,44 @@
1
1
  module Scruber
2
2
  module QueueAdapters
3
+ #
4
+ # Abstract Queue Adapter
5
+ #
6
+ # @abstract
7
+ # @author Ivan Goncharov
8
+ #
3
9
  class AbstractAdapter
4
-
10
+ #
11
+ # Queue page wrapper
12
+ #
13
+ # @author Ivan Goncharov
14
+ #
15
+ # @attr [Object] id ID of page. Will be autogenerated if not passed
16
+ # @attr [String] url URL of page
17
+ # @attr [String] method Request method, post, get, head
18
+ # @attr [String] user_agent Fixed User-Agent for requesting this page
19
+ # @attr [Hash] headers Headers for requesting this page
20
+ # @attr [Object] fetcher_agent_id ID of FetcherAgent, assigned to this page
21
+ # @attr [Object] proxy_id ID of proxy, assigned to this page
22
+ # @attr [String] response_body Response body
23
+ # @attr [Integer] response_code Response code
24
+ # @attr [Hash] response_headers Response headers
25
+ # @attr [Float] response_total_time Response total time
26
+ # @attr [Integer] retry_at Minimal timestamp of next retry
27
+ # @attr [Integer] fetched_at Download completion timestamp
28
+ # @attr [Integer] retry_count Number of download attempts
29
+ # @attr [Integer] max_retry_times Max number of download attempts
30
+ # @attr [Integer] enqueued_at Timestamp added to the queue
31
+ # @attr [String] page_type Page type
32
+ # @attr [Scruber::QueueAdapters::AbstractAdapter::Page] queue Queue object
33
+ # @attr [Integer] priority Priority of page in queue for fetcher
34
+ # @attr [Integer] processed_at Processed by parser timestamp
35
+ # @attr [Hash] options All options
5
36
  class Page
6
- attr_accessor :url,
37
+ attr_accessor :id,
38
+ :url,
7
39
  :method,
8
40
  :user_agent,
9
- :post_body,
41
+ :body,
10
42
  :headers,
11
43
  :fetcher_agent_id,
12
44
  :proxy_id,
@@ -25,14 +57,16 @@ module Scruber
25
57
  :processed_at,
26
58
  :options
27
59
 
28
- def initialize(queue, url, options={})
60
+ def initialize(queue, options={})
29
61
  @queue = queue
30
- @url = url
31
62
 
32
63
  options = options.with_indifferent_access
64
+ @options = options
65
+ @id = options.fetch(:id) { generate_page_id }
66
+ @url = options.fetch(:url) { raise "URL not provided" }
33
67
  @method = options.fetch(:method) { :get }
34
68
  @user_agent = options.fetch(:user_agent) { nil }
35
- @post_body = options.fetch(:post_body) { nil }
69
+ @body = options.fetch(:body) { nil }
36
70
  @headers = options.fetch(:headers) { {} }
37
71
  @fetcher_agent_id = options.fetch(:fetcher_agent_id) { nil }
38
72
  @proxy_id = options.fetch(:proxy_id) { nil }
@@ -49,13 +83,16 @@ module Scruber
49
83
  # @queue = options.fetch(:queue) { 'default' }
50
84
  @priority = options.fetch(:priority) { 0 }
51
85
  @processed_at = options.fetch(:processed_at) { 0 }
52
- @options = options
53
86
 
54
87
  @_fetcher_agent = false
55
88
  @_proxy = false
56
89
  @_redownload = false
57
90
  end
58
91
 
92
+ #
93
+ # Returns assigned to this page FetcherAgent
94
+ #
95
+ # @return [Scruber::Helpers::FetcherAgent] Agent object
59
96
  def fetcher_agent
60
97
  if @_fetcher_agent == false
61
98
  @_fetcher_agent = (@fetcher_agent_id ? Scruber::Helpers::FetcherAgent.find(@fetcher_agent_id) : nil)
@@ -64,6 +101,10 @@ module Scruber
64
101
  end
65
102
  end
66
103
 
104
+ #
105
+ # Returns assigned to this page proxy
106
+ #
107
+ # @return [Proxy] proxy object
67
108
  def proxy
68
109
  if @_proxy == false
69
110
  @_proxy = (@proxy_id ? Scruber::Helpers::ProxyRotator.find(@proxy_id) : nil)
@@ -72,6 +113,10 @@ module Scruber
72
113
  end
73
114
  end
74
115
 
116
+ #
117
+ # Returns cookies from response headers
118
+ #
119
+ # @return [Array] array of cookies
75
120
  def response_cookies
76
121
  cookies = self.response_headers['Set-Cookie']
77
122
  if cookies.blank?
@@ -93,50 +138,144 @@ module Scruber
93
138
  instance_variable_get("@#{k.to_s}")
94
139
  end
95
140
 
141
+ #
142
+ # Delete page from queue
143
+ #
144
+ # @return [void]
96
145
  def delete
97
146
  raise NotImplementedError
98
147
  end
99
148
 
149
+ #
150
+ # Mark page as processed by parser and save it
151
+ #
152
+ # @return [void]
100
153
  def processed!
101
154
  @processed_at = Time.now.to_i
102
155
  @_redownload = false
103
156
  save
104
157
  end
105
158
 
106
- def redownload!
159
+ #
160
+ # Mark page as pending and return to queue
161
+ #
162
+ # @param new_retry_count [Integer] new count of reties. Allows to reset retries count
163
+ #
164
+ # @return [void]
165
+ def redownload!(new_retry_count=nil)
107
166
  @_redownload = true
108
167
 
109
- @processed_at = nil
110
- @retry_count += 1
168
+ @processed_at = 0
169
+ if new_retry_count
170
+ @retry_count = new_retry_count
171
+ else
172
+ @retry_count += 1
173
+ end
111
174
  @fetched_at = 0
112
175
  @response_body = nil
113
176
  save
114
177
  end
115
178
 
179
+ #
180
+ # Marked as page for redownloading
181
+ #
182
+ # @return [Boolean] true if need to redownload
116
183
  def sent_to_redownload?
117
184
  @_redownload
118
185
  end
186
+
187
+ private
188
+
189
+ def generate_page_id
190
+ Digest::MD5.hexdigest @options.slice(:method, :url, :headers, :body).to_json
191
+ end
119
192
  end
120
193
 
121
194
  def initialize(options={})
122
195
  @options = options
123
196
  end
124
197
 
198
+ #
199
+ # Add page to queue
200
+ # @param url [String] URL of page
201
+ # @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
202
+ #
203
+ # @return [void]
125
204
  def add(url, options={})
126
205
  raise NotImplementedError
127
206
  end
128
207
 
208
+ #
209
+ # Search page by id
210
+ # @param id [Object] id of page
211
+ #
212
+ # @return [Page] page object
213
+ def find(id)
214
+ raise NotImplementedError
215
+ end
216
+
217
+ #
218
+ # Size of queue
219
+ #
220
+ # @return [Integer] count of pages in queue
221
+ def size
222
+ raise NotImplementedError
223
+ end
224
+
225
+ #
226
+ # Fetch pending page for fetching
227
+ # @param count [Integer] count of pages to fetch
228
+ #
229
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
129
230
  def fetch_pending(count=nil)
130
231
  raise NotImplementedError
131
232
  end
132
233
 
234
+ #
235
+ # Fetch downloaded and not processed pages for feching
236
+ # @param count [Integer] count of pages to fetch
237
+ #
238
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
133
239
  def fetch_downloaded(count=nil)
134
240
  raise NotImplementedError
135
241
  end
136
242
 
243
+ #
244
+ # Fetch error page
245
+ # @param count [Integer] count of pages to fetch
246
+ #
247
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
248
+ def fetch_error(count=nil)
249
+ raise NotImplementedError
250
+ end
251
+
252
+ #
253
+ # Count of downloaded pages
254
+ # Using to show downloading progress.
255
+ #
256
+ # @return [Integer] count of downloaded pages
257
+ def downloaded_count
258
+ raise NotImplementedError
259
+ end
260
+
261
+ #
262
+ # Check if queue was initialized.
263
+ # Using for `seed` method. If queue was initialized,
264
+ # then no need to run seed block.
265
+ #
266
+ # @return [Boolean] true if queue already was initialized
137
267
  def initialized?
138
268
  raise NotImplementedError
139
269
  end
270
+
271
+ #
272
+ # Used by Core. It checks for pages that are
273
+ # not downloaded or not parsed yet.
274
+ #
275
+ # @return [Boolean] true if queue still has work for scraper
276
+ def has_work?
277
+ raise NotImplementedError
278
+ end
140
279
  end
141
280
  end
142
281
  end
@@ -1,46 +1,118 @@
1
1
  module Scruber
2
2
  module QueueAdapters
3
+ #
4
+ # Memory Queue Adapter
5
+ #
6
+ # Simple queue adapted which stores pages in memory.
7
+ # Nice solution for small scrapes.
8
+ # Easy to use. No need to setup any database, but
9
+ # no ability to reparse pages if something went wrong.
10
+ #
11
+ # @author Ivan Goncharov
12
+ #
3
13
  class Memory < AbstractAdapter
4
14
  attr_reader :error_pages
5
15
 
16
+ #
17
+ # Queue item class
18
+ #
19
+ # @author Ivan Goncharov
20
+ #
21
+ # @attr (see Scruber::QueueAdapters::AbstractAdapter::Page)
22
+ #
6
23
  class Page < Scruber::QueueAdapters::AbstractAdapter::Page
24
+
25
+ #
26
+ # Save page
27
+ #
28
+ # Depends on page attributes it push page
29
+ # to pending, downloaded or error queue.
30
+ #
31
+ # @return [void]
7
32
  def save
8
33
  if self.processed_at.to_i > 0
9
- nil
34
+ @queue.add_processed_page self
10
35
  elsif self.fetched_at > 0
11
36
  @queue.add_downloaded self
12
37
  elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
13
38
  @queue.add_error_page self
14
39
  else
15
- @queue.push self
40
+ @queue.add self
16
41
  end
17
42
  end
18
43
 
44
+ #
45
+ # Delete page from all queues
46
+ #
47
+ # @return [void]
19
48
  def delete
20
49
  @queue.delete self
21
50
  end
22
51
  end
23
52
 
53
+ #
54
+ # Queue initializer
55
+ # @param options [Hash] See {Scruber::QueueAdapters::AbstractAdapter#initializer}
56
+ #
57
+ # @return [Scruber::QueueAdapters::Memory] class instance
24
58
  def initialize(options={})
25
59
  super(options)
60
+ @processed_ids = []
26
61
  @queue = []
27
62
  @downloaded_pages = []
28
63
  @error_pages = []
29
64
  end
30
65
 
31
- def push(url_or_page, options={})
32
- if url_or_page.is_a?(Page)
33
- @queue.push url_or_page
34
- else
35
- @queue.push Page.new(self, url_or_page, options)
66
+ #
67
+ # Add page to queue
68
+ # @param url_or_page [String|Page] URL of page or Page object
69
+ # @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
70
+ #
71
+ # @return [void]
72
+ def add(url_or_page, options={})
73
+ unless url_or_page.is_a?(Page)
74
+ url_or_page = Page.new(self, options.merge(url: url_or_page))
75
+ end
76
+ @queue.push(url_or_page) unless @processed_ids.include?(url_or_page.id) || find(url_or_page.id)
77
+ end
78
+ alias_method :push, :add
79
+
80
+ #
81
+ # Search page by id
82
+ # @param id [Object] id of page
83
+ #
84
+ # @return [Page] page
85
+ def find(id)
86
+ [@queue, @downloaded_pages, @error_pages].each do |q|
87
+ q.each do |i|
88
+ return i if i.id == id
89
+ end
36
90
  end
91
+ nil
37
92
  end
38
- alias_method :add, :push
39
93
 
40
- def queue_size
94
+ #
95
+ # Size of queue
96
+ #
97
+ # @return [Integer] count of pages in queue
98
+ def size
41
99
  @queue.count
42
100
  end
43
101
 
102
+ #
103
+ # Count of downloaded pages
104
+ # Using to show downloading progress.
105
+ #
106
+ # @return [Integer] count of downloaded pages
107
+ def downloaded_count
108
+ @downloaded_pages.count
109
+ end
110
+
111
+ #
112
+ # Fetch downloaded and not processed pages for feching
113
+ # @param count [Integer] count of pages to fetch
114
+ #
115
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
44
116
  def fetch_downloaded(count=nil)
45
117
  if count.nil?
46
118
  @downloaded_pages.shift
@@ -49,6 +121,24 @@ module Scruber
49
121
  end
50
122
  end
51
123
 
124
+ #
125
+ # Fetch error page
126
+ # @param count [Integer] count of pages to fetch
127
+ #
128
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
129
+ def fetch_error(count=nil)
130
+ if count.nil?
131
+ @error_pages.shift
132
+ else
133
+ @error_pages.shift(count)
134
+ end
135
+ end
136
+
137
+ #
138
+ # Fetch pending page for fetching
139
+ # @param count [Integer] count of pages to fetch
140
+ #
141
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
52
142
  def fetch_pending(count=nil)
53
143
  if count.nil?
54
144
  @queue.shift
@@ -57,24 +147,64 @@ module Scruber
57
147
  end
58
148
  end
59
149
 
150
+ #
151
+ # Internal method to add page to downloaded queue
152
+ #
153
+ # @param page [Scruber::QueueAdapters::Memory::Page] page
154
+ #
155
+ # @return [void]
60
156
  def add_downloaded(page)
61
157
  @downloaded_pages.push page
62
158
  end
63
159
 
160
+ #
161
+ # Internal method to add page to error queue
162
+ #
163
+ # @param page [Scruber::QueueAdapters::Memory::Page] page
164
+ #
165
+ # @return [void]
64
166
  def add_error_page(page)
65
167
  @error_pages.push page
66
168
  end
67
169
 
170
+ #
171
+ # Saving processed page id to prevent
172
+ # adding identical pages to queue
173
+ #
174
+ # @param page [Page] page
175
+ #
176
+ # @return [void]
177
+ def add_processed_page(page)
178
+ @processed_ids.push page.id
179
+ end
180
+
181
+ #
182
+ # Used by Core. It checks for pages that are
183
+ # not downloaded or not parsed yet.
184
+ #
185
+ # @return [Boolean] true if queue still has work for scraper
68
186
  def has_work?
69
187
  @queue.count > 0 || @downloaded_pages.count > 0
70
188
  end
71
189
 
190
+ #
191
+ # Delete page from all internal queues
192
+ #
193
+ # @param page [Scruber::QueueAdapters::Memory::Page] page
194
+ #
195
+ # @return [void]
72
196
  def delete(page)
73
197
  @queue -= [page]
74
198
  @downloaded_pages -= [page]
75
199
  @error_pages -= [page]
76
200
  end
77
201
 
202
+ #
203
+ # Check if queue was initialized.
204
+ # Using for `seed` method. If queue was initialized,
205
+ # then no need to run seed block.
206
+ #
207
+ # @return [Boolean] true if queue already was initialized
78
208
  def initialized?
79
209
  @queue.present? || @downloaded_pages.present? || @error_pages.present?
80
210
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.5"
3
3
  end
data/lib/scruber.rb CHANGED
@@ -3,7 +3,11 @@ require 'nokogiri'
3
3
  require 'http-cookie'
4
4
  require 'pickup'
5
5
  require 'csv'
6
+ require 'paint'
7
+ require 'powerbar'
8
+ require 'core_ext/const_missing'
6
9
  require 'active_support'
10
+ require 'active_support/dependencies'
7
11
  require 'active_support/core_ext/object'
8
12
  require 'active_support/core_ext/hash'
9
13
 
@@ -22,6 +26,8 @@ require "scruber/core/page_format/html"
22
26
 
23
27
  require "scruber/core/extensions/base"
24
28
  require "scruber/core/extensions/loop"
29
+ require "scruber/core/extensions/log"
30
+ require "scruber/core/extensions/seed"
25
31
  require "scruber/core/extensions/csv_output"
26
32
  require "scruber/core/extensions/queue_aliases"
27
33
  require "scruber/core/extensions/parser_aliases"
@@ -30,16 +36,13 @@ require "scruber/helpers/dictionary_reader"
30
36
  require "scruber/helpers/dictionary_reader/xml"
31
37
  require "scruber/helpers/dictionary_reader/csv"
32
38
 
33
- # require "scruber/core/configuration"
34
- # require "scruber/core/configuration"
35
-
36
39
  module Scruber
37
40
  class ArgumentError < ::ArgumentError; end
41
+
38
42
  module Core
39
- autoload :Configuration, "scruber/core/configuration"
40
- autoload :Crawler, "scruber/core/crawler"
43
+ autoload :Configuration, "scruber/core/configuration"
44
+ autoload :Crawler, "scruber/core/crawler"
41
45
  end
42
-
43
46
  module Helpers
44
47
  autoload :UserAgentRotator, "scruber/helpers/user_agent_rotator"
45
48
  autoload :ProxyRotator, "scruber/helpers/proxy_rotator"
@@ -51,11 +54,11 @@ module Scruber
51
54
  end
52
55
 
53
56
  class << self
54
- attr_writer :configuration
57
+ attr_writer :configuration, :logger
55
58
 
56
59
  def run(*args, &block)
57
60
  raise "You need a block to build!" unless block_given?
58
-
61
+
59
62
  Core::Crawler.new(*args).run(&block)
60
63
  end
61
64
 
@@ -63,8 +66,20 @@ module Scruber
63
66
  @configuration ||= Core::Configuration.new
64
67
  end
65
68
 
69
+ def logger
70
+ @logger ||= Scruber.root.nil? ? nil : Logger.new(Scruber.root.join('log', 'crawler.log'))
71
+ end
72
+
66
73
  def configure(&block)
67
74
  yield configuration
68
75
  end
76
+
77
+ def root
78
+ if defined?(APP_PATH)
79
+ Pathname.new(File.expand_path('../../', APP_PATH))
80
+ else
81
+ nil
82
+ end
83
+ end
69
84
  end
70
85
  end
data/scruber.gemspec CHANGED
@@ -30,11 +30,13 @@ Gem::Specification.new do |spec|
30
30
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
31
31
  spec.require_paths = ["lib"]
32
32
 
33
- spec.add_dependency "typhoeus", "1.1.2"
34
- spec.add_dependency "pickup", "0.0.11"
35
- spec.add_dependency "nokogiri", "1.8.2"
33
+ spec.add_dependency "typhoeus", '~> 1.1', '>= 1.1.2'
34
+ spec.add_dependency "pickup", "~> 0.0.11"
35
+ spec.add_dependency "nokogiri", '~> 1.8', '>= 1.8.2'
36
36
  spec.add_dependency "http-cookie", "1.0.3"
37
- spec.add_dependency "activesupport", "5.1.5"
37
+ spec.add_dependency "activesupport", '~> 5.1', '>= 5.1.5'
38
+ spec.add_dependency "powerbar", '~> 2.0', '>= 2.0.1'
39
+ spec.add_dependency "paint", '~> 2.0', '>= 2.0.1'
38
40
  spec.add_runtime_dependency "thor", "0.20.0"
39
41
  spec.add_development_dependency "bundler", "~> 1.15"
40
42
  spec.add_development_dependency "rake", "~> 10.0"