scruber 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,44 @@
1
1
  module Scruber
2
2
  module QueueAdapters
3
+ #
4
+ # Abstract Queue Adapter
5
+ #
6
+ # @abstract
7
+ # @author Ivan Goncharov
8
+ #
3
9
  class AbstractAdapter
4
-
10
+ #
11
+ # Queue page wrapper
12
+ #
13
+ # @author Ivan Goncharov
14
+ #
15
+ # @attr [Object] id ID of page. Will be autogenerated if not passed
16
+ # @attr [String] url URL of page
17
+ # @attr [String] method Request method, post, get, head
18
+ # @attr [String] user_agent Fixed User-Agent for requesting this page
19
+ # @attr [Hash] headers Headers for requesting this page
20
+ # @attr [Object] fetcher_agent_id ID of FetcherAgent, assigned to this page
21
+ # @attr [Object] proxy_id ID of proxy, assigned to this page
22
+ # @attr [String] response_body Response body
23
+ # @attr [Integer] response_code Response code
24
+ # @attr [Hash] response_headers Response headers
25
+ # @attr [Float] response_total_time Response total time
26
+ # @attr [Integer] retry_at Minimal timestamp of next retry
27
+ # @attr [Integer] fetched_at Download completion timestamp
28
+ # @attr [Integer] retry_count Number of download attempts
29
+ # @attr [Integer] max_retry_times Max number of download attempts
30
+ # @attr [Integer] enqueued_at Timestamp added to the queue
31
+ # @attr [String] page_type Page type
32
+ # @attr [Scruber::QueueAdapters::AbstractAdapter::Page] queue Queue object
33
+ # @attr [Integer] priority Priority of page in queue for fetcher
34
+ # @attr [Integer] processed_at Processed by parser timestamp
35
+ # @attr [Hash] options All options
5
36
  class Page
6
- attr_accessor :url,
37
+ attr_accessor :id,
38
+ :url,
7
39
  :method,
8
40
  :user_agent,
9
- :post_body,
41
+ :body,
10
42
  :headers,
11
43
  :fetcher_agent_id,
12
44
  :proxy_id,
@@ -25,14 +57,16 @@ module Scruber
25
57
  :processed_at,
26
58
  :options
27
59
 
28
- def initialize(queue, url, options={})
60
+ def initialize(queue, options={})
29
61
  @queue = queue
30
- @url = url
31
62
 
32
63
  options = options.with_indifferent_access
64
+ @options = options
65
+ @id = options.fetch(:id) { generate_page_id }
66
+ @url = options.fetch(:url) { raise "URL not provided" }
33
67
  @method = options.fetch(:method) { :get }
34
68
  @user_agent = options.fetch(:user_agent) { nil }
35
- @post_body = options.fetch(:post_body) { nil }
69
+ @body = options.fetch(:body) { nil }
36
70
  @headers = options.fetch(:headers) { {} }
37
71
  @fetcher_agent_id = options.fetch(:fetcher_agent_id) { nil }
38
72
  @proxy_id = options.fetch(:proxy_id) { nil }
@@ -49,13 +83,16 @@ module Scruber
49
83
  # @queue = options.fetch(:queue) { 'default' }
50
84
  @priority = options.fetch(:priority) { 0 }
51
85
  @processed_at = options.fetch(:processed_at) { 0 }
52
- @options = options
53
86
 
54
87
  @_fetcher_agent = false
55
88
  @_proxy = false
56
89
  @_redownload = false
57
90
  end
58
91
 
92
+ #
93
+ # Returns assigned to this page FetcherAgent
94
+ #
95
+ # @return [Scruber::Helpers::FetcherAgent] Agent object
59
96
  def fetcher_agent
60
97
  if @_fetcher_agent == false
61
98
  @_fetcher_agent = (@fetcher_agent_id ? Scruber::Helpers::FetcherAgent.find(@fetcher_agent_id) : nil)
@@ -64,6 +101,10 @@ module Scruber
64
101
  end
65
102
  end
66
103
 
104
+ #
105
+ # Returns assigned to this page proxy
106
+ #
107
+ # @return [Proxy] proxy object
67
108
  def proxy
68
109
  if @_proxy == false
69
110
  @_proxy = (@proxy_id ? Scruber::Helpers::ProxyRotator.find(@proxy_id) : nil)
@@ -72,6 +113,10 @@ module Scruber
72
113
  end
73
114
  end
74
115
 
116
+ #
117
+ # Returns cookies from response headers
118
+ #
119
+ # @return [Array] array of cookies
75
120
  def response_cookies
76
121
  cookies = self.response_headers['Set-Cookie']
77
122
  if cookies.blank?
@@ -93,50 +138,144 @@ module Scruber
93
138
  instance_variable_get("@#{k.to_s}")
94
139
  end
95
140
 
141
+ #
142
+ # Delete page from queue
143
+ #
144
+ # @return [void]
96
145
  def delete
97
146
  raise NotImplementedError
98
147
  end
99
148
 
149
+ #
150
+ # Mark page as processed by parser and save it
151
+ #
152
+ # @return [void]
100
153
  def processed!
101
154
  @processed_at = Time.now.to_i
102
155
  @_redownload = false
103
156
  save
104
157
  end
105
158
 
106
- def redownload!
159
+ #
160
+ # Mark page as pending and return to queue
161
+ #
162
+ # @param new_retry_count [Integer] new count of reties. Allows to reset retries count
163
+ #
164
+ # @return [void]
165
+ def redownload!(new_retry_count=nil)
107
166
  @_redownload = true
108
167
 
109
- @processed_at = nil
110
- @retry_count += 1
168
+ @processed_at = 0
169
+ if new_retry_count
170
+ @retry_count = new_retry_count
171
+ else
172
+ @retry_count += 1
173
+ end
111
174
  @fetched_at = 0
112
175
  @response_body = nil
113
176
  save
114
177
  end
115
178
 
179
+ #
180
+ # Marked as page for redownloading
181
+ #
182
+ # @return [Boolean] true if need to redownload
116
183
  def sent_to_redownload?
117
184
  @_redownload
118
185
  end
186
+
187
+ private
188
+
189
+ def generate_page_id
190
+ Digest::MD5.hexdigest @options.slice(:method, :url, :headers, :body).to_json
191
+ end
119
192
  end
120
193
 
121
194
  def initialize(options={})
122
195
  @options = options
123
196
  end
124
197
 
198
+ #
199
+ # Add page to queue
200
+ # @param url [String] URL of page
201
+ # @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
202
+ #
203
+ # @return [void]
125
204
  def add(url, options={})
126
205
  raise NotImplementedError
127
206
  end
128
207
 
208
+ #
209
+ # Search page by id
210
+ # @param id [Object] id of page
211
+ #
212
+ # @return [Page] page object
213
+ def find(id)
214
+ raise NotImplementedError
215
+ end
216
+
217
+ #
218
+ # Size of queue
219
+ #
220
+ # @return [Integer] count of pages in queue
221
+ def size
222
+ raise NotImplementedError
223
+ end
224
+
225
+ #
226
+ # Fetch pending page for fetching
227
+ # @param count [Integer] count of pages to fetch
228
+ #
229
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
129
230
  def fetch_pending(count=nil)
130
231
  raise NotImplementedError
131
232
  end
132
233
 
234
+ #
235
+ # Fetch downloaded and not processed pages for feching
236
+ # @param count [Integer] count of pages to fetch
237
+ #
238
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
133
239
  def fetch_downloaded(count=nil)
134
240
  raise NotImplementedError
135
241
  end
136
242
 
243
+ #
244
+ # Fetch error page
245
+ # @param count [Integer] count of pages to fetch
246
+ #
247
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
248
+ def fetch_error(count=nil)
249
+ raise NotImplementedError
250
+ end
251
+
252
+ #
253
+ # Count of downloaded pages
254
+ # Using to show downloading progress.
255
+ #
256
+ # @return [Integer] count of downloaded pages
257
+ def downloaded_count
258
+ raise NotImplementedError
259
+ end
260
+
261
+ #
262
+ # Check if queue was initialized.
263
+ # Using for `seed` method. If queue was initialized,
264
+ # then no need to run seed block.
265
+ #
266
+ # @return [Boolean] true if queue already was initialized
137
267
  def initialized?
138
268
  raise NotImplementedError
139
269
  end
270
+
271
+ #
272
+ # Used by Core. It checks for pages that are
273
+ # not downloaded or not parsed yet.
274
+ #
275
+ # @return [Boolean] true if queue still has work for scraper
276
+ def has_work?
277
+ raise NotImplementedError
278
+ end
140
279
  end
141
280
  end
142
281
  end
@@ -1,46 +1,118 @@
1
1
  module Scruber
2
2
  module QueueAdapters
3
+ #
4
+ # Memory Queue Adapter
5
+ #
6
+ # Simple queue adapted which stores pages in memory.
7
+ # Nice solution for small scrapes.
8
+ # Easy to use. No need to setup any database, but
9
+ # no ability to reparse pages if something went wrong.
10
+ #
11
+ # @author Ivan Goncharov
12
+ #
3
13
  class Memory < AbstractAdapter
4
14
  attr_reader :error_pages
5
15
 
16
+ #
17
+ # Queue item class
18
+ #
19
+ # @author Ivan Goncharov
20
+ #
21
+ # @attr (see Scruber::QueueAdapters::AbstractAdapter::Page)
22
+ #
6
23
  class Page < Scruber::QueueAdapters::AbstractAdapter::Page
24
+
25
+ #
26
+ # Save page
27
+ #
28
+ # Depends on page attributes it push page
29
+ # to pending, downloaded or error queue.
30
+ #
31
+ # @return [void]
7
32
  def save
8
33
  if self.processed_at.to_i > 0
9
- nil
34
+ @queue.add_processed_page self
10
35
  elsif self.fetched_at > 0
11
36
  @queue.add_downloaded self
12
37
  elsif self.max_retry_times && self.retry_count >= self.max_retry_times.to_i
13
38
  @queue.add_error_page self
14
39
  else
15
- @queue.push self
40
+ @queue.add self
16
41
  end
17
42
  end
18
43
 
44
+ #
45
+ # Delete page from all queues
46
+ #
47
+ # @return [void]
19
48
  def delete
20
49
  @queue.delete self
21
50
  end
22
51
  end
23
52
 
53
+ #
54
+ # Queue initializer
55
+ # @param options [Hash] See {Scruber::QueueAdapters::AbstractAdapter#initializer}
56
+ #
57
+ # @return [Scruber::QueueAdapters::Memory] class instance
24
58
  def initialize(options={})
25
59
  super(options)
60
+ @processed_ids = []
26
61
  @queue = []
27
62
  @downloaded_pages = []
28
63
  @error_pages = []
29
64
  end
30
65
 
31
- def push(url_or_page, options={})
32
- if url_or_page.is_a?(Page)
33
- @queue.push url_or_page
34
- else
35
- @queue.push Page.new(self, url_or_page, options)
66
+ #
67
+ # Add page to queue
68
+ # @param url_or_page [String|Page] URL of page or Page object
69
+ # @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
70
+ #
71
+ # @return [void]
72
+ def add(url_or_page, options={})
73
+ unless url_or_page.is_a?(Page)
74
+ url_or_page = Page.new(self, options.merge(url: url_or_page))
75
+ end
76
+ @queue.push(url_or_page) unless @processed_ids.include?(url_or_page.id) || find(url_or_page.id)
77
+ end
78
+ alias_method :push, :add
79
+
80
+ #
81
+ # Search page by id
82
+ # @param id [Object] id of page
83
+ #
84
+ # @return [Page] page
85
+ def find(id)
86
+ [@queue, @downloaded_pages, @error_pages].each do |q|
87
+ q.each do |i|
88
+ return i if i.id == id
89
+ end
36
90
  end
91
+ nil
37
92
  end
38
- alias_method :add, :push
39
93
 
40
- def queue_size
94
+ #
95
+ # Size of queue
96
+ #
97
+ # @return [Integer] count of pages in queue
98
+ def size
41
99
  @queue.count
42
100
  end
43
101
 
102
+ #
103
+ # Count of downloaded pages
104
+ # Using to show downloading progress.
105
+ #
106
+ # @return [Integer] count of downloaded pages
107
+ def downloaded_count
108
+ @downloaded_pages.count
109
+ end
110
+
111
+ #
112
+ # Fetch downloaded and not processed pages for feching
113
+ # @param count [Integer] count of pages to fetch
114
+ #
115
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
44
116
  def fetch_downloaded(count=nil)
45
117
  if count.nil?
46
118
  @downloaded_pages.shift
@@ -49,6 +121,24 @@ module Scruber
49
121
  end
50
122
  end
51
123
 
124
+ #
125
+ # Fetch error page
126
+ # @param count [Integer] count of pages to fetch
127
+ #
128
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
129
+ def fetch_error(count=nil)
130
+ if count.nil?
131
+ @error_pages.shift
132
+ else
133
+ @error_pages.shift(count)
134
+ end
135
+ end
136
+
137
+ #
138
+ # Fetch pending page for fetching
139
+ # @param count [Integer] count of pages to fetch
140
+ #
141
+ # @return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
52
142
  def fetch_pending(count=nil)
53
143
  if count.nil?
54
144
  @queue.shift
@@ -57,24 +147,64 @@ module Scruber
57
147
  end
58
148
  end
59
149
 
150
+ #
151
+ # Internal method to add page to downloaded queue
152
+ #
153
+ # @param page [Scruber::QueueAdapters::Memory::Page] page
154
+ #
155
+ # @return [void]
60
156
  def add_downloaded(page)
61
157
  @downloaded_pages.push page
62
158
  end
63
159
 
160
+ #
161
+ # Internal method to add page to error queue
162
+ #
163
+ # @param page [Scruber::QueueAdapters::Memory::Page] page
164
+ #
165
+ # @return [void]
64
166
  def add_error_page(page)
65
167
  @error_pages.push page
66
168
  end
67
169
 
170
+ #
171
+ # Saving processed page id to prevent
172
+ # adding identical pages to queue
173
+ #
174
+ # @param page [Page] page
175
+ #
176
+ # @return [void]
177
+ def add_processed_page(page)
178
+ @processed_ids.push page.id
179
+ end
180
+
181
+ #
182
+ # Used by Core. It checks for pages that are
183
+ # not downloaded or not parsed yet.
184
+ #
185
+ # @return [Boolean] true if queue still has work for scraper
68
186
  def has_work?
69
187
  @queue.count > 0 || @downloaded_pages.count > 0
70
188
  end
71
189
 
190
+ #
191
+ # Delete page from all internal queues
192
+ #
193
+ # @param page [Scruber::QueueAdapters::Memory::Page] page
194
+ #
195
+ # @return [void]
72
196
  def delete(page)
73
197
  @queue -= [page]
74
198
  @downloaded_pages -= [page]
75
199
  @error_pages -= [page]
76
200
  end
77
201
 
202
+ #
203
+ # Check if queue was initialized.
204
+ # Using for `seed` method. If queue was initialized,
205
+ # then no need to run seed block.
206
+ #
207
+ # @return [Boolean] true if queue already was initialized
78
208
  def initialized?
79
209
  @queue.present? || @downloaded_pages.present? || @error_pages.present?
80
210
  end
@@ -1,3 +1,3 @@
1
1
  module Scruber
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.5"
3
3
  end
data/lib/scruber.rb CHANGED
@@ -3,7 +3,11 @@ require 'nokogiri'
3
3
  require 'http-cookie'
4
4
  require 'pickup'
5
5
  require 'csv'
6
+ require 'paint'
7
+ require 'powerbar'
8
+ require 'core_ext/const_missing'
6
9
  require 'active_support'
10
+ require 'active_support/dependencies'
7
11
  require 'active_support/core_ext/object'
8
12
  require 'active_support/core_ext/hash'
9
13
 
@@ -22,6 +26,8 @@ require "scruber/core/page_format/html"
22
26
 
23
27
  require "scruber/core/extensions/base"
24
28
  require "scruber/core/extensions/loop"
29
+ require "scruber/core/extensions/log"
30
+ require "scruber/core/extensions/seed"
25
31
  require "scruber/core/extensions/csv_output"
26
32
  require "scruber/core/extensions/queue_aliases"
27
33
  require "scruber/core/extensions/parser_aliases"
@@ -30,16 +36,13 @@ require "scruber/helpers/dictionary_reader"
30
36
  require "scruber/helpers/dictionary_reader/xml"
31
37
  require "scruber/helpers/dictionary_reader/csv"
32
38
 
33
- # require "scruber/core/configuration"
34
- # require "scruber/core/configuration"
35
-
36
39
  module Scruber
37
40
  class ArgumentError < ::ArgumentError; end
41
+
38
42
  module Core
39
- autoload :Configuration, "scruber/core/configuration"
40
- autoload :Crawler, "scruber/core/crawler"
43
+ autoload :Configuration, "scruber/core/configuration"
44
+ autoload :Crawler, "scruber/core/crawler"
41
45
  end
42
-
43
46
  module Helpers
44
47
  autoload :UserAgentRotator, "scruber/helpers/user_agent_rotator"
45
48
  autoload :ProxyRotator, "scruber/helpers/proxy_rotator"
@@ -51,11 +54,11 @@ module Scruber
51
54
  end
52
55
 
53
56
  class << self
54
- attr_writer :configuration
57
+ attr_writer :configuration, :logger
55
58
 
56
59
  def run(*args, &block)
57
60
  raise "You need a block to build!" unless block_given?
58
-
61
+
59
62
  Core::Crawler.new(*args).run(&block)
60
63
  end
61
64
 
@@ -63,8 +66,20 @@ module Scruber
63
66
  @configuration ||= Core::Configuration.new
64
67
  end
65
68
 
69
+ def logger
70
+ @logger ||= Scruber.root.nil? ? nil : Logger.new(Scruber.root.join('log', 'crawler.log'))
71
+ end
72
+
66
73
  def configure(&block)
67
74
  yield configuration
68
75
  end
76
+
77
+ def root
78
+ if defined?(APP_PATH)
79
+ Pathname.new(File.expand_path('../../', APP_PATH))
80
+ else
81
+ nil
82
+ end
83
+ end
69
84
  end
70
85
  end
data/scruber.gemspec CHANGED
@@ -30,11 +30,13 @@ Gem::Specification.new do |spec|
30
30
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
31
31
  spec.require_paths = ["lib"]
32
32
 
33
- spec.add_dependency "typhoeus", "1.1.2"
34
- spec.add_dependency "pickup", "0.0.11"
35
- spec.add_dependency "nokogiri", "1.8.2"
33
+ spec.add_dependency "typhoeus", '~> 1.1', '>= 1.1.2'
34
+ spec.add_dependency "pickup", "~> 0.0.11"
35
+ spec.add_dependency "nokogiri", '~> 1.8', '>= 1.8.2'
36
36
  spec.add_dependency "http-cookie", "1.0.3"
37
- spec.add_dependency "activesupport", "5.1.5"
37
+ spec.add_dependency "activesupport", '~> 5.1', '>= 5.1.5'
38
+ spec.add_dependency "powerbar", '~> 2.0', '>= 2.0.1'
39
+ spec.add_dependency "paint", '~> 2.0', '>= 2.0.1'
38
40
  spec.add_runtime_dependency "thor", "0.20.0"
39
41
  spec.add_development_dependency "bundler", "~> 1.15"
40
42
  spec.add_development_dependency "rake", "~> 10.0"