scruber 0.1.6 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/scruber.rb +2 -1
- data/lib/scruber/core/crawler.rb +48 -48
- data/lib/scruber/core/page_format/json.rb +13 -0
- data/lib/scruber/fetcher_adapters/abstract_adapter.rb +17 -0
- data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +1 -1
- data/lib/scruber/queue_adapters/abstract_adapter.rb +10 -0
- data/lib/scruber/version.rb +1 -1
- data/scruber.gemspec +1 -0
- data/spec/core/extensions/loop_spec.rb +7 -4
- data/spec/core/extensions/parser_aliases_spec.rb +33 -25
- data/spec/core/extensions/queue_aliases_spec.rb +30 -22
- data/spec/core/extensions/seed_spec.rb +12 -11
- data/spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb +3 -2
- data/spec/scruber_spec.rb +53 -47
- data/spec/support/queue/queue_adapter.rb +5 -0
- metadata +23 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b74171eb49e730929f3b303cc1b65f7d55eb54f5
|
4
|
+
data.tar.gz: 67debe50e980a21e66fc9966e509b9c4fb65fc2b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f997b9b072270c8a28dc00f6dcde03a9041adb8f1fc6245d862f4423e3d155bf9e6fe478c11fe0d6081a878407fdc35792fe3a17dbf93b3f566daeb3fd22a60
|
7
|
+
data.tar.gz: a5d4f2bafe347e9a775bf01cbfd2170678596fd133cb788a444099a03a95bcc7f72116bd725f32506766f9dbb0756c06f1e3dc909e526d684cfa14e3535f6ae2
|
data/lib/scruber.rb
CHANGED
@@ -23,6 +23,7 @@ require "scruber/core/page_format"
|
|
23
23
|
require "scruber/core/page_format/base"
|
24
24
|
require "scruber/core/page_format/xml"
|
25
25
|
require "scruber/core/page_format/html"
|
26
|
+
require "scruber/core/page_format/json"
|
26
27
|
|
27
28
|
require "scruber/core/extensions/base"
|
28
29
|
require "scruber/core/extensions/loop"
|
@@ -58,7 +59,7 @@ module Scruber
|
|
58
59
|
|
59
60
|
def run(*args, &block)
|
60
61
|
raise "You need a block to build!" unless block_given?
|
61
|
-
|
62
|
+
|
62
63
|
Core::Crawler.new(*args).run(&block)
|
63
64
|
end
|
64
65
|
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
|
-
#
|
3
|
+
#
|
4
4
|
# Crawler class
|
5
|
-
#
|
5
|
+
#
|
6
6
|
# Main class-runner for scrapers.
|
7
|
-
#
|
7
|
+
#
|
8
8
|
# @example Simple scraper
|
9
9
|
# Scruber::Core::Crawler.new(:simple) do
|
10
10
|
# get 'http://example.com'
|
@@ -12,29 +12,29 @@ module Scruber
|
|
12
12
|
# puts html.at('title').text
|
13
13
|
# end
|
14
14
|
# end
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# @author Ivan Goncharov
|
17
|
-
#
|
17
|
+
#
|
18
18
|
class Crawler
|
19
19
|
attr_reader :queue, :fetcher, :scraper_name
|
20
20
|
|
21
|
-
#
|
21
|
+
#
|
22
22
|
# Initialize crawler with scraper name and/or with options
|
23
|
-
#
|
23
|
+
#
|
24
24
|
# Crawler.new(:sample, fetcher_adapter: :custom)
|
25
25
|
# Crawler.new(:sample)
|
26
26
|
# Crawler.new(fetcher_adapter: :custom)
|
27
|
-
#
|
27
|
+
#
|
28
28
|
# @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
|
29
|
-
#
|
29
|
+
#
|
30
30
|
# @return [Scruber::Core::Crawler] [description]
|
31
31
|
def initialize(*args)
|
32
32
|
if args.first.is_a?(Hash)
|
33
33
|
scraper_name = nil
|
34
|
-
options = args.first
|
34
|
+
@options = args.first
|
35
35
|
else
|
36
|
-
scraper_name, options = args
|
37
|
-
options ||= {}
|
36
|
+
scraper_name, @options = args
|
37
|
+
@options ||= {}
|
38
38
|
end
|
39
39
|
@scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
|
40
40
|
raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
|
@@ -44,7 +44,7 @@ module Scruber
|
|
44
44
|
@on_page_error_callback = nil
|
45
45
|
@on_complete_callbacks = []
|
46
46
|
|
47
|
-
Scruber.configuration.merge_options(options)
|
47
|
+
Scruber.configuration.merge_options(@options)
|
48
48
|
ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
|
49
49
|
|
50
50
|
@queue = Scruber::Queue.new(scraper_name: @scraper_name)
|
@@ -53,9 +53,9 @@ module Scruber
|
|
53
53
|
load_extenstions
|
54
54
|
end
|
55
55
|
|
56
|
-
#
|
56
|
+
#
|
57
57
|
# Crawling engine
|
58
|
-
#
|
58
|
+
#
|
59
59
|
# @param block [Proc] crawler body
|
60
60
|
def run(&block)
|
61
61
|
instance_eval &block
|
@@ -75,33 +75,33 @@ module Scruber
|
|
75
75
|
end
|
76
76
|
end
|
77
77
|
end
|
78
|
-
@on_complete_callbacks.sort_by{|c| -c[0] }.
|
78
|
+
@on_complete_callbacks.sort_by{|c| -c[0] }.map do |(_,callback)|
|
79
79
|
instance_exec &(callback)
|
80
|
-
end
|
80
|
+
end.first
|
81
81
|
end
|
82
82
|
|
83
|
-
#
|
83
|
+
#
|
84
84
|
# Register parser
|
85
|
-
#
|
85
|
+
#
|
86
86
|
# @param page_type [Symbol] type of page
|
87
87
|
# @param options [Hash] options for parser
|
88
|
-
# @option options [Symbol] :format format of page. Scruber automatically process
|
88
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
89
89
|
# page body depends on this format. For example :json or :html
|
90
90
|
# @param block [Proc] body of parser
|
91
|
-
#
|
91
|
+
#
|
92
92
|
# @return [void]
|
93
93
|
def parser(page_type, options={}, &block)
|
94
94
|
register_callback(page_type, options, &block)
|
95
95
|
end
|
96
96
|
|
97
|
-
#
|
97
|
+
#
|
98
98
|
# Method missing callback. Scruber allows to register
|
99
99
|
# regexp and proc body to process calls
|
100
|
-
#
|
100
|
+
#
|
101
101
|
# @param method_sym [Symbol] missing method name
|
102
102
|
# @param arguments [Array] arguments
|
103
103
|
# @param block [Proc] block (if passed)
|
104
|
-
#
|
104
|
+
#
|
105
105
|
# @return [type] [description]
|
106
106
|
def method_missing(method_sym, *arguments, &block)
|
107
107
|
Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
|
@@ -124,27 +124,27 @@ module Scruber
|
|
124
124
|
|
125
125
|
class << self
|
126
126
|
|
127
|
-
#
|
127
|
+
#
|
128
128
|
# Register method missing callback
|
129
|
-
#
|
129
|
+
#
|
130
130
|
# @param pattern [Regexp] Regexp to match missing name
|
131
131
|
# @param block [Proc] Body to process missing method
|
132
|
-
#
|
132
|
+
#
|
133
133
|
# @return [void]
|
134
134
|
def register_method_missing(pattern, &block)
|
135
135
|
_registered_method_missings[pattern] = block
|
136
136
|
end
|
137
137
|
|
138
|
-
#
|
138
|
+
#
|
139
139
|
# Registered method missing callbacks dictionary
|
140
|
-
#
|
140
|
+
#
|
141
141
|
# @return [Hash] callbacks
|
142
142
|
def _registered_method_missings
|
143
143
|
@registered_method_missings ||= {}
|
144
144
|
end
|
145
145
|
end
|
146
146
|
|
147
|
-
#
|
147
|
+
#
|
148
148
|
# Register callback which will be executed when
|
149
149
|
# downloading and parsing will be completed.
|
150
150
|
# For example when you need to write results to file,
|
@@ -153,16 +153,16 @@ module Scruber
|
|
153
153
|
# on_complete -1 do
|
154
154
|
# Scruber::Core::Extensions::CsvOutput.close_all
|
155
155
|
# end
|
156
|
-
#
|
156
|
+
#
|
157
157
|
# @param priority [Integer] priority of this callback
|
158
158
|
# @param block [Proc] body of callback
|
159
|
-
#
|
159
|
+
#
|
160
160
|
# @return [void]
|
161
161
|
def on_complete(priority=1, &block)
|
162
162
|
@on_complete_callbacks.push [priority,block]
|
163
163
|
end
|
164
164
|
|
165
|
-
#
|
165
|
+
#
|
166
166
|
# Register callback which will be executed for
|
167
167
|
# error pages, like 404 or 500
|
168
168
|
# Attention! You should call one of these methods for page
|
@@ -178,9 +178,9 @@ module Scruber
|
|
178
178
|
# page.delete
|
179
179
|
# end
|
180
180
|
# end
|
181
|
-
#
|
181
|
+
#
|
182
182
|
# @param block [Proc] body of callback
|
183
|
-
#
|
183
|
+
#
|
184
184
|
# @return [void]
|
185
185
|
def on_page_error(&block)
|
186
186
|
@on_page_error_callback = block
|
@@ -188,46 +188,46 @@ module Scruber
|
|
188
188
|
|
189
189
|
private
|
190
190
|
|
191
|
-
#
|
191
|
+
#
|
192
192
|
# Register parser
|
193
|
-
#
|
193
|
+
#
|
194
194
|
# @param page_type [Symbol] type of page
|
195
195
|
# @param options [Hash] options for parser
|
196
|
-
# @option options [Symbol] :format format of page. Scruber automatically process
|
196
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
197
197
|
# page body depends on this format. For example :json or :html
|
198
198
|
# @param block [Proc] body of parser
|
199
|
-
#
|
199
|
+
#
|
200
200
|
# @return [void]
|
201
201
|
def register_callback(page_type, options, &block)
|
202
202
|
@callbacks_options[page_type.to_sym] = options || {}
|
203
203
|
@callbacks[page_type.to_sym] = block
|
204
204
|
end
|
205
205
|
|
206
|
-
#
|
206
|
+
#
|
207
207
|
# Process page body depends on format of this page
|
208
208
|
# For example, if page_format = :html, then
|
209
209
|
# it will return Nokogiri::HTML(page.response_body)
|
210
|
-
#
|
210
|
+
#
|
211
211
|
# @param page [Page] page from queue
|
212
212
|
# @param page_type [Symbol] name of parser
|
213
|
-
#
|
213
|
+
#
|
214
214
|
# @return [Object] depends on page_type it will return different objects
|
215
215
|
def process_page(page, page_type)
|
216
216
|
page_format = @callbacks_options[page_type].fetch(:format){ nil }
|
217
217
|
Scruber::Core::PageFormat.process(page, page_format)
|
218
218
|
end
|
219
219
|
|
220
|
-
#
|
220
|
+
#
|
221
221
|
# Loads all extensions
|
222
|
-
#
|
222
|
+
#
|
223
223
|
# @return [void]
|
224
224
|
def load_extenstions
|
225
225
|
Scruber::Core::Extensions::Base.descendants.each(&:register)
|
226
226
|
end
|
227
227
|
|
228
|
-
#
|
228
|
+
#
|
229
229
|
# Initialize progressbar, that shows progress in console
|
230
|
-
#
|
230
|
+
#
|
231
231
|
# @return [void]
|
232
232
|
def initialize_progressbar
|
233
233
|
unless Scruber.configuration.silent
|
@@ -243,9 +243,9 @@ module Scruber
|
|
243
243
|
end
|
244
244
|
end
|
245
245
|
|
246
|
-
#
|
246
|
+
#
|
247
247
|
# Out progress to console
|
248
|
-
#
|
248
|
+
#
|
249
249
|
# @return [void]
|
250
250
|
def show_progress
|
251
251
|
if @progressbar
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'charlock_holmes'
|
2
|
+
|
1
3
|
module Scruber
|
2
4
|
module FetcherAdapters
|
3
5
|
class AbstractAdapter
|
@@ -45,9 +47,24 @@ module Scruber
|
|
45
47
|
page.fetched_at = Time.now.to_i
|
46
48
|
end
|
47
49
|
end
|
50
|
+
if page.response_headers
|
51
|
+
page.response_headers = page.response_headers.inject({}) {|acc, (k,v)| acc[k.gsub('.', '_')] = v.is_a?(Array) ? v.map{|v1| convert_to_utf8(v1) } : convert_to_utf8(v); acc }
|
52
|
+
end
|
53
|
+
page.response_body = convert_to_utf8(page.response_body)
|
48
54
|
page
|
49
55
|
end
|
50
56
|
|
57
|
+
def convert_to_utf8(text)
|
58
|
+
unless text.to_s.empty?
|
59
|
+
detection = CharlockHolmes::EncodingDetector.detect(text)
|
60
|
+
if detection && detection[:encoding].present?
|
61
|
+
text = CharlockHolmes::Converter.convert(text, detection[:encoding], 'UTF-8') rescue text
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
text
|
66
|
+
end
|
67
|
+
|
51
68
|
def headers_for(page)
|
52
69
|
if page.fetcher_agent
|
53
70
|
headers = page.fetcher_agent.headers
|
@@ -60,7 +60,7 @@ module Scruber
|
|
60
60
|
def on_complete_callback(page, response)
|
61
61
|
page.response_code = response.code
|
62
62
|
page.response_body = response.body
|
63
|
-
page.response_headers = response.
|
63
|
+
page.response_headers = response.headers
|
64
64
|
page.response_total_time = response.total_time
|
65
65
|
|
66
66
|
if response.timed_out?
|
@@ -134,6 +134,16 @@ module Scruber
|
|
134
134
|
raise NotImplementedError
|
135
135
|
end
|
136
136
|
|
137
|
+
|
138
|
+
#
|
139
|
+
# Join url of current page with another path or url
|
140
|
+
# @param link_url [String] link
|
141
|
+
#
|
142
|
+
# @return [String] joined url
|
143
|
+
def url_join(link_url)
|
144
|
+
URI.join(url, link_url).to_s
|
145
|
+
end
|
146
|
+
|
137
147
|
def [](k)
|
138
148
|
instance_variable_get("@#{k.to_s}")
|
139
149
|
end
|
data/lib/scruber/version.rb
CHANGED
data/scruber.gemspec
CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
|
|
37
37
|
spec.add_dependency "activesupport", '~> 5.1', '>= 5.1.5'
|
38
38
|
spec.add_dependency "powerbar", '~> 2.0', '>= 2.0.1'
|
39
39
|
spec.add_dependency "paint", '~> 2.0', '>= 2.0.1'
|
40
|
+
spec.add_dependency "charlock_holmes", '~> 0.7', '>= 0.7.6'
|
40
41
|
spec.add_runtime_dependency "thor", "0.20.0"
|
41
42
|
spec.add_development_dependency "bundler", "~> 1.15"
|
42
43
|
spec.add_development_dependency "rake", "~> 10.0"
|
@@ -11,16 +11,19 @@ RSpec.describe Scruber::Core::Extensions::Loop do
|
|
11
11
|
|
12
12
|
it "should add dictionary and read info" do
|
13
13
|
Scruber::Core::Extensions::Loop.register
|
14
|
-
|
15
|
-
Scruber.run :sample do
|
14
|
+
zip_codes = Scruber.run :sample do
|
16
15
|
add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
|
16
|
+
@zip_codes = []
|
17
|
+
|
17
18
|
seed do
|
18
19
|
loop :zip_codes_usa, state: 'NY' do |row|
|
19
|
-
|
20
|
+
@zip_codes.push row['zip']
|
20
21
|
end
|
21
22
|
end
|
23
|
+
|
24
|
+
on_complete { @zip_codes }
|
22
25
|
end
|
23
|
-
expect(
|
26
|
+
expect(zip_codes).to eq(['10001', '10002'])
|
24
27
|
end
|
25
28
|
end
|
26
29
|
end
|
@@ -19,15 +19,17 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
19
19
|
|
20
20
|
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
21
21
|
|
22
|
-
Scruber.run :sample do
|
22
|
+
page = Scruber.run :sample do
|
23
23
|
get "http://example.com"
|
24
|
-
|
24
|
+
|
25
25
|
parse do |page|
|
26
|
-
|
26
|
+
@queue_page = page
|
27
27
|
end
|
28
|
+
|
29
|
+
on_complete { @queue_page }
|
28
30
|
end
|
29
|
-
expect(
|
30
|
-
expect(
|
31
|
+
expect(page.url).to eq("http://example.com")
|
32
|
+
expect(page.page_type.to_s).to eq("seed")
|
31
33
|
end
|
32
34
|
|
33
35
|
it "should register parser with custom page_type" do
|
@@ -35,16 +37,18 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
35
37
|
|
36
38
|
stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
|
37
39
|
|
38
|
-
Scruber.run :sample do
|
40
|
+
page = Scruber.run :sample do
|
39
41
|
post_product "http://example.com"
|
40
|
-
|
42
|
+
|
41
43
|
parse_product do |page|
|
42
|
-
|
44
|
+
@queue_page = page
|
43
45
|
end
|
46
|
+
|
47
|
+
on_complete { @queue_page }
|
44
48
|
end
|
45
|
-
expect(
|
46
|
-
expect(
|
47
|
-
expect(
|
49
|
+
expect(page.url).to eq("http://example.com")
|
50
|
+
expect(page.method.to_s).to eq("post")
|
51
|
+
expect(page.page_type.to_s).to eq("product")
|
48
52
|
end
|
49
53
|
end
|
50
54
|
|
@@ -54,17 +58,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
54
58
|
|
55
59
|
stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
|
56
60
|
|
57
|
-
Scruber.run :sample do
|
61
|
+
page, doc = Scruber.run :sample do
|
58
62
|
get "http://example.com"
|
59
|
-
|
63
|
+
|
60
64
|
parse :html do |page,doc|
|
61
|
-
|
62
|
-
|
65
|
+
@queue_page = page
|
66
|
+
@doc = doc
|
63
67
|
end
|
68
|
+
|
69
|
+
on_complete { [@queue_page, @doc] }
|
64
70
|
end
|
65
|
-
expect(
|
66
|
-
expect(
|
67
|
-
expect(
|
71
|
+
expect(doc.at('span').text).to eq("Example Domain")
|
72
|
+
expect(page.page_type.to_s).to eq("seed")
|
73
|
+
expect(page.method.to_s).to eq("get")
|
68
74
|
end
|
69
75
|
|
70
76
|
it "should register parser with custom page_type" do
|
@@ -72,17 +78,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
72
78
|
|
73
79
|
stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
|
74
80
|
|
75
|
-
Scruber.run :sample do
|
81
|
+
page, doc = Scruber.run :sample do
|
76
82
|
post_product "http://example.com"
|
77
|
-
|
83
|
+
|
78
84
|
parse_product :html do |page,doc|
|
79
|
-
|
80
|
-
|
85
|
+
@queue_page = page
|
86
|
+
@doc = doc
|
81
87
|
end
|
88
|
+
|
89
|
+
on_complete { [@queue_page, @doc] }
|
82
90
|
end
|
83
|
-
expect(
|
84
|
-
expect(
|
85
|
-
expect(
|
91
|
+
expect(doc.at('span').text).to eq("Example Post")
|
92
|
+
expect(page.method.to_s).to eq("post")
|
93
|
+
expect(page.page_type.to_s).to eq("product")
|
86
94
|
end
|
87
95
|
end
|
88
96
|
end
|
@@ -19,25 +19,29 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
|
|
19
19
|
it "should add page to queue" do
|
20
20
|
described_class.register
|
21
21
|
|
22
|
-
Scruber.run :sample do
|
22
|
+
page = Scruber.run :sample do
|
23
23
|
get "http://example.com"
|
24
|
-
|
24
|
+
|
25
|
+
@queue_page = queue.fetch_pending
|
26
|
+
on_complete { @queue_page }
|
25
27
|
end
|
26
|
-
expect(
|
27
|
-
expect(
|
28
|
-
expect(
|
28
|
+
expect(page.url).to eq("http://example.com")
|
29
|
+
expect(page.method.to_s).to eq("get")
|
30
|
+
expect(page.page_type.to_s).to eq("seed")
|
29
31
|
end
|
30
32
|
|
31
33
|
it "should add page to queue" do
|
32
34
|
described_class.register
|
33
35
|
|
34
|
-
Scruber.run :sample do
|
36
|
+
page = Scruber.run :sample do
|
35
37
|
post_product "http://example.com"
|
36
|
-
|
38
|
+
|
39
|
+
@queue_page = queue.fetch_pending
|
40
|
+
on_complete { @queue_page }
|
37
41
|
end
|
38
|
-
expect(
|
39
|
-
expect(
|
40
|
-
expect(
|
42
|
+
expect(page.url).to eq("http://example.com")
|
43
|
+
expect(page.method.to_s).to eq("post")
|
44
|
+
expect(page.page_type).to eq("product")
|
41
45
|
end
|
42
46
|
end
|
43
47
|
|
@@ -45,27 +49,31 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
|
|
45
49
|
it "should add page to queue" do
|
46
50
|
described_class.register
|
47
51
|
|
48
|
-
Scruber.run :sample do
|
52
|
+
page = Scruber.run :sample do
|
49
53
|
get "http://example.com", user_agent: 'Agent 1'
|
50
|
-
|
54
|
+
|
55
|
+
@queue_page = queue.fetch_pending
|
56
|
+
on_complete { @queue_page }
|
51
57
|
end
|
52
|
-
expect(
|
53
|
-
expect(
|
54
|
-
expect(
|
55
|
-
expect(
|
58
|
+
expect(page.url).to eq("http://example.com")
|
59
|
+
expect(page.method.to_s).to eq("get")
|
60
|
+
expect(page.page_type.to_s).to eq("seed")
|
61
|
+
expect(page.user_agent).to eq('Agent 1')
|
56
62
|
end
|
57
63
|
|
58
64
|
it "should add page to queue" do
|
59
65
|
described_class.register
|
60
66
|
|
61
|
-
Scruber.run :sample do
|
67
|
+
page = Scruber.run :sample do
|
62
68
|
post_product "http://example.com", user_agent: 'Agent 1'
|
63
|
-
|
69
|
+
|
70
|
+
@queue_page = queue.fetch_pending
|
71
|
+
on_complete { @queue_page }
|
64
72
|
end
|
65
|
-
expect(
|
66
|
-
expect(
|
67
|
-
expect(
|
68
|
-
expect(
|
73
|
+
expect(page.url).to eq("http://example.com")
|
74
|
+
expect(page.method.to_s).to eq("post")
|
75
|
+
expect(page.page_type).to eq("product")
|
76
|
+
expect(page.user_agent).to eq('Agent 1')
|
69
77
|
end
|
70
78
|
end
|
71
79
|
end
|
@@ -14,31 +14,32 @@ RSpec.describe Scruber::Core::Extensions::Seed do
|
|
14
14
|
stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
|
15
15
|
stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
it "should execute seed block" do
|
19
|
-
|
20
|
-
Scruber.run :sample do
|
19
|
+
queue_size = Scruber.run :sample do
|
21
20
|
seed do
|
22
21
|
get 'http://example.com'
|
23
22
|
end
|
24
|
-
|
23
|
+
@queue_size = queue.size
|
24
|
+
on_complete { @queue_size }
|
25
25
|
end
|
26
|
-
expect(
|
26
|
+
expect(queue_size).to eq(1)
|
27
27
|
end
|
28
28
|
|
29
29
|
it "should not execute seed block" do
|
30
|
-
|
31
|
-
Scruber.run :sample do
|
30
|
+
queue_size, page = Scruber.run :sample do
|
32
31
|
seed do
|
33
32
|
get 'http://example.com'
|
34
33
|
end
|
35
34
|
seed do
|
36
35
|
get 'http://example.com/contacts'
|
37
36
|
end
|
38
|
-
|
39
|
-
|
37
|
+
@queue_size = queue.size
|
38
|
+
@queue_page = queue.fetch_pending
|
39
|
+
|
40
|
+
on_complete { [@queue_size, @queue_page] }
|
40
41
|
end
|
41
|
-
expect(
|
42
|
-
expect(
|
42
|
+
expect(queue_size).to eq(1)
|
43
|
+
expect(page.url).to eq("http://example.com")
|
43
44
|
end
|
44
45
|
end
|
@@ -13,7 +13,7 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
|
|
13
13
|
cookie_jar: cookie_jar_string,
|
14
14
|
disable_proxy: true
|
15
15
|
end
|
16
|
-
|
16
|
+
|
17
17
|
it "set values" do
|
18
18
|
expect(agent.id).to eq(1)
|
19
19
|
expect(agent.user_agent).to eq('Scruber')
|
@@ -33,7 +33,8 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
|
|
33
33
|
end
|
34
34
|
|
35
35
|
it "parse cookies from page" do
|
36
|
-
page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep
|
36
|
+
page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-#{Date.today.year+1} 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
|
37
|
+
puts page.response_cookies.inspect
|
37
38
|
agent.parse_cookies_from_page!(page)
|
38
39
|
expect(agent.cookie_for('http://example.com')).to eq('__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; feed_flow=top')
|
39
40
|
end
|
data/spec/scruber_spec.rb
CHANGED
@@ -20,7 +20,7 @@ RSpec.describe Scruber do
|
|
20
20
|
config.fetcher_adapter = :typhoeus_fetcher
|
21
21
|
end
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
it "returns :typhoeus_fetcher as fetcher" do
|
25
25
|
expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
|
26
26
|
end
|
@@ -34,70 +34,70 @@ RSpec.describe Scruber do
|
|
34
34
|
|
35
35
|
it "should set scraper name from ENV" do
|
36
36
|
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
37
|
-
Scruber.run do
|
38
|
-
|
37
|
+
name = Scruber.run do
|
38
|
+
on_complete { scraper_name }
|
39
39
|
end
|
40
|
-
expect(
|
40
|
+
expect(name).to eq(:sample)
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
44
|
context "with args" do
|
45
45
|
it "should set scraper name from first arg" do
|
46
|
-
Scruber.run :sample1 do
|
47
|
-
|
46
|
+
name = Scruber.run :sample1 do
|
47
|
+
on_complete { scraper_name }
|
48
48
|
end
|
49
|
-
expect(
|
49
|
+
expect(name).to eq(:sample1)
|
50
50
|
end
|
51
51
|
|
52
52
|
it "should set scraper name from first arg, and options from second" do
|
53
|
-
Scruber.run :sample2, queue_adapter: :test do
|
54
|
-
|
55
|
-
$opt = Scruber.configuration.queue_adapter
|
53
|
+
name, opt = Scruber.run :sample2, queue_adapter: :test do
|
54
|
+
on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
|
56
55
|
end
|
57
|
-
expect(
|
58
|
-
expect(
|
56
|
+
expect(name).to eq(:sample2)
|
57
|
+
expect(opt).to eq(:test)
|
59
58
|
end
|
60
59
|
|
61
60
|
it "options from first arg and scraper_name from ENV" do
|
62
61
|
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
63
|
-
Scruber.run queue_adapter: :test2 do
|
64
|
-
|
65
|
-
$opt = Scruber.configuration.queue_adapter
|
62
|
+
name, opt = Scruber.run queue_adapter: :test2 do
|
63
|
+
on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
|
66
64
|
end
|
67
|
-
expect(
|
68
|
-
expect(
|
65
|
+
expect(name).to eq(:sample)
|
66
|
+
expect(opt).to eq(:test2)
|
69
67
|
end
|
70
68
|
|
71
69
|
it "should raise error if passed only options without ENV" do
|
72
70
|
ENV['SCRUBER_SCRAPER_NAME'] = nil
|
73
|
-
expect { Scruber.run(queue_adapter: :test2) {
|
71
|
+
expect { Scruber.run(queue_adapter: :test2) { scraper_name } }.to raise_error(Scruber::ArgumentError)
|
74
72
|
end
|
75
73
|
end
|
76
74
|
|
77
75
|
it "simple example" do
|
78
76
|
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
79
77
|
|
80
|
-
Scruber.run :sample do
|
78
|
+
body = Scruber.run :sample do
|
81
79
|
queue.add "http://example.com"
|
82
|
-
|
80
|
+
|
83
81
|
parser :seed do |page|
|
84
|
-
|
82
|
+
@page_response_body = page.response_body
|
85
83
|
end
|
84
|
+
on_complete { @page_response_body }
|
86
85
|
end
|
87
|
-
expect(
|
86
|
+
expect(body).to eq('Example Domain')
|
88
87
|
end
|
89
88
|
|
90
89
|
it "should return Nokogiri object" do
|
91
90
|
stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
|
92
91
|
|
93
|
-
Scruber.run :sample do
|
92
|
+
title = Scruber.run :sample do
|
94
93
|
queue.add "http://example.com/contacts.html"
|
95
|
-
|
94
|
+
|
96
95
|
parser :seed, format: :html do |page, html|
|
97
|
-
|
96
|
+
@title = html.at('a').text
|
98
97
|
end
|
98
|
+
on_complete { @title }
|
99
99
|
end
|
100
|
-
expect(
|
100
|
+
expect(title).to eq('Contacts')
|
101
101
|
end
|
102
102
|
|
103
103
|
context "complex example" do
|
@@ -107,10 +107,9 @@ RSpec.describe Scruber do
|
|
107
107
|
stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
|
108
108
|
stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
|
109
109
|
|
110
|
-
|
111
|
-
Scruber.run :sample do
|
110
|
+
products = Scruber.run :sample do
|
112
111
|
get "http://example.com/catalog"
|
113
|
-
|
112
|
+
|
114
113
|
parse :html do |page, doc|
|
115
114
|
doc.search('a').each do |a|
|
116
115
|
get_product URI.join(page.url, a.attr('href')).to_s
|
@@ -118,29 +117,34 @@ RSpec.describe Scruber do
|
|
118
117
|
end
|
119
118
|
|
120
119
|
parse_product :html do |page,doc|
|
121
|
-
|
120
|
+
@products ||= []
|
121
|
+
@products.push doc.at('h1').text
|
122
122
|
end
|
123
|
+
|
124
|
+
on_complete { @products }
|
123
125
|
end
|
124
|
-
expect(
|
126
|
+
expect(products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
|
125
127
|
end
|
126
128
|
|
127
129
|
it "should redownload page and increase retry" do
|
128
130
|
stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
|
129
131
|
|
130
|
-
Scruber.run :sample do
|
132
|
+
title, retry_count = Scruber.run :sample do
|
131
133
|
get "http://example.com/"
|
132
|
-
|
134
|
+
|
133
135
|
parse :html do |page, doc|
|
134
136
|
if page.response_body =~ /blocked/
|
135
137
|
page.redownload!
|
136
138
|
else
|
137
|
-
|
138
|
-
|
139
|
+
@title = doc.at('h1').text
|
140
|
+
@retry_count = page.retry_count
|
139
141
|
end
|
140
142
|
end
|
143
|
+
|
144
|
+
on_complete { [@title, @retry_count] }
|
141
145
|
end
|
142
|
-
expect(
|
143
|
-
expect(
|
146
|
+
expect(title).to eq('Product')
|
147
|
+
expect(retry_count).to eq(2)
|
144
148
|
end
|
145
149
|
end
|
146
150
|
|
@@ -148,39 +152,41 @@ RSpec.describe Scruber do
|
|
148
152
|
it "should process 500 error page" do
|
149
153
|
stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
|
150
154
|
|
151
|
-
|
152
|
-
Scruber.run :sample do
|
155
|
+
error_title = Scruber.run :sample do
|
153
156
|
get "http://example.com", max_retry_times: 1
|
154
157
|
|
155
158
|
parse :html do |page,doc|
|
156
|
-
|
159
|
+
@error_title = doc.at('h1').text
|
157
160
|
end
|
158
161
|
|
159
162
|
on_page_error do |page|
|
160
|
-
|
163
|
+
@error_title = page.response_body
|
161
164
|
page.processed!
|
162
165
|
end
|
166
|
+
|
167
|
+
on_complete { @error_title }
|
163
168
|
end
|
164
|
-
expect(
|
169
|
+
expect(error_title).to eq('<div><h1>500</h1></div>')
|
165
170
|
end
|
166
171
|
|
167
172
|
it "should process 404 error page" do
|
168
173
|
stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
|
169
174
|
|
170
|
-
|
171
|
-
Scruber.run :sample do
|
175
|
+
error_title = Scruber.run :sample do
|
172
176
|
get "http://example.com", max_retry_times: 1
|
173
177
|
|
174
178
|
parse :html do |page,doc|
|
175
|
-
|
179
|
+
@error_title = doc.at('h1').text
|
176
180
|
end
|
177
181
|
|
178
182
|
on_page_error do |page|
|
179
|
-
|
183
|
+
@error_title = page.response_body
|
180
184
|
page.processed!
|
181
185
|
end
|
186
|
+
|
187
|
+
on_complete { @error_title }
|
182
188
|
end
|
183
|
-
expect(
|
189
|
+
expect(error_title).to eq('<div><h1>404</h1></div>')
|
184
190
|
end
|
185
191
|
end
|
186
192
|
end
|
@@ -167,5 +167,10 @@ RSpec.shared_examples "queue_adapter" do
|
|
167
167
|
expect(page1.id).not_to be_blank
|
168
168
|
expect(page1.id).not_to eq(page2.id)
|
169
169
|
end
|
170
|
+
|
171
|
+
it "should join url" do
|
172
|
+
page1 = page_class.new queue, url: "http://example.com/product1"
|
173
|
+
expect(page1.url_join('/abc')).to eq("http://example.com/abc")
|
174
|
+
end
|
170
175
|
end
|
171
176
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|
@@ -138,6 +138,26 @@ dependencies:
|
|
138
138
|
- - ">="
|
139
139
|
- !ruby/object:Gem::Version
|
140
140
|
version: 2.0.1
|
141
|
+
- !ruby/object:Gem::Dependency
|
142
|
+
name: charlock_holmes
|
143
|
+
requirement: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - "~>"
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0.7'
|
148
|
+
- - ">="
|
149
|
+
- !ruby/object:Gem::Version
|
150
|
+
version: 0.7.6
|
151
|
+
type: :runtime
|
152
|
+
prerelease: false
|
153
|
+
version_requirements: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - "~>"
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0.7'
|
158
|
+
- - ">="
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: 0.7.6
|
141
161
|
- !ruby/object:Gem::Dependency
|
142
162
|
name: thor
|
143
163
|
requirement: !ruby/object:Gem::Requirement
|
@@ -252,6 +272,7 @@ files:
|
|
252
272
|
- lib/scruber/core/page_format.rb
|
253
273
|
- lib/scruber/core/page_format/base.rb
|
254
274
|
- lib/scruber/core/page_format/html.rb
|
275
|
+
- lib/scruber/core/page_format/json.rb
|
255
276
|
- lib/scruber/core/page_format/xml.rb
|
256
277
|
- lib/scruber/fetcher.rb
|
257
278
|
- lib/scruber/fetcher_adapters/abstract_adapter.rb
|