scruber 0.1.6 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scruber.rb +2 -1
- data/lib/scruber/core/crawler.rb +48 -48
- data/lib/scruber/core/page_format/json.rb +13 -0
- data/lib/scruber/fetcher_adapters/abstract_adapter.rb +17 -0
- data/lib/scruber/fetcher_adapters/typhoeus_fetcher.rb +1 -1
- data/lib/scruber/queue_adapters/abstract_adapter.rb +10 -0
- data/lib/scruber/version.rb +1 -1
- data/scruber.gemspec +1 -0
- data/spec/core/extensions/loop_spec.rb +7 -4
- data/spec/core/extensions/parser_aliases_spec.rb +33 -25
- data/spec/core/extensions/queue_aliases_spec.rb +30 -22
- data/spec/core/extensions/seed_spec.rb +12 -11
- data/spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb +3 -2
- data/spec/scruber_spec.rb +53 -47
- data/spec/support/queue/queue_adapter.rb +5 -0
- metadata +23 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b74171eb49e730929f3b303cc1b65f7d55eb54f5
|
4
|
+
data.tar.gz: 67debe50e980a21e66fc9966e509b9c4fb65fc2b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f997b9b072270c8a28dc00f6dcde03a9041adb8f1fc6245d862f4423e3d155bf9e6fe478c11fe0d6081a878407fdc35792fe3a17dbf93b3f566daeb3fd22a60
|
7
|
+
data.tar.gz: a5d4f2bafe347e9a775bf01cbfd2170678596fd133cb788a444099a03a95bcc7f72116bd725f32506766f9dbb0756c06f1e3dc909e526d684cfa14e3535f6ae2
|
data/lib/scruber.rb
CHANGED
@@ -23,6 +23,7 @@ require "scruber/core/page_format"
|
|
23
23
|
require "scruber/core/page_format/base"
|
24
24
|
require "scruber/core/page_format/xml"
|
25
25
|
require "scruber/core/page_format/html"
|
26
|
+
require "scruber/core/page_format/json"
|
26
27
|
|
27
28
|
require "scruber/core/extensions/base"
|
28
29
|
require "scruber/core/extensions/loop"
|
@@ -58,7 +59,7 @@ module Scruber
|
|
58
59
|
|
59
60
|
def run(*args, &block)
|
60
61
|
raise "You need a block to build!" unless block_given?
|
61
|
-
|
62
|
+
|
62
63
|
Core::Crawler.new(*args).run(&block)
|
63
64
|
end
|
64
65
|
|
data/lib/scruber/core/crawler.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
module Scruber
|
2
2
|
module Core
|
3
|
-
#
|
3
|
+
#
|
4
4
|
# Crawler class
|
5
|
-
#
|
5
|
+
#
|
6
6
|
# Main class-runner for scrapers.
|
7
|
-
#
|
7
|
+
#
|
8
8
|
# @example Simple scraper
|
9
9
|
# Scruber::Core::Crawler.new(:simple) do
|
10
10
|
# get 'http://example.com'
|
@@ -12,29 +12,29 @@ module Scruber
|
|
12
12
|
# puts html.at('title').text
|
13
13
|
# end
|
14
14
|
# end
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# @author Ivan Goncharov
|
17
|
-
#
|
17
|
+
#
|
18
18
|
class Crawler
|
19
19
|
attr_reader :queue, :fetcher, :scraper_name
|
20
20
|
|
21
|
-
#
|
21
|
+
#
|
22
22
|
# Initialize crawler with scraper name and/or with options
|
23
|
-
#
|
23
|
+
#
|
24
24
|
# Crawler.new(:sample, fetcher_adapter: :custom)
|
25
25
|
# Crawler.new(:sample)
|
26
26
|
# Crawler.new(fetcher_adapter: :custom)
|
27
|
-
#
|
27
|
+
#
|
28
28
|
# @param args [Array] if first arg is a Symbol, it will be used as scraper_name, hash will me used as configuration options (see {Scruber::Core::Configuration})
|
29
|
-
#
|
29
|
+
#
|
30
30
|
# @return [Scruber::Core::Crawler] [description]
|
31
31
|
def initialize(*args)
|
32
32
|
if args.first.is_a?(Hash)
|
33
33
|
scraper_name = nil
|
34
|
-
options = args.first
|
34
|
+
@options = args.first
|
35
35
|
else
|
36
|
-
scraper_name, options = args
|
37
|
-
options ||= {}
|
36
|
+
scraper_name, @options = args
|
37
|
+
@options ||= {}
|
38
38
|
end
|
39
39
|
@scraper_name = scraper_name.present? ? scraper_name : ENV['SCRUBER_SCRAPER_NAME']
|
40
40
|
raise Scruber::ArgumentError.new("Scraper name is empty. Pass it to `Scruber.run :name do` or through ENV['SCRUBER_SCRAPER_NAME']") if @scraper_name.blank?
|
@@ -44,7 +44,7 @@ module Scruber
|
|
44
44
|
@on_page_error_callback = nil
|
45
45
|
@on_complete_callbacks = []
|
46
46
|
|
47
|
-
Scruber.configuration.merge_options(options)
|
47
|
+
Scruber.configuration.merge_options(@options)
|
48
48
|
ActiveSupport::Dependencies.autoload_paths = Scruber.configuration.autoload_paths
|
49
49
|
|
50
50
|
@queue = Scruber::Queue.new(scraper_name: @scraper_name)
|
@@ -53,9 +53,9 @@ module Scruber
|
|
53
53
|
load_extenstions
|
54
54
|
end
|
55
55
|
|
56
|
-
#
|
56
|
+
#
|
57
57
|
# Crawling engine
|
58
|
-
#
|
58
|
+
#
|
59
59
|
# @param block [Proc] crawler body
|
60
60
|
def run(&block)
|
61
61
|
instance_eval &block
|
@@ -75,33 +75,33 @@ module Scruber
|
|
75
75
|
end
|
76
76
|
end
|
77
77
|
end
|
78
|
-
@on_complete_callbacks.sort_by{|c| -c[0] }.
|
78
|
+
@on_complete_callbacks.sort_by{|c| -c[0] }.map do |(_,callback)|
|
79
79
|
instance_exec &(callback)
|
80
|
-
end
|
80
|
+
end.first
|
81
81
|
end
|
82
82
|
|
83
|
-
#
|
83
|
+
#
|
84
84
|
# Register parser
|
85
|
-
#
|
85
|
+
#
|
86
86
|
# @param page_type [Symbol] type of page
|
87
87
|
# @param options [Hash] options for parser
|
88
|
-
# @option options [Symbol] :format format of page. Scruber automatically process
|
88
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
89
89
|
# page body depends on this format. For example :json or :html
|
90
90
|
# @param block [Proc] body of parser
|
91
|
-
#
|
91
|
+
#
|
92
92
|
# @return [void]
|
93
93
|
def parser(page_type, options={}, &block)
|
94
94
|
register_callback(page_type, options, &block)
|
95
95
|
end
|
96
96
|
|
97
|
-
#
|
97
|
+
#
|
98
98
|
# Method missing callback. Scruber allows to register
|
99
99
|
# regexp and proc body to process calls
|
100
|
-
#
|
100
|
+
#
|
101
101
|
# @param method_sym [Symbol] missing method name
|
102
102
|
# @param arguments [Array] arguments
|
103
103
|
# @param block [Proc] block (if passed)
|
104
|
-
#
|
104
|
+
#
|
105
105
|
# @return [type] [description]
|
106
106
|
def method_missing(method_sym, *arguments, &block)
|
107
107
|
Scruber::Core::Crawler._registered_method_missings.each do |(pattern, func)|
|
@@ -124,27 +124,27 @@ module Scruber
|
|
124
124
|
|
125
125
|
class << self
|
126
126
|
|
127
|
-
#
|
127
|
+
#
|
128
128
|
# Register method missing callback
|
129
|
-
#
|
129
|
+
#
|
130
130
|
# @param pattern [Regexp] Regexp to match missing name
|
131
131
|
# @param block [Proc] Body to process missing method
|
132
|
-
#
|
132
|
+
#
|
133
133
|
# @return [void]
|
134
134
|
def register_method_missing(pattern, &block)
|
135
135
|
_registered_method_missings[pattern] = block
|
136
136
|
end
|
137
137
|
|
138
|
-
#
|
138
|
+
#
|
139
139
|
# Registered method missing callbacks dictionary
|
140
|
-
#
|
140
|
+
#
|
141
141
|
# @return [Hash] callbacks
|
142
142
|
def _registered_method_missings
|
143
143
|
@registered_method_missings ||= {}
|
144
144
|
end
|
145
145
|
end
|
146
146
|
|
147
|
-
#
|
147
|
+
#
|
148
148
|
# Register callback which will be executed when
|
149
149
|
# downloading and parsing will be completed.
|
150
150
|
# For example when you need to write results to file,
|
@@ -153,16 +153,16 @@ module Scruber
|
|
153
153
|
# on_complete -1 do
|
154
154
|
# Scruber::Core::Extensions::CsvOutput.close_all
|
155
155
|
# end
|
156
|
-
#
|
156
|
+
#
|
157
157
|
# @param priority [Integer] priority of this callback
|
158
158
|
# @param block [Proc] body of callback
|
159
|
-
#
|
159
|
+
#
|
160
160
|
# @return [void]
|
161
161
|
def on_complete(priority=1, &block)
|
162
162
|
@on_complete_callbacks.push [priority,block]
|
163
163
|
end
|
164
164
|
|
165
|
-
#
|
165
|
+
#
|
166
166
|
# Register callback which will be executed for
|
167
167
|
# error pages, like 404 or 500
|
168
168
|
# Attention! You should call one of these methods for page
|
@@ -178,9 +178,9 @@ module Scruber
|
|
178
178
|
# page.delete
|
179
179
|
# end
|
180
180
|
# end
|
181
|
-
#
|
181
|
+
#
|
182
182
|
# @param block [Proc] body of callback
|
183
|
-
#
|
183
|
+
#
|
184
184
|
# @return [void]
|
185
185
|
def on_page_error(&block)
|
186
186
|
@on_page_error_callback = block
|
@@ -188,46 +188,46 @@ module Scruber
|
|
188
188
|
|
189
189
|
private
|
190
190
|
|
191
|
-
#
|
191
|
+
#
|
192
192
|
# Register parser
|
193
|
-
#
|
193
|
+
#
|
194
194
|
# @param page_type [Symbol] type of page
|
195
195
|
# @param options [Hash] options for parser
|
196
|
-
# @option options [Symbol] :format format of page. Scruber automatically process
|
196
|
+
# @option options [Symbol] :format format of page. Scruber automatically process
|
197
197
|
# page body depends on this format. For example :json or :html
|
198
198
|
# @param block [Proc] body of parser
|
199
|
-
#
|
199
|
+
#
|
200
200
|
# @return [void]
|
201
201
|
def register_callback(page_type, options, &block)
|
202
202
|
@callbacks_options[page_type.to_sym] = options || {}
|
203
203
|
@callbacks[page_type.to_sym] = block
|
204
204
|
end
|
205
205
|
|
206
|
-
#
|
206
|
+
#
|
207
207
|
# Process page body depends on format of this page
|
208
208
|
# For example, if page_format = :html, then
|
209
209
|
# it will return Nokogiri::HTML(page.response_body)
|
210
|
-
#
|
210
|
+
#
|
211
211
|
# @param page [Page] page from queue
|
212
212
|
# @param page_type [Symbol] name of parser
|
213
|
-
#
|
213
|
+
#
|
214
214
|
# @return [Object] depends on page_type it will return different objects
|
215
215
|
def process_page(page, page_type)
|
216
216
|
page_format = @callbacks_options[page_type].fetch(:format){ nil }
|
217
217
|
Scruber::Core::PageFormat.process(page, page_format)
|
218
218
|
end
|
219
219
|
|
220
|
-
#
|
220
|
+
#
|
221
221
|
# Loads all extensions
|
222
|
-
#
|
222
|
+
#
|
223
223
|
# @return [void]
|
224
224
|
def load_extenstions
|
225
225
|
Scruber::Core::Extensions::Base.descendants.each(&:register)
|
226
226
|
end
|
227
227
|
|
228
|
-
#
|
228
|
+
#
|
229
229
|
# Initialize progressbar, that shows progress in console
|
230
|
-
#
|
230
|
+
#
|
231
231
|
# @return [void]
|
232
232
|
def initialize_progressbar
|
233
233
|
unless Scruber.configuration.silent
|
@@ -243,9 +243,9 @@ module Scruber
|
|
243
243
|
end
|
244
244
|
end
|
245
245
|
|
246
|
-
#
|
246
|
+
#
|
247
247
|
# Out progress to console
|
248
|
-
#
|
248
|
+
#
|
249
249
|
# @return [void]
|
250
250
|
def show_progress
|
251
251
|
if @progressbar
|
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'charlock_holmes'
|
2
|
+
|
1
3
|
module Scruber
|
2
4
|
module FetcherAdapters
|
3
5
|
class AbstractAdapter
|
@@ -45,9 +47,24 @@ module Scruber
|
|
45
47
|
page.fetched_at = Time.now.to_i
|
46
48
|
end
|
47
49
|
end
|
50
|
+
if page.response_headers
|
51
|
+
page.response_headers = page.response_headers.inject({}) {|acc, (k,v)| acc[k.gsub('.', '_')] = v.is_a?(Array) ? v.map{|v1| convert_to_utf8(v1) } : convert_to_utf8(v); acc }
|
52
|
+
end
|
53
|
+
page.response_body = convert_to_utf8(page.response_body)
|
48
54
|
page
|
49
55
|
end
|
50
56
|
|
57
|
+
def convert_to_utf8(text)
|
58
|
+
unless text.to_s.empty?
|
59
|
+
detection = CharlockHolmes::EncodingDetector.detect(text)
|
60
|
+
if detection && detection[:encoding].present?
|
61
|
+
text = CharlockHolmes::Converter.convert(text, detection[:encoding], 'UTF-8') rescue text
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
text
|
66
|
+
end
|
67
|
+
|
51
68
|
def headers_for(page)
|
52
69
|
if page.fetcher_agent
|
53
70
|
headers = page.fetcher_agent.headers
|
@@ -60,7 +60,7 @@ module Scruber
|
|
60
60
|
def on_complete_callback(page, response)
|
61
61
|
page.response_code = response.code
|
62
62
|
page.response_body = response.body
|
63
|
-
page.response_headers = response.
|
63
|
+
page.response_headers = response.headers
|
64
64
|
page.response_total_time = response.total_time
|
65
65
|
|
66
66
|
if response.timed_out?
|
@@ -134,6 +134,16 @@ module Scruber
|
|
134
134
|
raise NotImplementedError
|
135
135
|
end
|
136
136
|
|
137
|
+
|
138
|
+
#
|
139
|
+
# Join url of current page with another path or url
|
140
|
+
# @param link_url [String] link
|
141
|
+
#
|
142
|
+
# @return [String] joined url
|
143
|
+
def url_join(link_url)
|
144
|
+
URI.join(url, link_url).to_s
|
145
|
+
end
|
146
|
+
|
137
147
|
def [](k)
|
138
148
|
instance_variable_get("@#{k.to_s}")
|
139
149
|
end
|
data/lib/scruber/version.rb
CHANGED
data/scruber.gemspec
CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
|
|
37
37
|
spec.add_dependency "activesupport", '~> 5.1', '>= 5.1.5'
|
38
38
|
spec.add_dependency "powerbar", '~> 2.0', '>= 2.0.1'
|
39
39
|
spec.add_dependency "paint", '~> 2.0', '>= 2.0.1'
|
40
|
+
spec.add_dependency "charlock_holmes", '~> 0.7', '>= 0.7.6'
|
40
41
|
spec.add_runtime_dependency "thor", "0.20.0"
|
41
42
|
spec.add_development_dependency "bundler", "~> 1.15"
|
42
43
|
spec.add_development_dependency "rake", "~> 10.0"
|
@@ -11,16 +11,19 @@ RSpec.describe Scruber::Core::Extensions::Loop do
|
|
11
11
|
|
12
12
|
it "should add dictionary and read info" do
|
13
13
|
Scruber::Core::Extensions::Loop.register
|
14
|
-
|
15
|
-
Scruber.run :sample do
|
14
|
+
zip_codes = Scruber.run :sample do
|
16
15
|
add_dictionary :zip_codes_usa, File.expand_path(File.dirname(__FILE__))+'/dict.csv', :csv
|
16
|
+
@zip_codes = []
|
17
|
+
|
17
18
|
seed do
|
18
19
|
loop :zip_codes_usa, state: 'NY' do |row|
|
19
|
-
|
20
|
+
@zip_codes.push row['zip']
|
20
21
|
end
|
21
22
|
end
|
23
|
+
|
24
|
+
on_complete { @zip_codes }
|
22
25
|
end
|
23
|
-
expect(
|
26
|
+
expect(zip_codes).to eq(['10001', '10002'])
|
24
27
|
end
|
25
28
|
end
|
26
29
|
end
|
@@ -19,15 +19,17 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
19
19
|
|
20
20
|
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
21
21
|
|
22
|
-
Scruber.run :sample do
|
22
|
+
page = Scruber.run :sample do
|
23
23
|
get "http://example.com"
|
24
|
-
|
24
|
+
|
25
25
|
parse do |page|
|
26
|
-
|
26
|
+
@queue_page = page
|
27
27
|
end
|
28
|
+
|
29
|
+
on_complete { @queue_page }
|
28
30
|
end
|
29
|
-
expect(
|
30
|
-
expect(
|
31
|
+
expect(page.url).to eq("http://example.com")
|
32
|
+
expect(page.page_type.to_s).to eq("seed")
|
31
33
|
end
|
32
34
|
|
33
35
|
it "should register parser with custom page_type" do
|
@@ -35,16 +37,18 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
35
37
|
|
36
38
|
stub_request(:post, "http://example.com").to_return(body: 'Example Domain')
|
37
39
|
|
38
|
-
Scruber.run :sample do
|
40
|
+
page = Scruber.run :sample do
|
39
41
|
post_product "http://example.com"
|
40
|
-
|
42
|
+
|
41
43
|
parse_product do |page|
|
42
|
-
|
44
|
+
@queue_page = page
|
43
45
|
end
|
46
|
+
|
47
|
+
on_complete { @queue_page }
|
44
48
|
end
|
45
|
-
expect(
|
46
|
-
expect(
|
47
|
-
expect(
|
49
|
+
expect(page.url).to eq("http://example.com")
|
50
|
+
expect(page.method.to_s).to eq("post")
|
51
|
+
expect(page.page_type.to_s).to eq("product")
|
48
52
|
end
|
49
53
|
end
|
50
54
|
|
@@ -54,17 +58,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
54
58
|
|
55
59
|
stub_request(:get, "http://example.com").to_return(body: '<div><span>Example Domain</span></div>')
|
56
60
|
|
57
|
-
Scruber.run :sample do
|
61
|
+
page, doc = Scruber.run :sample do
|
58
62
|
get "http://example.com"
|
59
|
-
|
63
|
+
|
60
64
|
parse :html do |page,doc|
|
61
|
-
|
62
|
-
|
65
|
+
@queue_page = page
|
66
|
+
@doc = doc
|
63
67
|
end
|
68
|
+
|
69
|
+
on_complete { [@queue_page, @doc] }
|
64
70
|
end
|
65
|
-
expect(
|
66
|
-
expect(
|
67
|
-
expect(
|
71
|
+
expect(doc.at('span').text).to eq("Example Domain")
|
72
|
+
expect(page.page_type.to_s).to eq("seed")
|
73
|
+
expect(page.method.to_s).to eq("get")
|
68
74
|
end
|
69
75
|
|
70
76
|
it "should register parser with custom page_type" do
|
@@ -72,17 +78,19 @@ RSpec.describe Scruber::Core::Extensions::ParserAliases do
|
|
72
78
|
|
73
79
|
stub_request(:post, "http://example.com").to_return(body: '<div><span>Example Post</span></div>')
|
74
80
|
|
75
|
-
Scruber.run :sample do
|
81
|
+
page, doc = Scruber.run :sample do
|
76
82
|
post_product "http://example.com"
|
77
|
-
|
83
|
+
|
78
84
|
parse_product :html do |page,doc|
|
79
|
-
|
80
|
-
|
85
|
+
@queue_page = page
|
86
|
+
@doc = doc
|
81
87
|
end
|
88
|
+
|
89
|
+
on_complete { [@queue_page, @doc] }
|
82
90
|
end
|
83
|
-
expect(
|
84
|
-
expect(
|
85
|
-
expect(
|
91
|
+
expect(doc.at('span').text).to eq("Example Post")
|
92
|
+
expect(page.method.to_s).to eq("post")
|
93
|
+
expect(page.page_type.to_s).to eq("product")
|
86
94
|
end
|
87
95
|
end
|
88
96
|
end
|
@@ -19,25 +19,29 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
|
|
19
19
|
it "should add page to queue" do
|
20
20
|
described_class.register
|
21
21
|
|
22
|
-
Scruber.run :sample do
|
22
|
+
page = Scruber.run :sample do
|
23
23
|
get "http://example.com"
|
24
|
-
|
24
|
+
|
25
|
+
@queue_page = queue.fetch_pending
|
26
|
+
on_complete { @queue_page }
|
25
27
|
end
|
26
|
-
expect(
|
27
|
-
expect(
|
28
|
-
expect(
|
28
|
+
expect(page.url).to eq("http://example.com")
|
29
|
+
expect(page.method.to_s).to eq("get")
|
30
|
+
expect(page.page_type.to_s).to eq("seed")
|
29
31
|
end
|
30
32
|
|
31
33
|
it "should add page to queue" do
|
32
34
|
described_class.register
|
33
35
|
|
34
|
-
Scruber.run :sample do
|
36
|
+
page = Scruber.run :sample do
|
35
37
|
post_product "http://example.com"
|
36
|
-
|
38
|
+
|
39
|
+
@queue_page = queue.fetch_pending
|
40
|
+
on_complete { @queue_page }
|
37
41
|
end
|
38
|
-
expect(
|
39
|
-
expect(
|
40
|
-
expect(
|
42
|
+
expect(page.url).to eq("http://example.com")
|
43
|
+
expect(page.method.to_s).to eq("post")
|
44
|
+
expect(page.page_type).to eq("product")
|
41
45
|
end
|
42
46
|
end
|
43
47
|
|
@@ -45,27 +49,31 @@ RSpec.describe Scruber::Core::Extensions::QueueAliases do
|
|
45
49
|
it "should add page to queue" do
|
46
50
|
described_class.register
|
47
51
|
|
48
|
-
Scruber.run :sample do
|
52
|
+
page = Scruber.run :sample do
|
49
53
|
get "http://example.com", user_agent: 'Agent 1'
|
50
|
-
|
54
|
+
|
55
|
+
@queue_page = queue.fetch_pending
|
56
|
+
on_complete { @queue_page }
|
51
57
|
end
|
52
|
-
expect(
|
53
|
-
expect(
|
54
|
-
expect(
|
55
|
-
expect(
|
58
|
+
expect(page.url).to eq("http://example.com")
|
59
|
+
expect(page.method.to_s).to eq("get")
|
60
|
+
expect(page.page_type.to_s).to eq("seed")
|
61
|
+
expect(page.user_agent).to eq('Agent 1')
|
56
62
|
end
|
57
63
|
|
58
64
|
it "should add page to queue" do
|
59
65
|
described_class.register
|
60
66
|
|
61
|
-
Scruber.run :sample do
|
67
|
+
page = Scruber.run :sample do
|
62
68
|
post_product "http://example.com", user_agent: 'Agent 1'
|
63
|
-
|
69
|
+
|
70
|
+
@queue_page = queue.fetch_pending
|
71
|
+
on_complete { @queue_page }
|
64
72
|
end
|
65
|
-
expect(
|
66
|
-
expect(
|
67
|
-
expect(
|
68
|
-
expect(
|
73
|
+
expect(page.url).to eq("http://example.com")
|
74
|
+
expect(page.method.to_s).to eq("post")
|
75
|
+
expect(page.page_type).to eq("product")
|
76
|
+
expect(page.user_agent).to eq('Agent 1')
|
69
77
|
end
|
70
78
|
end
|
71
79
|
end
|
@@ -14,31 +14,32 @@ RSpec.describe Scruber::Core::Extensions::Seed do
|
|
14
14
|
stub_request(:get, "http://example.com").to_return(body: '<div><a>Main</a></div>')
|
15
15
|
stub_request(:get, "http://example.com/contacts").to_return(body: '<div><a>Contacts</a></div>')
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
it "should execute seed block" do
|
19
|
-
|
20
|
-
Scruber.run :sample do
|
19
|
+
queue_size = Scruber.run :sample do
|
21
20
|
seed do
|
22
21
|
get 'http://example.com'
|
23
22
|
end
|
24
|
-
|
23
|
+
@queue_size = queue.size
|
24
|
+
on_complete { @queue_size }
|
25
25
|
end
|
26
|
-
expect(
|
26
|
+
expect(queue_size).to eq(1)
|
27
27
|
end
|
28
28
|
|
29
29
|
it "should not execute seed block" do
|
30
|
-
|
31
|
-
Scruber.run :sample do
|
30
|
+
queue_size, page = Scruber.run :sample do
|
32
31
|
seed do
|
33
32
|
get 'http://example.com'
|
34
33
|
end
|
35
34
|
seed do
|
36
35
|
get 'http://example.com/contacts'
|
37
36
|
end
|
38
|
-
|
39
|
-
|
37
|
+
@queue_size = queue.size
|
38
|
+
@queue_page = queue.fetch_pending
|
39
|
+
|
40
|
+
on_complete { [@queue_size, @queue_page] }
|
40
41
|
end
|
41
|
-
expect(
|
42
|
-
expect(
|
42
|
+
expect(queue_size).to eq(1)
|
43
|
+
expect(page.url).to eq("http://example.com")
|
43
44
|
end
|
44
45
|
end
|
@@ -13,7 +13,7 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
|
|
13
13
|
cookie_jar: cookie_jar_string,
|
14
14
|
disable_proxy: true
|
15
15
|
end
|
16
|
-
|
16
|
+
|
17
17
|
it "set values" do
|
18
18
|
expect(agent.id).to eq(1)
|
19
19
|
expect(agent.user_agent).to eq('Scruber')
|
@@ -33,7 +33,8 @@ RSpec.describe Scruber::Helpers::FetcherAgentAdapters::AbstractAdapter do
|
|
33
33
|
end
|
34
34
|
|
35
35
|
it "parse cookies from page" do
|
36
|
-
page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep
|
36
|
+
page = Scruber::QueueAdapters::AbstractAdapter::Page.new(nil, url: 'http://example.com', response_headers: {"Connection" => "keep-alive","Set-Cookie" => "__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; expires=Sun, 02-Sep-#{Date.today.year+1} 00:26:06 GMT; path=/; domain=example.com; HttpOnly"})
|
37
|
+
puts page.response_cookies.inspect
|
37
38
|
agent.parse_cookies_from_page!(page)
|
38
39
|
expect(agent.cookie_for('http://example.com')).to eq('__cfduid=dc8db498b1e419b7943052a69c8e9d1d01504311966; feed_flow=top')
|
39
40
|
end
|
data/spec/scruber_spec.rb
CHANGED
@@ -20,7 +20,7 @@ RSpec.describe Scruber do
|
|
20
20
|
config.fetcher_adapter = :typhoeus_fetcher
|
21
21
|
end
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
it "returns :typhoeus_fetcher as fetcher" do
|
25
25
|
expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
|
26
26
|
end
|
@@ -34,70 +34,70 @@ RSpec.describe Scruber do
|
|
34
34
|
|
35
35
|
it "should set scraper name from ENV" do
|
36
36
|
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
37
|
-
Scruber.run do
|
38
|
-
|
37
|
+
name = Scruber.run do
|
38
|
+
on_complete { scraper_name }
|
39
39
|
end
|
40
|
-
expect(
|
40
|
+
expect(name).to eq(:sample)
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
44
44
|
context "with args" do
|
45
45
|
it "should set scraper name from first arg" do
|
46
|
-
Scruber.run :sample1 do
|
47
|
-
|
46
|
+
name = Scruber.run :sample1 do
|
47
|
+
on_complete { scraper_name }
|
48
48
|
end
|
49
|
-
expect(
|
49
|
+
expect(name).to eq(:sample1)
|
50
50
|
end
|
51
51
|
|
52
52
|
it "should set scraper name from first arg, and options from second" do
|
53
|
-
Scruber.run :sample2, queue_adapter: :test do
|
54
|
-
|
55
|
-
$opt = Scruber.configuration.queue_adapter
|
53
|
+
name, opt = Scruber.run :sample2, queue_adapter: :test do
|
54
|
+
on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
|
56
55
|
end
|
57
|
-
expect(
|
58
|
-
expect(
|
56
|
+
expect(name).to eq(:sample2)
|
57
|
+
expect(opt).to eq(:test)
|
59
58
|
end
|
60
59
|
|
61
60
|
it "options from first arg and scraper_name from ENV" do
|
62
61
|
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
63
|
-
Scruber.run queue_adapter: :test2 do
|
64
|
-
|
65
|
-
$opt = Scruber.configuration.queue_adapter
|
62
|
+
name, opt = Scruber.run queue_adapter: :test2 do
|
63
|
+
on_complete { [scraper_name, Scruber.configuration.queue_adapter] }
|
66
64
|
end
|
67
|
-
expect(
|
68
|
-
expect(
|
65
|
+
expect(name).to eq(:sample)
|
66
|
+
expect(opt).to eq(:test2)
|
69
67
|
end
|
70
68
|
|
71
69
|
it "should raise error if passed only options without ENV" do
|
72
70
|
ENV['SCRUBER_SCRAPER_NAME'] = nil
|
73
|
-
expect { Scruber.run(queue_adapter: :test2) {
|
71
|
+
expect { Scruber.run(queue_adapter: :test2) { scraper_name } }.to raise_error(Scruber::ArgumentError)
|
74
72
|
end
|
75
73
|
end
|
76
74
|
|
77
75
|
it "simple example" do
|
78
76
|
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
79
77
|
|
80
|
-
Scruber.run :sample do
|
78
|
+
body = Scruber.run :sample do
|
81
79
|
queue.add "http://example.com"
|
82
|
-
|
80
|
+
|
83
81
|
parser :seed do |page|
|
84
|
-
|
82
|
+
@page_response_body = page.response_body
|
85
83
|
end
|
84
|
+
on_complete { @page_response_body }
|
86
85
|
end
|
87
|
-
expect(
|
86
|
+
expect(body).to eq('Example Domain')
|
88
87
|
end
|
89
88
|
|
90
89
|
it "should return Nokogiri object" do
|
91
90
|
stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
|
92
91
|
|
93
|
-
Scruber.run :sample do
|
92
|
+
title = Scruber.run :sample do
|
94
93
|
queue.add "http://example.com/contacts.html"
|
95
|
-
|
94
|
+
|
96
95
|
parser :seed, format: :html do |page, html|
|
97
|
-
|
96
|
+
@title = html.at('a').text
|
98
97
|
end
|
98
|
+
on_complete { @title }
|
99
99
|
end
|
100
|
-
expect(
|
100
|
+
expect(title).to eq('Contacts')
|
101
101
|
end
|
102
102
|
|
103
103
|
context "complex example" do
|
@@ -107,10 +107,9 @@ RSpec.describe Scruber do
|
|
107
107
|
stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
|
108
108
|
stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
|
109
109
|
|
110
|
-
|
111
|
-
Scruber.run :sample do
|
110
|
+
products = Scruber.run :sample do
|
112
111
|
get "http://example.com/catalog"
|
113
|
-
|
112
|
+
|
114
113
|
parse :html do |page, doc|
|
115
114
|
doc.search('a').each do |a|
|
116
115
|
get_product URI.join(page.url, a.attr('href')).to_s
|
@@ -118,29 +117,34 @@ RSpec.describe Scruber do
|
|
118
117
|
end
|
119
118
|
|
120
119
|
parse_product :html do |page,doc|
|
121
|
-
|
120
|
+
@products ||= []
|
121
|
+
@products.push doc.at('h1').text
|
122
122
|
end
|
123
|
+
|
124
|
+
on_complete { @products }
|
123
125
|
end
|
124
|
-
expect(
|
126
|
+
expect(products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
|
125
127
|
end
|
126
128
|
|
127
129
|
it "should redownload page and increase retry" do
|
128
130
|
stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
|
129
131
|
|
130
|
-
Scruber.run :sample do
|
132
|
+
title, retry_count = Scruber.run :sample do
|
131
133
|
get "http://example.com/"
|
132
|
-
|
134
|
+
|
133
135
|
parse :html do |page, doc|
|
134
136
|
if page.response_body =~ /blocked/
|
135
137
|
page.redownload!
|
136
138
|
else
|
137
|
-
|
138
|
-
|
139
|
+
@title = doc.at('h1').text
|
140
|
+
@retry_count = page.retry_count
|
139
141
|
end
|
140
142
|
end
|
143
|
+
|
144
|
+
on_complete { [@title, @retry_count] }
|
141
145
|
end
|
142
|
-
expect(
|
143
|
-
expect(
|
146
|
+
expect(title).to eq('Product')
|
147
|
+
expect(retry_count).to eq(2)
|
144
148
|
end
|
145
149
|
end
|
146
150
|
|
@@ -148,39 +152,41 @@ RSpec.describe Scruber do
|
|
148
152
|
it "should process 500 error page" do
|
149
153
|
stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
|
150
154
|
|
151
|
-
|
152
|
-
Scruber.run :sample do
|
155
|
+
error_title = Scruber.run :sample do
|
153
156
|
get "http://example.com", max_retry_times: 1
|
154
157
|
|
155
158
|
parse :html do |page,doc|
|
156
|
-
|
159
|
+
@error_title = doc.at('h1').text
|
157
160
|
end
|
158
161
|
|
159
162
|
on_page_error do |page|
|
160
|
-
|
163
|
+
@error_title = page.response_body
|
161
164
|
page.processed!
|
162
165
|
end
|
166
|
+
|
167
|
+
on_complete { @error_title }
|
163
168
|
end
|
164
|
-
expect(
|
169
|
+
expect(error_title).to eq('<div><h1>500</h1></div>')
|
165
170
|
end
|
166
171
|
|
167
172
|
it "should process 404 error page" do
|
168
173
|
stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
|
169
174
|
|
170
|
-
|
171
|
-
Scruber.run :sample do
|
175
|
+
error_title = Scruber.run :sample do
|
172
176
|
get "http://example.com", max_retry_times: 1
|
173
177
|
|
174
178
|
parse :html do |page,doc|
|
175
|
-
|
179
|
+
@error_title = doc.at('h1').text
|
176
180
|
end
|
177
181
|
|
178
182
|
on_page_error do |page|
|
179
|
-
|
183
|
+
@error_title = page.response_body
|
180
184
|
page.processed!
|
181
185
|
end
|
186
|
+
|
187
|
+
on_complete { @error_title }
|
182
188
|
end
|
183
|
-
expect(
|
189
|
+
expect(error_title).to eq('<div><h1>404</h1></div>')
|
184
190
|
end
|
185
191
|
end
|
186
192
|
end
|
@@ -167,5 +167,10 @@ RSpec.shared_examples "queue_adapter" do
|
|
167
167
|
expect(page1.id).not_to be_blank
|
168
168
|
expect(page1.id).not_to eq(page2.id)
|
169
169
|
end
|
170
|
+
|
171
|
+
it "should join url" do
|
172
|
+
page1 = page_class.new queue, url: "http://example.com/product1"
|
173
|
+
expect(page1.url_join('/abc')).to eq("http://example.com/abc")
|
174
|
+
end
|
170
175
|
end
|
171
176
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-09-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: typhoeus
|
@@ -138,6 +138,26 @@ dependencies:
|
|
138
138
|
- - ">="
|
139
139
|
- !ruby/object:Gem::Version
|
140
140
|
version: 2.0.1
|
141
|
+
- !ruby/object:Gem::Dependency
|
142
|
+
name: charlock_holmes
|
143
|
+
requirement: !ruby/object:Gem::Requirement
|
144
|
+
requirements:
|
145
|
+
- - "~>"
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0.7'
|
148
|
+
- - ">="
|
149
|
+
- !ruby/object:Gem::Version
|
150
|
+
version: 0.7.6
|
151
|
+
type: :runtime
|
152
|
+
prerelease: false
|
153
|
+
version_requirements: !ruby/object:Gem::Requirement
|
154
|
+
requirements:
|
155
|
+
- - "~>"
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: '0.7'
|
158
|
+
- - ">="
|
159
|
+
- !ruby/object:Gem::Version
|
160
|
+
version: 0.7.6
|
141
161
|
- !ruby/object:Gem::Dependency
|
142
162
|
name: thor
|
143
163
|
requirement: !ruby/object:Gem::Requirement
|
@@ -252,6 +272,7 @@ files:
|
|
252
272
|
- lib/scruber/core/page_format.rb
|
253
273
|
- lib/scruber/core/page_format/base.rb
|
254
274
|
- lib/scruber/core/page_format/html.rb
|
275
|
+
- lib/scruber/core/page_format/json.rb
|
255
276
|
- lib/scruber/core/page_format/xml.rb
|
256
277
|
- lib/scruber/fetcher.rb
|
257
278
|
- lib/scruber/fetcher_adapters/abstract_adapter.rb
|