scruber 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/scruber/version.rb +1 -1
- data/scruber.gemspec +1 -1
- data/spec/core/extensions/csv_output_spec.rb +44 -0
- data/spec/core/extensions/dict.csv +4 -0
- data/spec/core/extensions/log_spec.rb +25 -0
- data/spec/core/extensions/loop_spec.rb +26 -0
- data/spec/core/extensions/parser_aliases_spec.rb +89 -0
- data/spec/core/extensions/queue_aliases_spec.rb +72 -0
- data/spec/core/extensions/seed_spec.rb +44 -0
- data/spec/fetcher.rb +27 -0
- data/spec/helpers/dictionary_reader/dict.csv +4 -0
- data/spec/helpers/dictionary_reader/dict.xml +5 -0
- data/spec/helpers/dictionary_reader/dict_records.xml +5 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb +36 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/memory_spec.rb +45 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb +21 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_spec.rb +118 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb +145 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb +40 -0
- data/spec/helpers/user_agent_rotator/user_agents.xml +6 -0
- data/spec/queue_adapter/memory_spec.rb +15 -0
- data/spec/queue_spec.rb +27 -0
- data/spec/scruber_spec.rb +198 -0
- data/spec/spec_helper.rb +36 -0
- data/spec/support/queue/queue_adapter.rb +171 -0
- metadata +26 -1
@@ -0,0 +1,198 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber do
|
4
|
+
before do
|
5
|
+
Scruber::Helpers::UserAgentRotator.configure do
|
6
|
+
clean
|
7
|
+
set_filter :all
|
8
|
+
add "Scruber 1.0", tags: [:robot, :scruber]
|
9
|
+
add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
it "has a version number" do
|
14
|
+
expect(Scruber::VERSION).not_to be nil
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "configurable" do
|
18
|
+
before do
|
19
|
+
Scruber.configure do |config|
|
20
|
+
config.fetcher_adapter = :typhoeus_fetcher
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
it "returns :typhoeus_fetcher as fetcher" do
|
25
|
+
expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#run" do
|
30
|
+
context "without args" do
|
31
|
+
it "should raise error" do
|
32
|
+
expect { Scruber.run { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should set scraper name from ENV" do
|
36
|
+
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
37
|
+
Scruber.run do
|
38
|
+
$scraper_name = scraper_name
|
39
|
+
end
|
40
|
+
expect($scraper_name).to eq(:sample)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "with args" do
|
45
|
+
it "should set scraper name from first arg" do
|
46
|
+
Scruber.run :sample1 do
|
47
|
+
$scraper_name = scraper_name
|
48
|
+
end
|
49
|
+
expect($scraper_name).to eq(:sample1)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should set scraper name from first arg, and options from second" do
|
53
|
+
Scruber.run :sample2, queue_adapter: :test do
|
54
|
+
$scraper_name = scraper_name
|
55
|
+
$opt = Scruber.configuration.queue_adapter
|
56
|
+
end
|
57
|
+
expect($scraper_name).to eq(:sample2)
|
58
|
+
expect($opt).to eq(:test)
|
59
|
+
end
|
60
|
+
|
61
|
+
it "options from first arg and scraper_name from ENV" do
|
62
|
+
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
63
|
+
Scruber.run queue_adapter: :test2 do
|
64
|
+
$scraper_name = scraper_name
|
65
|
+
$opt = Scruber.configuration.queue_adapter
|
66
|
+
end
|
67
|
+
expect($scraper_name).to eq(:sample)
|
68
|
+
expect($opt).to eq(:test2)
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should raise error if passed only options without ENV" do
|
72
|
+
ENV['SCRUBER_SCRAPER_NAME'] = nil
|
73
|
+
expect { Scruber.run(queue_adapter: :test2) { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
it "simple example" do
|
78
|
+
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
79
|
+
|
80
|
+
Scruber.run :sample do
|
81
|
+
queue.add "http://example.com"
|
82
|
+
|
83
|
+
parser :seed do |page|
|
84
|
+
$title = page.response_body
|
85
|
+
end
|
86
|
+
end
|
87
|
+
expect($title).to eq('Example Domain')
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should return Nokogiri object" do
|
91
|
+
stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
|
92
|
+
|
93
|
+
Scruber.run :sample do
|
94
|
+
queue.add "http://example.com/contacts.html"
|
95
|
+
|
96
|
+
parser :seed, format: :html do |page, html|
|
97
|
+
$title = html.at('a').text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
expect($title).to eq('Contacts')
|
101
|
+
end
|
102
|
+
|
103
|
+
context "complex example" do
|
104
|
+
it "should parse pages in 2 steps" do
|
105
|
+
stub_request(:get, "http://example.com/catalog").to_return(body: '<div><a href="/product1">Product 1</a><a href="/product2">Product 2</a><a href="/product3">Product 3</a></div>')
|
106
|
+
stub_request(:get, "http://example.com/product1").to_return(body: '<div><h1>Product 1</h1></div>')
|
107
|
+
stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
|
108
|
+
stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
|
109
|
+
|
110
|
+
$products = []
|
111
|
+
Scruber.run :sample do
|
112
|
+
get "http://example.com/catalog"
|
113
|
+
|
114
|
+
parse :html do |page, doc|
|
115
|
+
doc.search('a').each do |a|
|
116
|
+
get_product URI.join(page.url, a.attr('href')).to_s
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
parse_product :html do |page,doc|
|
121
|
+
$products.push doc.at('h1').text
|
122
|
+
end
|
123
|
+
end
|
124
|
+
expect($products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should redownload page and increase retry" do
|
128
|
+
stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
|
129
|
+
|
130
|
+
Scruber.run :sample do
|
131
|
+
get "http://example.com/"
|
132
|
+
|
133
|
+
parse :html do |page, doc|
|
134
|
+
if page.response_body =~ /blocked/
|
135
|
+
page.redownload!
|
136
|
+
else
|
137
|
+
$title = doc.at('h1').text
|
138
|
+
$retry_count = page.retry_count
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
expect($title).to eq('Product')
|
143
|
+
expect($retry_count).to eq(2)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
context "processing error examples" do
|
148
|
+
it "should process 500 error page" do
|
149
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
|
150
|
+
|
151
|
+
$error_title = nil
|
152
|
+
Scruber.run :sample do
|
153
|
+
get "http://example.com", max_retry_times: 1
|
154
|
+
|
155
|
+
parse :html do |page,doc|
|
156
|
+
$error_title = doc.at('h1').text
|
157
|
+
end
|
158
|
+
|
159
|
+
on_page_error do |page|
|
160
|
+
$error_title = page.response_body
|
161
|
+
page.processed!
|
162
|
+
end
|
163
|
+
end
|
164
|
+
expect($error_title).to eq('<div><h1>500</h1></div>')
|
165
|
+
end
|
166
|
+
|
167
|
+
it "should process 404 error page" do
|
168
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
|
169
|
+
|
170
|
+
$error_title = nil
|
171
|
+
Scruber.run :sample do
|
172
|
+
get "http://example.com", max_retry_times: 1
|
173
|
+
|
174
|
+
parse :html do |page,doc|
|
175
|
+
$error_title = doc.at('h1').text
|
176
|
+
end
|
177
|
+
|
178
|
+
on_page_error do |page|
|
179
|
+
$error_title = page.response_body
|
180
|
+
page.processed!
|
181
|
+
end
|
182
|
+
end
|
183
|
+
expect($error_title).to eq('<div><h1>404</h1></div>')
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
describe "#root" do
|
189
|
+
it "should return nil without APP_PATH defined" do
|
190
|
+
expect(Scruber.root).to eq(nil)
|
191
|
+
end
|
192
|
+
|
193
|
+
it "should return path object" do
|
194
|
+
APP_PATH='/tmp/a/b/'
|
195
|
+
expect(Scruber.root.to_s).to eq('/tmp')
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require "scruber"
|
3
|
+
require 'webmock/rspec'
|
4
|
+
|
5
|
+
Encoding.default_external = Encoding::UTF_8
|
6
|
+
Encoding.default_internal = Encoding::UTF_8
|
7
|
+
|
8
|
+
Dir[File.expand_path(File.dirname(__FILE__))+"/support/**/*.rb"].each { |f| require f }
|
9
|
+
|
10
|
+
Scruber::Helpers::UserAgentRotator.configure do
|
11
|
+
set_filter :all
|
12
|
+
add "Scruber 1.0", tags: [:robot, :scruber]
|
13
|
+
add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
|
14
|
+
end
|
15
|
+
|
16
|
+
RSpec.configure do |config|
|
17
|
+
# Enable flags like --only-failures and --next-failure
|
18
|
+
config.example_status_persistence_file_path = ".rspec_status"
|
19
|
+
|
20
|
+
# Disable RSpec exposing methods globally on `Module` and `main`
|
21
|
+
config.disable_monkey_patching!
|
22
|
+
|
23
|
+
config.expect_with :rspec do |c|
|
24
|
+
c.syntax = :expect
|
25
|
+
end
|
26
|
+
|
27
|
+
# Use color in STDOUT
|
28
|
+
config.color = true
|
29
|
+
|
30
|
+
# Use color not only in STDOUT but also in pagers and files
|
31
|
+
config.tty = true
|
32
|
+
|
33
|
+
# Use the specified formatter
|
34
|
+
config.formatter = :progress # :documentation, :html, :textmate
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.shared_examples "queue_adapter" do
|
4
|
+
|
5
|
+
it "should update page" do
|
6
|
+
queue.add "http://example.com"
|
7
|
+
page = queue.fetch_pending
|
8
|
+
page.url = "http://example.net"
|
9
|
+
page.save
|
10
|
+
page = queue.fetch_pending
|
11
|
+
expect(page.url).to eq("http://example.net")
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should update page and fetch downloaded page" do
|
15
|
+
queue.add "http://example.com"
|
16
|
+
page = queue.fetch_pending
|
17
|
+
page.fetched_at = Time.now.to_i
|
18
|
+
page.save
|
19
|
+
pending_page = queue.fetch_pending
|
20
|
+
downloaded_page = queue.fetch_downloaded
|
21
|
+
expect(pending_page).to eq(nil)
|
22
|
+
expect(downloaded_page.url).to eq("http://example.com")
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "processing errors page" do
|
26
|
+
it "should fetch error page" do
|
27
|
+
queue.add "http://example.com"
|
28
|
+
page = queue.fetch_pending
|
29
|
+
page.retry_count = 5
|
30
|
+
page.max_retry_times = 5
|
31
|
+
page.save
|
32
|
+
error_page = queue.fetch_error
|
33
|
+
expect(error_page).not_to eq(nil)
|
34
|
+
expect(error_page.id).to eq(page.id)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should return page to downloading" do
|
38
|
+
queue.add "http://example.com"
|
39
|
+
page = queue.fetch_pending
|
40
|
+
page.retry_count = 5
|
41
|
+
page.max_retry_times = 5
|
42
|
+
page.save
|
43
|
+
error_page = queue.fetch_error
|
44
|
+
error_page.redownload!(0)
|
45
|
+
pending_page = queue.fetch_pending
|
46
|
+
err_page = queue.fetch_error
|
47
|
+
d_page = queue.fetch_downloaded
|
48
|
+
expect(error_page.id).to eq(pending_page.id)
|
49
|
+
expect(err_page).to be_nil
|
50
|
+
expect(d_page).to be_nil
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should delete page from queue" do
|
54
|
+
queue.add "http://example.com"
|
55
|
+
page = queue.fetch_pending
|
56
|
+
page.retry_count = 5
|
57
|
+
page.max_retry_times = 5
|
58
|
+
page.save
|
59
|
+
error_page = queue.fetch_error
|
60
|
+
error_page.delete
|
61
|
+
pending_page = queue.fetch_pending
|
62
|
+
err_page = queue.fetch_error
|
63
|
+
d_page = queue.fetch_downloaded
|
64
|
+
expect(pending_page).to be_nil
|
65
|
+
expect(err_page).to be_nil
|
66
|
+
expect(d_page).to be_nil
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should process page" do
|
70
|
+
queue.add "http://example.com"
|
71
|
+
page = queue.fetch_pending
|
72
|
+
page.retry_count = 5
|
73
|
+
page.max_retry_times = 5
|
74
|
+
page.save
|
75
|
+
error_page = queue.fetch_error
|
76
|
+
error_page.processed!
|
77
|
+
pending_page = queue.fetch_pending
|
78
|
+
err_page = queue.fetch_error
|
79
|
+
d_page = queue.fetch_downloaded
|
80
|
+
expect(pending_page).to be_nil
|
81
|
+
expect(err_page).to be_nil
|
82
|
+
expect(d_page).to be_nil
|
83
|
+
queue.add "http://example.com"
|
84
|
+
pending_page = queue.fetch_pending
|
85
|
+
expect(pending_page).to be_nil
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
context "#add" do
|
90
|
+
it "queue page for downloading" do
|
91
|
+
queue.add "http://example.com"
|
92
|
+
expect(queue.size).to eq(1)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should not add the same page twice" do
|
96
|
+
queue.add "http://example.com"
|
97
|
+
expect(queue.size).to eq(1)
|
98
|
+
queue.add "http://example.com"
|
99
|
+
expect(queue.size).to eq(1)
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should not add the same page twice even if page was processed" do
|
103
|
+
queue.add "http://example.com"
|
104
|
+
page = queue.fetch_pending
|
105
|
+
page.fetched_at = Time.now.to_i
|
106
|
+
page.save
|
107
|
+
downloaded_page = queue.fetch_downloaded
|
108
|
+
downloaded_page.processed!
|
109
|
+
queue.add "http://example.com"
|
110
|
+
page = queue.fetch_pending
|
111
|
+
expect(page).to eq(nil)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
context "#save" do
|
116
|
+
it "should delete page" do
|
117
|
+
queue.add "http://example.abc"
|
118
|
+
page = queue.fetch_pending
|
119
|
+
page.fetched_at = Time.now.to_i
|
120
|
+
page.save
|
121
|
+
page.delete
|
122
|
+
page = queue.fetch_downloaded
|
123
|
+
|
124
|
+
expect(page).to eq(nil)
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should save additional arguments" do
|
128
|
+
queue.add "http://example.abc", id: 'abc', test_id: '1'
|
129
|
+
page = queue.find 'abc'
|
130
|
+
|
131
|
+
expect(page.options[:test_id]).to eq('1')
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should not override page" do
|
135
|
+
queue.add "http://example.abc", id: 'abc'
|
136
|
+
page = queue.find 'abc'
|
137
|
+
page.fetched_at = 1
|
138
|
+
page.save
|
139
|
+
page = queue.find 'abc'
|
140
|
+
expect(page.fetched_at).to eq(1)
|
141
|
+
queue.add "http://example.abc", id: 'abc'
|
142
|
+
page = queue.find 'abc'
|
143
|
+
expect(page.fetched_at).to eq(1)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
context "#processed!" do
|
148
|
+
it "should update page and set processed_at" do
|
149
|
+
queue.add "http://example.com"
|
150
|
+
page = queue.fetch_pending
|
151
|
+
page.fetched_at = Time.now.to_i
|
152
|
+
page.save
|
153
|
+
downloaded_page = queue.fetch_downloaded
|
154
|
+
downloaded_page.processed!
|
155
|
+
downloaded_page2 = queue.fetch_downloaded
|
156
|
+
expect(downloaded_page2).to eq(nil)
|
157
|
+
expect(downloaded_page.processed_at).to be >= 0
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
describe "Page" do
|
162
|
+
let(:page_class){ described_class.const_get(:Page) }
|
163
|
+
|
164
|
+
it "should generate different ids for different urls" do
|
165
|
+
page1 = page_class.new queue, url: "http://example.com/product1"
|
166
|
+
page2 = page_class.new queue, url: "http://example.com/product2"
|
167
|
+
expect(page1.id).not_to be_blank
|
168
|
+
expect(page1.id).not_to eq(page2.id)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
@@ -269,6 +269,31 @@ files:
|
|
269
269
|
- lib/scruber/queue_adapters/memory.rb
|
270
270
|
- lib/scruber/version.rb
|
271
271
|
- scruber.gemspec
|
272
|
+
- spec/core/extensions/csv_output_spec.rb
|
273
|
+
- spec/core/extensions/dict.csv
|
274
|
+
- spec/core/extensions/log_spec.rb
|
275
|
+
- spec/core/extensions/loop_spec.rb
|
276
|
+
- spec/core/extensions/parser_aliases_spec.rb
|
277
|
+
- spec/core/extensions/queue_aliases_spec.rb
|
278
|
+
- spec/core/extensions/seed_spec.rb
|
279
|
+
- spec/fetcher.rb
|
280
|
+
- spec/helpers/dictionary_reader/dict.csv
|
281
|
+
- spec/helpers/dictionary_reader/dict.xml
|
282
|
+
- spec/helpers/dictionary_reader/dict_records.xml
|
283
|
+
- spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb
|
284
|
+
- spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb
|
285
|
+
- spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb
|
286
|
+
- spec/helpers/fetcher_agent_adapters/memory_spec.rb
|
287
|
+
- spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb
|
288
|
+
- spec/helpers/proxy_rotator/proxy_rotator_spec.rb
|
289
|
+
- spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb
|
290
|
+
- spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb
|
291
|
+
- spec/helpers/user_agent_rotator/user_agents.xml
|
292
|
+
- spec/queue_adapter/memory_spec.rb
|
293
|
+
- spec/queue_spec.rb
|
294
|
+
- spec/scruber_spec.rb
|
295
|
+
- spec/spec_helper.rb
|
296
|
+
- spec/support/queue/queue_adapter.rb
|
272
297
|
homepage: https://github.com/scruber/scruber
|
273
298
|
licenses:
|
274
299
|
- MIT
|