scruber 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/scruber/version.rb +1 -1
- data/scruber.gemspec +1 -1
- data/spec/core/extensions/csv_output_spec.rb +44 -0
- data/spec/core/extensions/dict.csv +4 -0
- data/spec/core/extensions/log_spec.rb +25 -0
- data/spec/core/extensions/loop_spec.rb +26 -0
- data/spec/core/extensions/parser_aliases_spec.rb +89 -0
- data/spec/core/extensions/queue_aliases_spec.rb +72 -0
- data/spec/core/extensions/seed_spec.rb +44 -0
- data/spec/fetcher.rb +27 -0
- data/spec/helpers/dictionary_reader/dict.csv +4 -0
- data/spec/helpers/dictionary_reader/dict.xml +5 -0
- data/spec/helpers/dictionary_reader/dict_records.xml +5 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb +36 -0
- data/spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb +46 -0
- data/spec/helpers/fetcher_agent_adapters/memory_spec.rb +45 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb +21 -0
- data/spec/helpers/proxy_rotator/proxy_rotator_spec.rb +118 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb +145 -0
- data/spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb +40 -0
- data/spec/helpers/user_agent_rotator/user_agents.xml +6 -0
- data/spec/queue_adapter/memory_spec.rb +15 -0
- data/spec/queue_spec.rb +27 -0
- data/spec/scruber_spec.rb +198 -0
- data/spec/spec_helper.rb +36 -0
- data/spec/support/queue/queue_adapter.rb +171 -0
- metadata +26 -1
@@ -0,0 +1,198 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.describe Scruber do
|
4
|
+
before do
|
5
|
+
Scruber::Helpers::UserAgentRotator.configure do
|
6
|
+
clean
|
7
|
+
set_filter :all
|
8
|
+
add "Scruber 1.0", tags: [:robot, :scruber]
|
9
|
+
add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
it "has a version number" do
|
14
|
+
expect(Scruber::VERSION).not_to be nil
|
15
|
+
end
|
16
|
+
|
17
|
+
describe "configurable" do
|
18
|
+
before do
|
19
|
+
Scruber.configure do |config|
|
20
|
+
config.fetcher_adapter = :typhoeus_fetcher
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
it "returns :typhoeus_fetcher as fetcher" do
|
25
|
+
expect(Scruber.configuration.fetcher_adapter).to eq(:typhoeus_fetcher)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "#run" do
|
30
|
+
context "without args" do
|
31
|
+
it "should raise error" do
|
32
|
+
expect { Scruber.run { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should set scraper name from ENV" do
|
36
|
+
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
37
|
+
Scruber.run do
|
38
|
+
$scraper_name = scraper_name
|
39
|
+
end
|
40
|
+
expect($scraper_name).to eq(:sample)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
context "with args" do
|
45
|
+
it "should set scraper name from first arg" do
|
46
|
+
Scruber.run :sample1 do
|
47
|
+
$scraper_name = scraper_name
|
48
|
+
end
|
49
|
+
expect($scraper_name).to eq(:sample1)
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should set scraper name from first arg, and options from second" do
|
53
|
+
Scruber.run :sample2, queue_adapter: :test do
|
54
|
+
$scraper_name = scraper_name
|
55
|
+
$opt = Scruber.configuration.queue_adapter
|
56
|
+
end
|
57
|
+
expect($scraper_name).to eq(:sample2)
|
58
|
+
expect($opt).to eq(:test)
|
59
|
+
end
|
60
|
+
|
61
|
+
it "options from first arg and scraper_name from ENV" do
|
62
|
+
ENV['SCRUBER_SCRAPER_NAME'] = 'sample'
|
63
|
+
Scruber.run queue_adapter: :test2 do
|
64
|
+
$scraper_name = scraper_name
|
65
|
+
$opt = Scruber.configuration.queue_adapter
|
66
|
+
end
|
67
|
+
expect($scraper_name).to eq(:sample)
|
68
|
+
expect($opt).to eq(:test2)
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should raise error if passed only options without ENV" do
|
72
|
+
ENV['SCRUBER_SCRAPER_NAME'] = nil
|
73
|
+
expect { Scruber.run(queue_adapter: :test2) { $title = scraper_name } }.to raise_error(Scruber::ArgumentError)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
it "simple example" do
|
78
|
+
stub_request(:get, "http://example.com").to_return(body: 'Example Domain')
|
79
|
+
|
80
|
+
Scruber.run :sample do
|
81
|
+
queue.add "http://example.com"
|
82
|
+
|
83
|
+
parser :seed do |page|
|
84
|
+
$title = page.response_body
|
85
|
+
end
|
86
|
+
end
|
87
|
+
expect($title).to eq('Example Domain')
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should return Nokogiri object" do
|
91
|
+
stub_request(:get, "http://example.com/contacts.html").to_return(body: '<div><a>Contacts</a></div>')
|
92
|
+
|
93
|
+
Scruber.run :sample do
|
94
|
+
queue.add "http://example.com/contacts.html"
|
95
|
+
|
96
|
+
parser :seed, format: :html do |page, html|
|
97
|
+
$title = html.at('a').text
|
98
|
+
end
|
99
|
+
end
|
100
|
+
expect($title).to eq('Contacts')
|
101
|
+
end
|
102
|
+
|
103
|
+
context "complex example" do
|
104
|
+
it "should parse pages in 2 steps" do
|
105
|
+
stub_request(:get, "http://example.com/catalog").to_return(body: '<div><a href="/product1">Product 1</a><a href="/product2">Product 2</a><a href="/product3">Product 3</a></div>')
|
106
|
+
stub_request(:get, "http://example.com/product1").to_return(body: '<div><h1>Product 1</h1></div>')
|
107
|
+
stub_request(:get, "http://example.com/product2").to_return(body: '<div><h1>Product 2</h1></div>')
|
108
|
+
stub_request(:get, "http://example.com/product3").to_return(body: '<div><h1>Product 3</h1></div>')
|
109
|
+
|
110
|
+
$products = []
|
111
|
+
Scruber.run :sample do
|
112
|
+
get "http://example.com/catalog"
|
113
|
+
|
114
|
+
parse :html do |page, doc|
|
115
|
+
doc.search('a').each do |a|
|
116
|
+
get_product URI.join(page.url, a.attr('href')).to_s
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
parse_product :html do |page,doc|
|
121
|
+
$products.push doc.at('h1').text
|
122
|
+
end
|
123
|
+
end
|
124
|
+
expect($products.sort).to eq((1..3).map{|i| "Product #{i}"}.sort)
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should redownload page and increase retry" do
|
128
|
+
stub_request(:get, "http://example.com/").to_return(body: '<div>blocked</div>').times(2).then.to_return(body: '<div><h1>Product</h1></div>')
|
129
|
+
|
130
|
+
Scruber.run :sample do
|
131
|
+
get "http://example.com/"
|
132
|
+
|
133
|
+
parse :html do |page, doc|
|
134
|
+
if page.response_body =~ /blocked/
|
135
|
+
page.redownload!
|
136
|
+
else
|
137
|
+
$title = doc.at('h1').text
|
138
|
+
$retry_count = page.retry_count
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
expect($title).to eq('Product')
|
143
|
+
expect($retry_count).to eq(2)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
context "processing error examples" do
|
148
|
+
it "should process 500 error page" do
|
149
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><h1>500</h1></div>', status: 500)
|
150
|
+
|
151
|
+
$error_title = nil
|
152
|
+
Scruber.run :sample do
|
153
|
+
get "http://example.com", max_retry_times: 1
|
154
|
+
|
155
|
+
parse :html do |page,doc|
|
156
|
+
$error_title = doc.at('h1').text
|
157
|
+
end
|
158
|
+
|
159
|
+
on_page_error do |page|
|
160
|
+
$error_title = page.response_body
|
161
|
+
page.processed!
|
162
|
+
end
|
163
|
+
end
|
164
|
+
expect($error_title).to eq('<div><h1>500</h1></div>')
|
165
|
+
end
|
166
|
+
|
167
|
+
it "should process 404 error page" do
|
168
|
+
stub_request(:get, "http://example.com").to_return(body: '<div><h1>404</h1></div>', status: 404)
|
169
|
+
|
170
|
+
$error_title = nil
|
171
|
+
Scruber.run :sample do
|
172
|
+
get "http://example.com", max_retry_times: 1
|
173
|
+
|
174
|
+
parse :html do |page,doc|
|
175
|
+
$error_title = doc.at('h1').text
|
176
|
+
end
|
177
|
+
|
178
|
+
on_page_error do |page|
|
179
|
+
$error_title = page.response_body
|
180
|
+
page.processed!
|
181
|
+
end
|
182
|
+
end
|
183
|
+
expect($error_title).to eq('<div><h1>404</h1></div>')
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
describe "#root" do
|
189
|
+
it "should return nil without APP_PATH defined" do
|
190
|
+
expect(Scruber.root).to eq(nil)
|
191
|
+
end
|
192
|
+
|
193
|
+
it "should return path object" do
|
194
|
+
APP_PATH='/tmp/a/b/'
|
195
|
+
expect(Scruber.root.to_s).to eq('/tmp')
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require "scruber"
|
3
|
+
require 'webmock/rspec'
|
4
|
+
|
5
|
+
Encoding.default_external = Encoding::UTF_8
|
6
|
+
Encoding.default_internal = Encoding::UTF_8
|
7
|
+
|
8
|
+
Dir[File.expand_path(File.dirname(__FILE__))+"/support/**/*.rb"].each { |f| require f }
|
9
|
+
|
10
|
+
Scruber::Helpers::UserAgentRotator.configure do
|
11
|
+
set_filter :all
|
12
|
+
add "Scruber 1.0", tags: [:robot, :scruber]
|
13
|
+
add "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", tags: [:desktop, :chrome, :macos]
|
14
|
+
end
|
15
|
+
|
16
|
+
RSpec.configure do |config|
|
17
|
+
# Enable flags like --only-failures and --next-failure
|
18
|
+
config.example_status_persistence_file_path = ".rspec_status"
|
19
|
+
|
20
|
+
# Disable RSpec exposing methods globally on `Module` and `main`
|
21
|
+
config.disable_monkey_patching!
|
22
|
+
|
23
|
+
config.expect_with :rspec do |c|
|
24
|
+
c.syntax = :expect
|
25
|
+
end
|
26
|
+
|
27
|
+
# Use color in STDOUT
|
28
|
+
config.color = true
|
29
|
+
|
30
|
+
# Use color not only in STDOUT but also in pagers and files
|
31
|
+
config.tty = true
|
32
|
+
|
33
|
+
# Use the specified formatter
|
34
|
+
config.formatter = :progress # :documentation, :html, :textmate
|
35
|
+
|
36
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
RSpec.shared_examples "queue_adapter" do
|
4
|
+
|
5
|
+
it "should update page" do
|
6
|
+
queue.add "http://example.com"
|
7
|
+
page = queue.fetch_pending
|
8
|
+
page.url = "http://example.net"
|
9
|
+
page.save
|
10
|
+
page = queue.fetch_pending
|
11
|
+
expect(page.url).to eq("http://example.net")
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should update page and fetch downloaded page" do
|
15
|
+
queue.add "http://example.com"
|
16
|
+
page = queue.fetch_pending
|
17
|
+
page.fetched_at = Time.now.to_i
|
18
|
+
page.save
|
19
|
+
pending_page = queue.fetch_pending
|
20
|
+
downloaded_page = queue.fetch_downloaded
|
21
|
+
expect(pending_page).to eq(nil)
|
22
|
+
expect(downloaded_page.url).to eq("http://example.com")
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "processing errors page" do
|
26
|
+
it "should fetch error page" do
|
27
|
+
queue.add "http://example.com"
|
28
|
+
page = queue.fetch_pending
|
29
|
+
page.retry_count = 5
|
30
|
+
page.max_retry_times = 5
|
31
|
+
page.save
|
32
|
+
error_page = queue.fetch_error
|
33
|
+
expect(error_page).not_to eq(nil)
|
34
|
+
expect(error_page.id).to eq(page.id)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should return page to downloading" do
|
38
|
+
queue.add "http://example.com"
|
39
|
+
page = queue.fetch_pending
|
40
|
+
page.retry_count = 5
|
41
|
+
page.max_retry_times = 5
|
42
|
+
page.save
|
43
|
+
error_page = queue.fetch_error
|
44
|
+
error_page.redownload!(0)
|
45
|
+
pending_page = queue.fetch_pending
|
46
|
+
err_page = queue.fetch_error
|
47
|
+
d_page = queue.fetch_downloaded
|
48
|
+
expect(error_page.id).to eq(pending_page.id)
|
49
|
+
expect(err_page).to be_nil
|
50
|
+
expect(d_page).to be_nil
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should delete page from queue" do
|
54
|
+
queue.add "http://example.com"
|
55
|
+
page = queue.fetch_pending
|
56
|
+
page.retry_count = 5
|
57
|
+
page.max_retry_times = 5
|
58
|
+
page.save
|
59
|
+
error_page = queue.fetch_error
|
60
|
+
error_page.delete
|
61
|
+
pending_page = queue.fetch_pending
|
62
|
+
err_page = queue.fetch_error
|
63
|
+
d_page = queue.fetch_downloaded
|
64
|
+
expect(pending_page).to be_nil
|
65
|
+
expect(err_page).to be_nil
|
66
|
+
expect(d_page).to be_nil
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should process page" do
|
70
|
+
queue.add "http://example.com"
|
71
|
+
page = queue.fetch_pending
|
72
|
+
page.retry_count = 5
|
73
|
+
page.max_retry_times = 5
|
74
|
+
page.save
|
75
|
+
error_page = queue.fetch_error
|
76
|
+
error_page.processed!
|
77
|
+
pending_page = queue.fetch_pending
|
78
|
+
err_page = queue.fetch_error
|
79
|
+
d_page = queue.fetch_downloaded
|
80
|
+
expect(pending_page).to be_nil
|
81
|
+
expect(err_page).to be_nil
|
82
|
+
expect(d_page).to be_nil
|
83
|
+
queue.add "http://example.com"
|
84
|
+
pending_page = queue.fetch_pending
|
85
|
+
expect(pending_page).to be_nil
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
context "#add" do
|
90
|
+
it "queue page for downloading" do
|
91
|
+
queue.add "http://example.com"
|
92
|
+
expect(queue.size).to eq(1)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should not add the same page twice" do
|
96
|
+
queue.add "http://example.com"
|
97
|
+
expect(queue.size).to eq(1)
|
98
|
+
queue.add "http://example.com"
|
99
|
+
expect(queue.size).to eq(1)
|
100
|
+
end
|
101
|
+
|
102
|
+
it "should not add the same page twice even if page was processed" do
|
103
|
+
queue.add "http://example.com"
|
104
|
+
page = queue.fetch_pending
|
105
|
+
page.fetched_at = Time.now.to_i
|
106
|
+
page.save
|
107
|
+
downloaded_page = queue.fetch_downloaded
|
108
|
+
downloaded_page.processed!
|
109
|
+
queue.add "http://example.com"
|
110
|
+
page = queue.fetch_pending
|
111
|
+
expect(page).to eq(nil)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
context "#save" do
|
116
|
+
it "should delete page" do
|
117
|
+
queue.add "http://example.abc"
|
118
|
+
page = queue.fetch_pending
|
119
|
+
page.fetched_at = Time.now.to_i
|
120
|
+
page.save
|
121
|
+
page.delete
|
122
|
+
page = queue.fetch_downloaded
|
123
|
+
|
124
|
+
expect(page).to eq(nil)
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should save additional arguments" do
|
128
|
+
queue.add "http://example.abc", id: 'abc', test_id: '1'
|
129
|
+
page = queue.find 'abc'
|
130
|
+
|
131
|
+
expect(page.options[:test_id]).to eq('1')
|
132
|
+
end
|
133
|
+
|
134
|
+
it "should not override page" do
|
135
|
+
queue.add "http://example.abc", id: 'abc'
|
136
|
+
page = queue.find 'abc'
|
137
|
+
page.fetched_at = 1
|
138
|
+
page.save
|
139
|
+
page = queue.find 'abc'
|
140
|
+
expect(page.fetched_at).to eq(1)
|
141
|
+
queue.add "http://example.abc", id: 'abc'
|
142
|
+
page = queue.find 'abc'
|
143
|
+
expect(page.fetched_at).to eq(1)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
context "#processed!" do
|
148
|
+
it "should update page and set processed_at" do
|
149
|
+
queue.add "http://example.com"
|
150
|
+
page = queue.fetch_pending
|
151
|
+
page.fetched_at = Time.now.to_i
|
152
|
+
page.save
|
153
|
+
downloaded_page = queue.fetch_downloaded
|
154
|
+
downloaded_page.processed!
|
155
|
+
downloaded_page2 = queue.fetch_downloaded
|
156
|
+
expect(downloaded_page2).to eq(nil)
|
157
|
+
expect(downloaded_page.processed_at).to be >= 0
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
describe "Page" do
|
162
|
+
let(:page_class){ described_class.const_get(:Page) }
|
163
|
+
|
164
|
+
it "should generate different ids for different urls" do
|
165
|
+
page1 = page_class.new queue, url: "http://example.com/product1"
|
166
|
+
page2 = page_class.new queue, url: "http://example.com/product2"
|
167
|
+
expect(page1.id).not_to be_blank
|
168
|
+
expect(page1.id).not_to eq(page2.id)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scruber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ivan Goncharov
|
@@ -269,6 +269,31 @@ files:
|
|
269
269
|
- lib/scruber/queue_adapters/memory.rb
|
270
270
|
- lib/scruber/version.rb
|
271
271
|
- scruber.gemspec
|
272
|
+
- spec/core/extensions/csv_output_spec.rb
|
273
|
+
- spec/core/extensions/dict.csv
|
274
|
+
- spec/core/extensions/log_spec.rb
|
275
|
+
- spec/core/extensions/loop_spec.rb
|
276
|
+
- spec/core/extensions/parser_aliases_spec.rb
|
277
|
+
- spec/core/extensions/queue_aliases_spec.rb
|
278
|
+
- spec/core/extensions/seed_spec.rb
|
279
|
+
- spec/fetcher.rb
|
280
|
+
- spec/helpers/dictionary_reader/dict.csv
|
281
|
+
- spec/helpers/dictionary_reader/dict.xml
|
282
|
+
- spec/helpers/dictionary_reader/dict_records.xml
|
283
|
+
- spec/helpers/dictionary_reader/dictionary_reader_csv_spec.rb
|
284
|
+
- spec/helpers/dictionary_reader/dictionary_reader_xml_spec.rb
|
285
|
+
- spec/helpers/fetcher_agent_adapters/abstract_adapter_spec.rb
|
286
|
+
- spec/helpers/fetcher_agent_adapters/memory_spec.rb
|
287
|
+
- spec/helpers/proxy_rotator/proxy_rotator_proxy_spec.rb
|
288
|
+
- spec/helpers/proxy_rotator/proxy_rotator_spec.rb
|
289
|
+
- spec/helpers/user_agent_rotator/user_agent_rotator_spec.rb
|
290
|
+
- spec/helpers/user_agent_rotator/user_agent_rotator_user_agent_spec.rb
|
291
|
+
- spec/helpers/user_agent_rotator/user_agents.xml
|
292
|
+
- spec/queue_adapter/memory_spec.rb
|
293
|
+
- spec/queue_spec.rb
|
294
|
+
- spec/scruber_spec.rb
|
295
|
+
- spec/spec_helper.rb
|
296
|
+
- spec/support/queue/queue_adapter.rb
|
272
297
|
homepage: https://github.com/scruber/scruber
|
273
298
|
licenses:
|
274
299
|
- MIT
|