scraper_clients 9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +26 -0
- data/bin/pry +17 -0
- data/bin/rspec +17 -0
- data/data/user_agents.txt +204 -0
- data/lib/clients.rb +24 -0
- data/lib/clients/errors.rb +16 -0
- data/lib/clients/ftp_client.rb +17 -0
- data/lib/clients/http_client.rb +152 -0
- data/lib/clients/http_client/response.rb +57 -0
- data/lib/clients/proxy6_client.rb +70 -0
- data/lib/clients/proxy_client.rb +14 -0
- data/lib/clients/proxy_list_client.rb +38 -0
- data/lib/clients/recaptcha/client.rb +48 -0
- data/lib/clients/recaptcha/response.rb +15 -0
- data/lib/clients/recaptcha/solver.rb +115 -0
- data/lib/clients/tor_client.rb +146 -0
- data/lib/clients/url_decoder.rb +8 -0
- data/lib/clients/version.rb +3 -0
- data/spec/lib/clients/http_client/response_spec.rb +197 -0
- data/spec/lib/clients/http_client_spec.rb +221 -0
- data/spec/lib/clients/tor_client_spec.rb +34 -0
- data/spec/spec_helper.rb +66 -0
- metadata +168 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module Clients
|
4
|
+
class HttpClient
|
5
|
+
RSpec.describe Response do
|
6
|
+
let(:headers) { {} }
|
7
|
+
let(:body) { "BODY" }
|
8
|
+
let(:status) { 200 }
|
9
|
+
let(:response) {
|
10
|
+
HTTP::Response.new(
|
11
|
+
status: status,
|
12
|
+
version: "1.1",
|
13
|
+
headers: headers,
|
14
|
+
body: body
|
15
|
+
)
|
16
|
+
}
|
17
|
+
subject { described_class.new(response) }
|
18
|
+
|
19
|
+
describe "#success?" do
|
20
|
+
context "when response has succeeded" do
|
21
|
+
let(:status) { 200 }
|
22
|
+
it "returns true" do
|
23
|
+
is_expected.to be_success
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
context "when response has failed" do
|
28
|
+
let(:status) { 502 }
|
29
|
+
it "returns false" do
|
30
|
+
is_expected.not_to be_success
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#fail?" do
|
36
|
+
context "when response has failed" do
|
37
|
+
let(:status) { 400 }
|
38
|
+
it "returns true" do
|
39
|
+
is_expected.to be_fail
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
context "when response has succeeded" do
|
44
|
+
let(:status) { 201 }
|
45
|
+
it "returns false" do
|
46
|
+
is_expected.not_to be_fail
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#to_s" do
|
52
|
+
it "returns response body" do
|
53
|
+
expect(subject.to_s).to eq("BODY")
|
54
|
+
end
|
55
|
+
|
56
|
+
context "when force_utf8 hasn't been provided" do
|
57
|
+
let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
|
58
|
+
it "sets force_utf8 to FALSE by default" do
|
59
|
+
expect(subject.to_s).to eq(body)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
context "when force_utf8 is FALSE" do
|
64
|
+
shared_examples "unmodified body" do
|
65
|
+
it "returns unmodified response body" do
|
66
|
+
expect(subject.to_s(force_utf8: false)).to eq(body)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
context "when response doesn't have valid charset" do
|
71
|
+
let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
|
72
|
+
include_examples "unmodified body"
|
73
|
+
end
|
74
|
+
|
75
|
+
context "when response is binary" do
|
76
|
+
let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
|
77
|
+
include_examples "unmodified body"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "when force_utf8 is TRUE" do
|
82
|
+
context "when response doesn't have valid charset" do
|
83
|
+
let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
|
84
|
+
|
85
|
+
it "returns response body in UTF-8 encoding" do
|
86
|
+
response = subject.to_s force_utf8: true
|
87
|
+
|
88
|
+
expect(response.encoding).to eq(Encoding::UTF_8)
|
89
|
+
expect(response).to eq("Correct Ответ")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "when response have valid charset - windows-1251" do
|
94
|
+
let(:headers) {
|
95
|
+
{
|
96
|
+
"Content-Type" => "text/html; charset=windows-1251"
|
97
|
+
}
|
98
|
+
}
|
99
|
+
let(:body) { "Correct Ответ".encode Encoding::CP1251 }
|
100
|
+
|
101
|
+
it "returns response body in UTF-8 encoding" do
|
102
|
+
response = subject.to_s force_utf8: true
|
103
|
+
|
104
|
+
expect(response.encoding).to eq(Encoding::UTF_8)
|
105
|
+
expect(response).to eq("Correct Ответ")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
describe "#to_html" do
|
112
|
+
it "returns parsed response body" do
|
113
|
+
html = subject.to_html
|
114
|
+
expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
|
115
|
+
expect(html.to_s).to include("<body><p>BODY</p></body>")
|
116
|
+
end
|
117
|
+
|
118
|
+
context "when force_utf8 is TRUE" do
|
119
|
+
let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
|
120
|
+
|
121
|
+
it "returns parsed response body in valid UTF_8 encodin" do
|
122
|
+
html = subject.to_html(force_utf8: true)
|
123
|
+
expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
|
124
|
+
expect(html.to_s).to include("<body><p>Correct Ответ</p></body>")
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe "#to_xml" do
|
130
|
+
let(:body) { "<node><![CDATA[ Brazil ]]></node>" }
|
131
|
+
|
132
|
+
it "returns parsed response body" do
|
133
|
+
xml = subject.to_xml
|
134
|
+
expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
|
135
|
+
expect(xml.to_s).to include("<node><![CDATA[ Brazil ]]></node>")
|
136
|
+
end
|
137
|
+
|
138
|
+
context "when force_utf8 is TRUE", skip: true do
|
139
|
+
let(:body) { "<node><![CDATA[ Бразилия ]]></node>".force_encoding Encoding::CP1251 }
|
140
|
+
|
141
|
+
it "returns parsed response body in valid UTF_8 encodin" do
|
142
|
+
xml = subject.to_xml(force_utf8: true)
|
143
|
+
expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
|
144
|
+
expect(xml.to_s).to include("<node><![CDATA[ Бразилия ]]></node>")
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
describe "#to_json" do
|
150
|
+
let(:body) { "[{\"brand\":\"Фирма ZANUSSI\",\"product_code\":\"91460370200\"}]" }
|
151
|
+
let(:parsed_body) { [{brand: "Фирма ZANUSSI", product_code: "91460370200"}] }
|
152
|
+
|
153
|
+
it "returns parsed json body" do
|
154
|
+
expect(subject.to_json).to eq parsed_body
|
155
|
+
end
|
156
|
+
|
157
|
+
context "when force_utf8 is TRUE" do
|
158
|
+
let(:body) { super().force_encoding Encoding::CP1251 }
|
159
|
+
|
160
|
+
it "returns parsed json body" do
|
161
|
+
expect(subject.to_json(force_utf8: true)).to eq parsed_body
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
describe "#to_io" do
|
167
|
+
let(:body) { "IO BODY" }
|
168
|
+
|
169
|
+
it "returns response as StringIO" do
|
170
|
+
io = subject.to_io
|
171
|
+
expect(io).to be_an_instance_of(StringIO)
|
172
|
+
expect(io.read).to eq("IO BODY")
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
describe "#stream" do
|
178
|
+
let(:url) { "http://example.com" }
|
179
|
+
let(:response) { Clients::HttpClient.new.get(url) }
|
180
|
+
|
181
|
+
before do
|
182
|
+
stub_request(:get, url).and_return(body: body)
|
183
|
+
end
|
184
|
+
|
185
|
+
it "streams response body" do
|
186
|
+
expect { |b| subject.stream(1, &b) }.to yield_successive_args("B", "O", "D", "Y")
|
187
|
+
end
|
188
|
+
|
189
|
+
context "buffer size is not specified" do
|
190
|
+
it "streams response body" do
|
191
|
+
expect { |b| subject.stream(&b) }.to yield_successive_args("BODY")
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module Clients
|
4
|
+
RSpec.describe HttpClient do
|
5
|
+
subject { described_class.new }
|
6
|
+
|
7
|
+
describe "#get" do
|
8
|
+
let(:url) { "http://ya.ru/index.html" }
|
9
|
+
let(:response) { subject.get(url) }
|
10
|
+
|
11
|
+
before do
|
12
|
+
stub_request(:get, url).and_return(
|
13
|
+
status: 202,
|
14
|
+
body: "RESPONSE",
|
15
|
+
headers: {
|
16
|
+
content_type: "image/png; charset=UTF-8"
|
17
|
+
}
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "makes a request to given url" do
|
22
|
+
response
|
23
|
+
expect(WebMock).to have_requested(:get, url)
|
24
|
+
end
|
25
|
+
|
26
|
+
shared_examples "follow_redirects" do
|
27
|
+
let(:url) { "http://ya.ru/redirect" }
|
28
|
+
let(:redirect_to_url) { "http://ya.ru/final" }
|
29
|
+
|
30
|
+
before do
|
31
|
+
stub_request(:get, url).and_return(
|
32
|
+
status: 301,
|
33
|
+
headers: { location: redirect_to_url }
|
34
|
+
)
|
35
|
+
stub_request(:get, redirect_to_url).and_return(
|
36
|
+
status: 200,
|
37
|
+
body: "RESPONSE"
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "follows redirects" do
|
42
|
+
expect(response.to_s).to eq("RESPONSE")
|
43
|
+
expect(WebMock).to have_requested(:get, url).once
|
44
|
+
expect(WebMock).to have_requested(:get, redirect_to_url).once
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "when follow_redirects is true" do
|
49
|
+
let(:response) { subject.get(url, follow_redirects: true) }
|
50
|
+
include_examples "follow_redirects"
|
51
|
+
end
|
52
|
+
|
53
|
+
context "when follow_redirects is nil" do
|
54
|
+
let(:response) { subject.get(url) }
|
55
|
+
include_examples "follow_redirects"
|
56
|
+
end
|
57
|
+
|
58
|
+
context "when follow_redirects is false" do
|
59
|
+
let(:url) { "http://ya.ru/redirect" }
|
60
|
+
let(:redirect_to_url) { "http://ya.ru/final" }
|
61
|
+
let(:response) { subject.get(url, follow_redirects: false) }
|
62
|
+
|
63
|
+
before do
|
64
|
+
stub_request(:get, url).and_return(
|
65
|
+
status: 301,
|
66
|
+
headers: { location: redirect_to_url }
|
67
|
+
)
|
68
|
+
end
|
69
|
+
|
70
|
+
it "DOESN'T follow redirects" do
|
71
|
+
expect(response.status).to eq(301)
|
72
|
+
expect(response.headers["Location"]).to eq(redirect_to_url)
|
73
|
+
expect(WebMock).to have_requested(:get, url).once
|
74
|
+
expect(WebMock).not_to have_requested(:get, redirect_to_url)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
it "modifies request from the block" do
|
79
|
+
subject.get(url) do |request|
|
80
|
+
request.headers(cookie: "was_here=1;")
|
81
|
+
end
|
82
|
+
|
83
|
+
expect(WebMock).to have_requested(:get, url).with(headers: { "Cookie" => "was_here=1;" }).once
|
84
|
+
end
|
85
|
+
|
86
|
+
it "returns wrapped response" do
|
87
|
+
expect(response).to be_an_instance_of(HttpClient::Response)
|
88
|
+
expect(response.to_s).to eq("RESPONSE")
|
89
|
+
expect(response.code).to eq(202)
|
90
|
+
expect(response.mime_type).to eq("image/png")
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
describe "#head" do
|
95
|
+
let(:url) { "http://ya.ru/index.html" }
|
96
|
+
let(:response) { subject.head(url) }
|
97
|
+
|
98
|
+
before do
|
99
|
+
stub_request(:head, url).and_return(status: 202)
|
100
|
+
end
|
101
|
+
|
102
|
+
it "makes a request to given url" do
|
103
|
+
response
|
104
|
+
expect(WebMock).to have_requested(:head, url)
|
105
|
+
end
|
106
|
+
|
107
|
+
it "returns wrapped response" do
|
108
|
+
expect(response).to be_an_instance_of(HttpClient::Response)
|
109
|
+
expect(response.code).to eq(202)
|
110
|
+
expect(response.uri.to_s).to eq(url)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
describe "#proxy?" do
|
116
|
+
context "when proxy has been used" do
|
117
|
+
let(:proxy) { instance_spy("Clients::TorProxy") }
|
118
|
+
subject { described_class.new proxy: proxy }
|
119
|
+
|
120
|
+
it "returns true" do
|
121
|
+
expect(subject.proxy?).to eq(true)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
context "when proxy has NOT been used" do
|
126
|
+
it "returns false" do
|
127
|
+
expect(subject.proxy?).to eq(false)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "#has_cookies?" do
|
133
|
+
context "when client has cookies" do
|
134
|
+
let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
|
135
|
+
|
136
|
+
it "returns true" do
|
137
|
+
subject.cookies << cookie
|
138
|
+
expect(subject).to have_cookies
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
context "when proxy has NOT been used" do
|
143
|
+
it "returns false" do
|
144
|
+
expect(subject).not_to have_cookies
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
describe "#store_cookies" do
|
150
|
+
let(:old_cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
|
151
|
+
let(:new_cookie) { HTTP::Cookie.new("uid", "u12345", domain: "ya.ru", path: "/admin") }
|
152
|
+
let(:cookies) { HTTP::CookieJar.new }
|
153
|
+
|
154
|
+
before do
|
155
|
+
subject.cookies << old_cookie
|
156
|
+
cookies << new_cookie
|
157
|
+
end
|
158
|
+
|
159
|
+
it "adds given cookies from the response" do
|
160
|
+
subject.store_cookies cookies
|
161
|
+
expect(subject.cookies.to_a).to contain_exactly(old_cookie, new_cookie)
|
162
|
+
end
|
163
|
+
|
164
|
+
it "sents new and old cookies with the new request" do
|
165
|
+
url = "https://placeholder.com"
|
166
|
+
stub_request(:get, url).and_return(status: 200)
|
167
|
+
|
168
|
+
subject.store_cookies cookies
|
169
|
+
subject.get(url)
|
170
|
+
|
171
|
+
expect(WebMock).to have_requested(:get, url)
|
172
|
+
.with(headers: { "Cookie" => "group=admin; uid=u12345" })
|
173
|
+
.once
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe "#reset_cookies" do
|
178
|
+
let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
|
179
|
+
|
180
|
+
it "reset client cookies" do
|
181
|
+
subject.cookies << cookie
|
182
|
+
subject.reset_cookies
|
183
|
+
expect(subject.cookies).to be_empty
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
describe "#reset_user_agent" do
|
188
|
+
it "reset client user agent" do
|
189
|
+
subject.user_agent
|
190
|
+
expect(subject.user_agent).not_to be_empty
|
191
|
+
|
192
|
+
# Need to stub sample, because it's not deterministic
|
193
|
+
allow(subject).to receive(:sample_user_agent).and_return("UA")
|
194
|
+
|
195
|
+
subject.reset_user_agent
|
196
|
+
|
197
|
+
expect(subject.user_agent).to eq("UA")
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
describe "#reset_proxy" do
|
202
|
+
context "when proxy has been used" do
|
203
|
+
let(:proxy) { instance_spy("Clients::TorProxy") }
|
204
|
+
subject { described_class.new proxy: proxy }
|
205
|
+
|
206
|
+
it "calls reset on proxy" do
|
207
|
+
subject.reset_proxy
|
208
|
+
expect(proxy).to have_received(:reset!).once
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
context "when proxy has NOT been used" do
|
213
|
+
subject { described_class.new }
|
214
|
+
|
215
|
+
it "does nothing" do
|
216
|
+
expect { subject.reset_proxy }.not_to raise_error
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module Clients
|
4
|
+
RSpec.describe TorClient do
|
5
|
+
let(:ok_prompt) { "250 OK\n" }
|
6
|
+
let(:new_route_signal) { "SIGNAL NEWNYM" }
|
7
|
+
let(:localhost) { double("telnet") }
|
8
|
+
subject { described_class.new }
|
9
|
+
|
10
|
+
describe "#switch_identity" do
|
11
|
+
before do
|
12
|
+
allow(Net::Telnet).to receive(:new).and_return localhost
|
13
|
+
allow(localhost).to receive("cmd").and_return(ok_prompt)
|
14
|
+
allow(localhost).to receive("close")
|
15
|
+
|
16
|
+
allow(subject).to receive(:sleep).and_return(0)
|
17
|
+
end
|
18
|
+
|
19
|
+
it "throttles tor switch route command by 10 seconds", skip: true do
|
20
|
+
time = Time.now
|
21
|
+
|
22
|
+
Timecop.freeze(time) { subject.switch_identity }
|
23
|
+
Timecop.freeze(time + 2) { subject.switch_identity }
|
24
|
+
Timecop.freeze(time + 3) { subject.switch_identity }
|
25
|
+
Timecop.freeze(time + 5) { subject.switch_identity }
|
26
|
+
Timecop.freeze(time + 11) { subject.switch_identity }
|
27
|
+
Timecop.freeze(time + 15) { subject.switch_identity }
|
28
|
+
|
29
|
+
expect(subject).to have_received(:sleep).exactly(4)
|
30
|
+
expect(localhost).to have_received("cmd").with(new_route_signal).exactly(4)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|