scraper_clients 9.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +26 -0
- data/bin/pry +17 -0
- data/bin/rspec +17 -0
- data/data/user_agents.txt +204 -0
- data/lib/clients.rb +24 -0
- data/lib/clients/errors.rb +16 -0
- data/lib/clients/ftp_client.rb +17 -0
- data/lib/clients/http_client.rb +152 -0
- data/lib/clients/http_client/response.rb +57 -0
- data/lib/clients/proxy6_client.rb +70 -0
- data/lib/clients/proxy_client.rb +14 -0
- data/lib/clients/proxy_list_client.rb +38 -0
- data/lib/clients/recaptcha/client.rb +48 -0
- data/lib/clients/recaptcha/response.rb +15 -0
- data/lib/clients/recaptcha/solver.rb +115 -0
- data/lib/clients/tor_client.rb +146 -0
- data/lib/clients/url_decoder.rb +8 -0
- data/lib/clients/version.rb +3 -0
- data/spec/lib/clients/http_client/response_spec.rb +197 -0
- data/spec/lib/clients/http_client_spec.rb +221 -0
- data/spec/lib/clients/tor_client_spec.rb +34 -0
- data/spec/spec_helper.rb +66 -0
- metadata +168 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module Clients
|
4
|
+
class HttpClient
|
5
|
+
RSpec.describe Response do
|
6
|
+
let(:headers) { {} }
|
7
|
+
let(:body) { "BODY" }
|
8
|
+
let(:status) { 200 }
|
9
|
+
let(:response) {
|
10
|
+
HTTP::Response.new(
|
11
|
+
status: status,
|
12
|
+
version: "1.1",
|
13
|
+
headers: headers,
|
14
|
+
body: body
|
15
|
+
)
|
16
|
+
}
|
17
|
+
subject { described_class.new(response) }
|
18
|
+
|
19
|
+
describe "#success?" do
|
20
|
+
context "when response has succeeded" do
|
21
|
+
let(:status) { 200 }
|
22
|
+
it "returns true" do
|
23
|
+
is_expected.to be_success
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
context "when response has failed" do
|
28
|
+
let(:status) { 502 }
|
29
|
+
it "returns false" do
|
30
|
+
is_expected.not_to be_success
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
describe "#fail?" do
|
36
|
+
context "when response has failed" do
|
37
|
+
let(:status) { 400 }
|
38
|
+
it "returns true" do
|
39
|
+
is_expected.to be_fail
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
context "when response has succeeded" do
|
44
|
+
let(:status) { 201 }
|
45
|
+
it "returns false" do
|
46
|
+
is_expected.not_to be_fail
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "#to_s" do
|
52
|
+
it "returns response body" do
|
53
|
+
expect(subject.to_s).to eq("BODY")
|
54
|
+
end
|
55
|
+
|
56
|
+
context "when force_utf8 hasn't been provided" do
|
57
|
+
let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
|
58
|
+
it "sets force_utf8 to FALSE by default" do
|
59
|
+
expect(subject.to_s).to eq(body)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
context "when force_utf8 is FALSE" do
|
64
|
+
shared_examples "unmodified body" do
|
65
|
+
it "returns unmodified response body" do
|
66
|
+
expect(subject.to_s(force_utf8: false)).to eq(body)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
context "when response doesn't have valid charset" do
|
71
|
+
let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
|
72
|
+
include_examples "unmodified body"
|
73
|
+
end
|
74
|
+
|
75
|
+
context "when response is binary" do
|
76
|
+
let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
|
77
|
+
include_examples "unmodified body"
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
context "when force_utf8 is TRUE" do
|
82
|
+
context "when response doesn't have valid charset" do
|
83
|
+
let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
|
84
|
+
|
85
|
+
it "returns response body in UTF-8 encoding" do
|
86
|
+
response = subject.to_s force_utf8: true
|
87
|
+
|
88
|
+
expect(response.encoding).to eq(Encoding::UTF_8)
|
89
|
+
expect(response).to eq("Correct Ответ")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "when response have valid charset - windows-1251" do
|
94
|
+
let(:headers) {
|
95
|
+
{
|
96
|
+
"Content-Type" => "text/html; charset=windows-1251"
|
97
|
+
}
|
98
|
+
}
|
99
|
+
let(:body) { "Correct Ответ".encode Encoding::CP1251 }
|
100
|
+
|
101
|
+
it "returns response body in UTF-8 encoding" do
|
102
|
+
response = subject.to_s force_utf8: true
|
103
|
+
|
104
|
+
expect(response.encoding).to eq(Encoding::UTF_8)
|
105
|
+
expect(response).to eq("Correct Ответ")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
describe "#to_html" do
|
112
|
+
it "returns parsed response body" do
|
113
|
+
html = subject.to_html
|
114
|
+
expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
|
115
|
+
expect(html.to_s).to include("<body><p>BODY</p></body>")
|
116
|
+
end
|
117
|
+
|
118
|
+
context "when force_utf8 is TRUE" do
|
119
|
+
let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
|
120
|
+
|
121
|
+
it "returns parsed response body in valid UTF_8 encodin" do
|
122
|
+
html = subject.to_html(force_utf8: true)
|
123
|
+
expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
|
124
|
+
expect(html.to_s).to include("<body><p>Correct Ответ</p></body>")
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe "#to_xml" do
|
130
|
+
let(:body) { "<node><![CDATA[ Brazil ]]></node>" }
|
131
|
+
|
132
|
+
it "returns parsed response body" do
|
133
|
+
xml = subject.to_xml
|
134
|
+
expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
|
135
|
+
expect(xml.to_s).to include("<node><![CDATA[ Brazil ]]></node>")
|
136
|
+
end
|
137
|
+
|
138
|
+
context "when force_utf8 is TRUE", skip: true do
|
139
|
+
let(:body) { "<node><![CDATA[ Бразилия ]]></node>".force_encoding Encoding::CP1251 }
|
140
|
+
|
141
|
+
it "returns parsed response body in valid UTF_8 encodin" do
|
142
|
+
xml = subject.to_xml(force_utf8: true)
|
143
|
+
expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
|
144
|
+
expect(xml.to_s).to include("<node><![CDATA[ Бразилия ]]></node>")
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
describe "#to_json" do
|
150
|
+
let(:body) { "[{\"brand\":\"Фирма ZANUSSI\",\"product_code\":\"91460370200\"}]" }
|
151
|
+
let(:parsed_body) { [{brand: "Фирма ZANUSSI", product_code: "91460370200"}] }
|
152
|
+
|
153
|
+
it "returns parsed json body" do
|
154
|
+
expect(subject.to_json).to eq parsed_body
|
155
|
+
end
|
156
|
+
|
157
|
+
context "when force_utf8 is TRUE" do
|
158
|
+
let(:body) { super().force_encoding Encoding::CP1251 }
|
159
|
+
|
160
|
+
it "returns parsed json body" do
|
161
|
+
expect(subject.to_json(force_utf8: true)).to eq parsed_body
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
describe "#to_io" do
|
167
|
+
let(:body) { "IO BODY" }
|
168
|
+
|
169
|
+
it "returns response as StringIO" do
|
170
|
+
io = subject.to_io
|
171
|
+
expect(io).to be_an_instance_of(StringIO)
|
172
|
+
expect(io.read).to eq("IO BODY")
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
|
177
|
+
describe "#stream" do
|
178
|
+
let(:url) { "http://example.com" }
|
179
|
+
let(:response) { Clients::HttpClient.new.get(url) }
|
180
|
+
|
181
|
+
before do
|
182
|
+
stub_request(:get, url).and_return(body: body)
|
183
|
+
end
|
184
|
+
|
185
|
+
it "streams response body" do
|
186
|
+
expect { |b| subject.stream(1, &b) }.to yield_successive_args("B", "O", "D", "Y")
|
187
|
+
end
|
188
|
+
|
189
|
+
context "buffer size is not specified" do
|
190
|
+
it "streams response body" do
|
191
|
+
expect { |b| subject.stream(&b) }.to yield_successive_args("BODY")
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,221 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module Clients
|
4
|
+
RSpec.describe HttpClient do
|
5
|
+
subject { described_class.new }
|
6
|
+
|
7
|
+
describe "#get" do
|
8
|
+
let(:url) { "http://ya.ru/index.html" }
|
9
|
+
let(:response) { subject.get(url) }
|
10
|
+
|
11
|
+
before do
|
12
|
+
stub_request(:get, url).and_return(
|
13
|
+
status: 202,
|
14
|
+
body: "RESPONSE",
|
15
|
+
headers: {
|
16
|
+
content_type: "image/png; charset=UTF-8"
|
17
|
+
}
|
18
|
+
)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "makes a request to given url" do
|
22
|
+
response
|
23
|
+
expect(WebMock).to have_requested(:get, url)
|
24
|
+
end
|
25
|
+
|
26
|
+
shared_examples "follow_redirects" do
|
27
|
+
let(:url) { "http://ya.ru/redirect" }
|
28
|
+
let(:redirect_to_url) { "http://ya.ru/final" }
|
29
|
+
|
30
|
+
before do
|
31
|
+
stub_request(:get, url).and_return(
|
32
|
+
status: 301,
|
33
|
+
headers: { location: redirect_to_url }
|
34
|
+
)
|
35
|
+
stub_request(:get, redirect_to_url).and_return(
|
36
|
+
status: 200,
|
37
|
+
body: "RESPONSE"
|
38
|
+
)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "follows redirects" do
|
42
|
+
expect(response.to_s).to eq("RESPONSE")
|
43
|
+
expect(WebMock).to have_requested(:get, url).once
|
44
|
+
expect(WebMock).to have_requested(:get, redirect_to_url).once
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context "when follow_redirects is true" do
|
49
|
+
let(:response) { subject.get(url, follow_redirects: true) }
|
50
|
+
include_examples "follow_redirects"
|
51
|
+
end
|
52
|
+
|
53
|
+
context "when follow_redirects is nil" do
|
54
|
+
let(:response) { subject.get(url) }
|
55
|
+
include_examples "follow_redirects"
|
56
|
+
end
|
57
|
+
|
58
|
+
context "when follow_redirects is false" do
|
59
|
+
let(:url) { "http://ya.ru/redirect" }
|
60
|
+
let(:redirect_to_url) { "http://ya.ru/final" }
|
61
|
+
let(:response) { subject.get(url, follow_redirects: false) }
|
62
|
+
|
63
|
+
before do
|
64
|
+
stub_request(:get, url).and_return(
|
65
|
+
status: 301,
|
66
|
+
headers: { location: redirect_to_url }
|
67
|
+
)
|
68
|
+
end
|
69
|
+
|
70
|
+
it "DOESN'T follow redirects" do
|
71
|
+
expect(response.status).to eq(301)
|
72
|
+
expect(response.headers["Location"]).to eq(redirect_to_url)
|
73
|
+
expect(WebMock).to have_requested(:get, url).once
|
74
|
+
expect(WebMock).not_to have_requested(:get, redirect_to_url)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
it "modifies request from the block" do
|
79
|
+
subject.get(url) do |request|
|
80
|
+
request.headers(cookie: "was_here=1;")
|
81
|
+
end
|
82
|
+
|
83
|
+
expect(WebMock).to have_requested(:get, url).with(headers: { "Cookie" => "was_here=1;" }).once
|
84
|
+
end
|
85
|
+
|
86
|
+
it "returns wrapped response" do
|
87
|
+
expect(response).to be_an_instance_of(HttpClient::Response)
|
88
|
+
expect(response.to_s).to eq("RESPONSE")
|
89
|
+
expect(response.code).to eq(202)
|
90
|
+
expect(response.mime_type).to eq("image/png")
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
describe "#head" do
|
95
|
+
let(:url) { "http://ya.ru/index.html" }
|
96
|
+
let(:response) { subject.head(url) }
|
97
|
+
|
98
|
+
before do
|
99
|
+
stub_request(:head, url).and_return(status: 202)
|
100
|
+
end
|
101
|
+
|
102
|
+
it "makes a request to given url" do
|
103
|
+
response
|
104
|
+
expect(WebMock).to have_requested(:head, url)
|
105
|
+
end
|
106
|
+
|
107
|
+
it "returns wrapped response" do
|
108
|
+
expect(response).to be_an_instance_of(HttpClient::Response)
|
109
|
+
expect(response.code).to eq(202)
|
110
|
+
expect(response.uri.to_s).to eq(url)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
|
115
|
+
describe "#proxy?" do
|
116
|
+
context "when proxy has been used" do
|
117
|
+
let(:proxy) { instance_spy("Clients::TorProxy") }
|
118
|
+
subject { described_class.new proxy: proxy }
|
119
|
+
|
120
|
+
it "returns true" do
|
121
|
+
expect(subject.proxy?).to eq(true)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
context "when proxy has NOT been used" do
|
126
|
+
it "returns false" do
|
127
|
+
expect(subject.proxy?).to eq(false)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
describe "#has_cookies?" do
|
133
|
+
context "when client has cookies" do
|
134
|
+
let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
|
135
|
+
|
136
|
+
it "returns true" do
|
137
|
+
subject.cookies << cookie
|
138
|
+
expect(subject).to have_cookies
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
context "when proxy has NOT been used" do
|
143
|
+
it "returns false" do
|
144
|
+
expect(subject).not_to have_cookies
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
describe "#store_cookies" do
|
150
|
+
let(:old_cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
|
151
|
+
let(:new_cookie) { HTTP::Cookie.new("uid", "u12345", domain: "ya.ru", path: "/admin") }
|
152
|
+
let(:cookies) { HTTP::CookieJar.new }
|
153
|
+
|
154
|
+
before do
|
155
|
+
subject.cookies << old_cookie
|
156
|
+
cookies << new_cookie
|
157
|
+
end
|
158
|
+
|
159
|
+
it "adds given cookies from the response" do
|
160
|
+
subject.store_cookies cookies
|
161
|
+
expect(subject.cookies.to_a).to contain_exactly(old_cookie, new_cookie)
|
162
|
+
end
|
163
|
+
|
164
|
+
it "sents new and old cookies with the new request" do
|
165
|
+
url = "https://placeholder.com"
|
166
|
+
stub_request(:get, url).and_return(status: 200)
|
167
|
+
|
168
|
+
subject.store_cookies cookies
|
169
|
+
subject.get(url)
|
170
|
+
|
171
|
+
expect(WebMock).to have_requested(:get, url)
|
172
|
+
.with(headers: { "Cookie" => "group=admin; uid=u12345" })
|
173
|
+
.once
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
describe "#reset_cookies" do
|
178
|
+
let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
|
179
|
+
|
180
|
+
it "reset client cookies" do
|
181
|
+
subject.cookies << cookie
|
182
|
+
subject.reset_cookies
|
183
|
+
expect(subject.cookies).to be_empty
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
describe "#reset_user_agent" do
|
188
|
+
it "reset client user agent" do
|
189
|
+
subject.user_agent
|
190
|
+
expect(subject.user_agent).not_to be_empty
|
191
|
+
|
192
|
+
# Need to stub sample, because it's not deterministic
|
193
|
+
allow(subject).to receive(:sample_user_agent).and_return("UA")
|
194
|
+
|
195
|
+
subject.reset_user_agent
|
196
|
+
|
197
|
+
expect(subject.user_agent).to eq("UA")
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
describe "#reset_proxy" do
|
202
|
+
context "when proxy has been used" do
|
203
|
+
let(:proxy) { instance_spy("Clients::TorProxy") }
|
204
|
+
subject { described_class.new proxy: proxy }
|
205
|
+
|
206
|
+
it "calls reset on proxy" do
|
207
|
+
subject.reset_proxy
|
208
|
+
expect(proxy).to have_received(:reset!).once
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
context "when proxy has NOT been used" do
|
213
|
+
subject { described_class.new }
|
214
|
+
|
215
|
+
it "does nothing" do
|
216
|
+
expect { subject.reset_proxy }.not_to raise_error
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
module Clients
|
4
|
+
RSpec.describe TorClient do
|
5
|
+
let(:ok_prompt) { "250 OK\n" }
|
6
|
+
let(:new_route_signal) { "SIGNAL NEWNYM" }
|
7
|
+
let(:localhost) { double("telnet") }
|
8
|
+
subject { described_class.new }
|
9
|
+
|
10
|
+
describe "#switch_identity" do
|
11
|
+
before do
|
12
|
+
allow(Net::Telnet).to receive(:new).and_return localhost
|
13
|
+
allow(localhost).to receive("cmd").and_return(ok_prompt)
|
14
|
+
allow(localhost).to receive("close")
|
15
|
+
|
16
|
+
allow(subject).to receive(:sleep).and_return(0)
|
17
|
+
end
|
18
|
+
|
19
|
+
it "throttles tor switch route command by 10 seconds", skip: true do
|
20
|
+
time = Time.now
|
21
|
+
|
22
|
+
Timecop.freeze(time) { subject.switch_identity }
|
23
|
+
Timecop.freeze(time + 2) { subject.switch_identity }
|
24
|
+
Timecop.freeze(time + 3) { subject.switch_identity }
|
25
|
+
Timecop.freeze(time + 5) { subject.switch_identity }
|
26
|
+
Timecop.freeze(time + 11) { subject.switch_identity }
|
27
|
+
Timecop.freeze(time + 15) { subject.switch_identity }
|
28
|
+
|
29
|
+
expect(subject).to have_received(:sleep).exactly(4)
|
30
|
+
expect(localhost).to have_received("cmd").with(new_route_signal).exactly(4)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|