scraper_clients 9.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,8 @@
1
+ module Clients
2
+ module UrlDecoder
3
+ def self.decode(url)
4
+ current_url, url = url, URI.decode(url) until url == current_url
5
+ url
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,3 @@
1
+ module Clients
2
+ VERSION = "9.0.0"
3
+ end
@@ -0,0 +1,197 @@
1
+ require "spec_helper"
2
+
3
+ module Clients
4
+ class HttpClient
5
+ RSpec.describe Response do
6
+ let(:headers) { {} }
7
+ let(:body) { "BODY" }
8
+ let(:status) { 200 }
9
+ let(:response) {
10
+ HTTP::Response.new(
11
+ status: status,
12
+ version: "1.1",
13
+ headers: headers,
14
+ body: body
15
+ )
16
+ }
17
+ subject { described_class.new(response) }
18
+
19
+ describe "#success?" do
20
+ context "when response has succeeded" do
21
+ let(:status) { 200 }
22
+ it "returns true" do
23
+ is_expected.to be_success
24
+ end
25
+ end
26
+
27
+ context "when response has failed" do
28
+ let(:status) { 502 }
29
+ it "returns false" do
30
+ is_expected.not_to be_success
31
+ end
32
+ end
33
+ end
34
+
35
+ describe "#fail?" do
36
+ context "when response has failed" do
37
+ let(:status) { 400 }
38
+ it "returns true" do
39
+ is_expected.to be_fail
40
+ end
41
+ end
42
+
43
+ context "when response has succeeded" do
44
+ let(:status) { 201 }
45
+ it "returns false" do
46
+ is_expected.not_to be_fail
47
+ end
48
+ end
49
+ end
50
+
51
+ describe "#to_s" do
52
+ it "returns response body" do
53
+ expect(subject.to_s).to eq("BODY")
54
+ end
55
+
56
+ context "when force_utf8 hasn't been provided" do
57
+ let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
58
+ it "sets force_utf8 to FALSE by default" do
59
+ expect(subject.to_s).to eq(body)
60
+ end
61
+ end
62
+
63
+ context "when force_utf8 is FALSE" do
64
+ shared_examples "unmodified body" do
65
+ it "returns unmodified response body" do
66
+ expect(subject.to_s(force_utf8: false)).to eq(body)
67
+ end
68
+ end
69
+
70
+ context "when response doesn't have valid charset" do
71
+ let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
72
+ include_examples "unmodified body"
73
+ end
74
+
75
+ context "when response is binary" do
76
+ let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
77
+ include_examples "unmodified body"
78
+ end
79
+ end
80
+
81
+ context "when force_utf8 is TRUE" do
82
+ context "when response doesn't have valid charset" do
83
+ let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
84
+
85
+ it "returns response body in UTF-8 encoding" do
86
+ response = subject.to_s force_utf8: true
87
+
88
+ expect(response.encoding).to eq(Encoding::UTF_8)
89
+ expect(response).to eq("Correct Ответ")
90
+ end
91
+ end
92
+
93
+ context "when response have valid charset - windows-1251" do
94
+ let(:headers) {
95
+ {
96
+ "Content-Type" => "text/html; charset=windows-1251"
97
+ }
98
+ }
99
+ let(:body) { "Correct Ответ".encode Encoding::CP1251 }
100
+
101
+ it "returns response body in UTF-8 encoding" do
102
+ response = subject.to_s force_utf8: true
103
+
104
+ expect(response.encoding).to eq(Encoding::UTF_8)
105
+ expect(response).to eq("Correct Ответ")
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ describe "#to_html" do
112
+ it "returns parsed response body" do
113
+ html = subject.to_html
114
+ expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
115
+ expect(html.to_s).to include("<body><p>BODY</p></body>")
116
+ end
117
+
118
+ context "when force_utf8 is TRUE" do
119
+ let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
120
+
121
+ it "returns parsed response body in valid UTF_8 encodin" do
122
+ html = subject.to_html(force_utf8: true)
123
+ expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
124
+ expect(html.to_s).to include("<body><p>Correct Ответ</p></body>")
125
+ end
126
+ end
127
+ end
128
+
129
+ describe "#to_xml" do
130
+ let(:body) { "<node><![CDATA[ Brazil ]]></node>" }
131
+
132
+ it "returns parsed response body" do
133
+ xml = subject.to_xml
134
+ expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
135
+ expect(xml.to_s).to include("<node><![CDATA[ Brazil ]]></node>")
136
+ end
137
+
138
+ context "when force_utf8 is TRUE", skip: true do
139
+ let(:body) { "<node><![CDATA[ Бразилия ]]></node>".force_encoding Encoding::CP1251 }
140
+
141
+ it "returns parsed response body in valid UTF_8 encodin" do
142
+ xml = subject.to_xml(force_utf8: true)
143
+ expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
144
+ expect(xml.to_s).to include("<node><![CDATA[ Бразилия ]]></node>")
145
+ end
146
+ end
147
+ end
148
+
149
+ describe "#to_json" do
150
+ let(:body) { "[{\"brand\":\"Фирма ZANUSSI\",\"product_code\":\"91460370200\"}]" }
151
+ let(:parsed_body) { [{brand: "Фирма ZANUSSI", product_code: "91460370200"}] }
152
+
153
+ it "returns parsed json body" do
154
+ expect(subject.to_json).to eq parsed_body
155
+ end
156
+
157
+ context "when force_utf8 is TRUE" do
158
+ let(:body) { super().force_encoding Encoding::CP1251 }
159
+
160
+ it "returns parsed json body" do
161
+ expect(subject.to_json(force_utf8: true)).to eq parsed_body
162
+ end
163
+ end
164
+ end
165
+
166
+ describe "#to_io" do
167
+ let(:body) { "IO BODY" }
168
+
169
+ it "returns response as StringIO" do
170
+ io = subject.to_io
171
+ expect(io).to be_an_instance_of(StringIO)
172
+ expect(io.read).to eq("IO BODY")
173
+ end
174
+ end
175
+
176
+
177
+ describe "#stream" do
178
+ let(:url) { "http://example.com" }
179
+ let(:response) { Clients::HttpClient.new.get(url) }
180
+
181
+ before do
182
+ stub_request(:get, url).and_return(body: body)
183
+ end
184
+
185
+ it "streams response body" do
186
+ expect { |b| subject.stream(1, &b) }.to yield_successive_args("B", "O", "D", "Y")
187
+ end
188
+
189
+ context "buffer size is not specified" do
190
+ it "streams response body" do
191
+ expect { |b| subject.stream(&b) }.to yield_successive_args("BODY")
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,221 @@
1
+ require "spec_helper"
2
+
3
+ module Clients
4
+ RSpec.describe HttpClient do
5
+ subject { described_class.new }
6
+
7
+ describe "#get" do
8
+ let(:url) { "http://ya.ru/index.html" }
9
+ let(:response) { subject.get(url) }
10
+
11
+ before do
12
+ stub_request(:get, url).and_return(
13
+ status: 202,
14
+ body: "RESPONSE",
15
+ headers: {
16
+ content_type: "image/png; charset=UTF-8"
17
+ }
18
+ )
19
+ end
20
+
21
+ it "makes a request to given url" do
22
+ response
23
+ expect(WebMock).to have_requested(:get, url)
24
+ end
25
+
26
+ shared_examples "follow_redirects" do
27
+ let(:url) { "http://ya.ru/redirect" }
28
+ let(:redirect_to_url) { "http://ya.ru/final" }
29
+
30
+ before do
31
+ stub_request(:get, url).and_return(
32
+ status: 301,
33
+ headers: { location: redirect_to_url }
34
+ )
35
+ stub_request(:get, redirect_to_url).and_return(
36
+ status: 200,
37
+ body: "RESPONSE"
38
+ )
39
+ end
40
+
41
+ it "follows redirects" do
42
+ expect(response.to_s).to eq("RESPONSE")
43
+ expect(WebMock).to have_requested(:get, url).once
44
+ expect(WebMock).to have_requested(:get, redirect_to_url).once
45
+ end
46
+ end
47
+
48
+ context "when follow_redirects is true" do
49
+ let(:response) { subject.get(url, follow_redirects: true) }
50
+ include_examples "follow_redirects"
51
+ end
52
+
53
+ context "when follow_redirects is nil" do
54
+ let(:response) { subject.get(url) }
55
+ include_examples "follow_redirects"
56
+ end
57
+
58
+ context "when follow_redirects is false" do
59
+ let(:url) { "http://ya.ru/redirect" }
60
+ let(:redirect_to_url) { "http://ya.ru/final" }
61
+ let(:response) { subject.get(url, follow_redirects: false) }
62
+
63
+ before do
64
+ stub_request(:get, url).and_return(
65
+ status: 301,
66
+ headers: { location: redirect_to_url }
67
+ )
68
+ end
69
+
70
+ it "DOESN'T follow redirects" do
71
+ expect(response.status).to eq(301)
72
+ expect(response.headers["Location"]).to eq(redirect_to_url)
73
+ expect(WebMock).to have_requested(:get, url).once
74
+ expect(WebMock).not_to have_requested(:get, redirect_to_url)
75
+ end
76
+ end
77
+
78
+ it "modifies request from the block" do
79
+ subject.get(url) do |request|
80
+ request.headers(cookie: "was_here=1;")
81
+ end
82
+
83
+ expect(WebMock).to have_requested(:get, url).with(headers: { "Cookie" => "was_here=1;" }).once
84
+ end
85
+
86
+ it "returns wrapped response" do
87
+ expect(response).to be_an_instance_of(HttpClient::Response)
88
+ expect(response.to_s).to eq("RESPONSE")
89
+ expect(response.code).to eq(202)
90
+ expect(response.mime_type).to eq("image/png")
91
+ end
92
+ end
93
+
94
+ describe "#head" do
95
+ let(:url) { "http://ya.ru/index.html" }
96
+ let(:response) { subject.head(url) }
97
+
98
+ before do
99
+ stub_request(:head, url).and_return(status: 202)
100
+ end
101
+
102
+ it "makes a request to given url" do
103
+ response
104
+ expect(WebMock).to have_requested(:head, url)
105
+ end
106
+
107
+ it "returns wrapped response" do
108
+ expect(response).to be_an_instance_of(HttpClient::Response)
109
+ expect(response.code).to eq(202)
110
+ expect(response.uri.to_s).to eq(url)
111
+ end
112
+ end
113
+
114
+
115
+ describe "#proxy?" do
116
+ context "when proxy has been used" do
117
+ let(:proxy) { instance_spy("Clients::TorProxy") }
118
+ subject { described_class.new proxy: proxy }
119
+
120
+ it "returns true" do
121
+ expect(subject.proxy?).to eq(true)
122
+ end
123
+ end
124
+
125
+ context "when proxy has NOT been used" do
126
+ it "returns false" do
127
+ expect(subject.proxy?).to eq(false)
128
+ end
129
+ end
130
+ end
131
+
132
+ describe "#has_cookies?" do
133
+ context "when client has cookies" do
134
+ let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
135
+
136
+ it "returns true" do
137
+ subject.cookies << cookie
138
+ expect(subject).to have_cookies
139
+ end
140
+ end
141
+
142
+ context "when proxy has NOT been used" do
143
+ it "returns false" do
144
+ expect(subject).not_to have_cookies
145
+ end
146
+ end
147
+ end
148
+
149
+ describe "#store_cookies" do
150
+ let(:old_cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
151
+ let(:new_cookie) { HTTP::Cookie.new("uid", "u12345", domain: "ya.ru", path: "/admin") }
152
+ let(:cookies) { HTTP::CookieJar.new }
153
+
154
+ before do
155
+ subject.cookies << old_cookie
156
+ cookies << new_cookie
157
+ end
158
+
159
+ it "adds given cookies from the response" do
160
+ subject.store_cookies cookies
161
+ expect(subject.cookies.to_a).to contain_exactly(old_cookie, new_cookie)
162
+ end
163
+
164
+ it "sents new and old cookies with the new request" do
165
+ url = "https://placeholder.com"
166
+ stub_request(:get, url).and_return(status: 200)
167
+
168
+ subject.store_cookies cookies
169
+ subject.get(url)
170
+
171
+ expect(WebMock).to have_requested(:get, url)
172
+ .with(headers: { "Cookie" => "group=admin; uid=u12345" })
173
+ .once
174
+ end
175
+ end
176
+
177
+ describe "#reset_cookies" do
178
+ let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
179
+
180
+ it "reset client cookies" do
181
+ subject.cookies << cookie
182
+ subject.reset_cookies
183
+ expect(subject.cookies).to be_empty
184
+ end
185
+ end
186
+
187
+ describe "#reset_user_agent" do
188
+ it "reset client user agent" do
189
+ subject.user_agent
190
+ expect(subject.user_agent).not_to be_empty
191
+
192
+ # Need to stub sample, because it's not deterministic
193
+ allow(subject).to receive(:sample_user_agent).and_return("UA")
194
+
195
+ subject.reset_user_agent
196
+
197
+ expect(subject.user_agent).to eq("UA")
198
+ end
199
+ end
200
+
201
+ describe "#reset_proxy" do
202
+ context "when proxy has been used" do
203
+ let(:proxy) { instance_spy("Clients::TorProxy") }
204
+ subject { described_class.new proxy: proxy }
205
+
206
+ it "calls reset on proxy" do
207
+ subject.reset_proxy
208
+ expect(proxy).to have_received(:reset!).once
209
+ end
210
+ end
211
+
212
+ context "when proxy has NOT been used" do
213
+ subject { described_class.new }
214
+
215
+ it "does nothing" do
216
+ expect { subject.reset_proxy }.not_to raise_error
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,34 @@
1
+ require "spec_helper"
2
+
3
+ module Clients
4
+ RSpec.describe TorClient do
5
+ let(:ok_prompt) { "250 OK\n" }
6
+ let(:new_route_signal) { "SIGNAL NEWNYM" }
7
+ let(:localhost) { double("telnet") }
8
+ subject { described_class.new }
9
+
10
+ describe "#switch_identity" do
11
+ before do
12
+ allow(Net::Telnet).to receive(:new).and_return localhost
13
+ allow(localhost).to receive("cmd").and_return(ok_prompt)
14
+ allow(localhost).to receive("close")
15
+
16
+ allow(subject).to receive(:sleep).and_return(0)
17
+ end
18
+
19
+ it "throttles tor switch route command by 10 seconds", skip: true do
20
+ time = Time.now
21
+
22
+ Timecop.freeze(time) { subject.switch_identity }
23
+ Timecop.freeze(time + 2) { subject.switch_identity }
24
+ Timecop.freeze(time + 3) { subject.switch_identity }
25
+ Timecop.freeze(time + 5) { subject.switch_identity }
26
+ Timecop.freeze(time + 11) { subject.switch_identity }
27
+ Timecop.freeze(time + 15) { subject.switch_identity }
28
+
29
+ expect(subject).to have_received(:sleep).exactly(4)
30
+ expect(localhost).to have_received("cmd").with(new_route_signal).exactly(4)
31
+ end
32
+ end
33
+ end
34
+ end