scraper_clients 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ module Clients
2
+ module UrlDecoder
3
+ def self.decode(url)
4
+ current_url, url = url, URI.decode(url) until url == current_url
5
+ url
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,3 @@
1
+ module Clients
2
+ VERSION = "9.0.0"
3
+ end
@@ -0,0 +1,197 @@
1
+ require "spec_helper"
2
+
3
+ module Clients
4
+ class HttpClient
5
+ RSpec.describe Response do
6
+ let(:headers) { {} }
7
+ let(:body) { "BODY" }
8
+ let(:status) { 200 }
9
+ let(:response) {
10
+ HTTP::Response.new(
11
+ status: status,
12
+ version: "1.1",
13
+ headers: headers,
14
+ body: body
15
+ )
16
+ }
17
+ subject { described_class.new(response) }
18
+
19
+ describe "#success?" do
20
+ context "when response has succeeded" do
21
+ let(:status) { 200 }
22
+ it "returns true" do
23
+ is_expected.to be_success
24
+ end
25
+ end
26
+
27
+ context "when response has failed" do
28
+ let(:status) { 502 }
29
+ it "returns false" do
30
+ is_expected.not_to be_success
31
+ end
32
+ end
33
+ end
34
+
35
+ describe "#fail?" do
36
+ context "when response has failed" do
37
+ let(:status) { 400 }
38
+ it "returns true" do
39
+ is_expected.to be_fail
40
+ end
41
+ end
42
+
43
+ context "when response has succeeded" do
44
+ let(:status) { 201 }
45
+ it "returns false" do
46
+ is_expected.not_to be_fail
47
+ end
48
+ end
49
+ end
50
+
51
+ describe "#to_s" do
52
+ it "returns response body" do
53
+ expect(subject.to_s).to eq("BODY")
54
+ end
55
+
56
+ context "when force_utf8 hasn't been provided" do
57
+ let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
58
+ it "sets force_utf8 to FALSE by default" do
59
+ expect(subject.to_s).to eq(body)
60
+ end
61
+ end
62
+
63
+ context "when force_utf8 is FALSE" do
64
+ shared_examples "unmodified body" do
65
+ it "returns unmodified response body" do
66
+ expect(subject.to_s(force_utf8: false)).to eq(body)
67
+ end
68
+ end
69
+
70
+ context "when response doesn't have valid charset" do
71
+ let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
72
+ include_examples "unmodified body"
73
+ end
74
+
75
+ context "when response is binary" do
76
+ let(:body) { "\x89PNG\r\n\x1A\n\x00\x00\x00" }
77
+ include_examples "unmodified body"
78
+ end
79
+ end
80
+
81
+ context "when force_utf8 is TRUE" do
82
+ context "when response doesn't have valid charset" do
83
+ let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
84
+
85
+ it "returns response body in UTF-8 encoding" do
86
+ response = subject.to_s force_utf8: true
87
+
88
+ expect(response.encoding).to eq(Encoding::UTF_8)
89
+ expect(response).to eq("Correct Ответ")
90
+ end
91
+ end
92
+
93
+ context "when response have valid charset - windows-1251" do
94
+ let(:headers) {
95
+ {
96
+ "Content-Type" => "text/html; charset=windows-1251"
97
+ }
98
+ }
99
+ let(:body) { "Correct Ответ".encode Encoding::CP1251 }
100
+
101
+ it "returns response body in UTF-8 encoding" do
102
+ response = subject.to_s force_utf8: true
103
+
104
+ expect(response.encoding).to eq(Encoding::UTF_8)
105
+ expect(response).to eq("Correct Ответ")
106
+ end
107
+ end
108
+ end
109
+ end
110
+
111
+ describe "#to_html" do
112
+ it "returns parsed response body" do
113
+ html = subject.to_html
114
+ expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
115
+ expect(html.to_s).to include("<body><p>BODY</p></body>")
116
+ end
117
+
118
+ context "when force_utf8 is TRUE" do
119
+ let(:body) { "Correct Ответ".force_encoding Encoding::CP1251 }
120
+
121
+ it "returns parsed response body in valid UTF_8 encodin" do
122
+ html = subject.to_html(force_utf8: true)
123
+ expect(html).to be_an_instance_of(Nokogiri::HTML::Document)
124
+ expect(html.to_s).to include("<body><p>Correct Ответ</p></body>")
125
+ end
126
+ end
127
+ end
128
+
129
+ describe "#to_xml" do
130
+ let(:body) { "<node><![CDATA[ Brazil ]]></node>" }
131
+
132
+ it "returns parsed response body" do
133
+ xml = subject.to_xml
134
+ expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
135
+ expect(xml.to_s).to include("<node><![CDATA[ Brazil ]]></node>")
136
+ end
137
+
138
+ context "when force_utf8 is TRUE", skip: true do
139
+ let(:body) { "<node><![CDATA[ Бразилия ]]></node>".force_encoding Encoding::CP1251 }
140
+
141
+ it "returns parsed response body in valid UTF_8 encodin" do
142
+ xml = subject.to_xml(force_utf8: true)
143
+ expect(xml).to be_an_instance_of(Nokogiri::XML::Document)
144
+ expect(xml.to_s).to include("<node><![CDATA[ Бразилия ]]></node>")
145
+ end
146
+ end
147
+ end
148
+
149
+ describe "#to_json" do
150
+ let(:body) { "[{\"brand\":\"Фирма ZANUSSI\",\"product_code\":\"91460370200\"}]" }
151
+ let(:parsed_body) { [{brand: "Фирма ZANUSSI", product_code: "91460370200"}] }
152
+
153
+ it "returns parsed json body" do
154
+ expect(subject.to_json).to eq parsed_body
155
+ end
156
+
157
+ context "when force_utf8 is TRUE" do
158
+ let(:body) { super().force_encoding Encoding::CP1251 }
159
+
160
+ it "returns parsed json body" do
161
+ expect(subject.to_json(force_utf8: true)).to eq parsed_body
162
+ end
163
+ end
164
+ end
165
+
166
+ describe "#to_io" do
167
+ let(:body) { "IO BODY" }
168
+
169
+ it "returns response as StringIO" do
170
+ io = subject.to_io
171
+ expect(io).to be_an_instance_of(StringIO)
172
+ expect(io.read).to eq("IO BODY")
173
+ end
174
+ end
175
+
176
+
177
+ describe "#stream" do
178
+ let(:url) { "http://example.com" }
179
+ let(:response) { Clients::HttpClient.new.get(url) }
180
+
181
+ before do
182
+ stub_request(:get, url).and_return(body: body)
183
+ end
184
+
185
+ it "streams response body" do
186
+ expect { |b| subject.stream(1, &b) }.to yield_successive_args("B", "O", "D", "Y")
187
+ end
188
+
189
+ context "buffer size is not specified" do
190
+ it "streams response body" do
191
+ expect { |b| subject.stream(&b) }.to yield_successive_args("BODY")
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,221 @@
1
+ require "spec_helper"
2
+
3
+ module Clients
4
+ RSpec.describe HttpClient do
5
+ subject { described_class.new }
6
+
7
+ describe "#get" do
8
+ let(:url) { "http://ya.ru/index.html" }
9
+ let(:response) { subject.get(url) }
10
+
11
+ before do
12
+ stub_request(:get, url).and_return(
13
+ status: 202,
14
+ body: "RESPONSE",
15
+ headers: {
16
+ content_type: "image/png; charset=UTF-8"
17
+ }
18
+ )
19
+ end
20
+
21
+ it "makes a request to given url" do
22
+ response
23
+ expect(WebMock).to have_requested(:get, url)
24
+ end
25
+
26
+ shared_examples "follow_redirects" do
27
+ let(:url) { "http://ya.ru/redirect" }
28
+ let(:redirect_to_url) { "http://ya.ru/final" }
29
+
30
+ before do
31
+ stub_request(:get, url).and_return(
32
+ status: 301,
33
+ headers: { location: redirect_to_url }
34
+ )
35
+ stub_request(:get, redirect_to_url).and_return(
36
+ status: 200,
37
+ body: "RESPONSE"
38
+ )
39
+ end
40
+
41
+ it "follows redirects" do
42
+ expect(response.to_s).to eq("RESPONSE")
43
+ expect(WebMock).to have_requested(:get, url).once
44
+ expect(WebMock).to have_requested(:get, redirect_to_url).once
45
+ end
46
+ end
47
+
48
+ context "when follow_redirects is true" do
49
+ let(:response) { subject.get(url, follow_redirects: true) }
50
+ include_examples "follow_redirects"
51
+ end
52
+
53
+ context "when follow_redirects is nil" do
54
+ let(:response) { subject.get(url) }
55
+ include_examples "follow_redirects"
56
+ end
57
+
58
+ context "when follow_redirects is false" do
59
+ let(:url) { "http://ya.ru/redirect" }
60
+ let(:redirect_to_url) { "http://ya.ru/final" }
61
+ let(:response) { subject.get(url, follow_redirects: false) }
62
+
63
+ before do
64
+ stub_request(:get, url).and_return(
65
+ status: 301,
66
+ headers: { location: redirect_to_url }
67
+ )
68
+ end
69
+
70
+ it "DOESN'T follow redirects" do
71
+ expect(response.status).to eq(301)
72
+ expect(response.headers["Location"]).to eq(redirect_to_url)
73
+ expect(WebMock).to have_requested(:get, url).once
74
+ expect(WebMock).not_to have_requested(:get, redirect_to_url)
75
+ end
76
+ end
77
+
78
+ it "modifies request from the block" do
79
+ subject.get(url) do |request|
80
+ request.headers(cookie: "was_here=1;")
81
+ end
82
+
83
+ expect(WebMock).to have_requested(:get, url).with(headers: { "Cookie" => "was_here=1;" }).once
84
+ end
85
+
86
+ it "returns wrapped response" do
87
+ expect(response).to be_an_instance_of(HttpClient::Response)
88
+ expect(response.to_s).to eq("RESPONSE")
89
+ expect(response.code).to eq(202)
90
+ expect(response.mime_type).to eq("image/png")
91
+ end
92
+ end
93
+
94
+ describe "#head" do
95
+ let(:url) { "http://ya.ru/index.html" }
96
+ let(:response) { subject.head(url) }
97
+
98
+ before do
99
+ stub_request(:head, url).and_return(status: 202)
100
+ end
101
+
102
+ it "makes a request to given url" do
103
+ response
104
+ expect(WebMock).to have_requested(:head, url)
105
+ end
106
+
107
+ it "returns wrapped response" do
108
+ expect(response).to be_an_instance_of(HttpClient::Response)
109
+ expect(response.code).to eq(202)
110
+ expect(response.uri.to_s).to eq(url)
111
+ end
112
+ end
113
+
114
+
115
+ describe "#proxy?" do
116
+ context "when proxy has been used" do
117
+ let(:proxy) { instance_spy("Clients::TorProxy") }
118
+ subject { described_class.new proxy: proxy }
119
+
120
+ it "returns true" do
121
+ expect(subject.proxy?).to eq(true)
122
+ end
123
+ end
124
+
125
+ context "when proxy has NOT been used" do
126
+ it "returns false" do
127
+ expect(subject.proxy?).to eq(false)
128
+ end
129
+ end
130
+ end
131
+
132
+ describe "#has_cookies?" do
133
+ context "when client has cookies" do
134
+ let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
135
+
136
+ it "returns true" do
137
+ subject.cookies << cookie
138
+ expect(subject).to have_cookies
139
+ end
140
+ end
141
+
142
+ context "when proxy has NOT been used" do
143
+ it "returns false" do
144
+ expect(subject).not_to have_cookies
145
+ end
146
+ end
147
+ end
148
+
149
+ describe "#store_cookies" do
150
+ let(:old_cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
151
+ let(:new_cookie) { HTTP::Cookie.new("uid", "u12345", domain: "ya.ru", path: "/admin") }
152
+ let(:cookies) { HTTP::CookieJar.new }
153
+
154
+ before do
155
+ subject.cookies << old_cookie
156
+ cookies << new_cookie
157
+ end
158
+
159
+ it "adds given cookies from the response" do
160
+ subject.store_cookies cookies
161
+ expect(subject.cookies.to_a).to contain_exactly(old_cookie, new_cookie)
162
+ end
163
+
164
+ it "sents new and old cookies with the new request" do
165
+ url = "https://placeholder.com"
166
+ stub_request(:get, url).and_return(status: 200)
167
+
168
+ subject.store_cookies cookies
169
+ subject.get(url)
170
+
171
+ expect(WebMock).to have_requested(:get, url)
172
+ .with(headers: { "Cookie" => "group=admin; uid=u12345" })
173
+ .once
174
+ end
175
+ end
176
+
177
+ describe "#reset_cookies" do
178
+ let(:cookie) { HTTP::Cookie.new("group", "admin", domain: "example.com", path: "/") }
179
+
180
+ it "reset client cookies" do
181
+ subject.cookies << cookie
182
+ subject.reset_cookies
183
+ expect(subject.cookies).to be_empty
184
+ end
185
+ end
186
+
187
+ describe "#reset_user_agent" do
188
+ it "reset client user agent" do
189
+ subject.user_agent
190
+ expect(subject.user_agent).not_to be_empty
191
+
192
+ # Need to stub sample, because it's not deterministic
193
+ allow(subject).to receive(:sample_user_agent).and_return("UA")
194
+
195
+ subject.reset_user_agent
196
+
197
+ expect(subject.user_agent).to eq("UA")
198
+ end
199
+ end
200
+
201
+ describe "#reset_proxy" do
202
+ context "when proxy has been used" do
203
+ let(:proxy) { instance_spy("Clients::TorProxy") }
204
+ subject { described_class.new proxy: proxy }
205
+
206
+ it "calls reset on proxy" do
207
+ subject.reset_proxy
208
+ expect(proxy).to have_received(:reset!).once
209
+ end
210
+ end
211
+
212
+ context "when proxy has NOT been used" do
213
+ subject { described_class.new }
214
+
215
+ it "does nothing" do
216
+ expect { subject.reset_proxy }.not_to raise_error
217
+ end
218
+ end
219
+ end
220
+ end
221
+ end
@@ -0,0 +1,34 @@
1
+ require "spec_helper"
2
+
3
+ module Clients
4
+ RSpec.describe TorClient do
5
+ let(:ok_prompt) { "250 OK\n" }
6
+ let(:new_route_signal) { "SIGNAL NEWNYM" }
7
+ let(:localhost) { double("telnet") }
8
+ subject { described_class.new }
9
+
10
+ describe "#switch_identity" do
11
+ before do
12
+ allow(Net::Telnet).to receive(:new).and_return localhost
13
+ allow(localhost).to receive("cmd").and_return(ok_prompt)
14
+ allow(localhost).to receive("close")
15
+
16
+ allow(subject).to receive(:sleep).and_return(0)
17
+ end
18
+
19
+ it "throttles tor switch route command by 10 seconds", skip: true do
20
+ time = Time.now
21
+
22
+ Timecop.freeze(time) { subject.switch_identity }
23
+ Timecop.freeze(time + 2) { subject.switch_identity }
24
+ Timecop.freeze(time + 3) { subject.switch_identity }
25
+ Timecop.freeze(time + 5) { subject.switch_identity }
26
+ Timecop.freeze(time + 11) { subject.switch_identity }
27
+ Timecop.freeze(time + 15) { subject.switch_identity }
28
+
29
+ expect(subject).to have_received(:sleep).exactly(4)
30
+ expect(localhost).to have_received("cmd").with(new_route_signal).exactly(4)
31
+ end
32
+ end
33
+ end
34
+ end