tiny_grabber 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -5
- data/lib/tiny_grabber.rb +10 -8
- data/lib/tiny_grabber/agent.rb +45 -11
- data/lib/tiny_grabber/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c48375a5210123b907c6ebe2a7911f1c7090188a
|
4
|
+
data.tar.gz: 19b5052e7e79e876a40368d9e061a9aba032b7b4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 498cffe418cabb9917dad706d8df5d6281fcc119418016dcd775169346fd5108fcdbf0af5c8d75cbb1472acfdef96d4bc82e037cfc339de729fd864f1dcdafc2
|
7
|
+
data.tar.gz: 582d3bf2fa5cde9a54c2862d5f5d237218d483f69dadd8749eba93a07ab58a17ecfd1d40aaac5197823a155363bd51ae1196f5de033c33239432cd7b76d281b6
|
data/README.md
CHANGED
@@ -28,7 +28,7 @@ Or install it yourself as:
|
|
28
28
|
require 'tiny_grabber'
|
29
29
|
|
30
30
|
read_timeout = 300
|
31
|
-
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
31
|
+
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'
|
32
32
|
proxy = { ip: 'xx.xx.xx.xx', port: 'xxxx' }
|
33
33
|
headers1 = { 'Content-Type' => 'text/html; charset=utf-8' }
|
34
34
|
headers2 = { 'Content-Type' => 'text/html; charset=utf-8', 'Connection' => 'keep-alive' }
|
@@ -54,13 +54,16 @@ tg.cookies = cookies
|
|
54
54
|
|
55
55
|
# Make response with GET method
|
56
56
|
response = tg.get 'https://whoer.net/ru', headers
|
57
|
+
# Reset headers and cookies
|
58
|
+
tg.reset
|
57
59
|
# Make response with POST method
|
58
|
-
response = tg.
|
60
|
+
response = tg.post 'https://whoer.net/ru', params, headers
|
61
|
+
|
59
62
|
|
60
63
|
# Make singleton response with GET method
|
61
|
-
response = TinyGrabber.get 'https://whoer.net/ru',
|
64
|
+
response = TinyGrabber.get 'https://whoer.net/ru', { debug = true, read_timeout = read_timeout ... }
|
62
65
|
# Make singleton response with POST method
|
63
|
-
response = TinyGrabber.post 'https://whoer.net/ru', params,
|
66
|
+
response = TinyGrabber.post 'https://whoer.net/ru', params, { debug = true, read_timeout = read_timeout ... }
|
64
67
|
|
65
68
|
# Get Nokogiri object from response HTML
|
66
69
|
ng = response.ng
|
@@ -70,11 +73,18 @@ response.code
|
|
70
73
|
response.cookies
|
71
74
|
# Get response headers
|
72
75
|
response.headers
|
73
|
-
|
76
|
+
# Get response HTML
|
77
|
+
response.body
|
74
78
|
```
|
75
79
|
|
76
80
|
## Changelog
|
77
81
|
|
82
|
+
* *v 0.2.1*
|
83
|
+
* Setting random user_agent from list if it not seted
|
84
|
+
* Remove headers attribute from singleton methods
|
85
|
+
* Remove header transfer-encoding for chain requests
|
86
|
+
* Add reset method for delete headers and cookies
|
87
|
+
|
78
88
|
* *v 0.2.0*
|
79
89
|
* Now there is an opportunity to create object TinyGrabber
|
80
90
|
* Change order of parameters for singleton request
|
data/lib/tiny_grabber.rb
CHANGED
@@ -47,9 +47,9 @@ class TinyGrabber
|
|
47
47
|
# @param url Resource link
|
48
48
|
# @param headers Request header
|
49
49
|
#
|
50
|
-
def self.get url,
|
50
|
+
def self.get url, config = {}
|
51
51
|
initialize config
|
52
|
-
@agent.fetch url, :get
|
52
|
+
@agent.fetch url, :get
|
53
53
|
end
|
54
54
|
|
55
55
|
|
@@ -59,7 +59,7 @@ class TinyGrabber
|
|
59
59
|
# @param params Request post data
|
60
60
|
# @param headers Request header
|
61
61
|
#
|
62
|
-
def post url, params = {}, headers = {}
|
62
|
+
def post url, params = {}, headers = {}
|
63
63
|
@agent.fetch url, :post, headers, params
|
64
64
|
end
|
65
65
|
|
@@ -69,9 +69,9 @@ class TinyGrabber
|
|
69
69
|
# @param url Resource link
|
70
70
|
# @param headers Request header
|
71
71
|
#
|
72
|
-
def self.post url, params = {},
|
72
|
+
def self.post url, params = {}, config = {}
|
73
73
|
initialize config
|
74
|
-
@agent.fetch url, :post,
|
74
|
+
@agent.fetch url, :post, {}, params
|
75
75
|
end
|
76
76
|
|
77
77
|
|
@@ -173,7 +173,9 @@ class TinyGrabber
|
|
173
173
|
@agent.cookies = cookies
|
174
174
|
end
|
175
175
|
|
176
|
-
|
177
|
-
|
178
|
-
|
176
|
+
# Call RESET agent method
|
177
|
+
#
|
178
|
+
def reset
|
179
|
+
@agent.reset
|
180
|
+
end
|
179
181
|
end
|
data/lib/tiny_grabber/agent.rb
CHANGED
@@ -18,13 +18,37 @@ class TinyGrabber::Agent
|
|
18
18
|
# Headers
|
19
19
|
attr_accessor :cookies
|
20
20
|
|
21
|
+
# Agent aliases given from http://www.useragentstring.com/pages/Chrome/
|
22
|
+
AGENT_ALIASES = [
|
23
|
+
# Chrome
|
24
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
|
25
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
|
26
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
|
27
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
|
28
|
+
# Firefox
|
29
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
|
30
|
+
'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
|
31
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
|
32
|
+
'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
|
33
|
+
# Internet Explorer
|
34
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
|
35
|
+
'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
|
36
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)',
|
37
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)',
|
38
|
+
# Opera
|
39
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
40
|
+
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
|
41
|
+
'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
|
42
|
+
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
|
43
|
+
]
|
44
|
+
|
21
45
|
# Initialization object
|
22
46
|
#
|
23
47
|
def initialize
|
24
48
|
@debug = false
|
25
49
|
|
26
50
|
# Initialize variables agent attributes
|
27
|
-
@user_agent =
|
51
|
+
@user_agent = AGENT_ALIASES[rand(AGENT_ALIASES.count) - 1]
|
28
52
|
@proxy = []
|
29
53
|
@basic_auth = {}
|
30
54
|
@headers = {}
|
@@ -126,7 +150,7 @@ class TinyGrabber::Agent
|
|
126
150
|
if @debug
|
127
151
|
p "#{debug_initial_word} =============================="
|
128
152
|
p "#{debug_initial_word} #{method.upcase} #{url}"
|
129
|
-
p "#{debug_initial_word} #{params}"
|
153
|
+
p "#{debug_initial_word} -> [params] = #{params}"
|
130
154
|
p "#{debug_initial_word} ------------------------------"
|
131
155
|
end
|
132
156
|
set_uri url
|
@@ -160,7 +184,7 @@ class TinyGrabber::Agent
|
|
160
184
|
if @debug
|
161
185
|
debug_filename = "log/#{method.upcase}_#{@uri.to_s.gsub(/[\/:]/, '_').gsub(/_+/, '_')}"
|
162
186
|
File.open(debug_filename, 'wb') { |f| f << @response.body } if @debug
|
163
|
-
p "#{debug_initial_word}
|
187
|
+
p "#{debug_initial_word} <- [html_file] = #{debug_filename}"
|
164
188
|
end
|
165
189
|
@response
|
166
190
|
end
|
@@ -173,7 +197,7 @@ class TinyGrabber::Agent
|
|
173
197
|
def set_uri url
|
174
198
|
# It's magic work with escaped url
|
175
199
|
@uri = URI(URI.escape(URI.unescape(url)))
|
176
|
-
p "#{debug_initial_word}
|
200
|
+
p "#{debug_initial_word} -> [uri] = #{@uri}" if @debug
|
177
201
|
end
|
178
202
|
|
179
203
|
|
@@ -181,7 +205,7 @@ class TinyGrabber::Agent
|
|
181
205
|
#
|
182
206
|
def set_user_agent
|
183
207
|
@headers['User-Agent'] = @user_agent
|
184
|
-
p "#{debug_initial_word} user_agent = #{@user_agent}" if @debug
|
208
|
+
p "#{debug_initial_word} -> [user_agent] = #{@user_agent}" if @debug
|
185
209
|
end
|
186
210
|
|
187
211
|
|
@@ -189,7 +213,7 @@ class TinyGrabber::Agent
|
|
189
213
|
#
|
190
214
|
def set_basic_auth
|
191
215
|
@request.basic_auth @basic_auth[:username], @basic_auth[:password]
|
192
|
-
p "#{debug_initial_word} basic_auth = #{@basic_auth}" if @debug
|
216
|
+
p "#{debug_initial_word} -> [basic_auth] = #{@basic_auth}" if @debug
|
193
217
|
end
|
194
218
|
|
195
219
|
|
@@ -197,7 +221,7 @@ class TinyGrabber::Agent
|
|
197
221
|
#
|
198
222
|
def set_headers
|
199
223
|
@headers.each { |k, v| @request.add_field(String(k), v) }
|
200
|
-
p "#{debug_initial_word} headers = #{@headers}" if @debug
|
224
|
+
p "#{debug_initial_word} -> [headers] = #{@headers}" if @debug
|
201
225
|
end
|
202
226
|
|
203
227
|
|
@@ -205,7 +229,7 @@ class TinyGrabber::Agent
|
|
205
229
|
#
|
206
230
|
def set_cookies
|
207
231
|
@request['Cookie'] = @cookies
|
208
|
-
p "#{debug_initial_word} cookies = #{@cookies}" if @debug
|
232
|
+
p "#{debug_initial_word} -> [cookies] = #{@cookies}" if @debug
|
209
233
|
end
|
210
234
|
|
211
235
|
|
@@ -215,7 +239,7 @@ class TinyGrabber::Agent
|
|
215
239
|
def send_request
|
216
240
|
@http.start(@uri.host, @uri.port, use_ssl: @uri.scheme == 'https') do |http|
|
217
241
|
http.read_timeout = @read_timeout
|
218
|
-
p "#{debug_initial_word} read_timeout = #{@read_timeout}" if @debug
|
242
|
+
p "#{debug_initial_word} -> [read_timeout] = #{@read_timeout}" if @debug
|
219
243
|
http.request(@request)
|
220
244
|
end
|
221
245
|
end
|
@@ -225,7 +249,9 @@ class TinyGrabber::Agent
|
|
225
249
|
#
|
226
250
|
def save_headers
|
227
251
|
@headers = @response.headers
|
228
|
-
|
252
|
+
# Delete header TRANSFER_ENCODING for chain of requests
|
253
|
+
@headers.delete('transfer-encoding')
|
254
|
+
p "#{debug_initial_word} <- [headers] = #{@headers}" if @debug
|
229
255
|
end
|
230
256
|
|
231
257
|
|
@@ -233,7 +259,15 @@ class TinyGrabber::Agent
|
|
233
259
|
#
|
234
260
|
def save_cookies
|
235
261
|
@cookies = @response.cookies
|
236
|
-
p "#{debug_initial_word}
|
262
|
+
p "#{debug_initial_word} <- [cookies] = #{@cookies}" if @debug
|
263
|
+
end
|
264
|
+
|
265
|
+
|
266
|
+
# Clears headers and cookies
|
267
|
+
#
|
268
|
+
def reset
|
269
|
+
@headers = {}
|
270
|
+
@cookies = nil
|
237
271
|
end
|
238
272
|
|
239
273
|
|
data/lib/tiny_grabber/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiny_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksandr Chernyshev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: socksify
|