tiny_grabber 0.2.8 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +4 -0
- data/README.md +5 -0
- data/Rakefile +3 -3
- data/lib/tiny_grabber/agent.rb +283 -277
- data/lib/tiny_grabber/debug.rb +10 -56
- data/lib/tiny_grabber/http.rb +6 -8
- data/lib/tiny_grabber/version.rb +1 -1
- data/lib/tiny_grabber.rb +24 -34
- data/tiny_grabber.gemspec +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 96bdccdeb24ccdbbb99e4cb2e73bf1450aca4e4f
|
4
|
+
data.tar.gz: 785bed54953a6faa41325aa8d8dda56f688b638d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4256e04d8e42b404c09e32ad66bd99c4c1f38680be021fcb6e36e6e8a1e86e65be548945d853583fad0177d53e2732a275ec804079531757d477af91bd403674
|
7
|
+
data.tar.gz: 46f84bf20a8c0e4de69e3bea855c5d3a26bd66332c75fbb7e2e6ed1805364c05262e10d897f4ea458931ac842819baee55f0a3c4cd7ed731ce7207c2bdd8d149
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -126,6 +126,11 @@ response.body
|
|
126
126
|
|
127
127
|
## Changelog
|
128
128
|
|
129
|
+
* *v 0.2.9*
|
130
|
+
* Added agent attribute for redirect follow location
|
131
|
+
* Used 302 http answer code and header location for redirecting
|
132
|
+
* Used meta refresh url
|
133
|
+
* Refactored code for rubocop
|
129
134
|
* *v 0.2.8*
|
130
135
|
* Added processing Accept headers
|
131
136
|
* *v 0.2.7*
|
data/Rakefile
CHANGED
data/lib/tiny_grabber/agent.rb
CHANGED
@@ -2,334 +2,340 @@
|
|
2
2
|
# Initialize connect with Resource
|
3
3
|
# Setting connect attributes
|
4
4
|
#
|
5
|
-
class TinyGrabber
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
#
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
#
|
71
|
-
def debug= debug
|
72
|
-
debug = var_to_sym(debug, true)
|
73
|
-
if debug.is_a?(Hash)
|
74
|
-
@debug.active = debug[:active]
|
75
|
-
@debug.destination = debug[:destination]
|
76
|
-
@debug.save_html = debug[:save_html]
|
77
|
-
elsif debug.is_a?(TrueClass)
|
78
|
-
@debug.active = true
|
5
|
+
class TinyGrabber
|
6
|
+
class Agent
|
7
|
+
# Debug configuration
|
8
|
+
attr_writer :debug
|
9
|
+
# Max time to execute request
|
10
|
+
attr_writer :read_timeout
|
11
|
+
# Web browser name
|
12
|
+
attr_writer :user_agent
|
13
|
+
# Remote proxy configuration
|
14
|
+
attr_accessor :proxy
|
15
|
+
# Basic authentification configuration
|
16
|
+
attr_writer :basic_auth
|
17
|
+
# Headers
|
18
|
+
attr_writer :headers
|
19
|
+
# Headers
|
20
|
+
attr_writer :cookies
|
21
|
+
# Set verify mode
|
22
|
+
attr_writer :verify_mode
|
23
|
+
# Follow location
|
24
|
+
attr_writer :follow_location
|
25
|
+
|
26
|
+
# Agent aliases given from http://www.useragentstring.com/pages/Chrome/
|
27
|
+
AGENT_ALIASES = [
|
28
|
+
# Chrome
|
29
|
+
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
|
30
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
|
31
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
|
32
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36',
|
33
|
+
# Firefox
|
34
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',
|
35
|
+
'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0',
|
36
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0',
|
37
|
+
'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0',
|
38
|
+
# Internet Explorer
|
39
|
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko',
|
40
|
+
'Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0',
|
41
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)',
|
42
|
+
'Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)',
|
43
|
+
# Opera
|
44
|
+
'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
|
45
|
+
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
|
46
|
+
'Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14',
|
47
|
+
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52'
|
48
|
+
].freeze
|
49
|
+
|
50
|
+
# Initialization object
|
51
|
+
#
|
52
|
+
def initialize
|
53
|
+
@debug = Debug.new
|
54
|
+
|
55
|
+
# Initialize variables agent attributes
|
56
|
+
@user_agent = AGENT_ALIASES[rand(AGENT_ALIASES.count) - 1]
|
57
|
+
@proxy = []
|
58
|
+
@basic_auth = {}
|
59
|
+
@headers = {}
|
60
|
+
@cookies = nil
|
61
|
+
@follow_location = false
|
62
|
+
@read_timeout = 10
|
63
|
+
# Initialize variable for URI object
|
64
|
+
@uri = nil
|
65
|
+
# Initialize variable for Net::HTTP request object
|
66
|
+
@http = Net::HTTP
|
67
|
+
# Initialize variable for Net::HTTP response object
|
68
|
+
@response = nil
|
69
|
+
@verify_mode = OpenSSL::SSL::VERIFY_NONE
|
79
70
|
end
|
80
|
-
end
|
81
|
-
|
82
71
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
#
|
97
|
-
def user_agent= user_agent
|
98
|
-
fail 'attribute user_agent must be String' unless user_agent.is_a?(String)
|
99
|
-
@user_agent = user_agent
|
100
|
-
end
|
101
|
-
|
102
|
-
|
103
|
-
# Initialize Net::HTTP connection through proxy provider
|
104
|
-
# TYPE attribute distribute proxy type on SOCKS4(5) and HTTP(s)
|
105
|
-
#
|
106
|
-
# @param proxy Proxy configuration
|
107
|
-
#
|
108
|
-
def proxy= proxy
|
109
|
-
if proxy.is_a?(String)
|
110
|
-
ip, port, type = proxy.split(':')
|
111
|
-
fail 'attribute proxy must be in format ip:port' unless ip and port
|
112
|
-
type ||= :http
|
113
|
-
proxy = { ip: ip, port: port, type: type }
|
72
|
+
# Set debug configuration
|
73
|
+
#
|
74
|
+
# @param debug
|
75
|
+
#
|
76
|
+
def debug=(debug)
|
77
|
+
debug = var_to_sym(debug, true)
|
78
|
+
if debug.is_a?(Hash)
|
79
|
+
@debug.active = debug[:active]
|
80
|
+
@debug.destination = debug[:destination]
|
81
|
+
@debug.save_html = debug[:save_html]
|
82
|
+
elsif debug.is_a?(TrueClass)
|
83
|
+
@debug.active = true
|
84
|
+
end
|
114
85
|
end
|
115
|
-
proxy = var_to_sym(proxy)
|
116
|
-
fail 'attribute proxy must be Hash' unless proxy.is_a?(Hash)
|
117
|
-
fail 'attribute proxy must contain :ip and :port keys' unless proxy[:ip] and proxy[:port]
|
118
86
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
87
|
+
# Set READ_TIMEOUT agent attribute
|
88
|
+
#
|
89
|
+
# @param read_timeout Waiting time to reading
|
90
|
+
#
|
91
|
+
def read_timeout=(read_timeout)
|
92
|
+
raise 'attribute read_timeout must be Integer' unless read_timeout.is_a?(Integer)
|
93
|
+
@read_timeout = read_timeout
|
124
94
|
end
|
125
|
-
end
|
126
|
-
|
127
|
-
|
128
|
-
# Set BASIC_AUTH agent attribute
|
129
|
-
#
|
130
|
-
# @param basic_auth Authentification configuration
|
131
|
-
#
|
132
|
-
def basic_auth= basic_auth
|
133
|
-
basic_auth = var_to_sym(basic_auth)
|
134
|
-
fail 'attribute basic_auth must be Hash' unless basic_auth.is_a?(Hash)
|
135
|
-
fail 'attribute basic_auth must contain :username and :password keys' unless basic_auth[:username] and basic_auth[:password]
|
136
|
-
@basic_auth = basic_auth
|
137
|
-
end
|
138
|
-
|
139
|
-
|
140
|
-
# Set HEADERS agent attribute
|
141
|
-
#
|
142
|
-
# @param headers Request headers
|
143
|
-
#
|
144
|
-
def headers= headers
|
145
|
-
fail 'attribute headers must be Hash' unless headers.is_a?(Hash)
|
146
|
-
@headers = headers
|
147
|
-
end
|
148
95
|
|
96
|
+
# Set USER_AGENT agent attribute
|
97
|
+
#
|
98
|
+
# @param user_agent Web browser name
|
99
|
+
#
|
100
|
+
def user_agent=(user_agent)
|
101
|
+
raise 'attribute user_agent must be String' unless user_agent.is_a?(String)
|
102
|
+
@user_agent = user_agent
|
103
|
+
end
|
149
104
|
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
105
|
+
# Initialize Net::HTTP connection through proxy provider
|
106
|
+
# TYPE attribute distribute proxy type on SOCKS4(5) and HTTP(s)
|
107
|
+
#
|
108
|
+
# @param proxy Proxy configuration
|
109
|
+
#
|
110
|
+
def proxy=(proxy)
|
111
|
+
if proxy.is_a?(String)
|
112
|
+
ip, port, type = proxy.split(':')
|
113
|
+
raise 'attribute proxy must be in format ip:port' unless ip && port
|
114
|
+
type ||= :http
|
115
|
+
proxy = { ip: ip, port: port, type: type }
|
116
|
+
end
|
117
|
+
proxy = var_to_sym(proxy)
|
118
|
+
raise 'attribute proxy must be Hash' unless proxy.is_a?(Hash)
|
119
|
+
raise 'attribute proxy must contain :ip and :port keys' unless proxy[:ip] && proxy[:port]
|
120
|
+
|
121
|
+
@proxy = proxy
|
122
|
+
@http = if [:socks, 'socks'].include? proxy[:type]
|
123
|
+
Net::HTTP.SOCKSProxy(proxy[:ip].to_s, proxy[:port].to_s)
|
124
|
+
else
|
125
|
+
Net::HTTP::Proxy(proxy[:ip], proxy[:port])
|
126
|
+
end
|
127
|
+
end
|
160
128
|
|
129
|
+
# Set BASIC_AUTH agent attribute
|
130
|
+
#
|
131
|
+
# @param basic_auth Authentification configuration
|
132
|
+
#
|
133
|
+
def basic_auth=(basic_auth)
|
134
|
+
basic_auth = var_to_sym(basic_auth)
|
135
|
+
raise 'attribute basic_auth must be Hash' unless basic_auth.is_a?(Hash)
|
136
|
+
raise 'attribute basic_auth must contain :username and :password keys' unless basic_auth[:username] && basic_auth[:password]
|
137
|
+
@basic_auth = basic_auth
|
138
|
+
end
|
161
139
|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
140
|
+
# Set HEADERS agent attribute
|
141
|
+
#
|
142
|
+
# @param headers Request headers
|
143
|
+
#
|
144
|
+
def headers=(headers)
|
145
|
+
raise 'attribute headers must be Hash' unless headers.is_a?(Hash)
|
146
|
+
@headers = headers
|
147
|
+
end
|
169
148
|
|
149
|
+
# Set COOKIES agent attribute
|
150
|
+
#
|
151
|
+
# @param cookies Request cookies
|
152
|
+
#
|
153
|
+
def cookies=(cookies)
|
154
|
+
cookies = var_to_sym(cookies)
|
155
|
+
cookies = cookies.to_a.map { |x| "#{x[0]}=#{x[1]}" }.join('&') if cookies.is_a?(Hash)
|
156
|
+
raise 'attribute cookies must be String' unless cookies.is_a?(String)
|
157
|
+
@cookies = cookies
|
158
|
+
end
|
170
159
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
@
|
186
|
-
@debug.save '------------------------------'
|
160
|
+
# Set verify_mode
|
161
|
+
#
|
162
|
+
# @param verify_mode SSL verify_mode
|
163
|
+
#
|
164
|
+
# def verify_mode=(verify_mode)
|
165
|
+
# @verify_mode = verify_mode
|
166
|
+
# end
|
167
|
+
|
168
|
+
# Init follow location for redirect
|
169
|
+
#
|
170
|
+
# @param follow_location Follow location flag
|
171
|
+
#
|
172
|
+
def follow_location=(follow_location)
|
173
|
+
raise 'attribute follow_location must be Boolean' unless follow_location.is_a?(TrueClass) || follow_location.is_a?(FalseClass)
|
174
|
+
@follow_location = follow_location
|
187
175
|
end
|
188
|
-
|
189
|
-
|
176
|
+
|
177
|
+
# Fetch request for GET and POST HTTP methods
|
178
|
+
# Setting USER_AGENT, BASIC_AUTH, HEADERS, COOKIES request attribute
|
179
|
+
# Make response and save COOKIES for next requests
|
180
|
+
#
|
181
|
+
# @param url Resource link
|
182
|
+
# @param method Request method
|
183
|
+
# @param headers Request header
|
184
|
+
# @param params Request additional params
|
185
|
+
#
|
186
|
+
def fetch(url, method = :get, headers = {}, params = {})
|
187
|
+
if @debug.active
|
188
|
+
@debug.save '=============================='
|
189
|
+
@debug.save "#{method.upcase} #{url}"
|
190
|
+
@debug.save "-> [proxy] = #{@proxy}" if @proxy
|
191
|
+
@debug.save "-> [params] = #{params}"
|
192
|
+
@debug.save '------------------------------'
|
193
|
+
end
|
194
|
+
convert_to_uri url
|
195
|
+
case method
|
190
196
|
when :get
|
191
197
|
@request = Net::HTTP::Get.new(@uri.request_uri)
|
192
198
|
when :post
|
193
199
|
@request = Net::HTTP::Post.new(@uri.request_uri)
|
194
200
|
@request.set_form_data(params)
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
201
|
+
end
|
202
|
+
set_user_agent if @user_agent
|
203
|
+
set_basic_auth unless @basic_auth.empty?
|
204
|
+
@headers = headers unless headers.empty?
|
205
|
+
set_headers unless @headers.empty?
|
206
|
+
set_cookies if @cookies
|
207
|
+
@response = send_request
|
208
|
+
case @response
|
203
209
|
# HTTP response code 1xx
|
204
210
|
when Net::HTTPInformation
|
205
|
-
@debug.save
|
211
|
+
@debug.save '<- [response] = Net::HTTPInformation' if @debug.active
|
206
212
|
# HTTP response code 2xx
|
207
213
|
when Net::HTTPSuccess
|
208
214
|
save_headers if @response.header
|
209
215
|
save_cookies if @response.cookies
|
210
216
|
@debug.save "<- [response] = #{@response.code} Net::HTTPSuccess" if @debug.active
|
217
|
+
# Follow meta refresh
|
218
|
+
if @follow_location
|
219
|
+
refresh = @response.ng.at_css('meta[http-equiv="refresh"]')
|
220
|
+
@response = fetch refresh.attr('content').gsub(/\A.*?(http)/, 'http') if refresh
|
221
|
+
end
|
211
222
|
# HTTP response code 3xx
|
212
223
|
when Net::HTTPRedirection
|
213
224
|
@debug.save "<- [response] = #{@response.code} Net::HTTPRedirection" if @debug.active
|
225
|
+
@debug.save 'try curl user_agent: tg.user_agent=\'curl\''
|
226
|
+
# Follow location
|
227
|
+
@response = fetch @response.header['Location'] if @follow_location
|
214
228
|
# HTTP response code 4xx
|
215
229
|
when Net::HTTPClientError
|
216
230
|
@debug.save "<- [response] = #{@response.code} Net::HTTPClientError" if @debug.active
|
217
231
|
# HTTP response code 5xx
|
218
232
|
when Net::HTTPServerError
|
219
233
|
@debug.save "<- [response] = #{@response.code} Net::HTTPServerError" if @debug.active
|
234
|
+
end
|
235
|
+
@debug.save_to_file @response.body if @debug.save_html
|
236
|
+
@response
|
220
237
|
end
|
221
|
-
@debug.save_to_file @response.body if @debug.save_html
|
222
|
-
@response
|
223
|
-
end
|
224
|
-
|
225
|
-
|
226
|
-
# Initialize URI object from request url
|
227
|
-
#
|
228
|
-
# @param url Request link
|
229
|
-
#
|
230
|
-
def set_uri url
|
231
|
-
# It's magic work with escaped url
|
232
|
-
@uri = URI(URI.escape(URI.unescape(url)))
|
233
|
-
@debug.save "-> [uri] = #{@uri}" if @debug.active
|
234
|
-
end
|
235
|
-
|
236
|
-
|
237
|
-
# Set USER_AGENT request attribute
|
238
|
-
#
|
239
|
-
def set_user_agent
|
240
|
-
@headers['User-Agent'] = @user_agent
|
241
|
-
@debug.save "-> [user_agent] = #{@user_agent}" if @debug.active
|
242
|
-
end
|
243
238
|
|
239
|
+
# Initialize URI object from request url
|
240
|
+
#
|
241
|
+
# @param url Request link
|
242
|
+
#
|
243
|
+
def convert_to_uri(url)
|
244
|
+
# It's magic work with escaped url
|
245
|
+
@uri = URI(URI.escape(URI.unescape(url)))
|
246
|
+
@debug.save "-> [uri] = #{@uri}" if @debug.active
|
247
|
+
end
|
244
248
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
249
|
+
# Set USER_AGENT request attribute
|
250
|
+
#
|
251
|
+
def set_user_agent
|
252
|
+
@headers['User-Agent'] = @user_agent
|
253
|
+
@debug.save "-> [user_agent] = #{@user_agent}" if @debug.active
|
254
|
+
end
|
251
255
|
|
256
|
+
# Set BASIC_AUTH request authentification
|
257
|
+
#
|
258
|
+
def set_basic_auth
|
259
|
+
@request.basic_auth @basic_auth[:username], @basic_auth[:password]
|
260
|
+
@debug.save "-> [basic_auth] = #{@basic_auth}" if @debug.active
|
261
|
+
end
|
252
262
|
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
263
|
+
# Set request HEADERS
|
264
|
+
#
|
265
|
+
def set_headers
|
266
|
+
@headers.each do |k, v|
|
267
|
+
k = String(k)
|
268
|
+
case k
|
259
269
|
when 'Accept'
|
260
270
|
@request[k] = v
|
261
271
|
else
|
262
272
|
@request.add_field(k, v)
|
273
|
+
end
|
263
274
|
end
|
275
|
+
@debug.save "-> [headers] = #{@headers}" if @debug.active
|
264
276
|
end
|
265
|
-
@debug.save "-> [headers] = #{@headers}" if @debug.active
|
266
|
-
end
|
267
|
-
|
268
277
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
end
|
275
|
-
|
276
|
-
|
277
|
-
# Send request and get response
|
278
|
-
# Use SSL connect for HTTPS link scheme
|
279
|
-
#
|
280
|
-
def send_request
|
281
|
-
@http.start(@uri.host, @uri.port, use_ssl: @uri.scheme == 'https', verify_mode: @verify_mode, read_timeout: @read_timeout) do |http|
|
282
|
-
@debug.save "-> [read_timeout] = #{@read_timeout}" if @debug.active
|
283
|
-
http.request(@request)
|
278
|
+
# Set request COOKIES
|
279
|
+
#
|
280
|
+
def set_cookies
|
281
|
+
@request['Cookie'] = @cookies
|
282
|
+
@debug.save "-> [cookies] = #{@cookies}" if @debug.active
|
284
283
|
end
|
285
|
-
end
|
286
|
-
|
287
|
-
|
288
|
-
# Save response headers in agent attribute
|
289
|
-
#
|
290
|
-
def save_headers
|
291
|
-
@headers = @response.headers
|
292
|
-
# Delete header TRANSFER_ENCODING for chain of requests
|
293
|
-
@headers.delete('transfer-encoding')
|
294
|
-
@debug.save "<- [headers] = #{@headers}" if @debug.active
|
295
|
-
end
|
296
284
|
|
285
|
+
# Send request and get response
|
286
|
+
# Use SSL connect for HTTPS link scheme
|
287
|
+
#
|
288
|
+
def send_request
|
289
|
+
@http.start(@uri.host, @uri.port, use_ssl: @uri.scheme == 'https', verify_mode: @verify_mode, read_timeout: @read_timeout) do |http|
|
290
|
+
@debug.save "-> [read_timeout] = #{@read_timeout}" if @debug.active
|
291
|
+
http.request(@request)
|
292
|
+
end
|
293
|
+
end
|
297
294
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
295
|
+
# Save response headers in agent attribute
|
296
|
+
#
|
297
|
+
def save_headers
|
298
|
+
@headers = @response.headers
|
299
|
+
# Delete header TRANSFER_ENCODING for chain of requests
|
300
|
+
@headers.delete('transfer-encoding')
|
301
|
+
@debug.save "<- [headers] = #{@headers}" if @debug.active
|
302
|
+
end
|
304
303
|
|
304
|
+
# Save response cookies in agent attribute
|
305
|
+
#
|
306
|
+
def save_cookies
|
307
|
+
@cookies = @response.cookies
|
308
|
+
@debug.save "<- [cookies] = #{@cookies}" if @debug.active
|
309
|
+
end
|
305
310
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
311
|
+
# Clears headers and cookies
|
312
|
+
#
|
313
|
+
def reset
|
314
|
+
@headers = {}
|
315
|
+
@cookies = nil
|
316
|
+
end
|
312
317
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
318
|
+
# Convert variables and contains to symbol
|
319
|
+
#
|
320
|
+
# @param var Variable need to convert
|
321
|
+
#
|
322
|
+
def var_to_sym(var, str_to_sym = false)
|
323
|
+
if var.is_a?(Hash)
|
324
|
+
result = {}
|
325
|
+
var.each do |k, v|
|
326
|
+
result[k.to_sym] = var_to_sym(v, str_to_sym)
|
327
|
+
end
|
328
|
+
elsif var.is_a?(Array)
|
329
|
+
result = []
|
330
|
+
var.each do |v|
|
331
|
+
result << var_to_sym(v, str_to_sym)
|
332
|
+
end
|
333
|
+
elsif var.is_a?(String)
|
334
|
+
result = str_to_sym ? var.to_sym : var
|
335
|
+
else
|
336
|
+
result = var
|
327
337
|
end
|
328
|
-
|
329
|
-
result = str_to_sym ? var.to_sym : var
|
330
|
-
else
|
331
|
-
result = var
|
338
|
+
result
|
332
339
|
end
|
333
|
-
result
|
334
340
|
end
|
335
|
-
end
|
341
|
+
end
|
data/lib/tiny_grabber/debug.rb
CHANGED
@@ -15,76 +15,30 @@ class Debug
|
|
15
15
|
@save_html = false
|
16
16
|
end
|
17
17
|
|
18
|
-
|
19
|
-
# Set debug active flag
|
20
|
-
#
|
21
|
-
# @param active Flag
|
22
|
-
#
|
23
|
-
def active= active
|
24
|
-
@active = active
|
25
|
-
end
|
26
|
-
|
27
|
-
|
28
|
-
# Get debug active flag
|
29
|
-
def active
|
30
|
-
@active
|
31
|
-
end
|
32
|
-
|
33
|
-
|
34
|
-
# Set debug destination
|
35
|
-
#
|
36
|
-
# @param destination Save log to file or print
|
37
|
-
#
|
38
|
-
def destination= destination
|
39
|
-
@destination = destination
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
# Get debug destination
|
44
|
-
#
|
45
|
-
def destination
|
46
|
-
@destination
|
47
|
-
end
|
48
|
-
|
49
|
-
|
50
|
-
# Set debug flag to save response HTML to file
|
51
|
-
#
|
52
|
-
# @param save_html Flag
|
53
|
-
def save_html= save_html
|
54
|
-
@save_html = save_html
|
55
|
-
end
|
56
|
-
|
57
|
-
|
58
|
-
# Get debug flag to save response HTML to file
|
59
|
-
#
|
60
|
-
def save_html
|
61
|
-
@save_html
|
62
|
-
end
|
63
|
-
|
64
|
-
|
65
18
|
# Save log information
|
66
19
|
#
|
67
20
|
# @param message Message body
|
68
21
|
#
|
69
|
-
def save
|
22
|
+
def save(message)
|
70
23
|
message = "TG | #{Time.now.strftime('%Y%m%d-%H%M%S')} | #{message}"
|
71
24
|
case @destination
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
25
|
+
when :file
|
26
|
+
save_to_file message
|
27
|
+
when :print
|
28
|
+
p message
|
76
29
|
end
|
77
30
|
end
|
78
31
|
|
79
|
-
|
80
32
|
# Save log information to file
|
81
33
|
#
|
82
34
|
# @param message Message body
|
83
35
|
#
|
84
|
-
def save_to_file
|
36
|
+
def save_to_file(message)
|
37
|
+
# Encode message for correct Unix encoding
|
38
|
+
message = message.force_encoding('utf-8')
|
85
39
|
debug_path = "#{Dir.pwd}/log"
|
86
|
-
Dir.mkdir(debug_path,
|
40
|
+
Dir.mkdir(debug_path, 0o775) unless File.exist? debug_path
|
87
41
|
filename = "#{Time.now.strftime('%Y%m%d')}.log"
|
88
42
|
File.open("#{debug_path}/#{filename}", 'a+') { |f| f << "#{message}\r\n" }
|
89
43
|
end
|
90
|
-
end
|
44
|
+
end
|
data/lib/tiny_grabber/http.rb
CHANGED
@@ -4,29 +4,27 @@ require 'nokogiri'
|
|
4
4
|
module Net
|
5
5
|
# Success response class
|
6
6
|
class HTTPOK
|
7
|
-
|
8
7
|
# Nokogiri object of response
|
9
8
|
#
|
10
9
|
def ng
|
11
|
-
Nokogiri::HTML(
|
10
|
+
Nokogiri::HTML(body)
|
12
11
|
end
|
13
12
|
|
14
13
|
# Response Cookies
|
15
14
|
#
|
16
15
|
def cookies
|
17
|
-
cookies =
|
16
|
+
cookies = get_fields('set-cookie')
|
18
17
|
if cookies
|
19
18
|
cookies.map { |cookie| cookie.gsub(/\A([^;]+).*\Z/, '\1') }.join('&')
|
20
|
-
else
|
21
|
-
nil
|
22
19
|
end
|
23
20
|
end
|
24
21
|
|
25
|
-
|
26
22
|
# Response Headers
|
27
23
|
#
|
28
24
|
def headers
|
29
|
-
|
25
|
+
header.to_hash.each_with_object({}) do |header_key, header_value|
|
26
|
+
header_value[header_key] = header_value.first
|
27
|
+
end
|
30
28
|
end
|
31
29
|
end
|
32
|
-
end
|
30
|
+
end
|
data/lib/tiny_grabber/version.rb
CHANGED
data/lib/tiny_grabber.rb
CHANGED
@@ -10,17 +10,15 @@ require 'tiny_grabber/http'
|
|
10
10
|
# Main class for TinyGrabber
|
11
11
|
#
|
12
12
|
class TinyGrabber
|
13
|
-
|
14
13
|
# Initialize a new TinyGrabber user agent.
|
15
14
|
#
|
16
15
|
def initialize
|
17
16
|
@agent = TinyGrabber::Agent.new
|
18
17
|
end
|
19
18
|
|
20
|
-
|
21
19
|
# Singleton > Initialize a new TinyGrabber user agent.
|
22
20
|
#
|
23
|
-
def self.initialize
|
21
|
+
def self.initialize(config = {})
|
24
22
|
@agent = TinyGrabber::Agent.new
|
25
23
|
|
26
24
|
@agent.debug = config[:debug] if config[:debug]
|
@@ -32,13 +30,12 @@ class TinyGrabber
|
|
32
30
|
@agent.cookies = config[:cookies] if config[:cookies]
|
33
31
|
end
|
34
32
|
|
35
|
-
|
36
33
|
# HTTP::GET request
|
37
34
|
#
|
38
35
|
# @param url Resource link
|
39
36
|
# @param headers Request header
|
40
37
|
#
|
41
|
-
def get
|
38
|
+
def get(url, headers = {})
|
42
39
|
@agent.fetch url, :get, headers
|
43
40
|
end
|
44
41
|
|
@@ -47,129 +44,115 @@ class TinyGrabber
|
|
47
44
|
# @param url Resource link
|
48
45
|
# @param headers Request header
|
49
46
|
#
|
50
|
-
def self.get
|
51
|
-
initialize
|
47
|
+
def self.get(url, config = {})
|
48
|
+
initialize config
|
52
49
|
@agent.fetch url, :get
|
53
50
|
end
|
54
51
|
|
55
|
-
|
56
52
|
# HTTP::POST request
|
57
53
|
#
|
58
54
|
# @param url Resource link
|
59
55
|
# @param params Request post data
|
60
56
|
# @param headers Request header
|
61
57
|
#
|
62
|
-
def post
|
58
|
+
def post(url, params = {}, headers = {})
|
63
59
|
@agent.fetch url, :post, headers, params
|
64
60
|
end
|
65
61
|
|
66
|
-
|
67
62
|
# Singleton > HTTP::GET request
|
68
63
|
#
|
69
64
|
# @param url Resource link
|
70
65
|
# @param headers Request header
|
71
66
|
#
|
72
|
-
def self.post
|
73
|
-
initialize
|
67
|
+
def self.post(url, params = {}, config = {})
|
68
|
+
initialize config
|
74
69
|
@agent.fetch url, :post, {}, params
|
75
70
|
end
|
76
71
|
|
77
|
-
|
78
72
|
# Set DEBUG flag
|
79
73
|
#
|
80
74
|
# @param debug Flag to start debug
|
81
75
|
#
|
82
|
-
def debug=
|
76
|
+
def debug=(debug)
|
83
77
|
@agent.debug = debug
|
84
78
|
end
|
85
79
|
|
86
|
-
|
87
80
|
# Read READ_TIMEOUT agent attribute
|
88
81
|
#
|
89
82
|
def read_timeout
|
90
83
|
@agent.read_timeout
|
91
84
|
end
|
92
85
|
|
93
|
-
|
94
86
|
# Set READ_TIMEOUT agent attribute
|
95
87
|
#
|
96
88
|
# @param read_timeout Waiting time to reading
|
97
89
|
#
|
98
|
-
def read_timeout=
|
90
|
+
def read_timeout=(read_timeout)
|
99
91
|
@agent.read_timeout = read_timeout
|
100
92
|
end
|
101
93
|
|
102
|
-
|
103
94
|
# Read USER_AGENT agent attribute
|
104
95
|
#
|
105
96
|
def user_agent
|
106
97
|
@agent.user_agent
|
107
98
|
end
|
108
99
|
|
109
|
-
|
110
100
|
# Set USER_AGENT agent attribute
|
111
101
|
#
|
112
102
|
# @param user_agent Web browser name
|
113
103
|
#
|
114
|
-
def user_agent=
|
104
|
+
def user_agent=(user_agent)
|
115
105
|
@agent.user_agent = user_agent
|
116
106
|
end
|
117
107
|
|
118
|
-
|
119
108
|
# Read PROXY agent attribute
|
120
109
|
#
|
121
110
|
def proxy
|
122
111
|
@agent.proxy
|
123
112
|
end
|
124
113
|
|
125
|
-
|
126
114
|
# Set PROXY agent attribute
|
127
115
|
#
|
128
116
|
# @param proxy Proxy configuration
|
129
117
|
#
|
130
|
-
def proxy=
|
118
|
+
def proxy=(proxy)
|
131
119
|
@agent.proxy = proxy
|
132
120
|
end
|
133
121
|
|
134
|
-
|
135
122
|
# Set BASIC_AUTH agent attribute
|
136
123
|
#
|
137
124
|
# @param username Authentification username
|
138
125
|
# @param password Authentification password
|
139
126
|
#
|
140
|
-
def basic_auth
|
127
|
+
def basic_auth(username, password)
|
141
128
|
@agent.basic_auth = { username: username, password: password }
|
142
129
|
end
|
143
130
|
|
144
|
-
|
145
131
|
# Read HEADERS agent attribute
|
146
132
|
#
|
147
133
|
def headers
|
148
134
|
@agent.headers
|
149
135
|
end
|
150
136
|
|
151
|
-
|
152
137
|
# Set HEADERS agent attribute
|
153
138
|
#
|
154
139
|
# @param headers Request headers
|
155
140
|
#
|
156
|
-
def headers=
|
141
|
+
def headers=(headers)
|
157
142
|
@agent.headers = headers
|
158
143
|
end
|
159
144
|
|
160
|
-
|
161
145
|
# Read COOKIES agent attribute
|
162
146
|
#
|
163
147
|
def cookies
|
164
148
|
@agent.cookies
|
165
149
|
end
|
166
150
|
|
167
|
-
|
168
151
|
# Set COOKIES agent attribute
|
169
152
|
#
|
170
153
|
# @param cookies Request cookies
|
171
154
|
#
|
172
|
-
def cookies=
|
155
|
+
def cookies=(cookies)
|
173
156
|
@agent.cookies = cookies
|
174
157
|
end
|
175
158
|
|
@@ -179,12 +162,19 @@ class TinyGrabber
|
|
179
162
|
@agent.reset
|
180
163
|
end
|
181
164
|
|
182
|
-
|
183
165
|
# Set verify_mode
|
184
166
|
#
|
185
167
|
# @param verify_mode SSL verify mode
|
186
168
|
#
|
187
|
-
def verify_mode=
|
169
|
+
def verify_mode=(verify_mode)
|
188
170
|
@agent.verify_mode = verify_mode
|
189
171
|
end
|
190
|
-
|
172
|
+
|
173
|
+
# Set follow_location
|
174
|
+
#
|
175
|
+
# @param follow_location Follow location flag
|
176
|
+
#
|
177
|
+
def follow_location=(follow_location)
|
178
|
+
@agent.follow_location = follow_location
|
179
|
+
end
|
180
|
+
end
|
data/tiny_grabber.gemspec
CHANGED
@@ -6,11 +6,11 @@ require 'tiny_grabber/version'
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
7
|
spec.name = 'tiny_grabber'
|
8
8
|
spec.version = TinyGrabber::VERSION
|
9
|
-
spec.authors = [
|
9
|
+
spec.authors = ['Aleksandr Chernyshev']
|
10
10
|
spec.email = ['moroznoeytpo@gmail.com']
|
11
11
|
|
12
|
-
spec.summary =
|
13
|
-
spec.description =
|
12
|
+
spec.summary = 'Tiny grabber'
|
13
|
+
spec.description = 'Simple gem for grabbing remote web page.'
|
14
14
|
spec.homepage = 'https://github.com/moroznoeytpo/tiny_grabber'
|
15
15
|
spec.license = 'MIT'
|
16
16
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tiny_grabber
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Aleksandr Chernyshev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: socksify
|