http_crawler 0.3.0.0 → 0.3.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/http_crawler.gemspec +1 -1
- data/lib/http_crawler/client.rb +63 -14
- data/lib/http_crawler/http/response.rb +22 -1
- data/lib/http_crawler/proxy/test_proxy_api/client.rb +0 -1
- data/lib/http_crawler/proxy.rb +4 -2
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler.rb +3 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d3f0a5acf0654b1ccba309c40a7153866850ce8f5be508e06cf587f23cf8a1d
|
4
|
+
data.tar.gz: 38ca43bafdecec27eb57078de5b07dcc07a0d30b75807a44895eb7441c0860ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98d2057ce312a8beef8508ef411f2e1d3b7f65c0588c19f3dffc1a71f1c85a259e812e8f7fb48b66f0b25120a012eccafaba096e4d1ca1647e3e7fdb790fd6d8
|
7
|
+
data.tar.gz: 6f8b27afbf39767e1484ff62247e007fb666a8d49d6893968be85a93925b8b2f7207ef06e7843902d08d270af3041320f9b70e4f38248108841c4d1be3359b6c
|
data/http_crawler.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["1336098842@qq.com"]
|
11
11
|
|
12
12
|
spec.summary = %q{http 爬虫。}
|
13
|
-
spec.description = %q{初级开发工程师,基于
|
13
|
+
spec.description = %q{初级开发工程师,基于 http 写的爬虫扩展包。}
|
14
14
|
spec.homepage = "https://rubygems.org/gems/http_crawler"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
data/lib/http_crawler/client.rb
CHANGED
@@ -23,8 +23,14 @@ module HttpCrawler
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
attr_reader :uri
|
27
26
|
|
27
|
+
attr_accessor :max_error_num
|
28
|
+
# 最大错误重试次数
|
29
|
+
def max_error_num
|
30
|
+
@max_error_num ||= 1
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :uri
|
28
34
|
# init_uri 如果未初始化@uri,则会报错
|
29
35
|
# 继承类需要实现 @uri = URI("http://host")
|
30
36
|
#
|
@@ -34,9 +40,9 @@ module HttpCrawler
|
|
34
40
|
|
35
41
|
# 初始化超时时间
|
36
42
|
def init_timeout
|
37
|
-
@connect_time =
|
38
|
-
@write_time =
|
39
|
-
@read_time =
|
43
|
+
@connect_time = 3
|
44
|
+
@write_time = 3
|
45
|
+
@read_time = 3
|
40
46
|
end
|
41
47
|
|
42
48
|
# 初始化 ssl 协议
|
@@ -169,16 +175,6 @@ module HttpCrawler
|
|
169
175
|
h
|
170
176
|
end
|
171
177
|
|
172
|
-
# 发送 get 请求
|
173
|
-
def get(path, params = {})
|
174
|
-
http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
|
175
|
-
end
|
176
|
-
|
177
|
-
# 发送 post 请求
|
178
|
-
def post(path, params = {})
|
179
|
-
http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
|
180
|
-
end
|
181
|
-
|
182
178
|
#
|
183
179
|
# init_uri 如果未初始化@uri,则会报错
|
184
180
|
# 继承类需要重定义 init_uri
|
@@ -200,6 +196,59 @@ module HttpCrawler
|
|
200
196
|
@proxy_params = {key: "#{self.class}"}
|
201
197
|
end
|
202
198
|
|
199
|
+
# 发送 get 请求
|
200
|
+
def get(path, params = {})
|
201
|
+
request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
|
202
|
+
end
|
203
|
+
|
204
|
+
# 发送 post 请求
|
205
|
+
def post(path, params = {})
|
206
|
+
request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
|
207
|
+
end
|
208
|
+
|
209
|
+
# 请求的响应
|
210
|
+
attr_accessor :response
|
211
|
+
protected :response=
|
212
|
+
|
213
|
+
# 出现如果验证码,切换代理
|
214
|
+
def validation_to_proxy?(r = response)
|
215
|
+
# 判断是否出现验证码
|
216
|
+
if r.validation_page?
|
217
|
+
# 触发验证码切换代理
|
218
|
+
self.update_proxy?
|
219
|
+
# 成功处理
|
220
|
+
return true
|
221
|
+
else
|
222
|
+
return false
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
protected
|
227
|
+
|
228
|
+
|
229
|
+
# 发送请求
|
230
|
+
def request(&block)
|
231
|
+
raise "必须定义块" unless block_given?
|
232
|
+
n = max_error_num
|
233
|
+
begin
|
234
|
+
block.call
|
235
|
+
rescue HTTP::TimeoutError
|
236
|
+
# 超时错误切换代理
|
237
|
+
if self.update_proxy?
|
238
|
+
retry
|
239
|
+
else
|
240
|
+
raise error
|
241
|
+
end
|
242
|
+
rescue => error
|
243
|
+
# 错误尝试次数
|
244
|
+
if n <= 0
|
245
|
+
raise error
|
246
|
+
else
|
247
|
+
n -= 1
|
248
|
+
retry
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
203
252
|
end
|
204
253
|
end
|
205
254
|
|
@@ -3,7 +3,7 @@ module HTTP
|
|
3
3
|
|
4
4
|
# 解压并转码 body 数据
|
5
5
|
def decoding_body
|
6
|
-
@decoding_body ||= self.to_s
|
6
|
+
@decoding_body ||= self.body.to_s
|
7
7
|
end
|
8
8
|
|
9
9
|
# def decoding_body
|
@@ -48,5 +48,26 @@ module HTTP
|
|
48
48
|
return time
|
49
49
|
end
|
50
50
|
|
51
|
+
|
52
|
+
# 验证码判断
|
53
|
+
attr_accessor :validations
|
54
|
+
|
55
|
+
def validations
|
56
|
+
@validations ||= []
|
57
|
+
end
|
58
|
+
|
59
|
+
# 是否验证码界面
|
60
|
+
def validation_page?
|
61
|
+
# 正则匹配数组 validations 的所有匹配值
|
62
|
+
validations.each do |regular|
|
63
|
+
if decoding_body[regular]
|
64
|
+
Rails.logger.warn("触发验证信息")
|
65
|
+
Rails.logger.warn(decoding_body[(decoding_body =~ regular)..100])
|
66
|
+
return true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
return false
|
70
|
+
end
|
71
|
+
|
51
72
|
end # class Net::HTTPResponse
|
52
73
|
end
|
data/lib/http_crawler/proxy.rb
CHANGED
data/lib/http_crawler/version.rb
CHANGED
data/lib/http_crawler.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.0.
|
4
|
+
version: 0.3.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -108,7 +108,7 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: 0.2.1
|
111
|
-
description: 初级开发工程师,基于
|
111
|
+
description: 初级开发工程师,基于 http 写的爬虫扩展包。
|
112
112
|
email:
|
113
113
|
- 1336098842@qq.com
|
114
114
|
executables: []
|