http_crawler 0.3.0.0 → 0.3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/http_crawler.gemspec +1 -1
- data/lib/http_crawler/client.rb +63 -14
- data/lib/http_crawler/http/response.rb +22 -1
- data/lib/http_crawler/proxy/test_proxy_api/client.rb +0 -1
- data/lib/http_crawler/proxy.rb +4 -2
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler.rb +3 -0
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d3f0a5acf0654b1ccba309c40a7153866850ce8f5be508e06cf587f23cf8a1d
|
4
|
+
data.tar.gz: 38ca43bafdecec27eb57078de5b07dcc07a0d30b75807a44895eb7441c0860ff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 98d2057ce312a8beef8508ef411f2e1d3b7f65c0588c19f3dffc1a71f1c85a259e812e8f7fb48b66f0b25120a012eccafaba096e4d1ca1647e3e7fdb790fd6d8
|
7
|
+
data.tar.gz: 6f8b27afbf39767e1484ff62247e007fb666a8d49d6893968be85a93925b8b2f7207ef06e7843902d08d270af3041320f9b70e4f38248108841c4d1be3359b6c
|
data/http_crawler.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["1336098842@qq.com"]
|
11
11
|
|
12
12
|
spec.summary = %q{http 爬虫。}
|
13
|
-
spec.description = %q{初级开发工程师,基于
|
13
|
+
spec.description = %q{初级开发工程师,基于 http 写的爬虫扩展包。}
|
14
14
|
spec.homepage = "https://rubygems.org/gems/http_crawler"
|
15
15
|
spec.license = "MIT"
|
16
16
|
|
data/lib/http_crawler/client.rb
CHANGED
@@ -23,8 +23,14 @@ module HttpCrawler
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
attr_reader :uri
|
27
26
|
|
27
|
+
attr_accessor :max_error_num
|
28
|
+
# 最大错误重试次数
|
29
|
+
def max_error_num
|
30
|
+
@max_error_num ||= 1
|
31
|
+
end
|
32
|
+
|
33
|
+
attr_reader :uri
|
28
34
|
# init_uri 如果未初始化@uri,则会报错
|
29
35
|
# 继承类需要实现 @uri = URI("http://host")
|
30
36
|
#
|
@@ -34,9 +40,9 @@ module HttpCrawler
|
|
34
40
|
|
35
41
|
# 初始化超时时间
|
36
42
|
def init_timeout
|
37
|
-
@connect_time =
|
38
|
-
@write_time =
|
39
|
-
@read_time =
|
43
|
+
@connect_time = 3
|
44
|
+
@write_time = 3
|
45
|
+
@read_time = 3
|
40
46
|
end
|
41
47
|
|
42
48
|
# 初始化 ssl 协议
|
@@ -169,16 +175,6 @@ module HttpCrawler
|
|
169
175
|
h
|
170
176
|
end
|
171
177
|
|
172
|
-
# 发送 get 请求
|
173
|
-
def get(path, params = {})
|
174
|
-
http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
|
175
|
-
end
|
176
|
-
|
177
|
-
# 发送 post 请求
|
178
|
-
def post(path, params = {})
|
179
|
-
http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
|
180
|
-
end
|
181
|
-
|
182
178
|
#
|
183
179
|
# init_uri 如果未初始化@uri,则会报错
|
184
180
|
# 继承类需要重定义 init_uri
|
@@ -200,6 +196,59 @@ module HttpCrawler
|
|
200
196
|
@proxy_params = {key: "#{self.class}"}
|
201
197
|
end
|
202
198
|
|
199
|
+
# 发送 get 请求
|
200
|
+
def get(path, params = {})
|
201
|
+
request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
|
202
|
+
end
|
203
|
+
|
204
|
+
# 发送 post 请求
|
205
|
+
def post(path, params = {})
|
206
|
+
request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
|
207
|
+
end
|
208
|
+
|
209
|
+
# 请求的响应
|
210
|
+
attr_accessor :response
|
211
|
+
protected :response=
|
212
|
+
|
213
|
+
# 出现如果验证码,切换代理
|
214
|
+
def validation_to_proxy?(r = response)
|
215
|
+
# 判断是否出现验证码
|
216
|
+
if r.validation_page?
|
217
|
+
# 触发验证码切换代理
|
218
|
+
self.update_proxy?
|
219
|
+
# 成功处理
|
220
|
+
return true
|
221
|
+
else
|
222
|
+
return false
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
protected
|
227
|
+
|
228
|
+
|
229
|
+
# 发送请求
|
230
|
+
def request(&block)
|
231
|
+
raise "必须定义块" unless block_given?
|
232
|
+
n = max_error_num
|
233
|
+
begin
|
234
|
+
block.call
|
235
|
+
rescue HTTP::TimeoutError
|
236
|
+
# 超时错误切换代理
|
237
|
+
if self.update_proxy?
|
238
|
+
retry
|
239
|
+
else
|
240
|
+
raise error
|
241
|
+
end
|
242
|
+
rescue => error
|
243
|
+
# 错误尝试次数
|
244
|
+
if n <= 0
|
245
|
+
raise error
|
246
|
+
else
|
247
|
+
n -= 1
|
248
|
+
retry
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
203
252
|
end
|
204
253
|
end
|
205
254
|
|
@@ -3,7 +3,7 @@ module HTTP
|
|
3
3
|
|
4
4
|
# 解压并转码 body 数据
|
5
5
|
def decoding_body
|
6
|
-
@decoding_body ||= self.to_s
|
6
|
+
@decoding_body ||= self.body.to_s
|
7
7
|
end
|
8
8
|
|
9
9
|
# def decoding_body
|
@@ -48,5 +48,26 @@ module HTTP
|
|
48
48
|
return time
|
49
49
|
end
|
50
50
|
|
51
|
+
|
52
|
+
# 验证码判断
|
53
|
+
attr_accessor :validations
|
54
|
+
|
55
|
+
def validations
|
56
|
+
@validations ||= []
|
57
|
+
end
|
58
|
+
|
59
|
+
# 是否验证码界面
|
60
|
+
def validation_page?
|
61
|
+
# 正则匹配数组 validations 的所有匹配值
|
62
|
+
validations.each do |regular|
|
63
|
+
if decoding_body[regular]
|
64
|
+
Rails.logger.warn("触发验证信息")
|
65
|
+
Rails.logger.warn(decoding_body[(decoding_body =~ regular)..100])
|
66
|
+
return true
|
67
|
+
end
|
68
|
+
end
|
69
|
+
return false
|
70
|
+
end
|
71
|
+
|
51
72
|
end # class Net::HTTPResponse
|
52
73
|
end
|
data/lib/http_crawler/proxy.rb
CHANGED
data/lib/http_crawler/version.rb
CHANGED
data/lib/http_crawler.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.0.
|
4
|
+
version: 0.3.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-02-
|
11
|
+
date: 2019-02-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -108,7 +108,7 @@ dependencies:
|
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
110
|
version: 0.2.1
|
111
|
-
description: 初级开发工程师,基于
|
111
|
+
description: 初级开发工程师,基于 http 写的爬虫扩展包。
|
112
112
|
email:
|
113
113
|
- 1336098842@qq.com
|
114
114
|
executables: []
|