http_crawler 0.3.0.0 → 0.3.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 870f70e609b513a6bdf5176e1ab2960d19e7b4f04d60c746845e6e27343288c7
4
- data.tar.gz: 6706312c08462cb4dee0f8c2eac15b7882131832dc49f6cb336032ffdc0ab68d
3
+ metadata.gz: 3d3f0a5acf0654b1ccba309c40a7153866850ce8f5be508e06cf587f23cf8a1d
4
+ data.tar.gz: 38ca43bafdecec27eb57078de5b07dcc07a0d30b75807a44895eb7441c0860ff
5
5
  SHA512:
6
- metadata.gz: 3c47791191aca7f3065eee2c9e223e3196f0890a4da8cd4bf8dee697eae18133758ac3225d49503d6ece004bc55474bbd9486b140b2d1588c92ee23d8b06663d
7
- data.tar.gz: ff4da1b65de1431b9e6aa800e63d9e0b4c97ee942b68760136deb3bd819c08fc82504159e1a1ea14b6368458e8fa6a7568521a12d0eff259c637e2092791ea6f
6
+ metadata.gz: 98d2057ce312a8beef8508ef411f2e1d3b7f65c0588c19f3dffc1a71f1c85a259e812e8f7fb48b66f0b25120a012eccafaba096e4d1ca1647e3e7fdb790fd6d8
7
+ data.tar.gz: 6f8b27afbf39767e1484ff62247e007fb666a8d49d6893968be85a93925b8b2f7207ef06e7843902d08d270af3041320f9b70e4f38248108841c4d1be3359b6c
data/http_crawler.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ["1336098842@qq.com"]
11
11
 
12
12
  spec.summary = %q{http 爬虫。}
13
- spec.description = %q{初级开发工程师,基于net/http 写的爬虫扩展包。}
13
+ spec.description = %q{初级开发工程师,基于 http 写的爬虫扩展包。}
14
14
  spec.homepage = "https://rubygems.org/gems/http_crawler"
15
15
  spec.license = "MIT"
16
16
 
@@ -23,8 +23,14 @@ module HttpCrawler
23
23
  end
24
24
  end
25
25
 
26
- attr_reader :uri
27
26
 
27
+ attr_accessor :max_error_num
28
+ # 最大错误重试次数
29
+ def max_error_num
30
+ @max_error_num ||= 1
31
+ end
32
+
33
+ attr_reader :uri
28
34
  # init_uri 如果未初始化@uri,则会报错
29
35
  # 继承类需要实现 @uri = URI("http://host")
30
36
  #
@@ -34,9 +40,9 @@ module HttpCrawler
34
40
 
35
41
  # 初始化超时时间
36
42
  def init_timeout
37
- @connect_time = 5
38
- @write_time = 2
39
- @read_time = 5
43
+ @connect_time = 3
44
+ @write_time = 3
45
+ @read_time = 3
40
46
  end
41
47
 
42
48
  # 初始化 ssl 协议
@@ -169,16 +175,6 @@ module HttpCrawler
169
175
  h
170
176
  end
171
177
 
172
- # 发送 get 请求
173
- def get(path, params = {})
174
- http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
175
- end
176
-
177
- # 发送 post 请求
178
- def post(path, params = {})
179
- http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
180
- end
181
-
182
178
  #
183
179
  # init_uri 如果未初始化@uri,则会报错
184
180
  # 继承类需要重定义 init_uri
@@ -200,6 +196,59 @@ module HttpCrawler
200
196
  @proxy_params = {key: "#{self.class}"}
201
197
  end
202
198
 
199
+ # 发送 get 请求
200
+ def get(path, params = {})
201
+ request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
202
+ end
203
+
204
+ # 发送 post 请求
205
+ def post(path, params = {})
206
+ request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
207
+ end
208
+
209
+ # 请求的响应
210
+ attr_accessor :response
211
+ protected :response=
212
+
213
+ # 出现如果验证码,切换代理
214
+ def validation_to_proxy?(r = response)
215
+ # 判断是否出现验证码
216
+ if r.validation_page?
217
+ # 触发验证码切换代理
218
+ self.update_proxy?
219
+ # 成功处理
220
+ return true
221
+ else
222
+ return false
223
+ end
224
+ end
225
+
226
+ protected
227
+
228
+
229
+ # 发送请求
230
+ def request(&block)
231
+ raise "必须定义块" unless block_given?
232
+ n = max_error_num
233
+ begin
234
+ block.call
235
+ rescue HTTP::TimeoutError
236
+ # 超时错误切换代理
237
+ if self.update_proxy?
238
+ retry
239
+ else
240
+ raise error
241
+ end
242
+ rescue => error
243
+ # 错误尝试次数
244
+ if n <= 0
245
+ raise error
246
+ else
247
+ n -= 1
248
+ retry
249
+ end
250
+ end
251
+ end
203
252
  end
204
253
  end
205
254
 
@@ -3,7 +3,7 @@ module HTTP
3
3
 
4
4
  # 解压并转码 body 数据
5
5
  def decoding_body
6
- @decoding_body ||= self.to_s
6
+ @decoding_body ||= self.body.to_s
7
7
  end
8
8
 
9
9
  # def decoding_body
@@ -48,5 +48,26 @@ module HTTP
48
48
  return time
49
49
  end
50
50
 
51
+
52
+ # 验证码判断
53
+ attr_accessor :validations
54
+
55
+ def validations
56
+ @validations ||= []
57
+ end
58
+
59
+ # 是否验证码界面
60
+ def validation_page?
61
+ # 正则匹配数组 validations 的所有匹配值
62
+ validations.each do |regular|
63
+ if decoding_body[regular]
64
+ Rails.logger.warn("触发验证信息")
65
+ Rails.logger.warn(decoding_body[(decoding_body =~ regular)..100])
66
+ return true
67
+ end
68
+ end
69
+ return false
70
+ end
71
+
51
72
  end # class Net::HTTPResponse
52
73
  end
@@ -4,7 +4,6 @@ module HttpCrawler
4
4
  module TestProxyApi
5
5
  class Client
6
6
 
7
- include(HttpCrawler::Client)
8
7
  include(HttpCrawler::Proxy::Client)
9
8
 
10
9
 
@@ -1,8 +1,7 @@
1
-
2
-
3
1
  module HttpCrawler
4
2
  module Proxy
5
3
 
4
+ include(HttpCrawler::Client)
6
5
  class << self
7
6
 
8
7
  # 接收格式
@@ -15,6 +14,9 @@ module HttpCrawler
15
14
 
16
15
  end
17
16
 
17
+ def max_error_num
18
+ @max_error_num ||= 0
19
+ end
18
20
 
19
21
  end
20
22
  end
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.0"
2
+ VERSION = "0.3.0.1"
3
3
  end
data/lib/http_crawler.rb CHANGED
@@ -10,4 +10,7 @@ load 'http_crawler/proxy.rb'
10
10
 
11
11
  module HttpCrawler
12
12
  # Your code goes here...
13
+ def self.a
14
+ puts "112"
15
+ end
13
16
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.0
4
+ version: 0.3.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-16 00:00:00.000000000 Z
11
+ date: 2019-02-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -108,7 +108,7 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: 0.2.1
111
- description: 初级开发工程师,基于net/http 写的爬虫扩展包。
111
+ description: 初级开发工程师,基于 http 写的爬虫扩展包。
112
112
  email:
113
113
  - 1336098842@qq.com
114
114
  executables: []