http_crawler 0.3.0.0 → 0.3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 870f70e609b513a6bdf5176e1ab2960d19e7b4f04d60c746845e6e27343288c7
4
- data.tar.gz: 6706312c08462cb4dee0f8c2eac15b7882131832dc49f6cb336032ffdc0ab68d
3
+ metadata.gz: 3d3f0a5acf0654b1ccba309c40a7153866850ce8f5be508e06cf587f23cf8a1d
4
+ data.tar.gz: 38ca43bafdecec27eb57078de5b07dcc07a0d30b75807a44895eb7441c0860ff
5
5
  SHA512:
6
- metadata.gz: 3c47791191aca7f3065eee2c9e223e3196f0890a4da8cd4bf8dee697eae18133758ac3225d49503d6ece004bc55474bbd9486b140b2d1588c92ee23d8b06663d
7
- data.tar.gz: ff4da1b65de1431b9e6aa800e63d9e0b4c97ee942b68760136deb3bd819c08fc82504159e1a1ea14b6368458e8fa6a7568521a12d0eff259c637e2092791ea6f
6
+ metadata.gz: 98d2057ce312a8beef8508ef411f2e1d3b7f65c0588c19f3dffc1a71f1c85a259e812e8f7fb48b66f0b25120a012eccafaba096e4d1ca1647e3e7fdb790fd6d8
7
+ data.tar.gz: 6f8b27afbf39767e1484ff62247e007fb666a8d49d6893968be85a93925b8b2f7207ef06e7843902d08d270af3041320f9b70e4f38248108841c4d1be3359b6c
data/http_crawler.gemspec CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
10
10
  spec.email = ["1336098842@qq.com"]
11
11
 
12
12
  spec.summary = %q{http 爬虫。}
13
- spec.description = %q{初级开发工程师,基于net/http 写的爬虫扩展包。}
13
+ spec.description = %q{初级开发工程师,基于 http 写的爬虫扩展包。}
14
14
  spec.homepage = "https://rubygems.org/gems/http_crawler"
15
15
  spec.license = "MIT"
16
16
 
@@ -23,8 +23,14 @@ module HttpCrawler
23
23
  end
24
24
  end
25
25
 
26
- attr_reader :uri
27
26
 
27
+ attr_accessor :max_error_num
28
+ # 最大错误重试次数
29
+ def max_error_num
30
+ @max_error_num ||= 1
31
+ end
32
+
33
+ attr_reader :uri
28
34
  # init_uri 如果未初始化@uri,则会报错
29
35
  # 继承类需要实现 @uri = URI("http://host")
30
36
  #
@@ -34,9 +40,9 @@ module HttpCrawler
34
40
 
35
41
  # 初始化超时时间
36
42
  def init_timeout
37
- @connect_time = 5
38
- @write_time = 2
39
- @read_time = 5
43
+ @connect_time = 3
44
+ @write_time = 3
45
+ @read_time = 3
40
46
  end
41
47
 
42
48
  # 初始化 ssl 协议
@@ -169,16 +175,6 @@ module HttpCrawler
169
175
  h
170
176
  end
171
177
 
172
- # 发送 get 请求
173
- def get(path, params = {})
174
- http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)
175
- end
176
-
177
- # 发送 post 请求
178
- def post(path, params = {})
179
- http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)
180
- end
181
-
182
178
  #
183
179
  # init_uri 如果未初始化@uri,则会报错
184
180
  # 继承类需要重定义 init_uri
@@ -200,6 +196,59 @@ module HttpCrawler
200
196
  @proxy_params = {key: "#{self.class}"}
201
197
  end
202
198
 
199
+ # 发送 get 请求
200
+ def get(path, params = {})
201
+ request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
202
+ end
203
+
204
+ # 发送 post 请求
205
+ def post(path, params = {})
206
+ request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
207
+ end
208
+
209
+ # 请求的响应
210
+ attr_accessor :response
211
+ protected :response=
212
+
213
+ # 出现如果验证码,切换代理
214
+ def validation_to_proxy?(r = response)
215
+ # 判断是否出现验证码
216
+ if r.validation_page?
217
+ # 触发验证码切换代理
218
+ self.update_proxy?
219
+ # 成功处理
220
+ return true
221
+ else
222
+ return false
223
+ end
224
+ end
225
+
226
+ protected
227
+
228
+
229
+ # 发送请求
230
+ def request(&block)
231
+ raise "必须定义块" unless block_given?
232
+ n = max_error_num
233
+ begin
234
+ block.call
235
+ rescue HTTP::TimeoutError
236
+ # 超时错误切换代理
237
+ if self.update_proxy?
238
+ retry
239
+ else
240
+ raise error
241
+ end
242
+ rescue => error
243
+ # 错误尝试次数
244
+ if n <= 0
245
+ raise error
246
+ else
247
+ n -= 1
248
+ retry
249
+ end
250
+ end
251
+ end
203
252
  end
204
253
  end
205
254
 
@@ -3,7 +3,7 @@ module HTTP
3
3
 
4
4
  # 解压并转码 body 数据
5
5
  def decoding_body
6
- @decoding_body ||= self.to_s
6
+ @decoding_body ||= self.body.to_s
7
7
  end
8
8
 
9
9
  # def decoding_body
@@ -48,5 +48,26 @@ module HTTP
48
48
  return time
49
49
  end
50
50
 
51
+
52
+ # 验证码判断
53
+ attr_accessor :validations
54
+
55
+ def validations
56
+ @validations ||= []
57
+ end
58
+
59
+ # 是否验证码界面
60
+ def validation_page?
61
+ # 正则匹配数组 validations 的所有匹配值
62
+ validations.each do |regular|
63
+ if decoding_body[regular]
64
+ Rails.logger.warn("触发验证信息")
65
+ Rails.logger.warn(decoding_body[(decoding_body =~ regular)..100])
66
+ return true
67
+ end
68
+ end
69
+ return false
70
+ end
71
+
51
72
  end # class Net::HTTPResponse
52
73
  end
@@ -4,7 +4,6 @@ module HttpCrawler
4
4
  module TestProxyApi
5
5
  class Client
6
6
 
7
- include(HttpCrawler::Client)
8
7
  include(HttpCrawler::Proxy::Client)
9
8
 
10
9
 
@@ -1,8 +1,7 @@
1
-
2
-
3
1
  module HttpCrawler
4
2
  module Proxy
5
3
 
4
+ include(HttpCrawler::Client)
6
5
  class << self
7
6
 
8
7
  # 接收格式
@@ -15,6 +14,9 @@ module HttpCrawler
15
14
 
16
15
  end
17
16
 
17
+ def max_error_num
18
+ @max_error_num ||= 0
19
+ end
18
20
 
19
21
  end
20
22
  end
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.0"
2
+ VERSION = "0.3.0.1"
3
3
  end
data/lib/http_crawler.rb CHANGED
@@ -10,4 +10,7 @@ load 'http_crawler/proxy.rb'
10
10
 
11
11
  module HttpCrawler
12
12
  # Your code goes here...
13
+ def self.a
14
+ puts "112"
15
+ end
13
16
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.0
4
+ version: 0.3.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-16 00:00:00.000000000 Z
11
+ date: 2019-02-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -108,7 +108,7 @@ dependencies:
108
108
  - - "~>"
109
109
  - !ruby/object:Gem::Version
110
110
  version: 0.2.1
111
- description: 初级开发工程师,基于net/http 写的爬虫扩展包。
111
+ description: 初级开发工程师,基于 http 写的爬虫扩展包。
112
112
  email:
113
113
  - 1336098842@qq.com
114
114
  executables: []