http_crawler 0.2.3.0 → 0.2.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 52f21f9a41ac75989b37da494ca3cf4321c7cf5e
4
- data.tar.gz: e9fe5e402ac1f6333471adafaf997f81af2459c5
3
+ metadata.gz: e5a5b180d6051c3c0b343770e3d39f3711856b35
4
+ data.tar.gz: 4f74144ae22355a73e8775a41a85d5bba4a182ea
5
5
  SHA512:
6
- metadata.gz: fa99a2f0bdcce2dab203238c0aa0fe20dc970d925d359e538f58027fe9b7670cfae48d70c8eb3154a9343d9aa75dbe6c7ada2a64f88e036be9393829cc3184e6
7
- data.tar.gz: 1fc66e66594a5d9907170f4d71ce7e6b34e4c3691520ef33c4eb30bd778f744c1abf55fc249e0e125b0a1d7ca8f3dcacc159233da3e40f2653406c1ff786a90c
6
+ metadata.gz: 9a53292a05ad53fb701c1e9cb98075ff8a8daf185b0d4737a9ee821ae8e4a4ca3243120f2613feddb50ddc99a1ab10fcac4ddf363af8e5f627e5ffdedb3d8237
7
+ data.tar.gz: '09308f3773e6e9f8f6e30fd62c4adf8c2719d0297bf734e7726afcabaaa0851038ded52b27f1064d715d3d7d84359db252904a53554653e94592a3fa3bd4d8a1'
@@ -1,4 +1,3 @@
1
-
2
1
  module HttpCrawler
3
2
  class HTTP < Net::HTTP
4
3
 
@@ -59,7 +58,7 @@ module HttpCrawler
59
58
  Rails.logger.debug("@@proxy_list 为空进行更新")
60
59
  proxy_client = HttpCrawler::Proxy.for(proxy_api)
61
60
  proxy_r = proxy_client.get_proxy(key: proxy_key)
62
- @@proxy_list << proxy_r.parsing
61
+ @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
63
62
  Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
64
63
  sleep(1)
65
64
  end
@@ -125,13 +124,13 @@ module HttpCrawler
125
124
  Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
126
125
  server_error_sleep
127
126
  # 重新请求
128
- get_fetch(uri_or_path, initheader, dest, &block)
127
+ get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
129
128
  when Net::HTTPProxyAuthenticationRequired then
130
129
  Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
131
130
  if update_proxy?
132
131
  server_error_sleep
133
132
  # 重新请求
134
- get_fetch(uri_or_path, initheader, dest, &block)
133
+ get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
135
134
  else
136
135
  response.error!
137
136
  end
@@ -33,9 +33,10 @@ module Net
33
33
  # 通过 CharDet 判断编码格式
34
34
  encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
35
35
 
36
+
36
37
  # 进行转码
37
38
  begin
38
- @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding != @decoding_body.encoding
39
+ @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
39
40
  rescue => e
40
41
  # 转码错误后再次使用 CharDet 判断编码格式后进行转码
41
42
  cd = CharDet.detect(@decoding_body)["encoding"]
@@ -43,6 +44,9 @@ module Net
43
44
  @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
44
45
  else
45
46
  # 还是转码错误则抛出异常
47
+ Rails.logger.debug "encoding => #{encoding}"
48
+ Rails.logger.debug "cd => #{cd}"
49
+ Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
46
50
  raise e
47
51
  end
48
52
  end
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.2.3.0"
2
+ VERSION = "0.2.3.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3.0
4
+ version: 0.2.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-01-28 00:00:00.000000000 Z
11
+ date: 2019-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec