http_crawler 0.2.3.0 → 0.2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 52f21f9a41ac75989b37da494ca3cf4321c7cf5e
4
- data.tar.gz: e9fe5e402ac1f6333471adafaf997f81af2459c5
3
+ metadata.gz: e5a5b180d6051c3c0b343770e3d39f3711856b35
4
+ data.tar.gz: 4f74144ae22355a73e8775a41a85d5bba4a182ea
5
5
  SHA512:
6
- metadata.gz: fa99a2f0bdcce2dab203238c0aa0fe20dc970d925d359e538f58027fe9b7670cfae48d70c8eb3154a9343d9aa75dbe6c7ada2a64f88e036be9393829cc3184e6
7
- data.tar.gz: 1fc66e66594a5d9907170f4d71ce7e6b34e4c3691520ef33c4eb30bd778f744c1abf55fc249e0e125b0a1d7ca8f3dcacc159233da3e40f2653406c1ff786a90c
6
+ metadata.gz: 9a53292a05ad53fb701c1e9cb98075ff8a8daf185b0d4737a9ee821ae8e4a4ca3243120f2613feddb50ddc99a1ab10fcac4ddf363af8e5f627e5ffdedb3d8237
7
+ data.tar.gz: '09308f3773e6e9f8f6e30fd62c4adf8c2719d0297bf734e7726afcabaaa0851038ded52b27f1064d715d3d7d84359db252904a53554653e94592a3fa3bd4d8a1'
@@ -1,4 +1,3 @@
1
-
2
1
  module HttpCrawler
3
2
  class HTTP < Net::HTTP
4
3
 
@@ -59,7 +58,7 @@ module HttpCrawler
59
58
  Rails.logger.debug("@@proxy_list 为空进行更新")
60
59
  proxy_client = HttpCrawler::Proxy.for(proxy_api)
61
60
  proxy_r = proxy_client.get_proxy(key: proxy_key)
62
- @@proxy_list << proxy_r.parsing
61
+ @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
63
62
  Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
64
63
  sleep(1)
65
64
  end
@@ -125,13 +124,13 @@ module HttpCrawler
125
124
  Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
126
125
  server_error_sleep
127
126
  # 重新请求
128
- get_fetch(uri_or_path, initheader, dest, &block)
127
+ get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
129
128
  when Net::HTTPProxyAuthenticationRequired then
130
129
  Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
131
130
  if update_proxy?
132
131
  server_error_sleep
133
132
  # 重新请求
134
- get_fetch(uri_or_path, initheader, dest, &block)
133
+ get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
135
134
  else
136
135
  response.error!
137
136
  end
@@ -33,9 +33,10 @@ module Net
33
33
  # 通过 CharDet 判断编码格式
34
34
  encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
35
35
 
36
+
36
37
  # 进行转码
37
38
  begin
38
- @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding != @decoding_body.encoding
39
+ @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
39
40
  rescue => e
40
41
  # 转码错误后再次使用 CharDet 判断编码格式后进行转码
41
42
  cd = CharDet.detect(@decoding_body)["encoding"]
@@ -43,6 +44,9 @@ module Net
43
44
  @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
44
45
  else
45
46
  # 还是转码错误则抛出异常
47
+ Rails.logger.debug "encoding => #{encoding}"
48
+ Rails.logger.debug "cd => #{cd}"
49
+ Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
46
50
  raise e
47
51
  end
48
52
  end
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.2.3.0"
2
+ VERSION = "0.2.3.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3.0
4
+ version: 0.2.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-01-28 00:00:00.000000000 Z
11
+ date: 2019-01-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec