http_crawler 0.2.3.0 → 0.2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/http_crawler/http.rb +3 -4
- data/lib/http_crawler/net/response.rb +5 -1
- data/lib/http_crawler/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5a5b180d6051c3c0b343770e3d39f3711856b35
|
4
|
+
data.tar.gz: 4f74144ae22355a73e8775a41a85d5bba4a182ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a53292a05ad53fb701c1e9cb98075ff8a8daf185b0d4737a9ee821ae8e4a4ca3243120f2613feddb50ddc99a1ab10fcac4ddf363af8e5f627e5ffdedb3d8237
|
7
|
+
data.tar.gz: '09308f3773e6e9f8f6e30fd62c4adf8c2719d0297bf734e7726afcabaaa0851038ded52b27f1064d715d3d7d84359db252904a53554653e94592a3fa3bd4d8a1'
|
data/lib/http_crawler/http.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module HttpCrawler
|
3
2
|
class HTTP < Net::HTTP
|
4
3
|
|
@@ -59,7 +58,7 @@ module HttpCrawler
|
|
59
58
|
Rails.logger.debug("@@proxy_list 为空进行更新")
|
60
59
|
proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
61
60
|
proxy_r = proxy_client.get_proxy(key: proxy_key)
|
62
|
-
@@proxy_list << proxy_r.parsing
|
61
|
+
@@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
|
63
62
|
Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
|
64
63
|
sleep(1)
|
65
64
|
end
|
@@ -125,13 +124,13 @@ module HttpCrawler
|
|
125
124
|
Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
|
126
125
|
server_error_sleep
|
127
126
|
# 重新请求
|
128
|
-
get_fetch(uri_or_path, initheader, dest, &block)
|
127
|
+
get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
|
129
128
|
when Net::HTTPProxyAuthenticationRequired then
|
130
129
|
Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
|
131
130
|
if update_proxy?
|
132
131
|
server_error_sleep
|
133
132
|
# 重新请求
|
134
|
-
get_fetch(uri_or_path, initheader, dest, &block)
|
133
|
+
get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
|
135
134
|
else
|
136
135
|
response.error!
|
137
136
|
end
|
@@ -33,9 +33,10 @@ module Net
|
|
33
33
|
# 通过 CharDet 判断编码格式
|
34
34
|
encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
|
35
35
|
|
36
|
+
|
36
37
|
# 进行转码
|
37
38
|
begin
|
38
|
-
@decoding_body.force_encoding(encoding).encode!('utf-8') if encoding != @decoding_body.encoding
|
39
|
+
@decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
|
39
40
|
rescue => e
|
40
41
|
# 转码错误后再次使用 CharDet 判断编码格式后进行转码
|
41
42
|
cd = CharDet.detect(@decoding_body)["encoding"]
|
@@ -43,6 +44,9 @@ module Net
|
|
43
44
|
@decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
|
44
45
|
else
|
45
46
|
# 还是转码错误则抛出异常
|
47
|
+
Rails.logger.debug "encoding => #{encoding}"
|
48
|
+
Rails.logger.debug "cd => #{cd}"
|
49
|
+
Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
|
46
50
|
raise e
|
47
51
|
end
|
48
52
|
end
|
data/lib/http_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.3.
|
4
|
+
version: 0.2.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-01-
|
11
|
+
date: 2019-01-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|