http_crawler 0.2.3.0 → 0.2.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/http_crawler/http.rb +3 -4
- data/lib/http_crawler/net/response.rb +5 -1
- data/lib/http_crawler/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5a5b180d6051c3c0b343770e3d39f3711856b35
|
4
|
+
data.tar.gz: 4f74144ae22355a73e8775a41a85d5bba4a182ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a53292a05ad53fb701c1e9cb98075ff8a8daf185b0d4737a9ee821ae8e4a4ca3243120f2613feddb50ddc99a1ab10fcac4ddf363af8e5f627e5ffdedb3d8237
|
7
|
+
data.tar.gz: '09308f3773e6e9f8f6e30fd62c4adf8c2719d0297bf734e7726afcabaaa0851038ded52b27f1064d715d3d7d84359db252904a53554653e94592a3fa3bd4d8a1'
|
data/lib/http_crawler/http.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
module HttpCrawler
|
3
2
|
class HTTP < Net::HTTP
|
4
3
|
|
@@ -59,7 +58,7 @@ module HttpCrawler
|
|
59
58
|
Rails.logger.debug("@@proxy_list 为空进行更新")
|
60
59
|
proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
61
60
|
proxy_r = proxy_client.get_proxy(key: proxy_key)
|
62
|
-
@@proxy_list << proxy_r.parsing
|
61
|
+
@@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
|
63
62
|
Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
|
64
63
|
sleep(1)
|
65
64
|
end
|
@@ -125,13 +124,13 @@ module HttpCrawler
|
|
125
124
|
Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
|
126
125
|
server_error_sleep
|
127
126
|
# 重新请求
|
128
|
-
get_fetch(uri_or_path, initheader, dest, &block)
|
127
|
+
get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
|
129
128
|
when Net::HTTPProxyAuthenticationRequired then
|
130
129
|
Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
|
131
130
|
if update_proxy?
|
132
131
|
server_error_sleep
|
133
132
|
# 重新请求
|
134
|
-
get_fetch(uri_or_path, initheader, dest, &block)
|
133
|
+
get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
|
135
134
|
else
|
136
135
|
response.error!
|
137
136
|
end
|
@@ -33,9 +33,10 @@ module Net
|
|
33
33
|
# 通过 CharDet 判断编码格式
|
34
34
|
encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
|
35
35
|
|
36
|
+
|
36
37
|
# 进行转码
|
37
38
|
begin
|
38
|
-
@decoding_body.force_encoding(encoding).encode!('utf-8') if encoding != @decoding_body.encoding
|
39
|
+
@decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
|
39
40
|
rescue => e
|
40
41
|
# 转码错误后再次使用 CharDet 判断编码格式后进行转码
|
41
42
|
cd = CharDet.detect(@decoding_body)["encoding"]
|
@@ -43,6 +44,9 @@ module Net
|
|
43
44
|
@decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
|
44
45
|
else
|
45
46
|
# 还是转码错误则抛出异常
|
47
|
+
Rails.logger.debug "encoding => #{encoding}"
|
48
|
+
Rails.logger.debug "cd => #{cd}"
|
49
|
+
Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
|
46
50
|
raise e
|
47
51
|
end
|
48
52
|
end
|
data/lib/http_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.3.
|
4
|
+
version: 0.2.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-01-
|
11
|
+
date: 2019-01-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|