http_crawler 0.2.3.1 → 0.2.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/http_crawler/client.rb +5 -0
- data/lib/http_crawler/http.rb +46 -20
- data/lib/http_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c93afa1f523cd20478bf06f854f1588a2ec9e248
|
4
|
+
data.tar.gz: 55fcaf8ad7870b6fdf89a63731b1ea8814083cc1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c72785e4e5ce5a19d62f0371df23b7ad18e2446e50f89a02bff8cc8953e074713b0ad995b22b6c8e39adbffeef83783d889caa90b3de53bf701416f0e2035590
|
7
|
+
data.tar.gz: f092ae6bb1a5cfe704675b1cf8792a93a01dd9365486a89c5fd3663079019ecffb68931f2c25ccef6ee4356f735a2dc03c32539511f6309b20424650d35e0b99
|
data/lib/http_crawler/client.rb
CHANGED
data/lib/http_crawler/http.rb
CHANGED
@@ -9,12 +9,15 @@ module HttpCrawler
|
|
9
9
|
attr_accessor :proxy_key
|
10
10
|
# 请求错误后的重复最大请求次数
|
11
11
|
attr_accessor :max_error_num
|
12
|
+
# 错误的url地址,存的是正则
|
13
|
+
attr_accessor :error_urls
|
12
14
|
|
13
15
|
def initialize(address, port = nil)
|
14
16
|
super(address, port)
|
15
17
|
@max_error_num = 2
|
16
18
|
@error_num = 0
|
17
19
|
@proxy_key = "default"
|
20
|
+
@error_urls = []
|
18
21
|
end
|
19
22
|
|
20
23
|
def http_error_sleep
|
@@ -54,48 +57,61 @@ module HttpCrawler
|
|
54
57
|
# 通过调用 api 获取代理或者通过自定义设置代理
|
55
58
|
def get_proxy
|
56
59
|
|
57
|
-
while @@proxy_list.blank?
|
58
|
-
|
60
|
+
# while @@proxy_list.blank?
|
61
|
+
# Rails.logger.debug("@@proxy_list 为空进行更新")
|
62
|
+
# proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
63
|
+
# proxy_r = proxy_client.get_proxy(key: proxy_key)
|
64
|
+
# @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
|
65
|
+
# Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
|
66
|
+
# sleep(1)
|
67
|
+
# end
|
68
|
+
# p = @@proxy_list.delete_at(0)
|
69
|
+
|
70
|
+
proxy_ip = nil
|
71
|
+
begin
|
72
|
+
Rails.logger.debug("开始获取代理IP")
|
59
73
|
proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
60
74
|
proxy_r = proxy_client.get_proxy(key: proxy_key)
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
75
|
+
proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
|
76
|
+
if proxy_ip.blank?
|
77
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
78
|
+
else
|
79
|
+
break
|
80
|
+
end
|
81
|
+
sleep(5)
|
82
|
+
end while true
|
67
83
|
|
68
|
-
Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{
|
84
|
+
Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
|
69
85
|
|
70
|
-
unless
|
86
|
+
unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
|
71
87
|
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
72
88
|
sleep(5)
|
73
|
-
|
89
|
+
proxy_ip = get_proxy
|
74
90
|
end
|
75
91
|
|
76
|
-
if (@proxy_address ==
|
92
|
+
if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
|
77
93
|
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
78
94
|
sleep(5)
|
79
|
-
|
95
|
+
proxy_ip = get_proxy
|
80
96
|
end
|
81
|
-
|
97
|
+
proxy_ip
|
82
98
|
end
|
83
99
|
|
84
|
-
def update_proxy(
|
85
|
-
if
|
100
|
+
def update_proxy(proxy_ip = {})
|
101
|
+
if proxy_ip.blank?
|
86
102
|
proxy(get_proxy)
|
87
103
|
else
|
88
|
-
proxy(
|
104
|
+
proxy(proxy_ip)
|
89
105
|
end
|
90
106
|
end
|
91
107
|
|
92
108
|
# 如果自动更新代理 则更新代理返回 true,否则返回false
|
93
|
-
def update_proxy?(
|
109
|
+
def update_proxy?(proxy_ip = {})
|
94
110
|
if auto_proxy
|
95
|
-
if
|
111
|
+
if proxy_ip.blank?
|
96
112
|
proxy(get_proxy)
|
97
113
|
else
|
98
|
-
proxy(
|
114
|
+
proxy(proxy_ip)
|
99
115
|
end
|
100
116
|
return true
|
101
117
|
else
|
@@ -118,6 +134,11 @@ module HttpCrawler
|
|
118
134
|
when Net::HTTPRedirection then
|
119
135
|
location = response['location']
|
120
136
|
Rails.logger.warn "redirected to #{location}"
|
137
|
+
@error_urls.each do |url_string|
|
138
|
+
if location =~ /#{url_string}/
|
139
|
+
raise "跳转到异常url => #{location}"
|
140
|
+
end
|
141
|
+
end
|
121
142
|
# 传入 location 进行跳转
|
122
143
|
get_fetch(location, initheader, dest, limit - 1, &block)
|
123
144
|
when Net::HTTPServerError then
|
@@ -152,6 +173,11 @@ module HttpCrawler
|
|
152
173
|
when Net::HTTPRedirection then
|
153
174
|
location = response['location']
|
154
175
|
Rails.logger.warn "redirected to #{location}"
|
176
|
+
@error_urls.each do |url_string|
|
177
|
+
if location =~ /#{url_string}/
|
178
|
+
raise "跳转到异常url => #{location}"
|
179
|
+
end
|
180
|
+
end
|
155
181
|
# 传入 location 进行跳转
|
156
182
|
get_fetch(location, initheader, dest, 9, &block)
|
157
183
|
when Net::HTTPServerError then
|
data/lib/http_crawler/version.rb
CHANGED