http_crawler 0.2.3.1 → 0.2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/http_crawler/client.rb +5 -0
- data/lib/http_crawler/http.rb +46 -20
- data/lib/http_crawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c93afa1f523cd20478bf06f854f1588a2ec9e248
|
4
|
+
data.tar.gz: 55fcaf8ad7870b6fdf89a63731b1ea8814083cc1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c72785e4e5ce5a19d62f0371df23b7ad18e2446e50f89a02bff8cc8953e074713b0ad995b22b6c8e39adbffeef83783d889caa90b3de53bf701416f0e2035590
|
7
|
+
data.tar.gz: f092ae6bb1a5cfe704675b1cf8792a93a01dd9365486a89c5fd3663079019ecffb68931f2c25ccef6ee4356f735a2dc03c32539511f6309b20424650d35e0b99
|
data/lib/http_crawler/client.rb
CHANGED
data/lib/http_crawler/http.rb
CHANGED
@@ -9,12 +9,15 @@ module HttpCrawler
|
|
9
9
|
attr_accessor :proxy_key
|
10
10
|
# 请求错误后的重复最大请求次数
|
11
11
|
attr_accessor :max_error_num
|
12
|
+
# 错误的url地址,存的是正则
|
13
|
+
attr_accessor :error_urls
|
12
14
|
|
13
15
|
def initialize(address, port = nil)
|
14
16
|
super(address, port)
|
15
17
|
@max_error_num = 2
|
16
18
|
@error_num = 0
|
17
19
|
@proxy_key = "default"
|
20
|
+
@error_urls = []
|
18
21
|
end
|
19
22
|
|
20
23
|
def http_error_sleep
|
@@ -54,48 +57,61 @@ module HttpCrawler
|
|
54
57
|
# 通过调用 api 获取代理或者通过自定义设置代理
|
55
58
|
def get_proxy
|
56
59
|
|
57
|
-
while @@proxy_list.blank?
|
58
|
-
|
60
|
+
# while @@proxy_list.blank?
|
61
|
+
# Rails.logger.debug("@@proxy_list 为空进行更新")
|
62
|
+
# proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
63
|
+
# proxy_r = proxy_client.get_proxy(key: proxy_key)
|
64
|
+
# @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
|
65
|
+
# Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
|
66
|
+
# sleep(1)
|
67
|
+
# end
|
68
|
+
# p = @@proxy_list.delete_at(0)
|
69
|
+
|
70
|
+
proxy_ip = nil
|
71
|
+
begin
|
72
|
+
Rails.logger.debug("开始获取代理IP")
|
59
73
|
proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
60
74
|
proxy_r = proxy_client.get_proxy(key: proxy_key)
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
75
|
+
proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
|
76
|
+
if proxy_ip.blank?
|
77
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
78
|
+
else
|
79
|
+
break
|
80
|
+
end
|
81
|
+
sleep(5)
|
82
|
+
end while true
|
67
83
|
|
68
|
-
Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{
|
84
|
+
Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
|
69
85
|
|
70
|
-
unless
|
86
|
+
unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
|
71
87
|
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
72
88
|
sleep(5)
|
73
|
-
|
89
|
+
proxy_ip = get_proxy
|
74
90
|
end
|
75
91
|
|
76
|
-
if (@proxy_address ==
|
92
|
+
if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
|
77
93
|
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
78
94
|
sleep(5)
|
79
|
-
|
95
|
+
proxy_ip = get_proxy
|
80
96
|
end
|
81
|
-
|
97
|
+
proxy_ip
|
82
98
|
end
|
83
99
|
|
84
|
-
def update_proxy(
|
85
|
-
if
|
100
|
+
def update_proxy(proxy_ip = {})
|
101
|
+
if proxy_ip.blank?
|
86
102
|
proxy(get_proxy)
|
87
103
|
else
|
88
|
-
proxy(
|
104
|
+
proxy(proxy_ip)
|
89
105
|
end
|
90
106
|
end
|
91
107
|
|
92
108
|
# 如果自动更新代理 则更新代理返回 true,否则返回false
|
93
|
-
def update_proxy?(
|
109
|
+
def update_proxy?(proxy_ip = {})
|
94
110
|
if auto_proxy
|
95
|
-
if
|
111
|
+
if proxy_ip.blank?
|
96
112
|
proxy(get_proxy)
|
97
113
|
else
|
98
|
-
proxy(
|
114
|
+
proxy(proxy_ip)
|
99
115
|
end
|
100
116
|
return true
|
101
117
|
else
|
@@ -118,6 +134,11 @@ module HttpCrawler
|
|
118
134
|
when Net::HTTPRedirection then
|
119
135
|
location = response['location']
|
120
136
|
Rails.logger.warn "redirected to #{location}"
|
137
|
+
@error_urls.each do |url_string|
|
138
|
+
if location =~ /#{url_string}/
|
139
|
+
raise "跳转到异常url => #{location}"
|
140
|
+
end
|
141
|
+
end
|
121
142
|
# 传入 location 进行跳转
|
122
143
|
get_fetch(location, initheader, dest, limit - 1, &block)
|
123
144
|
when Net::HTTPServerError then
|
@@ -152,6 +173,11 @@ module HttpCrawler
|
|
152
173
|
when Net::HTTPRedirection then
|
153
174
|
location = response['location']
|
154
175
|
Rails.logger.warn "redirected to #{location}"
|
176
|
+
@error_urls.each do |url_string|
|
177
|
+
if location =~ /#{url_string}/
|
178
|
+
raise "跳转到异常url => #{location}"
|
179
|
+
end
|
180
|
+
end
|
155
181
|
# 传入 location 进行跳转
|
156
182
|
get_fetch(location, initheader, dest, 9, &block)
|
157
183
|
when Net::HTTPServerError then
|
data/lib/http_crawler/version.rb
CHANGED