http_crawler 0.3.1.4 → 0.3.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/http_crawler/client.rb +13 -5
- data/lib/http_crawler/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a7976a9fa4410e543a29dcbf761b79c2aa2b73f89604b7108daab863e3bf76d0
|
4
|
+
data.tar.gz: 349418d39a8346ac6844f3600541b982a6d7cbef445cd565b15ef2e8ef8cac2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 932954f758d3136a124ea0b80da7bf1198cb8b86f0e1c52c1a838a119a307ad72bc572e78200b5d9e7ca25e4e3b8f2e645da4d1923e60b5f59294fa323ad5f94
|
7
|
+
data.tar.gz: e3b5598ddab7ed79bc18b445a120f0d91466fa701599339c480d5f882c4c305fc4aae966af5c64bbcfbfe989c6eee14494895fd89e512b710dede2705b086ea0
|
data/lib/http_crawler/client.rb
CHANGED
@@ -82,10 +82,12 @@ module HttpCrawler
|
|
82
82
|
attr_accessor :header
|
83
83
|
# 头文件相关方法
|
84
84
|
def header(parameter = {})
|
85
|
+
parameter = parameter.symbolize_keys
|
85
86
|
@header ||= init_header
|
86
87
|
end
|
87
88
|
|
88
89
|
def init_header(parameter = {})
|
90
|
+
parameter = parameter.symbolize_keys
|
89
91
|
@header = {
|
90
92
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
91
93
|
"Accept-Encoding": "gzip, br",
|
@@ -97,20 +99,24 @@ module HttpCrawler
|
|
97
99
|
end
|
98
100
|
|
99
101
|
def update_header(parameter = {})
|
102
|
+
parameter = parameter.symbolize_keys
|
100
103
|
@header = init_header
|
101
104
|
end
|
102
105
|
|
103
106
|
attr_accessor :cookies
|
104
107
|
# cookies相关方法
|
105
108
|
def cookies(parameter = {})
|
109
|
+
parameter = parameter.symbolize_keys
|
106
110
|
@cookies ||= init_cookies
|
107
111
|
end
|
108
112
|
|
109
113
|
def init_cookies(parameter = {})
|
114
|
+
parameter = parameter.symbolize_keys
|
110
115
|
@cookies = {}
|
111
116
|
end
|
112
117
|
|
113
118
|
def update_cookies(parameter = {})
|
119
|
+
parameter = parameter.symbolize_keys
|
114
120
|
nil
|
115
121
|
end
|
116
122
|
|
@@ -136,10 +142,11 @@ module HttpCrawler
|
|
136
142
|
|
137
143
|
# 调用代理 api使用的参数
|
138
144
|
def proxy_params
|
139
|
-
@proxy_params ||= {
|
145
|
+
@proxy_params ||= {key: "default"}
|
140
146
|
end
|
141
147
|
|
142
148
|
def update_proxy(proxy = {})
|
149
|
+
proxy = proxy.symbolize_keys
|
143
150
|
if (proxy.blank?)
|
144
151
|
@proxy = get_proxy
|
145
152
|
else
|
@@ -167,7 +174,7 @@ module HttpCrawler
|
|
167
174
|
begin
|
168
175
|
Rails.logger.debug("开始获取代理IP")
|
169
176
|
proxy_client = HttpCrawler::Proxy.for(proxy_api)
|
170
|
-
proxy_r = proxy_client.get_proxy(proxy_params)
|
177
|
+
proxy_r = proxy_client.get_proxy(proxy_params.symbolize_keys)
|
171
178
|
proxy_ip = proxy_r.results unless proxy_r.results.blank?
|
172
179
|
if proxy_ip.blank?
|
173
180
|
Rails.logger.warn "无最新代理等待5秒后重新获取:proxy 为空"
|
@@ -176,10 +183,10 @@ module HttpCrawler
|
|
176
183
|
end
|
177
184
|
sleep(5)
|
178
185
|
end while true
|
179
|
-
|
186
|
+
proxy_ip = proxy_ip.symbolize_keys
|
180
187
|
Rails.logger.debug("当前IP => #{@proxy},获取最新代理 => #{proxy_ip}")
|
181
188
|
|
182
|
-
unless proxy_ip[
|
189
|
+
unless proxy_ip[:p_addr] && proxy_ip[:p_port]
|
183
190
|
Rails.logger.warn "无最新代理等待5秒后重新获取:p_addr 或 p_port 为空"
|
184
191
|
sleep(5)
|
185
192
|
proxy_ip = get_proxy
|
@@ -211,7 +218,7 @@ module HttpCrawler
|
|
211
218
|
h = HTTP.follow(max_hops: 5)
|
212
219
|
|
213
220
|
# 添加代理
|
214
|
-
h = h.via(@proxy[
|
221
|
+
h = h.via(@proxy[:p_addr], @proxy[:p_port].to_i, @proxy[:p_user], @proxy[:p_pass]) unless (@proxy.blank?)
|
215
222
|
|
216
223
|
# 添加头文件
|
217
224
|
h = h.headers(header) if header
|
@@ -236,6 +243,7 @@ module HttpCrawler
|
|
236
243
|
# 继承类需要重定义 init_uri
|
237
244
|
#
|
238
245
|
def initialize(parameter = {})
|
246
|
+
parameter = parameter.symbolize_keys
|
239
247
|
# 初始化 uri
|
240
248
|
init_uri
|
241
249
|
|
data/lib/http_crawler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: http_crawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.1.
|
4
|
+
version: 0.3.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- jagger
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-03-
|
11
|
+
date: 2019-03-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|