http_crawler 0.3.0.3 → 0.3.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +0 -2
- data/lib/http_crawler.rb +9 -0
- data/lib/http_crawler/client.rb +62 -20
- data/lib/http_crawler/proxy.rb +0 -8
- data/lib/http_crawler/proxy/client.rb +17 -2
- data/lib/http_crawler/proxy/test_proxy_api/client.rb +1 -3
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler/web/baidu/client.rb +1 -3
- data/lib/http_crawler/web/client.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e44504aa656dea432bd96e28cc1908aac5c6164d1aa5ab9399da5b05db77b5b8
|
4
|
+
data.tar.gz: df36de8464939d97436941534bd441f7ee83b60b12d83c6547e9ed986d109276
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9baf0b81a3888c11d0b8d3a908fe805e9e498d32f6418aabb0f2c6392a39d9c354b74ce75e8c1673b2c2ea9b67305539b0a0c363905893e03be09050dfdb2be
|
7
|
+
data.tar.gz: ccaff3ba7029675ba6d6109b235b9a274f5f20638f9d59d37892cf179c24a0cbf8f2d49fdd69ea730471ca81677cd47f2080aac1fe73e404461dbd9a75a23c99
|
data/README.md
CHANGED
data/lib/http_crawler.rb
CHANGED
@@ -2,6 +2,15 @@ require 'json'
|
|
2
2
|
require 'digest/md5'
|
3
3
|
require 'nokogiri'
|
4
4
|
|
5
|
+
# 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
|
6
|
+
class << self.class
|
7
|
+
def require_rename
|
8
|
+
# require 取别名 require_dependency
|
9
|
+
alias_method :require_dependency, :require
|
10
|
+
end
|
11
|
+
end
|
12
|
+
self.class.require_rename
|
13
|
+
|
5
14
|
# 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
|
6
15
|
require_dependency 'http_crawler/errors.rb'
|
7
16
|
require_dependency 'http_crawler/common.rb'
|
data/lib/http_crawler/client.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require_dependency File.dirname(__FILE__) + '/http/response.rb'
|
2
2
|
|
3
3
|
module HttpCrawler
|
4
|
-
|
4
|
+
class Client
|
5
5
|
|
6
6
|
class << self
|
7
7
|
|
@@ -9,8 +9,8 @@ module HttpCrawler
|
|
9
9
|
# web_name = "biquge_duquanben"
|
10
10
|
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
|
11
11
|
#
|
12
|
-
def for(web_name
|
13
|
-
"HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(
|
12
|
+
def for(web_name)
|
13
|
+
"HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
|
14
14
|
end
|
15
15
|
|
16
16
|
#
|
@@ -19,7 +19,11 @@ module HttpCrawler
|
|
19
19
|
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
|
20
20
|
#
|
21
21
|
def for_module(module_name, *args)
|
22
|
-
"#{module_name}::Client".constantize.new(
|
22
|
+
"#{module_name}::Client".constantize.new()
|
23
|
+
end
|
24
|
+
|
25
|
+
def for_uri(path)
|
26
|
+
self.new(uri: path)
|
23
27
|
end
|
24
28
|
end
|
25
29
|
|
@@ -38,6 +42,25 @@ module HttpCrawler
|
|
38
42
|
@uri = nil
|
39
43
|
end
|
40
44
|
|
45
|
+
# 更新uri
|
46
|
+
def update_uri(uri_or_path)
|
47
|
+
case uri_or_path
|
48
|
+
when URI
|
49
|
+
@uri = uri_or_path
|
50
|
+
when String
|
51
|
+
if uri_or_path =~ /^http/
|
52
|
+
@uri = URI(uri_or_path)
|
53
|
+
else
|
54
|
+
@uri = @uri + uri_or_path
|
55
|
+
end
|
56
|
+
else
|
57
|
+
raise ArgumentError, uri_or_path
|
58
|
+
end
|
59
|
+
# 初始化 ssl 协议
|
60
|
+
self.init_ssl
|
61
|
+
self.uri
|
62
|
+
end
|
63
|
+
|
41
64
|
# 初始化超时时间
|
42
65
|
def init_timeout
|
43
66
|
@connect_time = 5
|
@@ -55,23 +78,30 @@ module HttpCrawler
|
|
55
78
|
end
|
56
79
|
|
57
80
|
# 头文件相关方法
|
58
|
-
def header
|
81
|
+
def header(parameter = {})
|
59
82
|
@header ||= init_header
|
60
83
|
end
|
61
84
|
|
62
|
-
def init_header
|
63
|
-
|
85
|
+
def init_header(parameter = {})
|
86
|
+
@header = {}
|
64
87
|
end
|
65
88
|
|
66
89
|
def update_header(parameter = {})
|
67
90
|
nil
|
68
91
|
end
|
69
92
|
|
70
|
-
# cookies
|
71
|
-
def cookies
|
72
|
-
@cookies ||=
|
93
|
+
# cookies相关方法
|
94
|
+
def cookies(parameter = {})
|
95
|
+
@cookies ||= init_cookies
|
96
|
+
end
|
97
|
+
|
98
|
+
def init_cookies(parameter = {})
|
99
|
+
@cookies = {}
|
73
100
|
end
|
74
101
|
|
102
|
+
def update_cookies(parameter = {})
|
103
|
+
nil
|
104
|
+
end
|
75
105
|
|
76
106
|
# 代理设置
|
77
107
|
def auto_proxy=(value)
|
@@ -150,9 +180,9 @@ module HttpCrawler
|
|
150
180
|
end
|
151
181
|
|
152
182
|
|
153
|
-
# 初始化
|
183
|
+
# 初始化init_client参数
|
154
184
|
def init_client
|
155
|
-
|
185
|
+
nil
|
156
186
|
end
|
157
187
|
|
158
188
|
# 初始化http请求前置条件
|
@@ -179,9 +209,15 @@ module HttpCrawler
|
|
179
209
|
# init_uri 如果未初始化@uri,则会报错
|
180
210
|
# 继承类需要重定义 init_uri
|
181
211
|
#
|
182
|
-
def initialize
|
212
|
+
def initialize(parameter = {})
|
183
213
|
# 初始化 uri
|
184
|
-
|
214
|
+
init_uri
|
215
|
+
|
216
|
+
# 如果自定义uri
|
217
|
+
if parameter[:uri]
|
218
|
+
raise "Client uri为重复初始化" if uri
|
219
|
+
update_uri(parameter[:uri])
|
220
|
+
end
|
185
221
|
|
186
222
|
# 初始化超时时间
|
187
223
|
init_timeout
|
@@ -198,12 +234,20 @@ module HttpCrawler
|
|
198
234
|
|
199
235
|
# 发送 get 请求
|
200
236
|
def get(path, params = {})
|
201
|
-
|
237
|
+
raise "Client uri为空" unless self.uri
|
238
|
+
request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
|
239
|
+
end
|
240
|
+
|
241
|
+
# 直接发送uri的get请求
|
242
|
+
def get_uri
|
243
|
+
raise "Client uri为空" unless self.uri
|
244
|
+
request {http.get(self.uri.to_s, :ssl_context => @ctx)}
|
202
245
|
end
|
203
246
|
|
204
247
|
# 发送 post 请求
|
205
248
|
def post(path, params = {})
|
206
|
-
|
249
|
+
raise "Client uri为空" unless self.uri
|
250
|
+
request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
|
207
251
|
end
|
208
252
|
|
209
253
|
# 请求的响应
|
@@ -233,7 +277,7 @@ module HttpCrawler
|
|
233
277
|
begin
|
234
278
|
block.call
|
235
279
|
rescue => error
|
236
|
-
|
280
|
+
Rails.logger.debug error.class
|
237
281
|
case error
|
238
282
|
when HTTP::TimeoutError
|
239
283
|
# 超时错误切换代理
|
@@ -242,7 +286,6 @@ module HttpCrawler
|
|
242
286
|
else
|
243
287
|
raise error
|
244
288
|
end
|
245
|
-
|
246
289
|
else
|
247
290
|
# 错误尝试次数
|
248
291
|
if n <= 0
|
@@ -252,9 +295,8 @@ module HttpCrawler
|
|
252
295
|
retry
|
253
296
|
end
|
254
297
|
end
|
255
|
-
|
256
298
|
end
|
257
|
-
end
|
299
|
+
end # def request(&block)
|
258
300
|
end
|
259
301
|
end
|
260
302
|
|
data/lib/http_crawler/proxy.rb
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
module HttpCrawler
|
2
2
|
module Proxy
|
3
|
-
|
4
|
-
include(HttpCrawler::Client)
|
5
3
|
class << self
|
6
4
|
|
7
5
|
# 接收格式
|
@@ -11,13 +9,7 @@ module HttpCrawler
|
|
11
9
|
def for(web_name, *arg)
|
12
10
|
"HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
|
13
11
|
end
|
14
|
-
|
15
12
|
end
|
16
|
-
|
17
|
-
def max_error_num
|
18
|
-
@max_error_num ||= 0
|
19
|
-
end
|
20
|
-
|
21
13
|
end
|
22
14
|
end
|
23
15
|
|
@@ -1,7 +1,22 @@
|
|
1
|
-
|
2
1
|
module HttpCrawler
|
3
2
|
module Proxy
|
4
|
-
|
3
|
+
class Client < HttpCrawler::Client
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# 接收格式
|
8
|
+
# web_name = "test_proxy_api"
|
9
|
+
# 返回 HttpCrawler::Proxy::TestProxyApi::Client 实例
|
10
|
+
#
|
11
|
+
def for(web_name, *arg)
|
12
|
+
"HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
def max_error_num
|
18
|
+
@max_error_num ||= 0
|
19
|
+
end
|
5
20
|
|
6
21
|
end
|
7
22
|
end
|
data/lib/http_crawler/version.rb
CHANGED