http_crawler 0.3.0.3 → 0.3.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +0 -2
- data/lib/http_crawler.rb +9 -0
- data/lib/http_crawler/client.rb +62 -20
- data/lib/http_crawler/proxy.rb +0 -8
- data/lib/http_crawler/proxy/client.rb +17 -2
- data/lib/http_crawler/proxy/test_proxy_api/client.rb +1 -3
- data/lib/http_crawler/version.rb +1 -1
- data/lib/http_crawler/web/baidu/client.rb +1 -3
- data/lib/http_crawler/web/client.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e44504aa656dea432bd96e28cc1908aac5c6164d1aa5ab9399da5b05db77b5b8
|
4
|
+
data.tar.gz: df36de8464939d97436941534bd441f7ee83b60b12d83c6547e9ed986d109276
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a9baf0b81a3888c11d0b8d3a908fe805e9e498d32f6418aabb0f2c6392a39d9c354b74ce75e8c1673b2c2ea9b67305539b0a0c363905893e03be09050dfdb2be
|
7
|
+
data.tar.gz: ccaff3ba7029675ba6d6109b235b9a274f5f20638f9d59d37892cf179c24a0cbf8f2d49fdd69ea730471ca81677cd47f2080aac1fe73e404461dbd9a75a23c99
|
data/README.md
CHANGED
data/lib/http_crawler.rb
CHANGED
@@ -2,6 +2,15 @@ require 'json'
|
|
2
2
|
require 'digest/md5'
|
3
3
|
require 'nokogiri'
|
4
4
|
|
5
|
+
# 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
|
6
|
+
class << self.class
|
7
|
+
def require_rename
|
8
|
+
# require 取别名 require_dependency
|
9
|
+
alias_method :require_dependency, :require
|
10
|
+
end
|
11
|
+
end
|
12
|
+
self.class.require_rename
|
13
|
+
|
5
14
|
# 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
|
6
15
|
require_dependency 'http_crawler/errors.rb'
|
7
16
|
require_dependency 'http_crawler/common.rb'
|
data/lib/http_crawler/client.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require_dependency File.dirname(__FILE__) + '/http/response.rb'
|
2
2
|
|
3
3
|
module HttpCrawler
|
4
|
-
|
4
|
+
class Client
|
5
5
|
|
6
6
|
class << self
|
7
7
|
|
@@ -9,8 +9,8 @@ module HttpCrawler
|
|
9
9
|
# web_name = "biquge_duquanben"
|
10
10
|
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
|
11
11
|
#
|
12
|
-
def for(web_name
|
13
|
-
"HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(
|
12
|
+
def for(web_name)
|
13
|
+
"HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
|
14
14
|
end
|
15
15
|
|
16
16
|
#
|
@@ -19,7 +19,11 @@ module HttpCrawler
|
|
19
19
|
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
|
20
20
|
#
|
21
21
|
def for_module(module_name, *args)
|
22
|
-
"#{module_name}::Client".constantize.new(
|
22
|
+
"#{module_name}::Client".constantize.new()
|
23
|
+
end
|
24
|
+
|
25
|
+
def for_uri(path)
|
26
|
+
self.new(uri: path)
|
23
27
|
end
|
24
28
|
end
|
25
29
|
|
@@ -38,6 +42,25 @@ module HttpCrawler
|
|
38
42
|
@uri = nil
|
39
43
|
end
|
40
44
|
|
45
|
+
# 更新uri
|
46
|
+
def update_uri(uri_or_path)
|
47
|
+
case uri_or_path
|
48
|
+
when URI
|
49
|
+
@uri = uri_or_path
|
50
|
+
when String
|
51
|
+
if uri_or_path =~ /^http/
|
52
|
+
@uri = URI(uri_or_path)
|
53
|
+
else
|
54
|
+
@uri = @uri + uri_or_path
|
55
|
+
end
|
56
|
+
else
|
57
|
+
raise ArgumentError, uri_or_path
|
58
|
+
end
|
59
|
+
# 初始化 ssl 协议
|
60
|
+
self.init_ssl
|
61
|
+
self.uri
|
62
|
+
end
|
63
|
+
|
41
64
|
# 初始化超时时间
|
42
65
|
def init_timeout
|
43
66
|
@connect_time = 5
|
@@ -55,23 +78,30 @@ module HttpCrawler
|
|
55
78
|
end
|
56
79
|
|
57
80
|
# 头文件相关方法
|
58
|
-
def header
|
81
|
+
def header(parameter = {})
|
59
82
|
@header ||= init_header
|
60
83
|
end
|
61
84
|
|
62
|
-
def init_header
|
63
|
-
|
85
|
+
def init_header(parameter = {})
|
86
|
+
@header = {}
|
64
87
|
end
|
65
88
|
|
66
89
|
def update_header(parameter = {})
|
67
90
|
nil
|
68
91
|
end
|
69
92
|
|
70
|
-
# cookies
|
71
|
-
def cookies
|
72
|
-
@cookies ||=
|
93
|
+
# cookies相关方法
|
94
|
+
def cookies(parameter = {})
|
95
|
+
@cookies ||= init_cookies
|
96
|
+
end
|
97
|
+
|
98
|
+
def init_cookies(parameter = {})
|
99
|
+
@cookies = {}
|
73
100
|
end
|
74
101
|
|
102
|
+
def update_cookies(parameter = {})
|
103
|
+
nil
|
104
|
+
end
|
75
105
|
|
76
106
|
# 代理设置
|
77
107
|
def auto_proxy=(value)
|
@@ -150,9 +180,9 @@ module HttpCrawler
|
|
150
180
|
end
|
151
181
|
|
152
182
|
|
153
|
-
# 初始化
|
183
|
+
# 初始化init_client参数
|
154
184
|
def init_client
|
155
|
-
|
185
|
+
nil
|
156
186
|
end
|
157
187
|
|
158
188
|
# 初始化http请求前置条件
|
@@ -179,9 +209,15 @@ module HttpCrawler
|
|
179
209
|
# init_uri 如果未初始化@uri,则会报错
|
180
210
|
# 继承类需要重定义 init_uri
|
181
211
|
#
|
182
|
-
def initialize
|
212
|
+
def initialize(parameter = {})
|
183
213
|
# 初始化 uri
|
184
|
-
|
214
|
+
init_uri
|
215
|
+
|
216
|
+
# 如果自定义uri
|
217
|
+
if parameter[:uri]
|
218
|
+
raise "Client uri为重复初始化" if uri
|
219
|
+
update_uri(parameter[:uri])
|
220
|
+
end
|
185
221
|
|
186
222
|
# 初始化超时时间
|
187
223
|
init_timeout
|
@@ -198,12 +234,20 @@ module HttpCrawler
|
|
198
234
|
|
199
235
|
# 发送 get 请求
|
200
236
|
def get(path, params = {})
|
201
|
-
|
237
|
+
raise "Client uri为空" unless self.uri
|
238
|
+
request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
|
239
|
+
end
|
240
|
+
|
241
|
+
# 直接发送uri的get请求
|
242
|
+
def get_uri
|
243
|
+
raise "Client uri为空" unless self.uri
|
244
|
+
request {http.get(self.uri.to_s, :ssl_context => @ctx)}
|
202
245
|
end
|
203
246
|
|
204
247
|
# 发送 post 请求
|
205
248
|
def post(path, params = {})
|
206
|
-
|
249
|
+
raise "Client uri为空" unless self.uri
|
250
|
+
request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
|
207
251
|
end
|
208
252
|
|
209
253
|
# 请求的响应
|
@@ -233,7 +277,7 @@ module HttpCrawler
|
|
233
277
|
begin
|
234
278
|
block.call
|
235
279
|
rescue => error
|
236
|
-
|
280
|
+
Rails.logger.debug error.class
|
237
281
|
case error
|
238
282
|
when HTTP::TimeoutError
|
239
283
|
# 超时错误切换代理
|
@@ -242,7 +286,6 @@ module HttpCrawler
|
|
242
286
|
else
|
243
287
|
raise error
|
244
288
|
end
|
245
|
-
|
246
289
|
else
|
247
290
|
# 错误尝试次数
|
248
291
|
if n <= 0
|
@@ -252,9 +295,8 @@ module HttpCrawler
|
|
252
295
|
retry
|
253
296
|
end
|
254
297
|
end
|
255
|
-
|
256
298
|
end
|
257
|
-
end
|
299
|
+
end # def request(&block)
|
258
300
|
end
|
259
301
|
end
|
260
302
|
|
data/lib/http_crawler/proxy.rb
CHANGED
@@ -1,7 +1,5 @@
|
|
1
1
|
module HttpCrawler
|
2
2
|
module Proxy
|
3
|
-
|
4
|
-
include(HttpCrawler::Client)
|
5
3
|
class << self
|
6
4
|
|
7
5
|
# 接收格式
|
@@ -11,13 +9,7 @@ module HttpCrawler
|
|
11
9
|
def for(web_name, *arg)
|
12
10
|
"HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
|
13
11
|
end
|
14
|
-
|
15
12
|
end
|
16
|
-
|
17
|
-
def max_error_num
|
18
|
-
@max_error_num ||= 0
|
19
|
-
end
|
20
|
-
|
21
13
|
end
|
22
14
|
end
|
23
15
|
|
@@ -1,7 +1,22 @@
|
|
1
|
-
|
2
1
|
module HttpCrawler
|
3
2
|
module Proxy
|
4
|
-
|
3
|
+
class Client < HttpCrawler::Client
|
4
|
+
|
5
|
+
class << self
|
6
|
+
|
7
|
+
# 接收格式
|
8
|
+
# web_name = "test_proxy_api"
|
9
|
+
# 返回 HttpCrawler::Proxy::TestProxyApi::Client 实例
|
10
|
+
#
|
11
|
+
def for(web_name, *arg)
|
12
|
+
"HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
|
17
|
+
def max_error_num
|
18
|
+
@max_error_num ||= 0
|
19
|
+
end
|
5
20
|
|
6
21
|
end
|
7
22
|
end
|
data/lib/http_crawler/version.rb
CHANGED