http_crawler 0.3.0.3 → 0.3.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2aa108c02ecc7a8d922b6aa5843f902fc0d048e1a8f447a3e5d0b49abecb62a9
4
- data.tar.gz: 2fe4d070340b90e4f90df03ccce76f17256ed992085060769f985d08c0ed383f
3
+ metadata.gz: e44504aa656dea432bd96e28cc1908aac5c6164d1aa5ab9399da5b05db77b5b8
4
+ data.tar.gz: df36de8464939d97436941534bd441f7ee83b60b12d83c6547e9ed986d109276
5
5
  SHA512:
6
- metadata.gz: fa391e5ea9b16a84e28ce788964c24345882fa4ca906cd25efe16769e3e0dcc1708122094c92637dd32451958042ed2592b6c950586696e0075060ecbc8ea5c2
7
- data.tar.gz: d65c3597e646a2e245e248248bff09afe6d1aa862f7eebe7f7348704eee1c97b6139a06c1bac0ae3f1a830e4ff88398d126e279869ba64994d12ecd44bab6bd4
6
+ metadata.gz: a9baf0b81a3888c11d0b8d3a908fe805e9e498d32f6418aabb0f2c6392a39d9c354b74ce75e8c1673b2c2ea9b67305539b0a0c363905893e03be09050dfdb2be
7
+ data.tar.gz: ccaff3ba7029675ba6d6109b235b9a274f5f20638f9d59d37892cf179c24a0cbf8f2d49fdd69ea730471ca81677cd47f2080aac1fe73e404461dbd9a75a23c99
data/README.md CHANGED
@@ -45,11 +45,9 @@ client.index # 抓取首页
45
45
 
46
46
  ```ruby
47
47
  client = HttpCrawler::Proxy::TestProxyApi::Client.new
48
- client.index # 抓取首页
49
48
  ```
50
49
 
51
50
  ### 通过别名调用
52
51
  ```ruby
53
52
  client = HttpCrawler::Proxy.for("test_proxy_api") #
54
- client.index # 抓取首页
55
53
  ```
data/lib/http_crawler.rb CHANGED
@@ -2,6 +2,15 @@ require 'json'
2
2
  require 'digest/md5'
3
3
  require 'nokogiri'
4
4
 
5
+ # 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
6
+ class << self.class
7
+ def require_rename
8
+ # require 取别名 require_dependency
9
+ alias_method :require_dependency, :require
10
+ end
11
+ end
12
+ self.class.require_rename
13
+
5
14
  # 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
6
15
  require_dependency 'http_crawler/errors.rb'
7
16
  require_dependency 'http_crawler/common.rb'
@@ -1,7 +1,7 @@
1
1
  require_dependency File.dirname(__FILE__) + '/http/response.rb'
2
2
 
3
3
  module HttpCrawler
4
- module Client
4
+ class Client
5
5
 
6
6
  class << self
7
7
 
@@ -9,8 +9,8 @@ module HttpCrawler
9
9
  # web_name = "biquge_duquanben"
10
10
  # 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
11
11
  #
12
- def for(web_name, *args)
13
- "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(*args)
12
+ def for(web_name)
13
+ "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
14
14
  end
15
15
 
16
16
  #
@@ -19,7 +19,11 @@ module HttpCrawler
19
19
  # 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
20
20
  #
21
21
  def for_module(module_name, *args)
22
- "#{module_name}::Client".constantize.new(*args)
22
+ "#{module_name}::Client".constantize.new()
23
+ end
24
+
25
+ def for_uri(path)
26
+ self.new(uri: path)
23
27
  end
24
28
  end
25
29
 
@@ -38,6 +42,25 @@ module HttpCrawler
38
42
  @uri = nil
39
43
  end
40
44
 
45
+ # 更新uri
46
+ def update_uri(uri_or_path)
47
+ case uri_or_path
48
+ when URI
49
+ @uri = uri_or_path
50
+ when String
51
+ if uri_or_path =~ /^http/
52
+ @uri = URI(uri_or_path)
53
+ else
54
+ @uri = @uri + uri_or_path
55
+ end
56
+ else
57
+ raise ArgumentError, uri_or_path
58
+ end
59
+ # 初始化 ssl 协议
60
+ self.init_ssl
61
+ self.uri
62
+ end
63
+
41
64
  # 初始化超时时间
42
65
  def init_timeout
43
66
  @connect_time = 5
@@ -55,23 +78,30 @@ module HttpCrawler
55
78
  end
56
79
 
57
80
  # 头文件相关方法
58
- def header
81
+ def header(parameter = {})
59
82
  @header ||= init_header
60
83
  end
61
84
 
62
- def init_header
63
- nil
85
+ def init_header(parameter = {})
86
+ @header = {}
64
87
  end
65
88
 
66
89
  def update_header(parameter = {})
67
90
  nil
68
91
  end
69
92
 
70
- # cookies
71
- def cookies
72
- @cookies ||= {}
93
+ # cookies相关方法
94
+ def cookies(parameter = {})
95
+ @cookies ||= init_cookies
96
+ end
97
+
98
+ def init_cookies(parameter = {})
99
+ @cookies = {}
73
100
  end
74
101
 
102
+ def update_cookies(parameter = {})
103
+ nil
104
+ end
75
105
 
76
106
  # 代理设置
77
107
  def auto_proxy=(value)
@@ -150,9 +180,9 @@ module HttpCrawler
150
180
  end
151
181
 
152
182
 
153
- # 初始化http参数
183
+ # 初始化init_client参数
154
184
  def init_client
155
-
185
+ nil
156
186
  end
157
187
 
158
188
  # 初始化http请求前置条件
@@ -179,9 +209,15 @@ module HttpCrawler
179
209
  # init_uri 如果未初始化@uri,则会报错
180
210
  # 继承类需要重定义 init_uri
181
211
  #
182
- def initialize
212
+ def initialize(parameter = {})
183
213
  # 初始化 uri
184
- raise "Client uri为空" unless init_uri
214
+ init_uri
215
+
216
+ # 如果自定义uri
217
+ if parameter[:uri]
218
+ raise "Client uri为重复初始化" if uri
219
+ update_uri(parameter[:uri])
220
+ end
185
221
 
186
222
  # 初始化超时时间
187
223
  init_timeout
@@ -198,12 +234,20 @@ module HttpCrawler
198
234
 
199
235
  # 发送 get 请求
200
236
  def get(path, params = {})
201
- request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
237
+ raise "Client uri为空" unless self.uri
238
+ request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
239
+ end
240
+
241
+ # 直接发送uri的get请求
242
+ def get_uri
243
+ raise "Client uri为空" unless self.uri
244
+ request {http.get(self.uri.to_s, :ssl_context => @ctx)}
202
245
  end
203
246
 
204
247
  # 发送 post 请求
205
248
  def post(path, params = {})
206
- request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
249
+ raise "Client uri为空" unless self.uri
250
+ request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
207
251
  end
208
252
 
209
253
  # 请求的响应
@@ -233,7 +277,7 @@ module HttpCrawler
233
277
  begin
234
278
  block.call
235
279
  rescue => error
236
-
280
+ Rails.logger.debug error.class
237
281
  case error
238
282
  when HTTP::TimeoutError
239
283
  # 超时错误切换代理
@@ -242,7 +286,6 @@ module HttpCrawler
242
286
  else
243
287
  raise error
244
288
  end
245
-
246
289
  else
247
290
  # 错误尝试次数
248
291
  if n <= 0
@@ -252,9 +295,8 @@ module HttpCrawler
252
295
  retry
253
296
  end
254
297
  end
255
-
256
298
  end
257
- end
299
+ end # def request(&block)
258
300
  end
259
301
  end
260
302
 
@@ -1,7 +1,5 @@
1
1
  module HttpCrawler
2
2
  module Proxy
3
-
4
- include(HttpCrawler::Client)
5
3
  class << self
6
4
 
7
5
  # 接收格式
@@ -11,13 +9,7 @@ module HttpCrawler
11
9
  def for(web_name, *arg)
12
10
  "HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
13
11
  end
14
-
15
12
  end
16
-
17
- def max_error_num
18
- @max_error_num ||= 0
19
- end
20
-
21
13
  end
22
14
  end
23
15
 
@@ -1,7 +1,22 @@
1
-
2
1
  module HttpCrawler
3
2
  module Proxy
4
- module Client
3
+ class Client < HttpCrawler::Client
4
+
5
+ class << self
6
+
7
+ # 接收格式
8
+ # web_name = "test_proxy_api"
9
+ # 返回 HttpCrawler::Proxy::TestProxyApi::Client 实例
10
+ #
11
+ def for(web_name, *arg)
12
+ "HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
13
+ end
14
+
15
+ end
16
+
17
+ def max_error_num
18
+ @max_error_num ||= 0
19
+ end
5
20
 
6
21
  end
7
22
  end
@@ -2,9 +2,7 @@
2
2
  module HttpCrawler
3
3
  module Proxy
4
4
  module TestProxyApi
5
- class Client
6
-
7
- include(HttpCrawler::Proxy::Client)
5
+ class Client < HttpCrawler::Proxy::Client
8
6
 
9
7
 
10
8
 
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.3"
2
+ VERSION = "0.3.0.4"
3
3
  end
@@ -2,9 +2,7 @@
2
2
  module HttpCrawler
3
3
  module Web
4
4
  module Baidu
5
- class Client
6
-
7
- include(HttpCrawler::Client)
5
+ class Client < HttpCrawler::Web::Client
8
6
 
9
7
  def init_uri
10
8
  @uri = URI("https://www.baidu.com")
@@ -2,7 +2,7 @@
2
2
 
3
3
  module HttpCrawler
4
4
  module Web
5
- module Client
5
+ class Client < HttpCrawler::Client
6
6
  end
7
7
  end
8
8
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.3
4
+ version: 0.3.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger