http_crawler 0.3.0.3 → 0.3.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2aa108c02ecc7a8d922b6aa5843f902fc0d048e1a8f447a3e5d0b49abecb62a9
4
- data.tar.gz: 2fe4d070340b90e4f90df03ccce76f17256ed992085060769f985d08c0ed383f
3
+ metadata.gz: e44504aa656dea432bd96e28cc1908aac5c6164d1aa5ab9399da5b05db77b5b8
4
+ data.tar.gz: df36de8464939d97436941534bd441f7ee83b60b12d83c6547e9ed986d109276
5
5
  SHA512:
6
- metadata.gz: fa391e5ea9b16a84e28ce788964c24345882fa4ca906cd25efe16769e3e0dcc1708122094c92637dd32451958042ed2592b6c950586696e0075060ecbc8ea5c2
7
- data.tar.gz: d65c3597e646a2e245e248248bff09afe6d1aa862f7eebe7f7348704eee1c97b6139a06c1bac0ae3f1a830e4ff88398d126e279869ba64994d12ecd44bab6bd4
6
+ metadata.gz: a9baf0b81a3888c11d0b8d3a908fe805e9e498d32f6418aabb0f2c6392a39d9c354b74ce75e8c1673b2c2ea9b67305539b0a0c363905893e03be09050dfdb2be
7
+ data.tar.gz: ccaff3ba7029675ba6d6109b235b9a274f5f20638f9d59d37892cf179c24a0cbf8f2d49fdd69ea730471ca81677cd47f2080aac1fe73e404461dbd9a75a23c99
data/README.md CHANGED
@@ -45,11 +45,9 @@ client.index # 抓取首页
45
45
 
46
46
  ```ruby
47
47
  client = HttpCrawler::Proxy::TestProxyApi::Client.new
48
- client.index # 抓取首页
49
48
  ```
50
49
 
51
50
  ### 通过别名调用
52
51
  ```ruby
53
52
  client = HttpCrawler::Proxy.for("test_proxy_api") #
54
- client.index # 抓取首页
55
53
  ```
data/lib/http_crawler.rb CHANGED
@@ -2,6 +2,15 @@ require 'json'
2
2
  require 'digest/md5'
3
3
  require 'nokogiri'
4
4
 
5
+ # 此段代码用于解决 require_dependency 是 rails 的内置方法 必须要先引用 Rails的包才能用的bug
6
+ class << self.class
7
+ def require_rename
8
+ # require 取别名 require_dependency
9
+ alias_method :require_dependency, :require
10
+ end
11
+ end
12
+ self.class.require_rename
13
+
5
14
  # 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
6
15
  require_dependency 'http_crawler/errors.rb'
7
16
  require_dependency 'http_crawler/common.rb'
@@ -1,7 +1,7 @@
1
1
  require_dependency File.dirname(__FILE__) + '/http/response.rb'
2
2
 
3
3
  module HttpCrawler
4
- module Client
4
+ class Client
5
5
 
6
6
  class << self
7
7
 
@@ -9,8 +9,8 @@ module HttpCrawler
9
9
  # web_name = "biquge_duquanben"
10
10
  # 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
11
11
  #
12
- def for(web_name, *args)
13
- "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(*args)
12
+ def for(web_name)
13
+ "HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new()
14
14
  end
15
15
 
16
16
  #
@@ -19,7 +19,11 @@ module HttpCrawler
19
19
  # 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
20
20
  #
21
21
  def for_module(module_name, *args)
22
- "#{module_name}::Client".constantize.new(*args)
22
+ "#{module_name}::Client".constantize.new()
23
+ end
24
+
25
+ def for_uri(path)
26
+ self.new(uri: path)
23
27
  end
24
28
  end
25
29
 
@@ -38,6 +42,25 @@ module HttpCrawler
38
42
  @uri = nil
39
43
  end
40
44
 
45
+ # 更新uri
46
+ def update_uri(uri_or_path)
47
+ case uri_or_path
48
+ when URI
49
+ @uri = uri_or_path
50
+ when String
51
+ if uri_or_path =~ /^http/
52
+ @uri = URI(uri_or_path)
53
+ else
54
+ @uri = @uri + uri_or_path
55
+ end
56
+ else
57
+ raise ArgumentError, uri_or_path
58
+ end
59
+ # 初始化 ssl 协议
60
+ self.init_ssl
61
+ self.uri
62
+ end
63
+
41
64
  # 初始化超时时间
42
65
  def init_timeout
43
66
  @connect_time = 5
@@ -55,23 +78,30 @@ module HttpCrawler
55
78
  end
56
79
 
57
80
  # 头文件相关方法
58
- def header
81
+ def header(parameter = {})
59
82
  @header ||= init_header
60
83
  end
61
84
 
62
- def init_header
63
- nil
85
+ def init_header(parameter = {})
86
+ @header = {}
64
87
  end
65
88
 
66
89
  def update_header(parameter = {})
67
90
  nil
68
91
  end
69
92
 
70
- # cookies
71
- def cookies
72
- @cookies ||= {}
93
+ # cookies相关方法
94
+ def cookies(parameter = {})
95
+ @cookies ||= init_cookies
96
+ end
97
+
98
+ def init_cookies(parameter = {})
99
+ @cookies = {}
73
100
  end
74
101
 
102
+ def update_cookies(parameter = {})
103
+ nil
104
+ end
75
105
 
76
106
  # 代理设置
77
107
  def auto_proxy=(value)
@@ -150,9 +180,9 @@ module HttpCrawler
150
180
  end
151
181
 
152
182
 
153
- # 初始化http参数
183
+ # 初始化init_client参数
154
184
  def init_client
155
-
185
+ nil
156
186
  end
157
187
 
158
188
  # 初始化http请求前置条件
@@ -179,9 +209,15 @@ module HttpCrawler
179
209
  # init_uri 如果未初始化@uri,则会报错
180
210
  # 继承类需要重定义 init_uri
181
211
  #
182
- def initialize
212
+ def initialize(parameter = {})
183
213
  # 初始化 uri
184
- raise "Client uri为空" unless init_uri
214
+ init_uri
215
+
216
+ # 如果自定义uri
217
+ if parameter[:uri]
218
+ raise "Client uri为重复初始化" if uri
219
+ update_uri(parameter[:uri])
220
+ end
185
221
 
186
222
  # 初始化超时时间
187
223
  init_timeout
@@ -198,12 +234,20 @@ module HttpCrawler
198
234
 
199
235
  # 发送 get 请求
200
236
  def get(path, params = {})
201
- request {http.get((@uri + path).to_s, :params => params, :ssl_context => @ctx)}
237
+ raise "Client uri为空" unless self.uri
238
+ request {http.get((self.uri + path).to_s, :params => params, :ssl_context => @ctx)}
239
+ end
240
+
241
+ # 直接发送uri的get请求
242
+ def get_uri
243
+ raise "Client uri为空" unless self.uri
244
+ request {http.get(self.uri.to_s, :ssl_context => @ctx)}
202
245
  end
203
246
 
204
247
  # 发送 post 请求
205
248
  def post(path, params = {})
206
- request {http.post((@uri + path).to_s, :form => params, :ssl_context => @ctx)}
249
+ raise "Client uri为空" unless self.uri
250
+ request {http.post((self.uri + path).to_s, :form => params, :ssl_context => @ctx)}
207
251
  end
208
252
 
209
253
  # 请求的响应
@@ -233,7 +277,7 @@ module HttpCrawler
233
277
  begin
234
278
  block.call
235
279
  rescue => error
236
-
280
+ Rails.logger.debug error.class
237
281
  case error
238
282
  when HTTP::TimeoutError
239
283
  # 超时错误切换代理
@@ -242,7 +286,6 @@ module HttpCrawler
242
286
  else
243
287
  raise error
244
288
  end
245
-
246
289
  else
247
290
  # 错误尝试次数
248
291
  if n <= 0
@@ -252,9 +295,8 @@ module HttpCrawler
252
295
  retry
253
296
  end
254
297
  end
255
-
256
298
  end
257
- end
299
+ end # def request(&block)
258
300
  end
259
301
  end
260
302
 
@@ -1,7 +1,5 @@
1
1
  module HttpCrawler
2
2
  module Proxy
3
-
4
- include(HttpCrawler::Client)
5
3
  class << self
6
4
 
7
5
  # 接收格式
@@ -11,13 +9,7 @@ module HttpCrawler
11
9
  def for(web_name, *arg)
12
10
  "HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
13
11
  end
14
-
15
12
  end
16
-
17
- def max_error_num
18
- @max_error_num ||= 0
19
- end
20
-
21
13
  end
22
14
  end
23
15
 
@@ -1,7 +1,22 @@
1
-
2
1
  module HttpCrawler
3
2
  module Proxy
4
- module Client
3
+ class Client < HttpCrawler::Client
4
+
5
+ class << self
6
+
7
+ # 接收格式
8
+ # web_name = "test_proxy_api"
9
+ # 返回 HttpCrawler::Proxy::TestProxyApi::Client 实例
10
+ #
11
+ def for(web_name, *arg)
12
+ "HttpCrawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
13
+ end
14
+
15
+ end
16
+
17
+ def max_error_num
18
+ @max_error_num ||= 0
19
+ end
5
20
 
6
21
  end
7
22
  end
@@ -2,9 +2,7 @@
2
2
  module HttpCrawler
3
3
  module Proxy
4
4
  module TestProxyApi
5
- class Client
6
-
7
- include(HttpCrawler::Proxy::Client)
5
+ class Client < HttpCrawler::Proxy::Client
8
6
 
9
7
 
10
8
 
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.3"
2
+ VERSION = "0.3.0.4"
3
3
  end
@@ -2,9 +2,7 @@
2
2
  module HttpCrawler
3
3
  module Web
4
4
  module Baidu
5
- class Client
6
-
7
- include(HttpCrawler::Client)
5
+ class Client < HttpCrawler::Web::Client
8
6
 
9
7
  def init_uri
10
8
  @uri = URI("https://www.baidu.com")
@@ -2,7 +2,7 @@
2
2
 
3
3
  module HttpCrawler
4
4
  module Web
5
- module Client
5
+ class Client < HttpCrawler::Client
6
6
  end
7
7
  end
8
8
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.3
4
+ version: 0.3.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger