http_crawler 0.3.0.2 → 0.3.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e47fc7ceac8e7335c7d873104a8ca7f504885af1c19a0802d23c1986d4ae5588
4
- data.tar.gz: 392e793eae03814c1f3475e7515124d51b8adcdffdec9065873a90c800765225
3
+ metadata.gz: 2aa108c02ecc7a8d922b6aa5843f902fc0d048e1a8f447a3e5d0b49abecb62a9
4
+ data.tar.gz: 2fe4d070340b90e4f90df03ccce76f17256ed992085060769f985d08c0ed383f
5
5
  SHA512:
6
- metadata.gz: fb7ba4091d7320d1fcbb3926edb060fd55155156c34cf42b7ea1b67e1b8eba3c0cdf317a2f53d8094dee3672a17058dd57f688da6a89b4f86cfcdedad5bda42f
7
- data.tar.gz: cd6001c16fbbff9023fe26c739fe270c62176849d3a4809d7bfa1aff4dd74856b6a8db95297c312d4fe56334dc7c8f04772d6eeb8a97f8d2de9a9df841c8a2ab
6
+ metadata.gz: fa391e5ea9b16a84e28ce788964c24345882fa4ca906cd25efe16769e3e0dcc1708122094c92637dd32451958042ed2592b6c950586696e0075060ecbc8ea5c2
7
+ data.tar.gz: d65c3597e646a2e245e248248bff09afe6d1aa862f7eebe7f7348704eee1c97b6139a06c1bac0ae3f1a830e4ff88398d126e279869ba64994d12ecd44bab6bd4
@@ -5,8 +5,19 @@
5
5
  </component>
6
6
  <component name="NewModuleRootManager">
7
7
  <content url="file://$MODULE_DIR$" />
8
- <orderEntry type="inheritedJdk" />
8
+ <orderEntry type="jdk" jdkName="RVM: ruby-2.4.1 [rails5.1.6]" jdkType="RUBY_SDK" />
9
9
  <orderEntry type="sourceFolder" forTests="false" />
10
- <orderEntry type="library" scope="PROVIDED" name="bundler (v1.16.6, RVM: ruby-2.4.1) [gem]" level="application" />
10
+ <orderEntry type="library" scope="PROVIDED" name="brotli (v0.2.2, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
11
+ <orderEntry type="library" scope="PROVIDED" name="bundler (v1.16.6, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
12
+ <orderEntry type="library" scope="PROVIDED" name="diff-lcs (v1.3, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
13
+ <orderEntry type="library" scope="PROVIDED" name="guess_html_encoding (v0.0.11, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
14
+ <orderEntry type="library" scope="PROVIDED" name="mini_portile2 (v2.4.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
15
+ <orderEntry type="library" scope="PROVIDED" name="nokogiri (v1.10.1, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
16
+ <orderEntry type="library" scope="PROVIDED" name="rchardet (v1.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
17
+ <orderEntry type="library" scope="PROVIDED" name="rspec (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
18
+ <orderEntry type="library" scope="PROVIDED" name="rspec-core (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
19
+ <orderEntry type="library" scope="PROVIDED" name="rspec-mocks (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
20
+ <orderEntry type="library" scope="PROVIDED" name="rspec-support (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
21
+ <orderEntry type="library" scope="PROVIDED" name="ruby-readability (v0.7.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
11
22
  </component>
12
23
  </module>
@@ -1,4 +1,4 @@
1
- load File.dirname(__FILE__) + '/http/response.rb'
1
+ require_dependency File.dirname(__FILE__) + '/http/response.rb'
2
2
 
3
3
  module HttpCrawler
4
4
  module Client
@@ -40,9 +40,9 @@ module HttpCrawler
40
40
 
41
41
  # 初始化超时时间
42
42
  def init_timeout
43
- @connect_time = 3
44
- @write_time = 3
45
- @read_time = 3
43
+ @connect_time = 5
44
+ @write_time = 5
45
+ @read_time = 5
46
46
  end
47
47
 
48
48
  # 初始化 ssl 协议
@@ -1,2 +1,2 @@
1
- load File.dirname(__FILE__) + '/common/object.rb'
2
- load File.dirname(__FILE__) + '/common/string.rb'
1
+ require_dependency File.dirname(__FILE__) + '/common/object.rb'
2
+ require_dependency File.dirname(__FILE__) + '/common/string.rb'
@@ -1,11 +1,63 @@
1
1
  module HTTP
2
2
  class Response
3
3
 
4
+
4
5
  # 解压并转码 body 数据
5
6
  def decoding_body
6
- @decoding_body ||= self.body.to_s
7
+
8
+ return @decoding_body if @decoding_body
9
+ return nil unless self.body
10
+
11
+ # 数据解压
12
+ case self.headers['Content-Encoding']
13
+ when 'gzip' then
14
+ sio = StringIO.new(self.body.to_s)
15
+ gz = Zlib::GzipReader.new(sio)
16
+ @decoding_body = gz.read()
17
+ when 'br'
18
+ @decoding_body = Brotli.inflate(self.body.to_s)
19
+ # when 'deflate'
20
+ # # 可能错误代码 暂时没解决 deflate 编码格式
21
+ # @decoding_body = Zlib::Inflate.inflate(self.body.to_s)
22
+ else
23
+ @decoding_body = self.body.to_s
24
+ end
25
+
26
+ # @decoding_body = self.body.to_s
27
+
28
+ # 判断解压后数据编码格式
29
+
30
+ # 从header取编码格式
31
+ encoding = self.headers['Content-Type'][/charset=([^, ;"]*)/, 1] if self.headers['Content-Type']
32
+
33
+ # 从html中的 charset 取编码格式
34
+ # 不能使用,因为 decoding_body 还未转码,直接使用可能报错: ArgumentError: invalid byte sequence in UTF-8
35
+ # encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
36
+
37
+ # 通过 CharDet 判断编码格式
38
+ encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
39
+
40
+
41
+ # 进行转码
42
+ begin
43
+ @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
44
+ rescue => e
45
+ # 转码错误后再次使用 CharDet 判断编码格式后进行转码
46
+ cd = CharDet.detect(@decoding_body)["encoding"]
47
+ if (cd && cd != encoding)
48
+ @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
49
+ else
50
+ # 还是转码错误则抛出异常
51
+ Rails.logger.debug "encoding => #{encoding}"
52
+ Rails.logger.debug "cd => #{cd}"
53
+ Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
54
+ raise e
55
+ end
56
+ end
57
+
7
58
  end
8
59
 
60
+ alias_method :dec, :decoding_body
9
61
  # def decoding_body
10
62
 
11
63
  def html
@@ -19,13 +71,17 @@ module HTTP
19
71
  end
20
72
 
21
73
  # 通过readability 解析数据
74
+ # [Readability::Document]
22
75
  def readability
23
76
  @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
24
77
  end
25
78
 
79
+ def content
80
+ Nokogiri::HTML(readability.content).text
81
+ end
26
82
  # 解析
27
83
  def parsing
28
- json
84
+ self.json
29
85
  end
30
86
 
31
87
  # 获取解析结果
@@ -63,7 +119,7 @@ module HTTP
63
119
  regular_num = decoding_body =~ regular
64
120
  if regular_num
65
121
  Rails.logger.warn("触发验证信息")
66
- Rails.logger.warn(decoding_body[regular_num..(regular_num + 100)])
122
+ Rails.logger.warn(decoding_body[regular_num..(regular_num + 300)])
67
123
  return true
68
124
  end
69
125
  end
@@ -21,4 +21,4 @@ module HttpCrawler
21
21
  end
22
22
  end
23
23
 
24
- load File.dirname(__FILE__) + '/proxy/client.rb'
24
+ require_dependency File.dirname(__FILE__) + '/proxy/client.rb'
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.2"
2
+ VERSION = "0.3.0.3"
3
3
  end
@@ -5,4 +5,4 @@ module HttpCrawler
5
5
  end
6
6
  end
7
7
 
8
- load File.dirname(__FILE__) + '/web/client.rb'
8
+ require_dependency File.dirname(__FILE__) + '/web/client.rb'
data/lib/http_crawler.rb CHANGED
@@ -1,17 +1,14 @@
1
- require 'net/http'
2
1
  require 'json'
3
2
  require 'digest/md5'
4
3
  require 'nokogiri'
5
4
 
6
- require 'http_crawler/errors.rb'
7
- load 'http_crawler/common.rb'
8
- load 'http_crawler/client.rb'
9
- load 'http_crawler/web.rb'
10
- load 'http_crawler/proxy.rb'
5
+ # 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
6
+ require_dependency 'http_crawler/errors.rb'
7
+ require_dependency 'http_crawler/common.rb'
8
+ require_dependency 'http_crawler/client.rb'
9
+ require_dependency 'http_crawler/web.rb'
10
+ require_dependency 'http_crawler/proxy.rb'
11
11
 
12
12
  module HttpCrawler
13
13
  # Your code goes here...
14
- def self.a
15
- puts "112"
16
- end
17
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.2
4
+ version: 0.3.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-19 00:00:00.000000000 Z
11
+ date: 2019-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -137,7 +137,6 @@ files:
137
137
  - lib/http_crawler/common/object.rb
138
138
  - lib/http_crawler/common/string.rb
139
139
  - lib/http_crawler/errors.rb
140
- - lib/http_crawler/http.rb
141
140
  - lib/http_crawler/http/response.rb
142
141
  - lib/http_crawler/proxy.rb
143
142
  - lib/http_crawler/proxy/README.md
@@ -1,260 +0,0 @@
1
- module HttpCrawler
2
- class HTTP < Net::HTTP
3
-
4
- # 自动获取代理,true 表示自动获取代理 、false 表示不自动获取
5
- attr_accessor :auto_proxy
6
- # 代理API的别名 主要关联 HttpCrawler::Proxy中维护的代理API
7
- attr_accessor :proxy_api
8
-
9
- def proxy_api
10
- @proxy_api ||= "my"
11
- end
12
-
13
- # 调用自己的代理池所需要的主键 key
14
- attr_accessor :proxy_key
15
- # 请求错误后的重复最大请求次数
16
- attr_accessor :max_error_num
17
- # 错误的url地址,存的是正则
18
- attr_accessor :error_urls
19
-
20
- def initialize(address, port = nil)
21
- super(address, port)
22
- @max_error_num = 2
23
- @error_num = 0
24
- @proxy_key = "default"
25
- @error_urls = []
26
- end
27
-
28
- def http_error_sleep
29
- sleep(0.5)
30
- end
31
-
32
- def server_error_sleep
33
- sleep(3)
34
- end
35
-
36
-
37
- @@proxy_list = []
38
- # 为 @http 重设代理
39
- def proxy(p = {})
40
-
41
- raise '代理设置 p_addr 不能为空' unless p["p_addr"]
42
- raise '代理设置 p_port 不能为空' unless p["p_port"]
43
-
44
- p["p_user"] ||= nil
45
- p["p_pass"] ||= nil
46
-
47
- Rails.logger.info("切换代理至 => #{p}")
48
- # 设为 false 否则不会启用代理
49
- @proxy_from_env = false
50
-
51
- # 初始化代理数据
52
- @proxy_address = p["p_addr"]
53
- @proxy_port = p["p_port"]
54
- @proxy_user = p["p_user"]
55
- @proxy_pass = p["p_pass"]
56
-
57
- end
58
-
59
- # 通过调用 api 获取代理或者通过自定义设置代理
60
- def get_proxy
61
-
62
- # while @@proxy_list.blank?
63
- # Rails.logger.debug("@@proxy_list 为空进行更新")
64
- # proxy_client = HttpCrawler::Proxy.for(proxy_api)
65
- # proxy_r = proxy_client.get_proxy(key: proxy_key)
66
- # @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
67
- # Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
68
- # sleep(1)
69
- # end
70
- # p = @@proxy_list.delete_at(0)
71
-
72
- proxy_ip = nil
73
- begin
74
- Rails.logger.debug("开始获取代理IP")
75
- proxy_client = HttpCrawler::Proxy.for(proxy_api)
76
- proxy_r = proxy_client.get_proxy(key: proxy_key)
77
- proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
78
- if proxy_ip.blank?
79
- Rails.logger.warn "无最新代理等待5秒后重新获取"
80
- else
81
- break
82
- end
83
- sleep(5)
84
- end while true
85
-
86
- Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
87
-
88
- unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
89
- Rails.logger.warn "无最新代理等待5秒后重新获取"
90
- sleep(5)
91
- proxy_ip = get_proxy
92
- end
93
-
94
- if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
95
- Rails.logger.warn "无最新代理等待5秒后重新获取"
96
- sleep(5)
97
- proxy_ip = get_proxy
98
- end
99
- proxy_ip
100
- end
101
-
102
- def update_proxy(proxy_ip = {})
103
- if proxy_ip.blank?
104
- proxy(get_proxy)
105
- else
106
- proxy(proxy_ip)
107
- end
108
- end
109
-
110
- # 如果自动更新代理 则更新代理返回 true,否则返回false
111
- def update_proxy?(proxy_ip = {})
112
- if auto_proxy
113
- if proxy_ip.blank?
114
- proxy(get_proxy)
115
- else
116
- proxy(proxy_ip)
117
- end
118
- return true
119
- else
120
- return false
121
- end
122
- end
123
-
124
-
125
- # 重定向请求
126
- def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
127
- # You should choose a better exception.
128
- raise ArgumentError, 'too many HTTP repeated' if limit == 0
129
- # 更新uri_or_path
130
- uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
131
-
132
- response = get(uri_or_path, initheader, dest, &block)
133
- case response
134
- when Net::HTTPSuccess then
135
- response
136
- when Net::HTTPRedirection then
137
- location = response['location']
138
- Rails.logger.warn "redirected to #{location}"
139
- @error_urls.each do |url_string|
140
- if location =~ /#{url_string}/
141
- raise "跳转到异常url => #{location}"
142
- end
143
- end
144
- # 传入 location 进行跳转
145
- get_fetch(location, initheader, dest, limit - 1, &block)
146
- when Net::HTTPServerError then
147
- Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
148
- server_error_sleep
149
- # 重新请求
150
- get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
151
- when Net::HTTPProxyAuthenticationRequired then
152
- Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
153
- if update_proxy?
154
- server_error_sleep
155
- # 重新请求
156
- get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
157
- else
158
- response.error!
159
- end
160
- else
161
- Rails.logger.debug uri_or_path
162
- Rails.logger.debug initheader
163
- Rails.logger.debug response.body
164
- response.error!
165
- end
166
- end
167
-
168
- # 重定向请求
169
- def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
170
- # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
171
- uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
172
- # Rails.logger.debug "post_fetch => #{uri_or_path}"
173
- response = post(uri_or_path, data, initheader, dest, &block)
174
- case response
175
- when Net::HTTPSuccess then
176
- response
177
- when Net::HTTPRedirection then
178
- location = response['location']
179
- Rails.logger.warn "redirected to #{location}"
180
- @error_urls.each do |url_string|
181
- if location =~ /#{url_string}/
182
- raise "跳转到异常url => #{location}"
183
- end
184
- end
185
- # 传入 location 进行跳转
186
- get_fetch(location, initheader, dest, 9, &block)
187
- when Net::HTTPServerError then
188
- Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
189
- server_error_sleep
190
- # 重新请求
191
- post_fetch(uri_or_path, initheader, dest, &block)
192
- when Net::HTTPProxyAuthenticationRequired then
193
- Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
194
- if update_proxy?
195
- server_error_sleep
196
- # 重新请求
197
- post_fetch(uri_or_path, initheader, dest, &block)
198
- else
199
- response.error!
200
- end
201
- else
202
- server_error_sleep
203
- response.error!
204
- end
205
- end
206
-
207
- # def post_fetch
208
-
209
- #
210
- # 重写 发送请求的方法
211
- #
212
- def request(req, body = nil, &block)
213
- begin
214
- Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
215
- Rails.logger.debug("body => #{body}") if started? && body
216
- super(req, body, &block)
217
- rescue => error
218
- Rails.logger.error "出错了! 错误类型 => #{error.class}"
219
- if started?
220
- # started? 是为了判断是否结束http请求,如果不添加则会处理2次异常
221
- Rails.logger.error("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}")
222
- Rails.logger.error("body => #{body}") if body
223
- raise error
224
- else
225
- http_error_sleep
226
- # 最大错误尝试次数
227
- if @error_num < @max_error_num
228
- @error_num += 1
229
- retry # 这将把控制移到 begin 的开头
230
- else
231
-
232
- # 超过最大错误限制 判断错误类型
233
- case error
234
- when EOFError
235
- Rails.logger.warn "EOFError!"
236
- when Timeout::Error
237
- Rails.logger.warn "请求超时!"
238
- when Net::HTTPServerException
239
- Rails.logger.warn "代理失效:[#{proxy_address}:#{proxy_port}]"
240
- when Errno::ECONNREFUSED
241
- Rails.logger.warn "Errno::ECONNREFUSED"
242
- else
243
- raise error
244
- end
245
-
246
- if update_proxy?
247
- @error_num = 0
248
- retry # 这将把控制移到 begin 的开头
249
- else
250
- raise error
251
- end
252
- end
253
- end
254
- end # begin
255
- end # def request(req, body = nil, &block)
256
- end
257
- end
258
-
259
-
260
- load File.dirname(__FILE__) + '/http/response.rb'