http_crawler 0.3.0.2 → 0.3.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e47fc7ceac8e7335c7d873104a8ca7f504885af1c19a0802d23c1986d4ae5588
4
- data.tar.gz: 392e793eae03814c1f3475e7515124d51b8adcdffdec9065873a90c800765225
3
+ metadata.gz: 2aa108c02ecc7a8d922b6aa5843f902fc0d048e1a8f447a3e5d0b49abecb62a9
4
+ data.tar.gz: 2fe4d070340b90e4f90df03ccce76f17256ed992085060769f985d08c0ed383f
5
5
  SHA512:
6
- metadata.gz: fb7ba4091d7320d1fcbb3926edb060fd55155156c34cf42b7ea1b67e1b8eba3c0cdf317a2f53d8094dee3672a17058dd57f688da6a89b4f86cfcdedad5bda42f
7
- data.tar.gz: cd6001c16fbbff9023fe26c739fe270c62176849d3a4809d7bfa1aff4dd74856b6a8db95297c312d4fe56334dc7c8f04772d6eeb8a97f8d2de9a9df841c8a2ab
6
+ metadata.gz: fa391e5ea9b16a84e28ce788964c24345882fa4ca906cd25efe16769e3e0dcc1708122094c92637dd32451958042ed2592b6c950586696e0075060ecbc8ea5c2
7
+ data.tar.gz: d65c3597e646a2e245e248248bff09afe6d1aa862f7eebe7f7348704eee1c97b6139a06c1bac0ae3f1a830e4ff88398d126e279869ba64994d12ecd44bab6bd4
@@ -5,8 +5,19 @@
5
5
  </component>
6
6
  <component name="NewModuleRootManager">
7
7
  <content url="file://$MODULE_DIR$" />
8
- <orderEntry type="inheritedJdk" />
8
+ <orderEntry type="jdk" jdkName="RVM: ruby-2.4.1 [rails5.1.6]" jdkType="RUBY_SDK" />
9
9
  <orderEntry type="sourceFolder" forTests="false" />
10
- <orderEntry type="library" scope="PROVIDED" name="bundler (v1.16.6, RVM: ruby-2.4.1) [gem]" level="application" />
10
+ <orderEntry type="library" scope="PROVIDED" name="brotli (v0.2.2, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
11
+ <orderEntry type="library" scope="PROVIDED" name="bundler (v1.16.6, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
12
+ <orderEntry type="library" scope="PROVIDED" name="diff-lcs (v1.3, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
13
+ <orderEntry type="library" scope="PROVIDED" name="guess_html_encoding (v0.0.11, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
14
+ <orderEntry type="library" scope="PROVIDED" name="mini_portile2 (v2.4.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
15
+ <orderEntry type="library" scope="PROVIDED" name="nokogiri (v1.10.1, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
16
+ <orderEntry type="library" scope="PROVIDED" name="rchardet (v1.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
17
+ <orderEntry type="library" scope="PROVIDED" name="rspec (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
18
+ <orderEntry type="library" scope="PROVIDED" name="rspec-core (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
19
+ <orderEntry type="library" scope="PROVIDED" name="rspec-mocks (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
20
+ <orderEntry type="library" scope="PROVIDED" name="rspec-support (v3.8.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
21
+ <orderEntry type="library" scope="PROVIDED" name="ruby-readability (v0.7.0, RVM: ruby-2.4.1 [rails5.1.6]) [gem]" level="application" />
11
22
  </component>
12
23
  </module>
@@ -1,4 +1,4 @@
1
- load File.dirname(__FILE__) + '/http/response.rb'
1
+ require_dependency File.dirname(__FILE__) + '/http/response.rb'
2
2
 
3
3
  module HttpCrawler
4
4
  module Client
@@ -40,9 +40,9 @@ module HttpCrawler
40
40
 
41
41
  # 初始化超时时间
42
42
  def init_timeout
43
- @connect_time = 3
44
- @write_time = 3
45
- @read_time = 3
43
+ @connect_time = 5
44
+ @write_time = 5
45
+ @read_time = 5
46
46
  end
47
47
 
48
48
  # 初始化 ssl 协议
@@ -1,2 +1,2 @@
1
- load File.dirname(__FILE__) + '/common/object.rb'
2
- load File.dirname(__FILE__) + '/common/string.rb'
1
+ require_dependency File.dirname(__FILE__) + '/common/object.rb'
2
+ require_dependency File.dirname(__FILE__) + '/common/string.rb'
@@ -1,11 +1,63 @@
1
1
  module HTTP
2
2
  class Response
3
3
 
4
+
4
5
  # 解压并转码 body 数据
5
6
  def decoding_body
6
- @decoding_body ||= self.body.to_s
7
+
8
+ return @decoding_body if @decoding_body
9
+ return nil unless self.body
10
+
11
+ # 数据解压
12
+ case self.headers['Content-Encoding']
13
+ when 'gzip' then
14
+ sio = StringIO.new(self.body.to_s)
15
+ gz = Zlib::GzipReader.new(sio)
16
+ @decoding_body = gz.read()
17
+ when 'br'
18
+ @decoding_body = Brotli.inflate(self.body.to_s)
19
+ # when 'deflate'
20
+ # # 可能错误代码 暂时没解决 deflate 编码格式
21
+ # @decoding_body = Zlib::Inflate.inflate(self.body.to_s)
22
+ else
23
+ @decoding_body = self.body.to_s
24
+ end
25
+
26
+ # @decoding_body = self.body.to_s
27
+
28
+ # 判断解压后数据编码格式
29
+
30
+ # 从header取编码格式
31
+ encoding = self.headers['Content-Type'][/charset=([^, ;"]*)/, 1] if self.headers['Content-Type']
32
+
33
+ # 从html中的 charset 取编码格式
34
+ # 不能使用,因为 decoding_body 还未转码,直接使用可能报错: ArgumentError: invalid byte sequence in UTF-8
35
+ # encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
36
+
37
+ # 通过 CharDet 判断编码格式
38
+ encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
39
+
40
+
41
+ # 进行转码
42
+ begin
43
+ @decoding_body.force_encoding(encoding).encode!('utf-8') if encoding && encoding != @decoding_body.encoding
44
+ rescue => e
45
+ # 转码错误后再次使用 CharDet 判断编码格式后进行转码
46
+ cd = CharDet.detect(@decoding_body)["encoding"]
47
+ if (cd && cd != encoding)
48
+ @decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
49
+ else
50
+ # 还是转码错误则抛出异常
51
+ Rails.logger.debug "encoding => #{encoding}"
52
+ Rails.logger.debug "cd => #{cd}"
53
+ Rails.logger.debug "@decoding_body[0..200] => #{@decoding_body[0..200]}"
54
+ raise e
55
+ end
56
+ end
57
+
7
58
  end
8
59
 
60
+ alias_method :dec, :decoding_body
9
61
  # def decoding_body
10
62
 
11
63
  def html
@@ -19,13 +71,17 @@ module HTTP
19
71
  end
20
72
 
21
73
  # 通过readability 解析数据
74
+ # [Readability::Document]
22
75
  def readability
23
76
  @readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
24
77
  end
25
78
 
79
+ def content
80
+ Nokogiri::HTML(readability.content).text
81
+ end
26
82
  # 解析
27
83
  def parsing
28
- json
84
+ self.json
29
85
  end
30
86
 
31
87
  # 获取解析结果
@@ -63,7 +119,7 @@ module HTTP
63
119
  regular_num = decoding_body =~ regular
64
120
  if regular_num
65
121
  Rails.logger.warn("触发验证信息")
66
- Rails.logger.warn(decoding_body[regular_num..(regular_num + 100)])
122
+ Rails.logger.warn(decoding_body[regular_num..(regular_num + 300)])
67
123
  return true
68
124
  end
69
125
  end
@@ -21,4 +21,4 @@ module HttpCrawler
21
21
  end
22
22
  end
23
23
 
24
- load File.dirname(__FILE__) + '/proxy/client.rb'
24
+ require_dependency File.dirname(__FILE__) + '/proxy/client.rb'
@@ -1,3 +1,3 @@
1
1
  module HttpCrawler
2
- VERSION = "0.3.0.2"
2
+ VERSION = "0.3.0.3"
3
3
  end
@@ -5,4 +5,4 @@ module HttpCrawler
5
5
  end
6
6
  end
7
7
 
8
- load File.dirname(__FILE__) + '/web/client.rb'
8
+ require_dependency File.dirname(__FILE__) + '/web/client.rb'
data/lib/http_crawler.rb CHANGED
@@ -1,17 +1,14 @@
1
- require 'net/http'
2
1
  require 'json'
3
2
  require 'digest/md5'
4
3
  require 'nokogiri'
5
4
 
6
- require 'http_crawler/errors.rb'
7
- load 'http_crawler/common.rb'
8
- load 'http_crawler/client.rb'
9
- load 'http_crawler/web.rb'
10
- load 'http_crawler/proxy.rb'
5
+ # 千万不能使用 require 或者 load,这样的话 Rails 调试的时候就不能热加载了
6
+ require_dependency 'http_crawler/errors.rb'
7
+ require_dependency 'http_crawler/common.rb'
8
+ require_dependency 'http_crawler/client.rb'
9
+ require_dependency 'http_crawler/web.rb'
10
+ require_dependency 'http_crawler/proxy.rb'
11
11
 
12
12
  module HttpCrawler
13
13
  # Your code goes here...
14
- def self.a
15
- puts "112"
16
- end
17
14
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: http_crawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0.2
4
+ version: 0.3.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - jagger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-02-19 00:00:00.000000000 Z
11
+ date: 2019-02-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -137,7 +137,6 @@ files:
137
137
  - lib/http_crawler/common/object.rb
138
138
  - lib/http_crawler/common/string.rb
139
139
  - lib/http_crawler/errors.rb
140
- - lib/http_crawler/http.rb
141
140
  - lib/http_crawler/http/response.rb
142
141
  - lib/http_crawler/proxy.rb
143
142
  - lib/http_crawler/proxy/README.md
@@ -1,260 +0,0 @@
1
- module HttpCrawler
2
- class HTTP < Net::HTTP
3
-
4
- # 自动获取代理,true 表示自动获取代理 、false 表示不自动获取
5
- attr_accessor :auto_proxy
6
- # 代理API的别名 主要关联 HttpCrawler::Proxy中维护的代理API
7
- attr_accessor :proxy_api
8
-
9
- def proxy_api
10
- @proxy_api ||= "my"
11
- end
12
-
13
- # 调用自己的代理池所需要的主键 key
14
- attr_accessor :proxy_key
15
- # 请求错误后的重复最大请求次数
16
- attr_accessor :max_error_num
17
- # 错误的url地址,存的是正则
18
- attr_accessor :error_urls
19
-
20
- def initialize(address, port = nil)
21
- super(address, port)
22
- @max_error_num = 2
23
- @error_num = 0
24
- @proxy_key = "default"
25
- @error_urls = []
26
- end
27
-
28
- def http_error_sleep
29
- sleep(0.5)
30
- end
31
-
32
- def server_error_sleep
33
- sleep(3)
34
- end
35
-
36
-
37
- @@proxy_list = []
38
- # 为 @http 重设代理
39
- def proxy(p = {})
40
-
41
- raise '代理设置 p_addr 不能为空' unless p["p_addr"]
42
- raise '代理设置 p_port 不能为空' unless p["p_port"]
43
-
44
- p["p_user"] ||= nil
45
- p["p_pass"] ||= nil
46
-
47
- Rails.logger.info("切换代理至 => #{p}")
48
- # 设为 false 否则不会启用代理
49
- @proxy_from_env = false
50
-
51
- # 初始化代理数据
52
- @proxy_address = p["p_addr"]
53
- @proxy_port = p["p_port"]
54
- @proxy_user = p["p_user"]
55
- @proxy_pass = p["p_pass"]
56
-
57
- end
58
-
59
- # 通过调用 api 获取代理或者通过自定义设置代理
60
- def get_proxy
61
-
62
- # while @@proxy_list.blank?
63
- # Rails.logger.debug("@@proxy_list 为空进行更新")
64
- # proxy_client = HttpCrawler::Proxy.for(proxy_api)
65
- # proxy_r = proxy_client.get_proxy(key: proxy_key)
66
- # @@proxy_list << proxy_r.parsing unless proxy_r.parsing.blank?
67
- # Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
68
- # sleep(1)
69
- # end
70
- # p = @@proxy_list.delete_at(0)
71
-
72
- proxy_ip = nil
73
- begin
74
- Rails.logger.debug("开始获取代理IP")
75
- proxy_client = HttpCrawler::Proxy.for(proxy_api)
76
- proxy_r = proxy_client.get_proxy(key: proxy_key)
77
- proxy_ip = proxy_r.parsing unless proxy_r.parsing.blank?
78
- if proxy_ip.blank?
79
- Rails.logger.warn "无最新代理等待5秒后重新获取"
80
- else
81
- break
82
- end
83
- sleep(5)
84
- end while true
85
-
86
- Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{proxy_ip}")
87
-
88
- unless proxy_ip && proxy_ip["p_addr"] && proxy_ip["p_port"]
89
- Rails.logger.warn "无最新代理等待5秒后重新获取"
90
- sleep(5)
91
- proxy_ip = get_proxy
92
- end
93
-
94
- if (@proxy_address == proxy_ip["p_addr"] && @proxy_port == proxy_ip["p_port"])
95
- Rails.logger.warn "无最新代理等待5秒后重新获取"
96
- sleep(5)
97
- proxy_ip = get_proxy
98
- end
99
- proxy_ip
100
- end
101
-
102
- def update_proxy(proxy_ip = {})
103
- if proxy_ip.blank?
104
- proxy(get_proxy)
105
- else
106
- proxy(proxy_ip)
107
- end
108
- end
109
-
110
- # 如果自动更新代理 则更新代理返回 true,否则返回false
111
- def update_proxy?(proxy_ip = {})
112
- if auto_proxy
113
- if proxy_ip.blank?
114
- proxy(get_proxy)
115
- else
116
- proxy(proxy_ip)
117
- end
118
- return true
119
- else
120
- return false
121
- end
122
- end
123
-
124
-
125
- # 重定向请求
126
- def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
127
- # You should choose a better exception.
128
- raise ArgumentError, 'too many HTTP repeated' if limit == 0
129
- # 更新uri_or_path
130
- uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
131
-
132
- response = get(uri_or_path, initheader, dest, &block)
133
- case response
134
- when Net::HTTPSuccess then
135
- response
136
- when Net::HTTPRedirection then
137
- location = response['location']
138
- Rails.logger.warn "redirected to #{location}"
139
- @error_urls.each do |url_string|
140
- if location =~ /#{url_string}/
141
- raise "跳转到异常url => #{location}"
142
- end
143
- end
144
- # 传入 location 进行跳转
145
- get_fetch(location, initheader, dest, limit - 1, &block)
146
- when Net::HTTPServerError then
147
- Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
148
- server_error_sleep
149
- # 重新请求
150
- get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
151
- when Net::HTTPProxyAuthenticationRequired then
152
- Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
153
- if update_proxy?
154
- server_error_sleep
155
- # 重新请求
156
- get_fetch(uri_or_path, initheader, dest, limit - 1, &block)
157
- else
158
- response.error!
159
- end
160
- else
161
- Rails.logger.debug uri_or_path
162
- Rails.logger.debug initheader
163
- Rails.logger.debug response.body
164
- response.error!
165
- end
166
- end
167
-
168
- # 重定向请求
169
- def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
170
- # 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
171
- uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
172
- # Rails.logger.debug "post_fetch => #{uri_or_path}"
173
- response = post(uri_or_path, data, initheader, dest, &block)
174
- case response
175
- when Net::HTTPSuccess then
176
- response
177
- when Net::HTTPRedirection then
178
- location = response['location']
179
- Rails.logger.warn "redirected to #{location}"
180
- @error_urls.each do |url_string|
181
- if location =~ /#{url_string}/
182
- raise "跳转到异常url => #{location}"
183
- end
184
- end
185
- # 传入 location 进行跳转
186
- get_fetch(location, initheader, dest, 9, &block)
187
- when Net::HTTPServerError then
188
- Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
189
- server_error_sleep
190
- # 重新请求
191
- post_fetch(uri_or_path, initheader, dest, &block)
192
- when Net::HTTPProxyAuthenticationRequired then
193
- Rails.logger.warn "Net::HTTPProxyAuthenticationRequired 407 to proxy:[#{@proxy_address}:#{@proxy_port}] =>#{address}"
194
- if update_proxy?
195
- server_error_sleep
196
- # 重新请求
197
- post_fetch(uri_or_path, initheader, dest, &block)
198
- else
199
- response.error!
200
- end
201
- else
202
- server_error_sleep
203
- response.error!
204
- end
205
- end
206
-
207
- # def post_fetch
208
-
209
- #
210
- # 重写 发送请求的方法
211
- #
212
- def request(req, body = nil, &block)
213
- begin
214
- Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
215
- Rails.logger.debug("body => #{body}") if started? && body
216
- super(req, body, &block)
217
- rescue => error
218
- Rails.logger.error "出错了! 错误类型 => #{error.class}"
219
- if started?
220
- # started? 是为了判断是否结束http请求,如果不添加则会处理2次异常
221
- Rails.logger.error("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}")
222
- Rails.logger.error("body => #{body}") if body
223
- raise error
224
- else
225
- http_error_sleep
226
- # 最大错误尝试次数
227
- if @error_num < @max_error_num
228
- @error_num += 1
229
- retry # 这将把控制移到 begin 的开头
230
- else
231
-
232
- # 超过最大错误限制 判断错误类型
233
- case error
234
- when EOFError
235
- Rails.logger.warn "EOFError!"
236
- when Timeout::Error
237
- Rails.logger.warn "请求超时!"
238
- when Net::HTTPServerException
239
- Rails.logger.warn "代理失效:[#{proxy_address}:#{proxy_port}]"
240
- when Errno::ECONNREFUSED
241
- Rails.logger.warn "Errno::ECONNREFUSED"
242
- else
243
- raise error
244
- end
245
-
246
- if update_proxy?
247
- @error_num = 0
248
- retry # 这将把控制移到 begin 的开头
249
- else
250
- raise error
251
- end
252
- end
253
- end
254
- end # begin
255
- end # def request(req, body = nil, &block)
256
- end
257
- end
258
-
259
-
260
- load File.dirname(__FILE__) + '/http/response.rb'