http_crawler 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.idea/vcs.xml +6 -0
- data/.rspec +1 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +10 -0
- data/README.md +55 -0
- data/Rakefile +2 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/http_crawler.gemspec +45 -0
- data/lib/http_crawler/client.rb +88 -0
- data/lib/http_crawler/http.rb +211 -0
- data/lib/http_crawler/net/http.rb +7 -0
- data/lib/http_crawler/net/response.rb +96 -0
- data/lib/http_crawler/object.rb +13 -0
- data/lib/http_crawler/proxy/README.md +3 -0
- data/lib/http_crawler/proxy/client.rb +7 -0
- data/lib/http_crawler/proxy/response.rb +10 -0
- data/lib/http_crawler/proxy/test_proxy_api/README.md +18 -0
- data/lib/http_crawler/proxy/test_proxy_api/client.rb +29 -0
- data/lib/http_crawler/proxy/test_proxy_api/response/get_proxy.rb +24 -0
- data/lib/http_crawler/proxy/test_proxy_api/response.rb +12 -0
- data/lib/http_crawler/proxy.rb +18 -0
- data/lib/http_crawler/string.rb +9 -0
- data/lib/http_crawler/version.rb +3 -0
- data/lib/http_crawler/web/README.md +4 -0
- data/lib/http_crawler/web/baidu/README.md +19 -0
- data/lib/http_crawler/web/baidu/client.rb +25 -0
- data/lib/http_crawler/web/baidu/response/index.rb +16 -0
- data/lib/http_crawler/web/baidu/response.rb +10 -0
- data/lib/http_crawler/web.rb +7 -0
- data/lib/http_crawler.rb +9 -0
- metadata +175 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f034023a8c50c41be3d4e423d39fa2ad44e75930
|
4
|
+
data.tar.gz: 745c8a86a328387f8b8c7ef68e351c5d63d4d61a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: aab1febfdc72a126e9edb1b496b661d236d3ae6689d5bf29350d4bffdbb4e9551e03487c2a7fc01d6ec7f15ff55d0872db8f90fb791474834982168aca88e531
|
7
|
+
data.tar.gz: 317de9a4ef0d5423b57de20cb9220cdecd598d431d67f03b025fd05c0a2ce2d013eec4e3da816ce50a35722c161aef01a90ee376add0e0ebc06c779b0e296370
|
data/.gitignore
ADDED
data/.idea/vcs.xml
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--require spec_helper
|
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at 1336098842@qq.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [http://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: http://contributor-covenant.org
|
74
|
+
[version]: http://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
source "https://rubygems.org"
|
2
|
+
|
3
|
+
git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
|
4
|
+
# gem 'rchardet', '~> 1.8'
|
5
|
+
# gem 'nokogiri', '~> 1.8.4'
|
6
|
+
#
|
7
|
+
# gem "ruby-readability", :require => 'readability'
|
8
|
+
|
9
|
+
# Specify your gem's dependencies in http_crawler.gemspec
|
10
|
+
gemspec
|
data/README.md
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# HttpCrawler
|
2
|
+
|
3
|
+
Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/http_crawler`. To experiment with that code, run `bin/console` for an interactive prompt.
|
4
|
+
|
5
|
+
TODO: Delete this and the text above, and describe your gem
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'http_crawler'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install http_crawler
|
22
|
+
|
23
|
+
|
24
|
+
## 示例:百度爬虫维护
|
25
|
+
|
26
|
+
|
27
|
+
### 通过对象调用
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
client = HttpCrawler::Client::Baidu::Client.new
|
31
|
+
client.index # 抓取首页
|
32
|
+
```
|
33
|
+
|
34
|
+
### 通过别名调用
|
35
|
+
```ruby
|
36
|
+
client = HttpCrawler::Client.for("baidu") #
|
37
|
+
client.index # 抓取首页
|
38
|
+
```
|
39
|
+
|
40
|
+
|
41
|
+
## 示例:测试API
|
42
|
+
|
43
|
+
|
44
|
+
### 通过对象调用
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
client = HttpCrawler::Proxy::TestProxyApi::Client.new
|
48
|
+
client.index # 抓取首页
|
49
|
+
```
|
50
|
+
|
51
|
+
### 通过别名调用
|
52
|
+
```ruby
|
53
|
+
client = HttpCrawler::Proxy.for("test_proxy_api") #
|
54
|
+
client.index # 抓取首页
|
55
|
+
```
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "http_crawler"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "http_crawler/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "http_crawler"
|
8
|
+
spec.version = HttpCrawler::VERSION
|
9
|
+
spec.authors = ["jagger"]
|
10
|
+
spec.email = ["1336098842@qq.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{http 爬虫。}
|
13
|
+
spec.description = %q{初级开发工程师,基于net/http 写的爬虫扩展包。}
|
14
|
+
spec.homepage = "https://rubygems.org/gems/http_crawler"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
18
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
19
|
+
if spec.respond_to?(:metadata)
|
20
|
+
spec.metadata["allowed_push_host"] = "https://rubygems.org"
|
21
|
+
else
|
22
|
+
raise "RubyGems 2.0 or newer is required to protect against " \
|
23
|
+
"public gem pushes."
|
24
|
+
end
|
25
|
+
|
26
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
27
|
+
f.match(%r{^(test|spec|features)/})
|
28
|
+
end
|
29
|
+
|
30
|
+
spec.files += Dir['lib/**/*.rb']
|
31
|
+
|
32
|
+
spec.bindir = "exe"
|
33
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
34
|
+
spec.require_paths = ["lib"]
|
35
|
+
|
36
|
+
spec.add_development_dependency "rspec", "~> 3.8"
|
37
|
+
spec.add_development_dependency "bundler", "~> 1.15"
|
38
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
39
|
+
|
40
|
+
spec.add_dependency "rchardet", "~> 1.8"
|
41
|
+
spec.add_dependency "nokogiri", "~> 1.8"
|
42
|
+
spec.add_dependency "ruby-readability", "~> 0.7.0"
|
43
|
+
spec.add_dependency "brotli", "~> 0.2.1"
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
load File.dirname(__FILE__) + '/http.rb'
|
2
|
+
load File.dirname(__FILE__) + '/object.rb'
|
3
|
+
load File.dirname(__FILE__) + '/string.rb'
|
4
|
+
|
5
|
+
module HttpCrawler
|
6
|
+
module Client
|
7
|
+
|
8
|
+
class << self
|
9
|
+
|
10
|
+
# 接收格式
|
11
|
+
# web_name = "biquge_duquanben"
|
12
|
+
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
|
13
|
+
#
|
14
|
+
def for(web_name, *args)
|
15
|
+
"HttpCrawler::Web::#{web_name.camelize}::Client".constantize.new(*args)
|
16
|
+
end
|
17
|
+
|
18
|
+
#
|
19
|
+
# 接收格式
|
20
|
+
# module_name = "HttpCrawler::Web::BiqugeDuquanben"
|
21
|
+
# 返回 HttpCrawler::Web::BiqugeDuquanben::Client 实例
|
22
|
+
#
|
23
|
+
def for_module(module_name, *args)
|
24
|
+
"#{module_name}::Client".constantize.new(*args)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
attr_reader :http, :uri
|
29
|
+
|
30
|
+
#
|
31
|
+
# init_uri 如果未初始化@uri,则会报错
|
32
|
+
# 继承类需要重定义 init_uri
|
33
|
+
#
|
34
|
+
def initialize
|
35
|
+
raise "Client uri为空" unless init_uri
|
36
|
+
@http = Crawler::HTTP.new(uri.host, uri.port)
|
37
|
+
|
38
|
+
@http.use_ssl = (uri.scheme == "https")
|
39
|
+
|
40
|
+
@http.open_timeout = 5
|
41
|
+
@http.read_timeout = 5
|
42
|
+
@http.proxy_key = "#{self.class}"
|
43
|
+
init_http
|
44
|
+
|
45
|
+
Rails.logger.debug "proxy_key => #{@http.proxy_key}"
|
46
|
+
end
|
47
|
+
|
48
|
+
# 初始化http参数
|
49
|
+
def init_http
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
# init_uri 如果未初始化@uri,则会报错
|
54
|
+
# 继承类需要实现 @uri = URI("http://host")
|
55
|
+
#
|
56
|
+
def init_uri
|
57
|
+
@uri = nil
|
58
|
+
end
|
59
|
+
|
60
|
+
def header
|
61
|
+
@header ||= init_header
|
62
|
+
end
|
63
|
+
|
64
|
+
def init_header
|
65
|
+
nil
|
66
|
+
end
|
67
|
+
|
68
|
+
def update_header(parameter = {})
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
def update_proxy(proxy = {})
|
73
|
+
@http.update_proxy(proxy)
|
74
|
+
end
|
75
|
+
|
76
|
+
def auto_proxy=(value)
|
77
|
+
Rails.logger.debug "自动更新代理"
|
78
|
+
@http.auto_proxy = value
|
79
|
+
@http.update_proxy if (value == true && @http.proxy? == false)
|
80
|
+
end
|
81
|
+
|
82
|
+
# 是否验证码界面
|
83
|
+
def validation_page?(*arg)
|
84
|
+
false
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,211 @@
|
|
1
|
+
load File.dirname(__FILE__) + '/net/http.rb'
|
2
|
+
load File.dirname(__FILE__) + '/net/response.rb'
|
3
|
+
|
4
|
+
module HttpCrawler
|
5
|
+
class HTTP < Net::HTTP
|
6
|
+
|
7
|
+
# 自动获取代理,true 表示自动获取代理 、false 表示不自动获取
|
8
|
+
attr_accessor :auto_proxy
|
9
|
+
# 代理API的别名 主要关联 Crawler::Proxy中维护的代理API
|
10
|
+
attr_accessor :proxy_api
|
11
|
+
# 调用自己的代理池所需要的主键 key
|
12
|
+
attr_accessor :proxy_key
|
13
|
+
# 请求错误后的重复最大请求次数
|
14
|
+
attr_accessor :max_error_num
|
15
|
+
|
16
|
+
def initialize(address, port = nil)
|
17
|
+
super(address, port)
|
18
|
+
@max_error_num = 2
|
19
|
+
@error_num = 0
|
20
|
+
@proxy_key = "default"
|
21
|
+
end
|
22
|
+
|
23
|
+
def http_error_sleep
|
24
|
+
sleep(0.5)
|
25
|
+
end
|
26
|
+
|
27
|
+
def server_error_sleep
|
28
|
+
sleep(3)
|
29
|
+
end
|
30
|
+
|
31
|
+
def proxy_api
|
32
|
+
@proxy_api ||= "my"
|
33
|
+
end
|
34
|
+
@@proxy_list = []
|
35
|
+
# 为 @http 重设代理
|
36
|
+
def proxy(p = {})
|
37
|
+
|
38
|
+
raise '代理设置 p_addr 不能为空' unless p["p_addr"]
|
39
|
+
raise '代理设置 p_port 不能为空' unless p["p_port"]
|
40
|
+
|
41
|
+
p["p_user"] ||= nil
|
42
|
+
p["p_pass"] ||= nil
|
43
|
+
|
44
|
+
Rails.logger.info("切换代理至 => #{p}")
|
45
|
+
# 设为 false 否则不会启用代理
|
46
|
+
@proxy_from_env = false
|
47
|
+
|
48
|
+
# 初始化代理数据
|
49
|
+
@proxy_address = p["p_addr"]
|
50
|
+
@proxy_port = p["p_port"]
|
51
|
+
@proxy_user = p["p_user"]
|
52
|
+
@proxy_pass = p["p_pass"]
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
# 通过调用 api 获取代理或者通过自定义设置代理
|
57
|
+
def get_proxy
|
58
|
+
|
59
|
+
while @@proxy_list.blank?
|
60
|
+
Rails.logger.debug("@@proxy_list 为空进行更新")
|
61
|
+
proxy_client = Crawler::Proxy.for(proxy_api)
|
62
|
+
proxy_r = proxy_client.get_proxy(key: proxy_key)
|
63
|
+
@@proxy_list << proxy_r.parsing
|
64
|
+
Rails.logger.debug("@@proxy_list => #{@@proxy_list}")
|
65
|
+
sleep(1)
|
66
|
+
end
|
67
|
+
|
68
|
+
p = @@proxy_list.delete_at(0)
|
69
|
+
|
70
|
+
Rails.logger.debug("当前IP => #{@proxy_address}:#{@proxy_port},获取最新代理 => #{p}")
|
71
|
+
|
72
|
+
unless p && p["p_addr"] && p["p_port"]
|
73
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
74
|
+
sleep(5)
|
75
|
+
p = get_proxy
|
76
|
+
end
|
77
|
+
|
78
|
+
if (@proxy_address == p["p_addr"] && @proxy_port == p["p_port"])
|
79
|
+
Rails.logger.warn "无最新代理等待5秒后重新获取"
|
80
|
+
sleep(5)
|
81
|
+
p = get_proxy
|
82
|
+
end
|
83
|
+
p
|
84
|
+
end
|
85
|
+
|
86
|
+
def update_proxy(p = {})
|
87
|
+
if p.blank?
|
88
|
+
proxy(get_proxy)
|
89
|
+
else
|
90
|
+
proxy(p)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
# 如果自动更新代理 则更新代理返回 true,否则返回false
|
95
|
+
def update_proxy?(p = {})
|
96
|
+
if auto_proxy
|
97
|
+
if p.blank?
|
98
|
+
proxy(get_proxy)
|
99
|
+
else
|
100
|
+
proxy(p)
|
101
|
+
end
|
102
|
+
return true
|
103
|
+
else
|
104
|
+
return false
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
|
109
|
+
# 重定向请求
|
110
|
+
def get_fetch(uri_or_path, initheader = nil, dest = nil, limit = 10, &block)
|
111
|
+
# You should choose a better exception.
|
112
|
+
raise ArgumentError, 'too many HTTP repeated' if limit == 0
|
113
|
+
# 更新uri_or_path
|
114
|
+
uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
|
115
|
+
|
116
|
+
response = get(uri_or_path, initheader, dest, &block)
|
117
|
+
case response
|
118
|
+
when Net::HTTPSuccess then
|
119
|
+
response
|
120
|
+
when Net::HTTPRedirection then
|
121
|
+
location = response['location']
|
122
|
+
Rails.logger.warn "redirected to #{location}"
|
123
|
+
# 传入 location 进行跳转
|
124
|
+
get_fetch(location, initheader, dest, limit - 1, &block)
|
125
|
+
when Net::HTTPServerError then
|
126
|
+
Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
|
127
|
+
server_error_sleep
|
128
|
+
# 重新请求
|
129
|
+
get_fetch(uri_or_path, initheader, dest, &block)
|
130
|
+
else
|
131
|
+
server_error_sleep
|
132
|
+
response.error!
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# 重定向请求
|
137
|
+
def post_fetch(uri_or_path, data, initheader = nil, dest = nil, &block)
|
138
|
+
# 更新uri_or_path 如果 uri_or_path 是 String类型 同时 又不是 ascii编码格式就进行转码
|
139
|
+
uri_or_path = URI.encode(uri_or_path) if String === uri_or_path && CharDet.detect(uri_or_path)["encoding"] != "ascii"
|
140
|
+
Rails.logger.debug "post_fetch => #{uri_or_path}"
|
141
|
+
response = post(uri_or_path, data, initheader, dest, &block)
|
142
|
+
case response
|
143
|
+
when Net::HTTPSuccess then
|
144
|
+
response
|
145
|
+
when Net::HTTPRedirection then
|
146
|
+
location = response['location']
|
147
|
+
Rails.logger.warn "redirected to #{location}"
|
148
|
+
# 传入 location 进行跳转
|
149
|
+
get_fetch(location, initheader, dest, 9, &block)
|
150
|
+
when Net::HTTPServerError then
|
151
|
+
Rails.logger.warn "Net::HTTPServerError 5XX to #{address}"
|
152
|
+
server_error_sleep
|
153
|
+
# 重新请求
|
154
|
+
post_fetch(uri_or_path, initheader, dest, &block)
|
155
|
+
else
|
156
|
+
server_error_sleep
|
157
|
+
response.error!
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# def post_fetch
|
162
|
+
|
163
|
+
#
|
164
|
+
# 重写 发送请求的方法
|
165
|
+
#
|
166
|
+
def request(req, body = nil, &block)
|
167
|
+
begin
|
168
|
+
Rails.logger.debug("#{req.class} => #{use_ssl? ? "https://" : "http://" }#{address}:#{port}#{req.path}") if started?
|
169
|
+
super(req, body, &block)
|
170
|
+
rescue => error
|
171
|
+
if started?
|
172
|
+
# started? 是为了判断是否结束http请求,如果不添加则会处理2次异常
|
173
|
+
raise error
|
174
|
+
else
|
175
|
+
# 最大错误尝试次数
|
176
|
+
if @error_num < @max_error_num
|
177
|
+
@error_num += 1
|
178
|
+
http_error_sleep
|
179
|
+
retry # 这将把控制移到 begin 的开头
|
180
|
+
else
|
181
|
+
# 超过最大错误限制 判断错误类型
|
182
|
+
case error
|
183
|
+
when Net::HTTPFatalError
|
184
|
+
raise error
|
185
|
+
when EOFError
|
186
|
+
Rails.logger.warn "EOFError!"
|
187
|
+
if update_proxy?
|
188
|
+
proxy(get_proxy)
|
189
|
+
http_error_sleep
|
190
|
+
retry # 这将把控制移到 begin 的开头
|
191
|
+
else
|
192
|
+
raise error
|
193
|
+
end
|
194
|
+
when Timeout::Error
|
195
|
+
Rails.logger.warn "请求超时!"
|
196
|
+
if update_proxy?
|
197
|
+
@error_num = 0
|
198
|
+
http_error_sleep
|
199
|
+
retry # 这将把控制移到 begin 的开头
|
200
|
+
else
|
201
|
+
raise error
|
202
|
+
end
|
203
|
+
else
|
204
|
+
raise error
|
205
|
+
end
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end # begin
|
209
|
+
end # def request(req, body = nil, &block)
|
210
|
+
end
|
211
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Net
|
2
|
+
class HTTPResponse
|
3
|
+
|
4
|
+
# 解压并转码 body 数据
|
5
|
+
def decoding_body
|
6
|
+
|
7
|
+
return @decoding_body if @decoding_body
|
8
|
+
return nil unless body
|
9
|
+
|
10
|
+
# 数据解压
|
11
|
+
case header['Content-Encoding']
|
12
|
+
when 'gzip' then
|
13
|
+
sio = StringIO.new(body)
|
14
|
+
gz = Zlib::GzipReader.new(sio)
|
15
|
+
@decoding_body = gz.read()
|
16
|
+
when 'br'
|
17
|
+
@decoding_body = Brotli.inflate(body)
|
18
|
+
when 'deflate'
|
19
|
+
# 可能错误代码 暂时没解决 deflate 编码格式
|
20
|
+
@decoding_body = Zlib::Inflate.inflate(body)
|
21
|
+
else
|
22
|
+
@decoding_body = body
|
23
|
+
end
|
24
|
+
|
25
|
+
# 判断解压后数据编码格式
|
26
|
+
|
27
|
+
# 从header取编码格式
|
28
|
+
encoding = header['Content-Type'][/charset=([^, ;"]*)/, 1]
|
29
|
+
|
30
|
+
# 从html中的 charset 取编码格式
|
31
|
+
encoding = @decoding_body[/charset=([^, ;"]*)/, 1] unless encoding
|
32
|
+
|
33
|
+
# 通过 CharDet 判断编码格式
|
34
|
+
encoding = CharDet.detect(@decoding_body)["encoding"] unless encoding
|
35
|
+
|
36
|
+
# 进行转码
|
37
|
+
begin
|
38
|
+
@decoding_body.force_encoding(encoding).encode!('utf-8') if encoding != @decoding_body.encoding
|
39
|
+
rescue => e
|
40
|
+
# 转码错误后再次使用 CharDet 判断编码格式后进行转码
|
41
|
+
cd = CharDet.detect(@decoding_body)["encoding"]
|
42
|
+
if (cd && cd != encoding)
|
43
|
+
@decoding_body.force_encoding(cd).encode!('utf-8') if encoding != @decoding_body.encoding
|
44
|
+
else
|
45
|
+
# 还是转码错误则抛出异常
|
46
|
+
raise e
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
@decoding_body
|
51
|
+
end
|
52
|
+
|
53
|
+
# def decoding_body
|
54
|
+
|
55
|
+
def html
|
56
|
+
@html ||= Nokogiri::HTML(decoding_body)
|
57
|
+
end
|
58
|
+
|
59
|
+
def json
|
60
|
+
@json ||= JSON.parse(decoding_body)
|
61
|
+
@json = JSON.parse(@json) if String === @json
|
62
|
+
@json
|
63
|
+
end
|
64
|
+
|
65
|
+
# 通过readability 解析数据
|
66
|
+
def readability
|
67
|
+
@readability ||= Readability::Document.new(decoding_body, {do_not_guess_encoding: true})
|
68
|
+
end
|
69
|
+
|
70
|
+
# 解析
|
71
|
+
def parsing
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
|
75
|
+
def get_date(str)
|
76
|
+
time = Time.now
|
77
|
+
case str
|
78
|
+
when /^(\d{1,2})小时前$/
|
79
|
+
time = time - $1.to_i.hours
|
80
|
+
when /^(\d{1,2})月(\d{1,2})日$/
|
81
|
+
time = Time.local(time.year, $1.to_i, $2.to_i)
|
82
|
+
when /^(\d{4})年(\d{1,2})月(\d{1,2})日$/
|
83
|
+
time = Time.local($1.to_i, $2.to_i, $3.to_i)
|
84
|
+
when /^(\d{1,2})月(\d{1,2})日[ ]{0,3}(\d{1,2}):(\d{1,2})$/ # 09月30日 12:04
|
85
|
+
time = Time.local(time.year, $1.to_i, $2.to_i, $3.to_i, $4.to_i)
|
86
|
+
end
|
87
|
+
return time
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
# 是否是网站验证 true表示正常数据、false表示弹出网站验证
|
92
|
+
def web_verify(*arg)
|
93
|
+
true
|
94
|
+
end
|
95
|
+
end # class Net::HTTPResponse
|
96
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# 示例:测试API
|
2
|
+
|
3
|
+
|
4
|
+
### 通过对象调用
|
5
|
+
|
6
|
+
```ruby
|
7
|
+
client = HttpCrawler::Proxy::TestProxyApi::Client.new
|
8
|
+
client.get_proxy # 获取代理
|
9
|
+
```
|
10
|
+
|
11
|
+
### 通过别名调用
|
12
|
+
```ruby
|
13
|
+
client = HttpCrawler::Proxy.for("test_proxy_api") #
|
14
|
+
client.get_proxy # 获取代理
|
15
|
+
```
|
16
|
+
|
17
|
+
### response.rb
|
18
|
+
用于维护不同响应结果的处理方法
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module HttpCrawler
|
2
|
+
module Proxy
|
3
|
+
module TestProxyApi
|
4
|
+
class Client
|
5
|
+
|
6
|
+
include(Crawler::Client)
|
7
|
+
include(Crawler::Proxy::Client)
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def new(*args)
|
11
|
+
@client ||= super(*args)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def init_uri
|
16
|
+
@uri = URI("http://127.0.0.1:1111/")
|
17
|
+
end
|
18
|
+
|
19
|
+
# http://39.108.59.38:7772/Tools/proxyIP.ashx?OrderNumber=ccd4c8912691f28861a1ed048fec88dc&poolIndex=22717&cache=1&qty=2
|
20
|
+
def get_proxy(parameter = {})
|
21
|
+
r = http.get_fetch("/api/get_proxy")
|
22
|
+
r.extend(Crawler::Proxy::Laofu::Response::GetProxy)
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end # module BiQuGe_DuQuanBen
|
27
|
+
end # module Web
|
28
|
+
end # module Crawler
|
29
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# 查询
|
2
|
+
module HttpCrawler
|
3
|
+
module Proxy
|
4
|
+
module Laofu
|
5
|
+
module Response
|
6
|
+
module TestProxyApi
|
7
|
+
def parsing
|
8
|
+
array = []
|
9
|
+
decoding_body.scan(/([^\n\r:]*):([^\n\r]*)/) do |v1, v2|
|
10
|
+
if v1 =~ /\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/
|
11
|
+
array[array.length] = {"p_addr" => v1, "p_port" => v2, "p_user" => nil, "p_pass" => nil}
|
12
|
+
else
|
13
|
+
Rails.logger.warn decoding_body
|
14
|
+
end
|
15
|
+
end
|
16
|
+
array
|
17
|
+
end
|
18
|
+
end # module GetProxy
|
19
|
+
end # module Response
|
20
|
+
end # module Laofu
|
21
|
+
end # module Proxy
|
22
|
+
end # module Crawler
|
23
|
+
|
24
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module HttpCrawler
|
2
|
+
module Proxy
|
3
|
+
|
4
|
+
class << self
|
5
|
+
|
6
|
+
# 接收格式
|
7
|
+
# web_name = "feilong"
|
8
|
+
# 返回 Crawler::Proxy::Feilong::Client 实例
|
9
|
+
#
|
10
|
+
def for(web_name, *arg)
|
11
|
+
"Crawler::Proxy::#{web_name.camelize}::Client".constantize.new(*arg)
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# 示例:百度爬虫维护
|
2
|
+
|
3
|
+
|
4
|
+
### 通过对象调用
|
5
|
+
|
6
|
+
```ruby
|
7
|
+
client = HttpCrawler::Client::Baidu::Client.new
|
8
|
+
client.index # 抓取首页
|
9
|
+
```
|
10
|
+
|
11
|
+
### 通过别名调用
|
12
|
+
```ruby
|
13
|
+
client = HttpCrawler::Client.for("baidu") #
|
14
|
+
client.index # 抓取首页
|
15
|
+
```
|
16
|
+
|
17
|
+
|
18
|
+
### response.rb
|
19
|
+
用于维护不同响应结果的处理方法
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module HttpCrawler
|
2
|
+
module Web
|
3
|
+
module Baidu
|
4
|
+
class Client
|
5
|
+
include(Crawler::Client)
|
6
|
+
|
7
|
+
def init_http
|
8
|
+
@http.open_timeout = 3
|
9
|
+
@http.read_timeout = 3
|
10
|
+
end
|
11
|
+
|
12
|
+
def init_uri
|
13
|
+
@uri = URI("https://www.baidu.com/")
|
14
|
+
end
|
15
|
+
|
16
|
+
def index(parameter = {})
|
17
|
+
r = http.get_fetch("/", header)
|
18
|
+
r.extend(HttpCrawler::Web::Baidu::Response::Index)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end # module Baidu
|
23
|
+
end # module Web
|
24
|
+
end # module Crawler
|
25
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# 查询
|
2
|
+
module HttpCrawler
|
3
|
+
module Web
|
4
|
+
module Baidu
|
5
|
+
module Response
|
6
|
+
module Index
|
7
|
+
def parsing(parameter = {})
|
8
|
+
html
|
9
|
+
end
|
10
|
+
end # module Search
|
11
|
+
end # module Response
|
12
|
+
end # module Qichacha
|
13
|
+
end # module Web
|
14
|
+
end # module Crawler
|
15
|
+
|
16
|
+
|
data/lib/http_crawler.rb
ADDED
metadata
ADDED
@@ -0,0 +1,175 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: http_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- jagger
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2018-12-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.8'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.8'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.15'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.15'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rchardet
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.8'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '1.8'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.8'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.8'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: ruby-readability
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.7.0
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.7.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: brotli
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 0.2.1
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.2.1
|
111
|
+
description: 初级开发工程师,基于net/http 写的爬虫扩展包。
|
112
|
+
email:
|
113
|
+
- 1336098842@qq.com
|
114
|
+
executables: []
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- ".gitignore"
|
119
|
+
- ".idea/vcs.xml"
|
120
|
+
- ".rspec"
|
121
|
+
- CODE_OF_CONDUCT.md
|
122
|
+
- Gemfile
|
123
|
+
- README.md
|
124
|
+
- Rakefile
|
125
|
+
- bin/console
|
126
|
+
- bin/setup
|
127
|
+
- http_crawler.gemspec
|
128
|
+
- lib/http_crawler.rb
|
129
|
+
- lib/http_crawler/client.rb
|
130
|
+
- lib/http_crawler/http.rb
|
131
|
+
- lib/http_crawler/net/http.rb
|
132
|
+
- lib/http_crawler/net/response.rb
|
133
|
+
- lib/http_crawler/object.rb
|
134
|
+
- lib/http_crawler/proxy.rb
|
135
|
+
- lib/http_crawler/proxy/README.md
|
136
|
+
- lib/http_crawler/proxy/client.rb
|
137
|
+
- lib/http_crawler/proxy/response.rb
|
138
|
+
- lib/http_crawler/proxy/test_proxy_api/README.md
|
139
|
+
- lib/http_crawler/proxy/test_proxy_api/client.rb
|
140
|
+
- lib/http_crawler/proxy/test_proxy_api/response.rb
|
141
|
+
- lib/http_crawler/proxy/test_proxy_api/response/get_proxy.rb
|
142
|
+
- lib/http_crawler/string.rb
|
143
|
+
- lib/http_crawler/version.rb
|
144
|
+
- lib/http_crawler/web.rb
|
145
|
+
- lib/http_crawler/web/README.md
|
146
|
+
- lib/http_crawler/web/baidu/README.md
|
147
|
+
- lib/http_crawler/web/baidu/client.rb
|
148
|
+
- lib/http_crawler/web/baidu/response.rb
|
149
|
+
- lib/http_crawler/web/baidu/response/index.rb
|
150
|
+
homepage: https://rubygems.org/gems/http_crawler
|
151
|
+
licenses:
|
152
|
+
- MIT
|
153
|
+
metadata:
|
154
|
+
allowed_push_host: https://rubygems.org
|
155
|
+
post_install_message:
|
156
|
+
rdoc_options: []
|
157
|
+
require_paths:
|
158
|
+
- lib
|
159
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
160
|
+
requirements:
|
161
|
+
- - ">="
|
162
|
+
- !ruby/object:Gem::Version
|
163
|
+
version: '0'
|
164
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
165
|
+
requirements:
|
166
|
+
- - ">="
|
167
|
+
- !ruby/object:Gem::Version
|
168
|
+
version: '0'
|
169
|
+
requirements: []
|
170
|
+
rubyforge_project:
|
171
|
+
rubygems_version: 2.6.14
|
172
|
+
signing_key:
|
173
|
+
specification_version: 4
|
174
|
+
summary: http 爬虫。
|
175
|
+
test_files: []
|