list_spider 2.2.0 → 2.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/Gemfile.lock +12 -11
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +297 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +6 -5
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
source 'https://rubygems.org'
|
2
|
-
|
3
|
-
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
4
|
-
|
5
|
-
# Specify your gem's dependencies in list_spider.gemspec
|
6
|
-
gemspec
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
4
|
+
|
5
|
+
# Specify your gem's dependencies in list_spider.gemspec
|
6
|
+
gemspec
|
data/Gemfile.lock
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
list_spider (2.0
|
4
|
+
list_spider (2.3.0)
|
5
5
|
em-http-request (~> 1.1, >= 1.1.3)
|
6
|
-
nokogiri (~> 1.
|
6
|
+
nokogiri (~> 1.10)
|
7
7
|
rchardet (~> 1.6, >= 1.6.1)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
addressable (2.
|
13
|
-
public_suffix (>= 2.0.2, <
|
12
|
+
addressable (2.7.0)
|
13
|
+
public_suffix (>= 2.0.2, < 5.0)
|
14
14
|
cookiejar (0.3.3)
|
15
15
|
em-http-request (1.1.5)
|
16
16
|
addressable (>= 2.3.4)
|
@@ -20,17 +20,18 @@ GEM
|
|
20
20
|
http_parser.rb (>= 0.6.0)
|
21
21
|
em-socksify (0.3.2)
|
22
22
|
eventmachine (>= 1.0.0.beta.4)
|
23
|
-
eventmachine (1.2.
|
23
|
+
eventmachine (1.2.7-x64-mingw32)
|
24
24
|
http_parser.rb (0.6.0)
|
25
|
-
mini_portile2 (2.
|
26
|
-
nokogiri (1.
|
27
|
-
mini_portile2 (~> 2.
|
28
|
-
public_suffix (
|
25
|
+
mini_portile2 (2.4.0)
|
26
|
+
nokogiri (1.10.7-x64-mingw32)
|
27
|
+
mini_portile2 (~> 2.4.0)
|
28
|
+
public_suffix (4.0.2)
|
29
29
|
rake (10.5.0)
|
30
|
-
rchardet (1.
|
30
|
+
rchardet (1.8.0)
|
31
31
|
|
32
32
|
PLATFORMS
|
33
33
|
ruby
|
34
|
+
x64-mingw32
|
34
35
|
|
35
36
|
DEPENDENCIES
|
36
37
|
bundler (~> 1.16)
|
@@ -38,4 +39,4 @@ DEPENDENCIES
|
|
38
39
|
rake (~> 10.0)
|
39
40
|
|
40
41
|
BUNDLED WITH
|
41
|
-
1.
|
42
|
+
1.17.1
|
data/README.md
CHANGED
@@ -1,181 +1,181 @@
|
|
1
|
-
# 关于list_spider
|
2
|
-
|
3
|
-
list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
|
4
|
-
|
5
|
-
许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
|
6
|
-
|
7
|
-
## 功能特点
|
8
|
-
* 去重过滤 (使用本地文件路径做唯一性校验)。
|
9
|
-
|
10
|
-
* 支持UTF-8编码转换。
|
11
|
-
|
12
|
-
* 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
|
13
|
-
|
14
|
-
* 自由设置最大并发数和爬取任务间隔时间。
|
15
|
-
|
16
|
-
* 支持http所有选项设置。
|
17
|
-
|
18
|
-
## 开始
|
19
|
-
|
20
|
-
```ruby
|
21
|
-
gem install list_spider
|
22
|
-
```
|
23
|
-
|
24
|
-
或者添加到Gemfile
|
25
|
-
|
26
|
-
```ruby
|
27
|
-
gem 'list_spider'
|
28
|
-
```
|
29
|
-
|
30
|
-
## 使用方法
|
31
|
-
```ruby
|
32
|
-
require 'list_spider'
|
33
|
-
|
34
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
-
|
36
|
-
@next_list = []
|
37
|
-
|
38
|
-
def parse_index_item(e)
|
39
|
-
content = File.read(e.local_path)
|
40
|
-
doc = Nokogiri::HTML(content)
|
41
|
-
list_group = doc.css('h2.entry-title')
|
42
|
-
link_list = list_group.css('a')
|
43
|
-
|
44
|
-
link_list.each do |link|
|
45
|
-
href = link['href']
|
46
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
-
# 可以存入数据库后续处理
|
48
|
-
@next_list << TaskStruct.new(href, local_path)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
task_list = []
|
53
|
-
task_list << TaskStruct.new(
|
54
|
-
'https://coolshell.cn/',
|
55
|
-
DOWNLOAD_DIR + 'index.html',
|
56
|
-
parse_method: method(:parse_index_item)
|
57
|
-
)
|
58
|
-
|
59
|
-
ListSpider.get_list(task_list)
|
60
|
-
ListSpider.get_list(@next_list, max: 60)
|
61
|
-
```
|
62
|
-
|
63
|
-
## 或者使用更简单的一步完成
|
64
|
-
```ruby
|
65
|
-
require 'list_spider'
|
66
|
-
|
67
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
-
|
69
|
-
def parse_index_item(e)
|
70
|
-
content = File.read(e.local_path)
|
71
|
-
doc = Nokogiri::HTML(content)
|
72
|
-
list_group = doc.css('h2.entry-title')
|
73
|
-
link_list = list_group.css('a')
|
74
|
-
|
75
|
-
link_list.each do |link|
|
76
|
-
href = link['href']
|
77
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
-
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# get_one是封装了get_list的简化形式,方便一个任务时调用
|
83
|
-
ListSpider.get_one(
|
84
|
-
TaskStruct.new(
|
85
|
-
'https://coolshell.cn/',
|
86
|
-
DOWNLOAD_DIR + 'index.html',
|
87
|
-
parse_method: method(:parse_index_item)
|
88
|
-
),
|
89
|
-
max: 60
|
90
|
-
)
|
91
|
-
```
|
92
|
-
|
93
|
-
## get_list/get_one参数
|
94
|
-
```
|
95
|
-
# down_list: 要请求的TaskStruct数组
|
96
|
-
# interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
|
97
|
-
# max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
|
98
|
-
|
99
|
-
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
100
|
-
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
101
|
-
```
|
102
|
-
|
103
|
-
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
104
|
-
|
105
|
-
```ruby
|
106
|
-
new(href, # 请求链接
|
107
|
-
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
108
|
-
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
|
-
http_method: :get,
|
110
|
-
custom_data: nil, # 自定义数据
|
111
|
-
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
112
|
-
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
113
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
114
|
-
# http_req.response_header.status 状态码
|
115
|
-
# http_req.response_header 返回头
|
116
|
-
# http_req.response 返回体
|
117
|
-
callback: nil,
|
118
|
-
# 请求失败后的回调
|
119
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
120
|
-
errback: nil,
|
121
|
-
stream_callback: nil, # 流数据处理回调
|
122
|
-
convert_to_utf8: false, # 是否转换为utf8编码
|
123
|
-
overwrite_exist: false, # 是否覆盖现有文件
|
124
|
-
# 请求设置
|
125
|
-
redirects: 3, # 重定向次数
|
126
|
-
keepalive: nil, # (暂不支持复用)
|
127
|
-
file: nil, # 要上传的文件路径
|
128
|
-
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
129
|
-
query: nil, # 查询字符串,可以是string或hash类型
|
130
|
-
body: nil, # 请求体,可以是string或hash类型
|
131
|
-
head: nil, # 请求头
|
132
|
-
# 连接设置
|
133
|
-
connect_timeout: 60, # 连接超时时间
|
134
|
-
inactivity_timeout: nil, # 连接后超时时间
|
135
|
-
# ssl设置
|
136
|
-
# ssl: {
|
137
|
-
# :private_key_file => '/tmp/server.key',
|
138
|
-
# :cert_chain_file => '/tmp/server.crt',
|
139
|
-
# :verify_peer => false
|
140
|
-
# }
|
141
|
-
ssl: nil,
|
142
|
-
# bind: {
|
143
|
-
# :host => '123.123.123.123', # use a specific interface for outbound request
|
144
|
-
# :port => '123'
|
145
|
-
# }
|
146
|
-
bind: nil,
|
147
|
-
# 代理设置
|
148
|
-
# proxy: {
|
149
|
-
# :host => '127.0.0.1', # proxy address
|
150
|
-
# :port => 9000, # proxy port
|
151
|
-
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
152
|
-
|
153
|
-
# :authorization => ['user', 'pass'] # proxy authorization header
|
154
|
-
# }
|
155
|
-
proxy: nil)
|
156
|
-
```
|
157
|
-
|
158
|
-
## 回调函数形式
|
159
|
-
|
160
|
-
```ruby
|
161
|
-
# 文件成功保存后调用,通过parse_method参数传入
|
162
|
-
def parse_eresponse(task_struct)
|
163
|
-
# ...
|
164
|
-
end
|
165
|
-
|
166
|
-
# http请求成功后调用,通过callback参数传入
|
167
|
-
def call_back(task_struct, http_req)
|
168
|
-
# http_req 是EventMachine::HttpRequest对象
|
169
|
-
# http_req.response_header.status
|
170
|
-
# ...
|
171
|
-
end
|
172
|
-
|
173
|
-
# http请求出错后调用,通过errback参数传入
|
174
|
-
def err_back(task_struct, http_req)
|
175
|
-
# ...
|
176
|
-
end
|
177
|
-
```
|
178
|
-
|
179
|
-
## License
|
180
|
-
|
181
|
-
(MIT License) - Copyright (c) 2016 Charles Zhang
|
1
|
+
# 关于list_spider
|
2
|
+
|
3
|
+
list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
|
4
|
+
|
5
|
+
许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
|
6
|
+
|
7
|
+
## 功能特点
|
8
|
+
* 去重过滤 (使用本地文件路径做唯一性校验)。
|
9
|
+
|
10
|
+
* 支持UTF-8编码转换。
|
11
|
+
|
12
|
+
* 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
|
13
|
+
|
14
|
+
* 自由设置最大并发数和爬取任务间隔时间。
|
15
|
+
|
16
|
+
* 支持http所有选项设置。
|
17
|
+
|
18
|
+
## 开始
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
或者添加到Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
29
|
+
|
30
|
+
## 使用方法
|
31
|
+
```ruby
|
32
|
+
require 'list_spider'
|
33
|
+
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
+
|
36
|
+
@next_list = []
|
37
|
+
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
40
|
+
doc = Nokogiri::HTML(content)
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
43
|
+
|
44
|
+
link_list.each do |link|
|
45
|
+
href = link['href']
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# 可以存入数据库后续处理
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task_list = []
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
58
|
+
|
59
|
+
ListSpider.get_list(task_list)
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
61
|
+
```
|
62
|
+
|
63
|
+
## 或者使用更简单的一步完成
|
64
|
+
```ruby
|
65
|
+
require 'list_spider'
|
66
|
+
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
+
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
71
|
+
doc = Nokogiri::HTML(content)
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
74
|
+
|
75
|
+
link_list.each do |link|
|
76
|
+
href = link['href']
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# get_one是封装了get_list的简化形式,方便一个任务时调用
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
91
|
+
```
|
92
|
+
|
93
|
+
## get_list/get_one参数
|
94
|
+
```
|
95
|
+
# down_list: 要请求的TaskStruct数组
|
96
|
+
# interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
|
97
|
+
# max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
|
98
|
+
|
99
|
+
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
100
|
+
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
101
|
+
```
|
102
|
+
|
103
|
+
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
new(href, # 请求链接
|
107
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
108
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
|
+
http_method: :get,
|
110
|
+
custom_data: nil, # 自定义数据
|
111
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
112
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
113
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
114
|
+
# http_req.response_header.status 状态码
|
115
|
+
# http_req.response_header 返回头
|
116
|
+
# http_req.response 返回体
|
117
|
+
callback: nil,
|
118
|
+
# 请求失败后的回调
|
119
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
120
|
+
errback: nil,
|
121
|
+
stream_callback: nil, # 流数据处理回调
|
122
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
123
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
124
|
+
# 请求设置
|
125
|
+
redirects: 3, # 重定向次数
|
126
|
+
keepalive: nil, # (暂不支持复用)
|
127
|
+
file: nil, # 要上传的文件路径
|
128
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
129
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
130
|
+
body: nil, # 请求体,可以是string或hash类型
|
131
|
+
head: nil, # 请求头
|
132
|
+
# 连接设置
|
133
|
+
connect_timeout: 60, # 连接超时时间
|
134
|
+
inactivity_timeout: nil, # 连接后超时时间
|
135
|
+
# ssl设置
|
136
|
+
# ssl: {
|
137
|
+
# :private_key_file => '/tmp/server.key',
|
138
|
+
# :cert_chain_file => '/tmp/server.crt',
|
139
|
+
# :verify_peer => false
|
140
|
+
# }
|
141
|
+
ssl: nil,
|
142
|
+
# bind: {
|
143
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
144
|
+
# :port => '123'
|
145
|
+
# }
|
146
|
+
bind: nil,
|
147
|
+
# 代理设置
|
148
|
+
# proxy: {
|
149
|
+
# :host => '127.0.0.1', # proxy address
|
150
|
+
# :port => 9000, # proxy port
|
151
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
152
|
+
|
153
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
154
|
+
# }
|
155
|
+
proxy: nil)
|
156
|
+
```
|
157
|
+
|
158
|
+
## 回调函数形式
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
# 文件成功保存后调用,通过parse_method参数传入
|
162
|
+
def parse_eresponse(task_struct)
|
163
|
+
# ...
|
164
|
+
end
|
165
|
+
|
166
|
+
# http请求成功后调用,通过callback参数传入
|
167
|
+
def call_back(task_struct, http_req)
|
168
|
+
# http_req 是EventMachine::HttpRequest对象
|
169
|
+
# http_req.response_header.status
|
170
|
+
# ...
|
171
|
+
end
|
172
|
+
|
173
|
+
# http请求出错后调用,通过errback参数传入
|
174
|
+
def err_back(task_struct, http_req)
|
175
|
+
# ...
|
176
|
+
end
|
177
|
+
```
|
178
|
+
|
179
|
+
## License
|
180
|
+
|
181
|
+
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/Rakefile
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
require 'bundler/gem_tasks'
|
2
|
-
task default: :spec
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
task default: :spec
|
data/bin/console
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'bundler/setup'
|
4
|
-
require 'list_spider'
|
5
|
-
|
6
|
-
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
-
# with your gem easier. You can also use a different console, if you like.
|
8
|
-
|
9
|
-
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
-
# require "pry"
|
11
|
-
# Pry.start
|
12
|
-
|
13
|
-
require 'irb'
|
14
|
-
IRB.start(__FILE__)
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'list_spider'
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require 'irb'
|
14
|
+
IRB.start(__FILE__)
|