list_spider 2.0.2 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rdoc_options +23 -0
- data/.rubocop.yml +2 -2
- data/English_README.md +169 -0
- data/Gemfile.lock +1 -1
- data/README.md +124 -129
- data/lib/list_spider.rb +48 -44
- data/lib/list_spider/version.rb +1 -1
- data/list_spider.gemspec +2 -2
- data/spider_example.rb +2 -4
- data/spider_example_2.rb +2 -2
- metadata +7 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2af55a6c3577dc734aa7ee545cef217059abfc7be4724eaac9cf94126b869b0e
|
4
|
+
data.tar.gz: 48e8f116b91e36613b05958f173a1bc168c0c6daa163fd137266515c3a19c2b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 778ae0918059fd2edea3a02081cf479d054521f216afa789f4d9131708b8339486f39b9f7603e303de91f4c03b1bb7ebf30e6b45ac0921fe0c29640743df9e5d
|
7
|
+
data.tar.gz: bcfc6df857085630faf802f3cff9d21653c2d8ced9b2595a3bc92a8093d8883cd470132b770801bc2e4977ad17e5f82aea726a2ad600ab5d1560150dede7c20f
|
data/.rdoc_options
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
--- !ruby/object:RDoc::Options
|
2
|
+
encoding: UTF-8
|
3
|
+
static_path: []
|
4
|
+
rdoc_include:
|
5
|
+
- "."
|
6
|
+
- "/Users/zhangchao/github/list_spider"
|
7
|
+
charset: UTF-8
|
8
|
+
exclude:
|
9
|
+
hyperlink_all: false
|
10
|
+
line_numbers: false
|
11
|
+
locale:
|
12
|
+
locale_dir: locale
|
13
|
+
locale_name:
|
14
|
+
main_page:
|
15
|
+
markup: markdown
|
16
|
+
output_decoration: true
|
17
|
+
page_dir:
|
18
|
+
show_hash: false
|
19
|
+
tab_width: 8
|
20
|
+
template_stylesheets: []
|
21
|
+
title:
|
22
|
+
visibility: :protected
|
23
|
+
webcvs:
|
data/.rubocop.yml
CHANGED
@@ -18,9 +18,9 @@ Style/Documentation:
|
|
18
18
|
Enabled: false
|
19
19
|
Lint/AmbiguousRegexpLiteral:
|
20
20
|
Enabled: false
|
21
|
-
|
21
|
+
Layout/DefEndAlignment:
|
22
22
|
AutoCorrect: true
|
23
|
-
|
23
|
+
Layout/EndAlignment:
|
24
24
|
AutoCorrect: true
|
25
25
|
Style/BracesAroundHashParameters:
|
26
26
|
Enabled: false
|
data/English_README.md
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
# list_spider
|
2
|
+
|
3
|
+
A url list spider based on em-http-request.
|
4
|
+
|
5
|
+
Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
|
6
|
+
|
7
|
+
## Features
|
8
|
+
* Duplicate url filtering (based on local path, so you can custom your behavior).
|
9
|
+
|
10
|
+
* Convert to UTF-8 support.
|
11
|
+
|
12
|
+
* Increased spider support (don't spider exist).
|
13
|
+
|
14
|
+
* Customize concurrent number and interval between task.
|
15
|
+
|
16
|
+
* Http options support.
|
17
|
+
|
18
|
+
## Getting started
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
Or add it to your Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
29
|
+
|
30
|
+
## Use like this
|
31
|
+
```ruby
|
32
|
+
require 'list_spider'
|
33
|
+
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
+
|
36
|
+
@next_list = []
|
37
|
+
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
40
|
+
doc = Nokogiri::HTML(content)
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
43
|
+
|
44
|
+
link_list.each do |link|
|
45
|
+
href = link['href']
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# or you can save them to database for later use
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task_list = []
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
58
|
+
|
59
|
+
ListSpider.get_list(task_list)
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
61
|
+
```
|
62
|
+
|
63
|
+
## Or in one step
|
64
|
+
```ruby
|
65
|
+
require 'list_spider'
|
66
|
+
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
+
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
71
|
+
doc = Nokogiri::HTML(content)
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
74
|
+
|
75
|
+
link_list.each do |link|
|
76
|
+
href = link['href']
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# get_one is a simple function for one taskstruct situation
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
91
|
+
```
|
92
|
+
|
93
|
+
## And there are many options you can use
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
def initialize(href, # 请求链接
|
97
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
98
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
99
|
+
http_method: :get,
|
100
|
+
custom_data: nil, # 自定义数据
|
101
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
102
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
103
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
104
|
+
# http.response_header.status 状态码
|
105
|
+
# http.response_header 返回头
|
106
|
+
# http.response 返回体
|
107
|
+
callback: nil,
|
108
|
+
# 请求失败后的回调
|
109
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
110
|
+
errback: nil,
|
111
|
+
stream_callback: nil, # 流数据处理回调
|
112
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
113
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
114
|
+
# request options
|
115
|
+
redirects: 3, # 重定向次数
|
116
|
+
keepalive: nil, # (暂不支持复用)
|
117
|
+
file: nil, # 要上传的文件路径
|
118
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
119
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
120
|
+
body: nil, # 请求体,可以是string或hash类型
|
121
|
+
head: nil, # 请求头
|
122
|
+
# connection options
|
123
|
+
connect_timeout: 60, # 连接超时时间
|
124
|
+
inactivity_timeout: nil, # 连接后超时时间
|
125
|
+
# ssl设置
|
126
|
+
# ssl: {
|
127
|
+
# :private_key_file => '/tmp/server.key',
|
128
|
+
# :cert_chain_file => '/tmp/server.crt',
|
129
|
+
# :verify_peer => false
|
130
|
+
# }
|
131
|
+
ssl: nil,
|
132
|
+
# bind: {
|
133
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
134
|
+
# :port => '123'
|
135
|
+
# }
|
136
|
+
bind: nil,
|
137
|
+
# 代理设置
|
138
|
+
# proxy: {
|
139
|
+
# :host => '127.0.0.1', # proxy address
|
140
|
+
# :port => 9000, # proxy port
|
141
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
142
|
+
|
143
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
144
|
+
# }
|
145
|
+
proxy: nil)
|
146
|
+
```
|
147
|
+
|
148
|
+
## Callback methods form
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
# called when the file is saved successfully
|
152
|
+
def parse_eresponse(task_struct)
|
153
|
+
# ...
|
154
|
+
end
|
155
|
+
|
156
|
+
def call_back(task_struct, http_req)
|
157
|
+
# http_req is a EventMachine::HttpRequest object
|
158
|
+
# http_req.response_header.status
|
159
|
+
# ...
|
160
|
+
end
|
161
|
+
|
162
|
+
def err_back(task_struct, http_req)
|
163
|
+
# ...
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
167
|
+
### License
|
168
|
+
|
169
|
+
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,186 +1,181 @@
|
|
1
|
-
# list_spider
|
1
|
+
# 关于list_spider
|
2
2
|
|
3
|
-
|
3
|
+
list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
|
4
4
|
|
5
|
-
|
5
|
+
许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
|
6
6
|
|
7
|
-
##
|
8
|
-
*
|
7
|
+
## 功能特点
|
8
|
+
* 去重过滤 (使用本地文件路径做唯一性校验)。
|
9
9
|
|
10
|
-
*
|
10
|
+
* 支持UTF-8编码转换。
|
11
11
|
|
12
|
-
*
|
12
|
+
* 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
|
13
13
|
|
14
|
-
*
|
14
|
+
* 自由设置最大并发数和爬取任务间隔时间。
|
15
15
|
|
16
|
-
*
|
16
|
+
* 支持http所有选项设置。
|
17
17
|
|
18
|
-
##
|
18
|
+
## 开始
|
19
19
|
|
20
|
-
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
或者添加到Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
21
29
|
|
22
|
-
##
|
30
|
+
## 使用方法
|
23
31
|
```ruby
|
24
32
|
require 'list_spider'
|
25
33
|
|
26
|
-
DOWNLOAD_DIR = 'coolshell/'
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
27
35
|
|
28
|
-
|
36
|
+
@next_list = []
|
29
37
|
|
30
|
-
def parse_index_item(
|
31
|
-
content = File.read(
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
32
40
|
doc = Nokogiri::HTML(content)
|
33
|
-
list_group = doc.css(
|
34
|
-
link_list = list_group.css(
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
35
43
|
|
36
44
|
link_list.each do |link|
|
37
45
|
href = link['href']
|
38
|
-
local_path = DOWNLOAD_DIR + link.content +
|
39
|
-
#
|
40
|
-
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# 可以存入数据库后续处理
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
41
49
|
end
|
42
50
|
end
|
43
51
|
|
44
52
|
task_list = []
|
45
|
-
task_list << TaskStruct.new(
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
46
58
|
|
47
59
|
ListSpider.get_list(task_list)
|
48
|
-
ListSpider.get_list(
|
49
|
-
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
50
61
|
```
|
51
62
|
|
52
|
-
##
|
63
|
+
## 或者使用更简单的一步完成
|
53
64
|
```ruby
|
54
65
|
require 'list_spider'
|
55
66
|
|
56
|
-
DOWNLOAD_DIR = 'coolshell/'
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
57
68
|
|
58
|
-
def parse_index_item(
|
59
|
-
|
60
|
-
content = File.read(file_name)
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
61
71
|
doc = Nokogiri::HTML(content)
|
62
|
-
list_group = doc.css(
|
63
|
-
link_list = list_group.css(
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
64
74
|
|
65
75
|
link_list.each do |link|
|
66
76
|
href = link['href']
|
67
|
-
local_path = DOWNLOAD_DIR + link.content +
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
68
78
|
ListSpider.add_task(TaskStruct.new(href, local_path))
|
69
79
|
end
|
70
80
|
end
|
71
81
|
|
72
|
-
#get_one
|
73
|
-
ListSpider.get_one(
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
82
|
+
# get_one是封装了get_list的简化形式,方便一个任务时调用
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
79
91
|
```
|
80
92
|
|
81
|
-
##
|
82
|
-
|
83
|
-
```ruby
|
84
|
-
def parse_response(file_name)
|
85
|
-
#...
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
# custom_data is passed by TaskStruct's custom_data param
|
90
|
-
|
91
|
-
def parse_response(file_name, custom_data)
|
92
|
-
#...
|
93
|
-
end
|
94
|
-
|
95
|
-
|
96
|
-
# response_header is a EventMachine::HttpResponseHeader object
|
97
|
-
# you can use it like this:
|
98
|
-
# response_header.status
|
99
|
-
# response_header.cookie
|
100
|
-
# response_header['Last-Modified']
|
101
|
-
|
102
|
-
def parse_response(file_name, custom_data, response_header)
|
103
|
-
response_header.status
|
104
|
-
response_header['Last-Modified']
|
105
|
-
|
106
|
-
#...
|
107
|
-
end
|
108
|
-
|
109
|
-
# req is a EventMachine::HttpClientOptions object
|
110
|
-
# you can use it like this:
|
111
|
-
# req.body
|
112
|
-
# req.headers
|
113
|
-
# req.uri
|
114
|
-
# req.host
|
115
|
-
# req.port
|
116
|
-
def parse_response(file_name, custom_data, response_header, req)
|
117
|
-
puts req.body
|
118
|
-
puts req.headers
|
119
|
-
puts req.uri
|
120
|
-
puts req.host
|
121
|
-
puts req.port
|
122
|
-
|
123
|
-
#...
|
124
|
-
end
|
125
|
-
|
126
|
-
```
|
127
|
-
|
128
|
-
## And there are many options you can use
|
129
|
-
|
130
|
-
```ruby
|
131
|
-
TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
|
93
|
+
## get_list/get_one参数
|
132
94
|
```
|
95
|
+
# down_list: 要请求的TaskStruct数组
|
96
|
+
# interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
|
97
|
+
# max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
|
133
98
|
|
134
|
-
|
135
|
-
|
136
|
-
ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
|
137
|
-
|
138
|
-
#sleep random time, often used in site which limit spider
|
139
|
-
ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
|
140
|
-
|
141
|
-
#set random time range
|
142
|
-
ListSpider.get_list(down_list, interval: (1..10), max: 1)
|
143
|
-
|
99
|
+
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
100
|
+
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
144
101
|
```
|
145
102
|
|
146
|
-
|
103
|
+
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
147
104
|
|
148
105
|
```ruby
|
149
|
-
#
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
#
|
156
|
-
|
157
|
-
|
158
|
-
#
|
159
|
-
|
160
|
-
|
161
|
-
#
|
162
|
-
|
163
|
-
|
164
|
-
#
|
165
|
-
|
166
|
-
|
106
|
+
new(href, # 请求链接
|
107
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
108
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
|
+
http_method: :get,
|
110
|
+
custom_data: nil, # 自定义数据
|
111
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
112
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
113
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
114
|
+
# http_req.response_header.status 状态码
|
115
|
+
# http_req.response_header 返回头
|
116
|
+
# http_req.response 返回体
|
117
|
+
callback: nil,
|
118
|
+
# 请求失败后的回调
|
119
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
120
|
+
errback: nil,
|
121
|
+
stream_callback: nil, # 流数据处理回调
|
122
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
123
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
124
|
+
# 请求设置
|
125
|
+
redirects: 3, # 重定向次数
|
126
|
+
keepalive: nil, # (暂不支持复用)
|
127
|
+
file: nil, # 要上传的文件路径
|
128
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
129
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
130
|
+
body: nil, # 请求体,可以是string或hash类型
|
131
|
+
head: nil, # 请求头
|
132
|
+
# 连接设置
|
133
|
+
connect_timeout: 60, # 连接超时时间
|
134
|
+
inactivity_timeout: nil, # 连接后超时时间
|
135
|
+
# ssl设置
|
136
|
+
# ssl: {
|
137
|
+
# :private_key_file => '/tmp/server.key',
|
138
|
+
# :cert_chain_file => '/tmp/server.crt',
|
139
|
+
# :verify_peer => false
|
140
|
+
# }
|
141
|
+
ssl: nil,
|
142
|
+
# bind: {
|
143
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
144
|
+
# :port => '123'
|
145
|
+
# }
|
146
|
+
bind: nil,
|
147
|
+
# 代理设置
|
148
|
+
# proxy: {
|
149
|
+
# :host => '127.0.0.1', # proxy address
|
150
|
+
# :port => 9000, # proxy port
|
151
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
152
|
+
|
153
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
154
|
+
# }
|
155
|
+
proxy: nil)
|
167
156
|
```
|
168
157
|
|
169
|
-
##
|
158
|
+
## 回调函数形式
|
170
159
|
|
171
160
|
```ruby
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
|
161
|
+
# 文件成功保存后调用,通过parse_method参数传入
|
162
|
+
def parse_eresponse(task_struct)
|
163
|
+
# ...
|
164
|
+
end
|
177
165
|
|
178
|
-
#
|
179
|
-
|
166
|
+
# http请求成功后调用,通过callback参数传入
|
167
|
+
def call_back(task_struct, http_req)
|
168
|
+
# http_req 是EventMachine::HttpRequest对象
|
169
|
+
# http_req.response_header.status
|
170
|
+
# ...
|
171
|
+
end
|
180
172
|
|
181
|
-
|
173
|
+
# http请求出错后调用,通过errback参数传入
|
174
|
+
def err_back(task_struct, http_req)
|
175
|
+
# ...
|
176
|
+
end
|
182
177
|
```
|
183
178
|
|
184
|
-
|
179
|
+
## License
|
185
180
|
|
186
181
|
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/lib/list_spider.rb
CHANGED
@@ -4,10 +4,16 @@ require 'nokogiri'
|
|
4
4
|
require 'fileutils'
|
5
5
|
require 'set'
|
6
6
|
require 'addressable/uri'
|
7
|
-
require File.expand_path('
|
8
|
-
require File.expand_path('
|
7
|
+
require File.expand_path('spider_helper', __dir__)
|
8
|
+
require File.expand_path('file_filter', __dir__)
|
9
9
|
|
10
|
+
# 爬取任务类
|
10
11
|
class TaskStruct
|
12
|
+
# * href 请求链接
|
13
|
+
# * local_path 保存数据的本地路径(此路径作为去重标准)
|
14
|
+
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
|
+
# * custom_data 自定义数据
|
16
|
+
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
11
17
|
def initialize(href, # 请求链接
|
12
18
|
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
13
19
|
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
@@ -16,9 +22,9 @@ class TaskStruct
|
|
16
22
|
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
17
23
|
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
18
24
|
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
25
|
+
# http_req.response_header.status 状态码
|
26
|
+
# http_req.response_header 返回头
|
27
|
+
# http_req.response 返回体
|
22
28
|
callback: nil,
|
23
29
|
# 请求失败后的回调
|
24
30
|
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
@@ -26,7 +32,7 @@ class TaskStruct
|
|
26
32
|
stream_callback: nil, # 流数据处理回调
|
27
33
|
convert_to_utf8: false, # 是否转换为utf8编码
|
28
34
|
overwrite_exist: false, # 是否覆盖现有文件
|
29
|
-
#
|
35
|
+
# 请求设置
|
30
36
|
redirects: 3, # 重定向次数
|
31
37
|
keepalive: nil, # (暂不支持复用)
|
32
38
|
file: nil, # 要上传的文件路径
|
@@ -34,7 +40,7 @@ class TaskStruct
|
|
34
40
|
query: nil, # 查询字符串,可以是string或hash类型
|
35
41
|
body: nil, # 请求体,可以是string或hash类型
|
36
42
|
head: nil, # 请求头
|
37
|
-
#
|
43
|
+
# 连接设置
|
38
44
|
connect_timeout: 60, # 连接超时时间
|
39
45
|
inactivity_timeout: nil, # 连接后超时时间
|
40
46
|
# ssl设置
|
@@ -112,6 +118,41 @@ module ListSpider
|
|
112
118
|
@local_path_set = Set.new
|
113
119
|
|
114
120
|
class << self
|
121
|
+
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
122
|
+
if interval.is_a? Range
|
123
|
+
@random_time_range = interval
|
124
|
+
interval = RANDOM_TIME
|
125
|
+
end
|
126
|
+
|
127
|
+
@down_list = filter_list(down_list)
|
128
|
+
@interval = interval
|
129
|
+
@max = max
|
130
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
131
|
+
@succeed_size = 0
|
132
|
+
@failed_size = 0
|
133
|
+
|
134
|
+
puts "total size:#{@down_list.size}"
|
135
|
+
event_machine_start_list(next_task, method(:complete))
|
136
|
+
end
|
137
|
+
|
138
|
+
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
139
|
+
get_list([task], interval: interval, max: max)
|
140
|
+
end
|
141
|
+
|
142
|
+
def add_task(task)
|
143
|
+
if task.is_a? Array
|
144
|
+
need_down_list = filter_list(task)
|
145
|
+
@down_list += need_down_list
|
146
|
+
elsif task.is_a?TaskStruct
|
147
|
+
need_down_list = filter_list([task])
|
148
|
+
@down_list += need_down_list
|
149
|
+
else
|
150
|
+
puts "error task type:#{task.class}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
115
156
|
def event_machine_down(link_struct_list, callback = nil)
|
116
157
|
failed_list = []
|
117
158
|
succeed_list = []
|
@@ -247,43 +288,6 @@ module ListSpider
|
|
247
288
|
end
|
248
289
|
need_down_list
|
249
290
|
end
|
250
|
-
|
251
|
-
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
252
|
-
if interval.is_a? Range
|
253
|
-
@random_time_range = interval
|
254
|
-
interval = RANDOM_TIME
|
255
|
-
end
|
256
|
-
|
257
|
-
@down_list = []
|
258
|
-
|
259
|
-
need_down_list = filter_list(down_list)
|
260
|
-
|
261
|
-
@down_list += need_down_list
|
262
|
-
@interval = interval
|
263
|
-
@max = max
|
264
|
-
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
265
|
-
@succeed_size = 0
|
266
|
-
@failed_size = 0
|
267
|
-
|
268
|
-
puts "total size:#{@down_list.size}"
|
269
|
-
event_machine_start_list(next_task, method(:complete))
|
270
|
-
end
|
271
|
-
|
272
|
-
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
273
|
-
get_list([task], interval: interval, max: max)
|
274
|
-
end
|
275
|
-
|
276
|
-
def add_task(task)
|
277
|
-
if task.is_a? Array
|
278
|
-
need_down_list = filter_list(task)
|
279
|
-
@down_list += need_down_list
|
280
|
-
elsif task.is_a?TaskStruct
|
281
|
-
need_down_list = filter_list([task])
|
282
|
-
@down_list += need_down_list
|
283
|
-
else
|
284
|
-
puts "error task type:#{task.class}"
|
285
|
-
end
|
286
|
-
end
|
287
291
|
end
|
288
292
|
|
289
293
|
Signal.trap('INT') do
|
data/lib/list_spider/version.rb
CHANGED
data/list_spider.gemspec
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
lib = File.expand_path('
|
2
|
+
lib = File.expand_path('lib', __dir__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
require 'list_spider/version'
|
5
5
|
|
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency 'rake', '~> 10.0'
|
27
27
|
|
28
28
|
spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
|
29
|
-
spec.add_dependency 'nokogiri', '
|
29
|
+
spec.add_dependency 'nokogiri', '>= 1.8.5'
|
30
30
|
spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
|
31
31
|
end
|
data/spider_example.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
require File.expand_path('../lib/list_spider', __FILE__)
|
1
|
+
require 'list_spider'
|
2
|
+
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
3
|
|
4
4
|
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
5
|
|
@@ -16,8 +16,6 @@ def parse_index_item(e)
|
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
-
# ListSpider.convert_to_utf8 = true
|
20
|
-
|
21
19
|
# get_one is a simple function for one taskstruct situation
|
22
20
|
ListSpider.get_one(
|
23
21
|
TaskStruct.new(
|
data/spider_example_2.rb
CHANGED
@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
|
|
4
4
|
|
5
5
|
@next_list = []
|
6
6
|
|
7
|
-
def parse_index_item(
|
8
|
-
content = File.read(
|
7
|
+
def parse_index_item(e)
|
8
|
+
content = File.read(e.local_path)
|
9
9
|
doc = Nokogiri::HTML(content)
|
10
10
|
list_group = doc.css('h2.entry-title')
|
11
11
|
link_list = list_group.css('a')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -62,22 +62,16 @@ dependencies:
|
|
62
62
|
name: nokogiri
|
63
63
|
requirement: !ruby/object:Gem::Requirement
|
64
64
|
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '1.6'
|
68
65
|
- - ">="
|
69
66
|
- !ruby/object:Gem::Version
|
70
|
-
version: 1.
|
67
|
+
version: 1.8.5
|
71
68
|
type: :runtime
|
72
69
|
prerelease: false
|
73
70
|
version_requirements: !ruby/object:Gem::Requirement
|
74
71
|
requirements:
|
75
|
-
- - "~>"
|
76
|
-
- !ruby/object:Gem::Version
|
77
|
-
version: '1.6'
|
78
72
|
- - ">="
|
79
73
|
- !ruby/object:Gem::Version
|
80
|
-
version: 1.
|
74
|
+
version: 1.8.5
|
81
75
|
- !ruby/object:Gem::Dependency
|
82
76
|
name: rchardet
|
83
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,7 +100,9 @@ extensions: []
|
|
106
100
|
extra_rdoc_files: []
|
107
101
|
files:
|
108
102
|
- ".gitignore"
|
103
|
+
- ".rdoc_options"
|
109
104
|
- ".rubocop.yml"
|
105
|
+
- English_README.md
|
110
106
|
- Gemfile
|
111
107
|
- Gemfile.lock
|
112
108
|
- README.md
|
@@ -140,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
136
|
- !ruby/object:Gem::Version
|
141
137
|
version: '0'
|
142
138
|
requirements: []
|
143
|
-
|
144
|
-
rubygems_version: 2.7.3
|
139
|
+
rubygems_version: 3.0.1
|
145
140
|
signing_key:
|
146
141
|
specification_version: 4
|
147
142
|
summary: List Spider
|