list_spider 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rdoc_options +23 -0
- data/.rubocop.yml +2 -2
- data/English_README.md +169 -0
- data/Gemfile.lock +1 -1
- data/README.md +124 -129
- data/lib/list_spider.rb +48 -44
- data/lib/list_spider/version.rb +1 -1
- data/list_spider.gemspec +2 -2
- data/spider_example.rb +2 -4
- data/spider_example_2.rb +2 -2
- metadata +7 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2af55a6c3577dc734aa7ee545cef217059abfc7be4724eaac9cf94126b869b0e
|
4
|
+
data.tar.gz: 48e8f116b91e36613b05958f173a1bc168c0c6daa163fd137266515c3a19c2b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 778ae0918059fd2edea3a02081cf479d054521f216afa789f4d9131708b8339486f39b9f7603e303de91f4c03b1bb7ebf30e6b45ac0921fe0c29640743df9e5d
|
7
|
+
data.tar.gz: bcfc6df857085630faf802f3cff9d21653c2d8ced9b2595a3bc92a8093d8883cd470132b770801bc2e4977ad17e5f82aea726a2ad600ab5d1560150dede7c20f
|
data/.rdoc_options
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
--- !ruby/object:RDoc::Options
|
2
|
+
encoding: UTF-8
|
3
|
+
static_path: []
|
4
|
+
rdoc_include:
|
5
|
+
- "."
|
6
|
+
- "/Users/zhangchao/github/list_spider"
|
7
|
+
charset: UTF-8
|
8
|
+
exclude:
|
9
|
+
hyperlink_all: false
|
10
|
+
line_numbers: false
|
11
|
+
locale:
|
12
|
+
locale_dir: locale
|
13
|
+
locale_name:
|
14
|
+
main_page:
|
15
|
+
markup: markdown
|
16
|
+
output_decoration: true
|
17
|
+
page_dir:
|
18
|
+
show_hash: false
|
19
|
+
tab_width: 8
|
20
|
+
template_stylesheets: []
|
21
|
+
title:
|
22
|
+
visibility: :protected
|
23
|
+
webcvs:
|
data/.rubocop.yml
CHANGED
@@ -18,9 +18,9 @@ Style/Documentation:
|
|
18
18
|
Enabled: false
|
19
19
|
Lint/AmbiguousRegexpLiteral:
|
20
20
|
Enabled: false
|
21
|
-
|
21
|
+
Layout/DefEndAlignment:
|
22
22
|
AutoCorrect: true
|
23
|
-
|
23
|
+
Layout/EndAlignment:
|
24
24
|
AutoCorrect: true
|
25
25
|
Style/BracesAroundHashParameters:
|
26
26
|
Enabled: false
|
data/English_README.md
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
# list_spider
|
2
|
+
|
3
|
+
A url list spider based on em-http-request.
|
4
|
+
|
5
|
+
Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
|
6
|
+
|
7
|
+
## Features
|
8
|
+
* Duplicate url filtering (based on local path, so you can custom your behavior).
|
9
|
+
|
10
|
+
* Convert to UTF-8 support.
|
11
|
+
|
12
|
+
* Increased spider support (don't spider exist).
|
13
|
+
|
14
|
+
* Customize concurrent number and interval between task.
|
15
|
+
|
16
|
+
* Http options support.
|
17
|
+
|
18
|
+
## Getting started
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
Or add it to your Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
29
|
+
|
30
|
+
## Use like this
|
31
|
+
```ruby
|
32
|
+
require 'list_spider'
|
33
|
+
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
+
|
36
|
+
@next_list = []
|
37
|
+
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
40
|
+
doc = Nokogiri::HTML(content)
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
43
|
+
|
44
|
+
link_list.each do |link|
|
45
|
+
href = link['href']
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# or you can save them to database for later use
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task_list = []
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
58
|
+
|
59
|
+
ListSpider.get_list(task_list)
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
61
|
+
```
|
62
|
+
|
63
|
+
## Or in one step
|
64
|
+
```ruby
|
65
|
+
require 'list_spider'
|
66
|
+
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
+
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
71
|
+
doc = Nokogiri::HTML(content)
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
74
|
+
|
75
|
+
link_list.each do |link|
|
76
|
+
href = link['href']
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# get_one is a simple function for one taskstruct situation
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
91
|
+
```
|
92
|
+
|
93
|
+
## And there are many options you can use
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
def initialize(href, # 请求链接
|
97
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
98
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
99
|
+
http_method: :get,
|
100
|
+
custom_data: nil, # 自定义数据
|
101
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
102
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
103
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
104
|
+
# http.response_header.status 状态码
|
105
|
+
# http.response_header 返回头
|
106
|
+
# http.response 返回体
|
107
|
+
callback: nil,
|
108
|
+
# 请求失败后的回调
|
109
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
110
|
+
errback: nil,
|
111
|
+
stream_callback: nil, # 流数据处理回调
|
112
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
113
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
114
|
+
# request options
|
115
|
+
redirects: 3, # 重定向次数
|
116
|
+
keepalive: nil, # (暂不支持复用)
|
117
|
+
file: nil, # 要上传的文件路径
|
118
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
119
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
120
|
+
body: nil, # 请求体,可以是string或hash类型
|
121
|
+
head: nil, # 请求头
|
122
|
+
# connection options
|
123
|
+
connect_timeout: 60, # 连接超时时间
|
124
|
+
inactivity_timeout: nil, # 连接后超时时间
|
125
|
+
# ssl设置
|
126
|
+
# ssl: {
|
127
|
+
# :private_key_file => '/tmp/server.key',
|
128
|
+
# :cert_chain_file => '/tmp/server.crt',
|
129
|
+
# :verify_peer => false
|
130
|
+
# }
|
131
|
+
ssl: nil,
|
132
|
+
# bind: {
|
133
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
134
|
+
# :port => '123'
|
135
|
+
# }
|
136
|
+
bind: nil,
|
137
|
+
# 代理设置
|
138
|
+
# proxy: {
|
139
|
+
# :host => '127.0.0.1', # proxy address
|
140
|
+
# :port => 9000, # proxy port
|
141
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
142
|
+
|
143
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
144
|
+
# }
|
145
|
+
proxy: nil)
|
146
|
+
```
|
147
|
+
|
148
|
+
## Callback methods form
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
# called when the file is saved successfully
|
152
|
+
def parse_eresponse(task_struct)
|
153
|
+
# ...
|
154
|
+
end
|
155
|
+
|
156
|
+
def call_back(task_struct, http_req)
|
157
|
+
# http_req is a EventMachine::HttpRequest object
|
158
|
+
# http_req.response_header.status
|
159
|
+
# ...
|
160
|
+
end
|
161
|
+
|
162
|
+
def err_back(task_struct, http_req)
|
163
|
+
# ...
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
167
|
+
### License
|
168
|
+
|
169
|
+
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,186 +1,181 @@
|
|
1
|
-
# list_spider
|
1
|
+
# 关于list_spider
|
2
2
|
|
3
|
-
|
3
|
+
list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
|
4
4
|
|
5
|
-
|
5
|
+
许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
|
6
6
|
|
7
|
-
##
|
8
|
-
*
|
7
|
+
## 功能特点
|
8
|
+
* 去重过滤 (使用本地文件路径做唯一性校验)。
|
9
9
|
|
10
|
-
*
|
10
|
+
* 支持UTF-8编码转换。
|
11
11
|
|
12
|
-
*
|
12
|
+
* 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
|
13
13
|
|
14
|
-
*
|
14
|
+
* 自由设置最大并发数和爬取任务间隔时间。
|
15
15
|
|
16
|
-
*
|
16
|
+
* 支持http所有选项设置。
|
17
17
|
|
18
|
-
##
|
18
|
+
## 开始
|
19
19
|
|
20
|
-
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
或者添加到Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
21
29
|
|
22
|
-
##
|
30
|
+
## 使用方法
|
23
31
|
```ruby
|
24
32
|
require 'list_spider'
|
25
33
|
|
26
|
-
DOWNLOAD_DIR = 'coolshell/'
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
27
35
|
|
28
|
-
|
36
|
+
@next_list = []
|
29
37
|
|
30
|
-
def parse_index_item(
|
31
|
-
content = File.read(
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
32
40
|
doc = Nokogiri::HTML(content)
|
33
|
-
list_group = doc.css(
|
34
|
-
link_list = list_group.css(
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
35
43
|
|
36
44
|
link_list.each do |link|
|
37
45
|
href = link['href']
|
38
|
-
local_path = DOWNLOAD_DIR + link.content +
|
39
|
-
#
|
40
|
-
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# 可以存入数据库后续处理
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
41
49
|
end
|
42
50
|
end
|
43
51
|
|
44
52
|
task_list = []
|
45
|
-
task_list << TaskStruct.new(
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
46
58
|
|
47
59
|
ListSpider.get_list(task_list)
|
48
|
-
ListSpider.get_list(
|
49
|
-
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
50
61
|
```
|
51
62
|
|
52
|
-
##
|
63
|
+
## 或者使用更简单的一步完成
|
53
64
|
```ruby
|
54
65
|
require 'list_spider'
|
55
66
|
|
56
|
-
DOWNLOAD_DIR = 'coolshell/'
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
57
68
|
|
58
|
-
def parse_index_item(
|
59
|
-
|
60
|
-
content = File.read(file_name)
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
61
71
|
doc = Nokogiri::HTML(content)
|
62
|
-
list_group = doc.css(
|
63
|
-
link_list = list_group.css(
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
64
74
|
|
65
75
|
link_list.each do |link|
|
66
76
|
href = link['href']
|
67
|
-
local_path = DOWNLOAD_DIR + link.content +
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
68
78
|
ListSpider.add_task(TaskStruct.new(href, local_path))
|
69
79
|
end
|
70
80
|
end
|
71
81
|
|
72
|
-
#get_one
|
73
|
-
ListSpider.get_one(
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
82
|
+
# get_one是封装了get_list的简化形式,方便一个任务时调用
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
79
91
|
```
|
80
92
|
|
81
|
-
##
|
82
|
-
|
83
|
-
```ruby
|
84
|
-
def parse_response(file_name)
|
85
|
-
#...
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
# custom_data is passed by TaskStruct's custom_data param
|
90
|
-
|
91
|
-
def parse_response(file_name, custom_data)
|
92
|
-
#...
|
93
|
-
end
|
94
|
-
|
95
|
-
|
96
|
-
# response_header is a EventMachine::HttpResponseHeader object
|
97
|
-
# you can use it like this:
|
98
|
-
# response_header.status
|
99
|
-
# response_header.cookie
|
100
|
-
# response_header['Last-Modified']
|
101
|
-
|
102
|
-
def parse_response(file_name, custom_data, response_header)
|
103
|
-
response_header.status
|
104
|
-
response_header['Last-Modified']
|
105
|
-
|
106
|
-
#...
|
107
|
-
end
|
108
|
-
|
109
|
-
# req is a EventMachine::HttpClientOptions object
|
110
|
-
# you can use it like this:
|
111
|
-
# req.body
|
112
|
-
# req.headers
|
113
|
-
# req.uri
|
114
|
-
# req.host
|
115
|
-
# req.port
|
116
|
-
def parse_response(file_name, custom_data, response_header, req)
|
117
|
-
puts req.body
|
118
|
-
puts req.headers
|
119
|
-
puts req.uri
|
120
|
-
puts req.host
|
121
|
-
puts req.port
|
122
|
-
|
123
|
-
#...
|
124
|
-
end
|
125
|
-
|
126
|
-
```
|
127
|
-
|
128
|
-
## And there are many options you can use
|
129
|
-
|
130
|
-
```ruby
|
131
|
-
TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
|
93
|
+
## get_list/get_one参数
|
132
94
|
```
|
95
|
+
# down_list: 要请求的TaskStruct数组
|
96
|
+
# interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
|
97
|
+
# max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
|
133
98
|
|
134
|
-
|
135
|
-
|
136
|
-
ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
|
137
|
-
|
138
|
-
#sleep random time, often used in site which limit spider
|
139
|
-
ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
|
140
|
-
|
141
|
-
#set random time range
|
142
|
-
ListSpider.get_list(down_list, interval: (1..10), max: 1)
|
143
|
-
|
99
|
+
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
100
|
+
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
144
101
|
```
|
145
102
|
|
146
|
-
|
103
|
+
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
147
104
|
|
148
105
|
```ruby
|
149
|
-
#
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
#
|
156
|
-
|
157
|
-
|
158
|
-
#
|
159
|
-
|
160
|
-
|
161
|
-
#
|
162
|
-
|
163
|
-
|
164
|
-
#
|
165
|
-
|
166
|
-
|
106
|
+
new(href, # 请求链接
|
107
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
108
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
|
+
http_method: :get,
|
110
|
+
custom_data: nil, # 自定义数据
|
111
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
112
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
113
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
114
|
+
# http_req.response_header.status 状态码
|
115
|
+
# http_req.response_header 返回头
|
116
|
+
# http_req.response 返回体
|
117
|
+
callback: nil,
|
118
|
+
# 请求失败后的回调
|
119
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
120
|
+
errback: nil,
|
121
|
+
stream_callback: nil, # 流数据处理回调
|
122
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
123
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
124
|
+
# 请求设置
|
125
|
+
redirects: 3, # 重定向次数
|
126
|
+
keepalive: nil, # (暂不支持复用)
|
127
|
+
file: nil, # 要上传的文件路径
|
128
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
129
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
130
|
+
body: nil, # 请求体,可以是string或hash类型
|
131
|
+
head: nil, # 请求头
|
132
|
+
# 连接设置
|
133
|
+
connect_timeout: 60, # 连接超时时间
|
134
|
+
inactivity_timeout: nil, # 连接后超时时间
|
135
|
+
# ssl设置
|
136
|
+
# ssl: {
|
137
|
+
# :private_key_file => '/tmp/server.key',
|
138
|
+
# :cert_chain_file => '/tmp/server.crt',
|
139
|
+
# :verify_peer => false
|
140
|
+
# }
|
141
|
+
ssl: nil,
|
142
|
+
# bind: {
|
143
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
144
|
+
# :port => '123'
|
145
|
+
# }
|
146
|
+
bind: nil,
|
147
|
+
# 代理设置
|
148
|
+
# proxy: {
|
149
|
+
# :host => '127.0.0.1', # proxy address
|
150
|
+
# :port => 9000, # proxy port
|
151
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
152
|
+
|
153
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
154
|
+
# }
|
155
|
+
proxy: nil)
|
167
156
|
```
|
168
157
|
|
169
|
-
##
|
158
|
+
## 回调函数形式
|
170
159
|
|
171
160
|
```ruby
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
|
161
|
+
# 文件成功保存后调用,通过parse_method参数传入
|
162
|
+
def parse_eresponse(task_struct)
|
163
|
+
# ...
|
164
|
+
end
|
177
165
|
|
178
|
-
#
|
179
|
-
|
166
|
+
# http请求成功后调用,通过callback参数传入
|
167
|
+
def call_back(task_struct, http_req)
|
168
|
+
# http_req 是EventMachine::HttpRequest对象
|
169
|
+
# http_req.response_header.status
|
170
|
+
# ...
|
171
|
+
end
|
180
172
|
|
181
|
-
|
173
|
+
# http请求出错后调用,通过errback参数传入
|
174
|
+
def err_back(task_struct, http_req)
|
175
|
+
# ...
|
176
|
+
end
|
182
177
|
```
|
183
178
|
|
184
|
-
|
179
|
+
## License
|
185
180
|
|
186
181
|
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/lib/list_spider.rb
CHANGED
@@ -4,10 +4,16 @@ require 'nokogiri'
|
|
4
4
|
require 'fileutils'
|
5
5
|
require 'set'
|
6
6
|
require 'addressable/uri'
|
7
|
-
require File.expand_path('
|
8
|
-
require File.expand_path('
|
7
|
+
require File.expand_path('spider_helper', __dir__)
|
8
|
+
require File.expand_path('file_filter', __dir__)
|
9
9
|
|
10
|
+
# 爬取任务类
|
10
11
|
class TaskStruct
|
12
|
+
# * href 请求链接
|
13
|
+
# * local_path 保存数据的本地路径(此路径作为去重标准)
|
14
|
+
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
|
+
# * custom_data 自定义数据
|
16
|
+
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
11
17
|
def initialize(href, # 请求链接
|
12
18
|
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
13
19
|
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
@@ -16,9 +22,9 @@ class TaskStruct
|
|
16
22
|
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
17
23
|
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
18
24
|
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
25
|
+
# http_req.response_header.status 状态码
|
26
|
+
# http_req.response_header 返回头
|
27
|
+
# http_req.response 返回体
|
22
28
|
callback: nil,
|
23
29
|
# 请求失败后的回调
|
24
30
|
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
@@ -26,7 +32,7 @@ class TaskStruct
|
|
26
32
|
stream_callback: nil, # 流数据处理回调
|
27
33
|
convert_to_utf8: false, # 是否转换为utf8编码
|
28
34
|
overwrite_exist: false, # 是否覆盖现有文件
|
29
|
-
#
|
35
|
+
# 请求设置
|
30
36
|
redirects: 3, # 重定向次数
|
31
37
|
keepalive: nil, # (暂不支持复用)
|
32
38
|
file: nil, # 要上传的文件路径
|
@@ -34,7 +40,7 @@ class TaskStruct
|
|
34
40
|
query: nil, # 查询字符串,可以是string或hash类型
|
35
41
|
body: nil, # 请求体,可以是string或hash类型
|
36
42
|
head: nil, # 请求头
|
37
|
-
#
|
43
|
+
# 连接设置
|
38
44
|
connect_timeout: 60, # 连接超时时间
|
39
45
|
inactivity_timeout: nil, # 连接后超时时间
|
40
46
|
# ssl设置
|
@@ -112,6 +118,41 @@ module ListSpider
|
|
112
118
|
@local_path_set = Set.new
|
113
119
|
|
114
120
|
class << self
|
121
|
+
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
122
|
+
if interval.is_a? Range
|
123
|
+
@random_time_range = interval
|
124
|
+
interval = RANDOM_TIME
|
125
|
+
end
|
126
|
+
|
127
|
+
@down_list = filter_list(down_list)
|
128
|
+
@interval = interval
|
129
|
+
@max = max
|
130
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
131
|
+
@succeed_size = 0
|
132
|
+
@failed_size = 0
|
133
|
+
|
134
|
+
puts "total size:#{@down_list.size}"
|
135
|
+
event_machine_start_list(next_task, method(:complete))
|
136
|
+
end
|
137
|
+
|
138
|
+
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
139
|
+
get_list([task], interval: interval, max: max)
|
140
|
+
end
|
141
|
+
|
142
|
+
def add_task(task)
|
143
|
+
if task.is_a? Array
|
144
|
+
need_down_list = filter_list(task)
|
145
|
+
@down_list += need_down_list
|
146
|
+
elsif task.is_a?TaskStruct
|
147
|
+
need_down_list = filter_list([task])
|
148
|
+
@down_list += need_down_list
|
149
|
+
else
|
150
|
+
puts "error task type:#{task.class}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
115
156
|
def event_machine_down(link_struct_list, callback = nil)
|
116
157
|
failed_list = []
|
117
158
|
succeed_list = []
|
@@ -247,43 +288,6 @@ module ListSpider
|
|
247
288
|
end
|
248
289
|
need_down_list
|
249
290
|
end
|
250
|
-
|
251
|
-
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
252
|
-
if interval.is_a? Range
|
253
|
-
@random_time_range = interval
|
254
|
-
interval = RANDOM_TIME
|
255
|
-
end
|
256
|
-
|
257
|
-
@down_list = []
|
258
|
-
|
259
|
-
need_down_list = filter_list(down_list)
|
260
|
-
|
261
|
-
@down_list += need_down_list
|
262
|
-
@interval = interval
|
263
|
-
@max = max
|
264
|
-
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
265
|
-
@succeed_size = 0
|
266
|
-
@failed_size = 0
|
267
|
-
|
268
|
-
puts "total size:#{@down_list.size}"
|
269
|
-
event_machine_start_list(next_task, method(:complete))
|
270
|
-
end
|
271
|
-
|
272
|
-
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
273
|
-
get_list([task], interval: interval, max: max)
|
274
|
-
end
|
275
|
-
|
276
|
-
def add_task(task)
|
277
|
-
if task.is_a? Array
|
278
|
-
need_down_list = filter_list(task)
|
279
|
-
@down_list += need_down_list
|
280
|
-
elsif task.is_a?TaskStruct
|
281
|
-
need_down_list = filter_list([task])
|
282
|
-
@down_list += need_down_list
|
283
|
-
else
|
284
|
-
puts "error task type:#{task.class}"
|
285
|
-
end
|
286
|
-
end
|
287
291
|
end
|
288
292
|
|
289
293
|
Signal.trap('INT') do
|
data/lib/list_spider/version.rb
CHANGED
data/list_spider.gemspec
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
lib = File.expand_path('
|
2
|
+
lib = File.expand_path('lib', __dir__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
require 'list_spider/version'
|
5
5
|
|
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency 'rake', '~> 10.0'
|
27
27
|
|
28
28
|
spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
|
29
|
-
spec.add_dependency 'nokogiri', '
|
29
|
+
spec.add_dependency 'nokogiri', '>= 1.8.5'
|
30
30
|
spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
|
31
31
|
end
|
data/spider_example.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
|
2
|
-
require File.expand_path('../lib/list_spider', __FILE__)
|
1
|
+
require 'list_spider'
|
2
|
+
# require File.expand_path('../lib/list_spider', __FILE__)
|
3
3
|
|
4
4
|
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
5
|
|
@@ -16,8 +16,6 @@ def parse_index_item(e)
|
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
-
# ListSpider.convert_to_utf8 = true
|
20
|
-
|
21
19
|
# get_one is a simple function for one taskstruct situation
|
22
20
|
ListSpider.get_one(
|
23
21
|
TaskStruct.new(
|
data/spider_example_2.rb
CHANGED
@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
|
|
4
4
|
|
5
5
|
@next_list = []
|
6
6
|
|
7
|
-
def parse_index_item(
|
8
|
-
content = File.read(
|
7
|
+
def parse_index_item(e)
|
8
|
+
content = File.read(e.local_path)
|
9
9
|
doc = Nokogiri::HTML(content)
|
10
10
|
list_group = doc.css('h2.entry-title')
|
11
11
|
link_list = list_group.css('a')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -62,22 +62,16 @@ dependencies:
|
|
62
62
|
name: nokogiri
|
63
63
|
requirement: !ruby/object:Gem::Requirement
|
64
64
|
requirements:
|
65
|
-
- - "~>"
|
66
|
-
- !ruby/object:Gem::Version
|
67
|
-
version: '1.6'
|
68
65
|
- - ">="
|
69
66
|
- !ruby/object:Gem::Version
|
70
|
-
version: 1.
|
67
|
+
version: 1.8.5
|
71
68
|
type: :runtime
|
72
69
|
prerelease: false
|
73
70
|
version_requirements: !ruby/object:Gem::Requirement
|
74
71
|
requirements:
|
75
|
-
- - "~>"
|
76
|
-
- !ruby/object:Gem::Version
|
77
|
-
version: '1.6'
|
78
72
|
- - ">="
|
79
73
|
- !ruby/object:Gem::Version
|
80
|
-
version: 1.
|
74
|
+
version: 1.8.5
|
81
75
|
- !ruby/object:Gem::Dependency
|
82
76
|
name: rchardet
|
83
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,7 +100,9 @@ extensions: []
|
|
106
100
|
extra_rdoc_files: []
|
107
101
|
files:
|
108
102
|
- ".gitignore"
|
103
|
+
- ".rdoc_options"
|
109
104
|
- ".rubocop.yml"
|
105
|
+
- English_README.md
|
110
106
|
- Gemfile
|
111
107
|
- Gemfile.lock
|
112
108
|
- README.md
|
@@ -140,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
140
136
|
- !ruby/object:Gem::Version
|
141
137
|
version: '0'
|
142
138
|
requirements: []
|
143
|
-
|
144
|
-
rubygems_version: 2.7.3
|
139
|
+
rubygems_version: 3.0.1
|
145
140
|
signing_key:
|
146
141
|
specification_version: 4
|
147
142
|
summary: List Spider
|