list_spider 2.3.0 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +298 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5cbae2e4b21b6976d8d6edd2a441914ebeb6db4bd6dabadb5ae1e0f9458698f
|
4
|
+
data.tar.gz: ebbf650a9eab92b50eeea5217cab6cd3ffa4b79036ebf4de552f11fe4435ee5d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 88569ec65a348f547d503c7b87a133beaf3066ae459755d4c34a7b3fcb2f73158c9993d95a595af592b451e34c3798f907deed7da2097153d3a021ad7f9088cb
|
7
|
+
data.tar.gz: 9fd5d151359f5c19102f6fd2a7679a9fdbdb88576b1b6fbe82950d6f43acff1b7fbd6ecf4eed9f67e6d7e80e793701b04d3469fa7649e6a12e5804ea24626ce4
|
data/.gitignore
CHANGED
@@ -1,84 +1,84 @@
|
|
1
|
-
*.gem
|
2
|
-
*.rbc
|
3
|
-
/.config
|
4
|
-
/coverage/
|
5
|
-
/InstalledFiles
|
6
|
-
/pkg/
|
7
|
-
/spec/reports/
|
8
|
-
/spec/examples.txt
|
9
|
-
/test/tmp/
|
10
|
-
/test/version_tmp/
|
11
|
-
/tmp/
|
12
|
-
|
13
|
-
## Specific to RubyMotion:
|
14
|
-
.dat*
|
15
|
-
.repl_history
|
16
|
-
build/
|
17
|
-
|
18
|
-
## Documentation cache and generated files:
|
19
|
-
/.yardoc/
|
20
|
-
/_yardoc/
|
21
|
-
/doc/
|
22
|
-
/rdoc/
|
23
|
-
|
24
|
-
## Environment normalisation:
|
25
|
-
/.bundle/
|
26
|
-
/vendor/bundle
|
27
|
-
/lib/bundler/man/
|
28
|
-
|
29
|
-
# for a library or gem, you might want to ignore these files since the code is
|
30
|
-
# intended to run in multiple environments; otherwise, check them in:
|
31
|
-
# Gemfile.lock
|
32
|
-
# .ruby-version
|
33
|
-
# .ruby-gemset
|
34
|
-
|
35
|
-
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
36
|
-
.rvmrc
|
37
|
-
|
38
|
-
.DS_Store
|
39
|
-
.AppleDouble
|
40
|
-
.LSOverride
|
41
|
-
|
42
|
-
# Icon must end with two \r
|
43
|
-
Icon
|
44
|
-
|
45
|
-
|
46
|
-
# Thumbnails
|
47
|
-
._*
|
48
|
-
|
49
|
-
# Files that might appear in the root of a volume
|
50
|
-
.DocumentRevisions-V100
|
51
|
-
.fseventsd
|
52
|
-
.Spotlight-V100
|
53
|
-
.TemporaryItems
|
54
|
-
.Trashes
|
55
|
-
.VolumeIcon.icns
|
56
|
-
|
57
|
-
# Directories potentially created on remote AFP share
|
58
|
-
.AppleDB
|
59
|
-
.AppleDesktop
|
60
|
-
Network Trash Folder
|
61
|
-
Temporary Items
|
62
|
-
.apdisk
|
63
|
-
|
64
|
-
# Windows image file caches
|
65
|
-
Thumbs.db
|
66
|
-
ehthumbs.db
|
67
|
-
|
68
|
-
# Folder config file
|
69
|
-
Desktop.ini
|
70
|
-
|
71
|
-
# Recycle Bin used on file shares
|
72
|
-
$RECYCLE.BIN/
|
73
|
-
|
74
|
-
# Windows Installer files
|
75
|
-
*.cab
|
76
|
-
*.msi
|
77
|
-
*.msm
|
78
|
-
*.msp
|
79
|
-
|
80
|
-
# Windows shortcuts
|
81
|
-
*.lnk
|
82
|
-
|
83
|
-
rubocopresult
|
84
|
-
coolshell
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
## Specific to RubyMotion:
|
14
|
+
.dat*
|
15
|
+
.repl_history
|
16
|
+
build/
|
17
|
+
|
18
|
+
## Documentation cache and generated files:
|
19
|
+
/.yardoc/
|
20
|
+
/_yardoc/
|
21
|
+
/doc/
|
22
|
+
/rdoc/
|
23
|
+
|
24
|
+
## Environment normalisation:
|
25
|
+
/.bundle/
|
26
|
+
/vendor/bundle
|
27
|
+
/lib/bundler/man/
|
28
|
+
|
29
|
+
# for a library or gem, you might want to ignore these files since the code is
|
30
|
+
# intended to run in multiple environments; otherwise, check them in:
|
31
|
+
# Gemfile.lock
|
32
|
+
# .ruby-version
|
33
|
+
# .ruby-gemset
|
34
|
+
|
35
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
36
|
+
.rvmrc
|
37
|
+
|
38
|
+
.DS_Store
|
39
|
+
.AppleDouble
|
40
|
+
.LSOverride
|
41
|
+
|
42
|
+
# Icon must end with two \r
|
43
|
+
Icon
|
44
|
+
|
45
|
+
|
46
|
+
# Thumbnails
|
47
|
+
._*
|
48
|
+
|
49
|
+
# Files that might appear in the root of a volume
|
50
|
+
.DocumentRevisions-V100
|
51
|
+
.fseventsd
|
52
|
+
.Spotlight-V100
|
53
|
+
.TemporaryItems
|
54
|
+
.Trashes
|
55
|
+
.VolumeIcon.icns
|
56
|
+
|
57
|
+
# Directories potentially created on remote AFP share
|
58
|
+
.AppleDB
|
59
|
+
.AppleDesktop
|
60
|
+
Network Trash Folder
|
61
|
+
Temporary Items
|
62
|
+
.apdisk
|
63
|
+
|
64
|
+
# Windows image file caches
|
65
|
+
Thumbs.db
|
66
|
+
ehthumbs.db
|
67
|
+
|
68
|
+
# Folder config file
|
69
|
+
Desktop.ini
|
70
|
+
|
71
|
+
# Recycle Bin used on file shares
|
72
|
+
$RECYCLE.BIN/
|
73
|
+
|
74
|
+
# Windows Installer files
|
75
|
+
*.cab
|
76
|
+
*.msi
|
77
|
+
*.msm
|
78
|
+
*.msp
|
79
|
+
|
80
|
+
# Windows shortcuts
|
81
|
+
*.lnk
|
82
|
+
|
83
|
+
rubocopresult
|
84
|
+
coolshell
|
data/.rdoc_options
CHANGED
@@ -1,23 +1,23 @@
|
|
1
|
-
--- !ruby/object:RDoc::Options
|
2
|
-
encoding: UTF-8
|
3
|
-
static_path: []
|
4
|
-
rdoc_include:
|
5
|
-
- "."
|
6
|
-
- "/Users/zhangchao/github/list_spider"
|
7
|
-
charset: UTF-8
|
8
|
-
exclude:
|
9
|
-
hyperlink_all: false
|
10
|
-
line_numbers: false
|
11
|
-
locale:
|
12
|
-
locale_dir: locale
|
13
|
-
locale_name:
|
14
|
-
main_page:
|
15
|
-
markup: markdown
|
16
|
-
output_decoration: true
|
17
|
-
page_dir:
|
18
|
-
show_hash: false
|
19
|
-
tab_width: 8
|
20
|
-
template_stylesheets: []
|
21
|
-
title:
|
22
|
-
visibility: :protected
|
23
|
-
webcvs:
|
1
|
+
--- !ruby/object:RDoc::Options
|
2
|
+
encoding: UTF-8
|
3
|
+
static_path: []
|
4
|
+
rdoc_include:
|
5
|
+
- "."
|
6
|
+
- "/Users/zhangchao/github/list_spider"
|
7
|
+
charset: UTF-8
|
8
|
+
exclude:
|
9
|
+
hyperlink_all: false
|
10
|
+
line_numbers: false
|
11
|
+
locale:
|
12
|
+
locale_dir: locale
|
13
|
+
locale_name:
|
14
|
+
main_page:
|
15
|
+
markup: markdown
|
16
|
+
output_decoration: true
|
17
|
+
page_dir:
|
18
|
+
show_hash: false
|
19
|
+
tab_width: 8
|
20
|
+
template_stylesheets: []
|
21
|
+
title:
|
22
|
+
visibility: :protected
|
23
|
+
webcvs:
|
data/.rubocop.yml
CHANGED
@@ -1,48 +1,48 @@
|
|
1
|
-
Metrics/LineLength:
|
2
|
-
Max: 120
|
3
|
-
Metrics/MethodLength:
|
4
|
-
Max: 50
|
5
|
-
Metrics/ParameterLists:
|
6
|
-
Max: 12
|
7
|
-
Metrics/AbcSize:
|
8
|
-
Max: 50
|
9
|
-
Metrics/CyclomaticComplexity:
|
10
|
-
Max: 10
|
11
|
-
Metrics/PerceivedComplexity:
|
12
|
-
Max: 10
|
13
|
-
Style/GuardClause:
|
14
|
-
MinBodyLength: 5
|
15
|
-
Style/AsciiComments:
|
16
|
-
Enabled: false
|
17
|
-
Style/Documentation:
|
18
|
-
Enabled: false
|
19
|
-
Lint/AmbiguousRegexpLiteral:
|
20
|
-
Enabled: false
|
21
|
-
Layout/DefEndAlignment:
|
22
|
-
AutoCorrect: true
|
23
|
-
Layout/EndAlignment:
|
24
|
-
AutoCorrect: true
|
25
|
-
Style/BracesAroundHashParameters:
|
26
|
-
Enabled: false
|
27
|
-
Style/ClassAndModuleChildren:
|
28
|
-
Enabled: false
|
29
|
-
Style/AutoResourceCleanup:
|
30
|
-
Enabled: true
|
31
|
-
Style/CollectionMethods:
|
32
|
-
Enabled: true
|
33
|
-
Style/Encoding:
|
34
|
-
Enabled: true
|
35
|
-
Style/MethodCalledOnDoEndBlock:
|
36
|
-
Enabled: true
|
37
|
-
Layout/MultilineAssignmentLayout:
|
38
|
-
Enabled: true
|
39
|
-
Style/OptionHash:
|
40
|
-
Enabled: true
|
41
|
-
Style/StringMethods:
|
42
|
-
Enabled: true
|
43
|
-
Style/SymbolArray:
|
44
|
-
Enabled: true
|
45
|
-
Style/NonNilCheck:
|
46
|
-
IncludeSemanticChanges: true
|
47
|
-
Style/Send:
|
48
|
-
Enabled: true
|
1
|
+
Metrics/LineLength:
|
2
|
+
Max: 120
|
3
|
+
Metrics/MethodLength:
|
4
|
+
Max: 50
|
5
|
+
Metrics/ParameterLists:
|
6
|
+
Max: 12
|
7
|
+
Metrics/AbcSize:
|
8
|
+
Max: 50
|
9
|
+
Metrics/CyclomaticComplexity:
|
10
|
+
Max: 10
|
11
|
+
Metrics/PerceivedComplexity:
|
12
|
+
Max: 10
|
13
|
+
Style/GuardClause:
|
14
|
+
MinBodyLength: 5
|
15
|
+
Style/AsciiComments:
|
16
|
+
Enabled: false
|
17
|
+
Style/Documentation:
|
18
|
+
Enabled: false
|
19
|
+
Lint/AmbiguousRegexpLiteral:
|
20
|
+
Enabled: false
|
21
|
+
Layout/DefEndAlignment:
|
22
|
+
AutoCorrect: true
|
23
|
+
Layout/EndAlignment:
|
24
|
+
AutoCorrect: true
|
25
|
+
Style/BracesAroundHashParameters:
|
26
|
+
Enabled: false
|
27
|
+
Style/ClassAndModuleChildren:
|
28
|
+
Enabled: false
|
29
|
+
Style/AutoResourceCleanup:
|
30
|
+
Enabled: true
|
31
|
+
Style/CollectionMethods:
|
32
|
+
Enabled: true
|
33
|
+
Style/Encoding:
|
34
|
+
Enabled: true
|
35
|
+
Style/MethodCalledOnDoEndBlock:
|
36
|
+
Enabled: true
|
37
|
+
Layout/MultilineAssignmentLayout:
|
38
|
+
Enabled: true
|
39
|
+
Style/OptionHash:
|
40
|
+
Enabled: true
|
41
|
+
Style/StringMethods:
|
42
|
+
Enabled: true
|
43
|
+
Style/SymbolArray:
|
44
|
+
Enabled: true
|
45
|
+
Style/NonNilCheck:
|
46
|
+
IncludeSemanticChanges: true
|
47
|
+
Style/Send:
|
48
|
+
Enabled: true
|
data/English_README.md
CHANGED
@@ -1,169 +1,169 @@
|
|
1
|
-
# list_spider
|
2
|
-
|
3
|
-
A url list spider based on em-http-request.
|
4
|
-
|
5
|
-
Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
|
6
|
-
|
7
|
-
## Features
|
8
|
-
* Duplicate url filtering (based on local path, so you can custom your behavior).
|
9
|
-
|
10
|
-
* Convert to UTF-8 support.
|
11
|
-
|
12
|
-
* Increased spider support (don't spider exist).
|
13
|
-
|
14
|
-
* Customize concurrent number and interval between task.
|
15
|
-
|
16
|
-
* Http options support.
|
17
|
-
|
18
|
-
## Getting started
|
19
|
-
|
20
|
-
```ruby
|
21
|
-
gem install list_spider
|
22
|
-
```
|
23
|
-
|
24
|
-
Or add it to your Gemfile
|
25
|
-
|
26
|
-
```ruby
|
27
|
-
gem 'list_spider'
|
28
|
-
```
|
29
|
-
|
30
|
-
## Use like this
|
31
|
-
```ruby
|
32
|
-
require 'list_spider'
|
33
|
-
|
34
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
-
|
36
|
-
@next_list = []
|
37
|
-
|
38
|
-
def parse_index_item(e)
|
39
|
-
content = File.read(e.local_path)
|
40
|
-
doc = Nokogiri::HTML(content)
|
41
|
-
list_group = doc.css('h2.entry-title')
|
42
|
-
link_list = list_group.css('a')
|
43
|
-
|
44
|
-
link_list.each do |link|
|
45
|
-
href = link['href']
|
46
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
-
# or you can save them to database for later use
|
48
|
-
@next_list << TaskStruct.new(href, local_path)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
task_list = []
|
53
|
-
task_list << TaskStruct.new(
|
54
|
-
'https://coolshell.cn/',
|
55
|
-
DOWNLOAD_DIR + 'index.html',
|
56
|
-
parse_method: method(:parse_index_item)
|
57
|
-
)
|
58
|
-
|
59
|
-
ListSpider.get_list(task_list)
|
60
|
-
ListSpider.get_list(@next_list, max: 60)
|
61
|
-
```
|
62
|
-
|
63
|
-
## Or in one step
|
64
|
-
```ruby
|
65
|
-
require 'list_spider'
|
66
|
-
|
67
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
-
|
69
|
-
def parse_index_item(e)
|
70
|
-
content = File.read(e.local_path)
|
71
|
-
doc = Nokogiri::HTML(content)
|
72
|
-
list_group = doc.css('h2.entry-title')
|
73
|
-
link_list = list_group.css('a')
|
74
|
-
|
75
|
-
link_list.each do |link|
|
76
|
-
href = link['href']
|
77
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
-
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# get_one is a simple function for one taskstruct situation
|
83
|
-
ListSpider.get_one(
|
84
|
-
TaskStruct.new(
|
85
|
-
'https://coolshell.cn/',
|
86
|
-
DOWNLOAD_DIR + 'index.html',
|
87
|
-
parse_method: method(:parse_index_item)
|
88
|
-
),
|
89
|
-
max: 60
|
90
|
-
)
|
91
|
-
```
|
92
|
-
|
93
|
-
## And there are many options you can use
|
94
|
-
|
95
|
-
```ruby
|
96
|
-
def initialize(href, # 请求链接
|
97
|
-
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
98
|
-
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
99
|
-
http_method: :get,
|
100
|
-
custom_data: nil, # 自定义数据
|
101
|
-
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
102
|
-
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
103
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
104
|
-
# http.response_header.status 状态码
|
105
|
-
# http.response_header 返回头
|
106
|
-
# http.response 返回体
|
107
|
-
callback: nil,
|
108
|
-
# 请求失败后的回调
|
109
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
110
|
-
errback: nil,
|
111
|
-
stream_callback: nil, # 流数据处理回调
|
112
|
-
convert_to_utf8: false, # 是否转换为utf8编码
|
113
|
-
overwrite_exist: false, # 是否覆盖现有文件
|
114
|
-
# request options
|
115
|
-
redirects: 3, # 重定向次数
|
116
|
-
keepalive: nil, # (暂不支持复用)
|
117
|
-
file: nil, # 要上传的文件路径
|
118
|
-
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
119
|
-
query: nil, # 查询字符串,可以是string或hash类型
|
120
|
-
body: nil, # 请求体,可以是string或hash类型
|
121
|
-
head: nil, # 请求头
|
122
|
-
# connection options
|
123
|
-
connect_timeout: 60, # 连接超时时间
|
124
|
-
inactivity_timeout: nil, # 连接后超时时间
|
125
|
-
# ssl设置
|
126
|
-
# ssl: {
|
127
|
-
# :private_key_file => '/tmp/server.key',
|
128
|
-
# :cert_chain_file => '/tmp/server.crt',
|
129
|
-
# :verify_peer => false
|
130
|
-
# }
|
131
|
-
ssl: nil,
|
132
|
-
# bind: {
|
133
|
-
# :host => '123.123.123.123', # use a specific interface for outbound request
|
134
|
-
# :port => '123'
|
135
|
-
# }
|
136
|
-
bind: nil,
|
137
|
-
# 代理设置
|
138
|
-
# proxy: {
|
139
|
-
# :host => '127.0.0.1', # proxy address
|
140
|
-
# :port => 9000, # proxy port
|
141
|
-
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
142
|
-
|
143
|
-
# :authorization => ['user', 'pass'] # proxy authorization header
|
144
|
-
# }
|
145
|
-
proxy: nil)
|
146
|
-
```
|
147
|
-
|
148
|
-
## Callback methods form
|
149
|
-
|
150
|
-
```ruby
|
151
|
-
# called when the file is saved successfully
|
152
|
-
def parse_eresponse(task_struct)
|
153
|
-
# ...
|
154
|
-
end
|
155
|
-
|
156
|
-
def call_back(task_struct, http_req)
|
157
|
-
# http_req is a EventMachine::HttpRequest object
|
158
|
-
# http_req.response_header.status
|
159
|
-
# ...
|
160
|
-
end
|
161
|
-
|
162
|
-
def err_back(task_struct, http_req)
|
163
|
-
# ...
|
164
|
-
end
|
165
|
-
```
|
166
|
-
|
167
|
-
### License
|
168
|
-
|
169
|
-
(MIT License) - Copyright (c) 2016 Charles Zhang
|
1
|
+
# list_spider
|
2
|
+
|
3
|
+
A url list spider based on em-http-request.
|
4
|
+
|
5
|
+
Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
|
6
|
+
|
7
|
+
## Features
|
8
|
+
* Duplicate url filtering (based on local path, so you can custom your behavior).
|
9
|
+
|
10
|
+
* Convert to UTF-8 support.
|
11
|
+
|
12
|
+
* Increased spider support (don't spider exist).
|
13
|
+
|
14
|
+
* Customize concurrent number and interval between task.
|
15
|
+
|
16
|
+
* Http options support.
|
17
|
+
|
18
|
+
## Getting started
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
Or add it to your Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
29
|
+
|
30
|
+
## Use like this
|
31
|
+
```ruby
|
32
|
+
require 'list_spider'
|
33
|
+
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
+
|
36
|
+
@next_list = []
|
37
|
+
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
40
|
+
doc = Nokogiri::HTML(content)
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
43
|
+
|
44
|
+
link_list.each do |link|
|
45
|
+
href = link['href']
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# or you can save them to database for later use
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task_list = []
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
58
|
+
|
59
|
+
ListSpider.get_list(task_list)
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
61
|
+
```
|
62
|
+
|
63
|
+
## Or in one step
|
64
|
+
```ruby
|
65
|
+
require 'list_spider'
|
66
|
+
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
+
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
71
|
+
doc = Nokogiri::HTML(content)
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
74
|
+
|
75
|
+
link_list.each do |link|
|
76
|
+
href = link['href']
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# get_one is a simple function for one taskstruct situation
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
91
|
+
```
|
92
|
+
|
93
|
+
## And there are many options you can use
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
def initialize(href, # 请求链接
|
97
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
98
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
99
|
+
http_method: :get,
|
100
|
+
custom_data: nil, # 自定义数据
|
101
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
102
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
103
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
104
|
+
# http.response_header.status 状态码
|
105
|
+
# http.response_header 返回头
|
106
|
+
# http.response 返回体
|
107
|
+
callback: nil,
|
108
|
+
# 请求失败后的回调
|
109
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
110
|
+
errback: nil,
|
111
|
+
stream_callback: nil, # 流数据处理回调
|
112
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
113
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
114
|
+
# request options
|
115
|
+
redirects: 3, # 重定向次数
|
116
|
+
keepalive: nil, # (暂不支持复用)
|
117
|
+
file: nil, # 要上传的文件路径
|
118
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
119
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
120
|
+
body: nil, # 请求体,可以是string或hash类型
|
121
|
+
head: nil, # 请求头
|
122
|
+
# connection options
|
123
|
+
connect_timeout: 60, # 连接超时时间
|
124
|
+
inactivity_timeout: nil, # 连接后超时时间
|
125
|
+
# ssl设置
|
126
|
+
# ssl: {
|
127
|
+
# :private_key_file => '/tmp/server.key',
|
128
|
+
# :cert_chain_file => '/tmp/server.crt',
|
129
|
+
# :verify_peer => false
|
130
|
+
# }
|
131
|
+
ssl: nil,
|
132
|
+
# bind: {
|
133
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
134
|
+
# :port => '123'
|
135
|
+
# }
|
136
|
+
bind: nil,
|
137
|
+
# 代理设置
|
138
|
+
# proxy: {
|
139
|
+
# :host => '127.0.0.1', # proxy address
|
140
|
+
# :port => 9000, # proxy port
|
141
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
142
|
+
|
143
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
144
|
+
# }
|
145
|
+
proxy: nil)
|
146
|
+
```
|
147
|
+
|
148
|
+
## Callback methods form
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
# called when the file is saved successfully
|
152
|
+
def parse_eresponse(task_struct)
|
153
|
+
# ...
|
154
|
+
end
|
155
|
+
|
156
|
+
def call_back(task_struct, http_req)
|
157
|
+
# http_req is a EventMachine::HttpRequest object
|
158
|
+
# http_req.response_header.status
|
159
|
+
# ...
|
160
|
+
end
|
161
|
+
|
162
|
+
def err_back(task_struct, http_req)
|
163
|
+
# ...
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
167
|
+
### License
|
168
|
+
|
169
|
+
(MIT License) - Copyright (c) 2016 Charles Zhang
|