list_spider 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/Gemfile.lock +12 -11
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +297 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +6 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2fe3c4889a58bacda6fda400b4156c2adf4fbe774bdf9f15596958a1909d890
|
4
|
+
data.tar.gz: db682834ba36559dfc4bbcb704151b4eddea315b614d1c0ef9aa9d170aa69f58
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a259eb91eda3257d3be3df839274a4d40a3c11a4dc2fa0b3a57e580cef6aa180f06d27b05a2b2ee30ceb0e4bcbf037f55e36351d2034ae41f6636dcadaa14aa3
|
7
|
+
data.tar.gz: 9d2062238acf39de5fd8c86d47eb7106640b31ebf87ff42de245a6e911a2efb3e54e7d4592ede162eb248cc33b59e00482781c023a6b1c739b65b703bc8a39e5
|
data/.gitignore
CHANGED
@@ -1,84 +1,84 @@
|
|
1
|
-
*.gem
|
2
|
-
*.rbc
|
3
|
-
/.config
|
4
|
-
/coverage/
|
5
|
-
/InstalledFiles
|
6
|
-
/pkg/
|
7
|
-
/spec/reports/
|
8
|
-
/spec/examples.txt
|
9
|
-
/test/tmp/
|
10
|
-
/test/version_tmp/
|
11
|
-
/tmp/
|
12
|
-
|
13
|
-
## Specific to RubyMotion:
|
14
|
-
.dat*
|
15
|
-
.repl_history
|
16
|
-
build/
|
17
|
-
|
18
|
-
## Documentation cache and generated files:
|
19
|
-
/.yardoc/
|
20
|
-
/_yardoc/
|
21
|
-
/doc/
|
22
|
-
/rdoc/
|
23
|
-
|
24
|
-
## Environment normalisation:
|
25
|
-
/.bundle/
|
26
|
-
/vendor/bundle
|
27
|
-
/lib/bundler/man/
|
28
|
-
|
29
|
-
# for a library or gem, you might want to ignore these files since the code is
|
30
|
-
# intended to run in multiple environments; otherwise, check them in:
|
31
|
-
# Gemfile.lock
|
32
|
-
# .ruby-version
|
33
|
-
# .ruby-gemset
|
34
|
-
|
35
|
-
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
36
|
-
.rvmrc
|
37
|
-
|
38
|
-
.DS_Store
|
39
|
-
.AppleDouble
|
40
|
-
.LSOverride
|
41
|
-
|
42
|
-
# Icon must end with two \r
|
43
|
-
Icon
|
44
|
-
|
45
|
-
|
46
|
-
# Thumbnails
|
47
|
-
._*
|
48
|
-
|
49
|
-
# Files that might appear in the root of a volume
|
50
|
-
.DocumentRevisions-V100
|
51
|
-
.fseventsd
|
52
|
-
.Spotlight-V100
|
53
|
-
.TemporaryItems
|
54
|
-
.Trashes
|
55
|
-
.VolumeIcon.icns
|
56
|
-
|
57
|
-
# Directories potentially created on remote AFP share
|
58
|
-
.AppleDB
|
59
|
-
.AppleDesktop
|
60
|
-
Network Trash Folder
|
61
|
-
Temporary Items
|
62
|
-
.apdisk
|
63
|
-
|
64
|
-
# Windows image file caches
|
65
|
-
Thumbs.db
|
66
|
-
ehthumbs.db
|
67
|
-
|
68
|
-
# Folder config file
|
69
|
-
Desktop.ini
|
70
|
-
|
71
|
-
# Recycle Bin used on file shares
|
72
|
-
$RECYCLE.BIN/
|
73
|
-
|
74
|
-
# Windows Installer files
|
75
|
-
*.cab
|
76
|
-
*.msi
|
77
|
-
*.msm
|
78
|
-
*.msp
|
79
|
-
|
80
|
-
# Windows shortcuts
|
81
|
-
*.lnk
|
82
|
-
|
83
|
-
rubocopresult
|
84
|
-
coolshell
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
## Specific to RubyMotion:
|
14
|
+
.dat*
|
15
|
+
.repl_history
|
16
|
+
build/
|
17
|
+
|
18
|
+
## Documentation cache and generated files:
|
19
|
+
/.yardoc/
|
20
|
+
/_yardoc/
|
21
|
+
/doc/
|
22
|
+
/rdoc/
|
23
|
+
|
24
|
+
## Environment normalisation:
|
25
|
+
/.bundle/
|
26
|
+
/vendor/bundle
|
27
|
+
/lib/bundler/man/
|
28
|
+
|
29
|
+
# for a library or gem, you might want to ignore these files since the code is
|
30
|
+
# intended to run in multiple environments; otherwise, check them in:
|
31
|
+
# Gemfile.lock
|
32
|
+
# .ruby-version
|
33
|
+
# .ruby-gemset
|
34
|
+
|
35
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
36
|
+
.rvmrc
|
37
|
+
|
38
|
+
.DS_Store
|
39
|
+
.AppleDouble
|
40
|
+
.LSOverride
|
41
|
+
|
42
|
+
# Icon must end with two \r
|
43
|
+
Icon
|
44
|
+
|
45
|
+
|
46
|
+
# Thumbnails
|
47
|
+
._*
|
48
|
+
|
49
|
+
# Files that might appear in the root of a volume
|
50
|
+
.DocumentRevisions-V100
|
51
|
+
.fseventsd
|
52
|
+
.Spotlight-V100
|
53
|
+
.TemporaryItems
|
54
|
+
.Trashes
|
55
|
+
.VolumeIcon.icns
|
56
|
+
|
57
|
+
# Directories potentially created on remote AFP share
|
58
|
+
.AppleDB
|
59
|
+
.AppleDesktop
|
60
|
+
Network Trash Folder
|
61
|
+
Temporary Items
|
62
|
+
.apdisk
|
63
|
+
|
64
|
+
# Windows image file caches
|
65
|
+
Thumbs.db
|
66
|
+
ehthumbs.db
|
67
|
+
|
68
|
+
# Folder config file
|
69
|
+
Desktop.ini
|
70
|
+
|
71
|
+
# Recycle Bin used on file shares
|
72
|
+
$RECYCLE.BIN/
|
73
|
+
|
74
|
+
# Windows Installer files
|
75
|
+
*.cab
|
76
|
+
*.msi
|
77
|
+
*.msm
|
78
|
+
*.msp
|
79
|
+
|
80
|
+
# Windows shortcuts
|
81
|
+
*.lnk
|
82
|
+
|
83
|
+
rubocopresult
|
84
|
+
coolshell
|
data/.rdoc_options
CHANGED
@@ -1,23 +1,23 @@
|
|
1
|
-
--- !ruby/object:RDoc::Options
|
2
|
-
encoding: UTF-8
|
3
|
-
static_path: []
|
4
|
-
rdoc_include:
|
5
|
-
- "."
|
6
|
-
- "/Users/zhangchao/github/list_spider"
|
7
|
-
charset: UTF-8
|
8
|
-
exclude:
|
9
|
-
hyperlink_all: false
|
10
|
-
line_numbers: false
|
11
|
-
locale:
|
12
|
-
locale_dir: locale
|
13
|
-
locale_name:
|
14
|
-
main_page:
|
15
|
-
markup: markdown
|
16
|
-
output_decoration: true
|
17
|
-
page_dir:
|
18
|
-
show_hash: false
|
19
|
-
tab_width: 8
|
20
|
-
template_stylesheets: []
|
21
|
-
title:
|
22
|
-
visibility: :protected
|
23
|
-
webcvs:
|
1
|
+
--- !ruby/object:RDoc::Options
|
2
|
+
encoding: UTF-8
|
3
|
+
static_path: []
|
4
|
+
rdoc_include:
|
5
|
+
- "."
|
6
|
+
- "/Users/zhangchao/github/list_spider"
|
7
|
+
charset: UTF-8
|
8
|
+
exclude:
|
9
|
+
hyperlink_all: false
|
10
|
+
line_numbers: false
|
11
|
+
locale:
|
12
|
+
locale_dir: locale
|
13
|
+
locale_name:
|
14
|
+
main_page:
|
15
|
+
markup: markdown
|
16
|
+
output_decoration: true
|
17
|
+
page_dir:
|
18
|
+
show_hash: false
|
19
|
+
tab_width: 8
|
20
|
+
template_stylesheets: []
|
21
|
+
title:
|
22
|
+
visibility: :protected
|
23
|
+
webcvs:
|
data/.rubocop.yml
CHANGED
@@ -1,48 +1,48 @@
|
|
1
|
-
Metrics/LineLength:
|
2
|
-
Max: 120
|
3
|
-
Metrics/MethodLength:
|
4
|
-
Max: 50
|
5
|
-
Metrics/ParameterLists:
|
6
|
-
Max: 12
|
7
|
-
Metrics/AbcSize:
|
8
|
-
Max: 50
|
9
|
-
Metrics/CyclomaticComplexity:
|
10
|
-
Max: 10
|
11
|
-
Metrics/PerceivedComplexity:
|
12
|
-
Max: 10
|
13
|
-
Style/GuardClause:
|
14
|
-
MinBodyLength: 5
|
15
|
-
Style/AsciiComments:
|
16
|
-
Enabled: false
|
17
|
-
Style/Documentation:
|
18
|
-
Enabled: false
|
19
|
-
Lint/AmbiguousRegexpLiteral:
|
20
|
-
Enabled: false
|
21
|
-
Layout/DefEndAlignment:
|
22
|
-
AutoCorrect: true
|
23
|
-
Layout/EndAlignment:
|
24
|
-
AutoCorrect: true
|
25
|
-
Style/BracesAroundHashParameters:
|
26
|
-
Enabled: false
|
27
|
-
Style/ClassAndModuleChildren:
|
28
|
-
Enabled: false
|
29
|
-
Style/AutoResourceCleanup:
|
30
|
-
Enabled: true
|
31
|
-
Style/CollectionMethods:
|
32
|
-
Enabled: true
|
33
|
-
Style/Encoding:
|
34
|
-
Enabled: true
|
35
|
-
Style/MethodCalledOnDoEndBlock:
|
36
|
-
Enabled: true
|
37
|
-
Layout/MultilineAssignmentLayout:
|
38
|
-
Enabled: true
|
39
|
-
Style/OptionHash:
|
40
|
-
Enabled: true
|
41
|
-
Style/StringMethods:
|
42
|
-
Enabled: true
|
43
|
-
Style/SymbolArray:
|
44
|
-
Enabled: true
|
45
|
-
Style/NonNilCheck:
|
46
|
-
IncludeSemanticChanges: true
|
47
|
-
Style/Send:
|
48
|
-
Enabled: true
|
1
|
+
Metrics/LineLength:
|
2
|
+
Max: 120
|
3
|
+
Metrics/MethodLength:
|
4
|
+
Max: 50
|
5
|
+
Metrics/ParameterLists:
|
6
|
+
Max: 12
|
7
|
+
Metrics/AbcSize:
|
8
|
+
Max: 50
|
9
|
+
Metrics/CyclomaticComplexity:
|
10
|
+
Max: 10
|
11
|
+
Metrics/PerceivedComplexity:
|
12
|
+
Max: 10
|
13
|
+
Style/GuardClause:
|
14
|
+
MinBodyLength: 5
|
15
|
+
Style/AsciiComments:
|
16
|
+
Enabled: false
|
17
|
+
Style/Documentation:
|
18
|
+
Enabled: false
|
19
|
+
Lint/AmbiguousRegexpLiteral:
|
20
|
+
Enabled: false
|
21
|
+
Layout/DefEndAlignment:
|
22
|
+
AutoCorrect: true
|
23
|
+
Layout/EndAlignment:
|
24
|
+
AutoCorrect: true
|
25
|
+
Style/BracesAroundHashParameters:
|
26
|
+
Enabled: false
|
27
|
+
Style/ClassAndModuleChildren:
|
28
|
+
Enabled: false
|
29
|
+
Style/AutoResourceCleanup:
|
30
|
+
Enabled: true
|
31
|
+
Style/CollectionMethods:
|
32
|
+
Enabled: true
|
33
|
+
Style/Encoding:
|
34
|
+
Enabled: true
|
35
|
+
Style/MethodCalledOnDoEndBlock:
|
36
|
+
Enabled: true
|
37
|
+
Layout/MultilineAssignmentLayout:
|
38
|
+
Enabled: true
|
39
|
+
Style/OptionHash:
|
40
|
+
Enabled: true
|
41
|
+
Style/StringMethods:
|
42
|
+
Enabled: true
|
43
|
+
Style/SymbolArray:
|
44
|
+
Enabled: true
|
45
|
+
Style/NonNilCheck:
|
46
|
+
IncludeSemanticChanges: true
|
47
|
+
Style/Send:
|
48
|
+
Enabled: true
|
data/English_README.md
CHANGED
@@ -1,169 +1,169 @@
|
|
1
|
-
# list_spider
|
2
|
-
|
3
|
-
A url list spider based on em-http-request.
|
4
|
-
|
5
|
-
Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
|
6
|
-
|
7
|
-
## Features
|
8
|
-
* Duplicate url filtering (based on local path, so you can custom your behavior).
|
9
|
-
|
10
|
-
* Convert to UTF-8 support.
|
11
|
-
|
12
|
-
* Increased spider support (don't spider exist).
|
13
|
-
|
14
|
-
* Customize concurrent number and interval between task.
|
15
|
-
|
16
|
-
* Http options support.
|
17
|
-
|
18
|
-
## Getting started
|
19
|
-
|
20
|
-
```ruby
|
21
|
-
gem install list_spider
|
22
|
-
```
|
23
|
-
|
24
|
-
Or add it to your Gemfile
|
25
|
-
|
26
|
-
```ruby
|
27
|
-
gem 'list_spider'
|
28
|
-
```
|
29
|
-
|
30
|
-
## Use like this
|
31
|
-
```ruby
|
32
|
-
require 'list_spider'
|
33
|
-
|
34
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
-
|
36
|
-
@next_list = []
|
37
|
-
|
38
|
-
def parse_index_item(e)
|
39
|
-
content = File.read(e.local_path)
|
40
|
-
doc = Nokogiri::HTML(content)
|
41
|
-
list_group = doc.css('h2.entry-title')
|
42
|
-
link_list = list_group.css('a')
|
43
|
-
|
44
|
-
link_list.each do |link|
|
45
|
-
href = link['href']
|
46
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
-
# or you can save them to database for later use
|
48
|
-
@next_list << TaskStruct.new(href, local_path)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
task_list = []
|
53
|
-
task_list << TaskStruct.new(
|
54
|
-
'https://coolshell.cn/',
|
55
|
-
DOWNLOAD_DIR + 'index.html',
|
56
|
-
parse_method: method(:parse_index_item)
|
57
|
-
)
|
58
|
-
|
59
|
-
ListSpider.get_list(task_list)
|
60
|
-
ListSpider.get_list(@next_list, max: 60)
|
61
|
-
```
|
62
|
-
|
63
|
-
## Or in one step
|
64
|
-
```ruby
|
65
|
-
require 'list_spider'
|
66
|
-
|
67
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
-
|
69
|
-
def parse_index_item(e)
|
70
|
-
content = File.read(e.local_path)
|
71
|
-
doc = Nokogiri::HTML(content)
|
72
|
-
list_group = doc.css('h2.entry-title')
|
73
|
-
link_list = list_group.css('a')
|
74
|
-
|
75
|
-
link_list.each do |link|
|
76
|
-
href = link['href']
|
77
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
-
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# get_one is a simple function for one taskstruct situation
|
83
|
-
ListSpider.get_one(
|
84
|
-
TaskStruct.new(
|
85
|
-
'https://coolshell.cn/',
|
86
|
-
DOWNLOAD_DIR + 'index.html',
|
87
|
-
parse_method: method(:parse_index_item)
|
88
|
-
),
|
89
|
-
max: 60
|
90
|
-
)
|
91
|
-
```
|
92
|
-
|
93
|
-
## And there are many options you can use
|
94
|
-
|
95
|
-
```ruby
|
96
|
-
def initialize(href, # 请求链接
|
97
|
-
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
98
|
-
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
99
|
-
http_method: :get,
|
100
|
-
custom_data: nil, # 自定义数据
|
101
|
-
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
102
|
-
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
103
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
104
|
-
# http.response_header.status 状态码
|
105
|
-
# http.response_header 返回头
|
106
|
-
# http.response 返回体
|
107
|
-
callback: nil,
|
108
|
-
# 请求失败后的回调
|
109
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
110
|
-
errback: nil,
|
111
|
-
stream_callback: nil, # 流数据处理回调
|
112
|
-
convert_to_utf8: false, # 是否转换为utf8编码
|
113
|
-
overwrite_exist: false, # 是否覆盖现有文件
|
114
|
-
# request options
|
115
|
-
redirects: 3, # 重定向次数
|
116
|
-
keepalive: nil, # (暂不支持复用)
|
117
|
-
file: nil, # 要上传的文件路径
|
118
|
-
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
119
|
-
query: nil, # 查询字符串,可以是string或hash类型
|
120
|
-
body: nil, # 请求体,可以是string或hash类型
|
121
|
-
head: nil, # 请求头
|
122
|
-
# connection options
|
123
|
-
connect_timeout: 60, # 连接超时时间
|
124
|
-
inactivity_timeout: nil, # 连接后超时时间
|
125
|
-
# ssl设置
|
126
|
-
# ssl: {
|
127
|
-
# :private_key_file => '/tmp/server.key',
|
128
|
-
# :cert_chain_file => '/tmp/server.crt',
|
129
|
-
# :verify_peer => false
|
130
|
-
# }
|
131
|
-
ssl: nil,
|
132
|
-
# bind: {
|
133
|
-
# :host => '123.123.123.123', # use a specific interface for outbound request
|
134
|
-
# :port => '123'
|
135
|
-
# }
|
136
|
-
bind: nil,
|
137
|
-
# 代理设置
|
138
|
-
# proxy: {
|
139
|
-
# :host => '127.0.0.1', # proxy address
|
140
|
-
# :port => 9000, # proxy port
|
141
|
-
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
142
|
-
|
143
|
-
# :authorization => ['user', 'pass'] # proxy authorization header
|
144
|
-
# }
|
145
|
-
proxy: nil)
|
146
|
-
```
|
147
|
-
|
148
|
-
## Callback methods form
|
149
|
-
|
150
|
-
```ruby
|
151
|
-
# called when the file is saved successfully
|
152
|
-
def parse_eresponse(task_struct)
|
153
|
-
# ...
|
154
|
-
end
|
155
|
-
|
156
|
-
def call_back(task_struct, http_req)
|
157
|
-
# http_req is a EventMachine::HttpRequest object
|
158
|
-
# http_req.response_header.status
|
159
|
-
# ...
|
160
|
-
end
|
161
|
-
|
162
|
-
def err_back(task_struct, http_req)
|
163
|
-
# ...
|
164
|
-
end
|
165
|
-
```
|
166
|
-
|
167
|
-
### License
|
168
|
-
|
169
|
-
(MIT License) - Copyright (c) 2016 Charles Zhang
|
1
|
+
# list_spider
|
2
|
+
|
3
|
+
A url list spider based on em-http-request.
|
4
|
+
|
5
|
+
Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
|
6
|
+
|
7
|
+
## Features
|
8
|
+
* Duplicate url filtering (based on local path, so you can custom your behavior).
|
9
|
+
|
10
|
+
* Convert to UTF-8 support.
|
11
|
+
|
12
|
+
* Increased spider support (don't spider exist).
|
13
|
+
|
14
|
+
* Customize concurrent number and interval between task.
|
15
|
+
|
16
|
+
* Http options support.
|
17
|
+
|
18
|
+
## Getting started
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
Or add it to your Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
29
|
+
|
30
|
+
## Use like this
|
31
|
+
```ruby
|
32
|
+
require 'list_spider'
|
33
|
+
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
+
|
36
|
+
@next_list = []
|
37
|
+
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
40
|
+
doc = Nokogiri::HTML(content)
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
43
|
+
|
44
|
+
link_list.each do |link|
|
45
|
+
href = link['href']
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# or you can save them to database for later use
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task_list = []
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
58
|
+
|
59
|
+
ListSpider.get_list(task_list)
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
61
|
+
```
|
62
|
+
|
63
|
+
## Or in one step
|
64
|
+
```ruby
|
65
|
+
require 'list_spider'
|
66
|
+
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
+
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
71
|
+
doc = Nokogiri::HTML(content)
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
74
|
+
|
75
|
+
link_list.each do |link|
|
76
|
+
href = link['href']
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# get_one is a simple function for one taskstruct situation
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
91
|
+
```
|
92
|
+
|
93
|
+
## And there are many options you can use
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
def initialize(href, # 请求链接
|
97
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
98
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
99
|
+
http_method: :get,
|
100
|
+
custom_data: nil, # 自定义数据
|
101
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
102
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
103
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
104
|
+
# http.response_header.status 状态码
|
105
|
+
# http.response_header 返回头
|
106
|
+
# http.response 返回体
|
107
|
+
callback: nil,
|
108
|
+
# 请求失败后的回调
|
109
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
110
|
+
errback: nil,
|
111
|
+
stream_callback: nil, # 流数据处理回调
|
112
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
113
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
114
|
+
# request options
|
115
|
+
redirects: 3, # 重定向次数
|
116
|
+
keepalive: nil, # (暂不支持复用)
|
117
|
+
file: nil, # 要上传的文件路径
|
118
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
119
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
120
|
+
body: nil, # 请求体,可以是string或hash类型
|
121
|
+
head: nil, # 请求头
|
122
|
+
# connection options
|
123
|
+
connect_timeout: 60, # 连接超时时间
|
124
|
+
inactivity_timeout: nil, # 连接后超时时间
|
125
|
+
# ssl设置
|
126
|
+
# ssl: {
|
127
|
+
# :private_key_file => '/tmp/server.key',
|
128
|
+
# :cert_chain_file => '/tmp/server.crt',
|
129
|
+
# :verify_peer => false
|
130
|
+
# }
|
131
|
+
ssl: nil,
|
132
|
+
# bind: {
|
133
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
134
|
+
# :port => '123'
|
135
|
+
# }
|
136
|
+
bind: nil,
|
137
|
+
# 代理设置
|
138
|
+
# proxy: {
|
139
|
+
# :host => '127.0.0.1', # proxy address
|
140
|
+
# :port => 9000, # proxy port
|
141
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
142
|
+
|
143
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
144
|
+
# }
|
145
|
+
proxy: nil)
|
146
|
+
```
|
147
|
+
|
148
|
+
## Callback methods form
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
# called when the file is saved successfully
|
152
|
+
def parse_eresponse(task_struct)
|
153
|
+
# ...
|
154
|
+
end
|
155
|
+
|
156
|
+
def call_back(task_struct, http_req)
|
157
|
+
# http_req is a EventMachine::HttpRequest object
|
158
|
+
# http_req.response_header.status
|
159
|
+
# ...
|
160
|
+
end
|
161
|
+
|
162
|
+
def err_back(task_struct, http_req)
|
163
|
+
# ...
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
167
|
+
### License
|
168
|
+
|
169
|
+
(MIT License) - Copyright (c) 2016 Charles Zhang
|