list_spider 1.0.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rdoc_options +23 -0
- data/.rubocop.yml +2 -2
- data/English_README.md +169 -0
- data/Gemfile.lock +41 -0
- data/README.md +124 -129
- data/lib/file_filter.rb +4 -2
- data/lib/list_spider/version.rb +1 -1
- data/lib/list_spider.rb +152 -148
- data/lib/spider_helper.rb +7 -5
- data/list_spider.gemspec +2 -2
- data/spider_example.rb +2 -4
- data/spider_example_2.rb +2 -2
- metadata +8 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39600b837bb18841d083c7b50dbaadf82e72c3013f690129af6786efec193a39
|
4
|
+
data.tar.gz: 4128e673c551e3fcc2c1f9d4a3302407bcf7bc26829a4957d04ebc0505d5ce07
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f900e8f76086f37239872d9b4452f5d735799100879ac16570d29c9570837adca52c3c9e37c725913920a68add7784bc2f94e2cef42663c54930ae5b3e37ec50
|
7
|
+
data.tar.gz: 90495a4dae2552c3f41e55f0efa61fef0511581eb2e13d90256e0a585c48f7fdb2af167cd8c6daa98ca80c2229d970187fbcec3db4a8edd43738f76f79c18951
|
data/.rdoc_options
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
--- !ruby/object:RDoc::Options
|
2
|
+
encoding: UTF-8
|
3
|
+
static_path: []
|
4
|
+
rdoc_include:
|
5
|
+
- "."
|
6
|
+
- "/Users/zhangchao/github/list_spider"
|
7
|
+
charset: UTF-8
|
8
|
+
exclude:
|
9
|
+
hyperlink_all: false
|
10
|
+
line_numbers: false
|
11
|
+
locale:
|
12
|
+
locale_dir: locale
|
13
|
+
locale_name:
|
14
|
+
main_page:
|
15
|
+
markup: markdown
|
16
|
+
output_decoration: true
|
17
|
+
page_dir:
|
18
|
+
show_hash: false
|
19
|
+
tab_width: 8
|
20
|
+
template_stylesheets: []
|
21
|
+
title:
|
22
|
+
visibility: :protected
|
23
|
+
webcvs:
|
data/.rubocop.yml
CHANGED
@@ -18,9 +18,9 @@ Style/Documentation:
|
|
18
18
|
Enabled: false
|
19
19
|
Lint/AmbiguousRegexpLiteral:
|
20
20
|
Enabled: false
|
21
|
-
|
21
|
+
Layout/DefEndAlignment:
|
22
22
|
AutoCorrect: true
|
23
|
-
|
23
|
+
Layout/EndAlignment:
|
24
24
|
AutoCorrect: true
|
25
25
|
Style/BracesAroundHashParameters:
|
26
26
|
Enabled: false
|
data/English_README.md
ADDED
@@ -0,0 +1,169 @@
|
|
1
|
+
# list_spider
|
2
|
+
|
3
|
+
A url list spider based on em-http-request.
|
4
|
+
|
5
|
+
Many times we only need to spider by url list then parse them and spider again. This is for the purpose.
|
6
|
+
|
7
|
+
## Features
|
8
|
+
* Duplicate url filtering (based on local path, so you can custom your behavior).
|
9
|
+
|
10
|
+
* Convert to UTF-8 support.
|
11
|
+
|
12
|
+
* Increased spider support (don't spider exist).
|
13
|
+
|
14
|
+
* Customize concurrent number and interval between task.
|
15
|
+
|
16
|
+
* Http options support.
|
17
|
+
|
18
|
+
## Getting started
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
Or add it to your Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
29
|
+
|
30
|
+
## Use like this
|
31
|
+
```ruby
|
32
|
+
require 'list_spider'
|
33
|
+
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
+
|
36
|
+
@next_list = []
|
37
|
+
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
40
|
+
doc = Nokogiri::HTML(content)
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
43
|
+
|
44
|
+
link_list.each do |link|
|
45
|
+
href = link['href']
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# or you can save them to database for later use
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task_list = []
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
58
|
+
|
59
|
+
ListSpider.get_list(task_list)
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
61
|
+
```
|
62
|
+
|
63
|
+
## Or in one step
|
64
|
+
```ruby
|
65
|
+
require 'list_spider'
|
66
|
+
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
+
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
71
|
+
doc = Nokogiri::HTML(content)
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
74
|
+
|
75
|
+
link_list.each do |link|
|
76
|
+
href = link['href']
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# get_one is a simple function for one taskstruct situation
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
91
|
+
```
|
92
|
+
|
93
|
+
## And there are many options you can use
|
94
|
+
|
95
|
+
```ruby
|
96
|
+
def initialize(href, # 请求链接
|
97
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
98
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
99
|
+
http_method: :get,
|
100
|
+
custom_data: nil, # 自定义数据
|
101
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
102
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
103
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
104
|
+
# http.response_header.status 状态码
|
105
|
+
# http.response_header 返回头
|
106
|
+
# http.response 返回体
|
107
|
+
callback: nil,
|
108
|
+
# 请求失败后的回调
|
109
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
110
|
+
errback: nil,
|
111
|
+
stream_callback: nil, # 流数据处理回调
|
112
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
113
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
114
|
+
# request options
|
115
|
+
redirects: 3, # 重定向次数
|
116
|
+
keepalive: nil, # (暂不支持复用)
|
117
|
+
file: nil, # 要上传的文件路径
|
118
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
119
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
120
|
+
body: nil, # 请求体,可以是string或hash类型
|
121
|
+
head: nil, # 请求头
|
122
|
+
# connection options
|
123
|
+
connect_timeout: 60, # 连接超时时间
|
124
|
+
inactivity_timeout: nil, # 连接后超时时间
|
125
|
+
# ssl设置
|
126
|
+
# ssl: {
|
127
|
+
# :private_key_file => '/tmp/server.key',
|
128
|
+
# :cert_chain_file => '/tmp/server.crt',
|
129
|
+
# :verify_peer => false
|
130
|
+
# }
|
131
|
+
ssl: nil,
|
132
|
+
# bind: {
|
133
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
134
|
+
# :port => '123'
|
135
|
+
# }
|
136
|
+
bind: nil,
|
137
|
+
# 代理设置
|
138
|
+
# proxy: {
|
139
|
+
# :host => '127.0.0.1', # proxy address
|
140
|
+
# :port => 9000, # proxy port
|
141
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
142
|
+
|
143
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
144
|
+
# }
|
145
|
+
proxy: nil)
|
146
|
+
```
|
147
|
+
|
148
|
+
## Callback methods form
|
149
|
+
|
150
|
+
```ruby
|
151
|
+
# called when the file is saved successfully
|
152
|
+
def parse_eresponse(task_struct)
|
153
|
+
# ...
|
154
|
+
end
|
155
|
+
|
156
|
+
def call_back(task_struct, http_req)
|
157
|
+
# http_req is a EventMachine::HttpRequest object
|
158
|
+
# http_req.response_header.status
|
159
|
+
# ...
|
160
|
+
end
|
161
|
+
|
162
|
+
def err_back(task_struct, http_req)
|
163
|
+
# ...
|
164
|
+
end
|
165
|
+
```
|
166
|
+
|
167
|
+
### License
|
168
|
+
|
169
|
+
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
list_spider (2.0.2)
|
5
|
+
em-http-request (~> 1.1, >= 1.1.3)
|
6
|
+
nokogiri (~> 1.6, >= 1.6.7)
|
7
|
+
rchardet (~> 1.6, >= 1.6.1)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
addressable (2.5.2)
|
13
|
+
public_suffix (>= 2.0.2, < 4.0)
|
14
|
+
cookiejar (0.3.3)
|
15
|
+
em-http-request (1.1.5)
|
16
|
+
addressable (>= 2.3.4)
|
17
|
+
cookiejar (!= 0.3.1)
|
18
|
+
em-socksify (>= 0.3)
|
19
|
+
eventmachine (>= 1.0.3)
|
20
|
+
http_parser.rb (>= 0.6.0)
|
21
|
+
em-socksify (0.3.2)
|
22
|
+
eventmachine (>= 1.0.0.beta.4)
|
23
|
+
eventmachine (1.2.5)
|
24
|
+
http_parser.rb (0.6.0)
|
25
|
+
mini_portile2 (2.3.0)
|
26
|
+
nokogiri (1.8.2)
|
27
|
+
mini_portile2 (~> 2.3.0)
|
28
|
+
public_suffix (3.0.2)
|
29
|
+
rake (10.5.0)
|
30
|
+
rchardet (1.7.0)
|
31
|
+
|
32
|
+
PLATFORMS
|
33
|
+
ruby
|
34
|
+
|
35
|
+
DEPENDENCIES
|
36
|
+
bundler (~> 1.16)
|
37
|
+
list_spider!
|
38
|
+
rake (~> 10.0)
|
39
|
+
|
40
|
+
BUNDLED WITH
|
41
|
+
1.16.1
|
data/README.md
CHANGED
@@ -1,186 +1,181 @@
|
|
1
|
-
# list_spider
|
1
|
+
# 关于list_spider
|
2
2
|
|
3
|
-
|
3
|
+
list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
|
4
4
|
|
5
|
-
|
5
|
+
许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
|
6
6
|
|
7
|
-
##
|
8
|
-
*
|
7
|
+
## 功能特点
|
8
|
+
* 去重过滤 (使用本地文件路径做唯一性校验)。
|
9
9
|
|
10
|
-
*
|
10
|
+
* 支持UTF-8编码转换。
|
11
11
|
|
12
|
-
*
|
12
|
+
* 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
|
13
13
|
|
14
|
-
*
|
14
|
+
* 自由设置最大并发数和爬取任务间隔时间。
|
15
15
|
|
16
|
-
*
|
16
|
+
* 支持http所有选项设置。
|
17
17
|
|
18
|
-
##
|
18
|
+
## 开始
|
19
19
|
|
20
|
-
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
或者添加到Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
21
29
|
|
22
|
-
##
|
30
|
+
## 使用方法
|
23
31
|
```ruby
|
24
32
|
require 'list_spider'
|
25
33
|
|
26
|
-
DOWNLOAD_DIR = 'coolshell/'
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
27
35
|
|
28
|
-
|
36
|
+
@next_list = []
|
29
37
|
|
30
|
-
def parse_index_item(
|
31
|
-
content = File.read(
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
32
40
|
doc = Nokogiri::HTML(content)
|
33
|
-
list_group = doc.css(
|
34
|
-
link_list = list_group.css(
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
35
43
|
|
36
44
|
link_list.each do |link|
|
37
45
|
href = link['href']
|
38
|
-
local_path = DOWNLOAD_DIR + link.content +
|
39
|
-
#
|
40
|
-
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# 可以存入数据库后续处理
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
41
49
|
end
|
42
50
|
end
|
43
51
|
|
44
52
|
task_list = []
|
45
|
-
task_list << TaskStruct.new(
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
46
58
|
|
47
59
|
ListSpider.get_list(task_list)
|
48
|
-
ListSpider.get_list(
|
49
|
-
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
50
61
|
```
|
51
62
|
|
52
|
-
##
|
63
|
+
## 或者使用更简单的一步完成
|
53
64
|
```ruby
|
54
65
|
require 'list_spider'
|
55
66
|
|
56
|
-
DOWNLOAD_DIR = 'coolshell/'
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
57
68
|
|
58
|
-
def parse_index_item(
|
59
|
-
|
60
|
-
content = File.read(file_name)
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
61
71
|
doc = Nokogiri::HTML(content)
|
62
|
-
list_group = doc.css(
|
63
|
-
link_list = list_group.css(
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
64
74
|
|
65
75
|
link_list.each do |link|
|
66
76
|
href = link['href']
|
67
|
-
local_path = DOWNLOAD_DIR + link.content +
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
68
78
|
ListSpider.add_task(TaskStruct.new(href, local_path))
|
69
79
|
end
|
70
80
|
end
|
71
81
|
|
72
|
-
#get_one
|
73
|
-
ListSpider.get_one(
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
82
|
+
# get_one是封装了get_list的简化形式,方便一个任务时调用
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
79
91
|
```
|
80
92
|
|
81
|
-
##
|
82
|
-
|
83
|
-
```ruby
|
84
|
-
def parse_response(file_name)
|
85
|
-
#...
|
86
|
-
end
|
87
|
-
|
88
|
-
|
89
|
-
# extra_data is passed by TaskStruct's extra_data param
|
90
|
-
|
91
|
-
def parse_response(file_name, extra_data)
|
92
|
-
#...
|
93
|
-
end
|
94
|
-
|
95
|
-
|
96
|
-
# response_header is a EventMachine::HttpResponseHeader object
|
97
|
-
# you can use it like this:
|
98
|
-
# response_header.status
|
99
|
-
# response_header.cookie
|
100
|
-
# response_header['Last-Modified']
|
101
|
-
|
102
|
-
def parse_response(file_name, extra_data, response_header)
|
103
|
-
response_header.status
|
104
|
-
response_header['Last-Modified']
|
105
|
-
|
106
|
-
#...
|
107
|
-
end
|
108
|
-
|
109
|
-
# req is a EventMachine::HttpClientOptions object
|
110
|
-
# you can use it like this:
|
111
|
-
# req.body
|
112
|
-
# req.headers
|
113
|
-
# req.uri
|
114
|
-
# req.host
|
115
|
-
# req.port
|
116
|
-
def parse_response(file_name, extra_data, response_header, req)
|
117
|
-
puts req.body
|
118
|
-
puts req.headers
|
119
|
-
puts req.uri
|
120
|
-
puts req.host
|
121
|
-
puts req.port
|
122
|
-
|
123
|
-
#...
|
124
|
-
end
|
125
|
-
|
126
|
-
```
|
127
|
-
|
128
|
-
## And there are many options you can use
|
129
|
-
|
130
|
-
```ruby
|
131
|
-
TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
|
93
|
+
## get_list/get_one参数
|
132
94
|
```
|
95
|
+
# down_list: 要请求的TaskStruct数组
|
96
|
+
# interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
|
97
|
+
# max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
|
133
98
|
|
134
|
-
|
135
|
-
|
136
|
-
ListSpider.get_list(down_list, interval: 0, max: ListSpider::NO_LIMIT_CONCURRENT)
|
137
|
-
|
138
|
-
#sleep random time, often used in site which limit spider
|
139
|
-
ListSpider.get_list(down_list, interval: ListSpider::RANDOM_TIME, max: 1)
|
140
|
-
|
141
|
-
#set random time range
|
142
|
-
ListSpider.get_list(down_list, interval: (1..10), max: 1)
|
143
|
-
|
99
|
+
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
100
|
+
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
144
101
|
```
|
145
102
|
|
146
|
-
|
103
|
+
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
147
104
|
|
148
105
|
```ruby
|
149
|
-
#
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
#
|
156
|
-
|
157
|
-
|
158
|
-
#
|
159
|
-
|
160
|
-
|
161
|
-
#
|
162
|
-
|
163
|
-
|
164
|
-
#
|
165
|
-
|
166
|
-
|
106
|
+
new(href, # 请求链接
|
107
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
108
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
|
+
http_method: :get,
|
110
|
+
custom_data: nil, # 自定义数据
|
111
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
112
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
113
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
114
|
+
# http_req.response_header.status 状态码
|
115
|
+
# http_req.response_header 返回头
|
116
|
+
# http_req.response 返回体
|
117
|
+
callback: nil,
|
118
|
+
# 请求失败后的回调
|
119
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
120
|
+
errback: nil,
|
121
|
+
stream_callback: nil, # 流数据处理回调
|
122
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
123
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
124
|
+
# 请求设置
|
125
|
+
redirects: 3, # 重定向次数
|
126
|
+
keepalive: nil, # (暂不支持复用)
|
127
|
+
file: nil, # 要上传的文件路径
|
128
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
129
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
130
|
+
body: nil, # 请求体,可以是string或hash类型
|
131
|
+
head: nil, # 请求头
|
132
|
+
# 连接设置
|
133
|
+
connect_timeout: 60, # 连接超时时间
|
134
|
+
inactivity_timeout: nil, # 连接后超时时间
|
135
|
+
# ssl设置
|
136
|
+
# ssl: {
|
137
|
+
# :private_key_file => '/tmp/server.key',
|
138
|
+
# :cert_chain_file => '/tmp/server.crt',
|
139
|
+
# :verify_peer => false
|
140
|
+
# }
|
141
|
+
ssl: nil,
|
142
|
+
# bind: {
|
143
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
144
|
+
# :port => '123'
|
145
|
+
# }
|
146
|
+
bind: nil,
|
147
|
+
# 代理设置
|
148
|
+
# proxy: {
|
149
|
+
# :host => '127.0.0.1', # proxy address
|
150
|
+
# :port => 9000, # proxy port
|
151
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
152
|
+
|
153
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
154
|
+
# }
|
155
|
+
proxy: nil)
|
167
156
|
```
|
168
157
|
|
169
|
-
##
|
158
|
+
## 回调函数形式
|
170
159
|
|
171
160
|
```ruby
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
FileFilter.check_save_result(CustomConfig::DIR + '*', size_threshold: 300)
|
161
|
+
# 文件成功保存后调用,通过parse_method参数传入
|
162
|
+
def parse_eresponse(task_struct)
|
163
|
+
# ...
|
164
|
+
end
|
177
165
|
|
178
|
-
#
|
179
|
-
|
166
|
+
# http请求成功后调用,通过callback参数传入
|
167
|
+
def call_back(task_struct, http_req)
|
168
|
+
# http_req 是EventMachine::HttpRequest对象
|
169
|
+
# http_req.response_header.status
|
170
|
+
# ...
|
171
|
+
end
|
180
172
|
|
181
|
-
|
173
|
+
# http请求出错后调用,通过errback参数传入
|
174
|
+
def err_back(task_struct, http_req)
|
175
|
+
# ...
|
176
|
+
end
|
182
177
|
```
|
183
178
|
|
184
|
-
|
179
|
+
## License
|
185
180
|
|
186
181
|
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/lib/file_filter.rb
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
class FileFilter
|
3
3
|
# 4033
|
4
4
|
# 920
|
5
|
-
def initialize(dir_pattern, size_threshold: 1000,
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000,
|
6
|
+
cust_judge: nil, process_block: nil)
|
6
7
|
@dir_pattern = dir_pattern
|
7
8
|
@size_threshold = size_threshold
|
8
9
|
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
@@ -53,7 +54,8 @@ class FileFilter
|
|
53
54
|
).start
|
54
55
|
end
|
55
56
|
|
56
|
-
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
57
|
+
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
58
|
+
size_threshold: 1000, cust_judge: nil)
|
57
59
|
result_file = File.open(save_file_name, 'wt')
|
58
60
|
FileFilter.new(
|
59
61
|
dir_pattern,
|
data/lib/list_spider/version.rb
CHANGED
data/lib/list_spider.rb
CHANGED
@@ -4,26 +4,108 @@ require 'nokogiri'
|
|
4
4
|
require 'fileutils'
|
5
5
|
require 'set'
|
6
6
|
require 'addressable/uri'
|
7
|
-
require File.expand_path('
|
8
|
-
require File.expand_path('
|
7
|
+
require File.expand_path('spider_helper', __dir__)
|
8
|
+
require File.expand_path('file_filter', __dir__)
|
9
9
|
|
10
|
+
# 爬取任务类
|
10
11
|
class TaskStruct
|
11
|
-
|
12
|
+
# * href 请求链接
|
13
|
+
# * local_path 保存数据的本地路径(此路径作为去重标准)
|
14
|
+
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
|
+
# * custom_data 自定义数据
|
16
|
+
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
17
|
+
def initialize(href, # 请求链接
|
18
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
19
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
20
|
+
http_method: :get,
|
21
|
+
custom_data: nil, # 自定义数据
|
22
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
23
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
24
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
25
|
+
# http_req.response_header.status 状态码
|
26
|
+
# http_req.response_header 返回头
|
27
|
+
# http_req.response 返回体
|
28
|
+
callback: nil,
|
29
|
+
# 请求失败后的回调
|
30
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
31
|
+
errback: nil,
|
32
|
+
stream_callback: nil, # 流数据处理回调
|
33
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
34
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
35
|
+
# 请求设置
|
36
|
+
redirects: 3, # 重定向次数
|
37
|
+
keepalive: nil, # (暂不支持复用)
|
38
|
+
file: nil, # 要上传的文件路径
|
39
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
40
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
41
|
+
body: nil, # 请求体,可以是string或hash类型
|
42
|
+
head: nil, # 请求头
|
43
|
+
# 连接设置
|
44
|
+
connect_timeout: 60, # 连接超时时间
|
45
|
+
inactivity_timeout: nil, # 连接后超时时间
|
46
|
+
# ssl设置
|
47
|
+
# ssl: {
|
48
|
+
# :private_key_file => '/tmp/server.key',
|
49
|
+
# :cert_chain_file => '/tmp/server.crt',
|
50
|
+
# :verify_peer => false
|
51
|
+
# }
|
52
|
+
ssl: nil,
|
53
|
+
# bind: {
|
54
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
55
|
+
# :port => '123'
|
56
|
+
# }
|
57
|
+
bind: nil,
|
58
|
+
# 代理设置
|
59
|
+
# proxy: {
|
60
|
+
# :host => '127.0.0.1', # proxy address
|
61
|
+
# :port => 9000, # proxy port
|
62
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
63
|
+
|
64
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
65
|
+
# }
|
66
|
+
proxy: nil)
|
12
67
|
@href = href
|
13
|
-
@href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
|
14
68
|
@local_path = local_path
|
15
69
|
@http_method = http_method
|
16
|
-
@
|
17
|
-
@extra_data = extra_data
|
70
|
+
@custom_data = custom_data
|
18
71
|
@parse_method = parse_method
|
19
|
-
@
|
72
|
+
@callback = callback
|
73
|
+
@errback = errback
|
74
|
+
@stream_callback = stream_callback
|
75
|
+
@convert_to_utf8 = convert_to_utf8
|
76
|
+
@overwrite_exist = overwrite_exist
|
77
|
+
|
78
|
+
@request_options = {
|
79
|
+
redirects: redirects,
|
80
|
+
keepalive: keepalive,
|
81
|
+
file: file,
|
82
|
+
path: path,
|
83
|
+
query: query,
|
84
|
+
body: body,
|
85
|
+
head: head
|
86
|
+
}.compact
|
87
|
+
|
88
|
+
@connection_options = {
|
89
|
+
connect_timeout: connect_timeout,
|
90
|
+
inactivity_timeout: inactivity_timeout,
|
91
|
+
ssl: ssl,
|
92
|
+
bind: bind,
|
93
|
+
proxy: proxy
|
94
|
+
}.compact
|
20
95
|
end
|
21
96
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
97
|
+
attr_accessor :href, :local_path,
|
98
|
+
:http_method,
|
99
|
+
:custom_data,
|
100
|
+
:request_object,
|
101
|
+
:parse_method,
|
102
|
+
:callback,
|
103
|
+
:errback,
|
104
|
+
:stream_callback,
|
105
|
+
:convert_to_utf8,
|
106
|
+
:overwrite_exist,
|
107
|
+
:request_options,
|
108
|
+
:connection_options
|
27
109
|
end
|
28
110
|
|
29
111
|
module ListSpider
|
@@ -33,33 +115,44 @@ module ListSpider
|
|
33
115
|
DEFAULT_INTERVAL = 0
|
34
116
|
|
35
117
|
@random_time_range = 3..10
|
36
|
-
@convert_to_utf8 = false
|
37
|
-
@connection_opts = { connect_timeout: 60 }
|
38
|
-
@overwrite_exist = false
|
39
|
-
@max_redirects = 10
|
40
118
|
@local_path_set = Set.new
|
41
119
|
|
42
120
|
class << self
|
43
|
-
|
121
|
+
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
122
|
+
if interval.is_a? Range
|
123
|
+
@random_time_range = interval
|
124
|
+
interval = RANDOM_TIME
|
125
|
+
end
|
126
|
+
|
127
|
+
@down_list = filter_list(down_list)
|
128
|
+
@interval = interval
|
129
|
+
@max = max
|
130
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
131
|
+
@succeed_size = 0
|
132
|
+
@failed_size = 0
|
44
133
|
|
45
|
-
|
46
|
-
|
47
|
-
proxy: {
|
48
|
-
host: proxy_addr,
|
49
|
-
port: proxy_port
|
50
|
-
}
|
51
|
-
}
|
52
|
-
@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
134
|
+
puts "total size:#{@down_list.size}"
|
135
|
+
event_machine_start_list(next_task, method(:complete))
|
53
136
|
end
|
54
137
|
|
55
|
-
def
|
56
|
-
|
138
|
+
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
139
|
+
get_list([task], interval: interval, max: max)
|
57
140
|
end
|
58
141
|
|
59
|
-
def
|
60
|
-
|
142
|
+
def add_task(task)
|
143
|
+
if task.is_a? Array
|
144
|
+
need_down_list = filter_list(task)
|
145
|
+
@down_list += need_down_list
|
146
|
+
elsif task.is_a?TaskStruct
|
147
|
+
need_down_list = filter_list([task])
|
148
|
+
@down_list += need_down_list
|
149
|
+
else
|
150
|
+
puts "error task type:#{task.class}"
|
151
|
+
end
|
61
152
|
end
|
62
153
|
|
154
|
+
private
|
155
|
+
|
63
156
|
def event_machine_down(link_struct_list, callback = nil)
|
64
157
|
failed_list = []
|
65
158
|
succeed_list = []
|
@@ -67,78 +160,47 @@ module ListSpider
|
|
67
160
|
begin_time = Time.now
|
68
161
|
|
69
162
|
for_each_proc =
|
70
|
-
proc do |
|
71
|
-
|
72
|
-
if
|
73
|
-
|
74
|
-
elsif defined? @header_option
|
75
|
-
opt[:head] = @header_option
|
76
|
-
end
|
163
|
+
proc do |task_struct|
|
164
|
+
http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
|
165
|
+
http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
|
166
|
+
task_struct.request_object = http_req
|
77
167
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
if @connection_opts
|
82
|
-
EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
83
|
-
else
|
84
|
-
EventMachine::HttpRequest.new(e.href).post opt
|
85
|
-
end
|
86
|
-
else
|
87
|
-
if @connection_opts
|
88
|
-
opt[:query] = e.params unless e.params.empty?
|
89
|
-
w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
|
90
|
-
else
|
91
|
-
w = EventMachine::HttpRequest.new(e.href).get opt
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
e.request_object = w
|
168
|
+
http_req.callback do
|
169
|
+
s = http_req.response_header.status
|
170
|
+
puts "#{Time.now}, http status code: #{s}"
|
96
171
|
|
97
|
-
|
98
|
-
|
99
|
-
puts s
|
100
|
-
if s != 404
|
101
|
-
local_dir = File.dirname(e.local_path)
|
172
|
+
if s == 200
|
173
|
+
local_dir = File.dirname(task_struct.local_path)
|
102
174
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
103
175
|
begin
|
104
|
-
File.open(
|
176
|
+
File.open(task_struct.local_path, 'wb') do |f|
|
105
177
|
f << if @convert_to_utf8 == true
|
106
|
-
SpiderHelper.to_utf8(
|
178
|
+
SpiderHelper.to_utf8(http_req.response)
|
107
179
|
else
|
108
|
-
|
180
|
+
http_req.response
|
109
181
|
end
|
110
182
|
end
|
111
|
-
|
112
|
-
|
113
|
-
|
183
|
+
call_parse_method(task_struct)
|
184
|
+
succeed_list << task_struct
|
185
|
+
rescue StandardError => exception
|
186
|
+
puts exception
|
114
187
|
end
|
115
188
|
end
|
189
|
+
task_struct.callback.call(task_struct, http_req) if task_struct.callback
|
116
190
|
end
|
117
|
-
w.errback do
|
118
|
-
puts "errback:#{w.response_header},retry..."
|
119
|
-
puts e.href
|
120
|
-
puts w.response_header.status
|
121
191
|
|
122
|
-
|
123
|
-
|
124
|
-
ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
|
125
|
-
elsif e.http_method == :post
|
126
|
-
ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
|
127
|
-
end
|
192
|
+
http_req.errback do
|
193
|
+
puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
|
128
194
|
|
129
|
-
if
|
130
|
-
succeed_list << e
|
131
|
-
else
|
132
|
-
failed_list << e
|
133
|
-
end
|
195
|
+
task_struct.errback.call(task_struct, http_req) if task_struct.errback
|
134
196
|
end
|
135
197
|
|
136
198
|
begin
|
137
|
-
multi.add
|
199
|
+
multi.add task_struct.local_path, http_req
|
138
200
|
rescue StandardError => exception
|
139
201
|
puts exception
|
140
|
-
puts
|
141
|
-
puts
|
202
|
+
puts task_struct.href
|
203
|
+
puts task_struct.local_path
|
142
204
|
stop_machine
|
143
205
|
end
|
144
206
|
end
|
@@ -170,38 +232,15 @@ module ListSpider
|
|
170
232
|
@down_list.shift(@max)
|
171
233
|
end
|
172
234
|
|
173
|
-
def call_parse_method(
|
174
|
-
|
175
|
-
if pm
|
176
|
-
case pm.arity
|
177
|
-
when 1
|
178
|
-
pm.call(e.local_path)
|
179
|
-
when 2
|
180
|
-
pm.call(e.local_path, e.extra_data)
|
181
|
-
when 3
|
182
|
-
res_header = nil
|
183
|
-
res_header = e.request_object.response_header if e.request_object
|
184
|
-
pm.call(e.local_path, e.extra_data, res_header)
|
185
|
-
when 4
|
186
|
-
res_header = nil
|
187
|
-
res_header = e.request_object.response_header if e.request_object
|
188
|
-
|
189
|
-
req = nil
|
190
|
-
req = e.request_object.req if e.request_object
|
191
|
-
|
192
|
-
pm.call(e.local_path, e.extra_data, res_header, req)
|
193
|
-
else
|
194
|
-
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
|
195
|
-
end
|
196
|
-
end
|
235
|
+
def call_parse_method(task_struct)
|
236
|
+
task_struct.parse_method.call(task_struct) if task_struct.parse_method
|
197
237
|
end
|
198
238
|
|
199
239
|
def complete(_multi, success_list, failed_list)
|
200
240
|
@succeed_size += success_list.size
|
201
241
|
@failed_size += failed_list.size
|
202
|
-
success_list
|
203
|
-
|
204
|
-
end
|
242
|
+
@succeed_list.concat(success_list)
|
243
|
+
@failed_list.concat(failed_list)
|
205
244
|
|
206
245
|
todo = next_task
|
207
246
|
|
@@ -223,6 +262,8 @@ module ListSpider
|
|
223
262
|
|
224
263
|
def event_machine_start_list(down_list, callback = nil)
|
225
264
|
EventMachine.run do
|
265
|
+
@succeed_list = []
|
266
|
+
@failed_list = []
|
226
267
|
@begin_time = Time.now
|
227
268
|
if down_list.empty?
|
228
269
|
if callback
|
@@ -239,7 +280,7 @@ module ListSpider
|
|
239
280
|
def filter_list(down_list)
|
240
281
|
need_down_list = []
|
241
282
|
down_list.each do |ts|
|
242
|
-
if
|
283
|
+
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
243
284
|
call_parse_method(ts)
|
244
285
|
elsif @local_path_set.add?(ts.local_path)
|
245
286
|
need_down_list << ts
|
@@ -247,43 +288,6 @@ module ListSpider
|
|
247
288
|
end
|
248
289
|
need_down_list
|
249
290
|
end
|
250
|
-
|
251
|
-
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
252
|
-
if interval.is_a? Range
|
253
|
-
@random_time_range = interval
|
254
|
-
interval = RANDOM_TIME
|
255
|
-
end
|
256
|
-
|
257
|
-
@down_list = []
|
258
|
-
|
259
|
-
need_down_list = filter_list(down_list)
|
260
|
-
|
261
|
-
@down_list += need_down_list
|
262
|
-
@interval = interval
|
263
|
-
@max = max
|
264
|
-
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
265
|
-
@succeed_size = 0
|
266
|
-
@failed_size = 0
|
267
|
-
|
268
|
-
puts "total size:#{@down_list.size}"
|
269
|
-
event_machine_start_list(next_task, method(:complete))
|
270
|
-
end
|
271
|
-
|
272
|
-
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
273
|
-
get_list([task], interval: interval, max: max)
|
274
|
-
end
|
275
|
-
|
276
|
-
def add_task(task)
|
277
|
-
if task.is_a? Array
|
278
|
-
need_down_list = filter_list(task)
|
279
|
-
@down_list += need_down_list
|
280
|
-
elsif task.is_a?TaskStruct
|
281
|
-
need_down_list = filter_list([task])
|
282
|
-
@down_list += need_down_list
|
283
|
-
else
|
284
|
-
puts "error task type:#{task.class}"
|
285
|
-
end
|
286
|
-
end
|
287
291
|
end
|
288
292
|
|
289
293
|
Signal.trap('INT') do
|
data/lib/spider_helper.rb
CHANGED
@@ -3,8 +3,9 @@ require 'net/http'
|
|
3
3
|
|
4
4
|
module SpiderHelper
|
5
5
|
class << self
|
6
|
-
def direct_http_get(href, local_path, params: nil,
|
7
|
-
|
6
|
+
def direct_http_get(href, local_path, params: nil,
|
7
|
+
header: nil, convert_to_utf8: false)
|
8
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
8
9
|
|
9
10
|
begin
|
10
11
|
href.query = URI.encode_www_form(params) if params
|
@@ -35,8 +36,9 @@ module SpiderHelper
|
|
35
36
|
false
|
36
37
|
end
|
37
38
|
|
38
|
-
def direct_http_post(href, local_path, params,
|
39
|
-
|
39
|
+
def direct_http_post(href, local_path, params,
|
40
|
+
header: nil, convert_to_utf8: false)
|
41
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
40
42
|
|
41
43
|
begin
|
42
44
|
req = Net::HTTP::Post.new(href)
|
@@ -72,7 +74,7 @@ module SpiderHelper
|
|
72
74
|
|
73
75
|
def string_to_uri(href)
|
74
76
|
l = href
|
75
|
-
l.sub!('http:///', 'http://')
|
77
|
+
l.sub!('http:///', 'http://')
|
76
78
|
l = Addressable::URI.parse(l)
|
77
79
|
l.normalize!
|
78
80
|
end
|
data/list_spider.gemspec
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
lib = File.expand_path('
|
2
|
+
lib = File.expand_path('lib', __dir__)
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
require 'list_spider/version'
|
5
5
|
|
@@ -26,6 +26,6 @@ Gem::Specification.new do |spec|
|
|
26
26
|
spec.add_development_dependency 'rake', '~> 10.0'
|
27
27
|
|
28
28
|
spec.add_dependency 'em-http-request', '~> 1.1', '>= 1.1.3'
|
29
|
-
spec.add_dependency 'nokogiri', '~> 1.
|
29
|
+
spec.add_dependency 'nokogiri', '~> 1.11'
|
30
30
|
spec.add_dependency 'rchardet', '~> 1.6', '>= 1.6.1'
|
31
31
|
end
|
data/spider_example.rb
CHANGED
@@ -3,8 +3,8 @@ require 'list_spider'
|
|
3
3
|
|
4
4
|
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
5
|
|
6
|
-
def parse_index_item(
|
7
|
-
content = File.read(
|
6
|
+
def parse_index_item(e)
|
7
|
+
content = File.read(e.local_path)
|
8
8
|
doc = Nokogiri::HTML(content)
|
9
9
|
list_group = doc.css('h2.entry-title')
|
10
10
|
link_list = list_group.css('a')
|
@@ -16,8 +16,6 @@ def parse_index_item(file_name)
|
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
-
# ListSpider.convert_to_utf8 = true
|
20
|
-
|
21
19
|
# get_one is a simple function for one taskstruct situation
|
22
20
|
ListSpider.get_one(
|
23
21
|
TaskStruct.new(
|
data/spider_example_2.rb
CHANGED
@@ -4,8 +4,8 @@ DOWNLOAD_DIR = 'coolshell/'.freeze
|
|
4
4
|
|
5
5
|
@next_list = []
|
6
6
|
|
7
|
-
def parse_index_item(
|
8
|
-
content = File.read(
|
7
|
+
def parse_index_item(e)
|
8
|
+
content = File.read(e.local_path)
|
9
9
|
doc = Nokogiri::HTML(content)
|
10
10
|
list_group = doc.css('h2.entry-title')
|
11
11
|
link_list = list_group.css('a')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -64,20 +64,14 @@ dependencies:
|
|
64
64
|
requirements:
|
65
65
|
- - "~>"
|
66
66
|
- !ruby/object:Gem::Version
|
67
|
-
version: '1.
|
68
|
-
- - ">="
|
69
|
-
- !ruby/object:Gem::Version
|
70
|
-
version: 1.6.7
|
67
|
+
version: '1.11'
|
71
68
|
type: :runtime
|
72
69
|
prerelease: false
|
73
70
|
version_requirements: !ruby/object:Gem::Requirement
|
74
71
|
requirements:
|
75
72
|
- - "~>"
|
76
73
|
- !ruby/object:Gem::Version
|
77
|
-
version: '1.
|
78
|
-
- - ">="
|
79
|
-
- !ruby/object:Gem::Version
|
80
|
-
version: 1.6.7
|
74
|
+
version: '1.11'
|
81
75
|
- !ruby/object:Gem::Dependency
|
82
76
|
name: rchardet
|
83
77
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,8 +100,11 @@ extensions: []
|
|
106
100
|
extra_rdoc_files: []
|
107
101
|
files:
|
108
102
|
- ".gitignore"
|
103
|
+
- ".rdoc_options"
|
109
104
|
- ".rubocop.yml"
|
105
|
+
- English_README.md
|
110
106
|
- Gemfile
|
107
|
+
- Gemfile.lock
|
111
108
|
- README.md
|
112
109
|
- Rakefile
|
113
110
|
- bin/console
|
@@ -139,8 +136,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
139
136
|
- !ruby/object:Gem::Version
|
140
137
|
version: '0'
|
141
138
|
requirements: []
|
142
|
-
|
143
|
-
rubygems_version: 2.7.3
|
139
|
+
rubygems_version: 3.0.3
|
144
140
|
signing_key:
|
145
141
|
specification_version: 4
|
146
142
|
summary: List Spider
|