list_spider 2.3.0 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +298 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +3 -4
data/Gemfile
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
source 'https://rubygems.org'
|
2
|
-
|
3
|
-
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
4
|
-
|
5
|
-
# Specify your gem's dependencies in list_spider.gemspec
|
6
|
-
gemspec
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
4
|
+
|
5
|
+
# Specify your gem's dependencies in list_spider.gemspec
|
6
|
+
gemspec
|
data/README.md
CHANGED
@@ -1,181 +1,181 @@
|
|
1
|
-
# 关于list_spider
|
2
|
-
|
3
|
-
list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
|
4
|
-
|
5
|
-
许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
|
6
|
-
|
7
|
-
## 功能特点
|
8
|
-
* 去重过滤 (使用本地文件路径做唯一性校验)。
|
9
|
-
|
10
|
-
* 支持UTF-8编码转换。
|
11
|
-
|
12
|
-
* 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
|
13
|
-
|
14
|
-
* 自由设置最大并发数和爬取任务间隔时间。
|
15
|
-
|
16
|
-
* 支持http所有选项设置。
|
17
|
-
|
18
|
-
## 开始
|
19
|
-
|
20
|
-
```ruby
|
21
|
-
gem install list_spider
|
22
|
-
```
|
23
|
-
|
24
|
-
或者添加到Gemfile
|
25
|
-
|
26
|
-
```ruby
|
27
|
-
gem 'list_spider'
|
28
|
-
```
|
29
|
-
|
30
|
-
## 使用方法
|
31
|
-
```ruby
|
32
|
-
require 'list_spider'
|
33
|
-
|
34
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
-
|
36
|
-
@next_list = []
|
37
|
-
|
38
|
-
def parse_index_item(e)
|
39
|
-
content = File.read(e.local_path)
|
40
|
-
doc = Nokogiri::HTML(content)
|
41
|
-
list_group = doc.css('h2.entry-title')
|
42
|
-
link_list = list_group.css('a')
|
43
|
-
|
44
|
-
link_list.each do |link|
|
45
|
-
href = link['href']
|
46
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
-
# 可以存入数据库后续处理
|
48
|
-
@next_list << TaskStruct.new(href, local_path)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
task_list = []
|
53
|
-
task_list << TaskStruct.new(
|
54
|
-
'https://coolshell.cn/',
|
55
|
-
DOWNLOAD_DIR + 'index.html',
|
56
|
-
parse_method: method(:parse_index_item)
|
57
|
-
)
|
58
|
-
|
59
|
-
ListSpider.get_list(task_list)
|
60
|
-
ListSpider.get_list(@next_list, max: 60)
|
61
|
-
```
|
62
|
-
|
63
|
-
## 或者使用更简单的一步完成
|
64
|
-
```ruby
|
65
|
-
require 'list_spider'
|
66
|
-
|
67
|
-
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
-
|
69
|
-
def parse_index_item(e)
|
70
|
-
content = File.read(e.local_path)
|
71
|
-
doc = Nokogiri::HTML(content)
|
72
|
-
list_group = doc.css('h2.entry-title')
|
73
|
-
link_list = list_group.css('a')
|
74
|
-
|
75
|
-
link_list.each do |link|
|
76
|
-
href = link['href']
|
77
|
-
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
-
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
# get_one是封装了get_list的简化形式,方便一个任务时调用
|
83
|
-
ListSpider.get_one(
|
84
|
-
TaskStruct.new(
|
85
|
-
'https://coolshell.cn/',
|
86
|
-
DOWNLOAD_DIR + 'index.html',
|
87
|
-
parse_method: method(:parse_index_item)
|
88
|
-
),
|
89
|
-
max: 60
|
90
|
-
)
|
91
|
-
```
|
92
|
-
|
93
|
-
## get_list/get_one参数
|
94
|
-
```
|
95
|
-
# down_list: 要请求的TaskStruct数组
|
96
|
-
# interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
|
97
|
-
# max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
|
98
|
-
|
99
|
-
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
100
|
-
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
101
|
-
```
|
102
|
-
|
103
|
-
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
104
|
-
|
105
|
-
```ruby
|
106
|
-
new(href, # 请求链接
|
107
|
-
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
108
|
-
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
|
-
http_method: :get,
|
110
|
-
custom_data: nil, # 自定义数据
|
111
|
-
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
112
|
-
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
113
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
114
|
-
# http_req.response_header.status 状态码
|
115
|
-
# http_req.response_header 返回头
|
116
|
-
# http_req.response 返回体
|
117
|
-
callback: nil,
|
118
|
-
# 请求失败后的回调
|
119
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
120
|
-
errback: nil,
|
121
|
-
stream_callback: nil, # 流数据处理回调
|
122
|
-
convert_to_utf8: false, # 是否转换为utf8编码
|
123
|
-
overwrite_exist: false, # 是否覆盖现有文件
|
124
|
-
# 请求设置
|
125
|
-
redirects: 3, # 重定向次数
|
126
|
-
keepalive: nil, # (暂不支持复用)
|
127
|
-
file: nil, # 要上传的文件路径
|
128
|
-
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
129
|
-
query: nil, # 查询字符串,可以是string或hash类型
|
130
|
-
body: nil, # 请求体,可以是string或hash类型
|
131
|
-
head: nil, # 请求头
|
132
|
-
# 连接设置
|
133
|
-
connect_timeout: 60, # 连接超时时间
|
134
|
-
inactivity_timeout: nil, # 连接后超时时间
|
135
|
-
# ssl设置
|
136
|
-
# ssl: {
|
137
|
-
# :private_key_file => '/tmp/server.key',
|
138
|
-
# :cert_chain_file => '/tmp/server.crt',
|
139
|
-
# :verify_peer => false
|
140
|
-
# }
|
141
|
-
ssl: nil,
|
142
|
-
# bind: {
|
143
|
-
# :host => '123.123.123.123', # use a specific interface for outbound request
|
144
|
-
# :port => '123'
|
145
|
-
# }
|
146
|
-
bind: nil,
|
147
|
-
# 代理设置
|
148
|
-
# proxy: {
|
149
|
-
# :host => '127.0.0.1', # proxy address
|
150
|
-
# :port => 9000, # proxy port
|
151
|
-
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
152
|
-
|
153
|
-
# :authorization => ['user', 'pass'] # proxy authorization header
|
154
|
-
# }
|
155
|
-
proxy: nil)
|
156
|
-
```
|
157
|
-
|
158
|
-
## 回调函数形式
|
159
|
-
|
160
|
-
```ruby
|
161
|
-
# 文件成功保存后调用,通过parse_method参数传入
|
162
|
-
def parse_eresponse(task_struct)
|
163
|
-
# ...
|
164
|
-
end
|
165
|
-
|
166
|
-
# http请求成功后调用,通过callback参数传入
|
167
|
-
def call_back(task_struct, http_req)
|
168
|
-
# http_req 是EventMachine::HttpRequest对象
|
169
|
-
# http_req.response_header.status
|
170
|
-
# ...
|
171
|
-
end
|
172
|
-
|
173
|
-
# http请求出错后调用,通过errback参数传入
|
174
|
-
def err_back(task_struct, http_req)
|
175
|
-
# ...
|
176
|
-
end
|
177
|
-
```
|
178
|
-
|
179
|
-
## License
|
180
|
-
|
181
|
-
(MIT License) - Copyright (c) 2016 Charles Zhang
|
1
|
+
# 关于list_spider
|
2
|
+
|
3
|
+
list_spider是一个基于[em-http-request](https://github.com/igrigorik/em-http-request)的爬虫工具。
|
4
|
+
|
5
|
+
许多情况下,爬虫的工作是爬取链接,解析返回数据,从中提取链接,继续爬取,list_spider就是适用这种场景的爬虫工具。
|
6
|
+
|
7
|
+
## 功能特点
|
8
|
+
* 去重过滤 (使用本地文件路径做唯一性校验)。
|
9
|
+
|
10
|
+
* 支持UTF-8编码转换。
|
11
|
+
|
12
|
+
* 默认增量爬取,已爬取的不再重复爬取(可以通过选项强制重新获取)。
|
13
|
+
|
14
|
+
* 自由设置最大并发数和爬取任务间隔时间。
|
15
|
+
|
16
|
+
* 支持http所有选项设置。
|
17
|
+
|
18
|
+
## 开始
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
gem install list_spider
|
22
|
+
```
|
23
|
+
|
24
|
+
或者添加到Gemfile
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
gem 'list_spider'
|
28
|
+
```
|
29
|
+
|
30
|
+
## 使用方法
|
31
|
+
```ruby
|
32
|
+
require 'list_spider'
|
33
|
+
|
34
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
35
|
+
|
36
|
+
@next_list = []
|
37
|
+
|
38
|
+
def parse_index_item(e)
|
39
|
+
content = File.read(e.local_path)
|
40
|
+
doc = Nokogiri::HTML(content)
|
41
|
+
list_group = doc.css('h2.entry-title')
|
42
|
+
link_list = list_group.css('a')
|
43
|
+
|
44
|
+
link_list.each do |link|
|
45
|
+
href = link['href']
|
46
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
47
|
+
# 可以存入数据库后续处理
|
48
|
+
@next_list << TaskStruct.new(href, local_path)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
task_list = []
|
53
|
+
task_list << TaskStruct.new(
|
54
|
+
'https://coolshell.cn/',
|
55
|
+
DOWNLOAD_DIR + 'index.html',
|
56
|
+
parse_method: method(:parse_index_item)
|
57
|
+
)
|
58
|
+
|
59
|
+
ListSpider.get_list(task_list)
|
60
|
+
ListSpider.get_list(@next_list, max: 60)
|
61
|
+
```
|
62
|
+
|
63
|
+
## 或者使用更简单的一步完成
|
64
|
+
```ruby
|
65
|
+
require 'list_spider'
|
66
|
+
|
67
|
+
DOWNLOAD_DIR = 'coolshell/'.freeze
|
68
|
+
|
69
|
+
def parse_index_item(e)
|
70
|
+
content = File.read(e.local_path)
|
71
|
+
doc = Nokogiri::HTML(content)
|
72
|
+
list_group = doc.css('h2.entry-title')
|
73
|
+
link_list = list_group.css('a')
|
74
|
+
|
75
|
+
link_list.each do |link|
|
76
|
+
href = link['href']
|
77
|
+
local_path = DOWNLOAD_DIR + link.content + '.html'
|
78
|
+
ListSpider.add_task(TaskStruct.new(href, local_path))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# get_one是封装了get_list的简化形式,方便一个任务时调用
|
83
|
+
ListSpider.get_one(
|
84
|
+
TaskStruct.new(
|
85
|
+
'https://coolshell.cn/',
|
86
|
+
DOWNLOAD_DIR + 'index.html',
|
87
|
+
parse_method: method(:parse_index_item)
|
88
|
+
),
|
89
|
+
max: 60
|
90
|
+
)
|
91
|
+
```
|
92
|
+
|
93
|
+
## get_list/get_one参数
|
94
|
+
```
|
95
|
+
# down_list: 要请求的TaskStruct数组
|
96
|
+
# interval: 任务间隔,默认为0。若参数为Range对象,则随机间隔Range范围内的秒数。若设为RANDOM_TIME则随机间隔3到10秒。
|
97
|
+
# max: 最大并发数,默认为50。若设为NO_LIMIT_CONCURRENT,则所有请求任务全部一起并发执行
|
98
|
+
|
99
|
+
get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
100
|
+
get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
101
|
+
```
|
102
|
+
|
103
|
+
## 下面是TaskStruct可以设置的选项,与[em-http-request](https://github.com/igrigorik/em-http-request)基本一致
|
104
|
+
|
105
|
+
```ruby
|
106
|
+
new(href, # 请求链接
|
107
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
108
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
109
|
+
http_method: :get,
|
110
|
+
custom_data: nil, # 自定义数据
|
111
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
112
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
113
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
114
|
+
# http_req.response_header.status 状态码
|
115
|
+
# http_req.response_header 返回头
|
116
|
+
# http_req.response 返回体
|
117
|
+
callback: nil,
|
118
|
+
# 请求失败后的回调
|
119
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
120
|
+
errback: nil,
|
121
|
+
stream_callback: nil, # 流数据处理回调
|
122
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
123
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
124
|
+
# 请求设置
|
125
|
+
redirects: 3, # 重定向次数
|
126
|
+
keepalive: nil, # (暂不支持复用)
|
127
|
+
file: nil, # 要上传的文件路径
|
128
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
129
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
130
|
+
body: nil, # 请求体,可以是string或hash类型
|
131
|
+
head: nil, # 请求头
|
132
|
+
# 连接设置
|
133
|
+
connect_timeout: 60, # 连接超时时间
|
134
|
+
inactivity_timeout: nil, # 连接后超时时间
|
135
|
+
# ssl设置
|
136
|
+
# ssl: {
|
137
|
+
# :private_key_file => '/tmp/server.key',
|
138
|
+
# :cert_chain_file => '/tmp/server.crt',
|
139
|
+
# :verify_peer => false
|
140
|
+
# }
|
141
|
+
ssl: nil,
|
142
|
+
# bind: {
|
143
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
144
|
+
# :port => '123'
|
145
|
+
# }
|
146
|
+
bind: nil,
|
147
|
+
# 代理设置
|
148
|
+
# proxy: {
|
149
|
+
# :host => '127.0.0.1', # proxy address
|
150
|
+
# :port => 9000, # proxy port
|
151
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
152
|
+
|
153
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
154
|
+
# }
|
155
|
+
proxy: nil)
|
156
|
+
```
|
157
|
+
|
158
|
+
## 回调函数形式
|
159
|
+
|
160
|
+
```ruby
|
161
|
+
# 文件成功保存后调用,通过parse_method参数传入
|
162
|
+
def parse_eresponse(task_struct)
|
163
|
+
# ...
|
164
|
+
end
|
165
|
+
|
166
|
+
# http请求成功后调用,通过callback参数传入
|
167
|
+
def call_back(task_struct, http_req)
|
168
|
+
# http_req 是EventMachine::HttpRequest对象
|
169
|
+
# http_req.response_header.status
|
170
|
+
# ...
|
171
|
+
end
|
172
|
+
|
173
|
+
# http请求出错后调用,通过errback参数传入
|
174
|
+
def err_back(task_struct, http_req)
|
175
|
+
# ...
|
176
|
+
end
|
177
|
+
```
|
178
|
+
|
179
|
+
## License
|
180
|
+
|
181
|
+
(MIT License) - Copyright (c) 2016 Charles Zhang
|
data/Rakefile
CHANGED
@@ -1,2 +1,2 @@
|
|
1
|
-
require 'bundler/gem_tasks'
|
2
|
-
task default: :spec
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
task default: :spec
|
data/bin/console
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'bundler/setup'
|
4
|
-
require 'list_spider'
|
5
|
-
|
6
|
-
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
-
# with your gem easier. You can also use a different console, if you like.
|
8
|
-
|
9
|
-
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
-
# require "pry"
|
11
|
-
# Pry.start
|
12
|
-
|
13
|
-
require 'irb'
|
14
|
-
IRB.start(__FILE__)
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'list_spider'
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require 'irb'
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
set -euo pipefail
|
3
|
-
IFS=$'\n\t'
|
4
|
-
set -vx
|
5
|
-
|
6
|
-
bundle install
|
7
|
-
|
8
|
-
# Do any other automated setup that you need to do here
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
set -euo pipefail
|
3
|
+
IFS=$'\n\t'
|
4
|
+
set -vx
|
5
|
+
|
6
|
+
bundle install
|
7
|
+
|
8
|
+
# Do any other automated setup that you need to do here
|
data/check_code.sh
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
#!/bin/sh
|
2
|
-
|
1
|
+
#!/bin/sh
|
2
|
+
|
3
3
|
rubocop -a -D -f simple -o rubocopresult
|
data/lib/file_filter.rb
CHANGED
@@ -1,72 +1,72 @@
|
|
1
|
-
|
2
|
-
class FileFilter
|
3
|
-
# 4033
|
4
|
-
# 920
|
5
|
-
def initialize(dir_pattern, size_threshold: 1000,
|
6
|
-
cust_judge: nil, process_block: nil)
|
7
|
-
@dir_pattern = dir_pattern
|
8
|
-
@size_threshold = size_threshold
|
9
|
-
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
10
|
-
@total = 0
|
11
|
-
@process_block = process_block
|
12
|
-
end
|
13
|
-
|
14
|
-
def default_judge(f)
|
15
|
-
File.size(f) <= @size_threshold
|
16
|
-
end
|
17
|
-
|
18
|
-
def filter_file(f)
|
19
|
-
if @cust_judge.call(f)
|
20
|
-
@total += 1
|
21
|
-
@process_block.call(f)
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def start
|
26
|
-
Dir.glob(@dir_pattern) do |f|
|
27
|
-
filter_file(f)
|
28
|
-
end
|
29
|
-
puts "total:#{@total}"
|
30
|
-
end
|
31
|
-
|
32
|
-
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
33
|
-
FileFilter.new(
|
34
|
-
dir_pattern,
|
35
|
-
size_threshold: size_threshold,
|
36
|
-
cust_judge: cust_judge,
|
37
|
-
process_block:
|
38
|
-
proc do |f|
|
39
|
-
puts "deleted file: #{f}"
|
40
|
-
File.delete(f)
|
41
|
-
end
|
42
|
-
).start
|
43
|
-
end
|
44
|
-
|
45
|
-
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
46
|
-
FileFilter.new(
|
47
|
-
dir_pattern,
|
48
|
-
size_threshold: size_threshold,
|
49
|
-
cust_judge: cust_judge,
|
50
|
-
process_block:
|
51
|
-
proc do |f|
|
52
|
-
puts "filterd file: #{f}"
|
53
|
-
end
|
54
|
-
).start
|
55
|
-
end
|
56
|
-
|
57
|
-
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
58
|
-
size_threshold: 1000, cust_judge: nil)
|
59
|
-
result_file = File.open(save_file_name, 'wt')
|
60
|
-
FileFilter.new(
|
61
|
-
dir_pattern,
|
62
|
-
size_threshold: size_threshold,
|
63
|
-
cust_judge: cust_judge,
|
64
|
-
process_block:
|
65
|
-
proc do |f|
|
66
|
-
puts "filterd file: #{f}"
|
67
|
-
result_file << f << "\n"
|
68
|
-
end
|
69
|
-
).start
|
70
|
-
result_file.close
|
71
|
-
end
|
72
|
-
end
|
1
|
+
|
2
|
+
class FileFilter
|
3
|
+
# 4033
|
4
|
+
# 920
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000,
|
6
|
+
cust_judge: nil, process_block: nil)
|
7
|
+
@dir_pattern = dir_pattern
|
8
|
+
@size_threshold = size_threshold
|
9
|
+
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
10
|
+
@total = 0
|
11
|
+
@process_block = process_block
|
12
|
+
end
|
13
|
+
|
14
|
+
def default_judge(f)
|
15
|
+
File.size(f) <= @size_threshold
|
16
|
+
end
|
17
|
+
|
18
|
+
def filter_file(f)
|
19
|
+
if @cust_judge.call(f)
|
20
|
+
@total += 1
|
21
|
+
@process_block.call(f)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def start
|
26
|
+
Dir.glob(@dir_pattern) do |f|
|
27
|
+
filter_file(f)
|
28
|
+
end
|
29
|
+
puts "total:#{@total}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
33
|
+
FileFilter.new(
|
34
|
+
dir_pattern,
|
35
|
+
size_threshold: size_threshold,
|
36
|
+
cust_judge: cust_judge,
|
37
|
+
process_block:
|
38
|
+
proc do |f|
|
39
|
+
puts "deleted file: #{f}"
|
40
|
+
File.delete(f)
|
41
|
+
end
|
42
|
+
).start
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
46
|
+
FileFilter.new(
|
47
|
+
dir_pattern,
|
48
|
+
size_threshold: size_threshold,
|
49
|
+
cust_judge: cust_judge,
|
50
|
+
process_block:
|
51
|
+
proc do |f|
|
52
|
+
puts "filterd file: #{f}"
|
53
|
+
end
|
54
|
+
).start
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
58
|
+
size_threshold: 1000, cust_judge: nil)
|
59
|
+
result_file = File.open(save_file_name, 'wt')
|
60
|
+
FileFilter.new(
|
61
|
+
dir_pattern,
|
62
|
+
size_threshold: size_threshold,
|
63
|
+
cust_judge: cust_judge,
|
64
|
+
process_block:
|
65
|
+
proc do |f|
|
66
|
+
puts "filterd file: #{f}"
|
67
|
+
result_file << f << "\n"
|
68
|
+
end
|
69
|
+
).start
|
70
|
+
result_file.close
|
71
|
+
end
|
72
|
+
end
|