list_spider 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/file_filter.rb +71 -0
- data/lib/list_spider.rb +12 -6
- metadata +3 -3
- data/lib/delete_unvalid.rb +0 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38d3b1cc9c12998f7c0112f9f328d93e0c3909c8
|
4
|
+
data.tar.gz: 2e46c3201a979f17f3bafcbd7c2a215c666b29aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 25a9be2402259455a04e5c1cee6767de829841670a0967a2435e7a9616e94552e71371ab8ef69fd5e57fd640f71b205c2ae894798682a3583ec80ad2d7ac2b6d
|
7
|
+
data.tar.gz: cbe88b9d30421f6c1cf600d54755ad7df699e2186003c8df76b748317a3bedadd1b8c3536434c6cc11e34a9a59ece4ed4cfbfe712238538ab595d3ac4a121654
|
data/lib/file_filter.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
|
2
|
+
class FileFilter
|
3
|
+
# 4033
|
4
|
+
# 920
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
|
6
|
+
@dir_pattern = dir_pattern
|
7
|
+
@size_threshold = size_threshold
|
8
|
+
if cust_judge
|
9
|
+
@cust_judge = cust_judge
|
10
|
+
else
|
11
|
+
@cust_judge = method(:default_judge)
|
12
|
+
end
|
13
|
+
@total = 0
|
14
|
+
@process_block = process_block
|
15
|
+
end
|
16
|
+
|
17
|
+
def default_judge(f)
|
18
|
+
File.size(f) <= @size_threshold
|
19
|
+
end
|
20
|
+
|
21
|
+
def filter_file(f)
|
22
|
+
if @cust_judge.call(f)
|
23
|
+
@total += 1
|
24
|
+
@process_block.call(f)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def start
|
29
|
+
Dir.glob(@dir_pattern) do |f|
|
30
|
+
filter_file(f)
|
31
|
+
end
|
32
|
+
puts "total:#{@total}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
36
|
+
FileFilter.new(dir_pattern,
|
37
|
+
size_threshold: size_threshold,
|
38
|
+
cust_judge: cust_judge,
|
39
|
+
process_block:
|
40
|
+
proc do |f|
|
41
|
+
puts "deleted file: #{f}"
|
42
|
+
File.delete(f)
|
43
|
+
end
|
44
|
+
).start
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
48
|
+
FileFilter.new(dir_pattern,
|
49
|
+
size_threshold: size_threshold,
|
50
|
+
cust_judge: cust_judge,
|
51
|
+
process_block:
|
52
|
+
proc do |f|
|
53
|
+
puts "filterd file: #{f}"
|
54
|
+
end
|
55
|
+
).start
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
|
59
|
+
result_file = File.open(save_file_name, 'wt')
|
60
|
+
FileFilter.new(dir_pattern,
|
61
|
+
size_threshold: size_threshold,
|
62
|
+
cust_judge: cust_judge,
|
63
|
+
process_block:
|
64
|
+
proc do |f|
|
65
|
+
puts "filterd file: #{f}"
|
66
|
+
result_file << f << "\n"
|
67
|
+
end
|
68
|
+
).start
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/lib/list_spider.rb
CHANGED
@@ -4,10 +4,10 @@ require 'fileutils'
|
|
4
4
|
require 'set'
|
5
5
|
require "addressable/uri"
|
6
6
|
require File.expand_path('../spider_helper', __FILE__)
|
7
|
-
require File.expand_path('../
|
7
|
+
require File.expand_path('../file_filter', __FILE__)
|
8
8
|
|
9
9
|
class TaskStruct
|
10
|
-
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
|
10
|
+
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
|
11
11
|
@origin_href = href
|
12
12
|
@href = href
|
13
13
|
if @href.class == "".class
|
@@ -18,13 +18,14 @@ class TaskStruct
|
|
18
18
|
@params = params
|
19
19
|
@extra_data = extra_data
|
20
20
|
@parse_method = parse_method
|
21
|
+
@header = header
|
21
22
|
end
|
22
23
|
|
23
24
|
def == (o)
|
24
|
-
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
|
25
|
+
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data && o.header == header
|
25
26
|
end
|
26
27
|
|
27
|
-
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object
|
28
|
+
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
|
28
29
|
|
29
30
|
end
|
30
31
|
|
@@ -73,7 +74,12 @@ module ListSpider
|
|
73
74
|
for_each_proc = proc do |e|
|
74
75
|
opt = {}
|
75
76
|
opt = {:redirects => @max_redirects}
|
76
|
-
|
77
|
+
if e.header
|
78
|
+
opt[:head] = e.header
|
79
|
+
elsif defined? @header_option
|
80
|
+
opt[:head] = @header_option
|
81
|
+
end
|
82
|
+
|
77
83
|
if e.http_method == :post
|
78
84
|
opt[:body] = e.params unless e.params.empty?
|
79
85
|
if @connection_opts
|
@@ -182,7 +188,7 @@ module ListSpider
|
|
182
188
|
|
183
189
|
pm.call(e.local_path, e.extra_data, res_header, req)
|
184
190
|
else
|
185
|
-
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3"
|
191
|
+
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
|
186
192
|
end
|
187
193
|
end
|
188
194
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|
@@ -56,7 +56,7 @@ executables: []
|
|
56
56
|
extensions: []
|
57
57
|
extra_rdoc_files: []
|
58
58
|
files:
|
59
|
-
- lib/
|
59
|
+
- lib/file_filter.rb
|
60
60
|
- lib/list_spider.rb
|
61
61
|
- lib/spider_helper.rb
|
62
62
|
homepage: https://github.com/chinazhangchao/list_spider
|
data/lib/delete_unvalid.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
|
2
|
-
class DeleteUnvalid
|
3
|
-
# 4033
|
4
|
-
# 920
|
5
|
-
def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
6
|
-
@dir_pattern = dir_pattern
|
7
|
-
@size_threshold = size_threshold
|
8
|
-
if cust_judge
|
9
|
-
@cust_judge = cust_judge
|
10
|
-
else
|
11
|
-
@cust_judge = method(:default_judge)
|
12
|
-
end
|
13
|
-
@total = 0
|
14
|
-
end
|
15
|
-
|
16
|
-
def default_judge(f)
|
17
|
-
File.size(f) <= @size_threshold
|
18
|
-
end
|
19
|
-
|
20
|
-
def delete_unvaild(f)
|
21
|
-
if @cust_judge.call(f)
|
22
|
-
@total += 1
|
23
|
-
puts "deleted file: #{f}"
|
24
|
-
File.delete(f)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def start
|
29
|
-
Dir.glob(@dir_pattern) do |f|
|
30
|
-
# puts f
|
31
|
-
delete_unvaild(f)
|
32
|
-
end
|
33
|
-
puts "delete total:#{@total}"
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
37
|
-
DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|