list_spider 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/file_filter.rb +71 -0
- data/lib/list_spider.rb +12 -6
- metadata +3 -3
- data/lib/delete_unvalid.rb +0 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 38d3b1cc9c12998f7c0112f9f328d93e0c3909c8
|
4
|
+
data.tar.gz: 2e46c3201a979f17f3bafcbd7c2a215c666b29aa
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 25a9be2402259455a04e5c1cee6767de829841670a0967a2435e7a9616e94552e71371ab8ef69fd5e57fd640f71b205c2ae894798682a3583ec80ad2d7ac2b6d
|
7
|
+
data.tar.gz: cbe88b9d30421f6c1cf600d54755ad7df699e2186003c8df76b748317a3bedadd1b8c3536434c6cc11e34a9a59ece4ed4cfbfe712238538ab595d3ac4a121654
|
data/lib/file_filter.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
|
2
|
+
class FileFilter
|
3
|
+
# 4033
|
4
|
+
# 920
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
|
6
|
+
@dir_pattern = dir_pattern
|
7
|
+
@size_threshold = size_threshold
|
8
|
+
if cust_judge
|
9
|
+
@cust_judge = cust_judge
|
10
|
+
else
|
11
|
+
@cust_judge = method(:default_judge)
|
12
|
+
end
|
13
|
+
@total = 0
|
14
|
+
@process_block = process_block
|
15
|
+
end
|
16
|
+
|
17
|
+
def default_judge(f)
|
18
|
+
File.size(f) <= @size_threshold
|
19
|
+
end
|
20
|
+
|
21
|
+
def filter_file(f)
|
22
|
+
if @cust_judge.call(f)
|
23
|
+
@total += 1
|
24
|
+
@process_block.call(f)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def start
|
29
|
+
Dir.glob(@dir_pattern) do |f|
|
30
|
+
filter_file(f)
|
31
|
+
end
|
32
|
+
puts "total:#{@total}"
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
36
|
+
FileFilter.new(dir_pattern,
|
37
|
+
size_threshold: size_threshold,
|
38
|
+
cust_judge: cust_judge,
|
39
|
+
process_block:
|
40
|
+
proc do |f|
|
41
|
+
puts "deleted file: #{f}"
|
42
|
+
File.delete(f)
|
43
|
+
end
|
44
|
+
).start
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
48
|
+
FileFilter.new(dir_pattern,
|
49
|
+
size_threshold: size_threshold,
|
50
|
+
cust_judge: cust_judge,
|
51
|
+
process_block:
|
52
|
+
proc do |f|
|
53
|
+
puts "filterd file: #{f}"
|
54
|
+
end
|
55
|
+
).start
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
|
59
|
+
result_file = File.open(save_file_name, 'wt')
|
60
|
+
FileFilter.new(dir_pattern,
|
61
|
+
size_threshold: size_threshold,
|
62
|
+
cust_judge: cust_judge,
|
63
|
+
process_block:
|
64
|
+
proc do |f|
|
65
|
+
puts "filterd file: #{f}"
|
66
|
+
result_file << f << "\n"
|
67
|
+
end
|
68
|
+
).start
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
data/lib/list_spider.rb
CHANGED
@@ -4,10 +4,10 @@ require 'fileutils'
|
|
4
4
|
require 'set'
|
5
5
|
require "addressable/uri"
|
6
6
|
require File.expand_path('../spider_helper', __FILE__)
|
7
|
-
require File.expand_path('../
|
7
|
+
require File.expand_path('../file_filter', __FILE__)
|
8
8
|
|
9
9
|
class TaskStruct
|
10
|
-
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
|
10
|
+
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
|
11
11
|
@origin_href = href
|
12
12
|
@href = href
|
13
13
|
if @href.class == "".class
|
@@ -18,13 +18,14 @@ class TaskStruct
|
|
18
18
|
@params = params
|
19
19
|
@extra_data = extra_data
|
20
20
|
@parse_method = parse_method
|
21
|
+
@header = header
|
21
22
|
end
|
22
23
|
|
23
24
|
def == (o)
|
24
|
-
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
|
25
|
+
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data && o.header == header
|
25
26
|
end
|
26
27
|
|
27
|
-
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object
|
28
|
+
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
|
28
29
|
|
29
30
|
end
|
30
31
|
|
@@ -73,7 +74,12 @@ module ListSpider
|
|
73
74
|
for_each_proc = proc do |e|
|
74
75
|
opt = {}
|
75
76
|
opt = {:redirects => @max_redirects}
|
76
|
-
|
77
|
+
if e.header
|
78
|
+
opt[:head] = e.header
|
79
|
+
elsif defined? @header_option
|
80
|
+
opt[:head] = @header_option
|
81
|
+
end
|
82
|
+
|
77
83
|
if e.http_method == :post
|
78
84
|
opt[:body] = e.params unless e.params.empty?
|
79
85
|
if @connection_opts
|
@@ -182,7 +188,7 @@ module ListSpider
|
|
182
188
|
|
183
189
|
pm.call(e.local_path, e.extra_data, res_header, req)
|
184
190
|
else
|
185
|
-
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3"
|
191
|
+
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
|
186
192
|
end
|
187
193
|
end
|
188
194
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|
@@ -56,7 +56,7 @@ executables: []
|
|
56
56
|
extensions: []
|
57
57
|
extra_rdoc_files: []
|
58
58
|
files:
|
59
|
-
- lib/
|
59
|
+
- lib/file_filter.rb
|
60
60
|
- lib/list_spider.rb
|
61
61
|
- lib/spider_helper.rb
|
62
62
|
homepage: https://github.com/chinazhangchao/list_spider
|
data/lib/delete_unvalid.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
|
2
|
-
class DeleteUnvalid
|
3
|
-
# 4033
|
4
|
-
# 920
|
5
|
-
def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
6
|
-
@dir_pattern = dir_pattern
|
7
|
-
@size_threshold = size_threshold
|
8
|
-
if cust_judge
|
9
|
-
@cust_judge = cust_judge
|
10
|
-
else
|
11
|
-
@cust_judge = method(:default_judge)
|
12
|
-
end
|
13
|
-
@total = 0
|
14
|
-
end
|
15
|
-
|
16
|
-
def default_judge(f)
|
17
|
-
File.size(f) <= @size_threshold
|
18
|
-
end
|
19
|
-
|
20
|
-
def delete_unvaild(f)
|
21
|
-
if @cust_judge.call(f)
|
22
|
-
@total += 1
|
23
|
-
puts "deleted file: #{f}"
|
24
|
-
File.delete(f)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
def start
|
29
|
-
Dir.glob(@dir_pattern) do |f|
|
30
|
-
# puts f
|
31
|
-
delete_unvaild(f)
|
32
|
-
end
|
33
|
-
puts "delete total:#{@total}"
|
34
|
-
end
|
35
|
-
|
36
|
-
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
37
|
-
DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|