finddups 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +8 -7
- data/exe/finddups +35 -101
- data/lib/dup_finder.rb +142 -0
- data/lib/finddups/version.rb +1 -1
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1bdd387a3145a5a49a979e946be3668e2c8da432928f69f4e6e22518b2bd8ba
|
4
|
+
data.tar.gz: 78de1056705cfb1afee0e3c6e5dd4a54b80107e8466e875d23aaeaaeaaad1a2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60c7b54ca17cfc307bdd9f0eb1102b2c3a8b041a4b314de6fbcd3e2f5aa61fb9a598e651a9614a03f0589f5ad2257db504890bedc1b5f1119ea41b96c9605d6d
|
7
|
+
data.tar.gz: 65f26108b4480c66f5b0e80f4927873438eeb2b486b6c04f5637eb06aba30b4c384e3b9584cf22a4ef03cabb2193c7b3f15271104132d92b2b62c0381a7a043a
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Finddups
|
2
2
|
|
3
3
|
Shows duplicate files within a list of directories and outputs as JSON.
|
4
|
+
This is accomplished by generating a hash digest each file and comparing the hashes.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
@@ -9,17 +10,17 @@ Shows duplicate files within a list of directories and outputs as JSON.
|
|
9
10
|
## Usage
|
10
11
|
|
11
12
|
```
|
12
|
-
finddups
|
13
|
+
finddups 0.2.0
|
13
14
|
Usage: finddups [dirs] [options]
|
14
15
|
-i, --ignore path ignore paths
|
15
|
-
--atime (default) Use file access time to sort duplicates
|
16
|
-
--mtime Use file modification time to sort duplicates
|
17
|
-
--ctime Use file change time to sort duplicates
|
18
|
-
(the time at which directory information about the file was changed, not the file itself)
|
19
|
-
-t, --threads threads Number of threads to use (default 16)
|
20
16
|
-d, --depth depth Max depth to search
|
17
|
+
--[no-]cache Perform caching
|
18
|
+
--[no-]cache-to-tmp Save cache files to /tmp/file_hashes
|
19
|
+
-o, --output path Output file path
|
20
|
+
--alg alg Hashing algorithm (SHA1, MD5)
|
21
|
+
--[no-]ignore-empty Ignore empty files
|
21
22
|
-h, --help Show this help
|
22
|
-
-v
|
23
|
+
-v, --version Show version
|
23
24
|
```
|
24
25
|
|
25
26
|
Example:
|
data/exe/finddups
CHANGED
@@ -2,53 +2,54 @@
|
|
2
2
|
|
3
3
|
require 'digest/sha1'
|
4
4
|
require 'json'
|
5
|
+
require 'yaml'
|
5
6
|
require 'fileutils'
|
6
|
-
require 'thread'
|
7
7
|
require 'optparse'
|
8
8
|
|
9
9
|
$LOAD_PATH.unshift(File.join(__dir__, '..', 'lib'))
|
10
10
|
|
11
11
|
require 'finddups'
|
12
|
+
require 'dup_finder'
|
12
13
|
|
13
|
-
|
14
|
-
sort: 'atime',
|
15
|
-
depth: Float::INFINITY,
|
16
|
-
ignore: [],
|
17
|
-
threads: 16,
|
18
|
-
}
|
14
|
+
options = {}
|
19
15
|
|
20
16
|
optparser = OptionParser.new do |opts|
|
21
17
|
opts.banner = <<~BANNER
|
22
|
-
finddups
|
18
|
+
finddups #{Finddups::VERSION}
|
23
19
|
Usage: #{File.basename(__FILE__)} [dirs] [options]
|
24
20
|
BANNER
|
25
21
|
|
26
22
|
opts.on("-i path", "--ignore path", "ignore paths") do |path|
|
27
|
-
|
23
|
+
options[:ignore] ||= []
|
24
|
+
options[:ignore] << path
|
28
25
|
end
|
29
26
|
|
30
|
-
opts.on("
|
31
|
-
|
27
|
+
opts.on("-d depth", "--depth depth", "Max depth to search") do |depth|
|
28
|
+
options[:depth] = depth.to_i
|
32
29
|
end
|
33
30
|
|
34
|
-
opts.on("--
|
35
|
-
|
31
|
+
opts.on("--[no-]cache", "Perform caching") do |value|
|
32
|
+
options[:cache] = value
|
36
33
|
end
|
37
34
|
|
38
|
-
|
39
|
-
|
40
|
-
(the time at which directory information about the file was changed, not the file itself)
|
41
|
-
DESC
|
42
|
-
opts.on("--ctime", desc) do
|
43
|
-
@options[:sort] = 'ctime'
|
35
|
+
opts.on("--[no-]cache-to-tmp", "Save cache files to /tmp/file_hashes") do |value|
|
36
|
+
options[:cache_to_tmp] = value
|
44
37
|
end
|
45
38
|
|
46
|
-
opts.on("-
|
47
|
-
|
39
|
+
opts.on("-o path", "--output path", "Output file path") do |path|
|
40
|
+
options[:output] = path
|
48
41
|
end
|
49
42
|
|
50
|
-
opts.on("
|
51
|
-
|
43
|
+
opts.on("--alg alg", "Hashing algorithm (SHA1, MD5)") do |alg|
|
44
|
+
options[:alg] = alg.downcase.to_sym
|
45
|
+
unless %i[md5 sha1].include?(options[:alg])
|
46
|
+
$stderr.puts "Unsupported algorithm: #{options[:alg]}"
|
47
|
+
exit 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("--[no-]ignore-empty", "Ignore empty files") do |value|
|
52
|
+
options[:ignore_empty] = value
|
52
53
|
end
|
53
54
|
|
54
55
|
opts.on("-h", "--help", "Show this help") do
|
@@ -56,8 +57,8 @@ optparser = OptionParser.new do |opts|
|
|
56
57
|
exit
|
57
58
|
end
|
58
59
|
|
59
|
-
opts.on("-v", "Show version") do
|
60
|
-
puts "finddups
|
60
|
+
opts.on("-v", "--version", "Show version") do
|
61
|
+
puts "finddups #{Finddups::VERSION}"
|
61
62
|
exit
|
62
63
|
end
|
63
64
|
end
|
@@ -69,83 +70,16 @@ if ARGV.empty? || ARGV.any? { |entry| !File.directory?(entry) }
|
|
69
70
|
exit 1
|
70
71
|
end
|
71
72
|
|
72
|
-
#
|
73
|
-
# = Prog =
|
74
|
-
# ========
|
75
|
-
|
76
|
-
search_dirs = ARGV
|
77
|
-
trash_dir = "/tmp/duplicates/"
|
78
|
-
|
79
|
-
@mutex = Mutex.new
|
80
|
-
@queue = []
|
81
|
-
|
82
|
-
def search(directory, depth = 0)
|
83
|
-
# puts "Searching: #{directory}"
|
84
|
-
|
85
|
-
# Skips
|
86
|
-
return @duplicates if @options[:ignore].include?(File.basename(directory))
|
87
|
-
|
88
|
-
Dir.entries(directory).each do |entry|
|
89
|
-
next if entry.start_with?('.')
|
90
|
-
path = File.join(directory, entry)
|
91
|
-
|
92
|
-
if File.directory?(path)
|
93
|
-
if depth < @options[:depth]
|
94
|
-
@queue.push -> { search(path, depth + 1) }
|
95
|
-
end
|
96
|
-
elsif File.symlink?(path)
|
97
|
-
next
|
98
|
-
else
|
99
|
-
begin
|
100
|
-
digest = Digest::SHA1.hexdigest(File.read(path))
|
101
|
-
@mutex.synchronize do
|
102
|
-
@duplicates[digest] ||= []
|
103
|
-
@duplicates[digest] << path
|
104
|
-
end
|
105
|
-
rescue Errno::EINVAL => e
|
106
|
-
$stderr.puts "#{path}: #{e}"
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
@duplicates
|
111
|
-
end
|
73
|
+
# Prog ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
112
74
|
|
113
|
-
|
75
|
+
dup_finder = DupFinder.new(options: options)
|
76
|
+
ARGV.each { |path| dup_finder.queue(path) }
|
77
|
+
dup_finder.search
|
114
78
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
until @queue.empty?
|
120
|
-
threads = []
|
121
|
-
@options[:threads].times do
|
122
|
-
_proc = @queue.shift
|
123
|
-
threads << Thread.new { _proc.call } if _proc
|
124
|
-
end
|
125
|
-
threads.each(&:join)
|
126
|
-
end
|
127
|
-
|
128
|
-
# Trim non dups
|
129
|
-
@duplicates = @duplicates
|
130
|
-
.values
|
131
|
-
.reject do |files|
|
132
|
-
files.length < 2
|
133
|
-
end
|
134
|
-
|
135
|
-
# Stort
|
136
|
-
@duplicates.each do |dups|
|
137
|
-
dups = dups.sort do |a, b|
|
138
|
-
case @options[:sort]
|
139
|
-
when 'atime'
|
140
|
-
File.atime(a) <=> File.atime(a)
|
141
|
-
when 'mtime'
|
142
|
-
File.mtime(a) <=> File.mtime(a)
|
143
|
-
when 'ctime'
|
144
|
-
File.ctime(a) <=> File.ctime(a)
|
145
|
-
else
|
146
|
-
a.length <=> b.length
|
147
|
-
end
|
79
|
+
if options[:output]
|
80
|
+
File.open(options[:output], 'w') do |file|
|
81
|
+
file.write(JSON.pretty_generate(dup_finder.duplicate_entries))
|
148
82
|
end
|
83
|
+
else
|
84
|
+
$stdout.puts JSON.pretty_generate(dup_finder.duplicate_entries)
|
149
85
|
end
|
150
|
-
|
151
|
-
$stdout.puts JSON.pretty_generate(@duplicates)
|
data/lib/dup_finder.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
class DupFinder
|
2
|
+
attr_accessor :options
|
3
|
+
|
4
|
+
def initialize(hashed: {}, options: {})
|
5
|
+
@queue = []
|
6
|
+
@hashed = hashed
|
7
|
+
@options = {
|
8
|
+
depth: Float::INFINITY,
|
9
|
+
ignore: [],
|
10
|
+
ignore_empty: true,
|
11
|
+
cache: true,
|
12
|
+
alg: :md5,
|
13
|
+
cache_to_tmp: true,
|
14
|
+
}.merge(options)
|
15
|
+
@cache = {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def queue(directory, depth = 0)
|
19
|
+
@queue << [directory, depth]
|
20
|
+
end
|
21
|
+
|
22
|
+
def search
|
23
|
+
until @queue.empty?
|
24
|
+
directory, depth = @queue.shift
|
25
|
+
|
26
|
+
unless depth > @options[:depth]
|
27
|
+
hash_entries(directory)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def duplicate_entries
|
35
|
+
dups = {}
|
36
|
+
|
37
|
+
@hashed.each do |(path, hash)|
|
38
|
+
dups[hash] ||= []
|
39
|
+
dups[hash] << path
|
40
|
+
end
|
41
|
+
|
42
|
+
dups.values.reject { |paths| paths.length < 2 }
|
43
|
+
end
|
44
|
+
|
45
|
+
private # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
46
|
+
|
47
|
+
def ignored?(path)
|
48
|
+
return true if @options[:ignore_empty] && File.zero?(path)
|
49
|
+
|
50
|
+
@options[:ignore].any? do |pattern|
|
51
|
+
File.fnmatch(pattern, path, File::FNM_EXTGLOB)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def hash_entries(directory, depth: 0)
|
56
|
+
Dir.entries(directory).each do |entry|
|
57
|
+
next if entry.start_with?('.')
|
58
|
+
path = File.join(directory, entry)
|
59
|
+
|
60
|
+
Signal.trap("INT") do
|
61
|
+
write_cache(directory) if @options[:cache]
|
62
|
+
exit 1
|
63
|
+
end
|
64
|
+
|
65
|
+
next if File.symlink?(path)
|
66
|
+
next if ignored?(path)
|
67
|
+
|
68
|
+
if File.directory?(path)
|
69
|
+
queue(path, depth + 1)
|
70
|
+
else
|
71
|
+
if @options[:cache]
|
72
|
+
digest = cached_hash(path)
|
73
|
+
else
|
74
|
+
digest = hash_entry(path)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
write_cache(directory) if @options[:cache]
|
80
|
+
end
|
81
|
+
|
82
|
+
def cached_hash(path)
|
83
|
+
directory = File.dirname(path)
|
84
|
+
file_name = File.basename(path)
|
85
|
+
@cache[directory] ||= load_cache(directory)
|
86
|
+
@cache[directory][file_name] ||= {}
|
87
|
+
meta_data = @cache[directory][file_name]
|
88
|
+
|
89
|
+
if meta_data.key?(options[:alg])
|
90
|
+
meta_data[options[:alg]]
|
91
|
+
else
|
92
|
+
meta_data[options[:alg]] = hash_entry(path)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def cache_path(directory)
|
97
|
+
if @options[:cache_to_tmp]
|
98
|
+
File.join("/tmp/file_hashes", File.expand_path(directory), "hashes.yml")
|
99
|
+
else
|
100
|
+
File.join(directory, "hashes.yml")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def load_cache(directory)
|
105
|
+
File.exist?(cache_path(directory)) ? YAML.load_file(cache_path(directory)) : {}
|
106
|
+
end
|
107
|
+
|
108
|
+
def write_cache(directory)
|
109
|
+
@cache[directory] ||= {}
|
110
|
+
@cache[directory].keep_if do |file_name, hash|
|
111
|
+
File.exist?(File.join(directory, file_name))
|
112
|
+
end
|
113
|
+
|
114
|
+
FileUtils.mkdir_p(File.dirname(cache_path(directory)))
|
115
|
+
|
116
|
+
File.open(cache_path(directory), 'w') do |file|
|
117
|
+
file.write(YAML.dump(@cache[directory]))
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def hash_entry(path)
|
122
|
+
return @hashed[path] if @hashed.key?(path)
|
123
|
+
|
124
|
+
digest =
|
125
|
+
case @options[:alg]
|
126
|
+
when :md5 then Digest::MD5.new
|
127
|
+
when :sha1 then Digest::SHA1.new
|
128
|
+
end
|
129
|
+
|
130
|
+
File.open(path, 'rb') do |file|
|
131
|
+
while data = file.read(1024 * 1024)
|
132
|
+
digest.update(data)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
@hashed[path] = digest.hexdigest
|
137
|
+
digest.hexdigest
|
138
|
+
rescue Errno::EINVAL => e
|
139
|
+
$stderr.puts "#{path}: #{e}"
|
140
|
+
nil
|
141
|
+
end
|
142
|
+
end
|
data/lib/finddups/version.rb
CHANGED
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: finddups
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Clink
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-05-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
13
|
+
description:
|
14
14
|
email:
|
15
15
|
- code@alexclink.com
|
16
16
|
executables:
|
@@ -30,6 +30,7 @@ files:
|
|
30
30
|
- bin/setup
|
31
31
|
- exe/finddups
|
32
32
|
- finddups.gemspec
|
33
|
+
- lib/dup_finder.rb
|
33
34
|
- lib/finddups.rb
|
34
35
|
- lib/finddups/version.rb
|
35
36
|
homepage: https://github.com/SleepingInsomniac/finddups
|
@@ -38,7 +39,7 @@ licenses:
|
|
38
39
|
metadata:
|
39
40
|
homepage_uri: https://github.com/SleepingInsomniac/finddups
|
40
41
|
source_code_uri: https://github.com/SleepingInsomniac/finddups
|
41
|
-
post_install_message:
|
42
|
+
post_install_message:
|
42
43
|
rdoc_options: []
|
43
44
|
require_paths:
|
44
45
|
- lib
|
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
54
|
- !ruby/object:Gem::Version
|
54
55
|
version: '0'
|
55
56
|
requirements: []
|
56
|
-
rubygems_version: 3.
|
57
|
-
signing_key:
|
57
|
+
rubygems_version: 3.4.9
|
58
|
+
signing_key:
|
58
59
|
specification_version: 4
|
59
60
|
summary: Shows duplicate files within a list of directories and outputs as JSON.
|
60
61
|
test_files: []
|