finddups 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +8 -7
- data/exe/finddups +35 -101
- data/lib/dup_finder.rb +142 -0
- data/lib/finddups/version.rb +1 -1
- metadata +8 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1bdd387a3145a5a49a979e946be3668e2c8da432928f69f4e6e22518b2bd8ba
|
4
|
+
data.tar.gz: 78de1056705cfb1afee0e3c6e5dd4a54b80107e8466e875d23aaeaaeaaad1a2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60c7b54ca17cfc307bdd9f0eb1102b2c3a8b041a4b314de6fbcd3e2f5aa61fb9a598e651a9614a03f0589f5ad2257db504890bedc1b5f1119ea41b96c9605d6d
|
7
|
+
data.tar.gz: 65f26108b4480c66f5b0e80f4927873438eeb2b486b6c04f5637eb06aba30b4c384e3b9584cf22a4ef03cabb2193c7b3f15271104132d92b2b62c0381a7a043a
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# Finddups
|
2
2
|
|
3
3
|
Shows duplicate files within a list of directories and outputs as JSON.
|
4
|
+
This is accomplished by generating a hash digest each file and comparing the hashes.
|
4
5
|
|
5
6
|
## Installation
|
6
7
|
|
@@ -9,17 +10,17 @@ Shows duplicate files within a list of directories and outputs as JSON.
|
|
9
10
|
## Usage
|
10
11
|
|
11
12
|
```
|
12
|
-
finddups
|
13
|
+
finddups 0.2.0
|
13
14
|
Usage: finddups [dirs] [options]
|
14
15
|
-i, --ignore path ignore paths
|
15
|
-
--atime (default) Use file access time to sort duplicates
|
16
|
-
--mtime Use file modification time to sort duplicates
|
17
|
-
--ctime Use file change time to sort duplicates
|
18
|
-
(the time at which directory information about the file was changed, not the file itself)
|
19
|
-
-t, --threads threads Number of threads to use (default 16)
|
20
16
|
-d, --depth depth Max depth to search
|
17
|
+
--[no-]cache Perform caching
|
18
|
+
--[no-]cache-to-tmp Save cache files to /tmp/file_hashes
|
19
|
+
-o, --output path Output file path
|
20
|
+
--alg alg Hashing algorithm (SHA1, MD5)
|
21
|
+
--[no-]ignore-empty Ignore empty files
|
21
22
|
-h, --help Show this help
|
22
|
-
-v
|
23
|
+
-v, --version Show version
|
23
24
|
```
|
24
25
|
|
25
26
|
Example:
|
data/exe/finddups
CHANGED
@@ -2,53 +2,54 @@
|
|
2
2
|
|
3
3
|
require 'digest/sha1'
|
4
4
|
require 'json'
|
5
|
+
require 'yaml'
|
5
6
|
require 'fileutils'
|
6
|
-
require 'thread'
|
7
7
|
require 'optparse'
|
8
8
|
|
9
9
|
$LOAD_PATH.unshift(File.join(__dir__, '..', 'lib'))
|
10
10
|
|
11
11
|
require 'finddups'
|
12
|
+
require 'dup_finder'
|
12
13
|
|
13
|
-
|
14
|
-
sort: 'atime',
|
15
|
-
depth: Float::INFINITY,
|
16
|
-
ignore: [],
|
17
|
-
threads: 16,
|
18
|
-
}
|
14
|
+
options = {}
|
19
15
|
|
20
16
|
optparser = OptionParser.new do |opts|
|
21
17
|
opts.banner = <<~BANNER
|
22
|
-
finddups
|
18
|
+
finddups #{Finddups::VERSION}
|
23
19
|
Usage: #{File.basename(__FILE__)} [dirs] [options]
|
24
20
|
BANNER
|
25
21
|
|
26
22
|
opts.on("-i path", "--ignore path", "ignore paths") do |path|
|
27
|
-
|
23
|
+
options[:ignore] ||= []
|
24
|
+
options[:ignore] << path
|
28
25
|
end
|
29
26
|
|
30
|
-
opts.on("
|
31
|
-
|
27
|
+
opts.on("-d depth", "--depth depth", "Max depth to search") do |depth|
|
28
|
+
options[:depth] = depth.to_i
|
32
29
|
end
|
33
30
|
|
34
|
-
opts.on("--
|
35
|
-
|
31
|
+
opts.on("--[no-]cache", "Perform caching") do |value|
|
32
|
+
options[:cache] = value
|
36
33
|
end
|
37
34
|
|
38
|
-
|
39
|
-
|
40
|
-
(the time at which directory information about the file was changed, not the file itself)
|
41
|
-
DESC
|
42
|
-
opts.on("--ctime", desc) do
|
43
|
-
@options[:sort] = 'ctime'
|
35
|
+
opts.on("--[no-]cache-to-tmp", "Save cache files to /tmp/file_hashes") do |value|
|
36
|
+
options[:cache_to_tmp] = value
|
44
37
|
end
|
45
38
|
|
46
|
-
opts.on("-
|
47
|
-
|
39
|
+
opts.on("-o path", "--output path", "Output file path") do |path|
|
40
|
+
options[:output] = path
|
48
41
|
end
|
49
42
|
|
50
|
-
opts.on("
|
51
|
-
|
43
|
+
opts.on("--alg alg", "Hashing algorithm (SHA1, MD5)") do |alg|
|
44
|
+
options[:alg] = alg.downcase.to_sym
|
45
|
+
unless %i[md5 sha1].include?(options[:alg])
|
46
|
+
$stderr.puts "Unsupported algorithm: #{options[:alg]}"
|
47
|
+
exit 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("--[no-]ignore-empty", "Ignore empty files") do |value|
|
52
|
+
options[:ignore_empty] = value
|
52
53
|
end
|
53
54
|
|
54
55
|
opts.on("-h", "--help", "Show this help") do
|
@@ -56,8 +57,8 @@ optparser = OptionParser.new do |opts|
|
|
56
57
|
exit
|
57
58
|
end
|
58
59
|
|
59
|
-
opts.on("-v", "Show version") do
|
60
|
-
puts "finddups
|
60
|
+
opts.on("-v", "--version", "Show version") do
|
61
|
+
puts "finddups #{Finddups::VERSION}"
|
61
62
|
exit
|
62
63
|
end
|
63
64
|
end
|
@@ -69,83 +70,16 @@ if ARGV.empty? || ARGV.any? { |entry| !File.directory?(entry) }
|
|
69
70
|
exit 1
|
70
71
|
end
|
71
72
|
|
72
|
-
#
|
73
|
-
# = Prog =
|
74
|
-
# ========
|
75
|
-
|
76
|
-
search_dirs = ARGV
|
77
|
-
trash_dir = "/tmp/duplicates/"
|
78
|
-
|
79
|
-
@mutex = Mutex.new
|
80
|
-
@queue = []
|
81
|
-
|
82
|
-
def search(directory, depth = 0)
|
83
|
-
# puts "Searching: #{directory}"
|
84
|
-
|
85
|
-
# Skips
|
86
|
-
return @duplicates if @options[:ignore].include?(File.basename(directory))
|
87
|
-
|
88
|
-
Dir.entries(directory).each do |entry|
|
89
|
-
next if entry.start_with?('.')
|
90
|
-
path = File.join(directory, entry)
|
91
|
-
|
92
|
-
if File.directory?(path)
|
93
|
-
if depth < @options[:depth]
|
94
|
-
@queue.push -> { search(path, depth + 1) }
|
95
|
-
end
|
96
|
-
elsif File.symlink?(path)
|
97
|
-
next
|
98
|
-
else
|
99
|
-
begin
|
100
|
-
digest = Digest::SHA1.hexdigest(File.read(path))
|
101
|
-
@mutex.synchronize do
|
102
|
-
@duplicates[digest] ||= []
|
103
|
-
@duplicates[digest] << path
|
104
|
-
end
|
105
|
-
rescue Errno::EINVAL => e
|
106
|
-
$stderr.puts "#{path}: #{e}"
|
107
|
-
end
|
108
|
-
end
|
109
|
-
end
|
110
|
-
@duplicates
|
111
|
-
end
|
73
|
+
# Prog ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
112
74
|
|
113
|
-
|
75
|
+
dup_finder = DupFinder.new(options: options)
|
76
|
+
ARGV.each { |path| dup_finder.queue(path) }
|
77
|
+
dup_finder.search
|
114
78
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
until @queue.empty?
|
120
|
-
threads = []
|
121
|
-
@options[:threads].times do
|
122
|
-
_proc = @queue.shift
|
123
|
-
threads << Thread.new { _proc.call } if _proc
|
124
|
-
end
|
125
|
-
threads.each(&:join)
|
126
|
-
end
|
127
|
-
|
128
|
-
# Trim non dups
|
129
|
-
@duplicates = @duplicates
|
130
|
-
.values
|
131
|
-
.reject do |files|
|
132
|
-
files.length < 2
|
133
|
-
end
|
134
|
-
|
135
|
-
# Stort
|
136
|
-
@duplicates.each do |dups|
|
137
|
-
dups = dups.sort do |a, b|
|
138
|
-
case @options[:sort]
|
139
|
-
when 'atime'
|
140
|
-
File.atime(a) <=> File.atime(a)
|
141
|
-
when 'mtime'
|
142
|
-
File.mtime(a) <=> File.mtime(a)
|
143
|
-
when 'ctime'
|
144
|
-
File.ctime(a) <=> File.ctime(a)
|
145
|
-
else
|
146
|
-
a.length <=> b.length
|
147
|
-
end
|
79
|
+
if options[:output]
|
80
|
+
File.open(options[:output], 'w') do |file|
|
81
|
+
file.write(JSON.pretty_generate(dup_finder.duplicate_entries))
|
148
82
|
end
|
83
|
+
else
|
84
|
+
$stdout.puts JSON.pretty_generate(dup_finder.duplicate_entries)
|
149
85
|
end
|
150
|
-
|
151
|
-
$stdout.puts JSON.pretty_generate(@duplicates)
|
data/lib/dup_finder.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
class DupFinder
|
2
|
+
attr_accessor :options
|
3
|
+
|
4
|
+
def initialize(hashed: {}, options: {})
|
5
|
+
@queue = []
|
6
|
+
@hashed = hashed
|
7
|
+
@options = {
|
8
|
+
depth: Float::INFINITY,
|
9
|
+
ignore: [],
|
10
|
+
ignore_empty: true,
|
11
|
+
cache: true,
|
12
|
+
alg: :md5,
|
13
|
+
cache_to_tmp: true,
|
14
|
+
}.merge(options)
|
15
|
+
@cache = {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def queue(directory, depth = 0)
|
19
|
+
@queue << [directory, depth]
|
20
|
+
end
|
21
|
+
|
22
|
+
def search
|
23
|
+
until @queue.empty?
|
24
|
+
directory, depth = @queue.shift
|
25
|
+
|
26
|
+
unless depth > @options[:depth]
|
27
|
+
hash_entries(directory)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def duplicate_entries
|
35
|
+
dups = {}
|
36
|
+
|
37
|
+
@hashed.each do |(path, hash)|
|
38
|
+
dups[hash] ||= []
|
39
|
+
dups[hash] << path
|
40
|
+
end
|
41
|
+
|
42
|
+
dups.values.reject { |paths| paths.length < 2 }
|
43
|
+
end
|
44
|
+
|
45
|
+
private # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
46
|
+
|
47
|
+
def ignored?(path)
|
48
|
+
return true if @options[:ignore_empty] && File.zero?(path)
|
49
|
+
|
50
|
+
@options[:ignore].any? do |pattern|
|
51
|
+
File.fnmatch(pattern, path, File::FNM_EXTGLOB)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def hash_entries(directory, depth: 0)
|
56
|
+
Dir.entries(directory).each do |entry|
|
57
|
+
next if entry.start_with?('.')
|
58
|
+
path = File.join(directory, entry)
|
59
|
+
|
60
|
+
Signal.trap("INT") do
|
61
|
+
write_cache(directory) if @options[:cache]
|
62
|
+
exit 1
|
63
|
+
end
|
64
|
+
|
65
|
+
next if File.symlink?(path)
|
66
|
+
next if ignored?(path)
|
67
|
+
|
68
|
+
if File.directory?(path)
|
69
|
+
queue(path, depth + 1)
|
70
|
+
else
|
71
|
+
if @options[:cache]
|
72
|
+
digest = cached_hash(path)
|
73
|
+
else
|
74
|
+
digest = hash_entry(path)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
write_cache(directory) if @options[:cache]
|
80
|
+
end
|
81
|
+
|
82
|
+
def cached_hash(path)
|
83
|
+
directory = File.dirname(path)
|
84
|
+
file_name = File.basename(path)
|
85
|
+
@cache[directory] ||= load_cache(directory)
|
86
|
+
@cache[directory][file_name] ||= {}
|
87
|
+
meta_data = @cache[directory][file_name]
|
88
|
+
|
89
|
+
if meta_data.key?(options[:alg])
|
90
|
+
meta_data[options[:alg]]
|
91
|
+
else
|
92
|
+
meta_data[options[:alg]] = hash_entry(path)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def cache_path(directory)
|
97
|
+
if @options[:cache_to_tmp]
|
98
|
+
File.join("/tmp/file_hashes", File.expand_path(directory), "hashes.yml")
|
99
|
+
else
|
100
|
+
File.join(directory, "hashes.yml")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def load_cache(directory)
|
105
|
+
File.exist?(cache_path(directory)) ? YAML.load_file(cache_path(directory)) : {}
|
106
|
+
end
|
107
|
+
|
108
|
+
def write_cache(directory)
|
109
|
+
@cache[directory] ||= {}
|
110
|
+
@cache[directory].keep_if do |file_name, hash|
|
111
|
+
File.exist?(File.join(directory, file_name))
|
112
|
+
end
|
113
|
+
|
114
|
+
FileUtils.mkdir_p(File.dirname(cache_path(directory)))
|
115
|
+
|
116
|
+
File.open(cache_path(directory), 'w') do |file|
|
117
|
+
file.write(YAML.dump(@cache[directory]))
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def hash_entry(path)
|
122
|
+
return @hashed[path] if @hashed.key?(path)
|
123
|
+
|
124
|
+
digest =
|
125
|
+
case @options[:alg]
|
126
|
+
when :md5 then Digest::MD5.new
|
127
|
+
when :sha1 then Digest::SHA1.new
|
128
|
+
end
|
129
|
+
|
130
|
+
File.open(path, 'rb') do |file|
|
131
|
+
while data = file.read(1024 * 1024)
|
132
|
+
digest.update(data)
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
@hashed[path] = digest.hexdigest
|
137
|
+
digest.hexdigest
|
138
|
+
rescue Errno::EINVAL => e
|
139
|
+
$stderr.puts "#{path}: #{e}"
|
140
|
+
nil
|
141
|
+
end
|
142
|
+
end
|
data/lib/finddups/version.rb
CHANGED
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: finddups
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alex Clink
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-05-02 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description:
|
13
|
+
description:
|
14
14
|
email:
|
15
15
|
- code@alexclink.com
|
16
16
|
executables:
|
@@ -30,6 +30,7 @@ files:
|
|
30
30
|
- bin/setup
|
31
31
|
- exe/finddups
|
32
32
|
- finddups.gemspec
|
33
|
+
- lib/dup_finder.rb
|
33
34
|
- lib/finddups.rb
|
34
35
|
- lib/finddups/version.rb
|
35
36
|
homepage: https://github.com/SleepingInsomniac/finddups
|
@@ -38,7 +39,7 @@ licenses:
|
|
38
39
|
metadata:
|
39
40
|
homepage_uri: https://github.com/SleepingInsomniac/finddups
|
40
41
|
source_code_uri: https://github.com/SleepingInsomniac/finddups
|
41
|
-
post_install_message:
|
42
|
+
post_install_message:
|
42
43
|
rdoc_options: []
|
43
44
|
require_paths:
|
44
45
|
- lib
|
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
54
|
- !ruby/object:Gem::Version
|
54
55
|
version: '0'
|
55
56
|
requirements: []
|
56
|
-
rubygems_version: 3.
|
57
|
-
signing_key:
|
57
|
+
rubygems_version: 3.4.9
|
58
|
+
signing_key:
|
58
59
|
specification_version: 4
|
59
60
|
summary: Shows duplicate files within a list of directories and outputs as JSON.
|
60
61
|
test_files: []
|