finddups 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1cb43cbb4ba1ea49a71b5d369cffbd0decf21f0c77764b2252d4211b7b1ccb65
4
- data.tar.gz: cc04d2a23bcb19c8b8849e815e847df2472f681d46e6e7007d8cf8880acae1c6
3
+ metadata.gz: d1bdd387a3145a5a49a979e946be3668e2c8da432928f69f4e6e22518b2bd8ba
4
+ data.tar.gz: 78de1056705cfb1afee0e3c6e5dd4a54b80107e8466e875d23aaeaaeaaad1a2a
5
5
  SHA512:
6
- metadata.gz: 1c46b2d368ba239f703bf20a4c5297cf03d9f9275ea5e924a87112a2d9d92d8488e4b1e37453aec7b81b4b294629d2fe39a0d6e76114981eefa8b1c23e4b61d7
7
- data.tar.gz: ec31d5d9d66ac0081c8d0571a9a7783006df85f63f5a0711b1fbc10e3e2249fb2eb758804eabe6cc4db740c07c4517586686d1e9c1e5a140f9d766314ceb9dca
6
+ metadata.gz: 60c7b54ca17cfc307bdd9f0eb1102b2c3a8b041a4b314de6fbcd3e2f5aa61fb9a598e651a9614a03f0589f5ad2257db504890bedc1b5f1119ea41b96c9605d6d
7
+ data.tar.gz: 65f26108b4480c66f5b0e80f4927873438eeb2b486b6c04f5637eb06aba30b4c384e3b9584cf22a4ef03cabb2193c7b3f15271104132d92b2b62c0381a7a043a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- finddups (0.1.0)
4
+ finddups (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Finddups
2
2
 
3
3
  Shows duplicate files within a list of directories and outputs as JSON.
4
+ This is accomplished by generating a hash digest each file and comparing the hashes.
4
5
 
5
6
  ## Installation
6
7
 
@@ -9,17 +10,17 @@ Shows duplicate files within a list of directories and outputs as JSON.
9
10
  ## Usage
10
11
 
11
12
  ```
12
- finddups (version 0.1.0)
13
+ finddups 0.2.0
13
14
  Usage: finddups [dirs] [options]
14
15
  -i, --ignore path ignore paths
15
- --atime (default) Use file access time to sort duplicates
16
- --mtime Use file modification time to sort duplicates
17
- --ctime Use file change time to sort duplicates
18
- (the time at which directory information about the file was changed, not the file itself)
19
- -t, --threads threads Number of threads to use (default 16)
20
16
  -d, --depth depth Max depth to search
17
+ --[no-]cache Perform caching
18
+ --[no-]cache-to-tmp Save cache files to /tmp/file_hashes
19
+ -o, --output path Output file path
20
+ --alg alg Hashing algorithm (SHA1, MD5)
21
+ --[no-]ignore-empty Ignore empty files
21
22
  -h, --help Show this help
22
- -v Show version
23
+ -v, --version Show version
23
24
  ```
24
25
 
25
26
  Example:
data/exe/finddups CHANGED
@@ -2,53 +2,54 @@
2
2
 
3
3
  require 'digest/sha1'
4
4
  require 'json'
5
+ require 'yaml'
5
6
  require 'fileutils'
6
- require 'thread'
7
7
  require 'optparse'
8
8
 
9
9
  $LOAD_PATH.unshift(File.join(__dir__, '..', 'lib'))
10
10
 
11
11
  require 'finddups'
12
+ require 'dup_finder'
12
13
 
13
- @options = {
14
- sort: 'atime',
15
- depth: Float::INFINITY,
16
- ignore: [],
17
- threads: 16,
18
- }
14
+ options = {}
19
15
 
20
16
  optparser = OptionParser.new do |opts|
21
17
  opts.banner = <<~BANNER
22
- finddups (version #{Finddups::VERSION})
18
+ finddups #{Finddups::VERSION}
23
19
  Usage: #{File.basename(__FILE__)} [dirs] [options]
24
20
  BANNER
25
21
 
26
22
  opts.on("-i path", "--ignore path", "ignore paths") do |path|
27
- @options[:ignore] << path
23
+ options[:ignore] ||= []
24
+ options[:ignore] << path
28
25
  end
29
26
 
30
- opts.on("--atime", "(default) Use file access time to sort duplicates") do
31
- @options[:sort] = 'atime'
27
+ opts.on("-d depth", "--depth depth", "Max depth to search") do |depth|
28
+ options[:depth] = depth.to_i
32
29
  end
33
30
 
34
- opts.on("--mtime", "Use file modification time to sort duplicates") do
35
- @options[:sort] = 'mtime'
31
+ opts.on("--[no-]cache", "Perform caching") do |value|
32
+ options[:cache] = value
36
33
  end
37
34
 
38
- desc = <<~DESC
39
- Use file change time to sort duplicates
40
- (the time at which directory information about the file was changed, not the file itself)
41
- DESC
42
- opts.on("--ctime", desc) do
43
- @options[:sort] = 'ctime'
35
+ opts.on("--[no-]cache-to-tmp", "Save cache files to /tmp/file_hashes") do |value|
36
+ options[:cache_to_tmp] = value
44
37
  end
45
38
 
46
- opts.on("-t threads", "--threads threads", "Number of threads to use (default 16)") do |threads|
47
- @options[:threads] = threads.to_i
39
+ opts.on("-o path", "--output path", "Output file path") do |path|
40
+ options[:output] = path
48
41
  end
49
42
 
50
- opts.on("-d depth", "--depth depth", "Max depth to search") do |depth|
51
- @options[:depth] = depth.to_i
43
+ opts.on("--alg alg", "Hashing algorithm (SHA1, MD5)") do |alg|
44
+ options[:alg] = alg.downcase.to_sym
45
+ unless %i[md5 sha1].include?(options[:alg])
46
+ $stderr.puts "Unsupported algorithm: #{options[:alg]}"
47
+ exit 1
48
+ end
49
+ end
50
+
51
+ opts.on("--[no-]ignore-empty", "Ignore empty files") do |value|
52
+ options[:ignore_empty] = value
52
53
  end
53
54
 
54
55
  opts.on("-h", "--help", "Show this help") do
@@ -56,8 +57,8 @@ optparser = OptionParser.new do |opts|
56
57
  exit
57
58
  end
58
59
 
59
- opts.on("-v", "Show version") do
60
- puts "finddups (version #{Finddups::VERSION})"
60
+ opts.on("-v", "--version", "Show version") do
61
+ puts "finddups #{Finddups::VERSION}"
61
62
  exit
62
63
  end
63
64
  end
@@ -69,83 +70,16 @@ if ARGV.empty? || ARGV.any? { |entry| !File.directory?(entry) }
69
70
  exit 1
70
71
  end
71
72
 
72
- # ========
73
- # = Prog =
74
- # ========
75
-
76
- search_dirs = ARGV
77
- trash_dir = "/tmp/duplicates/"
78
-
79
- @mutex = Mutex.new
80
- @queue = []
81
-
82
- def search(directory, depth = 0)
83
- # puts "Searching: #{directory}"
84
-
85
- # Skips
86
- return @duplicates if @options[:ignore].include?(File.basename(directory))
87
-
88
- Dir.entries(directory).each do |entry|
89
- next if entry.start_with?('.')
90
- path = File.join(directory, entry)
91
-
92
- if File.directory?(path)
93
- if depth < @options[:depth]
94
- @queue.push -> { search(path, depth + 1) }
95
- end
96
- elsif File.symlink?(path)
97
- next
98
- else
99
- begin
100
- digest = Digest::SHA1.hexdigest(File.read(path))
101
- @mutex.synchronize do
102
- @duplicates[digest] ||= []
103
- @duplicates[digest] << path
104
- end
105
- rescue Errno::EINVAL => e
106
- $stderr.puts "#{path}: #{e}"
107
- end
108
- end
109
- end
110
- @duplicates
111
- end
73
+ # Prog ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
112
74
 
113
- @duplicates = {}
75
+ dup_finder = DupFinder.new(options: options)
76
+ ARGV.each { |path| dup_finder.queue(path) }
77
+ dup_finder.search
114
78
 
115
- search_dirs.each do |search_dir|
116
- @queue.push -> { search(search_dir) }
117
- end
118
-
119
- until @queue.empty?
120
- threads = []
121
- @options[:threads].times do
122
- _proc = @queue.shift
123
- threads << Thread.new { _proc.call } if _proc
124
- end
125
- threads.each(&:join)
126
- end
127
-
128
- # Trim non dups
129
- @duplicates = @duplicates
130
- .values
131
- .reject do |files|
132
- files.length < 2
133
- end
134
-
135
- # Stort
136
- @duplicates.each do |dups|
137
- dups = dups.sort do |a, b|
138
- case @options[:sort]
139
- when 'atime'
140
- File.atime(a) <=> File.atime(a)
141
- when 'mtime'
142
- File.mtime(a) <=> File.mtime(a)
143
- when 'ctime'
144
- File.ctime(a) <=> File.ctime(a)
145
- else
146
- a.length <=> b.length
147
- end
79
+ if options[:output]
80
+ File.open(options[:output], 'w') do |file|
81
+ file.write(JSON.pretty_generate(dup_finder.duplicate_entries))
148
82
  end
83
+ else
84
+ $stdout.puts JSON.pretty_generate(dup_finder.duplicate_entries)
149
85
  end
150
-
151
- $stdout.puts JSON.pretty_generate(@duplicates)
data/lib/dup_finder.rb ADDED
@@ -0,0 +1,142 @@
1
+ class DupFinder
2
+ attr_accessor :options
3
+
4
+ def initialize(hashed: {}, options: {})
5
+ @queue = []
6
+ @hashed = hashed
7
+ @options = {
8
+ depth: Float::INFINITY,
9
+ ignore: [],
10
+ ignore_empty: true,
11
+ cache: true,
12
+ alg: :md5,
13
+ cache_to_tmp: true,
14
+ }.merge(options)
15
+ @cache = {}
16
+ end
17
+
18
+ def queue(directory, depth = 0)
19
+ @queue << [directory, depth]
20
+ end
21
+
22
+ def search
23
+ until @queue.empty?
24
+ directory, depth = @queue.shift
25
+
26
+ unless depth > @options[:depth]
27
+ hash_entries(directory)
28
+ end
29
+ end
30
+
31
+ self
32
+ end
33
+
34
+ def duplicate_entries
35
+ dups = {}
36
+
37
+ @hashed.each do |(path, hash)|
38
+ dups[hash] ||= []
39
+ dups[hash] << path
40
+ end
41
+
42
+ dups.values.reject { |paths| paths.length < 2 }
43
+ end
44
+
45
+ private # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
46
+
47
+ def ignored?(path)
48
+ return true if @options[:ignore_empty] && File.zero?(path)
49
+
50
+ @options[:ignore].any? do |pattern|
51
+ File.fnmatch(pattern, path, File::FNM_EXTGLOB)
52
+ end
53
+ end
54
+
55
+ def hash_entries(directory, depth: 0)
56
+ Dir.entries(directory).each do |entry|
57
+ next if entry.start_with?('.')
58
+ path = File.join(directory, entry)
59
+
60
+ Signal.trap("INT") do
61
+ write_cache(directory) if @options[:cache]
62
+ exit 1
63
+ end
64
+
65
+ next if File.symlink?(path)
66
+ next if ignored?(path)
67
+
68
+ if File.directory?(path)
69
+ queue(path, depth + 1)
70
+ else
71
+ if @options[:cache]
72
+ digest = cached_hash(path)
73
+ else
74
+ digest = hash_entry(path)
75
+ end
76
+ end
77
+ end
78
+
79
+ write_cache(directory) if @options[:cache]
80
+ end
81
+
82
+ def cached_hash(path)
83
+ directory = File.dirname(path)
84
+ file_name = File.basename(path)
85
+ @cache[directory] ||= load_cache(directory)
86
+ @cache[directory][file_name] ||= {}
87
+ meta_data = @cache[directory][file_name]
88
+
89
+ if meta_data.key?(options[:alg])
90
+ meta_data[options[:alg]]
91
+ else
92
+ meta_data[options[:alg]] = hash_entry(path)
93
+ end
94
+ end
95
+
96
+ def cache_path(directory)
97
+ if @options[:cache_to_tmp]
98
+ File.join("/tmp/file_hashes", File.expand_path(directory), "hashes.yml")
99
+ else
100
+ File.join(directory, "hashes.yml")
101
+ end
102
+ end
103
+
104
+ def load_cache(directory)
105
+ File.exist?(cache_path(directory)) ? YAML.load_file(cache_path(directory)) : {}
106
+ end
107
+
108
+ def write_cache(directory)
109
+ @cache[directory] ||= {}
110
+ @cache[directory].keep_if do |file_name, hash|
111
+ File.exist?(File.join(directory, file_name))
112
+ end
113
+
114
+ FileUtils.mkdir_p(File.dirname(cache_path(directory)))
115
+
116
+ File.open(cache_path(directory), 'w') do |file|
117
+ file.write(YAML.dump(@cache[directory]))
118
+ end
119
+ end
120
+
121
+ def hash_entry(path)
122
+ return @hashed[path] if @hashed.key?(path)
123
+
124
+ digest =
125
+ case @options[:alg]
126
+ when :md5 then Digest::MD5.new
127
+ when :sha1 then Digest::SHA1.new
128
+ end
129
+
130
+ File.open(path, 'rb') do |file|
131
+ while data = file.read(1024 * 1024)
132
+ digest.update(data)
133
+ end
134
+ end
135
+
136
+ @hashed[path] = digest.hexdigest
137
+ digest.hexdigest
138
+ rescue Errno::EINVAL => e
139
+ $stderr.puts "#{path}: #{e}"
140
+ nil
141
+ end
142
+ end
@@ -1,3 +1,3 @@
1
1
  module Finddups
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: finddups
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Clink
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-03 00:00:00.000000000 Z
11
+ date: 2023-05-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description:
13
+ description:
14
14
  email:
15
15
  - code@alexclink.com
16
16
  executables:
@@ -30,6 +30,7 @@ files:
30
30
  - bin/setup
31
31
  - exe/finddups
32
32
  - finddups.gemspec
33
+ - lib/dup_finder.rb
33
34
  - lib/finddups.rb
34
35
  - lib/finddups/version.rb
35
36
  homepage: https://github.com/SleepingInsomniac/finddups
@@ -38,7 +39,7 @@ licenses:
38
39
  metadata:
39
40
  homepage_uri: https://github.com/SleepingInsomniac/finddups
40
41
  source_code_uri: https://github.com/SleepingInsomniac/finddups
41
- post_install_message:
42
+ post_install_message:
42
43
  rdoc_options: []
43
44
  require_paths:
44
45
  - lib
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
53
54
  - !ruby/object:Gem::Version
54
55
  version: '0'
55
56
  requirements: []
56
- rubygems_version: 3.1.3
57
- signing_key:
57
+ rubygems_version: 3.4.9
58
+ signing_key:
58
59
  specification_version: 4
59
60
  summary: Shows duplicate files within a list of directories and outputs as JSON.
60
61
  test_files: []