finddups 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1cb43cbb4ba1ea49a71b5d369cffbd0decf21f0c77764b2252d4211b7b1ccb65
4
- data.tar.gz: cc04d2a23bcb19c8b8849e815e847df2472f681d46e6e7007d8cf8880acae1c6
3
+ metadata.gz: d1bdd387a3145a5a49a979e946be3668e2c8da432928f69f4e6e22518b2bd8ba
4
+ data.tar.gz: 78de1056705cfb1afee0e3c6e5dd4a54b80107e8466e875d23aaeaaeaaad1a2a
5
5
  SHA512:
6
- metadata.gz: 1c46b2d368ba239f703bf20a4c5297cf03d9f9275ea5e924a87112a2d9d92d8488e4b1e37453aec7b81b4b294629d2fe39a0d6e76114981eefa8b1c23e4b61d7
7
- data.tar.gz: ec31d5d9d66ac0081c8d0571a9a7783006df85f63f5a0711b1fbc10e3e2249fb2eb758804eabe6cc4db740c07c4517586686d1e9c1e5a140f9d766314ceb9dca
6
+ metadata.gz: 60c7b54ca17cfc307bdd9f0eb1102b2c3a8b041a4b314de6fbcd3e2f5aa61fb9a598e651a9614a03f0589f5ad2257db504890bedc1b5f1119ea41b96c9605d6d
7
+ data.tar.gz: 65f26108b4480c66f5b0e80f4927873438eeb2b486b6c04f5637eb06aba30b4c384e3b9584cf22a4ef03cabb2193c7b3f15271104132d92b2b62c0381a7a043a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- finddups (0.1.0)
4
+ finddups (0.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -1,6 +1,7 @@
1
1
  # Finddups
2
2
 
3
3
  Shows duplicate files within a list of directories and outputs as JSON.
4
+ This is accomplished by generating a hash digest each file and comparing the hashes.
4
5
 
5
6
  ## Installation
6
7
 
@@ -9,17 +10,17 @@ Shows duplicate files within a list of directories and outputs as JSON.
9
10
  ## Usage
10
11
 
11
12
  ```
12
- finddups (version 0.1.0)
13
+ finddups 0.2.0
13
14
  Usage: finddups [dirs] [options]
14
15
  -i, --ignore path ignore paths
15
- --atime (default) Use file access time to sort duplicates
16
- --mtime Use file modification time to sort duplicates
17
- --ctime Use file change time to sort duplicates
18
- (the time at which directory information about the file was changed, not the file itself)
19
- -t, --threads threads Number of threads to use (default 16)
20
16
  -d, --depth depth Max depth to search
17
+ --[no-]cache Perform caching
18
+ --[no-]cache-to-tmp Save cache files to /tmp/file_hashes
19
+ -o, --output path Output file path
20
+ --alg alg Hashing algorithm (SHA1, MD5)
21
+ --[no-]ignore-empty Ignore empty files
21
22
  -h, --help Show this help
22
- -v Show version
23
+ -v, --version Show version
23
24
  ```
24
25
 
25
26
  Example:
data/exe/finddups CHANGED
@@ -2,53 +2,54 @@
2
2
 
3
3
  require 'digest/sha1'
4
4
  require 'json'
5
+ require 'yaml'
5
6
  require 'fileutils'
6
- require 'thread'
7
7
  require 'optparse'
8
8
 
9
9
  $LOAD_PATH.unshift(File.join(__dir__, '..', 'lib'))
10
10
 
11
11
  require 'finddups'
12
+ require 'dup_finder'
12
13
 
13
- @options = {
14
- sort: 'atime',
15
- depth: Float::INFINITY,
16
- ignore: [],
17
- threads: 16,
18
- }
14
+ options = {}
19
15
 
20
16
  optparser = OptionParser.new do |opts|
21
17
  opts.banner = <<~BANNER
22
- finddups (version #{Finddups::VERSION})
18
+ finddups #{Finddups::VERSION}
23
19
  Usage: #{File.basename(__FILE__)} [dirs] [options]
24
20
  BANNER
25
21
 
26
22
  opts.on("-i path", "--ignore path", "ignore paths") do |path|
27
- @options[:ignore] << path
23
+ options[:ignore] ||= []
24
+ options[:ignore] << path
28
25
  end
29
26
 
30
- opts.on("--atime", "(default) Use file access time to sort duplicates") do
31
- @options[:sort] = 'atime'
27
+ opts.on("-d depth", "--depth depth", "Max depth to search") do |depth|
28
+ options[:depth] = depth.to_i
32
29
  end
33
30
 
34
- opts.on("--mtime", "Use file modification time to sort duplicates") do
35
- @options[:sort] = 'mtime'
31
+ opts.on("--[no-]cache", "Perform caching") do |value|
32
+ options[:cache] = value
36
33
  end
37
34
 
38
- desc = <<~DESC
39
- Use file change time to sort duplicates
40
- (the time at which directory information about the file was changed, not the file itself)
41
- DESC
42
- opts.on("--ctime", desc) do
43
- @options[:sort] = 'ctime'
35
+ opts.on("--[no-]cache-to-tmp", "Save cache files to /tmp/file_hashes") do |value|
36
+ options[:cache_to_tmp] = value
44
37
  end
45
38
 
46
- opts.on("-t threads", "--threads threads", "Number of threads to use (default 16)") do |threads|
47
- @options[:threads] = threads.to_i
39
+ opts.on("-o path", "--output path", "Output file path") do |path|
40
+ options[:output] = path
48
41
  end
49
42
 
50
- opts.on("-d depth", "--depth depth", "Max depth to search") do |depth|
51
- @options[:depth] = depth.to_i
43
+ opts.on("--alg alg", "Hashing algorithm (SHA1, MD5)") do |alg|
44
+ options[:alg] = alg.downcase.to_sym
45
+ unless %i[md5 sha1].include?(options[:alg])
46
+ $stderr.puts "Unsupported algorithm: #{options[:alg]}"
47
+ exit 1
48
+ end
49
+ end
50
+
51
+ opts.on("--[no-]ignore-empty", "Ignore empty files") do |value|
52
+ options[:ignore_empty] = value
52
53
  end
53
54
 
54
55
  opts.on("-h", "--help", "Show this help") do
@@ -56,8 +57,8 @@ optparser = OptionParser.new do |opts|
56
57
  exit
57
58
  end
58
59
 
59
- opts.on("-v", "Show version") do
60
- puts "finddups (version #{Finddups::VERSION})"
60
+ opts.on("-v", "--version", "Show version") do
61
+ puts "finddups #{Finddups::VERSION}"
61
62
  exit
62
63
  end
63
64
  end
@@ -69,83 +70,16 @@ if ARGV.empty? || ARGV.any? { |entry| !File.directory?(entry) }
69
70
  exit 1
70
71
  end
71
72
 
72
- # ========
73
- # = Prog =
74
- # ========
75
-
76
- search_dirs = ARGV
77
- trash_dir = "/tmp/duplicates/"
78
-
79
- @mutex = Mutex.new
80
- @queue = []
81
-
82
- def search(directory, depth = 0)
83
- # puts "Searching: #{directory}"
84
-
85
- # Skips
86
- return @duplicates if @options[:ignore].include?(File.basename(directory))
87
-
88
- Dir.entries(directory).each do |entry|
89
- next if entry.start_with?('.')
90
- path = File.join(directory, entry)
91
-
92
- if File.directory?(path)
93
- if depth < @options[:depth]
94
- @queue.push -> { search(path, depth + 1) }
95
- end
96
- elsif File.symlink?(path)
97
- next
98
- else
99
- begin
100
- digest = Digest::SHA1.hexdigest(File.read(path))
101
- @mutex.synchronize do
102
- @duplicates[digest] ||= []
103
- @duplicates[digest] << path
104
- end
105
- rescue Errno::EINVAL => e
106
- $stderr.puts "#{path}: #{e}"
107
- end
108
- end
109
- end
110
- @duplicates
111
- end
73
+ # Prog ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
112
74
 
113
- @duplicates = {}
75
+ dup_finder = DupFinder.new(options: options)
76
+ ARGV.each { |path| dup_finder.queue(path) }
77
+ dup_finder.search
114
78
 
115
- search_dirs.each do |search_dir|
116
- @queue.push -> { search(search_dir) }
117
- end
118
-
119
- until @queue.empty?
120
- threads = []
121
- @options[:threads].times do
122
- _proc = @queue.shift
123
- threads << Thread.new { _proc.call } if _proc
124
- end
125
- threads.each(&:join)
126
- end
127
-
128
- # Trim non dups
129
- @duplicates = @duplicates
130
- .values
131
- .reject do |files|
132
- files.length < 2
133
- end
134
-
135
- # Stort
136
- @duplicates.each do |dups|
137
- dups = dups.sort do |a, b|
138
- case @options[:sort]
139
- when 'atime'
140
- File.atime(a) <=> File.atime(a)
141
- when 'mtime'
142
- File.mtime(a) <=> File.mtime(a)
143
- when 'ctime'
144
- File.ctime(a) <=> File.ctime(a)
145
- else
146
- a.length <=> b.length
147
- end
79
+ if options[:output]
80
+ File.open(options[:output], 'w') do |file|
81
+ file.write(JSON.pretty_generate(dup_finder.duplicate_entries))
148
82
  end
83
+ else
84
+ $stdout.puts JSON.pretty_generate(dup_finder.duplicate_entries)
149
85
  end
150
-
151
- $stdout.puts JSON.pretty_generate(@duplicates)
data/lib/dup_finder.rb ADDED
@@ -0,0 +1,142 @@
1
+ class DupFinder
2
+ attr_accessor :options
3
+
4
+ def initialize(hashed: {}, options: {})
5
+ @queue = []
6
+ @hashed = hashed
7
+ @options = {
8
+ depth: Float::INFINITY,
9
+ ignore: [],
10
+ ignore_empty: true,
11
+ cache: true,
12
+ alg: :md5,
13
+ cache_to_tmp: true,
14
+ }.merge(options)
15
+ @cache = {}
16
+ end
17
+
18
+ def queue(directory, depth = 0)
19
+ @queue << [directory, depth]
20
+ end
21
+
22
+ def search
23
+ until @queue.empty?
24
+ directory, depth = @queue.shift
25
+
26
+ unless depth > @options[:depth]
27
+ hash_entries(directory)
28
+ end
29
+ end
30
+
31
+ self
32
+ end
33
+
34
+ def duplicate_entries
35
+ dups = {}
36
+
37
+ @hashed.each do |(path, hash)|
38
+ dups[hash] ||= []
39
+ dups[hash] << path
40
+ end
41
+
42
+ dups.values.reject { |paths| paths.length < 2 }
43
+ end
44
+
45
+ private # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
46
+
47
+ def ignored?(path)
48
+ return true if @options[:ignore_empty] && File.zero?(path)
49
+
50
+ @options[:ignore].any? do |pattern|
51
+ File.fnmatch(pattern, path, File::FNM_EXTGLOB)
52
+ end
53
+ end
54
+
55
+ def hash_entries(directory, depth: 0)
56
+ Dir.entries(directory).each do |entry|
57
+ next if entry.start_with?('.')
58
+ path = File.join(directory, entry)
59
+
60
+ Signal.trap("INT") do
61
+ write_cache(directory) if @options[:cache]
62
+ exit 1
63
+ end
64
+
65
+ next if File.symlink?(path)
66
+ next if ignored?(path)
67
+
68
+ if File.directory?(path)
69
+ queue(path, depth + 1)
70
+ else
71
+ if @options[:cache]
72
+ digest = cached_hash(path)
73
+ else
74
+ digest = hash_entry(path)
75
+ end
76
+ end
77
+ end
78
+
79
+ write_cache(directory) if @options[:cache]
80
+ end
81
+
82
+ def cached_hash(path)
83
+ directory = File.dirname(path)
84
+ file_name = File.basename(path)
85
+ @cache[directory] ||= load_cache(directory)
86
+ @cache[directory][file_name] ||= {}
87
+ meta_data = @cache[directory][file_name]
88
+
89
+ if meta_data.key?(options[:alg])
90
+ meta_data[options[:alg]]
91
+ else
92
+ meta_data[options[:alg]] = hash_entry(path)
93
+ end
94
+ end
95
+
96
+ def cache_path(directory)
97
+ if @options[:cache_to_tmp]
98
+ File.join("/tmp/file_hashes", File.expand_path(directory), "hashes.yml")
99
+ else
100
+ File.join(directory, "hashes.yml")
101
+ end
102
+ end
103
+
104
+ def load_cache(directory)
105
+ File.exist?(cache_path(directory)) ? YAML.load_file(cache_path(directory)) : {}
106
+ end
107
+
108
+ def write_cache(directory)
109
+ @cache[directory] ||= {}
110
+ @cache[directory].keep_if do |file_name, hash|
111
+ File.exist?(File.join(directory, file_name))
112
+ end
113
+
114
+ FileUtils.mkdir_p(File.dirname(cache_path(directory)))
115
+
116
+ File.open(cache_path(directory), 'w') do |file|
117
+ file.write(YAML.dump(@cache[directory]))
118
+ end
119
+ end
120
+
121
+ def hash_entry(path)
122
+ return @hashed[path] if @hashed.key?(path)
123
+
124
+ digest =
125
+ case @options[:alg]
126
+ when :md5 then Digest::MD5.new
127
+ when :sha1 then Digest::SHA1.new
128
+ end
129
+
130
+ File.open(path, 'rb') do |file|
131
+ while data = file.read(1024 * 1024)
132
+ digest.update(data)
133
+ end
134
+ end
135
+
136
+ @hashed[path] = digest.hexdigest
137
+ digest.hexdigest
138
+ rescue Errno::EINVAL => e
139
+ $stderr.puts "#{path}: #{e}"
140
+ nil
141
+ end
142
+ end
@@ -1,3 +1,3 @@
1
1
  module Finddups
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,16 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: finddups
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alex Clink
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-06-03 00:00:00.000000000 Z
11
+ date: 2023-05-02 00:00:00.000000000 Z
12
12
  dependencies: []
13
- description:
13
+ description:
14
14
  email:
15
15
  - code@alexclink.com
16
16
  executables:
@@ -30,6 +30,7 @@ files:
30
30
  - bin/setup
31
31
  - exe/finddups
32
32
  - finddups.gemspec
33
+ - lib/dup_finder.rb
33
34
  - lib/finddups.rb
34
35
  - lib/finddups/version.rb
35
36
  homepage: https://github.com/SleepingInsomniac/finddups
@@ -38,7 +39,7 @@ licenses:
38
39
  metadata:
39
40
  homepage_uri: https://github.com/SleepingInsomniac/finddups
40
41
  source_code_uri: https://github.com/SleepingInsomniac/finddups
41
- post_install_message:
42
+ post_install_message:
42
43
  rdoc_options: []
43
44
  require_paths:
44
45
  - lib
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
53
54
  - !ruby/object:Gem::Version
54
55
  version: '0'
55
56
  requirements: []
56
- rubygems_version: 3.1.3
57
- signing_key:
57
+ rubygems_version: 3.4.9
58
+ signing_key:
58
59
  specification_version: 4
59
60
  summary: Shows duplicate files within a list of directories and outputs as JSON.
60
61
  test_files: []