finddup 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. data/LICENSE.txt +21 -0
  2. data/README.txt +22 -0
  3. data/VERSION +1 -0
  4. data/bin/finddup +5 -0
  5. data/lib/finddup.rb +162 -0
  6. metadata +57 -0
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ Copyright 2012 Juha-Jarmo Heinonen
2
+ o@sorsacode.com
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.txt ADDED
@@ -0,0 +1,22 @@
1
+ This utility finds duplicate files. It's as simple as that.
2
+
3
+ To install finddup, you must have ruby installed, then just run this:
4
+ gem install finddup
5
+
6
+ Usage:
7
+ Finddup by default finds files under your working directory.
8
+ If you want to search another directory, just use that as the only argument.
9
+
10
+ Output:
11
+ If there are no duplicates to find, finddup simply doesn't output anything.
12
+ While it's searching, it's updating the status line like this:
13
+ - Scrolling throbber (of -/|\ characters) while it's reading big files.
14
+ - A simple dot (.), when it's scanning a directory
15
+ - An asterisk (*), when it's found a duplicate
16
+ - An exclamation mark (!), when it's found a big file (over 2MB by default).
17
+ After the searching, it does another pass for the big files, comparing their sizes first.
18
+ Duplicates are reported in groups of two or files with the same content.
19
+ The output is delimitted by "Duplicate files:" and terminated with an extra "\n".
20
+
21
+ Homepage and source repository:
22
+ http://github.com/jammi/finddup
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1
data/bin/finddup ADDED
@@ -0,0 +1,5 @@
1
+ #!ruby
2
+ Signal.trap 'INT' do
3
+ exit
4
+ end
5
+ require_relative '../lib/finddup.rb'
data/lib/finddup.rb ADDED
@@ -0,0 +1,162 @@
1
+ #!ruby
2
+
3
+ class DupFind
4
+ STR = {
5
+ :usage => "usage: #{$0} [path]\n[path] is the root directory of searches, and defaults to the current directory.",
6
+ :dir_not_found => "Error! Directory not found: ",
7
+ :dir_not_dir => "Error! Not a directory: ",
8
+ :empty_head => "Empty files:",
9
+ :dup_head => "Duplicate files:",
10
+ :dup_tail => "",
11
+ :arr_delimit => "\n",
12
+ :progrstr => [' - ',' / ',' | ',' \ ']
13
+ }
14
+ BIGFILE_SIZE = 2*1024*1024 # 2 MiB
15
+ def help; puts STR[:usage]; exit; end
16
+ def exists?( path ); File.exist?( path ); end
17
+ def dir?( path ); File.directory?( path ); end
18
+ def file?( path ); File.file?( path ); end
19
+ def read?( path ); File.readable?( path ); end
20
+ def symlink?( path ); File.symlink?( path ); end
21
+ def err( what, where='' ); warn STR[what]+where; exit; end
22
+ def check_src_dir( path )
23
+ err :dir_not_found, path unless exists? path
24
+ err :dir_not_dir, path unless dir? path
25
+ end
26
+ def full_path( path, parent=nil ); File.expand_path( path, parent ); end
27
+ def ignore?( fn ); @ignores.include?( fn ); end
28
+ def check_argv
29
+ help if @argv.length > 1
30
+ if @argv.length == 0
31
+ src_dir = Dir.pwd
32
+ else
33
+ src_dir = @argv.first
34
+ end
35
+ src_path = full_path( src_dir )
36
+ check_src_dir( src_path )
37
+ @src_path = src_path
38
+ end
39
+ def newsha; @sha = Digest::SHA256.new; end
40
+ def progress( progchr=nil )
41
+ if progchr.nil?
42
+ time_now = Time.now.to_f
43
+ if time_now - @progrlast > 0.1
44
+ progrstr = STR[:progrstr]
45
+ @progrstate = 0 if @progrstate == progrstr.length
46
+ progchr = progrstr[@progrstate]
47
+ @progrstate += 1
48
+ @progrlast = time_now
49
+ end
50
+ end
51
+ return if @lastchr == progchr
52
+ @lastchr = progchr
53
+ print "\r#{progchr}"
54
+ $stdout.flush
55
+ end
56
+ def digest_small( path )
57
+ fdata = File.read( path )
58
+ @sha << fdata
59
+ end
60
+ def digest_large( path, blksize=65536 )
61
+ f = File.open( path, 'rb' )
62
+ progress
63
+ f.each( blksize ) do |fdata,i|
64
+ @sha << fdata
65
+ progress
66
+ end
67
+ f.close
68
+ end
69
+ def read_digest( path, fstat )
70
+ if fstat.size < fstat.blksize
71
+ newsha
72
+ digest_small( path )
73
+ else
74
+ newsha
75
+ digest_large( path, fstat.blksize )
76
+ end
77
+ @sha.digest
78
+ end
79
+ def is_dup?( digest, path )
80
+ if @files_by_sum.has_key? digest
81
+ progress ' * '
82
+ darr = @files_by_sum[ digest ]
83
+ @dup << digest if darr.length == 1
84
+ darr << path
85
+ return true
86
+ end
87
+ @files_by_sum[ digest ] = [ path ]
88
+ false
89
+ end
90
+ def handle_file( path, fstat=nil )
91
+ if fstat.nil?
92
+ fstat = File.stat( path )
93
+ if fstat.size == 0
94
+ @empty << path
95
+ return
96
+ end
97
+ if fstat.size > BIGFILE_SIZE
98
+ @bigfiles << [ path, fstat ]
99
+ progress ' ! '
100
+ return
101
+ end
102
+ end
103
+ digest = read_digest( path, fstat )
104
+ is_dup?( digest, path )
105
+ end
106
+ def scan( parent )
107
+ progress ' . '
108
+ Dir.entries( parent ).each do |fn|
109
+ next if ignore? fn
110
+ path = full_path( fn, parent )
111
+ next if symlink? path
112
+ if dir? path
113
+ scan path
114
+ elsif file? path and read? path
115
+ handle_file path
116
+ end
117
+ end
118
+ end
119
+ def find_big_by_size
120
+ big_compared = []
121
+ until @bigfiles.empty?
122
+ (path, fstat) = @bigfiles.shift
123
+ next if big_compared.include? path
124
+ @bigfiles.each do |path2,fstat2|
125
+ next if big_compared.include? path2
126
+ if fstat.size == fstat2.size
127
+ unless big_compared.include? path
128
+ handle_file( path, fstat )
129
+ big_compared << path
130
+ end
131
+ handle_file( path2, fstat2 )
132
+ end
133
+ end
134
+ end
135
+ end
136
+ def putsarr( head, arr )
137
+ puts STR[head]
138
+ puts arr.sort.join(STR[:arr_delimit])
139
+ end
140
+ def initialize( argv )
141
+ @argv = argv; check_argv
142
+ @files_by_sum = {}
143
+ @ignores = [ '.', '..' ]
144
+ @empty = []
145
+ @dup = []
146
+ @bigfiles = []
147
+ require 'digest/sha2'
148
+ @progrstate = 0
149
+ @lastchr = '...'
150
+ @progrlast = 0
151
+ progress ' _ '
152
+ scan @src_path
153
+ print "\r"
154
+ #putsarr( :empty_head, @empty ) unless @empty.empty?
155
+ find_big_by_size
156
+ @dup.each do |digest|
157
+ putsarr( :dup_head, @files_by_sum[digest] )
158
+ puts STR[:dup_tail]
159
+ end
160
+ end
161
+ end
162
+ DupFind.new(ARGV)
metadata ADDED
@@ -0,0 +1,57 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: finddup
3
+ version: !ruby/object:Gem::Version
4
+ version: "0.1"
5
+ platform: ruby
6
+ authors:
7
+ - Juha-Jarmo Heinonen
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2013-01-05 00:00:00 +02:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: This utility finds duplicate files. It's as simple as that.
17
+ email: o@sorsacode.com
18
+ executables:
19
+ - finddup
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
24
+ files:
25
+ - lib/finddup.rb
26
+ - bin/finddup
27
+ - README.txt
28
+ - LICENSE.txt
29
+ - VERSION
30
+ has_rdoc: false
31
+ homepage: http://github.com/jammi/finddup/
32
+ post_install_message:
33
+ rdoc_options: []
34
+
35
+ require_paths:
36
+ - lib
37
+ required_ruby_version: !ruby/object:Gem::Requirement
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ version: 1.9.1
42
+ version:
43
+ required_rubygems_version: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: "0"
48
+ version:
49
+ requirements: []
50
+
51
+ rubyforge_project:
52
+ rubygems_version: 1.3.1
53
+ signing_key:
54
+ specification_version: 2
55
+ summary: A simple recursive duplicate file finder
56
+ test_files: []
57
+