RubyGems - finddup - Versions diffs - 0.1 - Mend

finddup 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+Copyright 2012 Juha-Jarmo Heinonen
+o@sorsacode.com
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.txt ADDED Viewed

@@ -0,0 +1,22 @@
+This utility finds duplicate files. It's as simple as that.
+To install finddup, you must have ruby installed, then just run this:
+  gem install finddup
+Usage:
+Finddup by default finds files under your working directory.
+If you want to search another directory, just use that as the only argument.
+Output:
+If there are no duplicates to find, finddup simply doesn't output anything.
+While it's searching, it's updating the status line like this:
+ - Scrolling throbber (of -/|\ characters) while it's reading big files.
+ - A simple dot (.), when it's scanning a directory
+ - An asterisk (*), when it's found a duplicate
+ - An exclamation mark (!), when it's found a big file (over 2MB by default).
+After the searching, it does another pass for the big files, comparing their sizes first.
+Duplicates are reported in groups of two or files with the same content.
+The output is delimitted by "Duplicate files:" and terminated with an extra "\n".
+Homepage and source repository:
+http://github.com/jammi/finddup

data/VERSION ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0.1

data/bin/finddup ADDED Viewed

@@ -0,0 +1,5 @@
+#!ruby
+Signal.trap 'INT' do
+  exit
+end
+require_relative '../lib/finddup.rb'

data/lib/finddup.rb ADDED Viewed

@@ -0,0 +1,162 @@
+#!ruby
+class DupFind
+  STR = {
+    :usage => "usage: #{$0} [path]\n[path] is the root directory of searches, and defaults to the current directory.",
+    :dir_not_found => "Error! Directory not found: ",
+    :dir_not_dir => "Error! Not a directory: ",
+    :empty_head => "Empty files:",
+    :dup_head => "Duplicate files:",
+    :dup_tail => "",
+    :arr_delimit => "\n",
+    :progrstr => [' - ',' / ',' | ',' \ ']
+  }
+  BIGFILE_SIZE = 2*1024*1024 # 2 MiB
+  def help; puts STR[:usage]; exit; end
+  def exists?( path ); File.exist?( path ); end
+  def dir?( path ); File.directory?( path ); end
+  def file?( path ); File.file?( path ); end
+  def read?( path ); File.readable?( path ); end
+  def symlink?( path ); File.symlink?( path ); end
+  def err( what, where='' ); warn STR[what]+where; exit; end
+  def check_src_dir( path )
+    err :dir_not_found, path unless exists? path
+    err   :dir_not_dir, path unless    dir? path
+  end
+  def full_path( path, parent=nil ); File.expand_path( path, parent ); end
+  def ignore?( fn ); @ignores.include?( fn ); end
+  def check_argv
+    help if @argv.length > 1
+    if @argv.length == 0
+      src_dir = Dir.pwd
+    else
+      src_dir = @argv.first
+    end
+    src_path = full_path( src_dir )
+    check_src_dir( src_path )
+    @src_path = src_path
+  end
+  def newsha; @sha = Digest::SHA256.new; end
+  def progress( progchr=nil )
+    if progchr.nil?
+      time_now = Time.now.to_f
+      if time_now - @progrlast > 0.1
+        progrstr = STR[:progrstr]
+        @progrstate = 0 if @progrstate == progrstr.length
+        progchr = progrstr[@progrstate]
+        @progrstate += 1
+        @progrlast = time_now
+      end
+    end
+    return if @lastchr == progchr
+    @lastchr = progchr
+    print "\r#{progchr}"
+    $stdout.flush
+  end
+  def digest_small( path )
+    fdata = File.read( path )
+    @sha << fdata
+  end
+  def digest_large( path, blksize=65536 )
+    f = File.open( path, 'rb' )
+    progress
+    f.each( blksize ) do |fdata,i|
+      @sha << fdata
+      progress
+    end
+    f.close
+  end
+  def read_digest( path, fstat )
+    if fstat.size < fstat.blksize
+      newsha
+      digest_small( path )
+    else
+      newsha
+      digest_large( path, fstat.blksize )
+    end
+    @sha.digest
+  end
+  def is_dup?( digest, path )
+    if @files_by_sum.has_key? digest
+      progress ' * '
+      darr = @files_by_sum[ digest ]
+      @dup << digest if darr.length == 1
+      darr << path
+      return true
+    end
+    @files_by_sum[ digest ] = [ path ]
+    false
+  end
+  def handle_file( path, fstat=nil )
+    if fstat.nil?
+      fstat = File.stat( path )
+      if fstat.size == 0
+        @empty << path
+        return
+      end
+      if fstat.size > BIGFILE_SIZE
+        @bigfiles << [ path, fstat ]
+        progress ' ! '
+        return
+      end
+    end
+    digest = read_digest( path, fstat )
+    is_dup?( digest, path )
+  end
+  def scan( parent )
+    progress ' . '
+    Dir.entries( parent ).each do |fn|
+      next if ignore? fn
+      path = full_path( fn, parent )
+      next if symlink? path
+      if dir? path
+        scan path
+      elsif file? path and read? path
+        handle_file path
+      end
+    end
+  end
+  def find_big_by_size
+    big_compared = []
+    until @bigfiles.empty?
+      (path, fstat) = @bigfiles.shift
+      next if big_compared.include? path
+      @bigfiles.each do |path2,fstat2|
+         next if big_compared.include? path2
+         if fstat.size == fstat2.size
+           unless big_compared.include? path
+             handle_file( path, fstat )
+             big_compared << path
+           end
+           handle_file( path2, fstat2 )
+         end
+      end
+    end
+  end
+  def putsarr( head, arr )
+    puts STR[head]
+    puts arr.sort.join(STR[:arr_delimit])
+  end
+  def initialize( argv )
+    @argv = argv; check_argv
+    @files_by_sum = {}
+    @ignores = [ '.', '..' ]
+    @empty = []
+    @dup = []
+    @bigfiles = []
+    require 'digest/sha2'
+    @progrstate = 0
+    @lastchr = '...'
+    @progrlast = 0
+    progress ' _ '
+    scan @src_path
+    print "\r"
+    #putsarr( :empty_head, @empty ) unless @empty.empty?
+    find_big_by_size
+    @dup.each do |digest|
+      putsarr( :dup_head, @files_by_sum[digest] )
+      puts STR[:dup_tail]
+    end
+  end
+end
+DupFind.new(ARGV)

metadata ADDED Viewed

@@ -0,0 +1,57 @@
+--- !ruby/object:Gem::Specification
+name: finddup
+version: !ruby/object:Gem::Version
+  version: "0.1"
+platform: ruby
+authors:
+- Juha-Jarmo Heinonen
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-01-05 00:00:00 +02:00
+default_executable:
+dependencies: []
+description: This utility finds duplicate files. It's as simple as that.
+email: o@sorsacode.com
+executables:
+- finddup
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/finddup.rb
+- bin/finddup
+- README.txt
+- LICENSE.txt
+- VERSION
+has_rdoc: false
+homepage: http://github.com/jammi/finddup/
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 1.9.1
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.1
+signing_key:
+specification_version: 2
+summary: A simple recursive duplicate file finder
+test_files: []