archivededup 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 832d1375b57959d1b4ed4704261478f15d80611f3e8a59eeca5365e2c188c56b
4
+ data.tar.gz: 3f93fc4ba314347ee306a45554239cfb8e1ed7ffa0ef0da2a1f88abe54d1540e
5
+ SHA512:
6
+ metadata.gz: 715f06a1e280d06873d5ce900c7368b5f87d4509fa0fd1a4c3efc46c30a333675e40980a3297f6584eefdb90e3319f1954b29d4ab58468e8d281133324ec9d3e
7
+ data.tar.gz: 344bfd59b8af9d48430d8e97cc4e2f3594fea58061e5230afc6b901b142ca8c2bb9fa9a1c3234cf4a1ad1b0423ae011217eb8e2dd53c8089341fec605c744a2c
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ gem "dbm", "~> 1.1"
8
+ gem "rake", "~> 13.0"
9
+ gem "rspec", "~> 3.12"
10
+
data/Gemfile.lock ADDED
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ archivededup (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ dbm (1.1.0)
10
+ diff-lcs (1.5.0)
11
+ rake (13.0.6)
12
+ rspec (3.12.0)
13
+ rspec-core (~> 3.12.0)
14
+ rspec-expectations (~> 3.12.0)
15
+ rspec-mocks (~> 3.12.0)
16
+ rspec-core (3.12.2)
17
+ rspec-support (~> 3.12.0)
18
+ rspec-expectations (3.12.3)
19
+ diff-lcs (>= 1.2.0, < 2.0)
20
+ rspec-support (~> 3.12.0)
21
+ rspec-mocks (3.12.5)
22
+ diff-lcs (>= 1.2.0, < 2.0)
23
+ rspec-support (~> 3.12.0)
24
+ rspec-support (3.12.1)
25
+
26
+ PLATFORMS
27
+ x86_64-darwin-21
28
+
29
+ DEPENDENCIES
30
+ archivededup!
31
+ dbm (~> 1.1)
32
+ rake (~> 13.0)
33
+ rspec (~> 3.12)
34
+
35
+ BUNDLED WITH
36
+ 2.3.18
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 Sam Baskinger
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # Archivededup
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/archivededup`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Install the gem and add to the application's Gemfile by executing:
10
+
11
+ $ bundle add archivededup
12
+
13
+ If bundler is not being used to manage dependencies, install the gem by executing:
14
+
15
+ $ gem install archivededup
16
+
17
+ ## Usage
18
+
19
+ TODO: Write usage instructions here
20
+
21
+ ## Development
22
+
23
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
24
+
25
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
26
+
27
+ ## Contributing
28
+
29
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/archivededup.
30
+
31
+ ## License
32
+
33
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/archivededup/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "archivededup"
7
+ spec.version = Archivededup::VERSION
8
+ spec.authors = ["Sam Baskinger"]
9
+ spec.email = ["basking2@yahoo.com"]
10
+
11
+ spec.summary = "Deduplicate files from an archive directory."
12
+ spec.description = "Deduplicate files from an archive directory using a database."
13
+ spec.homepage = "https://github.com/basking2/archivededup"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.6.0"
16
+
17
+ #spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
18
+
19
+ spec.metadata["homepage_uri"] = spec.homepage
20
+ spec.metadata["source_code_uri"] = "https://github.com/basking2/archivededup"
21
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(__dir__) do
26
+ `git ls-files -z`.split("\x0").reject do |f|
27
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
28
+ end
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ # Uncomment to register a new dependency of your gem
35
+ # spec.add_dependency "example-gem", "~> 1.0"
36
+
37
+ # For more information and examples about making a new gem, check out our
38
+ # guide at: https://bundler.io/guides/creating_gem.html
39
+ end
data/exe/archivededup ADDED
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+
4
+ require 'dbm'
5
+ require 'digest'
6
+ require 'thread'
7
+ require 'yaml'
8
+ require 'optparse'
9
+
10
+ require 'archivededup/db'
11
+ require 'archivededup/paralleltask'
12
+ require 'archivededup/filepicker'
13
+
14
+ archive = File.join(ENV['HOME'], 'Archive')
15
+ dbfile = 'archivededup.db'
16
+ threads = 16
17
+ action = 'check'
18
+
19
+ OptionParser.new do |opts|
20
+
21
+ opts.on('-b', '--build', "Build a database.") do
22
+ action = 'build'
23
+ end
24
+
25
+ opts.on('-c', '--check', "Check a database and report all duplicates.") do
26
+ action = 'check'
27
+ end
28
+
29
+ opts.on('--removedups', "Check a database and remove all duplicates --check reports.") do
30
+ action = 'remove'
31
+ end
32
+
33
+ opts.on('-d', '--dir=directory', 'Directory to dedup.') do |o|
34
+ archive = o
35
+ end
36
+
37
+ opts.on('-D', '--db=database_file', 'Dup database file.') do |o|
38
+ dbfile = o
39
+ end
40
+
41
+ opts.on('-t', '--threads=count', Integer, 'How many threads to use.') do |o|
42
+ threads = o
43
+ end
44
+ end.parse!
45
+
46
+
47
+ db = Archivededup::Db.new(dbfile)
48
+
49
+ case action
50
+ when 'build'
51
+
52
+ parallelTask = Archivededup::ParallelTask.new
53
+
54
+ parallelTask.start(threads) do |d|
55
+ db.add_file(d)
56
+ end
57
+
58
+ parallelTask.scatter do |queue|
59
+ # Fill the queue and feed the threads.
60
+ db.add_directory(archive) do |d|
61
+ queue.enq d
62
+ end
63
+ end
64
+
65
+ when /check|remove/
66
+ # Check and remove are nearly the same except for the actual delete operation.
67
+
68
+ do_delete = action == 'remove'
69
+
70
+ filepicker = Archivededup::FilePicker.new
71
+
72
+ db.each_dup do |k, r|
73
+
74
+ # Skip files in the Documents directory. It's special.
75
+ files = r['files'].filter { |f| f !~ /\/(?:Documents|Archive\/Prairie Music)\// }
76
+
77
+ # If the filter removed all files, well, do nothing!
78
+ next if files.length < 2
79
+
80
+ # Skip files with unexpected file extensions.
81
+ unless files.find { |f| f =~ /\.(?:png|jpeg|jpg|mp4|mov|mp3|mod|xfc|gif|m4v|3g2|mpg)$/i }
82
+ files.each { |f| puts "Not deduping #{f}."}
83
+ next
84
+ end
85
+
86
+ filekeep = filepicker.pick(files)
87
+
88
+ puts "Keep #{filekeep}."
89
+ files.
90
+ select { |f| f != filekeep }.
91
+ each do |f|
92
+ puts " Remove #{f}."
93
+
94
+ if do_delete
95
+ # Bit of an excessive check commented out below.
96
+ # raise Exception.new("Files differ!!!! #{f} #{filekeep}") if File.read(f) != File.read(filekeep)
97
+ raise Exception.new("File #{f} is the same as keep file #{filekeep}. Will not delete.") if File.identical?(filekeep, f)
98
+ File.unlink(f)
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ db.close
@@ -0,0 +1,113 @@
1
+
2
+ require 'dbm'
3
+ require 'digest'
4
+ require 'thread'
5
+ require 'yaml'
6
+
7
+ module Archivededup
8
+ class Db
9
+ def initialize(dbfile)
10
+ @db = DBM.new(dbfile, 0755, DBM::WRCREAT)
11
+ end
12
+
13
+ def each_hash
14
+ @db.select do |k, v|
15
+ k.start_with? 'hash - '
16
+ end.map do |k, v|
17
+ [k, YAML::load(v)]
18
+ end.each do |k, r|
19
+ yield k, r
20
+ end
21
+ end
22
+
23
+ def each_dup
24
+ @db.select do |k, v|
25
+ k.start_with? 'hash - '
26
+ end.map do |k, v|
27
+ [k, YAML::load(v)]
28
+ end.select do |k, r|
29
+ r['files'].length > 1
30
+ end.each do |k, r|
31
+ yield k, r
32
+ end
33
+ end
34
+
35
+ def add_directory(dir, &blk)
36
+ Dir.new(dir).each do |dent|
37
+ d = File.join(dir, dent)
38
+
39
+ next if dent == '.'
40
+ next if dent == '..'
41
+ next if dent.start_with? '.'
42
+
43
+ if File.directory? d
44
+ add_directory(d, &blk)
45
+ else
46
+ yield d if block_given?
47
+ end
48
+ end
49
+ end
50
+
51
+ def add_file(d)
52
+ # puts "Got #{d}"
53
+ s = File.stat(d)
54
+ # noimplemented - s.birthtime
55
+ s.mtime
56
+ s.ctime
57
+ # unused - s.atime
58
+ # puts d, s
59
+ hash = Digest::MD5.new
60
+ outbuf = ''
61
+
62
+ File.open(d) do |io|
63
+ while io.read(4096, outbuf) do
64
+ hash.update outbuf
65
+ end
66
+ end
67
+
68
+ unless @db.has_key? "name - #{d}"
69
+
70
+ hash = hash.hexdigest
71
+
72
+ @db["name - #{d}"] = YAML::dump({
73
+ dir: d,
74
+ hash: hash,
75
+ })
76
+
77
+ if @db.has_key?('hash - '+hash)
78
+ o = YAML::load(@db['hash - '+hash])
79
+ buf1 = ''
80
+ buf2 = ''
81
+ File.open(d) do |io1|
82
+ File.open(o['files'][0]) do |io2|
83
+ v1 = ''
84
+ v2 = ''
85
+ while v1 && v2 && v1 == v2 do
86
+ v1 = io1.read(4096, buf1)
87
+ v2 = io2.read(4096, buf2)
88
+ end
89
+
90
+ if v1 == v2
91
+ o['files'] << d
92
+ puts "Dups found: #{o}"
93
+ @db['hash - '+hash] = YAML::dump(o)
94
+ end
95
+ end
96
+ end
97
+ else
98
+ puts "Storing #{d}."
99
+ @db['hash - '+hash] = YAML::dump({
100
+ 'files' => [ d ],
101
+ 'hash' => hash,
102
+ })
103
+ end
104
+ else
105
+ puts "Had #{d}."
106
+ end
107
+ end
108
+
109
+ def close()
110
+ @db.close()
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,90 @@
1
+
2
+ require 'time'
3
+
4
+
5
+ module Archivededup
6
+
7
+ module FileNamePickerByDate
8
+
9
+ @@times = [
10
+ /(\d\d\d\d)-(\d?\d)-\d?\d/, # eg 2021-09-08
11
+ /(\d\d\d\d)(\d\d)\d\d/, # eg 20210908
12
+ /(\d\d\d\d)-(\d\d)/, # eg 2021-09
13
+ /(\d\d\d\d)(\d\d)/, # eg 202109
14
+ /(\d\d\d\d)/, # eg 2021
15
+ ]
16
+
17
+ def self.goodmatch?(m)
18
+ return false if m.nil?
19
+ year = m.length > 1? m[1].to_i : 0
20
+ month = m.length > 2? m[2].to_i : 0
21
+
22
+ return false if year < 1979
23
+ return false if year > 2100
24
+ return false if month < 0
25
+ return false if month > 12
26
+
27
+ true
28
+ end
29
+
30
+ def self.call(a, b)
31
+ last_compare = 0
32
+ @@times.find do |re|
33
+ ma = goodmatch?(re.match(a))
34
+ mb = goodmatch?(re.match(b))
35
+
36
+ if ma
37
+ if mb
38
+ false
39
+ else
40
+ # Pick a as better.
41
+ last_compare = -1
42
+ true
43
+ end
44
+ else
45
+ if mb
46
+ # Pick b as better.
47
+ last_compare = 1
48
+ true
49
+ else
50
+ false
51
+ end
52
+ end
53
+ end
54
+
55
+ last_compare
56
+ end
57
+ end
58
+
59
+ # From a list of file names, pick the file that provides the most information
60
+ # and so should be kept.
61
+ #
62
+ class FilePicker
63
+ def initialize()
64
+ # A list of ways to compare file names.
65
+ # These are applied in-order until one returns non-zero.
66
+ @comparators = []
67
+ @comparators << FileNamePickerByDate
68
+ @comparators << proc { |a, b| b.length <=> a.length }
69
+ end
70
+
71
+ def pick(filelist)
72
+
73
+ filelist.sort do |a,b|
74
+
75
+ # Default the sort to be equal, no sort.
76
+ last_compare = 0
77
+
78
+ # Run comparators until one selcts a better file name.
79
+ @comparators.find do |c|
80
+ last_compare = c.call(a, b)
81
+ last_compare != 0
82
+ end
83
+
84
+ # Return the result of the last comparison.
85
+ last_compare
86
+ end.
87
+ first
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,61 @@
1
+ require 'thread'
2
+
3
+ module Archivededup
4
+
5
+ class QueueAppender
6
+ def initialize(queue)
7
+ @queue = queue
8
+ end
9
+
10
+ def enq d
11
+ # Fill the queue and feed the threads.
12
+ while @queue.length > 100
13
+ sleep 1
14
+ end
15
+ @queue.enq d
16
+ end
17
+
18
+ alias :<< :enq
19
+ end
20
+
21
+ class ParallelTask
22
+ def initialize
23
+ @task
24
+ end
25
+
26
+ def start(threads=16)
27
+
28
+ @queue = Queue.new
29
+
30
+ #Start the threads.
31
+ @threads = threads.times.map do
32
+ Thread.new(@queue) do |queue|
33
+ run = true
34
+
35
+ while run
36
+ d = queue.deq
37
+
38
+ if d.nil?
39
+ run = false
40
+ else
41
+ yield d
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ def scatter
50
+ yield QueueAppender.new(@queue)
51
+
52
+ @queue.close
53
+
54
+ # Join the threads before exiting.
55
+ @threads.each do |t|
56
+ t.join
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,3 @@
1
+ module Archivededup
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,6 @@
1
+ require "archivededup/version"
2
+
3
+ module Archivededup
4
+ class Error < StandardError; end
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,4 @@
1
+ module Archivededup
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: archivededup
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Sam Baskinger
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-07-04 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Deduplicate files from an archive directory using a database.
14
+ email:
15
+ - basking2@yahoo.com
16
+ executables:
17
+ - archivededup
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".rspec"
22
+ - Gemfile
23
+ - Gemfile.lock
24
+ - LICENSE.txt
25
+ - README.md
26
+ - Rakefile
27
+ - archivededup.gemspec
28
+ - exe/archivededup
29
+ - lib/archivededup.rb
30
+ - lib/archivededup/db.rb
31
+ - lib/archivededup/filepicker.rb
32
+ - lib/archivededup/paralleltask.rb
33
+ - lib/archivededup/version.rb
34
+ - sig/archivededup.rbs
35
+ homepage: https://github.com/basking2/archivededup
36
+ licenses:
37
+ - MIT
38
+ metadata:
39
+ homepage_uri: https://github.com/basking2/archivededup
40
+ source_code_uri: https://github.com/basking2/archivededup
41
+ post_install_message:
42
+ rdoc_options: []
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 2.6.0
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ requirements: []
56
+ rubygems_version: 3.3.7
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Deduplicate files from an archive directory.
60
+ test_files: []