archivededup 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 832d1375b57959d1b4ed4704261478f15d80611f3e8a59eeca5365e2c188c56b
4
+ data.tar.gz: 3f93fc4ba314347ee306a45554239cfb8e1ed7ffa0ef0da2a1f88abe54d1540e
5
+ SHA512:
6
+ metadata.gz: 715f06a1e280d06873d5ce900c7368b5f87d4509fa0fd1a4c3efc46c30a333675e40980a3297f6584eefdb90e3319f1954b29d4ab58468e8d281133324ec9d3e
7
+ data.tar.gz: 344bfd59b8af9d48430d8e97cc4e2f3594fea58061e5230afc6b901b142ca8c2bb9fa9a1c3234cf4a1ad1b0423ae011217eb8e2dd53c8089341fec605c744a2c
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gemspec
6
+
7
+ gem "dbm", "~> 1.1"
8
+ gem "rake", "~> 13.0"
9
+ gem "rspec", "~> 3.12"
10
+
data/Gemfile.lock ADDED
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ archivededup (0.1.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ dbm (1.1.0)
10
+ diff-lcs (1.5.0)
11
+ rake (13.0.6)
12
+ rspec (3.12.0)
13
+ rspec-core (~> 3.12.0)
14
+ rspec-expectations (~> 3.12.0)
15
+ rspec-mocks (~> 3.12.0)
16
+ rspec-core (3.12.2)
17
+ rspec-support (~> 3.12.0)
18
+ rspec-expectations (3.12.3)
19
+ diff-lcs (>= 1.2.0, < 2.0)
20
+ rspec-support (~> 3.12.0)
21
+ rspec-mocks (3.12.5)
22
+ diff-lcs (>= 1.2.0, < 2.0)
23
+ rspec-support (~> 3.12.0)
24
+ rspec-support (3.12.1)
25
+
26
+ PLATFORMS
27
+ x86_64-darwin-21
28
+
29
+ DEPENDENCIES
30
+ archivededup!
31
+ dbm (~> 1.1)
32
+ rake (~> 13.0)
33
+ rspec (~> 3.12)
34
+
35
+ BUNDLED WITH
36
+ 2.3.18
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 Sam Baskinger
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,33 @@
1
+ # Archivededup
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/archivededup`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Install the gem and add to the application's Gemfile by executing:
10
+
11
+ $ bundle add archivededup
12
+
13
+ If bundler is not being used to manage dependencies, install the gem by executing:
14
+
15
+ $ gem install archivededup
16
+
17
+ ## Usage
18
+
19
+ TODO: Write usage instructions here
20
+
21
+ ## Development
22
+
23
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
24
+
25
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
26
+
27
+ ## Contributing
28
+
29
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/archivededup.
30
+
31
+ ## License
32
+
33
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/archivededup/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "archivededup"
7
+ spec.version = Archivededup::VERSION
8
+ spec.authors = ["Sam Baskinger"]
9
+ spec.email = ["basking2@yahoo.com"]
10
+
11
+ spec.summary = "Deduplicate files from an archive directory."
12
+ spec.description = "Deduplicate files from an archive directory using a database."
13
+ spec.homepage = "https://github.com/basking2/archivededup"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.6.0"
16
+
17
+ #spec.metadata["allowed_push_host"] = "TODO: Set to your gem server 'https://example.com'"
18
+
19
+ spec.metadata["homepage_uri"] = spec.homepage
20
+ spec.metadata["source_code_uri"] = "https://github.com/basking2/archivededup"
21
+ # spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
22
+
23
+ # Specify which files should be added to the gem when it is released.
24
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
25
+ spec.files = Dir.chdir(__dir__) do
26
+ `git ls-files -z`.split("\x0").reject do |f|
27
+ (f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
28
+ end
29
+ end
30
+ spec.bindir = "exe"
31
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
+ spec.require_paths = ["lib"]
33
+
34
+ # Uncomment to register a new dependency of your gem
35
+ # spec.add_dependency "example-gem", "~> 1.0"
36
+
37
+ # For more information and examples about making a new gem, check out our
38
+ # guide at: https://bundler.io/guides/creating_gem.html
39
+ end
data/exe/archivededup ADDED
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+
4
+ require 'dbm'
5
+ require 'digest'
6
+ require 'thread'
7
+ require 'yaml'
8
+ require 'optparse'
9
+
10
+ require 'archivededup/db'
11
+ require 'archivededup/paralleltask'
12
+ require 'archivededup/filepicker'
13
+
14
+ archive = File.join(ENV['HOME'], 'Archive')
15
+ dbfile = 'archivededup.db'
16
+ threads = 16
17
+ action = 'check'
18
+
19
+ OptionParser.new do |opts|
20
+
21
+ opts.on('-b', '--build', "Build a database.") do
22
+ action = 'build'
23
+ end
24
+
25
+ opts.on('-c', '--check', "Check a database and report all duplicates.") do
26
+ action = 'check'
27
+ end
28
+
29
+ opts.on('--removedups', "Check a database and remove all duplicates --check reports.") do
30
+ action = 'remove'
31
+ end
32
+
33
+ opts.on('-d', '--dir=directory', 'Directory to dedup.') do |o|
34
+ archive = o
35
+ end
36
+
37
+ opts.on('-D', '--db=database_file', 'Dup database file.') do |o|
38
+ dbfile = o
39
+ end
40
+
41
+ opts.on('-t', '--threads=count', Integer, 'How many threads to use.') do |o|
42
+ threads = o
43
+ end
44
+ end.parse!
45
+
46
+
47
+ db = Archivededup::Db.new(dbfile)
48
+
49
+ case action
50
+ when 'build'
51
+
52
+ parallelTask = Archivededup::ParallelTask.new
53
+
54
+ parallelTask.start(threads) do |d|
55
+ db.add_file(d)
56
+ end
57
+
58
+ parallelTask.scatter do |queue|
59
+ # Fill the queue and feed the threads.
60
+ db.add_directory(archive) do |d|
61
+ queue.enq d
62
+ end
63
+ end
64
+
65
+ when /check|remove/
66
+ # Check and remove are nearly the same except for the actual delete operation.
67
+
68
+ do_delete = action == 'remove'
69
+
70
+ filepicker = Archivededup::FilePicker.new
71
+
72
+ db.each_dup do |k, r|
73
+
74
+ # Skip files in the Documents directory. It's special.
75
+ files = r['files'].filter { |f| f !~ /\/(?:Documents|Archive\/Prairie Music)\// }
76
+
77
+ # If the filter removed all files, well, do nothing!
78
+ next if files.length < 2
79
+
80
+ # Skip files with unexpected file extensions.
81
+ unless files.find { |f| f =~ /\.(?:png|jpeg|jpg|mp4|mov|mp3|mod|xfc|gif|m4v|3g2|mpg)$/i }
82
+ files.each { |f| puts "Not deduping #{f}."}
83
+ next
84
+ end
85
+
86
+ filekeep = filepicker.pick(files)
87
+
88
+ puts "Keep #{filekeep}."
89
+ files.
90
+ select { |f| f != filekeep }.
91
+ each do |f|
92
+ puts " Remove #{f}."
93
+
94
+ if do_delete
95
+ # Bit of an excessive check commented out below.
96
+ # raise Exception.new("Files differ!!!! #{f} #{filekeep}") if File.read(f) != File.read(filekeep)
97
+ raise Exception.new("File #{f} is the same as keep file #{filekeep}. Will not delete.") if File.identical?(filekeep, f)
98
+ File.unlink(f)
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ db.close
@@ -0,0 +1,113 @@
1
+
2
+ require 'dbm'
3
+ require 'digest'
4
+ require 'thread'
5
+ require 'yaml'
6
+
7
+ module Archivededup
8
+ class Db
9
+ def initialize(dbfile)
10
+ @db = DBM.new(dbfile, 0755, DBM::WRCREAT)
11
+ end
12
+
13
+ def each_hash
14
+ @db.select do |k, v|
15
+ k.start_with? 'hash - '
16
+ end.map do |k, v|
17
+ [k, YAML::load(v)]
18
+ end.each do |k, r|
19
+ yield k, r
20
+ end
21
+ end
22
+
23
+ def each_dup
24
+ @db.select do |k, v|
25
+ k.start_with? 'hash - '
26
+ end.map do |k, v|
27
+ [k, YAML::load(v)]
28
+ end.select do |k, r|
29
+ r['files'].length > 1
30
+ end.each do |k, r|
31
+ yield k, r
32
+ end
33
+ end
34
+
35
+ def add_directory(dir, &blk)
36
+ Dir.new(dir).each do |dent|
37
+ d = File.join(dir, dent)
38
+
39
+ next if dent == '.'
40
+ next if dent == '..'
41
+ next if dent.start_with? '.'
42
+
43
+ if File.directory? d
44
+ add_directory(d, &blk)
45
+ else
46
+ yield d if block_given?
47
+ end
48
+ end
49
+ end
50
+
51
+ def add_file(d)
52
+ # puts "Got #{d}"
53
+ s = File.stat(d)
54
+ # noimplemented - s.birthtime
55
+ s.mtime
56
+ s.ctime
57
+ # unused - s.atime
58
+ # puts d, s
59
+ hash = Digest::MD5.new
60
+ outbuf = ''
61
+
62
+ File.open(d) do |io|
63
+ while io.read(4096, outbuf) do
64
+ hash.update outbuf
65
+ end
66
+ end
67
+
68
+ unless @db.has_key? "name - #{d}"
69
+
70
+ hash = hash.hexdigest
71
+
72
+ @db["name - #{d}"] = YAML::dump({
73
+ dir: d,
74
+ hash: hash,
75
+ })
76
+
77
+ if @db.has_key?('hash - '+hash)
78
+ o = YAML::load(@db['hash - '+hash])
79
+ buf1 = ''
80
+ buf2 = ''
81
+ File.open(d) do |io1|
82
+ File.open(o['files'][0]) do |io2|
83
+ v1 = ''
84
+ v2 = ''
85
+ while v1 && v2 && v1 == v2 do
86
+ v1 = io1.read(4096, buf1)
87
+ v2 = io2.read(4096, buf2)
88
+ end
89
+
90
+ if v1 == v2
91
+ o['files'] << d
92
+ puts "Dups found: #{o}"
93
+ @db['hash - '+hash] = YAML::dump(o)
94
+ end
95
+ end
96
+ end
97
+ else
98
+ puts "Storing #{d}."
99
+ @db['hash - '+hash] = YAML::dump({
100
+ 'files' => [ d ],
101
+ 'hash' => hash,
102
+ })
103
+ end
104
+ else
105
+ puts "Had #{d}."
106
+ end
107
+ end
108
+
109
+ def close()
110
+ @db.close()
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,90 @@
1
+
2
+ require 'time'
3
+
4
+
5
+ module Archivededup
6
+
7
+ module FileNamePickerByDate
8
+
9
+ @@times = [
10
+ /(\d\d\d\d)-(\d?\d)-\d?\d/, # eg 2021-09-08
11
+ /(\d\d\d\d)(\d\d)\d\d/, # eg 20210908
12
+ /(\d\d\d\d)-(\d\d)/, # eg 2021-09
13
+ /(\d\d\d\d)(\d\d)/, # eg 202109
14
+ /(\d\d\d\d)/, # eg 2021
15
+ ]
16
+
17
+ def self.goodmatch?(m)
18
+ return false if m.nil?
19
+ year = m.length > 1? m[1].to_i : 0
20
+ month = m.length > 2? m[2].to_i : 0
21
+
22
+ return false if year < 1979
23
+ return false if year > 2100
24
+ return false if month < 0
25
+ return false if month > 12
26
+
27
+ true
28
+ end
29
+
30
+ def self.call(a, b)
31
+ last_compare = 0
32
+ @@times.find do |re|
33
+ ma = goodmatch?(re.match(a))
34
+ mb = goodmatch?(re.match(b))
35
+
36
+ if ma
37
+ if mb
38
+ false
39
+ else
40
+ # Pick a as better.
41
+ last_compare = -1
42
+ true
43
+ end
44
+ else
45
+ if mb
46
+ # Pick b as better.
47
+ last_compare = 1
48
+ true
49
+ else
50
+ false
51
+ end
52
+ end
53
+ end
54
+
55
+ last_compare
56
+ end
57
+ end
58
+
59
+ # From a list of file names, pick the file that provides the most information
60
+ # and so should be kept.
61
+ #
62
+ class FilePicker
63
+ def initialize()
64
+ # A list of ways to compare file names.
65
+ # These are applied in-order until one returns non-zero.
66
+ @comparators = []
67
+ @comparators << FileNamePickerByDate
68
+ @comparators << proc { |a, b| b.length <=> a.length }
69
+ end
70
+
71
+ def pick(filelist)
72
+
73
+ filelist.sort do |a,b|
74
+
75
+ # Default the sort to be equal, no sort.
76
+ last_compare = 0
77
+
78
+ # Run comparators until one selcts a better file name.
79
+ @comparators.find do |c|
80
+ last_compare = c.call(a, b)
81
+ last_compare != 0
82
+ end
83
+
84
+ # Return the result of the last comparison.
85
+ last_compare
86
+ end.
87
+ first
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,61 @@
1
+ require 'thread'
2
+
3
+ module Archivededup
4
+
5
+ class QueueAppender
6
+ def initialize(queue)
7
+ @queue = queue
8
+ end
9
+
10
+ def enq d
11
+ # Fill the queue and feed the threads.
12
+ while @queue.length > 100
13
+ sleep 1
14
+ end
15
+ @queue.enq d
16
+ end
17
+
18
+ alias :<< :enq
19
+ end
20
+
21
+ class ParallelTask
22
+ def initialize
23
+ @task
24
+ end
25
+
26
+ def start(threads=16)
27
+
28
+ @queue = Queue.new
29
+
30
+ #Start the threads.
31
+ @threads = threads.times.map do
32
+ Thread.new(@queue) do |queue|
33
+ run = true
34
+
35
+ while run
36
+ d = queue.deq
37
+
38
+ if d.nil?
39
+ run = false
40
+ else
41
+ yield d
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ end
48
+
49
+ def scatter
50
+ yield QueueAppender.new(@queue)
51
+
52
+ @queue.close
53
+
54
+ # Join the threads before exiting.
55
+ @threads.each do |t|
56
+ t.join
57
+ end
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,3 @@
1
+ module Archivededup
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,6 @@
1
+ require "archivededup/version"
2
+
3
+ module Archivededup
4
+ class Error < StandardError; end
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,4 @@
1
+ module Archivededup
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: archivededup
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Sam Baskinger
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-07-04 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Deduplicate files from an archive directory using a database.
14
+ email:
15
+ - basking2@yahoo.com
16
+ executables:
17
+ - archivededup
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".rspec"
22
+ - Gemfile
23
+ - Gemfile.lock
24
+ - LICENSE.txt
25
+ - README.md
26
+ - Rakefile
27
+ - archivededup.gemspec
28
+ - exe/archivededup
29
+ - lib/archivededup.rb
30
+ - lib/archivededup/db.rb
31
+ - lib/archivededup/filepicker.rb
32
+ - lib/archivededup/paralleltask.rb
33
+ - lib/archivededup/version.rb
34
+ - sig/archivededup.rbs
35
+ homepage: https://github.com/basking2/archivededup
36
+ licenses:
37
+ - MIT
38
+ metadata:
39
+ homepage_uri: https://github.com/basking2/archivededup
40
+ source_code_uri: https://github.com/basking2/archivededup
41
+ post_install_message:
42
+ rdoc_options: []
43
+ require_paths:
44
+ - lib
45
+ required_ruby_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: 2.6.0
50
+ required_rubygems_version: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ requirements: []
56
+ rubygems_version: 3.3.7
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Deduplicate files from an archive directory.
60
+ test_files: []