active_storage_dedup 1.0.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +8 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +419 -0
- data/Rakefile +12 -0
- data/lib/active_storage_dedup/attachment_extension.rb +39 -0
- data/lib/active_storage_dedup/attachment_options.rb +21 -0
- data/lib/active_storage_dedup/blob_deduplication.rb +139 -0
- data/lib/active_storage_dedup/changes_extension.rb +70 -0
- data/lib/active_storage_dedup/configuration.rb +78 -0
- data/lib/active_storage_dedup/deduplication_job.rb +100 -0
- data/lib/active_storage_dedup/railtie.rb +48 -0
- data/lib/active_storage_dedup/version.rb +5 -0
- data/lib/active_storage_dedup.rb +16 -0
- data/lib/generators/active_storage_dedup/install_generator.rb +36 -0
- data/lib/generators/active_storage_dedup/templates/README +44 -0
- data/lib/generators/active_storage_dedup/templates/add_active_storage_dedup.rb.erb +41 -0
- data/lib/generators/active_storage_dedup/templates/initializer.rb +42 -0
- data/lib/tasks/active_storage_dedup.rake +101 -0
- data/sig/active_storage_dedup.rbs +4 -0
- metadata +164 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
namespace :active_storage_dedup do
|
|
4
|
+
desc "Report duplicate blobs grouped by checksum and service"
|
|
5
|
+
task report_duplicates: :environment do
|
|
6
|
+
puts "Scanning for duplicate blobs...\n\n"
|
|
7
|
+
|
|
8
|
+
# Group blobs by checksum and service_name, find groups with duplicates
|
|
9
|
+
duplicate_groups = ActiveStorage::Blob
|
|
10
|
+
.select("checksum, service_name, COUNT(*) as blob_count")
|
|
11
|
+
.group(:checksum, :service_name)
|
|
12
|
+
.having("COUNT(*) > 1")
|
|
13
|
+
.order("blob_count DESC")
|
|
14
|
+
|
|
15
|
+
if duplicate_groups.empty?
|
|
16
|
+
puts "No duplicate blobs found!"
|
|
17
|
+
next
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
total_duplicates = 0
|
|
21
|
+
total_wasted_storage = 0
|
|
22
|
+
|
|
23
|
+
duplicate_groups.each do |group|
|
|
24
|
+
blobs = ActiveStorage::Blob
|
|
25
|
+
.where(checksum: group.checksum, service_name: group.service_name)
|
|
26
|
+
.order(:created_at)
|
|
27
|
+
|
|
28
|
+
keeper = blobs.first
|
|
29
|
+
duplicates = blobs[1..]
|
|
30
|
+
|
|
31
|
+
# Calculate wasted storage (size of duplicate blobs)
|
|
32
|
+
wasted_bytes = duplicates.sum(&:byte_size)
|
|
33
|
+
total_wasted_storage += wasted_bytes
|
|
34
|
+
|
|
35
|
+
puts "Checksum: #{group.checksum}"
|
|
36
|
+
puts "Service: #{group.service_name}"
|
|
37
|
+
puts "Filename: #{keeper.filename}"
|
|
38
|
+
puts "Total blobs: #{blobs.count}"
|
|
39
|
+
puts "Keeper blob ID: #{keeper.id} (#{keeper.attachments.count} attachments)"
|
|
40
|
+
puts "Duplicate blob IDs: #{duplicates.map(&:id).join(', ')}"
|
|
41
|
+
puts "Total attachments across duplicates: #{duplicates.sum { |b| b.attachments.count }}"
|
|
42
|
+
puts "Wasted storage: #{format_bytes(wasted_bytes)}"
|
|
43
|
+
puts "-" * 80
|
|
44
|
+
puts
|
|
45
|
+
|
|
46
|
+
total_duplicates += duplicates.count
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
puts "\nSummary:"
|
|
50
|
+
puts "Total duplicate groups: #{duplicate_groups.count}"
|
|
51
|
+
puts "Total duplicate blobs: #{total_duplicates}"
|
|
52
|
+
puts "Total wasted storage: #{format_bytes(total_wasted_storage)}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
desc "Clean up all duplicate blobs by merging them (sanity check)"
|
|
56
|
+
task cleanup_all: :environment do
|
|
57
|
+
puts "Running sanity check to find and merge duplicate blobs...\n\n"
|
|
58
|
+
|
|
59
|
+
# Run the deduplication job
|
|
60
|
+
ActiveStorageDedup::DeduplicationJob.perform_now
|
|
61
|
+
|
|
62
|
+
puts "\nCleanup complete! Check logs for details."
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
desc "Backfill reference_count for existing blobs"
|
|
66
|
+
task backfill_reference_count: :environment do
|
|
67
|
+
puts "Backfilling reference_count for all blobs...\n\n"
|
|
68
|
+
|
|
69
|
+
total_blobs = ActiveStorage::Blob.count
|
|
70
|
+
updated = 0
|
|
71
|
+
|
|
72
|
+
ActiveStorage::Blob.find_each.with_index do |blob, index|
|
|
73
|
+
actual_count = blob.attachments.count
|
|
74
|
+
current_count = blob.reference_count || 0
|
|
75
|
+
|
|
76
|
+
if actual_count != current_count
|
|
77
|
+
blob.update_column(:reference_count, actual_count)
|
|
78
|
+
updated += 1
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
if (index + 1) % 100 == 0
|
|
82
|
+
puts "Processed #{index + 1}/#{total_blobs} blobs..."
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
puts "\nBackfill complete!"
|
|
87
|
+
puts "Total blobs: #{total_blobs}"
|
|
88
|
+
puts "Updated: #{updated}"
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Helper method to format bytes into human-readable format
|
|
92
|
+
def format_bytes(bytes)
|
|
93
|
+
return "0 B" if bytes.zero?
|
|
94
|
+
|
|
95
|
+
units = ["B", "KB", "MB", "GB", "TB"]
|
|
96
|
+
exp = (Math.log(bytes) / Math.log(1024)).floor
|
|
97
|
+
exp = [exp, units.length - 1].min
|
|
98
|
+
|
|
99
|
+
format("%.2f %s", bytes.to_f / (1024**exp), units[exp])
|
|
100
|
+
end
|
|
101
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: active_storage_dedup
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0.alpha
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- coderhs
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: rails
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: 6.0.0
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: 6.0.0
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: activestorage
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: 6.0.0
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: 6.0.0
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: rspec
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '3.0'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '3.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: rspec-rails
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - ">="
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0'
|
|
61
|
+
type: :development
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - ">="
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '0'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: sqlite3
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - ">="
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '0'
|
|
75
|
+
type: :development
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - ">="
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '0'
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: simplecov
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - ">="
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '0'
|
|
89
|
+
type: :development
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - ">="
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '0'
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: combustion
|
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
|
99
|
+
requirements:
|
|
100
|
+
- - "~>"
|
|
101
|
+
- !ruby/object:Gem::Version
|
|
102
|
+
version: '1.3'
|
|
103
|
+
type: :development
|
|
104
|
+
prerelease: false
|
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - "~>"
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: '1.3'
|
|
110
|
+
description: Prevents duplicate file uploads in Active Storage by reusing existing
|
|
111
|
+
blobs with matching checksums and service names
|
|
112
|
+
email:
|
|
113
|
+
- mailme@hsps.in
|
|
114
|
+
executables: []
|
|
115
|
+
extensions: []
|
|
116
|
+
extra_rdoc_files: []
|
|
117
|
+
files:
|
|
118
|
+
- ".rspec"
|
|
119
|
+
- ".rubocop.yml"
|
|
120
|
+
- CHANGELOG.md
|
|
121
|
+
- CODE_OF_CONDUCT.md
|
|
122
|
+
- LICENSE.txt
|
|
123
|
+
- README.md
|
|
124
|
+
- Rakefile
|
|
125
|
+
- lib/active_storage_dedup.rb
|
|
126
|
+
- lib/active_storage_dedup/attachment_extension.rb
|
|
127
|
+
- lib/active_storage_dedup/attachment_options.rb
|
|
128
|
+
- lib/active_storage_dedup/blob_deduplication.rb
|
|
129
|
+
- lib/active_storage_dedup/changes_extension.rb
|
|
130
|
+
- lib/active_storage_dedup/configuration.rb
|
|
131
|
+
- lib/active_storage_dedup/deduplication_job.rb
|
|
132
|
+
- lib/active_storage_dedup/railtie.rb
|
|
133
|
+
- lib/active_storage_dedup/version.rb
|
|
134
|
+
- lib/generators/active_storage_dedup/install_generator.rb
|
|
135
|
+
- lib/generators/active_storage_dedup/templates/README
|
|
136
|
+
- lib/generators/active_storage_dedup/templates/add_active_storage_dedup.rb.erb
|
|
137
|
+
- lib/generators/active_storage_dedup/templates/initializer.rb
|
|
138
|
+
- lib/tasks/active_storage_dedup.rake
|
|
139
|
+
- sig/active_storage_dedup.rbs
|
|
140
|
+
homepage: https://github.com/coderhs/active_storage_dedup
|
|
141
|
+
licenses:
|
|
142
|
+
- MIT
|
|
143
|
+
metadata:
|
|
144
|
+
homepage_uri: https://github.com/coderhs/active_storage_dedup
|
|
145
|
+
source_code_uri: https://github.com/coderhs/active_storage_dedup
|
|
146
|
+
changelog_uri: https://github.com/coderhs/active_storage_dedup/blob/main/CHANGELOG.md
|
|
147
|
+
rdoc_options: []
|
|
148
|
+
require_paths:
|
|
149
|
+
- lib
|
|
150
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
151
|
+
requirements:
|
|
152
|
+
- - ">="
|
|
153
|
+
- !ruby/object:Gem::Version
|
|
154
|
+
version: 2.7.0
|
|
155
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
156
|
+
requirements:
|
|
157
|
+
- - ">="
|
|
158
|
+
- !ruby/object:Gem::Version
|
|
159
|
+
version: '0'
|
|
160
|
+
requirements: []
|
|
161
|
+
rubygems_version: 3.6.7
|
|
162
|
+
specification_version: 4
|
|
163
|
+
summary: Deduplication for Active Storage uploads
|
|
164
|
+
test_files: []
|