active_storage_dedup 1.0.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ namespace :active_storage_dedup do
4
+ desc "Report duplicate blobs grouped by checksum and service"
5
+ task report_duplicates: :environment do
6
+ puts "Scanning for duplicate blobs...\n\n"
7
+
8
+ # Group blobs by checksum and service_name, find groups with duplicates
9
+ duplicate_groups = ActiveStorage::Blob
10
+ .select("checksum, service_name, COUNT(*) as blob_count")
11
+ .group(:checksum, :service_name)
12
+ .having("COUNT(*) > 1")
13
+ .order("blob_count DESC")
14
+
15
+ if duplicate_groups.empty?
16
+ puts "No duplicate blobs found!"
17
+ next
18
+ end
19
+
20
+ total_duplicates = 0
21
+ total_wasted_storage = 0
22
+
23
+ duplicate_groups.each do |group|
24
+ blobs = ActiveStorage::Blob
25
+ .where(checksum: group.checksum, service_name: group.service_name)
26
+ .order(:created_at)
27
+
28
+ keeper = blobs.first
29
+ duplicates = blobs[1..]
30
+
31
+ # Calculate wasted storage (size of duplicate blobs)
32
+ wasted_bytes = duplicates.sum(&:byte_size)
33
+ total_wasted_storage += wasted_bytes
34
+
35
+ puts "Checksum: #{group.checksum}"
36
+ puts "Service: #{group.service_name}"
37
+ puts "Filename: #{keeper.filename}"
38
+ puts "Total blobs: #{blobs.count}"
39
+ puts "Keeper blob ID: #{keeper.id} (#{keeper.attachments.count} attachments)"
40
+ puts "Duplicate blob IDs: #{duplicates.map(&:id).join(', ')}"
41
+ puts "Total attachments across duplicates: #{duplicates.sum { |b| b.attachments.count }}"
42
+ puts "Wasted storage: #{format_bytes(wasted_bytes)}"
43
+ puts "-" * 80
44
+ puts
45
+
46
+ total_duplicates += duplicates.count
47
+ end
48
+
49
+ puts "\nSummary:"
50
+ puts "Total duplicate groups: #{duplicate_groups.count}"
51
+ puts "Total duplicate blobs: #{total_duplicates}"
52
+ puts "Total wasted storage: #{format_bytes(total_wasted_storage)}"
53
+ end
54
+
55
+ desc "Clean up all duplicate blobs by merging them (sanity check)"
56
+ task cleanup_all: :environment do
57
+ puts "Running sanity check to find and merge duplicate blobs...\n\n"
58
+
59
+ # Run the deduplication job
60
+ ActiveStorageDedup::DeduplicationJob.perform_now
61
+
62
+ puts "\nCleanup complete! Check logs for details."
63
+ end
64
+
65
+ desc "Backfill reference_count for existing blobs"
66
+ task backfill_reference_count: :environment do
67
+ puts "Backfilling reference_count for all blobs...\n\n"
68
+
69
+ total_blobs = ActiveStorage::Blob.count
70
+ updated = 0
71
+
72
+ ActiveStorage::Blob.find_each.with_index do |blob, index|
73
+ actual_count = blob.attachments.count
74
+ current_count = blob.reference_count || 0
75
+
76
+ if actual_count != current_count
77
+ blob.update_column(:reference_count, actual_count)
78
+ updated += 1
79
+ end
80
+
81
+ if (index + 1) % 100 == 0
82
+ puts "Processed #{index + 1}/#{total_blobs} blobs..."
83
+ end
84
+ end
85
+
86
+ puts "\nBackfill complete!"
87
+ puts "Total blobs: #{total_blobs}"
88
+ puts "Updated: #{updated}"
89
+ end
90
+
91
+ # Helper method to format bytes into human-readable format
92
+ def format_bytes(bytes)
93
+ return "0 B" if bytes.zero?
94
+
95
+ units = ["B", "KB", "MB", "GB", "TB"]
96
+ exp = (Math.log(bytes) / Math.log(1024)).floor
97
+ exp = [exp, units.length - 1].min
98
+
99
+ format("%.2f %s", bytes.to_f / (1024**exp), units[exp])
100
+ end
101
+ end
@@ -0,0 +1,4 @@
1
+ module ActiveStorageDedup
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,164 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: active_storage_dedup
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.alpha
5
+ platform: ruby
6
+ authors:
7
+ - coderhs
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: rails
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: 6.0.0
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: 6.0.0
26
+ - !ruby/object:Gem::Dependency
27
+ name: activestorage
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 6.0.0
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: 6.0.0
40
+ - !ruby/object:Gem::Dependency
41
+ name: rspec
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '3.0'
47
+ type: :development
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '3.0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: rspec-rails
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ type: :development
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ - !ruby/object:Gem::Dependency
69
+ name: sqlite3
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '0'
75
+ type: :development
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - ">="
80
+ - !ruby/object:Gem::Version
81
+ version: '0'
82
+ - !ruby/object:Gem::Dependency
83
+ name: simplecov
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ type: :development
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ - !ruby/object:Gem::Dependency
97
+ name: combustion
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '1.3'
103
+ type: :development
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '1.3'
110
+ description: Prevents duplicate file uploads in Active Storage by reusing existing
111
+ blobs with matching checksums and service names
112
+ email:
113
+ - mailme@hsps.in
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".rspec"
119
+ - ".rubocop.yml"
120
+ - CHANGELOG.md
121
+ - CODE_OF_CONDUCT.md
122
+ - LICENSE.txt
123
+ - README.md
124
+ - Rakefile
125
+ - lib/active_storage_dedup.rb
126
+ - lib/active_storage_dedup/attachment_extension.rb
127
+ - lib/active_storage_dedup/attachment_options.rb
128
+ - lib/active_storage_dedup/blob_deduplication.rb
129
+ - lib/active_storage_dedup/changes_extension.rb
130
+ - lib/active_storage_dedup/configuration.rb
131
+ - lib/active_storage_dedup/deduplication_job.rb
132
+ - lib/active_storage_dedup/railtie.rb
133
+ - lib/active_storage_dedup/version.rb
134
+ - lib/generators/active_storage_dedup/install_generator.rb
135
+ - lib/generators/active_storage_dedup/templates/README
136
+ - lib/generators/active_storage_dedup/templates/add_active_storage_dedup.rb.erb
137
+ - lib/generators/active_storage_dedup/templates/initializer.rb
138
+ - lib/tasks/active_storage_dedup.rake
139
+ - sig/active_storage_dedup.rbs
140
+ homepage: https://github.com/coderhs/active_storage_dedup
141
+ licenses:
142
+ - MIT
143
+ metadata:
144
+ homepage_uri: https://github.com/coderhs/active_storage_dedup
145
+ source_code_uri: https://github.com/coderhs/active_storage_dedup
146
+ changelog_uri: https://github.com/coderhs/active_storage_dedup/blob/main/CHANGELOG.md
147
+ rdoc_options: []
148
+ require_paths:
149
+ - lib
150
+ required_ruby_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: 2.7.0
155
+ required_rubygems_version: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ requirements: []
161
+ rubygems_version: 3.6.7
162
+ specification_version: 4
163
+ summary: Deduplication for Active Storage uploads
164
+ test_files: []