pumice 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +962 -0
- data/lib/pumice/analyzer.rb +67 -0
- data/lib/pumice/configuration.rb +330 -0
- data/lib/pumice/dsl.rb +267 -0
- data/lib/pumice/dump_generator.rb +115 -0
- data/lib/pumice/empty_sanitizer.rb +38 -0
- data/lib/pumice/generators/column_classification.rb +58 -0
- data/lib/pumice/generators/install_generator.rb +33 -0
- data/lib/pumice/generators/sanitizer_generator.rb +107 -0
- data/lib/pumice/generators/templates/initializer.rb.erb +51 -0
- data/lib/pumice/generators/templates/sanitizer.rb.erb +32 -0
- data/lib/pumice/generators/templates/sanitizer_spec.rb.erb +15 -0
- data/lib/pumice/generators/test_generator.rb +20 -0
- data/lib/pumice/helpers.rb +141 -0
- data/lib/pumice/logger.rb +105 -0
- data/lib/pumice/output.rb +81 -0
- data/lib/pumice/progress.rb +42 -0
- data/lib/pumice/pruner.rb +157 -0
- data/lib/pumice/pruning/analyzer.rb +207 -0
- data/lib/pumice/railtie.rb +15 -0
- data/lib/pumice/rspec.rb +101 -0
- data/lib/pumice/runner.rb +66 -0
- data/lib/pumice/safe_scrubber.rb +341 -0
- data/lib/pumice/sanitizer.rb +336 -0
- data/lib/pumice/soft_scrubbing/policy.rb +104 -0
- data/lib/pumice/soft_scrubbing.rb +101 -0
- data/lib/pumice/validator.rb +113 -0
- data/lib/pumice/version.rb +5 -0
- data/lib/pumice.rb +23 -0
- data/lib/tasks/db_scrub.rake +616 -0
- metadata +132 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pumice
|
|
4
|
+
class Analyzer
|
|
5
|
+
def initialize(limit: 20, schema: 'public', tables: nil)
|
|
6
|
+
@limit = limit
|
|
7
|
+
@schema = schema
|
|
8
|
+
@tables = Array(tables || Pumice.config.sensitive_tables).map(&:to_s)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def table_sizes
|
|
12
|
+
@sizes ||= fetch_table_sizes
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def total_bytes
|
|
16
|
+
@total ||= table_sizes.sum { |s| s.bytes.to_i }
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def row_counts
|
|
20
|
+
@row_counts ||= fetch_row_counts
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
TableSize = Struct.new(:name, :size, :bytes, keyword_init: true)
|
|
26
|
+
RowCount = Struct.new(:table, :count, keyword_init: true)
|
|
27
|
+
|
|
28
|
+
def fetch_table_sizes
|
|
29
|
+
conn = ActiveRecord::Base.connection
|
|
30
|
+
quoted_schema = conn.quote(@schema)
|
|
31
|
+
|
|
32
|
+
sql = <<-SQL
|
|
33
|
+
SELECT
|
|
34
|
+
tablename,
|
|
35
|
+
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size,
|
|
36
|
+
pg_total_relation_size(schemaname||'.'||tablename) AS bytes
|
|
37
|
+
FROM pg_tables
|
|
38
|
+
WHERE schemaname = #{quoted_schema}
|
|
39
|
+
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
|
|
40
|
+
LIMIT #{@limit.to_i};
|
|
41
|
+
SQL
|
|
42
|
+
|
|
43
|
+
conn.execute(sql).map do |row|
|
|
44
|
+
TableSize.new(
|
|
45
|
+
name: row['tablename'],
|
|
46
|
+
size: row['size'],
|
|
47
|
+
bytes: row['bytes'].to_i
|
|
48
|
+
)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def fetch_row_counts
|
|
53
|
+
@tables.filter_map do |table|
|
|
54
|
+
count = fetch_row_count(table)
|
|
55
|
+
RowCount.new(table: table, count: count) if count
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def fetch_row_count(table_name)
|
|
60
|
+
conn = ActiveRecord::Base.connection
|
|
61
|
+
quoted_table = conn.quote_table_name(table_name)
|
|
62
|
+
conn.execute("SELECT COUNT(*) FROM #{quoted_table}").first['count'].to_i
|
|
63
|
+
rescue StandardError
|
|
64
|
+
nil
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'soft_scrubbing/policy'
|
|
4
|
+
|
|
5
|
+
module Pumice
|
|
6
|
+
class Configuration
|
|
7
|
+
attr_accessor :verbose, :strict, :continue_on_error,
|
|
8
|
+
:allow_keep_undefined_columns, :sensitive_tables, :sensitive_email_domains,
|
|
9
|
+
:sensitive_email_model, :sensitive_email_column, :default_verification,
|
|
10
|
+
# Validator configuration
|
|
11
|
+
:sensitive_token_columns, # Token columns to verify are cleared (e.g., Devise tokens)
|
|
12
|
+
:sensitive_external_id_columns, # External ID columns to verify are cleared
|
|
13
|
+
# Safe scrub configuration
|
|
14
|
+
:source_database_url, # Source database (read-only, never modified)
|
|
15
|
+
:target_database_url, # Target database (scrubbed copy)
|
|
16
|
+
:export_path, # Optional: path to export the scrubbed dump
|
|
17
|
+
:export_format, # Export format: :custom (pg_dump -Fc) or :plain (SQL)
|
|
18
|
+
:require_readonly_source, # Enforce read-only source credentials (default: false, warns only)
|
|
19
|
+
# Pruning configuration
|
|
20
|
+
:pruning # Delete old records before sanitization (see pruning_config)
|
|
21
|
+
|
|
22
|
+
attr_writer :soft_scrubbing
|
|
23
|
+
|
|
24
|
+
# Default verification policy for bulk operations.
|
|
25
|
+
# Used when `verify_all` is called without a block.
|
|
26
|
+
# Receives (model_class, bulk_operation) and returns a verification proc.
|
|
27
|
+
# The bulk_operation hash contains :type (:truncate, :delete, :destroy) and :scope (optional block).
|
|
28
|
+
DEFAULT_VERIFICATION_POLICY = lambda do |_model_class, bulk_operation|
|
|
29
|
+
case bulk_operation[:type]
|
|
30
|
+
when :truncate
|
|
31
|
+
-> { count.zero? }
|
|
32
|
+
when :delete, :destroy
|
|
33
|
+
if bulk_operation[:scope]
|
|
34
|
+
# Re-run the scope and check .none?
|
|
35
|
+
bulk_operation[:scope]
|
|
36
|
+
else
|
|
37
|
+
-> { count.zero? }
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def initialize
|
|
43
|
+
@verbose = false
|
|
44
|
+
@strict = true
|
|
45
|
+
@continue_on_error = false
|
|
46
|
+
@soft_scrubbing = false # Disabled by default; set to hash to enable
|
|
47
|
+
@allow_keep_undefined_columns = true
|
|
48
|
+
@sensitive_tables = []
|
|
49
|
+
@sensitive_email_domains = []
|
|
50
|
+
@sensitive_email_model = 'User'
|
|
51
|
+
@sensitive_email_column = 'email'
|
|
52
|
+
@default_verification = DEFAULT_VERIFICATION_POLICY
|
|
53
|
+
# Validator defaults (Devise-compatible)
|
|
54
|
+
@sensitive_token_columns = %w[reset_password_token confirmation_token]
|
|
55
|
+
@sensitive_external_id_columns = []
|
|
56
|
+
# Safe scrub defaults
|
|
57
|
+
@source_database_url = nil
|
|
58
|
+
@target_database_url = nil
|
|
59
|
+
@export_path = nil
|
|
60
|
+
@export_format = :custom
|
|
61
|
+
@require_readonly_source = false # Warn by default, set true to enforce
|
|
62
|
+
# Pruning defaults
|
|
63
|
+
@pruning = false # Disabled by default; set to hash to enable
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Returns true if soft_scrubbing config is set
|
|
67
|
+
def soft_scrubbing_configured?
|
|
68
|
+
@soft_scrubbing.is_a?(Hash)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Returns normalized soft_scrubbing configuration
|
|
72
|
+
# Overrides attr_writer getter to return normalized config
|
|
73
|
+
def soft_scrubbing
|
|
74
|
+
return nil unless soft_scrubbing_configured?
|
|
75
|
+
|
|
76
|
+
{
|
|
77
|
+
context: @soft_scrubbing.fetch(:context, nil),
|
|
78
|
+
policy: resolve_soft_scrubbing_policy
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
# Resolves the policy from if:/unless: options
|
|
85
|
+
# if: and unless: are mutually exclusive (if: takes precedence)
|
|
86
|
+
def resolve_soft_scrubbing_policy
|
|
87
|
+
if_condition = @soft_scrubbing[:if]
|
|
88
|
+
unless_condition = @soft_scrubbing[:unless]
|
|
89
|
+
|
|
90
|
+
if if_condition
|
|
91
|
+
if_condition
|
|
92
|
+
elsif unless_condition
|
|
93
|
+
# Invert the unless condition
|
|
94
|
+
->(record, viewer) { !unless_condition.call(record, viewer) }
|
|
95
|
+
else
|
|
96
|
+
# Default: always scrub
|
|
97
|
+
->(_record, _viewer) { true }
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
public
|
|
102
|
+
|
|
103
|
+
# Returns true if pruning config is set (not checking ENV)
|
|
104
|
+
def pruning_configured?
|
|
105
|
+
@pruning.is_a?(Hash)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Returns normalized pruning configuration
|
|
109
|
+
# Overrides attr_accessor getter to return normalized config
|
|
110
|
+
def pruning
|
|
111
|
+
return nil unless Pumice.pruning_enabled?
|
|
112
|
+
|
|
113
|
+
validate_pruning_config!
|
|
114
|
+
|
|
115
|
+
{
|
|
116
|
+
older_than: @pruning[:older_than],
|
|
117
|
+
newer_than: @pruning[:newer_than],
|
|
118
|
+
column: @pruning.fetch(:column, :created_at).to_sym,
|
|
119
|
+
only: @pruning.fetch(:only, []).map(&:to_s),
|
|
120
|
+
except: @pruning.fetch(:except, []).map(&:to_s),
|
|
121
|
+
analyzer: normalize_analyzer_config(@pruning.fetch(:analyzer, {}))
|
|
122
|
+
}
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
private
|
|
126
|
+
|
|
127
|
+
def validate_pruning_config!
|
|
128
|
+
has_older = @pruning.key?(:older_than)
|
|
129
|
+
has_newer = @pruning.key?(:newer_than)
|
|
130
|
+
|
|
131
|
+
if has_older && has_newer
|
|
132
|
+
raise ArgumentError,
|
|
133
|
+
"Pruning config cannot specify both older_than and newer_than. " \
|
|
134
|
+
"Use one or the other."
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
unless has_older || has_newer
|
|
138
|
+
raise ArgumentError,
|
|
139
|
+
"Pruning config requires either older_than: or newer_than: to specify " \
|
|
140
|
+
"which records to prune."
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def normalize_analyzer_config(analyzer_config)
|
|
145
|
+
{
|
|
146
|
+
table_patterns: Array(analyzer_config.fetch(:table_patterns, [])).map(&:to_s),
|
|
147
|
+
min_table_size: analyzer_config.fetch(:min_table_size, 10_000_000), # 10 MB
|
|
148
|
+
min_row_count: analyzer_config.fetch(:min_row_count, 1000)
|
|
149
|
+
}
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def database_url_from_rails_config
|
|
153
|
+
config_hash = ActiveRecord::Base.connection_db_config.configuration_hash
|
|
154
|
+
|
|
155
|
+
# Staging/production: config already has a url key
|
|
156
|
+
return config_hash[:url] if config_hash[:url].present?
|
|
157
|
+
|
|
158
|
+
# Development/test: build from components
|
|
159
|
+
build_database_url(config_hash)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def build_database_url(config_hash)
|
|
163
|
+
return nil unless config_hash[:adapter] == 'postgresql'
|
|
164
|
+
|
|
165
|
+
host = config_hash[:host] || 'localhost'
|
|
166
|
+
port = config_hash[:port] || 5432
|
|
167
|
+
database = config_hash[:database]
|
|
168
|
+
username = config_hash[:username]
|
|
169
|
+
password = config_hash[:password]
|
|
170
|
+
|
|
171
|
+
return nil if database.blank?
|
|
172
|
+
|
|
173
|
+
userinfo = if username.present? && password.present?
|
|
174
|
+
"#{URI.encode_www_form_component(username)}:#{URI.encode_www_form_component(password)}@"
|
|
175
|
+
elsif username.present?
|
|
176
|
+
"#{URI.encode_www_form_component(username)}@"
|
|
177
|
+
else
|
|
178
|
+
''
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
"postgresql://#{userinfo}#{host}:#{port}/#{database}"
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
public
|
|
185
|
+
|
|
186
|
+
# Resolves source_database_url, handling the :auto sentinel.
|
|
187
|
+
# Returns a concrete URL string or nil.
|
|
188
|
+
def resolved_source_database_url
|
|
189
|
+
case @source_database_url
|
|
190
|
+
when :auto then database_url_from_rails_config
|
|
191
|
+
when String then @source_database_url
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def sensitive_tables=(value)
|
|
196
|
+
@sensitive_tables = normalize_collection(value)
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
def sensitive_email_domains=(value)
|
|
200
|
+
@sensitive_email_domains = normalize_collection(value)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def add_sensitive_tables(value)
|
|
204
|
+
@sensitive_tables |= normalize_collection(value)
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def add_sensitive_email_domains(value)
|
|
208
|
+
@sensitive_email_domains |= normalize_collection(value)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
private
|
|
212
|
+
|
|
213
|
+
def normalize_collection(value)
|
|
214
|
+
Array(value).flatten.compact.map(&:to_s)
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
def self.config
|
|
219
|
+
@config ||= Configuration.new
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def self.configure
|
|
223
|
+
yield(config)
|
|
224
|
+
Pumice::SoftScrubbing.init! if config.soft_scrubbing_configured?
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def self.dry_run?
|
|
228
|
+
ENV['DRY_RUN'] == 'true'
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
def self.verbose?
|
|
232
|
+
config.verbose
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def self.strict?
|
|
236
|
+
config.strict
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def self.soft_scrubbing?
|
|
240
|
+
config.soft_scrubbing_configured?
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def self.allow_keep_undefined_columns?
|
|
244
|
+
config.allow_keep_undefined_columns
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def self.soft_scrubbing_context=(context)
|
|
248
|
+
SoftScrubbing::Policy.context = context
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def self.soft_scrubbing_context
|
|
252
|
+
SoftScrubbing::Policy.current
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def self.with_soft_scrubbing_context(context, &block)
|
|
256
|
+
SoftScrubbing::Policy.with_context(context, &block)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def self.soft_scrubbing_enabled_for?(record)
|
|
260
|
+
SoftScrubbing::Policy.enabled_for?(record)
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# Returns true if pruning is configured and not disabled by ENV
|
|
264
|
+
# Set PRUNE=false to disable pruning without changing config
|
|
265
|
+
def self.pruning_enabled?
|
|
266
|
+
return false if ENV['PRUNE'] == 'false'
|
|
267
|
+
|
|
268
|
+
config.pruning_configured?
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Returns true if the given table should be pruned
|
|
272
|
+
def self.prune_table?(table_name)
|
|
273
|
+
return false unless pruning_enabled?
|
|
274
|
+
|
|
275
|
+
pruning_config = config.pruning
|
|
276
|
+
table = table_name.to_s
|
|
277
|
+
|
|
278
|
+
if pruning_config[:only].present?
|
|
279
|
+
pruning_config[:only].include?(table)
|
|
280
|
+
elsif pruning_config[:except].present?
|
|
281
|
+
!pruning_config[:except].include?(table)
|
|
282
|
+
else
|
|
283
|
+
true # Prune all tables if no filter specified
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
def self.sanitizer_for(model_class)
|
|
288
|
+
@sanitizer_map ||= {}
|
|
289
|
+
@sanitizer_map[model_class] ||= sanitizers.find do |s|
|
|
290
|
+
s.model_class == model_class
|
|
291
|
+
rescue StandardError
|
|
292
|
+
nil
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
@sanitizer_map[model_class] || Pumice::EmptySanitizer
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def self.lint!
|
|
299
|
+
issues = []
|
|
300
|
+
|
|
301
|
+
sanitizers.each do |sanitizer|
|
|
302
|
+
issues.concat(sanitizer.lint!)
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
if issues.any?
|
|
306
|
+
puts "\n🔍 Pumice Lint Errors:\n"
|
|
307
|
+
issues.each { |issue| puts " ❌ #{issue}" }
|
|
308
|
+
puts "\n"
|
|
309
|
+
else
|
|
310
|
+
puts "\n✅ Pumice: All sanitizers have complete coverage\n"
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
issues
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def self.sanitizers
|
|
317
|
+
@sanitizers ||= []
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def self.register(sanitizer)
|
|
321
|
+
sanitizers << sanitizer unless sanitizers.include?(sanitizer)
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
def self.reset!
|
|
325
|
+
@sanitizers = []
|
|
326
|
+
@sanitizer_map = {}
|
|
327
|
+
@config = nil
|
|
328
|
+
SoftScrubbing::Policy.reset!
|
|
329
|
+
end
|
|
330
|
+
end
|
data/lib/pumice/dsl.rb
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pumice
|
|
4
|
+
# DSL methods for defining sanitizer behavior.
|
|
5
|
+
# Extended by Sanitizer subclasses to provide scrub/keep declarations.
|
|
6
|
+
module DSL
|
|
7
|
+
PROTECTED_COLUMNS = %w[id created_at updated_at].freeze
|
|
8
|
+
|
|
9
|
+
# Explicitly declare the model this sanitizer handles.
|
|
10
|
+
# Uses pluralized form (like has_many):
|
|
11
|
+
# sanitizes :users # infers User
|
|
12
|
+
# sanitizes :admin_users, class_name: 'Admin::User' # namespaced model
|
|
13
|
+
# sanitizes :users, class_name: User # explicit constant
|
|
14
|
+
def sanitizes(model_name, class_name: model_name.to_s.classify)
|
|
15
|
+
@model_class = if class_name.is_a?(String)
|
|
16
|
+
class_name.constantize
|
|
17
|
+
else
|
|
18
|
+
class_name
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def model_class
|
|
23
|
+
@model_class ||= infer_model_class
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Override the auto-derived friendly name for rake tasks
|
|
27
|
+
# Example: friendly_name 'legacy_users' in LegacyUserDataSanitizer
|
|
28
|
+
def friendly_name(name = nil)
|
|
29
|
+
if name
|
|
30
|
+
@friendly_name = name.to_s
|
|
31
|
+
else
|
|
32
|
+
@friendly_name || infer_friendly_name
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Define a scrubbing rule for a column
|
|
37
|
+
def scrub(name, &block)
|
|
38
|
+
@scrubbed ||= {}
|
|
39
|
+
@scrubbed[name] = block
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Mark columns as safe to keep unchanged (not PII)
|
|
43
|
+
def keep(*names)
|
|
44
|
+
@kept ||= []
|
|
45
|
+
@kept.concat(names.map(&:to_sym))
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# UNSAFE: Keep all columns not explicitly declared via scrub or keep.
|
|
49
|
+
# Bypasses PII review - use only for development/testing.
|
|
50
|
+
# Disable with: Pumice.configure { |c| c.allow_keep_undefined_columns = false }
|
|
51
|
+
def keep_undefined_columns!
|
|
52
|
+
unless Pumice.allow_keep_undefined_columns?
|
|
53
|
+
raise "keep_undefined_columns! is disabled. " \
|
|
54
|
+
"This method bypasses PII review and should not be used in production."
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
@kept ||= []
|
|
58
|
+
@kept.concat(undefined_columns.map(&:to_sym))
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Prune Operation
|
|
62
|
+
# Removes matching records BEFORE record-by-record scrubbing.
|
|
63
|
+
# Use when you want to delete old/irrelevant records AND scrub the survivors.
|
|
64
|
+
#
|
|
65
|
+
# Unlike bulk operations (truncate!, delete_all, destroy_all) which are terminal,
|
|
66
|
+
# prune is a pre-step: it deletes matching records, then scrubbing continues
|
|
67
|
+
# on the remaining records.
|
|
68
|
+
#
|
|
69
|
+
# Examples:
|
|
70
|
+
# prune { where(created_at: ..1.year.ago) } # Delete old, scrub the rest
|
|
71
|
+
# prune { where(status: 'archived') } # Delete archived, scrub active
|
|
72
|
+
def prune(&scope)
|
|
73
|
+
raise ArgumentError, 'prune requires a block' unless scope
|
|
74
|
+
|
|
75
|
+
@prune_operation = { scope: scope }
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Convenience: prune records older than the given age.
|
|
79
|
+
# Accepts a duration (1.year), DateTime, or date string ("2024-01-01").
|
|
80
|
+
#
|
|
81
|
+
# Examples:
|
|
82
|
+
# prune_older_than 1.year
|
|
83
|
+
# prune_older_than 90.days
|
|
84
|
+
# prune_older_than DateTime.new(2024, 1, 1)
|
|
85
|
+
# prune_older_than "2024-01-01"
|
|
86
|
+
def prune_older_than(age, column: :created_at)
|
|
87
|
+
cutoff = resolve_prune_cutoff(age)
|
|
88
|
+
prune { where(column => ...cutoff) }
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Convenience: prune records newer than the given age.
|
|
92
|
+
# Accepts a duration (1.year), DateTime, or date string ("2024-01-01").
|
|
93
|
+
#
|
|
94
|
+
# Examples:
|
|
95
|
+
# prune_newer_than 1.year
|
|
96
|
+
# prune_newer_than 30.days
|
|
97
|
+
# prune_newer_than "2025-06-01"
|
|
98
|
+
def prune_newer_than(age, column: :created_at)
|
|
99
|
+
cutoff = resolve_prune_cutoff(age)
|
|
100
|
+
prune { where(column => cutoff..) }
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def prune_operation
|
|
104
|
+
@prune_operation
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Bulk Operations (Terminal)
|
|
108
|
+
# These replace record-by-record sanitization with fast bulk SQL operations.
|
|
109
|
+
# No scrubbing runs after a bulk operation.
|
|
110
|
+
# Use for audit logs, sessions, and other high-volume tables.
|
|
111
|
+
|
|
112
|
+
# TRUNCATE TABLE - fastest, resets auto-increment, no conditions
|
|
113
|
+
# Examples:
|
|
114
|
+
# truncate!
|
|
115
|
+
# truncate!(verify: true) # verifies count.zero? after truncation
|
|
116
|
+
def truncate!(verify: false)
|
|
117
|
+
@bulk_operation = { type: :truncate }
|
|
118
|
+
self.verify_all if verify
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# DELETE with optional scope - fast, no callbacks/associations
|
|
122
|
+
# Examples:
|
|
123
|
+
# delete_all # deletes all records
|
|
124
|
+
# delete_all { where(item_type: 'User') } # deletes matching records
|
|
125
|
+
# delete_all(verify: true) { where(...) } # verifies scope.none? after deletion
|
|
126
|
+
def delete_all(verify: false, &scope)
|
|
127
|
+
@bulk_operation = { type: :delete, scope: scope }
|
|
128
|
+
self.verify_all if verify
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# DESTROY with optional scope - runs callbacks, handles associations
|
|
132
|
+
# Examples:
|
|
133
|
+
# destroy_all # destroys all records
|
|
134
|
+
# destroy_all { where(attachable_id: nil) } # destroys orphaned records
|
|
135
|
+
# destroy_all(verify: true) { where(...) } # verifies scope.none? after destruction
|
|
136
|
+
def destroy_all(verify: false, &scope)
|
|
137
|
+
@bulk_operation = { type: :destroy, scope: scope }
|
|
138
|
+
self.verify_all if verify
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def bulk_operation
|
|
142
|
+
@bulk_operation
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# Verification
|
|
146
|
+
# Define post-sanitization checks to confirm the operation succeeded.
|
|
147
|
+
|
|
148
|
+
# Verify after all records are processed (bulk or record-by-record)
|
|
149
|
+
# Block executes in model scope and should return truthy for success.
|
|
150
|
+
# Examples:
|
|
151
|
+
# verify_all # uses default for bulk ops
|
|
152
|
+
# verify_all { where(item_type: SENSITIVE_TYPES).none? }
|
|
153
|
+
# verify_all "No sensitive data should remain" do
|
|
154
|
+
# where(pii_column: true).count.zero?
|
|
155
|
+
# end
|
|
156
|
+
def verify_all(message = nil, &block)
|
|
157
|
+
@verification = if block
|
|
158
|
+
{ message: message, block: block }
|
|
159
|
+
else
|
|
160
|
+
{ message: message, use_default: true }
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Verify each record after sanitization (record-by-record only)
|
|
165
|
+
# Block receives the sanitized record and should return truthy for success.
|
|
166
|
+
# Examples:
|
|
167
|
+
# verify_each { |record| !record.email.include?('@gmail.com') }
|
|
168
|
+
# verify_each "Record should not contain real email" do |record|
|
|
169
|
+
# record.email.end_with?('@example.com')
|
|
170
|
+
# end
|
|
171
|
+
def verify_each(message = nil, &block)
|
|
172
|
+
raise ArgumentError, 'verify_each requires a block' unless block
|
|
173
|
+
|
|
174
|
+
@record_verification = { message: message, block: block }
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def verification
|
|
178
|
+
@verification
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def record_verification
|
|
182
|
+
@record_verification
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def scrubbed
|
|
186
|
+
@scrubbed ||= {}
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def kept
|
|
190
|
+
@kept ||= []
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def scrubbed_columns
|
|
194
|
+
scrubbed.keys.map(&:to_s)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def scrubbed_column?(name)
|
|
198
|
+
scrubbed_columns.include?(name.to_s)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def kept_columns
|
|
202
|
+
kept.map(&:to_s)
|
|
203
|
+
end
|
|
204
|
+
|
|
205
|
+
def defined_columns
|
|
206
|
+
scrubbed_columns + kept_columns
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def undefined_columns
|
|
210
|
+
model_class.column_names - defined_columns - PROTECTED_COLUMNS
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def stale_columns
|
|
214
|
+
defined_columns - model_class.column_names
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def lint!
|
|
218
|
+
issues = []
|
|
219
|
+
|
|
220
|
+
# Bulk operations are terminal — no scrubbing happens, so column coverage is irrelevant
|
|
221
|
+
if undefined_columns.any? && !bulk_operation
|
|
222
|
+
issues << "#{name} (#{model_class.name}) has undefined columns: #{undefined_columns.join(', ')}"
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
if stale_columns.any?
|
|
226
|
+
issues << "#{name} (#{model_class.name}) has stale columns (removed from model): #{stale_columns.join(', ')}"
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
if bulk_operation && (scrubbed.any? || kept.any?)
|
|
230
|
+
ignored = (scrubbed_columns + kept_columns).join(', ')
|
|
231
|
+
issues << "#{name} uses a terminal bulk operation (#{bulk_operation[:type]}) but also declares scrub/keep columns (#{ignored}). These will be ignored."
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
issues
|
|
235
|
+
rescue NameError, RuntimeError => e
|
|
236
|
+
["#{name} references a model that doesn't exist: #{e.message}"]
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
private
|
|
240
|
+
|
|
241
|
+
def infer_model_class
|
|
242
|
+
# e.g. UserSanitizer -> User, StudentSanitizer -> Student
|
|
243
|
+
model_name = name.delete_suffix('Sanitizer')
|
|
244
|
+
model_name.constantize
|
|
245
|
+
rescue NameError
|
|
246
|
+
raise "Could not infer model for #{name}. Use `sanitizes :model_names` to specify explicitly."
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def infer_friendly_name
|
|
250
|
+
name.delete_suffix('Sanitizer').underscore.pluralize
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def resolve_prune_cutoff(age)
|
|
254
|
+
case age
|
|
255
|
+
when ActiveSupport::Duration
|
|
256
|
+
age.ago
|
|
257
|
+
when DateTime, Time, Date
|
|
258
|
+
age
|
|
259
|
+
when String
|
|
260
|
+
DateTime.parse(age)
|
|
261
|
+
else
|
|
262
|
+
raise ArgumentError,
|
|
263
|
+
"prune cutoff must be a Duration (1.year), DateTime, or date string, got #{age.class}"
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
end
|
|
267
|
+
end
|