pumice 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,336 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pumice
4
+ class UndefinedAttributeError < StandardError; end
5
+ class VerificationError < StandardError; end
6
+
7
+ class Sanitizer
8
+ extend Pumice::DSL
9
+ include Pumice::Helpers
10
+
11
+ class << self
12
+ def inherited(subclass)
13
+ super
14
+ Pumice.register(subclass)
15
+ end
16
+
17
+ # Non-destructive sanitization - returns values without persisting
18
+ # sanitize(record) → returns hash of all sanitized values
19
+ # sanitize(record, :attr) → returns single sanitized value
20
+ def sanitize(record, attr_name = nil, raw_value: nil)
21
+ with_seed_for(record) do
22
+ instance = new(record)
23
+ attr_name ? instance.scrub(attr_name, raw_value) : instance.scrub_all
24
+ end
25
+ end
26
+
27
+ # Destructive scrubbing - persists to database
28
+ # scrub!(record) → persists all scrubbed values
29
+ # scrub!(record, :attr) → persists single scrubbed value
30
+ def scrub!(record, attr_name = nil)
31
+ result = sanitize(record, attr_name)
32
+ persist(record, attr_name, result)
33
+ result
34
+ end
35
+
36
+ # Batch operation - sanitize all records of this model
37
+ # If a bulk operation (truncate!, delete_all, destroy_all) is defined,
38
+ # it runs instead of record-by-record sanitization.
39
+ # If a prune operation is defined, matching records are deleted first,
40
+ # then remaining records are scrubbed one-by-one.
41
+ def scrub_all!
42
+ validate_coverage! if Pumice.strict? && !bulk_operation
43
+
44
+ logger.initialize_stats
45
+ logger.log_start(name)
46
+
47
+ if Pumice.dry_run? && !bulk_operation && scrubbed.any?
48
+ logger.log_progress("Columns: #{scrubbed_columns.join(', ')}")
49
+ end
50
+
51
+ count = if bulk_operation
52
+ run_bulk_operation
53
+ else
54
+ pruned = prune_operation ? run_prune : 0
55
+ scrubbed_count = run_record_sanitization
56
+ pruned + scrubbed_count
57
+ end
58
+
59
+ run_verification unless Pumice.dry_run?
60
+
61
+ logger.log_complete(name, count)
62
+ rescue NameError => e
63
+ logger.log_progress("Skipping #{name} (model not found)")
64
+ end
65
+
66
+ private
67
+
68
+ def run_bulk_operation
69
+ op = bulk_operation
70
+
71
+ if Pumice.dry_run?
72
+ count = if op[:scope]
73
+ model_class.instance_exec(&op[:scope]).count
74
+ else
75
+ model_class.count
76
+ end
77
+ logger.log_progress("[DRY RUN] Would #{op[:type]} #{count} records")
78
+ return count
79
+ end
80
+
81
+ case op[:type]
82
+ when :truncate
83
+ run_truncate
84
+ when :delete
85
+ run_delete(op[:scope])
86
+ when :destroy
87
+ run_destroy(op[:scope])
88
+ end
89
+ end
90
+
91
+ def run_truncate
92
+ table = model_class.table_name
93
+ count = model_class.count
94
+ ActiveRecord::Base.connection.truncate(table)
95
+ logger.log_progress("Truncated #{table}")
96
+ count
97
+ end
98
+
99
+ def run_delete(scope_block)
100
+ scope = scope_block ? model_class.instance_exec(&scope_block) : model_class.all
101
+ count = scope.delete_all
102
+ logger.log_progress("Deleted #{count} records")
103
+ count
104
+ end
105
+
106
+ def run_destroy(scope_block)
107
+ scope = scope_block ? model_class.instance_exec(&scope_block) : model_class.all
108
+ count = scope.destroy_all.count
109
+ logger.log_progress("Destroyed #{count} records")
110
+ count
111
+ end
112
+
113
+ def run_prune
114
+ scope_block = prune_operation[:scope]
115
+ scope = model_class.instance_exec(&scope_block)
116
+
117
+ if Pumice.dry_run?
118
+ count = scope.count
119
+ logger.log_progress("[DRY RUN] Would prune #{count} records")
120
+ return count
121
+ end
122
+
123
+ count = scope.delete_all
124
+ logger.log_progress("Pruned #{count} records")
125
+ count
126
+ end
127
+
128
+ def run_record_sanitization
129
+ total = model_class.count
130
+ progress = Pumice::Progress.new(title: model_class.name, total: total)
131
+ count = 0
132
+ model_class.find_each do |record|
133
+ scrub!(record)
134
+ run_record_verification(record) unless Pumice.dry_run?
135
+ count += 1
136
+ progress.increment
137
+ rescue => e
138
+ logger.log_error(name, e)
139
+ raise unless Pumice.config.continue_on_error
140
+ end
141
+ progress.finish
142
+ count
143
+ end
144
+
145
+ def run_record_verification(record)
146
+ return unless record_verification
147
+
148
+ block = record_verification[:block]
149
+ message = record_verification[:message]
150
+
151
+ # Reload record to get persisted values
152
+ record.reload
153
+
154
+ result = block.call(record)
155
+
156
+ unless result
157
+ error_message = message || "Record verification failed for #{name} (ID: #{record.id})"
158
+ logger.log_progress("VERIFICATION FAILED: #{error_message}")
159
+ raise VerificationError, error_message
160
+ end
161
+ end
162
+
163
+ def run_verification
164
+ return unless verification
165
+
166
+ if verification[:block]
167
+ execute_verification(verification[:block], verification[:message])
168
+ elsif verification[:use_default]
169
+ execute_default_verification(verification[:message])
170
+ end
171
+ end
172
+
173
+ def execute_verification(block, message)
174
+ result = model_class.instance_exec(&block)
175
+
176
+ unless result
177
+ error_message = message || "Verification failed for #{name}"
178
+ logger.log_progress("VERIFICATION FAILED: #{error_message}")
179
+ raise VerificationError, error_message
180
+ end
181
+
182
+ logger.log_progress("Verification passed")
183
+ end
184
+
185
+ def execute_default_verification(message)
186
+ unless bulk_operation
187
+ raise ArgumentError,
188
+ "#{name}: verify_all without a block requires a bulk operation (truncate!, delete_all, destroy_all)"
189
+ end
190
+
191
+ default_block = Pumice.config.default_verification.call(model_class, bulk_operation)
192
+
193
+ # For scoped operations, the default policy returns the scope block.
194
+ # We execute it and check .none? to verify records are gone.
195
+ scope_or_result = model_class.instance_exec(&default_block)
196
+
197
+ # If the result is an ActiveRecord relation, check .none?
198
+ # Otherwise treat it as a boolean result
199
+ result = if scope_or_result.respond_to?(:none?)
200
+ scope_or_result.none?
201
+ else
202
+ scope_or_result
203
+ end
204
+
205
+ unless result
206
+ error_message = message || "Verification failed for #{name}"
207
+ logger.log_progress("VERIFICATION FAILED: #{error_message}")
208
+ raise VerificationError, error_message
209
+ end
210
+
211
+ logger.log_progress("Verification passed")
212
+ end
213
+
214
+ def persist(record, attr_name, result)
215
+ if attr_name
216
+ persist_attribute(record, attr_name, result)
217
+ else
218
+ persist_record(record, result)
219
+ end
220
+ end
221
+
222
+ def persist_record(record, data)
223
+ if Pumice.dry_run?
224
+ details = if Pumice.verbose?
225
+ changes = data.map { |attr, new_val|
226
+ "#{attr} (#{record.read_attribute(attr).inspect} → #{new_val.inspect})"
227
+ }.join(', ')
228
+ "ID #{record.id}: #{changes}"
229
+ else
230
+ "ID #{record.id} — #{data.keys.join(', ')}"
231
+ end
232
+ logger.log_record(:would_sanitize, details)
233
+ else
234
+ record.update_columns(data)
235
+ logger.log_record(:sanitized, "ID #{record.id}")
236
+ end
237
+ end
238
+
239
+ def persist_attribute(record, attr_name, value)
240
+ if Pumice.dry_run?
241
+ original = record.read_attribute(attr_name)
242
+ logger.log_record(:would_sanitize, "ID #{record.id}.#{attr_name} (#{original.inspect} → #{value.inspect})")
243
+ else
244
+ record.update_column(attr_name, value)
245
+ logger.log_record(:sanitized, "ID #{record.id}.#{attr_name}")
246
+ end
247
+ end
248
+
249
+ # Seeds Faker per-record for deterministic output. Thread-safe because
250
+ # Faker 3.x stores Config.random in Thread.current (gemspec requires >= 3.0).
251
+ def with_seed_for(record)
252
+ previous = Faker::Config.random
253
+ Faker::Config.random = Random.new(record&.id || record.object_id)
254
+ yield
255
+ ensure
256
+ Faker::Config.random = previous
257
+ end
258
+
259
+ def validate_coverage!
260
+ return if undefined_columns.empty?
261
+
262
+ raise UndefinedAttributeError,
263
+ "#{name} is missing definitions for: #{undefined_columns.join(', ')}. " \
264
+ "Add scrub(:column) { value } for each, or set Pumice.configure { |c| c.strict = false }"
265
+ end
266
+
267
+ def logger
268
+ Pumice::Logger
269
+ end
270
+ end
271
+
272
+ attr_reader :record
273
+
274
+ def initialize(record)
275
+ @record = record
276
+ end
277
+
278
+ def scrub(attr_name, raw_value = nil)
279
+ raw_value ||= record.send(attr_name)
280
+ block = self.class.scrubbed[attr_name.to_sym]
281
+ return raw_value unless block
282
+
283
+ instance_exec(raw_value, &block)
284
+ end
285
+
286
+ def scrub_all
287
+ self.class.scrubbed.keys.each_with_object({}) do |attr_name, hash|
288
+ hash[attr_name] = scrub(attr_name)
289
+ end
290
+ end
291
+
292
+ # Read an original database value, bypassing scrubbing.
293
+ #
294
+ # scrub(:email) { "#{raw(:first_name)}.#{raw(:last_name)}@example.test" }
295
+ def raw(attr_name)
296
+ record.public_send(attr_name)
297
+ end
298
+
299
+ # Provides a clean DSL for referencing attributes within scrub blocks:
300
+ # - Bare attribute names return scrubbed values: `name` → scrub(:name)
301
+ # - raw_* methods return original database values: `raw_name` → raw(:name)
302
+ def method_missing(method_name, *args, &block)
303
+ if raw_attribute_method?(method_name)
304
+ return raw(extract_raw_attribute_name(method_name))
305
+ end
306
+
307
+ if self.class.scrubbed_column?(method_name)
308
+ return scrub(method_name)
309
+ end
310
+
311
+ if record.respond_to?(method_name)
312
+ return record.public_send(method_name, *args, &block)
313
+ end
314
+
315
+ super
316
+ end
317
+
318
+ def respond_to_missing?(method_name, include_private = false)
319
+ raw_attribute_method?(method_name) ||
320
+ self.class.scrubbed_column?(method_name) ||
321
+ record.respond_to?(method_name, include_private) ||
322
+ super
323
+ end
324
+
325
+ private
326
+
327
+ def raw_attribute_method?(method_name)
328
+ method_name.to_s.start_with?('raw_') &&
329
+ record.respond_to?(extract_raw_attribute_name(method_name))
330
+ end
331
+
332
+ def extract_raw_attribute_name(method_name)
333
+ method_name.to_s.delete_prefix('raw_').to_sym
334
+ end
335
+ end
336
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pumice
4
+ module SoftScrubbing
5
+ # Policy determines when soft scrubbing applies to a record.
6
+ #
7
+ # TRANSITIONAL: This module currently uses a binary on/off policy check.
8
+ # Future versions will support:
9
+ # - Per-attribute policies (SSN vs email have different rules)
10
+ # - Role-graduated scrubbing (admin/manager/user see different levels)
11
+ # - Viewer context passed to scrub blocks for conditional masking
12
+ #
13
+ # See lib/pumice/README.md for the roadmap.
14
+ module Policy
15
+ extend self
16
+
17
+ THREAD_KEY = :pumice_soft_scrub_context
18
+ CONTEXT_SET_KEY = :pumice_soft_scrub_context_set
19
+
20
+ def context=(context)
21
+ Thread.current[THREAD_KEY] = context
22
+ Thread.current[CONTEXT_SET_KEY] = true
23
+ end
24
+
25
+ def current(record = nil)
26
+ resolve(Thread.current[THREAD_KEY], record)
27
+ end
28
+
29
+ # Returns true if context has been explicitly set for this request/thread.
30
+ # Used to distinguish "no logged-in user" from "not in a request context".
31
+ def context_set?
32
+ Thread.current[CONTEXT_SET_KEY] == true
33
+ end
34
+
35
+ def with_context(context)
36
+ previous = Thread.current[THREAD_KEY]
37
+ previous_set = Thread.current[CONTEXT_SET_KEY]
38
+ self.context = context
39
+ yield
40
+ ensure
41
+ Thread.current[THREAD_KEY] = previous
42
+ Thread.current[CONTEXT_SET_KEY] = previous_set
43
+ end
44
+
45
+ # Temporarily disable soft scrubbing for a block.
46
+ # Used during authentication/session management to skip policy checks.
47
+ def without_context
48
+ previous = Thread.current[THREAD_KEY]
49
+ previous_set = Thread.current[CONTEXT_SET_KEY]
50
+ Thread.current[THREAD_KEY] = nil
51
+ Thread.current[CONTEXT_SET_KEY] = nil
52
+ yield
53
+ ensure
54
+ Thread.current[THREAD_KEY] = previous
55
+ Thread.current[CONTEXT_SET_KEY] = previous_set
56
+ end
57
+
58
+ def enabled_for?(record)
59
+ return false unless Pumice.soft_scrubbing?
60
+ return false unless context_set? # Skip during boot/initialization
61
+
62
+ viewer = current(record)
63
+ Pumice.config.soft_scrubbing[:policy].call(record, viewer)
64
+ end
65
+
66
+ def reset!
67
+ Thread.current[THREAD_KEY] = nil
68
+ Thread.current[CONTEXT_SET_KEY] = nil
69
+ end
70
+
71
+ private
72
+
73
+ def resolve(raw_context, record)
74
+ config_context = Pumice.soft_scrubbing? ? Pumice.config.soft_scrubbing[:context] : nil
75
+ value = raw_context.nil? ? config_context : raw_context
76
+
77
+ case value
78
+ when Proc
79
+ value.arity.zero? ? value.call : value.call(record)
80
+ when Symbol, String
81
+ resolve_symbol(value.to_sym, record)
82
+ when nil
83
+ nil
84
+ else
85
+ value
86
+ end
87
+ end
88
+
89
+ def resolve_symbol(method_name, record)
90
+ if record&.respond_to?(method_name)
91
+ record.public_send(method_name)
92
+ elsif Pumice.respond_to?(method_name)
93
+ Pumice.public_send(method_name)
94
+ elsif defined?(Current) && Current.respond_to?(method_name)
95
+ Current.public_send(method_name)
96
+ elsif Thread.current.key?(method_name)
97
+ Thread.current[method_name]
98
+ else
99
+ nil
100
+ end
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pumice
4
+ module SoftScrubbing
5
+ extend ActiveSupport::Concern
6
+
7
+ RECURSION_GUARD_KEY = :pumice_soft_scrub_in_progress
8
+
9
+ # System attributes that should never be scrubbed (needed for Rails internals)
10
+ SYSTEM_ATTRIBUTES = %w[id created_at updated_at].freeze
11
+
12
+ module AttributeInterceptor
13
+ def _read_attribute(attr_name)
14
+ # Prevent infinite recursion - if we're already inside the interceptor, bail out
15
+ return super if Thread.current[Pumice::SoftScrubbing::RECURSION_GUARD_KEY]
16
+
17
+ # Quick check: skip if soft_scrubbing not configured
18
+ return super unless Pumice.soft_scrubbing?
19
+
20
+ # Skip system attributes needed for Rails/Devise internals (session serialization, etc.)
21
+ return super if Pumice::SoftScrubbing::SYSTEM_ATTRIBUTES.include?(attr_name.to_s)
22
+
23
+ begin
24
+ Thread.current[Pumice::SoftScrubbing::RECURSION_GUARD_KEY] = true
25
+
26
+ unless Pumice.soft_scrubbing_enabled_for?(self)
27
+ return super
28
+ end
29
+
30
+ sanitizer = Pumice.sanitizer_for(self.class)
31
+ return super unless sanitizer.scrubbed_column?(attr_name)
32
+
33
+ soft_scrubbed_value(attr_name, sanitizer)
34
+ ensure
35
+ Thread.current[Pumice::SoftScrubbing::RECURSION_GUARD_KEY] = false
36
+ end
37
+ end
38
+
39
+ def reload(*)
40
+ @_soft_scrubbed_cache = nil
41
+ super
42
+ end
43
+
44
+ def write_attribute(attr_name, value)
45
+ @_soft_scrubbed_cache&.delete(attr_name.to_s)
46
+ super
47
+ end
48
+
49
+ private
50
+
51
+ def soft_scrubbed_value(attr_name, sanitizer)
52
+ @_soft_scrubbed_cache ||= {}
53
+ @_soft_scrubbed_cache[attr_name] ||= begin
54
+ raw_value = @attributes.fetch_value(attr_name.to_s)
55
+ sanitizer.sanitize(self, attr_name, raw_value: raw_value)
56
+ end
57
+ end
58
+ end
59
+
60
+ # Call this once at boot to enable the feature
61
+ def self.init!
62
+ return if @initialized
63
+
64
+ ActiveRecord::Base.prepend(AttributeInterceptor)
65
+ @initialized = true
66
+
67
+ # Eager-load sanitizers using Rails' reloader
68
+ Rails.application.reloader.to_prepare do
69
+ Pumice::SoftScrubbing.eager_load_sanitizers!
70
+ end
71
+
72
+ Rails.logger.info("[Pumice] Soft scrubbing initialized")
73
+ end
74
+
75
+ def self.initialized?
76
+ @initialized == true
77
+ end
78
+
79
+ # For debugging: force re-initialization (use in console only)
80
+ def self.reinit!
81
+ @initialized = false
82
+ init!
83
+ end
84
+
85
+ def self.eager_load_sanitizers!
86
+ sanitizer_paths = Rails.root.join('app/sanitizers')
87
+ return unless sanitizer_paths.exist?
88
+
89
+ Dir[sanitizer_paths.join('**/*.rb')].sort.each do |file|
90
+ relative_path = Pathname.new(file).relative_path_from(sanitizer_paths)
91
+ const_name = relative_path.to_s.delete_suffix('.rb').camelize
92
+
93
+ begin
94
+ const_name.constantize
95
+ rescue NameError => e
96
+ Rails.logger.warn("[Pumice] Could not load sanitizer #{const_name}: #{e.message}")
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,113 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pumice
4
+ class Validator
5
+ Result = Struct.new(:errors, :checks, keyword_init: true) do
6
+ def passed?
7
+ errors.empty?
8
+ end
9
+ end
10
+
11
+ Check = Struct.new(:name, :count, :passed, keyword_init: true)
12
+
13
+ def initialize(email_domains: nil)
14
+ @email_domains = Array(email_domains || Pumice.config.sensitive_email_domains)
15
+ end
16
+
17
+ def run
18
+ errors = []
19
+ checks = []
20
+
21
+ # Check for real email domains
22
+ email_check = check_real_emails
23
+ errors.concat(email_check[:errors])
24
+ checks << email_check[:check]
25
+
26
+ # Check for test emails
27
+ checks << check_test_emails
28
+
29
+ # Check for cleared tokens
30
+ token_checks = check_cleared_tokens
31
+ errors.concat(token_checks[:errors])
32
+ checks.concat(token_checks[:checks])
33
+
34
+ # Check for cleared external IDs
35
+ external_checks = check_external_ids
36
+ errors.concat(external_checks[:errors])
37
+ checks.concat(external_checks[:checks])
38
+
39
+ Result.new(errors: errors, checks: checks)
40
+ end
41
+
42
+ private
43
+
44
+ def email_model
45
+ @email_model ||= Pumice.config.sensitive_email_model.constantize
46
+ rescue NameError
47
+ raise NameError,
48
+ "Pumice validator: model '#{Pumice.config.sensitive_email_model}' not found. " \
49
+ "Set config.sensitive_email_model to your app's user model (e.g. 'Account', 'Member')."
50
+ end
51
+
52
+ def email_column
53
+ @email_column ||= Pumice.config.sensitive_email_column
54
+ end
55
+
56
+ def check_real_emails
57
+ errors = []
58
+
59
+ @email_domains.each do |domain|
60
+ count = email_model.where("#{email_column} LIKE ?", "%@#{domain}").count
61
+ errors << "Found #{count} emails with real domain #{domain}" if count > 0
62
+ end
63
+
64
+ {
65
+ errors: errors,
66
+ check: Check.new(name: 'real_email_domains', count: errors.size, passed: errors.empty?)
67
+ }
68
+ end
69
+
70
+ def check_test_emails
71
+ count = email_model.where("#{email_column} LIKE ?", "%@example.test").count
72
+ Check.new(name: 'test_emails', count: count, passed: count > 0)
73
+ end
74
+
75
+ def check_cleared_tokens
76
+ errors = []
77
+ checks = []
78
+
79
+ token_columns = Pumice.config.sensitive_token_columns
80
+
81
+ token_columns.each do |column|
82
+ next unless email_model.column_names.include?(column.to_s)
83
+
84
+ count = email_model.where.not(column => nil).count
85
+ if count > 0
86
+ errors << "Found #{count} users with #{column}"
87
+ end
88
+ checks << Check.new(name: column.to_s, count: count, passed: count == 0)
89
+ end
90
+
91
+ { errors: errors, checks: checks }
92
+ end
93
+
94
+ def check_external_ids
95
+ errors = []
96
+ checks = []
97
+
98
+ external_id_columns = Pumice.config.sensitive_external_id_columns
99
+
100
+ external_id_columns.each do |column|
101
+ next unless email_model.column_names.include?(column.to_s)
102
+
103
+ count = email_model.where.not(column => nil).count
104
+ if count > 0
105
+ errors << "Found #{count} users with #{column} (should be cleared)"
106
+ end
107
+ checks << Check.new(name: column.to_s, count: count, passed: count == 0)
108
+ end
109
+
110
+ { errors: errors, checks: checks }
111
+ end
112
+ end
113
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pumice
4
+ VERSION = "0.7.1"
5
+ end
data/lib/pumice.rb ADDED
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'pumice/version'
4
+
5
+ module Pumice; end
6
+
7
+ require_relative 'pumice/configuration'
8
+ require_relative 'pumice/output'
9
+ require_relative 'pumice/progress'
10
+ require_relative 'pumice/helpers'
11
+ require_relative 'pumice/logger'
12
+ require_relative 'pumice/dsl'
13
+ require_relative 'pumice/sanitizer'
14
+ require_relative 'pumice/empty_sanitizer'
15
+ require_relative 'pumice/soft_scrubbing'
16
+ require_relative 'pumice/analyzer'
17
+ require_relative 'pumice/validator'
18
+ require_relative 'pumice/runner'
19
+ require_relative 'pumice/dump_generator'
20
+ require_relative 'pumice/pruner'
21
+ require_relative 'pumice/pruning/analyzer'
22
+ require_relative 'pumice/safe_scrubber'
23
+ require_relative 'pumice/railtie' if defined?(Rails::Railtie)