pumice 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pumice
4
+ class Analyzer
5
+ def initialize(limit: 20, schema: 'public', tables: nil)
6
+ @limit = limit
7
+ @schema = schema
8
+ @tables = Array(tables || Pumice.config.sensitive_tables).map(&:to_s)
9
+ end
10
+
11
+ def table_sizes
12
+ @sizes ||= fetch_table_sizes
13
+ end
14
+
15
+ def total_bytes
16
+ @total ||= table_sizes.sum { |s| s.bytes.to_i }
17
+ end
18
+
19
+ def row_counts
20
+ @row_counts ||= fetch_row_counts
21
+ end
22
+
23
+ private
24
+
25
+ TableSize = Struct.new(:name, :size, :bytes, keyword_init: true)
26
+ RowCount = Struct.new(:table, :count, keyword_init: true)
27
+
28
+ def fetch_table_sizes
29
+ conn = ActiveRecord::Base.connection
30
+ quoted_schema = conn.quote(@schema)
31
+
32
+ sql = <<-SQL
33
+ SELECT
34
+ tablename,
35
+ pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS size,
36
+ pg_total_relation_size(schemaname||'.'||tablename) AS bytes
37
+ FROM pg_tables
38
+ WHERE schemaname = #{quoted_schema}
39
+ ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
40
+ LIMIT #{@limit.to_i};
41
+ SQL
42
+
43
+ conn.execute(sql).map do |row|
44
+ TableSize.new(
45
+ name: row['tablename'],
46
+ size: row['size'],
47
+ bytes: row['bytes'].to_i
48
+ )
49
+ end
50
+ end
51
+
52
+ def fetch_row_counts
53
+ @tables.filter_map do |table|
54
+ count = fetch_row_count(table)
55
+ RowCount.new(table: table, count: count) if count
56
+ end
57
+ end
58
+
59
+ def fetch_row_count(table_name)
60
+ conn = ActiveRecord::Base.connection
61
+ quoted_table = conn.quote_table_name(table_name)
62
+ conn.execute("SELECT COUNT(*) FROM #{quoted_table}").first['count'].to_i
63
+ rescue StandardError
64
+ nil
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,330 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'soft_scrubbing/policy'
4
+
5
+ module Pumice
6
+ class Configuration
7
+ attr_accessor :verbose, :strict, :continue_on_error,
8
+ :allow_keep_undefined_columns, :sensitive_tables, :sensitive_email_domains,
9
+ :sensitive_email_model, :sensitive_email_column, :default_verification,
10
+ # Validator configuration
11
+ :sensitive_token_columns, # Token columns to verify are cleared (e.g., Devise tokens)
12
+ :sensitive_external_id_columns, # External ID columns to verify are cleared
13
+ # Safe scrub configuration
14
+ :source_database_url, # Source database (read-only, never modified)
15
+ :target_database_url, # Target database (scrubbed copy)
16
+ :export_path, # Optional: path to export the scrubbed dump
17
+ :export_format, # Export format: :custom (pg_dump -Fc) or :plain (SQL)
18
+ :require_readonly_source, # Enforce read-only source credentials (default: false, warns only)
19
+ # Pruning configuration
20
+ :pruning # Delete old records before sanitization (see pruning_config)
21
+
22
+ attr_writer :soft_scrubbing
23
+
24
+ # Default verification policy for bulk operations.
25
+ # Used when `verify_all` is called without a block.
26
+ # Receives (model_class, bulk_operation) and returns a verification proc.
27
+ # The bulk_operation hash contains :type (:truncate, :delete, :destroy) and :scope (optional block).
28
+ DEFAULT_VERIFICATION_POLICY = lambda do |_model_class, bulk_operation|
29
+ case bulk_operation[:type]
30
+ when :truncate
31
+ -> { count.zero? }
32
+ when :delete, :destroy
33
+ if bulk_operation[:scope]
34
+ # Re-run the scope and check .none?
35
+ bulk_operation[:scope]
36
+ else
37
+ -> { count.zero? }
38
+ end
39
+ end
40
+ end
41
+
42
+ def initialize
43
+ @verbose = false
44
+ @strict = true
45
+ @continue_on_error = false
46
+ @soft_scrubbing = false # Disabled by default; set to hash to enable
47
+ @allow_keep_undefined_columns = true
48
+ @sensitive_tables = []
49
+ @sensitive_email_domains = []
50
+ @sensitive_email_model = 'User'
51
+ @sensitive_email_column = 'email'
52
+ @default_verification = DEFAULT_VERIFICATION_POLICY
53
+ # Validator defaults (Devise-compatible)
54
+ @sensitive_token_columns = %w[reset_password_token confirmation_token]
55
+ @sensitive_external_id_columns = []
56
+ # Safe scrub defaults
57
+ @source_database_url = nil
58
+ @target_database_url = nil
59
+ @export_path = nil
60
+ @export_format = :custom
61
+ @require_readonly_source = false # Warn by default, set true to enforce
62
+ # Pruning defaults
63
+ @pruning = false # Disabled by default; set to hash to enable
64
+ end
65
+
66
+ # Returns true if soft_scrubbing config is set
67
+ def soft_scrubbing_configured?
68
+ @soft_scrubbing.is_a?(Hash)
69
+ end
70
+
71
+ # Returns normalized soft_scrubbing configuration
72
+ # Overrides attr_writer getter to return normalized config
73
+ def soft_scrubbing
74
+ return nil unless soft_scrubbing_configured?
75
+
76
+ {
77
+ context: @soft_scrubbing.fetch(:context, nil),
78
+ policy: resolve_soft_scrubbing_policy
79
+ }
80
+ end
81
+
82
+ private
83
+
84
+ # Resolves the policy from if:/unless: options
85
+ # if: and unless: are mutually exclusive (if: takes precedence)
86
+ def resolve_soft_scrubbing_policy
87
+ if_condition = @soft_scrubbing[:if]
88
+ unless_condition = @soft_scrubbing[:unless]
89
+
90
+ if if_condition
91
+ if_condition
92
+ elsif unless_condition
93
+ # Invert the unless condition
94
+ ->(record, viewer) { !unless_condition.call(record, viewer) }
95
+ else
96
+ # Default: always scrub
97
+ ->(_record, _viewer) { true }
98
+ end
99
+ end
100
+
101
+ public
102
+
103
+ # Returns true if pruning config is set (not checking ENV)
104
+ def pruning_configured?
105
+ @pruning.is_a?(Hash)
106
+ end
107
+
108
+ # Returns normalized pruning configuration
109
+ # Overrides attr_accessor getter to return normalized config
110
+ def pruning
111
+ return nil unless Pumice.pruning_enabled?
112
+
113
+ validate_pruning_config!
114
+
115
+ {
116
+ older_than: @pruning[:older_than],
117
+ newer_than: @pruning[:newer_than],
118
+ column: @pruning.fetch(:column, :created_at).to_sym,
119
+ only: @pruning.fetch(:only, []).map(&:to_s),
120
+ except: @pruning.fetch(:except, []).map(&:to_s),
121
+ analyzer: normalize_analyzer_config(@pruning.fetch(:analyzer, {}))
122
+ }
123
+ end
124
+
125
+ private
126
+
127
+ def validate_pruning_config!
128
+ has_older = @pruning.key?(:older_than)
129
+ has_newer = @pruning.key?(:newer_than)
130
+
131
+ if has_older && has_newer
132
+ raise ArgumentError,
133
+ "Pruning config cannot specify both older_than and newer_than. " \
134
+ "Use one or the other."
135
+ end
136
+
137
+ unless has_older || has_newer
138
+ raise ArgumentError,
139
+ "Pruning config requires either older_than: or newer_than: to specify " \
140
+ "which records to prune."
141
+ end
142
+ end
143
+
144
+ def normalize_analyzer_config(analyzer_config)
145
+ {
146
+ table_patterns: Array(analyzer_config.fetch(:table_patterns, [])).map(&:to_s),
147
+ min_table_size: analyzer_config.fetch(:min_table_size, 10_000_000), # 10 MB
148
+ min_row_count: analyzer_config.fetch(:min_row_count, 1000)
149
+ }
150
+ end
151
+
152
+ def database_url_from_rails_config
153
+ config_hash = ActiveRecord::Base.connection_db_config.configuration_hash
154
+
155
+ # Staging/production: config already has a url key
156
+ return config_hash[:url] if config_hash[:url].present?
157
+
158
+ # Development/test: build from components
159
+ build_database_url(config_hash)
160
+ end
161
+
162
+ def build_database_url(config_hash)
163
+ return nil unless config_hash[:adapter] == 'postgresql'
164
+
165
+ host = config_hash[:host] || 'localhost'
166
+ port = config_hash[:port] || 5432
167
+ database = config_hash[:database]
168
+ username = config_hash[:username]
169
+ password = config_hash[:password]
170
+
171
+ return nil if database.blank?
172
+
173
+ userinfo = if username.present? && password.present?
174
+ "#{URI.encode_www_form_component(username)}:#{URI.encode_www_form_component(password)}@"
175
+ elsif username.present?
176
+ "#{URI.encode_www_form_component(username)}@"
177
+ else
178
+ ''
179
+ end
180
+
181
+ "postgresql://#{userinfo}#{host}:#{port}/#{database}"
182
+ end
183
+
184
+ public
185
+
186
+ # Resolves source_database_url, handling the :auto sentinel.
187
+ # Returns a concrete URL string or nil.
188
+ def resolved_source_database_url
189
+ case @source_database_url
190
+ when :auto then database_url_from_rails_config
191
+ when String then @source_database_url
192
+ end
193
+ end
194
+
195
+ def sensitive_tables=(value)
196
+ @sensitive_tables = normalize_collection(value)
197
+ end
198
+
199
+ def sensitive_email_domains=(value)
200
+ @sensitive_email_domains = normalize_collection(value)
201
+ end
202
+
203
+ def add_sensitive_tables(value)
204
+ @sensitive_tables |= normalize_collection(value)
205
+ end
206
+
207
+ def add_sensitive_email_domains(value)
208
+ @sensitive_email_domains |= normalize_collection(value)
209
+ end
210
+
211
+ private
212
+
213
+ def normalize_collection(value)
214
+ Array(value).flatten.compact.map(&:to_s)
215
+ end
216
+ end
217
+
218
+ def self.config
219
+ @config ||= Configuration.new
220
+ end
221
+
222
+ def self.configure
223
+ yield(config)
224
+ Pumice::SoftScrubbing.init! if config.soft_scrubbing_configured?
225
+ end
226
+
227
+ def self.dry_run?
228
+ ENV['DRY_RUN'] == 'true'
229
+ end
230
+
231
+ def self.verbose?
232
+ config.verbose
233
+ end
234
+
235
+ def self.strict?
236
+ config.strict
237
+ end
238
+
239
+ def self.soft_scrubbing?
240
+ config.soft_scrubbing_configured?
241
+ end
242
+
243
+ def self.allow_keep_undefined_columns?
244
+ config.allow_keep_undefined_columns
245
+ end
246
+
247
+ def self.soft_scrubbing_context=(context)
248
+ SoftScrubbing::Policy.context = context
249
+ end
250
+
251
+ def self.soft_scrubbing_context
252
+ SoftScrubbing::Policy.current
253
+ end
254
+
255
+ def self.with_soft_scrubbing_context(context, &block)
256
+ SoftScrubbing::Policy.with_context(context, &block)
257
+ end
258
+
259
+ def self.soft_scrubbing_enabled_for?(record)
260
+ SoftScrubbing::Policy.enabled_for?(record)
261
+ end
262
+
263
+ # Returns true if pruning is configured and not disabled by ENV
264
+ # Set PRUNE=false to disable pruning without changing config
265
+ def self.pruning_enabled?
266
+ return false if ENV['PRUNE'] == 'false'
267
+
268
+ config.pruning_configured?
269
+ end
270
+
271
+ # Returns true if the given table should be pruned
272
+ def self.prune_table?(table_name)
273
+ return false unless pruning_enabled?
274
+
275
+ pruning_config = config.pruning
276
+ table = table_name.to_s
277
+
278
+ if pruning_config[:only].present?
279
+ pruning_config[:only].include?(table)
280
+ elsif pruning_config[:except].present?
281
+ !pruning_config[:except].include?(table)
282
+ else
283
+ true # Prune all tables if no filter specified
284
+ end
285
+ end
286
+
287
+ def self.sanitizer_for(model_class)
288
+ @sanitizer_map ||= {}
289
+ @sanitizer_map[model_class] ||= sanitizers.find do |s|
290
+ s.model_class == model_class
291
+ rescue StandardError
292
+ nil
293
+ end
294
+
295
+ @sanitizer_map[model_class] || Pumice::EmptySanitizer
296
+ end
297
+
298
+ def self.lint!
299
+ issues = []
300
+
301
+ sanitizers.each do |sanitizer|
302
+ issues.concat(sanitizer.lint!)
303
+ end
304
+
305
+ if issues.any?
306
+ puts "\n🔍 Pumice Lint Errors:\n"
307
+ issues.each { |issue| puts " ❌ #{issue}" }
308
+ puts "\n"
309
+ else
310
+ puts "\n✅ Pumice: All sanitizers have complete coverage\n"
311
+ end
312
+
313
+ issues
314
+ end
315
+
316
+ def self.sanitizers
317
+ @sanitizers ||= []
318
+ end
319
+
320
+ def self.register(sanitizer)
321
+ sanitizers << sanitizer unless sanitizers.include?(sanitizer)
322
+ end
323
+
324
+ def self.reset!
325
+ @sanitizers = []
326
+ @sanitizer_map = {}
327
+ @config = nil
328
+ SoftScrubbing::Policy.reset!
329
+ end
330
+ end
data/lib/pumice/dsl.rb ADDED
@@ -0,0 +1,267 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pumice
4
+ # DSL methods for defining sanitizer behavior.
5
+ # Extended by Sanitizer subclasses to provide scrub/keep declarations.
6
+ module DSL
7
+ PROTECTED_COLUMNS = %w[id created_at updated_at].freeze
8
+
9
+ # Explicitly declare the model this sanitizer handles.
10
+ # Uses pluralized form (like has_many):
11
+ # sanitizes :users # infers User
12
+ # sanitizes :admin_users, class_name: 'Admin::User' # namespaced model
13
+ # sanitizes :users, class_name: User # explicit constant
14
+ def sanitizes(model_name, class_name: model_name.to_s.classify)
15
+ @model_class = if class_name.is_a?(String)
16
+ class_name.constantize
17
+ else
18
+ class_name
19
+ end
20
+ end
21
+
22
+ def model_class
23
+ @model_class ||= infer_model_class
24
+ end
25
+
26
+ # Override the auto-derived friendly name for rake tasks
27
+ # Example: friendly_name 'legacy_users' in LegacyUserDataSanitizer
28
+ def friendly_name(name = nil)
29
+ if name
30
+ @friendly_name = name.to_s
31
+ else
32
+ @friendly_name || infer_friendly_name
33
+ end
34
+ end
35
+
36
+ # Define a scrubbing rule for a column
37
+ def scrub(name, &block)
38
+ @scrubbed ||= {}
39
+ @scrubbed[name] = block
40
+ end
41
+
42
+ # Mark columns as safe to keep unchanged (not PII)
43
+ def keep(*names)
44
+ @kept ||= []
45
+ @kept.concat(names.map(&:to_sym))
46
+ end
47
+
48
+ # UNSAFE: Keep all columns not explicitly declared via scrub or keep.
49
+ # Bypasses PII review - use only for development/testing.
50
+ # Disable with: Pumice.configure { |c| c.allow_keep_undefined_columns = false }
51
+ def keep_undefined_columns!
52
+ unless Pumice.allow_keep_undefined_columns?
53
+ raise "keep_undefined_columns! is disabled. " \
54
+ "This method bypasses PII review and should not be used in production."
55
+ end
56
+
57
+ @kept ||= []
58
+ @kept.concat(undefined_columns.map(&:to_sym))
59
+ end
60
+
61
+ # Prune Operation
62
+ # Removes matching records BEFORE record-by-record scrubbing.
63
+ # Use when you want to delete old/irrelevant records AND scrub the survivors.
64
+ #
65
+ # Unlike bulk operations (truncate!, delete_all, destroy_all) which are terminal,
66
+ # prune is a pre-step: it deletes matching records, then scrubbing continues
67
+ # on the remaining records.
68
+ #
69
+ # Examples:
70
+ # prune { where(created_at: ..1.year.ago) } # Delete old, scrub the rest
71
+ # prune { where(status: 'archived') } # Delete archived, scrub active
72
+ def prune(&scope)
73
+ raise ArgumentError, 'prune requires a block' unless scope
74
+
75
+ @prune_operation = { scope: scope }
76
+ end
77
+
78
+ # Convenience: prune records older than the given age.
79
+ # Accepts a duration (1.year), DateTime, or date string ("2024-01-01").
80
+ #
81
+ # Examples:
82
+ # prune_older_than 1.year
83
+ # prune_older_than 90.days
84
+ # prune_older_than DateTime.new(2024, 1, 1)
85
+ # prune_older_than "2024-01-01"
86
+ def prune_older_than(age, column: :created_at)
87
+ cutoff = resolve_prune_cutoff(age)
88
+ prune { where(column => ...cutoff) }
89
+ end
90
+
91
+ # Convenience: prune records newer than the given age.
92
+ # Accepts a duration (1.year), DateTime, or date string ("2024-01-01").
93
+ #
94
+ # Examples:
95
+ # prune_newer_than 1.year
96
+ # prune_newer_than 30.days
97
+ # prune_newer_than "2025-06-01"
98
+ def prune_newer_than(age, column: :created_at)
99
+ cutoff = resolve_prune_cutoff(age)
100
+ prune { where(column => cutoff..) }
101
+ end
102
+
103
+ def prune_operation
104
+ @prune_operation
105
+ end
106
+
107
+ # Bulk Operations (Terminal)
108
+ # These replace record-by-record sanitization with fast bulk SQL operations.
109
+ # No scrubbing runs after a bulk operation.
110
+ # Use for audit logs, sessions, and other high-volume tables.
111
+
112
+ # TRUNCATE TABLE - fastest, resets auto-increment, no conditions
113
+ # Examples:
114
+ # truncate!
115
+ # truncate!(verify: true) # verifies count.zero? after truncation
116
+ def truncate!(verify: false)
117
+ @bulk_operation = { type: :truncate }
118
+ self.verify_all if verify
119
+ end
120
+
121
+ # DELETE with optional scope - fast, no callbacks/associations
122
+ # Examples:
123
+ # delete_all # deletes all records
124
+ # delete_all { where(item_type: 'User') } # deletes matching records
125
+ # delete_all(verify: true) { where(...) } # verifies scope.none? after deletion
126
+ def delete_all(verify: false, &scope)
127
+ @bulk_operation = { type: :delete, scope: scope }
128
+ self.verify_all if verify
129
+ end
130
+
131
+ # DESTROY with optional scope - runs callbacks, handles associations
132
+ # Examples:
133
+ # destroy_all # destroys all records
134
+ # destroy_all { where(attachable_id: nil) } # destroys orphaned records
135
+ # destroy_all(verify: true) { where(...) } # verifies scope.none? after destruction
136
+ def destroy_all(verify: false, &scope)
137
+ @bulk_operation = { type: :destroy, scope: scope }
138
+ self.verify_all if verify
139
+ end
140
+
141
+ def bulk_operation
142
+ @bulk_operation
143
+ end
144
+
145
+ # Verification
146
+ # Define post-sanitization checks to confirm the operation succeeded.
147
+
148
+ # Verify after all records are processed (bulk or record-by-record)
149
+ # Block executes in model scope and should return truthy for success.
150
+ # Examples:
151
+ # verify_all # uses default for bulk ops
152
+ # verify_all { where(item_type: SENSITIVE_TYPES).none? }
153
+ # verify_all "No sensitive data should remain" do
154
+ # where(pii_column: true).count.zero?
155
+ # end
156
+ def verify_all(message = nil, &block)
157
+ @verification = if block
158
+ { message: message, block: block }
159
+ else
160
+ { message: message, use_default: true }
161
+ end
162
+ end
163
+
164
+ # Verify each record after sanitization (record-by-record only)
165
+ # Block receives the sanitized record and should return truthy for success.
166
+ # Examples:
167
+ # verify_each { |record| !record.email.include?('@gmail.com') }
168
+ # verify_each "Record should not contain real email" do |record|
169
+ # record.email.end_with?('@example.com')
170
+ # end
171
+ def verify_each(message = nil, &block)
172
+ raise ArgumentError, 'verify_each requires a block' unless block
173
+
174
+ @record_verification = { message: message, block: block }
175
+ end
176
+
177
+ def verification
178
+ @verification
179
+ end
180
+
181
+ def record_verification
182
+ @record_verification
183
+ end
184
+
185
+ def scrubbed
186
+ @scrubbed ||= {}
187
+ end
188
+
189
+ def kept
190
+ @kept ||= []
191
+ end
192
+
193
+ def scrubbed_columns
194
+ scrubbed.keys.map(&:to_s)
195
+ end
196
+
197
+ def scrubbed_column?(name)
198
+ scrubbed_columns.include?(name.to_s)
199
+ end
200
+
201
+ def kept_columns
202
+ kept.map(&:to_s)
203
+ end
204
+
205
+ def defined_columns
206
+ scrubbed_columns + kept_columns
207
+ end
208
+
209
+ def undefined_columns
210
+ model_class.column_names - defined_columns - PROTECTED_COLUMNS
211
+ end
212
+
213
+ def stale_columns
214
+ defined_columns - model_class.column_names
215
+ end
216
+
217
+ def lint!
218
+ issues = []
219
+
220
+ # Bulk operations are terminal — no scrubbing happens, so column coverage is irrelevant
221
+ if undefined_columns.any? && !bulk_operation
222
+ issues << "#{name} (#{model_class.name}) has undefined columns: #{undefined_columns.join(', ')}"
223
+ end
224
+
225
+ if stale_columns.any?
226
+ issues << "#{name} (#{model_class.name}) has stale columns (removed from model): #{stale_columns.join(', ')}"
227
+ end
228
+
229
+ if bulk_operation && (scrubbed.any? || kept.any?)
230
+ ignored = (scrubbed_columns + kept_columns).join(', ')
231
+ issues << "#{name} uses a terminal bulk operation (#{bulk_operation[:type]}) but also declares scrub/keep columns (#{ignored}). These will be ignored."
232
+ end
233
+
234
+ issues
235
+ rescue NameError, RuntimeError => e
236
+ ["#{name} references a model that doesn't exist: #{e.message}"]
237
+ end
238
+
239
+ private
240
+
241
+ def infer_model_class
242
+ # e.g. UserSanitizer -> User, StudentSanitizer -> Student
243
+ model_name = name.delete_suffix('Sanitizer')
244
+ model_name.constantize
245
+ rescue NameError
246
+ raise "Could not infer model for #{name}. Use `sanitizes :model_names` to specify explicitly."
247
+ end
248
+
249
+ def infer_friendly_name
250
+ name.delete_suffix('Sanitizer').underscore.pluralize
251
+ end
252
+
253
+ def resolve_prune_cutoff(age)
254
+ case age
255
+ when ActiveSupport::Duration
256
+ age.ago
257
+ when DateTime, Time, Date
258
+ age
259
+ when String
260
+ DateTime.parse(age)
261
+ else
262
+ raise ArgumentError,
263
+ "prune cutoff must be a Duration (1.year), DateTime, or date string, got #{age.class}"
264
+ end
265
+ end
266
+ end
267
+ end