data_redactor 0.5.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +48 -1
- data/ext/data_redactor/custom_patterns.c +123 -0
- data/ext/data_redactor/custom_patterns.h +25 -0
- data/ext/data_redactor/data_redactor.c +26 -1001
- data/ext/data_redactor/extconf.rb +4 -0
- data/ext/data_redactor/patterns.c +455 -0
- data/ext/data_redactor/patterns.h +16 -0
- data/ext/data_redactor/placeholder.c +54 -0
- data/ext/data_redactor/placeholder.h +30 -0
- data/ext/data_redactor/redact.c +160 -0
- data/ext/data_redactor/redact.h +35 -0
- data/ext/data_redactor/scan.c +131 -0
- data/ext/data_redactor/scan.h +12 -0
- data/ext/data_redactor/tags.h +24 -0
- data/lib/data_redactor/integrations/logger.rb +42 -0
- data/lib/data_redactor/integrations/rack.rb +121 -0
- data/lib/data_redactor/integrations/rails.rb +38 -0
- data/lib/data_redactor/version.rb +2 -1
- data/lib/data_redactor.rb +247 -45
- data/readme.md +110 -24
- metadata +48 -5
data/lib/data_redactor.rb
CHANGED
|
@@ -1,7 +1,42 @@
|
|
|
1
|
+
require "set"
|
|
1
2
|
require_relative "data_redactor/version"
|
|
2
3
|
require_relative "data_redactor/data_redactor" # loads the compiled .so
|
|
3
4
|
|
|
5
|
+
# High-performance regex-based redactor for sensitive data.
|
|
6
|
+
#
|
|
7
|
+
# DataRedactor scans text for sensitive patterns (API keys, IBANs, national
|
|
8
|
+
# IDs, emails, phone numbers, etc.) and replaces matches with a configurable
|
|
9
|
+
# placeholder. The matching is done by a C extension backed by POSIX
|
|
10
|
+
# +regex.h+, so it is fast enough to run inline on large payloads.
|
|
11
|
+
#
|
|
12
|
+
# @example Basic redaction
|
|
13
|
+
# DataRedactor.redact("key is AKIAIOSFODNN7EXAMPLE")
|
|
14
|
+
# # => "key is [REDACTED]"
|
|
15
|
+
#
|
|
16
|
+
# @example Filter by tag or pattern name
|
|
17
|
+
# DataRedactor.redact(text, only: :credentials)
|
|
18
|
+
# DataRedactor.redact(text, except: [:contact, :network])
|
|
19
|
+
# DataRedactor.redact(text, only: :contact, except: ["email"])
|
|
20
|
+
# DataRedactor.redact(text, only: ["aws_access_key_id"])
|
|
21
|
+
#
|
|
22
|
+
# @example Custom placeholder
|
|
23
|
+
# DataRedactor.redact(text, placeholder: "***")
|
|
24
|
+
# DataRedactor.redact(text, placeholder: :tagged) # => "[REDACTED:CONTACT]"
|
|
25
|
+
# DataRedactor.redact(text, placeholder: :hash) # => "[CONTACT_a3f9]"
|
|
26
|
+
#
|
|
27
|
+
# @example Audit / dry-run
|
|
28
|
+
# DataRedactor.scan(text)
|
|
29
|
+
# # => { redacted: "...", matches: [{tag:, name:, value:, start:, length:}, ...] }
|
|
30
|
+
#
|
|
31
|
+
# @example Custom pattern
|
|
32
|
+
# DataRedactor.add_pattern(name: "employee_id", regex: "EMP-[0-9]{6}")
|
|
4
33
|
module DataRedactor
|
|
34
|
+
# Map of tag symbol to the integer bit used by the C layer.
|
|
35
|
+
#
|
|
36
|
+
# The keys of this hash are the canonical list of supported tags; pass any
|
|
37
|
+
# of them to {redact} or {scan} via +only:+ / +except:+.
|
|
38
|
+
#
|
|
39
|
+
# @return [Hash{Symbol => Integer}] frozen tag-to-bit map
|
|
5
40
|
TAGS = {
|
|
6
41
|
credentials: TAG_CREDENTIALS,
|
|
7
42
|
financial: TAG_FINANCIAL,
|
|
@@ -14,70 +49,148 @@ module DataRedactor
|
|
|
14
49
|
custom: TAG_CUSTOM
|
|
15
50
|
}.freeze
|
|
16
51
|
|
|
52
|
+
# Raised when a tag symbol passed to +only:+ / +except:+ / +tag:+ is not in {TAGS}.
|
|
17
53
|
class UnknownTagError < ArgumentError; end
|
|
54
|
+
|
|
55
|
+
# Raised when a String passed via +only:+ / +except:+ does not match any
|
|
56
|
+
# registered pattern name. See {pattern_names}.
|
|
57
|
+
class UnknownPatternError < ArgumentError; end
|
|
58
|
+
|
|
59
|
+
# Raised by {add_pattern} when the supplied regex is not valid POSIX ERE,
|
|
60
|
+
# uses Ruby-only syntax (+\d+, +\s+, lookaround, non-greedy, etc.), or
|
|
61
|
+
# contains capture groups while +boundary: true+ is requested.
|
|
18
62
|
class InvalidPatternError < ArgumentError; end
|
|
19
63
|
|
|
64
|
+
# @api private
|
|
20
65
|
# Capture groups break boundary-wrapper group index assumptions ([1],[2],[3] shift).
|
|
21
66
|
CAPTURE_GROUP_RE = /(?<!\\)\((?!\?:)/.freeze
|
|
22
67
|
|
|
68
|
+
# @api private
|
|
23
69
|
# Ruby regex syntax that has no POSIX ERE equivalent.
|
|
24
70
|
RUBY_ONLY_SYNTAX_RE = /\\[dDwWsShHbB]|\(\?[<!=]|\(\?<[a-zA-Z]|\(\?[imx]|[*+?]\?/.freeze
|
|
25
71
|
|
|
72
|
+
# Default placeholder used when +placeholder:+ is not given to {redact}.
|
|
26
73
|
PLACEHOLDER_DEFAULT = "[REDACTED]"
|
|
27
74
|
|
|
28
75
|
module_function
|
|
29
76
|
|
|
77
|
+
# List of supported tag symbols.
|
|
78
|
+
#
|
|
79
|
+
# @return [Array<Symbol>] every key from {TAGS}
|
|
30
80
|
def tags
|
|
31
81
|
TAGS.keys
|
|
32
82
|
end
|
|
33
83
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
84
|
+
# List of every pattern name the redactor knows about.
|
|
85
|
+
#
|
|
86
|
+
# Includes the {BUILTIN_PATTERN_NAMES} plus any names registered via
|
|
87
|
+
# {add_pattern}. Useful for discovering what String values +only:+ /
|
|
88
|
+
# +except:+ accept, and for filtering / debugging.
|
|
89
|
+
#
|
|
90
|
+
# @return [Array<String>] built-in names first (in execution order),
|
|
91
|
+
# then custom names in registration order.
|
|
92
|
+
def pattern_names
|
|
93
|
+
BUILTIN_PATTERN_NAMES + _custom_patterns.map { |h| h[:name] }
|
|
94
|
+
end
|
|
45
95
|
|
|
96
|
+
# Redact every match of the configured patterns in +text+.
|
|
97
|
+
#
|
|
98
|
+
# +only:+ and +except:+ both accept a single value or an Array, mixing:
|
|
99
|
+
# - **Symbols** — tag names from {TAGS} (e.g. +:contact+, +:credentials+).
|
|
100
|
+
# - **Strings** — specific pattern names from {pattern_names} (e.g. +"email"+).
|
|
101
|
+
#
|
|
102
|
+
# They can be combined: +only: :contact, except: ["email"]+ means
|
|
103
|
+
# "redact every contact pattern except email." Symbols give you tag-level
|
|
104
|
+
# control; Strings give you per-pattern precision.
|
|
105
|
+
#
|
|
106
|
+
# **Precedence:** a pattern is redacted iff
|
|
107
|
+
# +(only is nil OR pattern matches only:)+ AND +(pattern does not match except:)+.
|
|
108
|
+
# +except:+ always wins over +only:+ when they overlap — e.g.
|
|
109
|
+
# +only: :contact, except: :contact+ produces an empty redaction (no-op),
|
|
110
|
+
# and +only: ["email"], except: ["email"]+ likewise skips email entirely.
|
|
111
|
+
#
|
|
112
|
+
# @param text [String] input string. Returned unchanged if no patterns match.
|
|
113
|
+
# @param only [Symbol, String, Array, nil] include only the given tag(s)
|
|
114
|
+
# and/or pattern name(s).
|
|
115
|
+
# @param except [Symbol, String, Array, nil] exclude the given tag(s)
|
|
116
|
+
# and/or pattern name(s). May be combined with +only:+.
|
|
117
|
+
# @param placeholder [String, :tagged, :hash] replacement strategy.
|
|
118
|
+
# A String is used verbatim. +:tagged+ produces +[REDACTED:TAGNAME]+.
|
|
119
|
+
# +:hash+ produces a deterministic +[TAGNAME_xxxx]+ token (4-hex djb2)
|
|
120
|
+
# so the same input value always maps to the same token.
|
|
121
|
+
# @return [String] a new string with every match replaced.
|
|
122
|
+
# @raise [ArgumentError] if +placeholder:+ is not a String/:tagged/:hash.
|
|
123
|
+
# @raise [UnknownTagError] if any Symbol in +only:+/+except:+ is not in {TAGS}.
|
|
124
|
+
# @raise [UnknownPatternError] if any String in +only:+/+except:+ is not in {pattern_names}.
|
|
125
|
+
#
|
|
126
|
+
# @example
|
|
127
|
+
# DataRedactor.redact("token sk_live_abc123", only: :credentials)
|
|
128
|
+
# DataRedactor.redact(text, only: [:contact, "aws_access_key_id"])
|
|
129
|
+
# DataRedactor.redact(text, only: :contact, except: ["email"])
|
|
130
|
+
def redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT)
|
|
131
|
+
enable_bits = build_enable_bits(only, except)
|
|
46
132
|
ph_mode, ph_str = resolve_placeholder(placeholder)
|
|
47
|
-
_redact(text,
|
|
133
|
+
_redact(text, ph_mode, ph_str, enable_bits)
|
|
48
134
|
end
|
|
49
135
|
|
|
50
|
-
# Scan text
|
|
136
|
+
# Scan +text+ and return both the redacted string and per-match metadata.
|
|
137
|
+
#
|
|
138
|
+
# Useful for auditing, false-positive tuning, and compliance pipelines.
|
|
139
|
+
# +:start+ and +:length+ are byte offsets into the *original* string, so
|
|
140
|
+
# +text.byteslice(m[:start], m[:length]) == m[:value]+.
|
|
51
141
|
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
#
|
|
142
|
+
# @param text [String] input string.
|
|
143
|
+
# @param only [Symbol, String, Array, nil] same semantics as {redact}.
|
|
144
|
+
# @param except [Symbol, String, Array, nil] same semantics as {redact}.
|
|
145
|
+
# @return [Hash{Symbol => Object}] +{ redacted: String, matches:
|
|
146
|
+
# Array<Hash> }+. Each match hash has +:tag+ (Symbol), +:name+ (String),
|
|
147
|
+
# +:value+ (String), +:start+ (Integer byte offset), +:length+ (Integer).
|
|
148
|
+
# @raise [UnknownTagError] if any Symbol in +only:+/+except:+ is not in {TAGS}.
|
|
149
|
+
# @raise [UnknownPatternError] if any String in +only:+/+except:+ is not in {pattern_names}.
|
|
150
|
+
#
|
|
151
|
+
# @example
|
|
152
|
+
# DataRedactor.scan("user@example.com")
|
|
153
|
+
# # => { redacted: "[REDACTED]",
|
|
154
|
+
# # matches: [{tag: :contact, name: "email",
|
|
155
|
+
# # value: "user@example.com", start: 0, length: 16}] }
|
|
55
156
|
def scan(text, only: nil, except: nil)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
mask =
|
|
59
|
-
if only
|
|
60
|
-
bits_for(only)
|
|
61
|
-
elsif except
|
|
62
|
-
TAG_ALL & ~bits_for(except)
|
|
63
|
-
else
|
|
64
|
-
TAG_ALL
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
result = _scan(text, mask)
|
|
157
|
+
enable_bits = build_enable_bits(only, except)
|
|
158
|
+
result = _scan(text, enable_bits)
|
|
68
159
|
# Normalise: convert tag string from C (uppercase) back to the Symbol used in TAGS
|
|
69
|
-
result[:matches].each
|
|
70
|
-
m[:tag] = m[:tag].to_s.downcase.to_sym
|
|
71
|
-
end
|
|
160
|
+
result[:matches].each { |m| m[:tag] = m[:tag].to_s.downcase.to_sym }
|
|
72
161
|
result
|
|
73
162
|
end
|
|
74
163
|
|
|
75
|
-
#
|
|
164
|
+
# Register a custom redaction pattern.
|
|
76
165
|
#
|
|
77
|
-
#
|
|
78
|
-
#
|
|
79
|
-
#
|
|
80
|
-
#
|
|
166
|
+
# Patterns must be valid POSIX ERE. Ruby-only syntax (+\d+, +\s+, +\w+,
|
|
167
|
+
# +\b+, lookaround, non-greedy quantifiers, named groups) is rejected
|
|
168
|
+
# at registration time, never at redaction time.
|
|
169
|
+
#
|
|
170
|
+
# If a pattern with the same +name+ is already registered, it is replaced
|
|
171
|
+
# (the old compiled +regex_t+ is freed).
|
|
172
|
+
#
|
|
173
|
+
# @param name [String] unique identifier for this pattern. Used by {remove_pattern}.
|
|
174
|
+
# @param regex [String, Regexp] POSIX ERE source. A Regexp is accepted
|
|
175
|
+
# for convenience but only its +.source+ is used; flags are ignored.
|
|
176
|
+
# @param tag [Symbol] one of {TAGS} keys. Defaults to +:custom+.
|
|
177
|
+
# @param boundary [Boolean] when true, the pattern is wrapped with
|
|
178
|
+
# +(^|[^0-9A-Za-z])(...)([^0-9A-Za-z]|$)+ so it only matches when not
|
|
179
|
+
# embedded in a longer alphanumeric token. Incompatible with patterns
|
|
180
|
+
# that contain capture groups.
|
|
181
|
+
# @return [Boolean] +true+ on success.
|
|
182
|
+
# @raise [ArgumentError] if +name+ is not a non-empty String, or +regex+
|
|
183
|
+
# is neither a String nor a Regexp.
|
|
184
|
+
# @raise [InvalidPatternError] if the pattern uses Ruby-only syntax,
|
|
185
|
+
# contains capture groups while +boundary: true+, or fails +regcomp+.
|
|
186
|
+
# @raise [UnknownTagError] if +tag+ is not in {TAGS}.
|
|
187
|
+
#
|
|
188
|
+
# @example
|
|
189
|
+
# DataRedactor.add_pattern(name: "employee_id", regex: "EMP-[0-9]{6}")
|
|
190
|
+
# DataRedactor.add_pattern(name: "internal_key",
|
|
191
|
+
# regex: /INT-[A-Z]{3}/,
|
|
192
|
+
# tag: :credentials,
|
|
193
|
+
# boundary: true)
|
|
81
194
|
def add_pattern(name:, regex:, tag: :custom, boundary: false)
|
|
82
195
|
raise ArgumentError, "name must be a non-empty String" \
|
|
83
196
|
unless name.is_a?(String) && !name.empty?
|
|
@@ -105,10 +218,20 @@ module DataRedactor
|
|
|
105
218
|
_add_pattern(name, source, tag_bit, boundary ? 1 : 0)
|
|
106
219
|
end
|
|
107
220
|
|
|
221
|
+
# Remove a previously registered custom pattern.
|
|
222
|
+
#
|
|
223
|
+
# @param name [String, Symbol] the +name+ used in {add_pattern}.
|
|
224
|
+
# @return [Boolean] +true+ if a pattern was removed, +false+ if no
|
|
225
|
+
# pattern with that name was registered.
|
|
108
226
|
def remove_pattern(name)
|
|
109
227
|
_remove_pattern(name.to_s)
|
|
110
228
|
end
|
|
111
229
|
|
|
230
|
+
# List every currently registered custom pattern.
|
|
231
|
+
#
|
|
232
|
+
# @return [Array<Hash{Symbol => Object}>] one hash per pattern with keys
|
|
233
|
+
# +:name+ (String), +:source+ (String — the POSIX ERE source),
|
|
234
|
+
# +:tag+ (Symbol), +:boundary+ (Boolean).
|
|
112
235
|
def custom_patterns
|
|
113
236
|
_custom_patterns.map do |h|
|
|
114
237
|
{ name: h[:name], source: h[:source], tag: TAGS.key(h[:tag_bit]) || :custom,
|
|
@@ -116,22 +239,101 @@ module DataRedactor
|
|
|
116
239
|
end
|
|
117
240
|
end
|
|
118
241
|
|
|
242
|
+
# Remove every registered custom pattern.
|
|
243
|
+
#
|
|
244
|
+
# Mostly useful in test suites that need a clean slate between examples.
|
|
245
|
+
#
|
|
246
|
+
# @return [nil]
|
|
119
247
|
def clear_custom_patterns!
|
|
120
248
|
_clear_custom_patterns
|
|
121
249
|
end
|
|
122
250
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
251
|
+
# @api private
|
|
252
|
+
# Split a mixed Symbol/String filter list into +(tag_bitmask, name_set)+.
|
|
253
|
+
#
|
|
254
|
+
# @param entries [nil, Symbol, String, Array]
|
|
255
|
+
# @return [Array(Integer, Set<String>)] tag bits OR-ed together; set of
|
|
256
|
+
# pattern-name Strings.
|
|
257
|
+
# @raise [UnknownTagError] for unknown Symbols.
|
|
258
|
+
# @raise [UnknownPatternError] for unknown Strings.
|
|
259
|
+
def split_filter(entries)
|
|
260
|
+
bits = 0
|
|
261
|
+
names = Set.new
|
|
262
|
+
return [bits, names] if entries.nil?
|
|
263
|
+
Array(entries).each do |e|
|
|
264
|
+
case e
|
|
265
|
+
when Symbol
|
|
266
|
+
bit = TAGS[e] or raise UnknownTagError,
|
|
267
|
+
"unknown tag #{e.inspect}; valid tags: #{TAGS.keys.inspect}"
|
|
268
|
+
bits |= bit
|
|
269
|
+
when String
|
|
270
|
+
unless pattern_names.include?(e)
|
|
271
|
+
raise UnknownPatternError,
|
|
272
|
+
"unknown pattern name #{e.inspect}; see DataRedactor.pattern_names"
|
|
273
|
+
end
|
|
274
|
+
names << e
|
|
275
|
+
else
|
|
276
|
+
raise ArgumentError,
|
|
277
|
+
"only:/except: entries must be a Symbol (tag) or String (pattern name), got #{e.inspect}"
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
[bits, names]
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# @api private
|
|
284
|
+
# Build the per-pattern enable bit-list passed to the C layer.
|
|
285
|
+
#
|
|
286
|
+
# The list has one Integer (0 or 1) per pattern in execution order:
|
|
287
|
+
# built-ins first (NUM_PATTERNS entries), then currently registered custom
|
|
288
|
+
# patterns in registration order. C iterates by index and skips zeros.
|
|
289
|
+
#
|
|
290
|
+
# Semantics of +only:+ / +except:+ — both accept a mix of Symbols (tags)
|
|
291
|
+
# and Strings (pattern names):
|
|
292
|
+
# enabled(p) iff
|
|
293
|
+
# (only is nil OR p.tag ∈ only_tags OR p.name ∈ only_names)
|
|
294
|
+
# AND p.tag ∉ except_tags AND p.name ∉ except_names
|
|
295
|
+
#
|
|
296
|
+
# @return [Array<Integer>] same length as built-ins + customs.
|
|
297
|
+
def build_enable_bits(only, except)
|
|
298
|
+
only_bits, only_names = split_filter(only)
|
|
299
|
+
except_bits, except_names = split_filter(except)
|
|
300
|
+
only_present = !only.nil?
|
|
301
|
+
|
|
302
|
+
bits = Array.new(BUILTIN_PATTERN_NAMES.length + _custom_patterns.length, 0)
|
|
303
|
+
|
|
304
|
+
BUILTIN_PATTERN_NAMES.each_with_index do |name, i|
|
|
305
|
+
tag_bit = BUILTIN_PATTERN_TAG_BITS[i]
|
|
306
|
+
bits[i] = 1 if pattern_enabled?(name, tag_bit, only_present,
|
|
307
|
+
only_bits, only_names,
|
|
308
|
+
except_bits, except_names)
|
|
128
309
|
end
|
|
310
|
+
|
|
311
|
+
_custom_patterns.each_with_index do |h, i|
|
|
312
|
+
bits[BUILTIN_PATTERN_NAMES.length + i] = 1 if pattern_enabled?(
|
|
313
|
+
h[:name], h[:tag_bit], only_present,
|
|
314
|
+
only_bits, only_names, except_bits, except_names)
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
bits
|
|
129
318
|
end
|
|
130
319
|
|
|
131
|
-
#
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
320
|
+
# @api private
|
|
321
|
+
def pattern_enabled?(name, tag_bit, only_present, only_bits, only_names,
|
|
322
|
+
except_bits, except_names)
|
|
323
|
+
return false if (tag_bit & except_bits) != 0
|
|
324
|
+
return false if except_names.include?(name)
|
|
325
|
+
return true unless only_present
|
|
326
|
+
return true if (tag_bit & only_bits) != 0
|
|
327
|
+
only_names.include?(name)
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# @api private
|
|
331
|
+
# Translate the user-facing +placeholder:+ value into the +(mode_int, str)+
|
|
332
|
+
# pair the C layer expects.
|
|
333
|
+
#
|
|
334
|
+
# @param placeholder [String, :tagged, :hash]
|
|
335
|
+
# @return [Array(Integer, String)]
|
|
336
|
+
# @raise [ArgumentError] if +placeholder+ is none of the accepted values.
|
|
135
337
|
def resolve_placeholder(placeholder)
|
|
136
338
|
case placeholder
|
|
137
339
|
when :tagged then [PH_MODE_TAGGED, ""]
|
data/readme.md
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# DataRedactor
|
|
2
2
|
|
|
3
|
+
[](https://rubygems.org/gems/data_redactor)
|
|
4
|
+
[](https://github.com/danielefrisanco/data_redactor/actions/workflows/ci.yml)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
3
7
|
A Ruby gem with a C extension for high-performance regex-based redaction of sensitive data from strings.
|
|
4
8
|
|
|
5
9
|
## What it does
|
|
@@ -16,25 +20,34 @@ DataRedactor.redact(text)
|
|
|
16
20
|
# => "User CF is [REDACTED] and key is [REDACTED]"
|
|
17
21
|
```
|
|
18
22
|
|
|
19
|
-
### Filtering by tag
|
|
23
|
+
### Filtering by tag or pattern name
|
|
20
24
|
|
|
21
|
-
|
|
25
|
+
`only:` and `except:` both accept a single value or an Array, mixing **Symbols** (tag names) and **Strings** (specific pattern names).
|
|
22
26
|
|
|
23
27
|
```ruby
|
|
24
28
|
DataRedactor.tags
|
|
25
|
-
# => [:credentials, :financial, :tax_id, :national_id, :contact, :network, :travel, :other]
|
|
29
|
+
# => [:credentials, :financial, :tax_id, :national_id, :contact, :network, :travel, :other, :custom]
|
|
30
|
+
|
|
31
|
+
DataRedactor.pattern_names
|
|
32
|
+
# => ["aws_s3_presigned_url", "aws_access_key_id", "email", "phone_e164", "ipv4", ...]
|
|
26
33
|
|
|
27
|
-
#
|
|
34
|
+
# Tag-level filtering
|
|
28
35
|
DataRedactor.redact(text, only: [:credentials])
|
|
36
|
+
DataRedactor.redact(text, except: :contact)
|
|
29
37
|
|
|
30
|
-
#
|
|
31
|
-
DataRedactor.redact(text,
|
|
38
|
+
# Single specific pattern
|
|
39
|
+
DataRedactor.redact(text, only: ["aws_access_key_id"])
|
|
32
40
|
|
|
33
|
-
#
|
|
34
|
-
DataRedactor.redact(text, only: :
|
|
41
|
+
# Mix — every credentials pattern PLUS aws_access_key_id (even if it lived in another tag)
|
|
42
|
+
DataRedactor.redact(text, only: [:credentials, "aws_access_key_id"])
|
|
43
|
+
|
|
44
|
+
# Combine — every contact pattern EXCEPT email
|
|
45
|
+
DataRedactor.redact(text, only: :contact, except: ["email"])
|
|
35
46
|
```
|
|
36
47
|
|
|
37
|
-
|
|
48
|
+
**Precedence:** a pattern is redacted iff `(only is nil OR matches only:)` AND `(does not match except:)`. `except:` always wins when the two overlap, so `only: :contact, except: :contact` produces a no-op (everything is excluded).
|
|
49
|
+
|
|
50
|
+
**Errors:** an unknown tag Symbol raises `DataRedactor::UnknownTagError`; an unknown pattern name String raises `DataRedactor::UnknownPatternError`.
|
|
38
51
|
|
|
39
52
|
### Configurable placeholder
|
|
40
53
|
|
|
@@ -84,9 +97,10 @@ result = DataRedactor.scan("User AKIAIOSFODNN7EXAMPLE logged in from 192.168.1.1
|
|
|
84
97
|
m = result[:matches].first
|
|
85
98
|
original_text.byteslice(m[:start], m[:length]) # => "AKIAIOSFODNN7EXAMPLE"
|
|
86
99
|
|
|
87
|
-
# Accepts the same
|
|
100
|
+
# Accepts the same filters as redact (tags + specific pattern names)
|
|
88
101
|
DataRedactor.scan(text, only: :credentials)
|
|
89
102
|
DataRedactor.scan(text, except: :network)
|
|
103
|
+
DataRedactor.scan(text, only: :contact, except: ["email"])
|
|
90
104
|
```
|
|
91
105
|
|
|
92
106
|
### Custom patterns
|
|
@@ -114,21 +128,81 @@ DataRedactor.clear_custom_patterns! # mostly for test suites
|
|
|
114
128
|
|
|
115
129
|
**`boundary: true`** — wraps the pattern with `(^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)` so it only fires when the token is not embedded in a longer alphanumeric string. Incompatible with patterns that contain capture groups.
|
|
116
130
|
|
|
117
|
-
##
|
|
131
|
+
## Integrations
|
|
132
|
+
|
|
133
|
+
Optional adapters for Logger, Rails, and Rack. None are loaded automatically — `require` only what you use, and the gem adds zero runtime dependencies in the gemspec.
|
|
134
|
+
|
|
135
|
+
### Logger formatter
|
|
136
|
+
|
|
137
|
+
Drop-in `Logger::Formatter` replacement that scrubs every emitted line:
|
|
138
|
+
|
|
139
|
+
```ruby
|
|
140
|
+
require "data_redactor/integrations/logger"
|
|
141
|
+
|
|
142
|
+
logger = Logger.new($stdout)
|
|
143
|
+
logger.formatter = DataRedactor::Integrations::Logger.new
|
|
144
|
+
logger.info("Auth failed for alice@example.com")
|
|
145
|
+
# => I, [...] -- : Auth failed for [REDACTED]
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Wraps an inner formatter (defaults to `Logger::Formatter`), so it composes with structured loggers. Forwards `only:`, `except:`, `placeholder:` to `DataRedactor.redact`. Exception messages and arbitrary objects are scrubbed too — the wrapped object is passed unchanged to the inner formatter so the exception cause chain is preserved; only the rendered string is redacted.
|
|
149
|
+
|
|
150
|
+
### Rails `filter_parameters` adapter
|
|
151
|
+
|
|
152
|
+
```ruby
|
|
153
|
+
# config/initializers/filter_parameter_logging.rb
|
|
154
|
+
require "data_redactor/integrations/rails"
|
|
155
|
+
|
|
156
|
+
Rails.application.config.filter_parameters += [
|
|
157
|
+
DataRedactor::Integrations::Rails.filter
|
|
158
|
+
]
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Returns a `(key, value)` proc compatible with Rails' parameter filter. String values are mutated in place via `String#replace` so Rails sees the redacted value. Non-strings are left alone. Accepts the same `only:`/`except:`/`placeholder:` kwargs.
|
|
162
|
+
|
|
163
|
+
### Rack middleware
|
|
164
|
+
|
|
165
|
+
```ruby
|
|
166
|
+
# config.ru
|
|
167
|
+
require "data_redactor/integrations/rack"
|
|
168
|
+
|
|
169
|
+
use DataRedactor::Integrations::Rack, scrub: [:body, :headers]
|
|
170
|
+
run MyApp
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
`scrub:` selects which surfaces to redact (default `[:body, :headers]`):
|
|
174
|
+
|
|
175
|
+
- **`:body`** — buffers the response body, runs `DataRedactor.redact` over it, returns it as a single chunk. Drops the `Content-Length` header so the server recomputes (the redacted body may differ in byte length).
|
|
176
|
+
- **`:headers`** — scrubs sensitive **response** headers (`Set-Cookie`, `Authorization`, `X-Api-Key`, `X-Auth-Token`, `X-Access-Token`) in place, and sensitive **request** headers (`HTTP_AUTHORIZATION`, `HTTP_PROXY_AUTHORIZATION`, `HTTP_COOKIE`, `HTTP_X_API_KEY`, `HTTP_X_AUTH_TOKEN`, `HTTP_X_ACCESS_TOKEN`) in the env hash so any downstream middleware that logs them sees redacted values.
|
|
177
|
+
|
|
178
|
+
Pass an empty subset (e.g. `scrub: [:headers]`) to opt out of body wrapping. Forwards `only:`/`except:`/`placeholder:` to `DataRedactor.redact`. Unknown surfaces raise `ArgumentError` at boot.
|
|
179
|
+
|
|
180
|
+
> **Body wrapping is buffering.** The middleware reads the entire response body into memory before scanning. For streaming endpoints (SSE, large file downloads, Rack::Hijack) use `scrub: [:headers]` and rely on the Logger formatter for application logs instead.
|
|
181
|
+
|
|
182
|
+
## Detected patterns (85 total)
|
|
183
|
+
|
|
184
|
+
The table below is a representative sample. Use `DataRedactor.pattern_names` for the canonical, machine-readable list — it stays in sync with the C extension automatically.
|
|
118
185
|
|
|
119
186
|
### Cloud & API secrets
|
|
120
187
|
|
|
121
188
|
| # | Pattern | Example |
|
|
122
189
|
|---|---|---|
|
|
123
|
-
|
|
|
124
|
-
|
|
|
125
|
-
|
|
|
126
|
-
|
|
|
127
|
-
|
|
|
128
|
-
|
|
|
129
|
-
|
|
|
130
|
-
|
|
|
131
|
-
|
|
|
190
|
+
| — | AWS Access Key ID | `AKIAIOSFODNN7EXAMPLE` |
|
|
191
|
+
| — | AWS Secret Access Key | 40-character base64 string |
|
|
192
|
+
| — | Google API Key | `AIzaSyXXXX...` |
|
|
193
|
+
| — | GitHub Personal Access Token | `github_pat_XXXX...` |
|
|
194
|
+
| — | GitHub Classic PAT / OAuth | `ghp_XXXX...` / `gho_XXXX...` |
|
|
195
|
+
| — | Slack Webhook URL | `https://hooks.slack.com/services/T.../B.../...` |
|
|
196
|
+
| — | Stripe Secret Key | `sk_live_XXXX...` |
|
|
197
|
+
| — | Anthropic API Key | `sk-ant-api03-XXXX...` |
|
|
198
|
+
| — | OpenAI Project API Key | `sk-proj-XXXX...` |
|
|
199
|
+
| — | GitLab Personal Access Token | `glpat-XXXX...` |
|
|
200
|
+
| — | DigitalOcean PAT | `dop_v1_XXXX...` |
|
|
201
|
+
| — | Databricks API Token | `dapiXXXX...` |
|
|
202
|
+
| — | Sentry DSN | `https://KEY@oNNN.ingest.sentry.io/PID` |
|
|
203
|
+
| — | PEM Private Key header | `-----BEGIN RSA PRIVATE KEY-----` |
|
|
204
|
+
| — | Scaleway Access Key | `SCW12345ABCDE6789FGHIJ` |
|
|
205
|
+
| — | UUID v4 / Scaleway Secret Key | `550e8400-e29b-41d4-a716-446655440000` |
|
|
132
206
|
|
|
133
207
|
### Travel documents
|
|
134
208
|
|
|
@@ -205,10 +279,16 @@ redactor/
|
|
|
205
279
|
│ └── version.rb
|
|
206
280
|
├── ext/
|
|
207
281
|
│ └── data_redactor/
|
|
208
|
-
│ ├── extconf.rb
|
|
209
|
-
│
|
|
282
|
+
│ ├── extconf.rb # Checks for C headers, generates Makefile (globs *.c)
|
|
283
|
+
│ ├── data_redactor.c # Entry point: Init_data_redactor only
|
|
284
|
+
│ ├── patterns.{c,h} # Built-in pattern table + compiled regex_t array
|
|
285
|
+
│ ├── placeholder.{c,h} # write_placeholder, djb2 hash, tag_name_for_bit
|
|
286
|
+
│ ├── redact.{c,h} # _redact + replace_all_matches + wrap_boundary
|
|
287
|
+
│ ├── scan.{c,h} # _scan + byte-offset replacement-log macros
|
|
288
|
+
│ ├── custom_patterns.{c,h} # Dynamic registry: add/remove/clear/list
|
|
289
|
+
│ └── tags.h # TAG_* bit constants
|
|
210
290
|
└── spec/
|
|
211
|
-
└── data_redactor_spec.rb # RSpec tests
|
|
291
|
+
└── data_redactor_spec.rb # RSpec tests — at least one example per pattern, plus filter / placeholder / custom-pattern coverage
|
|
212
292
|
```
|
|
213
293
|
|
|
214
294
|
## Requirements
|
|
@@ -245,7 +325,7 @@ bundle exec rake
|
|
|
245
325
|
|
|
246
326
|
## How it works
|
|
247
327
|
|
|
248
|
-
1. At load time, `Init_data_redactor` compiles all
|
|
328
|
+
1. At load time, `Init_data_redactor` compiles all 85 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
|
|
249
329
|
2. `DataRedactor.redact(text)` receives a Ruby `String`, converts it to a C `char*` via `StringValueCStr`, and runs each compiled pattern in sequence on a working buffer.
|
|
250
330
|
3. For each pattern, `replace_all_matches` iterates using `regexec`, copies non-matching segments to a fresh output buffer, and inserts `[REDACTED]` in place of each match. For boundary-wrapped patterns, `regexec` is called with `nmatch=4` and sub-match groups `[1]`/`[3]` identify the boundary characters so they are preserved verbatim.
|
|
251
331
|
4. The output buffer is grown with `realloc` as needed. After all patterns are applied the result is returned as a Ruby `String` via `rb_str_new_cstr`. All intermediate `malloc`/`strdup` allocations are explicitly `free`d.
|
|
@@ -254,6 +334,12 @@ bundle exec rake
|
|
|
254
334
|
|
|
255
335
|
All C-side buffers are heap-allocated with `malloc`/`strdup` and freed before the function returns. The only Ruby-managed allocation is the final return value from `rb_str_new_cstr`. No Ruby objects are created mid-processing, so GC cannot collect anything out from under the C code.
|
|
256
336
|
|
|
337
|
+
## Thread safety
|
|
338
|
+
|
|
339
|
+
`DataRedactor.redact` and `DataRedactor.scan` are safe to call concurrently from multiple threads. Built-in patterns are compiled into a static `regex_t` array at load time and never mutated afterward, and each call allocates its own working buffers. POSIX `regexec` is documented as thread-safe.
|
|
340
|
+
|
|
341
|
+
`DataRedactor.add_pattern`, `remove_pattern`, and `clear_custom_patterns!` mutate a shared dynamic array and are **not** thread-safe. Register custom patterns once at boot — before spawning worker threads or forking — and they will be visible (read-only) to every subsequent `redact`/`scan` call.
|
|
342
|
+
|
|
257
343
|
## Versioning
|
|
258
344
|
|
|
259
345
|
This project follows [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html). Until `1.0.0`, minor versions may introduce breaking changes; from `1.0.0` onward, breaking changes will only land in major versions. See [CHANGELOG.md](CHANGELOG.md) for the release history.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_redactor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Daniele Frisanco
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake-compiler
|
|
@@ -38,10 +38,39 @@ dependencies:
|
|
|
38
38
|
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '3.12'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: yard
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0.9'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0.9'
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: rack
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - ">="
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: '2.0'
|
|
62
|
+
type: :development
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - ">="
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: '2.0'
|
|
41
69
|
description: A Ruby gem with a C extension for high-performance scanning and redaction
|
|
42
|
-
of
|
|
43
|
-
phone numbers, and PII from 15+ countries.
|
|
44
|
-
to
|
|
70
|
+
of 85 sensitive patterns — API keys, tokens, credentials, IBANs, national IDs, emails,
|
|
71
|
+
phone numbers, and PII from 15+ countries. Optional Logger formatter, Rails filter_parameters
|
|
72
|
+
adapter, and Rack middleware. Designed to sanitize text before sending to LLMs,
|
|
73
|
+
logging systems, or any public/third-party API.
|
|
45
74
|
email:
|
|
46
75
|
- daniele.frisanco@gmail.com
|
|
47
76
|
executables: []
|
|
@@ -51,9 +80,23 @@ extra_rdoc_files: []
|
|
|
51
80
|
files:
|
|
52
81
|
- CHANGELOG.md
|
|
53
82
|
- LICENSE
|
|
83
|
+
- ext/data_redactor/custom_patterns.c
|
|
84
|
+
- ext/data_redactor/custom_patterns.h
|
|
54
85
|
- ext/data_redactor/data_redactor.c
|
|
55
86
|
- ext/data_redactor/extconf.rb
|
|
87
|
+
- ext/data_redactor/patterns.c
|
|
88
|
+
- ext/data_redactor/patterns.h
|
|
89
|
+
- ext/data_redactor/placeholder.c
|
|
90
|
+
- ext/data_redactor/placeholder.h
|
|
91
|
+
- ext/data_redactor/redact.c
|
|
92
|
+
- ext/data_redactor/redact.h
|
|
93
|
+
- ext/data_redactor/scan.c
|
|
94
|
+
- ext/data_redactor/scan.h
|
|
95
|
+
- ext/data_redactor/tags.h
|
|
56
96
|
- lib/data_redactor.rb
|
|
97
|
+
- lib/data_redactor/integrations/logger.rb
|
|
98
|
+
- lib/data_redactor/integrations/rack.rb
|
|
99
|
+
- lib/data_redactor/integrations/rails.rb
|
|
57
100
|
- lib/data_redactor/version.rb
|
|
58
101
|
- readme.md
|
|
59
102
|
homepage: https://github.com/danielefrisanco/data_redactor
|