data_redactor 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +20 -1
- data/ext/data_redactor/custom_patterns.c +123 -0
- data/ext/data_redactor/custom_patterns.h +25 -0
- data/ext/data_redactor/data_redactor.c +26 -1001
- data/ext/data_redactor/extconf.rb +4 -0
- data/ext/data_redactor/patterns.c +430 -0
- data/ext/data_redactor/patterns.h +16 -0
- data/ext/data_redactor/placeholder.c +54 -0
- data/ext/data_redactor/placeholder.h +30 -0
- data/ext/data_redactor/redact.c +160 -0
- data/ext/data_redactor/redact.h +35 -0
- data/ext/data_redactor/scan.c +131 -0
- data/ext/data_redactor/scan.h +12 -0
- data/ext/data_redactor/tags.h +24 -0
- data/lib/data_redactor/version.rb +2 -1
- data/lib/data_redactor.rb +247 -45
- data/readme.md +43 -15
- metadata +27 -2
data/lib/data_redactor.rb
CHANGED
|
@@ -1,7 +1,42 @@
|
|
|
1
|
+
require "set"
|
|
1
2
|
require_relative "data_redactor/version"
|
|
2
3
|
require_relative "data_redactor/data_redactor" # loads the compiled .so
|
|
3
4
|
|
|
5
|
+
# High-performance regex-based redactor for sensitive data.
|
|
6
|
+
#
|
|
7
|
+
# DataRedactor scans text for sensitive patterns (API keys, IBANs, national
|
|
8
|
+
# IDs, emails, phone numbers, etc.) and replaces matches with a configurable
|
|
9
|
+
# placeholder. The matching is done by a C extension backed by POSIX
|
|
10
|
+
# +regex.h+, so it is fast enough to run inline on large payloads.
|
|
11
|
+
#
|
|
12
|
+
# @example Basic redaction
|
|
13
|
+
# DataRedactor.redact("key is AKIAIOSFODNN7EXAMPLE")
|
|
14
|
+
# # => "key is [REDACTED]"
|
|
15
|
+
#
|
|
16
|
+
# @example Filter by tag or pattern name
|
|
17
|
+
# DataRedactor.redact(text, only: :credentials)
|
|
18
|
+
# DataRedactor.redact(text, except: [:contact, :network])
|
|
19
|
+
# DataRedactor.redact(text, only: :contact, except: ["email"])
|
|
20
|
+
# DataRedactor.redact(text, only: ["aws_access_key_id"])
|
|
21
|
+
#
|
|
22
|
+
# @example Custom placeholder
|
|
23
|
+
# DataRedactor.redact(text, placeholder: "***")
|
|
24
|
+
# DataRedactor.redact(text, placeholder: :tagged) # => "[REDACTED:CONTACT]"
|
|
25
|
+
# DataRedactor.redact(text, placeholder: :hash) # => "[CONTACT_a3f9]"
|
|
26
|
+
#
|
|
27
|
+
# @example Audit / dry-run
|
|
28
|
+
# DataRedactor.scan(text)
|
|
29
|
+
# # => { redacted: "...", matches: [{tag:, name:, value:, start:, length:}, ...] }
|
|
30
|
+
#
|
|
31
|
+
# @example Custom pattern
|
|
32
|
+
# DataRedactor.add_pattern(name: "employee_id", regex: "EMP-[0-9]{6}")
|
|
4
33
|
module DataRedactor
|
|
34
|
+
# Map of tag symbol to the integer bit used by the C layer.
|
|
35
|
+
#
|
|
36
|
+
# The keys of this hash are the canonical list of supported tags; pass any
|
|
37
|
+
# of them to {redact} or {scan} via +only:+ / +except:+.
|
|
38
|
+
#
|
|
39
|
+
# @return [Hash{Symbol => Integer}] frozen tag-to-bit map
|
|
5
40
|
TAGS = {
|
|
6
41
|
credentials: TAG_CREDENTIALS,
|
|
7
42
|
financial: TAG_FINANCIAL,
|
|
@@ -14,70 +49,148 @@ module DataRedactor
|
|
|
14
49
|
custom: TAG_CUSTOM
|
|
15
50
|
}.freeze
|
|
16
51
|
|
|
52
|
+
# Raised when a tag symbol passed to +only:+ / +except:+ / +tag:+ is not in {TAGS}.
|
|
17
53
|
class UnknownTagError < ArgumentError; end
|
|
54
|
+
|
|
55
|
+
# Raised when a String passed via +only:+ / +except:+ does not match any
|
|
56
|
+
# registered pattern name. See {pattern_names}.
|
|
57
|
+
class UnknownPatternError < ArgumentError; end
|
|
58
|
+
|
|
59
|
+
# Raised by {add_pattern} when the supplied regex is not valid POSIX ERE,
|
|
60
|
+
# uses Ruby-only syntax (+\d+, +\s+, lookaround, non-greedy, etc.), or
|
|
61
|
+
# contains capture groups while +boundary: true+ is requested.
|
|
18
62
|
class InvalidPatternError < ArgumentError; end
|
|
19
63
|
|
|
64
|
+
# @api private
|
|
20
65
|
# Capture groups break boundary-wrapper group index assumptions ([1],[2],[3] shift).
|
|
21
66
|
CAPTURE_GROUP_RE = /(?<!\\)\((?!\?:)/.freeze
|
|
22
67
|
|
|
68
|
+
# @api private
|
|
23
69
|
# Ruby regex syntax that has no POSIX ERE equivalent.
|
|
24
70
|
RUBY_ONLY_SYNTAX_RE = /\\[dDwWsShHbB]|\(\?[<!=]|\(\?<[a-zA-Z]|\(\?[imx]|[*+?]\?/.freeze
|
|
25
71
|
|
|
72
|
+
# Default placeholder used when +placeholder:+ is not given to {redact}.
|
|
26
73
|
PLACEHOLDER_DEFAULT = "[REDACTED]"
|
|
27
74
|
|
|
28
75
|
module_function
|
|
29
76
|
|
|
77
|
+
# List of supported tag symbols.
|
|
78
|
+
#
|
|
79
|
+
# @return [Array<Symbol>] every key from {TAGS}
|
|
30
80
|
def tags
|
|
31
81
|
TAGS.keys
|
|
32
82
|
end
|
|
33
83
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
84
|
+
# List of every pattern name the redactor knows about.
|
|
85
|
+
#
|
|
86
|
+
# Includes the {BUILTIN_PATTERN_NAMES} plus any names registered via
|
|
87
|
+
# {add_pattern}. Useful for discovering what String values +only:+ /
|
|
88
|
+
# +except:+ accept, and for filtering / debugging.
|
|
89
|
+
#
|
|
90
|
+
# @return [Array<String>] built-in names first (in execution order),
|
|
91
|
+
# then custom names in registration order.
|
|
92
|
+
def pattern_names
|
|
93
|
+
BUILTIN_PATTERN_NAMES + _custom_patterns.map { |h| h[:name] }
|
|
94
|
+
end
|
|
45
95
|
|
|
96
|
+
# Redact every match of the configured patterns in +text+.
|
|
97
|
+
#
|
|
98
|
+
# +only:+ and +except:+ both accept a single value or an Array, mixing:
|
|
99
|
+
# - **Symbols** — tag names from {TAGS} (e.g. +:contact+, +:credentials+).
|
|
100
|
+
# - **Strings** — specific pattern names from {pattern_names} (e.g. +"email"+).
|
|
101
|
+
#
|
|
102
|
+
# They can be combined: +only: :contact, except: ["email"]+ means
|
|
103
|
+
# "redact every contact pattern except email." Symbols give you tag-level
|
|
104
|
+
# control; Strings give you per-pattern precision.
|
|
105
|
+
#
|
|
106
|
+
# **Precedence:** a pattern is redacted iff
|
|
107
|
+
# +(only is nil OR pattern matches only:)+ AND +(pattern does not match except:)+.
|
|
108
|
+
# +except:+ always wins over +only:+ when they overlap — e.g.
|
|
109
|
+
# +only: :contact, except: :contact+ produces an empty redaction (no-op),
|
|
110
|
+
# and +only: ["email"], except: ["email"]+ likewise skips email entirely.
|
|
111
|
+
#
|
|
112
|
+
# @param text [String] input string. Returned unchanged if no patterns match.
|
|
113
|
+
# @param only [Symbol, String, Array, nil] include only the given tag(s)
|
|
114
|
+
# and/or pattern name(s).
|
|
115
|
+
# @param except [Symbol, String, Array, nil] exclude the given tag(s)
|
|
116
|
+
# and/or pattern name(s). May be combined with +only:+.
|
|
117
|
+
# @param placeholder [String, :tagged, :hash] replacement strategy.
|
|
118
|
+
# A String is used verbatim. +:tagged+ produces +[REDACTED:TAGNAME]+.
|
|
119
|
+
# +:hash+ produces a deterministic +[TAGNAME_xxxx]+ token (4-hex djb2)
|
|
120
|
+
# so the same input value always maps to the same token.
|
|
121
|
+
# @return [String] a new string with every match replaced.
|
|
122
|
+
# @raise [ArgumentError] if +placeholder:+ is not a String/:tagged/:hash.
|
|
123
|
+
# @raise [UnknownTagError] if any Symbol in +only:+/+except:+ is not in {TAGS}.
|
|
124
|
+
# @raise [UnknownPatternError] if any String in +only:+/+except:+ is not in {pattern_names}.
|
|
125
|
+
#
|
|
126
|
+
# @example
|
|
127
|
+
# DataRedactor.redact("token sk_live_abc123", only: :credentials)
|
|
128
|
+
# DataRedactor.redact(text, only: [:contact, "aws_access_key_id"])
|
|
129
|
+
# DataRedactor.redact(text, only: :contact, except: ["email"])
|
|
130
|
+
def redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT)
|
|
131
|
+
enable_bits = build_enable_bits(only, except)
|
|
46
132
|
ph_mode, ph_str = resolve_placeholder(placeholder)
|
|
47
|
-
_redact(text,
|
|
133
|
+
_redact(text, ph_mode, ph_str, enable_bits)
|
|
48
134
|
end
|
|
49
135
|
|
|
50
|
-
# Scan text
|
|
136
|
+
# Scan +text+ and return both the redacted string and per-match metadata.
|
|
137
|
+
#
|
|
138
|
+
# Useful for auditing, false-positive tuning, and compliance pipelines.
|
|
139
|
+
# +:start+ and +:length+ are byte offsets into the *original* string, so
|
|
140
|
+
# +text.byteslice(m[:start], m[:length]) == m[:value]+.
|
|
51
141
|
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
#
|
|
142
|
+
# @param text [String] input string.
|
|
143
|
+
# @param only [Symbol, String, Array, nil] same semantics as {redact}.
|
|
144
|
+
# @param except [Symbol, String, Array, nil] same semantics as {redact}.
|
|
145
|
+
# @return [Hash{Symbol => Object}] +{ redacted: String, matches:
|
|
146
|
+
# Array<Hash> }+. Each match hash has +:tag+ (Symbol), +:name+ (String),
|
|
147
|
+
# +:value+ (String), +:start+ (Integer byte offset), +:length+ (Integer).
|
|
148
|
+
# @raise [UnknownTagError] if any Symbol in +only:+/+except:+ is not in {TAGS}.
|
|
149
|
+
# @raise [UnknownPatternError] if any String in +only:+/+except:+ is not in {pattern_names}.
|
|
150
|
+
#
|
|
151
|
+
# @example
|
|
152
|
+
# DataRedactor.scan("user@example.com")
|
|
153
|
+
# # => { redacted: "[REDACTED]",
|
|
154
|
+
# # matches: [{tag: :contact, name: "email",
|
|
155
|
+
# # value: "user@example.com", start: 0, length: 16}] }
|
|
55
156
|
def scan(text, only: nil, except: nil)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
mask =
|
|
59
|
-
if only
|
|
60
|
-
bits_for(only)
|
|
61
|
-
elsif except
|
|
62
|
-
TAG_ALL & ~bits_for(except)
|
|
63
|
-
else
|
|
64
|
-
TAG_ALL
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
result = _scan(text, mask)
|
|
157
|
+
enable_bits = build_enable_bits(only, except)
|
|
158
|
+
result = _scan(text, enable_bits)
|
|
68
159
|
# Normalise: convert tag string from C (uppercase) back to the Symbol used in TAGS
|
|
69
|
-
result[:matches].each
|
|
70
|
-
m[:tag] = m[:tag].to_s.downcase.to_sym
|
|
71
|
-
end
|
|
160
|
+
result[:matches].each { |m| m[:tag] = m[:tag].to_s.downcase.to_sym }
|
|
72
161
|
result
|
|
73
162
|
end
|
|
74
163
|
|
|
75
|
-
#
|
|
164
|
+
# Register a custom redaction pattern.
|
|
76
165
|
#
|
|
77
|
-
#
|
|
78
|
-
#
|
|
79
|
-
#
|
|
80
|
-
#
|
|
166
|
+
# Patterns must be valid POSIX ERE. Ruby-only syntax (+\d+, +\s+, +\w+,
|
|
167
|
+
# +\b+, lookaround, non-greedy quantifiers, named groups) is rejected
|
|
168
|
+
# at registration time, never at redaction time.
|
|
169
|
+
#
|
|
170
|
+
# If a pattern with the same +name+ is already registered, it is replaced
|
|
171
|
+
# (the old compiled +regex_t+ is freed).
|
|
172
|
+
#
|
|
173
|
+
# @param name [String] unique identifier for this pattern. Used by {remove_pattern}.
|
|
174
|
+
# @param regex [String, Regexp] POSIX ERE source. A Regexp is accepted
|
|
175
|
+
# for convenience but only its +.source+ is used; flags are ignored.
|
|
176
|
+
# @param tag [Symbol] one of {TAGS} keys. Defaults to +:custom+.
|
|
177
|
+
# @param boundary [Boolean] when true, the pattern is wrapped with
|
|
178
|
+
# +(^|[^0-9A-Za-z])(...)([^0-9A-Za-z]|$)+ so it only matches when not
|
|
179
|
+
# embedded in a longer alphanumeric token. Incompatible with patterns
|
|
180
|
+
# that contain capture groups.
|
|
181
|
+
# @return [Boolean] +true+ on success.
|
|
182
|
+
# @raise [ArgumentError] if +name+ is not a non-empty String, or +regex+
|
|
183
|
+
# is neither a String nor a Regexp.
|
|
184
|
+
# @raise [InvalidPatternError] if the pattern uses Ruby-only syntax,
|
|
185
|
+
# contains capture groups while +boundary: true+, or fails +regcomp+.
|
|
186
|
+
# @raise [UnknownTagError] if +tag+ is not in {TAGS}.
|
|
187
|
+
#
|
|
188
|
+
# @example
|
|
189
|
+
# DataRedactor.add_pattern(name: "employee_id", regex: "EMP-[0-9]{6}")
|
|
190
|
+
# DataRedactor.add_pattern(name: "internal_key",
|
|
191
|
+
# regex: /INT-[A-Z]{3}/,
|
|
192
|
+
# tag: :credentials,
|
|
193
|
+
# boundary: true)
|
|
81
194
|
def add_pattern(name:, regex:, tag: :custom, boundary: false)
|
|
82
195
|
raise ArgumentError, "name must be a non-empty String" \
|
|
83
196
|
unless name.is_a?(String) && !name.empty?
|
|
@@ -105,10 +218,20 @@ module DataRedactor
|
|
|
105
218
|
_add_pattern(name, source, tag_bit, boundary ? 1 : 0)
|
|
106
219
|
end
|
|
107
220
|
|
|
221
|
+
# Remove a previously registered custom pattern.
|
|
222
|
+
#
|
|
223
|
+
# @param name [String, Symbol] the +name+ used in {add_pattern}.
|
|
224
|
+
# @return [Boolean] +true+ if a pattern was removed, +false+ if no
|
|
225
|
+
# pattern with that name was registered.
|
|
108
226
|
def remove_pattern(name)
|
|
109
227
|
_remove_pattern(name.to_s)
|
|
110
228
|
end
|
|
111
229
|
|
|
230
|
+
# List every currently registered custom pattern.
|
|
231
|
+
#
|
|
232
|
+
# @return [Array<Hash{Symbol => Object}>] one hash per pattern with keys
|
|
233
|
+
# +:name+ (String), +:source+ (String — the POSIX ERE source),
|
|
234
|
+
# +:tag+ (Symbol), +:boundary+ (Boolean).
|
|
112
235
|
def custom_patterns
|
|
113
236
|
_custom_patterns.map do |h|
|
|
114
237
|
{ name: h[:name], source: h[:source], tag: TAGS.key(h[:tag_bit]) || :custom,
|
|
@@ -116,22 +239,101 @@ module DataRedactor
|
|
|
116
239
|
end
|
|
117
240
|
end
|
|
118
241
|
|
|
242
|
+
# Remove every registered custom pattern.
|
|
243
|
+
#
|
|
244
|
+
# Mostly useful in test suites that need a clean slate between examples.
|
|
245
|
+
#
|
|
246
|
+
# @return [nil]
|
|
119
247
|
def clear_custom_patterns!
|
|
120
248
|
_clear_custom_patterns
|
|
121
249
|
end
|
|
122
250
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
251
|
+
# @api private
|
|
252
|
+
# Split a mixed Symbol/String filter list into +(tag_bitmask, name_set)+.
|
|
253
|
+
#
|
|
254
|
+
# @param entries [nil, Symbol, String, Array]
|
|
255
|
+
# @return [Array(Integer, Set<String>)] tag bits OR-ed together; set of
|
|
256
|
+
# pattern-name Strings.
|
|
257
|
+
# @raise [UnknownTagError] for unknown Symbols.
|
|
258
|
+
# @raise [UnknownPatternError] for unknown Strings.
|
|
259
|
+
def split_filter(entries)
|
|
260
|
+
bits = 0
|
|
261
|
+
names = Set.new
|
|
262
|
+
return [bits, names] if entries.nil?
|
|
263
|
+
Array(entries).each do |e|
|
|
264
|
+
case e
|
|
265
|
+
when Symbol
|
|
266
|
+
bit = TAGS[e] or raise UnknownTagError,
|
|
267
|
+
"unknown tag #{e.inspect}; valid tags: #{TAGS.keys.inspect}"
|
|
268
|
+
bits |= bit
|
|
269
|
+
when String
|
|
270
|
+
unless pattern_names.include?(e)
|
|
271
|
+
raise UnknownPatternError,
|
|
272
|
+
"unknown pattern name #{e.inspect}; see DataRedactor.pattern_names"
|
|
273
|
+
end
|
|
274
|
+
names << e
|
|
275
|
+
else
|
|
276
|
+
raise ArgumentError,
|
|
277
|
+
"only:/except: entries must be a Symbol (tag) or String (pattern name), got #{e.inspect}"
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
[bits, names]
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# @api private
|
|
284
|
+
# Build the per-pattern enable bit-list passed to the C layer.
|
|
285
|
+
#
|
|
286
|
+
# The list has one Integer (0 or 1) per pattern in execution order:
|
|
287
|
+
# built-ins first (NUM_PATTERNS entries), then currently registered custom
|
|
288
|
+
# patterns in registration order. C iterates by index and skips zeros.
|
|
289
|
+
#
|
|
290
|
+
# Semantics of +only:+ / +except:+ — both accept a mix of Symbols (tags)
|
|
291
|
+
# and Strings (pattern names):
|
|
292
|
+
# enabled(p) iff
|
|
293
|
+
# (only is nil OR p.tag ∈ only_tags OR p.name ∈ only_names)
|
|
294
|
+
# AND p.tag ∉ except_tags AND p.name ∉ except_names
|
|
295
|
+
#
|
|
296
|
+
# @return [Array<Integer>] same length as built-ins + customs.
|
|
297
|
+
def build_enable_bits(only, except)
|
|
298
|
+
only_bits, only_names = split_filter(only)
|
|
299
|
+
except_bits, except_names = split_filter(except)
|
|
300
|
+
only_present = !only.nil?
|
|
301
|
+
|
|
302
|
+
bits = Array.new(BUILTIN_PATTERN_NAMES.length + _custom_patterns.length, 0)
|
|
303
|
+
|
|
304
|
+
BUILTIN_PATTERN_NAMES.each_with_index do |name, i|
|
|
305
|
+
tag_bit = BUILTIN_PATTERN_TAG_BITS[i]
|
|
306
|
+
bits[i] = 1 if pattern_enabled?(name, tag_bit, only_present,
|
|
307
|
+
only_bits, only_names,
|
|
308
|
+
except_bits, except_names)
|
|
128
309
|
end
|
|
310
|
+
|
|
311
|
+
_custom_patterns.each_with_index do |h, i|
|
|
312
|
+
bits[BUILTIN_PATTERN_NAMES.length + i] = 1 if pattern_enabled?(
|
|
313
|
+
h[:name], h[:tag_bit], only_present,
|
|
314
|
+
only_bits, only_names, except_bits, except_names)
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
bits
|
|
129
318
|
end
|
|
130
319
|
|
|
131
|
-
#
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
320
|
+
# @api private
|
|
321
|
+
def pattern_enabled?(name, tag_bit, only_present, only_bits, only_names,
|
|
322
|
+
except_bits, except_names)
|
|
323
|
+
return false if (tag_bit & except_bits) != 0
|
|
324
|
+
return false if except_names.include?(name)
|
|
325
|
+
return true unless only_present
|
|
326
|
+
return true if (tag_bit & only_bits) != 0
|
|
327
|
+
only_names.include?(name)
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# @api private
|
|
331
|
+
# Translate the user-facing +placeholder:+ value into the +(mode_int, str)+
|
|
332
|
+
# pair the C layer expects.
|
|
333
|
+
#
|
|
334
|
+
# @param placeholder [String, :tagged, :hash]
|
|
335
|
+
# @return [Array(Integer, String)]
|
|
336
|
+
# @raise [ArgumentError] if +placeholder+ is none of the accepted values.
|
|
135
337
|
def resolve_placeholder(placeholder)
|
|
136
338
|
case placeholder
|
|
137
339
|
when :tagged then [PH_MODE_TAGGED, ""]
|
data/readme.md
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# DataRedactor
|
|
2
2
|
|
|
3
|
+
[](https://rubygems.org/gems/data_redactor)
|
|
4
|
+
[](https://github.com/danielefrisanco/data_redactor/actions/workflows/ci.yml)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
3
7
|
A Ruby gem with a C extension for high-performance regex-based redaction of sensitive data from strings.
|
|
4
8
|
|
|
5
9
|
## What it does
|
|
@@ -16,25 +20,34 @@ DataRedactor.redact(text)
|
|
|
16
20
|
# => "User CF is [REDACTED] and key is [REDACTED]"
|
|
17
21
|
```
|
|
18
22
|
|
|
19
|
-
### Filtering by tag
|
|
23
|
+
### Filtering by tag or pattern name
|
|
20
24
|
|
|
21
|
-
|
|
25
|
+
`only:` and `except:` both accept a single value or an Array, mixing **Symbols** (tag names) and **Strings** (specific pattern names).
|
|
22
26
|
|
|
23
27
|
```ruby
|
|
24
28
|
DataRedactor.tags
|
|
25
|
-
# => [:credentials, :financial, :tax_id, :national_id, :contact, :network, :travel, :other]
|
|
29
|
+
# => [:credentials, :financial, :tax_id, :national_id, :contact, :network, :travel, :other, :custom]
|
|
30
|
+
|
|
31
|
+
DataRedactor.pattern_names
|
|
32
|
+
# => ["aws_s3_presigned_url", "aws_access_key_id", "email", "phone_e164", "ipv4", ...]
|
|
26
33
|
|
|
27
|
-
#
|
|
34
|
+
# Tag-level filtering
|
|
28
35
|
DataRedactor.redact(text, only: [:credentials])
|
|
36
|
+
DataRedactor.redact(text, except: :contact)
|
|
29
37
|
|
|
30
|
-
#
|
|
31
|
-
DataRedactor.redact(text,
|
|
38
|
+
# Single specific pattern
|
|
39
|
+
DataRedactor.redact(text, only: ["aws_access_key_id"])
|
|
32
40
|
|
|
33
|
-
#
|
|
34
|
-
DataRedactor.redact(text, only: :
|
|
41
|
+
# Mix — every credentials pattern PLUS aws_access_key_id (even if it lived in another tag)
|
|
42
|
+
DataRedactor.redact(text, only: [:credentials, "aws_access_key_id"])
|
|
43
|
+
|
|
44
|
+
# Combine — every contact pattern EXCEPT email
|
|
45
|
+
DataRedactor.redact(text, only: :contact, except: ["email"])
|
|
35
46
|
```
|
|
36
47
|
|
|
37
|
-
|
|
48
|
+
**Precedence:** a pattern is redacted iff `(only is nil OR matches only:)` AND `(does not match except:)`. `except:` always wins when the two overlap, so `only: :contact, except: :contact` produces a no-op (everything is excluded).
|
|
49
|
+
|
|
50
|
+
**Errors:** an unknown tag Symbol raises `DataRedactor::UnknownTagError`; an unknown pattern name String raises `DataRedactor::UnknownPatternError`.
|
|
38
51
|
|
|
39
52
|
### Configurable placeholder
|
|
40
53
|
|
|
@@ -84,9 +97,10 @@ result = DataRedactor.scan("User AKIAIOSFODNN7EXAMPLE logged in from 192.168.1.1
|
|
|
84
97
|
m = result[:matches].first
|
|
85
98
|
original_text.byteslice(m[:start], m[:length]) # => "AKIAIOSFODNN7EXAMPLE"
|
|
86
99
|
|
|
87
|
-
# Accepts the same
|
|
100
|
+
# Accepts the same filters as redact (tags + specific pattern names)
|
|
88
101
|
DataRedactor.scan(text, only: :credentials)
|
|
89
102
|
DataRedactor.scan(text, except: :network)
|
|
103
|
+
DataRedactor.scan(text, only: :contact, except: ["email"])
|
|
90
104
|
```
|
|
91
105
|
|
|
92
106
|
### Custom patterns
|
|
@@ -114,7 +128,9 @@ DataRedactor.clear_custom_patterns! # mostly for test suites
|
|
|
114
128
|
|
|
115
129
|
**`boundary: true`** — wraps the pattern with `(^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)` so it only fires when the token is not embedded in a longer alphanumeric string. Incompatible with patterns that contain capture groups.
|
|
116
130
|
|
|
117
|
-
## Detected patterns (
|
|
131
|
+
## Detected patterns (79 total)
|
|
132
|
+
|
|
133
|
+
The table below is a representative sample. Use `DataRedactor.pattern_names` for the canonical, machine-readable list — it stays in sync with the C extension automatically.
|
|
118
134
|
|
|
119
135
|
### Cloud & API secrets
|
|
120
136
|
|
|
@@ -205,10 +221,16 @@ redactor/
|
|
|
205
221
|
│ └── version.rb
|
|
206
222
|
├── ext/
|
|
207
223
|
│ └── data_redactor/
|
|
208
|
-
│ ├── extconf.rb
|
|
209
|
-
│
|
|
224
|
+
│ ├── extconf.rb # Checks for C headers, generates Makefile (globs *.c)
|
|
225
|
+
│ ├── data_redactor.c # Entry point: Init_data_redactor only
|
|
226
|
+
│ ├── patterns.{c,h} # Built-in pattern table + compiled regex_t array
|
|
227
|
+
│ ├── placeholder.{c,h} # write_placeholder, djb2 hash, tag_name_for_bit
|
|
228
|
+
│ ├── redact.{c,h} # _redact + replace_all_matches + wrap_boundary
|
|
229
|
+
│ ├── scan.{c,h} # _scan + byte-offset replacement-log macros
|
|
230
|
+
│ ├── custom_patterns.{c,h} # Dynamic registry: add/remove/clear/list
|
|
231
|
+
│ └── tags.h # TAG_* bit constants
|
|
210
232
|
└── spec/
|
|
211
|
-
└── data_redactor_spec.rb # RSpec tests
|
|
233
|
+
└── data_redactor_spec.rb # RSpec tests — at least one example per pattern, plus filter / placeholder / custom-pattern coverage
|
|
212
234
|
```
|
|
213
235
|
|
|
214
236
|
## Requirements
|
|
@@ -245,7 +267,7 @@ bundle exec rake
|
|
|
245
267
|
|
|
246
268
|
## How it works
|
|
247
269
|
|
|
248
|
-
1. At load time, `Init_data_redactor` compiles all
|
|
270
|
+
1. At load time, `Init_data_redactor` compiles all 79 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
|
|
249
271
|
2. `DataRedactor.redact(text)` receives a Ruby `String`, converts it to a C `char*` via `StringValueCStr`, and runs each compiled pattern in sequence on a working buffer.
|
|
250
272
|
3. For each pattern, `replace_all_matches` iterates using `regexec`, copies non-matching segments to a fresh output buffer, and inserts `[REDACTED]` in place of each match. For boundary-wrapped patterns, `regexec` is called with `nmatch=4` and sub-match groups `[1]`/`[3]` identify the boundary characters so they are preserved verbatim.
|
|
251
273
|
4. The output buffer is grown with `realloc` as needed. After all patterns are applied the result is returned as a Ruby `String` via `rb_str_new_cstr`. All intermediate `malloc`/`strdup` allocations are explicitly `free`d.
|
|
@@ -254,6 +276,12 @@ bundle exec rake
|
|
|
254
276
|
|
|
255
277
|
All C-side buffers are heap-allocated with `malloc`/`strdup` and freed before the function returns. The only Ruby-managed allocation is the final return value from `rb_str_new_cstr`. No Ruby objects are created mid-processing, so GC cannot collect anything out from under the C code.
|
|
256
278
|
|
|
279
|
+
## Thread safety
|
|
280
|
+
|
|
281
|
+
`DataRedactor.redact` and `DataRedactor.scan` are safe to call concurrently from multiple threads. Built-in patterns are compiled into a static `regex_t` array at load time and never mutated afterward, and each call allocates its own working buffers. POSIX `regexec` is documented as thread-safe.
|
|
282
|
+
|
|
283
|
+
`DataRedactor.add_pattern`, `remove_pattern`, and `clear_custom_patterns!` mutate a shared dynamic array and are **not** thread-safe. Register custom patterns once at boot — before spawning worker threads or forking — and they will be visible (read-only) to every subsequent `redact`/`scan` call.
|
|
284
|
+
|
|
257
285
|
## Versioning
|
|
258
286
|
|
|
259
287
|
This project follows [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html). Until `1.0.0`, minor versions may introduce breaking changes; from `1.0.0` onward, breaking changes will only land in major versions. See [CHANGELOG.md](CHANGELOG.md) for the release history.
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_redactor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Daniele Frisanco
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-08 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake-compiler
|
|
@@ -38,6 +38,20 @@ dependencies:
|
|
|
38
38
|
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '3.12'
|
|
41
|
+
- !ruby/object:Gem::Dependency
|
|
42
|
+
name: yard
|
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - "~>"
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: '0.9'
|
|
48
|
+
type: :development
|
|
49
|
+
prerelease: false
|
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - "~>"
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0.9'
|
|
41
55
|
description: A Ruby gem with a C extension for high-performance scanning and redaction
|
|
42
56
|
of 79 sensitive patterns — API keys, tokens, credentials, IBANs, national IDs, emails,
|
|
43
57
|
phone numbers, and PII from 15+ countries. Designed to sanitize text before sending
|
|
@@ -51,8 +65,19 @@ extra_rdoc_files: []
|
|
|
51
65
|
files:
|
|
52
66
|
- CHANGELOG.md
|
|
53
67
|
- LICENSE
|
|
68
|
+
- ext/data_redactor/custom_patterns.c
|
|
69
|
+
- ext/data_redactor/custom_patterns.h
|
|
54
70
|
- ext/data_redactor/data_redactor.c
|
|
55
71
|
- ext/data_redactor/extconf.rb
|
|
72
|
+
- ext/data_redactor/patterns.c
|
|
73
|
+
- ext/data_redactor/patterns.h
|
|
74
|
+
- ext/data_redactor/placeholder.c
|
|
75
|
+
- ext/data_redactor/placeholder.h
|
|
76
|
+
- ext/data_redactor/redact.c
|
|
77
|
+
- ext/data_redactor/redact.h
|
|
78
|
+
- ext/data_redactor/scan.c
|
|
79
|
+
- ext/data_redactor/scan.h
|
|
80
|
+
- ext/data_redactor/tags.h
|
|
56
81
|
- lib/data_redactor.rb
|
|
57
82
|
- lib/data_redactor/version.rb
|
|
58
83
|
- readme.md
|