data_redactor 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/data_redactor.rb CHANGED
@@ -1,7 +1,42 @@
1
+ require "set"
1
2
  require_relative "data_redactor/version"
2
3
  require_relative "data_redactor/data_redactor" # loads the compiled .so
3
4
 
5
+ # High-performance regex-based redactor for sensitive data.
6
+ #
7
+ # DataRedactor scans text for sensitive patterns (API keys, IBANs, national
8
+ # IDs, emails, phone numbers, etc.) and replaces matches with a configurable
9
+ # placeholder. The matching is done by a C extension backed by POSIX
10
+ # +regex.h+, so it is fast enough to run inline on large payloads.
11
+ #
12
+ # @example Basic redaction
13
+ # DataRedactor.redact("key is AKIAIOSFODNN7EXAMPLE")
14
+ # # => "key is [REDACTED]"
15
+ #
16
+ # @example Filter by tag or pattern name
17
+ # DataRedactor.redact(text, only: :credentials)
18
+ # DataRedactor.redact(text, except: [:contact, :network])
19
+ # DataRedactor.redact(text, only: :contact, except: ["email"])
20
+ # DataRedactor.redact(text, only: ["aws_access_key_id"])
21
+ #
22
+ # @example Custom placeholder
23
+ # DataRedactor.redact(text, placeholder: "***")
24
+ # DataRedactor.redact(text, placeholder: :tagged) # => "[REDACTED:CONTACT]"
25
+ # DataRedactor.redact(text, placeholder: :hash) # => "[CONTACT_a3f9]"
26
+ #
27
+ # @example Audit / dry-run
28
+ # DataRedactor.scan(text)
29
+ # # => { redacted: "...", matches: [{tag:, name:, value:, start:, length:}, ...] }
30
+ #
31
+ # @example Custom pattern
32
+ # DataRedactor.add_pattern(name: "employee_id", regex: "EMP-[0-9]{6}")
4
33
  module DataRedactor
34
+ # Map of tag symbol to the integer bit used by the C layer.
35
+ #
36
+ # The keys of this hash are the canonical list of supported tags; pass any
37
+ # of them to {redact} or {scan} via +only:+ / +except:+.
38
+ #
39
+ # @return [Hash{Symbol => Integer}] frozen tag-to-bit map
5
40
  TAGS = {
6
41
  credentials: TAG_CREDENTIALS,
7
42
  financial: TAG_FINANCIAL,
@@ -14,70 +49,148 @@ module DataRedactor
14
49
  custom: TAG_CUSTOM
15
50
  }.freeze
16
51
 
52
+ # Raised when a tag symbol passed to +only:+ / +except:+ / +tag:+ is not in {TAGS}.
17
53
  class UnknownTagError < ArgumentError; end
54
+
55
+ # Raised when a String passed via +only:+ / +except:+ does not match any
56
+ # registered pattern name. See {pattern_names}.
57
+ class UnknownPatternError < ArgumentError; end
58
+
59
+ # Raised by {add_pattern} when the supplied regex is not valid POSIX ERE,
60
+ # uses Ruby-only syntax (+\d+, +\s+, lookaround, non-greedy, etc.), or
61
+ # contains capture groups while +boundary: true+ is requested.
18
62
  class InvalidPatternError < ArgumentError; end
19
63
 
64
+ # @api private
20
65
  # Capture groups break boundary-wrapper group index assumptions ([1],[2],[3] shift).
21
66
  CAPTURE_GROUP_RE = /(?<!\\)\((?!\?:)/.freeze
22
67
 
68
+ # @api private
23
69
  # Ruby regex syntax that has no POSIX ERE equivalent.
24
70
  RUBY_ONLY_SYNTAX_RE = /\\[dDwWsShHbB]|\(\?[<!=]|\(\?<[a-zA-Z]|\(\?[imx]|[*+?]\?/.freeze
25
71
 
72
+ # Default placeholder used when +placeholder:+ is not given to {redact}.
26
73
  PLACEHOLDER_DEFAULT = "[REDACTED]"
27
74
 
28
75
  module_function
29
76
 
77
+ # List of supported tag symbols.
78
+ #
79
+ # @return [Array<Symbol>] every key from {TAGS}
30
80
  def tags
31
81
  TAGS.keys
32
82
  end
33
83
 
34
- def redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT)
35
- raise ArgumentError, "pass only: or except:, not both" if only && except
36
-
37
- mask =
38
- if only
39
- bits_for(only)
40
- elsif except
41
- TAG_ALL & ~bits_for(except)
42
- else
43
- TAG_ALL
44
- end
84
+ # List of every pattern name the redactor knows about.
85
+ #
86
+ # Includes the {BUILTIN_PATTERN_NAMES} plus any names registered via
87
+ # {add_pattern}. Useful for discovering what String values +only:+ /
88
+ # +except:+ accept, and for filtering / debugging.
89
+ #
90
+ # @return [Array<String>] built-in names first (in execution order),
91
+ # then custom names in registration order.
92
+ def pattern_names
93
+ BUILTIN_PATTERN_NAMES + _custom_patterns.map { |h| h[:name] }
94
+ end
45
95
 
96
+ # Redact every match of the configured patterns in +text+.
97
+ #
98
+ # +only:+ and +except:+ both accept a single value or an Array, mixing:
99
+ # - **Symbols** — tag names from {TAGS} (e.g. +:contact+, +:credentials+).
100
+ # - **Strings** — specific pattern names from {pattern_names} (e.g. +"email"+).
101
+ #
102
+ # They can be combined: +only: :contact, except: ["email"]+ means
103
+ # "redact every contact pattern except email." Symbols give you tag-level
104
+ # control; Strings give you per-pattern precision.
105
+ #
106
+ # **Precedence:** a pattern is redacted iff
107
+ # +(only is nil OR pattern matches only:)+ AND +(pattern does not match except:)+.
108
+ # +except:+ always wins over +only:+ when they overlap — e.g.
109
+ # +only: :contact, except: :contact+ produces an empty redaction (no-op),
110
+ # and +only: ["email"], except: ["email"]+ likewise skips email entirely.
111
+ #
112
+ # @param text [String] input string. Returned unchanged if no patterns match.
113
+ # @param only [Symbol, String, Array, nil] include only the given tag(s)
114
+ # and/or pattern name(s).
115
+ # @param except [Symbol, String, Array, nil] exclude the given tag(s)
116
+ # and/or pattern name(s). May be combined with +only:+.
117
+ # @param placeholder [String, :tagged, :hash] replacement strategy.
118
+ # A String is used verbatim. +:tagged+ produces +[REDACTED:TAGNAME]+.
119
+ # +:hash+ produces a deterministic +[TAGNAME_xxxx]+ token (4-hex djb2)
120
+ # so the same input value always maps to the same token.
121
+ # @return [String] a new string with every match replaced.
122
+ # @raise [ArgumentError] if +placeholder:+ is not a String/:tagged/:hash.
123
+ # @raise [UnknownTagError] if any Symbol in +only:+/+except:+ is not in {TAGS}.
124
+ # @raise [UnknownPatternError] if any String in +only:+/+except:+ is not in {pattern_names}.
125
+ #
126
+ # @example
127
+ # DataRedactor.redact("token sk_live_abc123", only: :credentials)
128
+ # DataRedactor.redact(text, only: [:contact, "aws_access_key_id"])
129
+ # DataRedactor.redact(text, only: :contact, except: ["email"])
130
+ def redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT)
131
+ enable_bits = build_enable_bits(only, except)
46
132
  ph_mode, ph_str = resolve_placeholder(placeholder)
47
- _redact(text, mask, ph_mode, ph_str)
133
+ _redact(text, ph_mode, ph_str, enable_bits)
48
134
  end
49
135
 
50
- # Scan text without necessarily redacting it.
136
+ # Scan +text+ and return both the redacted string and per-match metadata.
137
+ #
138
+ # Useful for auditing, false-positive tuning, and compliance pipelines.
139
+ # +:start+ and +:length+ are byte offsets into the *original* string, so
140
+ # +text.byteslice(m[:start], m[:length]) == m[:value]+.
51
141
  #
52
- # Returns { redacted: String, matches: [{tag:, name:, value:, start:, length:}, ...] }
53
- # The :tag value is a Symbol matching one of DataRedactor.tags.
54
- # :start and :length are byte offsets into the original string.
142
+ # @param text [String] input string.
143
+ # @param only [Symbol, String, Array, nil] same semantics as {redact}.
144
+ # @param except [Symbol, String, Array, nil] same semantics as {redact}.
145
+ # @return [Hash{Symbol => Object}] +{ redacted: String, matches:
146
+ # Array<Hash> }+. Each match hash has +:tag+ (Symbol), +:name+ (String),
147
+ # +:value+ (String), +:start+ (Integer byte offset), +:length+ (Integer).
148
+ # @raise [UnknownTagError] if any Symbol in +only:+/+except:+ is not in {TAGS}.
149
+ # @raise [UnknownPatternError] if any String in +only:+/+except:+ is not in {pattern_names}.
150
+ #
151
+ # @example
152
+ # DataRedactor.scan("user@example.com")
153
+ # # => { redacted: "[REDACTED]",
154
+ # # matches: [{tag: :contact, name: "email",
155
+ # # value: "user@example.com", start: 0, length: 16}] }
55
156
  def scan(text, only: nil, except: nil)
56
- raise ArgumentError, "pass only: or except:, not both" if only && except
57
-
58
- mask =
59
- if only
60
- bits_for(only)
61
- elsif except
62
- TAG_ALL & ~bits_for(except)
63
- else
64
- TAG_ALL
65
- end
66
-
67
- result = _scan(text, mask)
157
+ enable_bits = build_enable_bits(only, except)
158
+ result = _scan(text, enable_bits)
68
159
  # Normalise: convert tag string from C (uppercase) back to the Symbol used in TAGS
69
- result[:matches].each do |m|
70
- m[:tag] = m[:tag].to_s.downcase.to_sym
71
- end
160
+ result[:matches].each { |m| m[:tag] = m[:tag].to_s.downcase.to_sym }
72
161
  result
73
162
  end
74
163
 
75
- # Add (or replace) a custom redaction pattern.
164
+ # Register a custom redaction pattern.
76
165
  #
77
- # name: unique identifier string
78
- # regex: String (POSIX ERE) or Regexp; Ruby-only syntax raises InvalidPatternError
79
- # tag: one of the TAGS keys (default :custom), or any built-in tag
80
- # boundary: wrap with word-boundary guards; incompatible with capture groups
166
+ # Patterns must be valid POSIX ERE. Ruby-only syntax (+\d+, +\s+, +\w+,
167
+ # +\b+, lookaround, non-greedy quantifiers, named groups) is rejected
168
+ # at registration time, never at redaction time.
169
+ #
170
+ # If a pattern with the same +name+ is already registered, it is replaced
171
+ # (the old compiled +regex_t+ is freed).
172
+ #
173
+ # @param name [String] unique identifier for this pattern. Used by {remove_pattern}.
174
+ # @param regex [String, Regexp] POSIX ERE source. A Regexp is accepted
175
+ # for convenience but only its +.source+ is used; flags are ignored.
176
+ # @param tag [Symbol] one of {TAGS} keys. Defaults to +:custom+.
177
+ # @param boundary [Boolean] when true, the pattern is wrapped with
178
+ # +(^|[^0-9A-Za-z])(...)([^0-9A-Za-z]|$)+ so it only matches when not
179
+ # embedded in a longer alphanumeric token. Incompatible with patterns
180
+ # that contain capture groups.
181
+ # @return [Boolean] +true+ on success.
182
+ # @raise [ArgumentError] if +name+ is not a non-empty String, or +regex+
183
+ # is neither a String nor a Regexp.
184
+ # @raise [InvalidPatternError] if the pattern uses Ruby-only syntax,
185
+ # contains capture groups while +boundary: true+, or fails +regcomp+.
186
+ # @raise [UnknownTagError] if +tag+ is not in {TAGS}.
187
+ #
188
+ # @example
189
+ # DataRedactor.add_pattern(name: "employee_id", regex: "EMP-[0-9]{6}")
190
+ # DataRedactor.add_pattern(name: "internal_key",
191
+ # regex: /INT-[A-Z]{3}/,
192
+ # tag: :credentials,
193
+ # boundary: true)
81
194
  def add_pattern(name:, regex:, tag: :custom, boundary: false)
82
195
  raise ArgumentError, "name must be a non-empty String" \
83
196
  unless name.is_a?(String) && !name.empty?
@@ -105,10 +218,20 @@ module DataRedactor
105
218
  _add_pattern(name, source, tag_bit, boundary ? 1 : 0)
106
219
  end
107
220
 
221
+ # Remove a previously registered custom pattern.
222
+ #
223
+ # @param name [String, Symbol] the +name+ used in {add_pattern}.
224
+ # @return [Boolean] +true+ if a pattern was removed, +false+ if no
225
+ # pattern with that name was registered.
108
226
  def remove_pattern(name)
109
227
  _remove_pattern(name.to_s)
110
228
  end
111
229
 
230
+ # List every currently registered custom pattern.
231
+ #
232
+ # @return [Array<Hash{Symbol => Object}>] one hash per pattern with keys
233
+ # +:name+ (String), +:source+ (String — the POSIX ERE source),
234
+ # +:tag+ (Symbol), +:boundary+ (Boolean).
112
235
  def custom_patterns
113
236
  _custom_patterns.map do |h|
114
237
  { name: h[:name], source: h[:source], tag: TAGS.key(h[:tag_bit]) || :custom,
@@ -116,22 +239,101 @@ module DataRedactor
116
239
  end
117
240
  end
118
241
 
242
+ # Remove every registered custom pattern.
243
+ #
244
+ # Mostly useful in test suites that need a clean slate between examples.
245
+ #
246
+ # @return [nil]
119
247
  def clear_custom_patterns!
120
248
  _clear_custom_patterns
121
249
  end
122
250
 
123
- def bits_for(tag_list)
124
- Array(tag_list).inject(0) do |acc, tag|
125
- bit = TAGS[tag] or raise UnknownTagError,
126
- "unknown tag #{tag.inspect}; valid tags: #{TAGS.keys.inspect}"
127
- acc | bit
251
+ # @api private
252
+ # Split a mixed Symbol/String filter list into +(tag_bitmask, name_set)+.
253
+ #
254
+ # @param entries [nil, Symbol, String, Array]
255
+ # @return [Array(Integer, Set<String>)] tag bits OR-ed together; set of
256
+ # pattern-name Strings.
257
+ # @raise [UnknownTagError] for unknown Symbols.
258
+ # @raise [UnknownPatternError] for unknown Strings.
259
+ def split_filter(entries)
260
+ bits = 0
261
+ names = Set.new
262
+ return [bits, names] if entries.nil?
263
+ Array(entries).each do |e|
264
+ case e
265
+ when Symbol
266
+ bit = TAGS[e] or raise UnknownTagError,
267
+ "unknown tag #{e.inspect}; valid tags: #{TAGS.keys.inspect}"
268
+ bits |= bit
269
+ when String
270
+ unless pattern_names.include?(e)
271
+ raise UnknownPatternError,
272
+ "unknown pattern name #{e.inspect}; see DataRedactor.pattern_names"
273
+ end
274
+ names << e
275
+ else
276
+ raise ArgumentError,
277
+ "only:/except: entries must be a Symbol (tag) or String (pattern name), got #{e.inspect}"
278
+ end
279
+ end
280
+ [bits, names]
281
+ end
282
+
283
+ # @api private
284
+ # Build the per-pattern enable bit-list passed to the C layer.
285
+ #
286
+ # The list has one Integer (0 or 1) per pattern in execution order:
287
+ # built-ins first (NUM_PATTERNS entries), then currently registered custom
288
+ # patterns in registration order. C iterates by index and skips zeros.
289
+ #
290
+ # Semantics of +only:+ / +except:+ — both accept a mix of Symbols (tags)
291
+ # and Strings (pattern names):
292
+ # enabled(p) iff
293
+ # (only is nil OR p.tag ∈ only_tags OR p.name ∈ only_names)
294
+ # AND p.tag ∉ except_tags AND p.name ∉ except_names
295
+ #
296
+ # @return [Array<Integer>] same length as built-ins + customs.
297
+ def build_enable_bits(only, except)
298
+ only_bits, only_names = split_filter(only)
299
+ except_bits, except_names = split_filter(except)
300
+ only_present = !only.nil?
301
+
302
+ bits = Array.new(BUILTIN_PATTERN_NAMES.length + _custom_patterns.length, 0)
303
+
304
+ BUILTIN_PATTERN_NAMES.each_with_index do |name, i|
305
+ tag_bit = BUILTIN_PATTERN_TAG_BITS[i]
306
+ bits[i] = 1 if pattern_enabled?(name, tag_bit, only_present,
307
+ only_bits, only_names,
308
+ except_bits, except_names)
128
309
  end
310
+
311
+ _custom_patterns.each_with_index do |h, i|
312
+ bits[BUILTIN_PATTERN_NAMES.length + i] = 1 if pattern_enabled?(
313
+ h[:name], h[:tag_bit], only_present,
314
+ only_bits, only_names, except_bits, except_names)
315
+ end
316
+
317
+ bits
129
318
  end
130
319
 
131
- # Returns [ph_mode_int, ph_str] for the C layer.
132
- # placeholder: "***" -> plain string
133
- # placeholder: :tagged -> "[REDACTED:TAGNAME]"
134
- # placeholder: :hash -> "[TAGNAME_xxxx]"
320
+ # @api private
321
+ def pattern_enabled?(name, tag_bit, only_present, only_bits, only_names,
322
+ except_bits, except_names)
323
+ return false if (tag_bit & except_bits) != 0
324
+ return false if except_names.include?(name)
325
+ return true unless only_present
326
+ return true if (tag_bit & only_bits) != 0
327
+ only_names.include?(name)
328
+ end
329
+
330
+ # @api private
331
+ # Translate the user-facing +placeholder:+ value into the +(mode_int, str)+
332
+ # pair the C layer expects.
333
+ #
334
+ # @param placeholder [String, :tagged, :hash]
335
+ # @return [Array(Integer, String)]
336
+ # @raise [ArgumentError] if +placeholder+ is none of the accepted values.
135
337
  def resolve_placeholder(placeholder)
136
338
  case placeholder
137
339
  when :tagged then [PH_MODE_TAGGED, ""]
data/readme.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # DataRedactor
2
2
 
3
+ [![Gem Version](https://badge.fury.io/rb/data_redactor.svg)](https://rubygems.org/gems/data_redactor)
4
+ [![CI](https://github.com/danielefrisanco/data_redactor/actions/workflows/ci.yml/badge.svg)](https://github.com/danielefrisanco/data_redactor/actions/workflows/ci.yml)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
6
+
3
7
  A Ruby gem with a C extension for high-performance regex-based redaction of sensitive data from strings.
4
8
 
5
9
  ## What it does
@@ -16,25 +20,34 @@ DataRedactor.redact(text)
16
20
  # => "User CF is [REDACTED] and key is [REDACTED]"
17
21
  ```
18
22
 
19
- ### Filtering by tag
23
+ ### Filtering by tag or pattern name
20
24
 
21
- Every pattern belongs to one tag. Use `only:` to redact a subset, or `except:` to skip one.
25
+ `only:` and `except:` both accept a single value or an Array, mixing **Symbols** (tag names) and **Strings** (specific pattern names).
22
26
 
23
27
  ```ruby
24
28
  DataRedactor.tags
25
- # => [:credentials, :financial, :tax_id, :national_id, :contact, :network, :travel, :other]
29
+ # => [:credentials, :financial, :tax_id, :national_id, :contact, :network, :travel, :other, :custom]
30
+
31
+ DataRedactor.pattern_names
32
+ # => ["aws_s3_presigned_url", "aws_access_key_id", "email", "phone_e164", "ipv4", ...]
26
33
 
27
- # Only redact API keys / tokens / private keys
34
+ # Tag-level filtering
28
35
  DataRedactor.redact(text, only: [:credentials])
36
+ DataRedactor.redact(text, except: :contact)
29
37
 
30
- # Redact everything except contact info (emails, phone numbers)
31
- DataRedactor.redact(text, except: [:contact])
38
+ # Single specific pattern
39
+ DataRedactor.redact(text, only: ["aws_access_key_id"])
32
40
 
33
- # Single symbol works too
34
- DataRedactor.redact(text, only: :financial)
41
+ # Mix every credentials pattern PLUS aws_access_key_id (even if it lived in another tag)
42
+ DataRedactor.redact(text, only: [:credentials, "aws_access_key_id"])
43
+
44
+ # Combine — every contact pattern EXCEPT email
45
+ DataRedactor.redact(text, only: :contact, except: ["email"])
35
46
  ```
36
47
 
37
- Passing an unknown tag raises `DataRedactor::UnknownTagError`. Passing both `only:` and `except:` raises `ArgumentError`.
48
+ **Precedence:** a pattern is redacted iff `(only is nil OR matches only:)` AND `(does not match except:)`. `except:` always wins when the two overlap, so `only: :contact, except: :contact` produces a no-op (everything is excluded).
49
+
50
+ **Errors:** an unknown tag Symbol raises `DataRedactor::UnknownTagError`; an unknown pattern name String raises `DataRedactor::UnknownPatternError`.
38
51
 
39
52
  ### Configurable placeholder
40
53
 
@@ -84,9 +97,10 @@ result = DataRedactor.scan("User AKIAIOSFODNN7EXAMPLE logged in from 192.168.1.1
84
97
  m = result[:matches].first
85
98
  original_text.byteslice(m[:start], m[:length]) # => "AKIAIOSFODNN7EXAMPLE"
86
99
 
87
- # Accepts the same tag filters as redact
100
+ # Accepts the same filters as redact (tags + specific pattern names)
88
101
  DataRedactor.scan(text, only: :credentials)
89
102
  DataRedactor.scan(text, except: :network)
103
+ DataRedactor.scan(text, only: :contact, except: ["email"])
90
104
  ```
91
105
 
92
106
  ### Custom patterns
@@ -114,7 +128,9 @@ DataRedactor.clear_custom_patterns! # mostly for test suites
114
128
 
115
129
  **`boundary: true`** — wraps the pattern with `(^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)` so it only fires when the token is not embedded in a longer alphanumeric string. Incompatible with patterns that contain capture groups.
116
130
 
117
- ## Detected patterns (49 total)
131
+ ## Detected patterns (79 total)
132
+
133
+ The table below is a representative sample. Use `DataRedactor.pattern_names` for the canonical, machine-readable list — it stays in sync with the C extension automatically.
118
134
 
119
135
  ### Cloud & API secrets
120
136
 
@@ -205,10 +221,16 @@ redactor/
205
221
  │ └── version.rb
206
222
  ├── ext/
207
223
  │ └── data_redactor/
208
- │ ├── extconf.rb # Checks for C headers, generates Makefile
209
- └── data_redactor.c # C extension: regex compilation + redaction
224
+ │ ├── extconf.rb # Checks for C headers, generates Makefile (globs *.c)
225
+ ├── data_redactor.c # Entry point: Init_data_redactor only
226
+ │ ├── patterns.{c,h} # Built-in pattern table + compiled regex_t array
227
+ │ ├── placeholder.{c,h} # write_placeholder, djb2 hash, tag_name_for_bit
228
+ │ ├── redact.{c,h} # _redact + replace_all_matches + wrap_boundary
229
+ │ ├── scan.{c,h} # _scan + byte-offset replacement-log macros
230
+ │ ├── custom_patterns.{c,h} # Dynamic registry: add/remove/clear/list
231
+ │ └── tags.h # TAG_* bit constants
210
232
  └── spec/
211
- └── data_redactor_spec.rb # RSpec tests (61 examples, one per pattern)
233
+ └── data_redactor_spec.rb # RSpec tests at least one example per pattern, plus filter / placeholder / custom-pattern coverage
212
234
  ```
213
235
 
214
236
  ## Requirements
@@ -245,7 +267,7 @@ bundle exec rake
245
267
 
246
268
  ## How it works
247
269
 
248
- 1. At load time, `Init_data_redactor` compiles all 49 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
270
+ 1. At load time, `Init_data_redactor` compiles all 79 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
249
271
  2. `DataRedactor.redact(text)` receives a Ruby `String`, converts it to a C `char*` via `StringValueCStr`, and runs each compiled pattern in sequence on a working buffer.
250
272
  3. For each pattern, `replace_all_matches` iterates using `regexec`, copies non-matching segments to a fresh output buffer, and inserts `[REDACTED]` in place of each match. For boundary-wrapped patterns, `regexec` is called with `nmatch=4` and sub-match groups `[1]`/`[3]` identify the boundary characters so they are preserved verbatim.
251
273
  4. The output buffer is grown with `realloc` as needed. After all patterns are applied the result is returned as a Ruby `String` via `rb_str_new_cstr`. All intermediate `malloc`/`strdup` allocations are explicitly `free`d.
@@ -254,6 +276,12 @@ bundle exec rake
254
276
 
255
277
  All C-side buffers are heap-allocated with `malloc`/`strdup` and freed before the function returns. The only Ruby-managed allocation is the final return value from `rb_str_new_cstr`. No Ruby objects are created mid-processing, so GC cannot collect anything out from under the C code.
256
278
 
279
+ ## Thread safety
280
+
281
+ `DataRedactor.redact` and `DataRedactor.scan` are safe to call concurrently from multiple threads. Built-in patterns are compiled into a static `regex_t` array at load time and never mutated afterward, and each call allocates its own working buffers. POSIX `regexec` is documented as thread-safe.
282
+
283
+ `DataRedactor.add_pattern`, `remove_pattern`, and `clear_custom_patterns!` mutate a shared dynamic array and are **not** thread-safe. Register custom patterns once at boot — before spawning worker threads or forking — and they will be visible (read-only) to every subsequent `redact`/`scan` call.
284
+
257
285
  ## Versioning
258
286
 
259
287
  This project follows [Semantic Versioning 2.0.0](https://semver.org/spec/v2.0.0.html). Until `1.0.0`, minor versions may introduce breaking changes; from `1.0.0` onward, breaking changes will only land in major versions. See [CHANGELOG.md](CHANGELOG.md) for the release history.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_redactor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniele Frisanco
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-05-02 00:00:00.000000000 Z
11
+ date: 2026-05-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '3.12'
41
+ - !ruby/object:Gem::Dependency
42
+ name: yard
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.9'
41
55
  description: A Ruby gem with a C extension for high-performance scanning and redaction
42
56
  of 79 sensitive patterns — API keys, tokens, credentials, IBANs, national IDs, emails,
43
57
  phone numbers, and PII from 15+ countries. Designed to sanitize text before sending
@@ -51,8 +65,19 @@ extra_rdoc_files: []
51
65
  files:
52
66
  - CHANGELOG.md
53
67
  - LICENSE
68
+ - ext/data_redactor/custom_patterns.c
69
+ - ext/data_redactor/custom_patterns.h
54
70
  - ext/data_redactor/data_redactor.c
55
71
  - ext/data_redactor/extconf.rb
72
+ - ext/data_redactor/patterns.c
73
+ - ext/data_redactor/patterns.h
74
+ - ext/data_redactor/placeholder.c
75
+ - ext/data_redactor/placeholder.h
76
+ - ext/data_redactor/redact.c
77
+ - ext/data_redactor/redact.h
78
+ - ext/data_redactor/scan.c
79
+ - ext/data_redactor/scan.h
80
+ - ext/data_redactor/tags.h
56
81
  - lib/data_redactor.rb
57
82
  - lib/data_redactor/version.rb
58
83
  - readme.md