data_redactor 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,78 @@
1
+ #ifndef DATA_REDACTOR_MATCHER_H
2
+ #define DATA_REDACTOR_MATCHER_H
3
+
4
+ #include <stddef.h>
5
+
6
+ /*
7
+ * The v19 multi-pattern engine: per-pattern lazy DFA (NFA -> bytecode ->
8
+ * interned DFA) with two selective merges (pure-digit run pass, IBAN union
9
+ * pass) and the v19.1 EOL-at-buffer-end fix. Zero dependencies beyond libc.
10
+ * See docs/research_log.md (v15..v19) for the derivation, and
11
+ * prototypes/multi_matcher_v1/ for the standalone prototype this is ported from.
12
+ *
13
+ * Built-in pattern engines are sourced from the gem's pattern arrays
14
+ * (pattern_strings[]/boundary_wrapped[]/pattern_required_literal[]), NOT a
15
+ * compile-time table. Custom patterns (DataRedactor.add_pattern) are appended
16
+ * at ids [NUM_PATTERNS .. NUM_PATTERNS+custom_count) and always take the
17
+ * per-pattern path (never folded into the selective merges) — documented as a
18
+ * divergence in TODO.md §1d.
19
+ *
20
+ * Thread-safety: the engine uses file-scope mutable scan scratch. Phase 1 does
21
+ * NOT release the GVL during a scan, so concurrent redact/scan calls are
22
+ * serialized by MRI's GVL. See TODO.md §1d "Phase 1 — not done yet" for the
23
+ * per-call re-entrancy work this defers.
24
+ */
25
+
26
+ typedef struct {
27
+ int pattern_id; /* index into the built-in + custom pattern space */
28
+ size_t start; /* byte offset of the CORE span in the ORIGINAL input */
29
+ size_t length; /* byte length of the CORE span */
30
+ } mm_match_t;
31
+
32
+ /* Build the built-in engines from the gem's pattern arrays. Idempotent;
33
+ * called once from Init_data_redactor. */
34
+ void mm_init(void);
35
+
36
+ /* Append one engine for a custom pattern whose CORE regex is `regex`.
37
+ * `boundary` mirrors custom_patterns[].boundary (wrap with the boundary group).
38
+ * Must be called in registration order so the custom id == NUM_PATTERNS + slot.
39
+ * Returns 0 on success, non-zero if the engine's own parser rejects the regex
40
+ * (treated as a bug-to-fix, not a silent fallback — see Section E). */
41
+ int mm_add(const char *regex, int boundary);
42
+
43
+ /* Free the engine at custom slot `idx` (0-based among customs) and compact the
44
+ * custom engine array so the remaining customs keep registration order. */
45
+ void mm_remove(int idx);
46
+
47
+ /* Free every custom engine (DataRedactor.clear_custom_patterns!). */
48
+ void mm_clear_custom(void);
49
+
50
+ /*
51
+ * Scan `input` once and write up to `max` match events to `out`, returning the
52
+ * count. `enable_bits[i]` (i in [0, n_total)) gates pattern i; a NULL or short
53
+ * array disables out-of-range patterns. Events carry ORIGINAL-frame offsets.
54
+ *
55
+ * Events are NOT pre-resolved for cross-pattern overlap — the caller applies
56
+ * the index-order greedy claim (mm_resolve) to reproduce the gem's sequential
57
+ * per-pattern rewrite semantics.
58
+ */
59
+ size_t mm_scan(const char *input, size_t len,
60
+ const int *enable_bits, size_t n_bits,
61
+ mm_match_t *out, size_t max);
62
+
63
+ /*
64
+ * Resolve raw scan events into the non-overlapping set the gem's sequential
65
+ * per-pattern rewrite would produce: in (pattern_id, start) order, keep an
66
+ * event iff its CORE span does not overlap an already-kept span. Sorts `ev`
67
+ * in place and returns the kept count (compacted to the front of `ev`), in
68
+ * ascending start order. n_total is the pattern-id upper bound for ordering.
69
+ */
70
+ size_t mm_resolve(mm_match_t *ev, size_t n);
71
+
72
+ /* Pattern name for an id (built-in or custom), or NULL if out of range. */
73
+ const char *mm_pattern_name(int id);
74
+
75
+ /* Total number of engines currently built (NUM_PATTERNS + custom count). */
76
+ int mm_pattern_count(void);
77
+
78
+ #endif /* DATA_REDACTOR_MATCHER_H */
@@ -1,5 +1,6 @@
1
1
  #include "patterns.h"
2
2
  #include "tags.h"
3
+ #include <stddef.h> /* NULL for pattern_required_literal entries */
3
4
 
4
5
  regex_t compiled_patterns[NUM_PATTERNS];
5
6
 
@@ -271,6 +272,124 @@ const char *pattern_names[NUM_PATTERNS] = {
271
272
  "polish_pesel_2" /* 87 */
272
273
  };
273
274
 
275
+ /*
276
+ * Required literal substrings for the pre-filter. See pattern_required_literal
277
+ * in patterns.h for the contract. Conservative: only literals provably required
278
+ * by the regex source are listed; the rest are NULL (pattern runs always).
279
+ *
280
+ * Boundary-wrapped patterns (boundary_wrapped[i] == 1) must consider that the
281
+ * compiled regex is wrapped with (^|[^0-9A-Za-z])(...)([^0-9A-Za-z]|$), so the
282
+ * required literal of the core pattern is still the required literal of the
283
+ * wrapped form — the wrapper only adds boundary-char classes, no literals.
284
+ *
285
+ * The 2-letter IBAN country prefixes are case-sensitive in the regex source
286
+ * (e.g. "DE", "IT"), so the memmem pre-filter is case-sensitive too. This is
287
+ * consistent with today's matching behaviour.
288
+ */
289
+ const char *pattern_required_literal[NUM_PATTERNS] = {
290
+ /* ---- Tier 1: Full URLs ---- */
291
+ "amazonaws.com", /* 0: AWS S3 presigned URL */
292
+ "webhook.office.com", /* 1: Microsoft Teams webhook */
293
+ "hooks.slack.com", /* 2: Slack webhook URL */
294
+ "mongodb", /* 3: MongoDB connection string — "mongodb" or "mongodb+srv" both contain it */
295
+ "ingest.sentry.io", /* 4: Sentry DSN */
296
+ "://", /* 5: URI with embedded password — scheme://...:...@... */
297
+
298
+ /* ---- Tier 2: Long prefixed tokens ---- */
299
+ "github_pat_", /* 6 */
300
+ "eyJ", /* 7: JWT — all three segments start "eyJ", at least the first must */
301
+ "eyJrIjoi", /* 8: Grafana API token */
302
+ "ssh-", /* 9: SSH public key */
303
+ NULL, /* 10: Bearer token — "[Bb]earer " has two forms, no single literal. Could memmem twice but skip for now. */
304
+ "sk-ant-api", /* 11: Anthropic API key */
305
+ "sk-proj-", /* 12: OpenAI project API key */
306
+ "AIza", /* 13: Google API key */
307
+ NULL, /* 14: AWS access key ID — many prefix alternations (AKIA|ABIA|...); skip pre-filter */
308
+ NULL, /* 15: AWS secret access key — pure base64, no literal */
309
+ "SG.", /* 16: SendGrid API key */
310
+ "amzn.mws.", /* 17: Amazon MWS auth token */
311
+ NULL, /* 18: LaunchDarkly — "api-" or "sdk-"; no single literal */
312
+ "ghp_", /* 19: GitHub classic PAT */
313
+ "gho_", /* 20: GitHub OAuth token */
314
+ "sk_live_", /* 21: Stripe secret key */
315
+ "pk_", /* 22: ClickUp API key */
316
+ "glpat-", /* 23: GitLab PAT */
317
+ "dop_v1_", /* 24: DigitalOcean PAT */
318
+ "dapi", /* 25: Databricks API token */
319
+ "SCW", /* 26: Scaleway access key */
320
+ "-----BEGIN ", /* 27: PEM private key header */
321
+ "-----BEGIN PGP PRIVATE KEY BLOCK-----", /* 28 — full literal, exact match */
322
+ "hvs.", /* 29: HashiCorp Vault service token */
323
+ "hvb.", /* 30: HashiCorp Vault batch token */
324
+ ".atlasv1.", /* 31: HashiCorp Terraform Cloud API token */
325
+
326
+ /* ---- Tier 3: IBANs ---- */
327
+ "HU", /* 32 */
328
+ "PL", /* 33 */
329
+ "FR", /* 34 */
330
+ "IT", /* 35 */
331
+ "PT", /* 36 */
332
+ "ES", /* 37 */
333
+ "CZ", /* 38 */
334
+ "RO", /* 39 */
335
+ "SE", /* 40 */
336
+ "DE", /* 41 */
337
+ "IE", /* 42 */
338
+ "CH", /* 43 */
339
+ "AT", /* 44 */
340
+ "NL", /* 45 */
341
+ "DK", /* 46 */
342
+ "FI", /* 47 */
343
+ "BE", /* 48 */
344
+ "NO", /* 49 */
345
+
346
+ /* ---- Tier 4: Structured formats ---- */
347
+ "@", /* 50: email — '@' is rare in typical text, great filter */
348
+ NULL, /* 51: phone E.164 — '+' is too common to filter usefully (URLs, code) */
349
+ NULL, /* 52: Brazilian CNPJ — pure digits + separators, no useful literal */
350
+ NULL, /* 53: Brazilian CPF — same */
351
+ NULL, /* 54: UUID v4 — '-' too common to filter usefully */
352
+ NULL, /* 55: IPv4 — digits + '.', no useful literal */
353
+ NULL, /* 56: credit card — pure digit alternations */
354
+ NULL, /* 57: Indian Aadhaar — digits + '-' or ' ' too common */
355
+
356
+ /* ---- Tier 5: Letter-anchored ---- */
357
+ NULL, /* 58: Mexican CURP — letter classes only */
358
+ NULL, /* 59: Italian CF omocodia — letter classes only */
359
+ NULL, /* 60: Italian CF basic — letter classes only */
360
+ NULL, /* 61: UK NIN — letter classes only */
361
+ NULL, /* 62: Spanish NIE — [XYZ] + digits + letter */
362
+ NULL, /* 63: passport with letter prefix — too generic */
363
+
364
+ /* ---- Tier 6: Boundary-wrapped structured ---- */
365
+ NULL, /* 64: Korean RRN — digits + '-' */
366
+ "756.", /* 65: Swiss AHV — always starts with "756." */
367
+ NULL, /* 66: Finnish HETU — digits + [-+A] */
368
+ NULL, /* 67: Swedish personnummer — digits + [-+] */
369
+ NULL, /* 68: Danish CPR — digits + '-' */
370
+ NULL, /* 69: Czech rodne cislo — digits + optional '/' */
371
+ NULL, /* 70: US SSN — digits + '-' */
372
+ NULL, /* 71: US ITIN — starts "9", but '9' is too common */
373
+ NULL, /* 72: Canadian SIN — digits + '-' */
374
+ NULL, /* 73: Australian TFN — digits + '-' or ' ' */
375
+ NULL, /* 74: Indian PAN — letters + digits, no required literal */
376
+ NULL, /* 75: Spanish DNI — 8 digits + letter */
377
+ NULL, /* 76: Hungarian Tax ID — starts "8", too common */
378
+
379
+ /* ---- Tier 7: Boundary-wrapped pure digits ---- */
380
+ NULL, /* 77: French NIR — pure digits */
381
+ NULL, /* 78: South African ID — pure digits */
382
+ NULL, /* 79: Romanian CNP — pure digits */
383
+ NULL, /* 80: Japanese My Number — pure digits */
384
+ NULL, /* 81: Polish PESEL — pure digits */
385
+ NULL, /* 82: Belgian National Number — pure digits */
386
+ NULL, /* 83: Norwegian Fødselsnummer — pure digits */
387
+ NULL, /* 84: passport 9 digits — pure digits */
388
+ NULL, /* 85: Dutch BSN — pure digits */
389
+ NULL, /* 86: Austrian Abgabenkontonummer — pure digits */
390
+ NULL /* 87: Polish PESEL duplicate — pure digits */
391
+ };
392
+
274
393
  /*
275
394
  * Raw patterns. Boundary-wrapped patterns are stored unwrapped here;
276
395
  * the wrapper is applied in Init_data_redactor at compile time.
@@ -10,6 +10,17 @@ extern const int boundary_wrapped[NUM_PATTERNS];
10
10
  extern const int pattern_tags[NUM_PATTERNS];
11
11
  extern const char *pattern_names[NUM_PATTERNS];
12
12
 
13
+ /*
14
+ * Optional case-sensitive literal substring that the input must contain for
15
+ * the pattern to have any chance of matching. NULL means no pre-filter — the
16
+ * pattern runs unconditionally. A non-NULL literal must be a string the
17
+ * regex *requires* (a wrong assignment here is a silent false negative).
18
+ * The redactor memmem()'s the input for the literal before invoking regexec;
19
+ * if absent, the pattern is skipped entirely. Big win for typical inputs
20
+ * where most patterns don't match — saves the per-call O(N) regexec setup.
21
+ */
22
+ extern const char *pattern_required_literal[NUM_PATTERNS];
23
+
13
24
  /* Compiled at Init_data_redactor time. */
14
25
  extern regex_t compiled_patterns[NUM_PATTERNS];
15
26
 
@@ -2,6 +2,8 @@
2
2
  #include "patterns.h"
3
3
  #include "placeholder.h"
4
4
  #include "custom_patterns.h"
5
+ #include "matcher.h"
6
+ #include "tags.h"
5
7
  #include <string.h>
6
8
  #include <stdlib.h>
7
9
  #include <stdio.h>
@@ -29,7 +31,13 @@ char *wrap_boundary(const char *core) {
29
31
  char *replace_all_matches(regex_t *pattern, const char *input,
30
32
  int use_boundary, const placeholder_t *ph) {
31
33
  size_t ph_max = max_placeholder_len(ph);
32
- size_t out_cap = strlen(input) * 2 + 512;
34
+ size_t in_len = strlen(input);
35
+
36
+ /* Worst case per input byte: it is either copied verbatim (1 byte out) or
37
+ * it is one byte of a match replaced by the longest placeholder (ph_max
38
+ * bytes out). A single byte is never both, but bounding each byte by
39
+ * (1 + ph_max) is safe and sized once — no per-match strlen, no realloc. */
40
+ size_t out_cap = in_len * (ph_max + 1) + 1;
33
41
  char *output = (char *)malloc(out_cap);
34
42
  if (!output) return NULL;
35
43
 
@@ -63,14 +71,6 @@ char *replace_all_matches(regex_t *pattern, const char *input,
63
71
 
64
72
  size_t ph_len = write_placeholder(ph_buf, ph, cursor + core_so, core_len);
65
73
 
66
- size_t needed = out_len + prefix_len + ph_len + suffix_len + strlen(cursor + full_eo) + 1;
67
- if (needed > out_cap) {
68
- out_cap = needed * 2;
69
- char *tmp = (char *)realloc(output, out_cap);
70
- if (!tmp) { free(output); free(ph_buf); return NULL; }
71
- output = tmp;
72
- }
73
-
74
74
  memcpy(output + out_len, cursor, prefix_len);
75
75
  out_len += prefix_len;
76
76
 
@@ -92,13 +92,6 @@ char *replace_all_matches(regex_t *pattern, const char *input,
92
92
  free(ph_buf);
93
93
 
94
94
  size_t tail_len = strlen(cursor);
95
- size_t needed = out_len + tail_len + 1;
96
- if (needed > out_cap) {
97
- out_cap = needed;
98
- char *tmp = (char *)realloc(output, out_cap);
99
- if (!tmp) { free(output); return NULL; }
100
- output = tmp;
101
- }
102
95
  memcpy(output + out_len, cursor, tail_len);
103
96
  out_len += tail_len;
104
97
  output[out_len] = '\0';
@@ -113,6 +106,79 @@ static inline int enable_bit(VALUE rb_enable_bits, long i) {
113
106
  return RTEST(v) && NUM2INT(v) != 0;
114
107
  }
115
108
 
109
+ /* Copy the first NUM_PATTERNS entries of the enable_bits Array into a C int[].
110
+ * Only the built-in slice is needed: the v19 engine runs built-ins only; custom
111
+ * patterns are gated separately in the glibc loop. Caller frees. */
112
+ static int *builtin_enable_bits(VALUE rb_enable_bits) {
113
+ int *bits = (int *)malloc((size_t)NUM_PATTERNS * sizeof(int));
114
+ if (!bits) return NULL;
115
+ long alen = RARRAY_LEN(rb_enable_bits);
116
+ for (int i = 0; i < NUM_PATTERNS; i++) {
117
+ if (i < alen) {
118
+ VALUE v = rb_ary_entry(rb_enable_bits, i);
119
+ bits[i] = (RTEST(v) && NUM2INT(v) != 0) ? 1 : 0;
120
+ } else {
121
+ bits[i] = 0;
122
+ }
123
+ }
124
+ return bits;
125
+ }
126
+
127
+ /* Redact the built-in patterns from `input` (len bytes) with the v19 engine,
128
+ * resolved to today's sequential semantics. Returns a newly malloc'd
129
+ * NUL-terminated C string (caller frees) and writes its length to *out_len_p.
130
+ * `bits` gates the built-ins (length NUM_PATTERNS). */
131
+ static char *redact_builtins(const char *input, size_t in_len, const int *bits,
132
+ int ph_mode, const char *ph_str_plain,
133
+ size_t *out_len_p) {
134
+ /* Scan + resolve. Grow and rescan if the buffer fills exactly (possible
135
+ * truncation), so no built-in match is ever silently dropped. */
136
+ size_t cap = in_len / 4 + 16;
137
+ mm_match_t *ev = NULL;
138
+ size_t n;
139
+ for (;;) {
140
+ mm_match_t *grown = (mm_match_t *)realloc(ev, cap * sizeof(mm_match_t));
141
+ if (!grown) { free(ev); return NULL; }
142
+ ev = grown;
143
+ n = mm_scan(input, in_len, bits, (size_t)NUM_PATTERNS, ev, cap);
144
+ if (n < cap) break;
145
+ cap *= 2;
146
+ }
147
+ n = mm_resolve(ev, n);
148
+
149
+ placeholder_t ph;
150
+ ph.mode = ph_mode;
151
+ /* Size against the widest placeholder (longest tag name) so one allocation
152
+ * covers any per-event tag. Each input byte maps to at most (ph_max+1) out
153
+ * bytes (verbatim, or one byte of a CORE span replaced by ph_max). */
154
+ ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN) ? ph_str_plain : "NATIONAL_ID";
155
+ size_t ph_max = max_placeholder_len(&ph);
156
+
157
+ size_t out_cap = in_len * (ph_max + 1) + 1;
158
+ char *output = (char *)malloc(out_cap);
159
+ char *ph_buf = (char *)malloc(ph_max + 1);
160
+ if (!output || !ph_buf) { free(output); free(ph_buf); free(ev); return NULL; }
161
+
162
+ size_t out_len = 0, cur = 0;
163
+ for (size_t i = 0; i < n; i++) {
164
+ size_t s = ev[i].start, l = ev[i].length;
165
+ if (s > cur) { memcpy(output + out_len, input + cur, s - cur); out_len += s - cur; }
166
+ ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
167
+ ? ph_str_plain
168
+ : tag_name_for_bit(pattern_tags[ev[i].pattern_id]);
169
+ size_t pl = write_placeholder(ph_buf, &ph, input + s, l);
170
+ memcpy(output + out_len, ph_buf, pl); out_len += pl;
171
+ cur = s + l;
172
+ }
173
+ if (cur < in_len) { memcpy(output + out_len, input + cur, in_len - cur); out_len += in_len - cur; }
174
+ output[out_len] = '\0';
175
+
176
+ free(ph_buf);
177
+ free(ev);
178
+ *out_len_p = out_len;
179
+ return output;
180
+ }
181
+
116
182
  VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
117
183
  VALUE rb_ph_mode, VALUE rb_ph_str,
118
184
  VALUE rb_enable_bits) {
@@ -123,25 +189,25 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
123
189
  int ph_mode = NUM2INT(rb_ph_mode);
124
190
  const char *ph_str_plain = StringValueCStr(rb_ph_str);
125
191
 
126
- const char *input = StringValueCStr(rb_text);
127
- char *working = strdup(input);
128
- if (!working) rb_raise(rb_eNoMemError, "strdup failed");
129
-
192
+ const char *input = RSTRING_PTR(rb_text);
193
+ size_t in_len = (size_t)RSTRING_LEN(rb_text);
194
+
195
+ /* Stage 1: built-ins through the fast v19 engine (single pass, resolved to
196
+ * earlier-index-wins). */
197
+ int *bits = builtin_enable_bits(rb_enable_bits);
198
+ if (!bits) rb_raise(rb_eNoMemError, "enable_bits allocation failed");
199
+ size_t work_len = 0;
200
+ char *working = redact_builtins(input, in_len, bits, ph_mode, ph_str_plain, &work_len);
201
+ free(bits);
202
+ if (!working) rb_raise(rb_eNoMemError, "built-in redaction allocation failed");
203
+
204
+ /* Stage 2: custom patterns through the glibc regexec path, on the buffer the
205
+ * built-ins already rewrote — preserving the sequential built-ins→customs
206
+ * order and full UTF-8 matching for user regex (see Gap 2 hybrid split). The
207
+ * "[REDACTED…]" placeholders introduce none of any custom pattern's literals
208
+ * incidentally beyond what today already did. */
130
209
  placeholder_t ph;
131
210
  ph.mode = ph_mode;
132
-
133
- for (int i = 0; i < NUM_PATTERNS; i++) {
134
- if (!enable_bit(rb_enable_bits, i)) continue;
135
- ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
136
- ? ph_str_plain
137
- : tag_name_for_bit(pattern_tags[i]);
138
- char *result = replace_all_matches(&compiled_patterns[i], working,
139
- boundary_wrapped[i], &ph);
140
- free(working);
141
- if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed");
142
- working = result;
143
- }
144
-
145
211
  for (int i = 0; i < custom_count; i++) {
146
212
  if (!enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
147
213
  ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
@@ -156,5 +222,12 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
156
222
 
157
223
  VALUE rb_result = rb_str_new_cstr(working);
158
224
  free(working);
225
+ /* Preserve the input's encoding. We go through Ruby's force_encoding rather
226
+ * than the C rb_enc_* API because pulling in ruby/encoding.h drags in
227
+ * onigmo.h, whose regex_t collides with the POSIX <regex.h> this TU uses for
228
+ * the custom-pattern path. Placeholders are pure ASCII, valid in every
229
+ * encoding the gem accepts. */
230
+ rb_funcall(rb_result, rb_intern("force_encoding"), 1,
231
+ rb_funcall(rb_text, rb_intern("encoding"), 0));
159
232
  return rb_result;
160
233
  }