RubyGems - data_redactor - Versions diffs - 0.9.0 → 0.10.0 - Mend

data_redactor 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +27 -0
data/ext/data_redactor/custom_patterns.c +5 -0
data/ext/data_redactor/data_redactor.c +15 -1
data/ext/data_redactor/matcher.c +1193 -0
data/ext/data_redactor/matcher.h +78 -0
data/ext/data_redactor/patterns.c +119 -0
data/ext/data_redactor/patterns.h +11 -0
data/ext/data_redactor/redact.c +106 -33
data/ext/data_redactor/scan.c +141 -92
data/lib/data_redactor/version.rb +1 -1
data/lib/data_redactor.rb +75 -1
data/readme.md +56 -5
metadata +31 -1

data/ext/data_redactor/matcher.h ADDED Viewed

@@ -0,0 +1,78 @@
+#ifndef DATA_REDACTOR_MATCHER_H
+#define DATA_REDACTOR_MATCHER_H
+#include <stddef.h>
+/*
+ * The v19 multi-pattern engine: per-pattern lazy DFA (NFA -> bytecode ->
+ * interned DFA) with two selective merges (pure-digit run pass, IBAN union
+ * pass) and the v19.1 EOL-at-buffer-end fix. Zero dependencies beyond libc.
+ * See docs/research_log.md (v15..v19) for the derivation, and
+ * prototypes/multi_matcher_v1/ for the standalone prototype this is ported from.
+ *
+ * Built-in pattern engines are sourced from the gem's pattern arrays
+ * (pattern_strings[]/boundary_wrapped[]/pattern_required_literal[]), NOT a
+ * compile-time table. Custom patterns (DataRedactor.add_pattern) are appended
+ * at ids [NUM_PATTERNS .. NUM_PATTERNS+custom_count) and always take the
+ * per-pattern path (never folded into the selective merges) — documented as a
+ * divergence in TODO.md §1d.
+ *
+ * Thread-safety: the engine uses file-scope mutable scan scratch. Phase 1 does
+ * NOT release the GVL during a scan, so concurrent redact/scan calls are
+ * serialized by MRI's GVL. See TODO.md §1d "Phase 1 — not done yet" for the
+ * per-call re-entrancy work this defers.
+ */
+typedef struct {
+    int    pattern_id;  /* index into the built-in + custom pattern space */
+    size_t start;       /* byte offset of the CORE span in the ORIGINAL input */
+    size_t length;      /* byte length of the CORE span */
+} mm_match_t;
+/* Build the built-in engines from the gem's pattern arrays. Idempotent;
+ * called once from Init_data_redactor. */
+void mm_init(void);
+/* Append one engine for a custom pattern whose CORE regex is `regex`.
+ * `boundary` mirrors custom_patterns[].boundary (wrap with the boundary group).
+ * Must be called in registration order so the custom id == NUM_PATTERNS + slot.
+ * Returns 0 on success, non-zero if the engine's own parser rejects the regex
+ * (treated as a bug-to-fix, not a silent fallback — see Section E). */
+int  mm_add(const char *regex, int boundary);
+/* Free the engine at custom slot `idx` (0-based among customs) and compact the
+ * custom engine array so the remaining customs keep registration order. */
+void mm_remove(int idx);
+/* Free every custom engine (DataRedactor.clear_custom_patterns!). */
+void mm_clear_custom(void);
+/*
+ * Scan `input` once and write up to `max` match events to `out`, returning the
+ * count. `enable_bits[i]` (i in [0, n_total)) gates pattern i; a NULL or short
+ * array disables out-of-range patterns. Events carry ORIGINAL-frame offsets.
+ *
+ * Events are NOT pre-resolved for cross-pattern overlap — the caller applies
+ * the index-order greedy claim (mm_resolve) to reproduce the gem's sequential
+ * per-pattern rewrite semantics.
+ */
+size_t mm_scan(const char *input, size_t len,
+               const int *enable_bits, size_t n_bits,
+               mm_match_t *out, size_t max);
+/*
+ * Resolve raw scan events into the non-overlapping set the gem's sequential
+ * per-pattern rewrite would produce: in (pattern_id, start) order, keep an
+ * event iff its CORE span does not overlap an already-kept span. Sorts `ev`
+ * in place and returns the kept count (compacted to the front of `ev`), in
+ * ascending start order. n_total is the pattern-id upper bound for ordering.
+ */
+size_t mm_resolve(mm_match_t *ev, size_t n);
+/* Pattern name for an id (built-in or custom), or NULL if out of range. */
+const char *mm_pattern_name(int id);
+/* Total number of engines currently built (NUM_PATTERNS + custom count). */
+int mm_pattern_count(void);
+#endif /* DATA_REDACTOR_MATCHER_H */

data/ext/data_redactor/patterns.c CHANGED Viewed

@@ -1,5 +1,6 @@
 #include "patterns.h"
 #include "tags.h"
+#include <stddef.h>  /* NULL for pattern_required_literal entries */
 regex_t compiled_patterns[NUM_PATTERNS];
@@ -271,6 +272,124 @@ const char *pattern_names[NUM_PATTERNS] = {
     "polish_pesel_2"                 /* 87 */
 };
+/*
+ * Required literal substrings for the pre-filter. See pattern_required_literal
+ * in patterns.h for the contract. Conservative: only literals provably required
+ * by the regex source are listed; the rest are NULL (pattern runs always).
+ *
+ * Boundary-wrapped patterns (boundary_wrapped[i] == 1) must consider that the
+ * compiled regex is wrapped with (^|[^0-9A-Za-z])(...)([^0-9A-Za-z]|$), so the
+ * required literal of the core pattern is still the required literal of the
+ * wrapped form — the wrapper only adds boundary-char classes, no literals.
+ *
+ * The 2-letter IBAN country prefixes are case-sensitive in the regex source
+ * (e.g. "DE", "IT"), so the memmem pre-filter is case-sensitive too. This is
+ * consistent with today's matching behaviour.
+ */
+const char *pattern_required_literal[NUM_PATTERNS] = {
+    /* ---- Tier 1: Full URLs ---- */
+    "amazonaws.com",  /*  0: AWS S3 presigned URL */
+    "webhook.office.com",  /*  1: Microsoft Teams webhook */
+    "hooks.slack.com",     /*  2: Slack webhook URL */
+    "mongodb",        /*  3: MongoDB connection string — "mongodb" or "mongodb+srv" both contain it */
+    "ingest.sentry.io",    /*  4: Sentry DSN */
+    "://",            /*  5: URI with embedded password — scheme://...:...@... */
+    /* ---- Tier 2: Long prefixed tokens ---- */
+    "github_pat_",    /*  6 */
+    "eyJ",            /*  7: JWT — all three segments start "eyJ", at least the first must */
+    "eyJrIjoi",       /*  8: Grafana API token */
+    "ssh-",           /*  9: SSH public key */
+    NULL,             /* 10: Bearer token — "[Bb]earer " has two forms, no single literal. Could memmem twice but skip for now. */
+    "sk-ant-api",     /* 11: Anthropic API key */
+    "sk-proj-",       /* 12: OpenAI project API key */
+    "AIza",           /* 13: Google API key */
+    NULL,             /* 14: AWS access key ID — many prefix alternations (AKIA|ABIA|...); skip pre-filter */
+    NULL,             /* 15: AWS secret access key — pure base64, no literal */
+    "SG.",            /* 16: SendGrid API key */
+    "amzn.mws.",      /* 17: Amazon MWS auth token */
+    NULL,             /* 18: LaunchDarkly — "api-" or "sdk-"; no single literal */
+    "ghp_",           /* 19: GitHub classic PAT */
+    "gho_",           /* 20: GitHub OAuth token */
+    "sk_live_",       /* 21: Stripe secret key */
+    "pk_",            /* 22: ClickUp API key */
+    "glpat-",         /* 23: GitLab PAT */
+    "dop_v1_",        /* 24: DigitalOcean PAT */
+    "dapi",           /* 25: Databricks API token */
+    "SCW",            /* 26: Scaleway access key */
+    "-----BEGIN ",    /* 27: PEM private key header */
+    "-----BEGIN PGP PRIVATE KEY BLOCK-----",  /* 28 — full literal, exact match */
+    "hvs.",           /* 29: HashiCorp Vault service token */
+    "hvb.",           /* 30: HashiCorp Vault batch token */
+    ".atlasv1.",      /* 31: HashiCorp Terraform Cloud API token */
+    /* ---- Tier 3: IBANs ---- */
+    "HU",             /* 32 */
+    "PL",             /* 33 */
+    "FR",             /* 34 */
+    "IT",             /* 35 */
+    "PT",             /* 36 */
+    "ES",             /* 37 */
+    "CZ",             /* 38 */
+    "RO",             /* 39 */
+    "SE",             /* 40 */
+    "DE",             /* 41 */
+    "IE",             /* 42 */
+    "CH",             /* 43 */
+    "AT",             /* 44 */
+    "NL",             /* 45 */
+    "DK",             /* 46 */
+    "FI",             /* 47 */
+    "BE",             /* 48 */
+    "NO",             /* 49 */
+    /* ---- Tier 4: Structured formats ---- */
+    "@",              /* 50: email — '@' is rare in typical text, great filter */
+    NULL,             /* 51: phone E.164 — '+' is too common to filter usefully (URLs, code) */
+    NULL,             /* 52: Brazilian CNPJ — pure digits + separators, no useful literal */
+    NULL,             /* 53: Brazilian CPF — same */
+    NULL,             /* 54: UUID v4 — '-' too common to filter usefully */
+    NULL,             /* 55: IPv4 — digits + '.', no useful literal */
+    NULL,             /* 56: credit card — pure digit alternations */
+    NULL,             /* 57: Indian Aadhaar — digits + '-' or ' ' too common */
+    /* ---- Tier 5: Letter-anchored ---- */
+    NULL,             /* 58: Mexican CURP — letter classes only */
+    NULL,             /* 59: Italian CF omocodia — letter classes only */
+    NULL,             /* 60: Italian CF basic — letter classes only */
+    NULL,             /* 61: UK NIN — letter classes only */
+    NULL,             /* 62: Spanish NIE — [XYZ] + digits + letter */
+    NULL,             /* 63: passport with letter prefix — too generic */
+    /* ---- Tier 6: Boundary-wrapped structured ---- */
+    NULL,             /* 64: Korean RRN — digits + '-' */
+    "756.",           /* 65: Swiss AHV — always starts with "756." */
+    NULL,             /* 66: Finnish HETU — digits + [-+A] */
+    NULL,             /* 67: Swedish personnummer — digits + [-+] */
+    NULL,             /* 68: Danish CPR — digits + '-' */
+    NULL,             /* 69: Czech rodne cislo — digits + optional '/' */
+    NULL,             /* 70: US SSN — digits + '-' */
+    NULL,             /* 71: US ITIN — starts "9", but '9' is too common */
+    NULL,             /* 72: Canadian SIN — digits + '-' */
+    NULL,             /* 73: Australian TFN — digits + '-' or ' ' */
+    NULL,             /* 74: Indian PAN — letters + digits, no required literal */
+    NULL,             /* 75: Spanish DNI — 8 digits + letter */
+    NULL,             /* 76: Hungarian Tax ID — starts "8", too common */
+    /* ---- Tier 7: Boundary-wrapped pure digits ---- */
+    NULL,             /* 77: French NIR — pure digits */
+    NULL,             /* 78: South African ID — pure digits */
+    NULL,             /* 79: Romanian CNP — pure digits */
+    NULL,             /* 80: Japanese My Number — pure digits */
+    NULL,             /* 81: Polish PESEL — pure digits */
+    NULL,             /* 82: Belgian National Number — pure digits */
+    NULL,             /* 83: Norwegian Fødselsnummer — pure digits */
+    NULL,             /* 84: passport 9 digits — pure digits */
+    NULL,             /* 85: Dutch BSN — pure digits */
+    NULL,             /* 86: Austrian Abgabenkontonummer — pure digits */
+    NULL              /* 87: Polish PESEL duplicate — pure digits */
+};
 /*
  * Raw patterns. Boundary-wrapped patterns are stored unwrapped here;
  * the wrapper is applied in Init_data_redactor at compile time.

data/ext/data_redactor/patterns.h CHANGED Viewed

@@ -10,6 +10,17 @@ extern const int   boundary_wrapped[NUM_PATTERNS];
 extern const int   pattern_tags[NUM_PATTERNS];
 extern const char *pattern_names[NUM_PATTERNS];
+/*
+ * Optional case-sensitive literal substring that the input must contain for
+ * the pattern to have any chance of matching. NULL means no pre-filter — the
+ * pattern runs unconditionally. A non-NULL literal must be a string the
+ * regex *requires* (a wrong assignment here is a silent false negative).
+ * The redactor memmem()'s the input for the literal before invoking regexec;
+ * if absent, the pattern is skipped entirely. Big win for typical inputs
+ * where most patterns don't match — saves the per-call O(N) regexec setup.
+ */
+extern const char *pattern_required_literal[NUM_PATTERNS];
 /* Compiled at Init_data_redactor time. */
 extern regex_t compiled_patterns[NUM_PATTERNS];

data/ext/data_redactor/redact.c CHANGED Viewed

@@ -2,6 +2,8 @@
 #include "patterns.h"
 #include "placeholder.h"
 #include "custom_patterns.h"
+#include "matcher.h"
+#include "tags.h"
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -29,7 +31,13 @@ char *wrap_boundary(const char *core) {
 char *replace_all_matches(regex_t *pattern, const char *input,
                           int use_boundary, const placeholder_t *ph) {
     size_t ph_max   = max_placeholder_len(ph);
-    size_t out_cap  = strlen(input) * 2 + 512;
+    size_t in_len   = strlen(input);
+    /* Worst case per input byte: it is either copied verbatim (1 byte out) or
+     * it is one byte of a match replaced by the longest placeholder (ph_max
+     * bytes out). A single byte is never both, but bounding each byte by
+     * (1 + ph_max) is safe and sized once — no per-match strlen, no realloc. */
+    size_t out_cap  = in_len * (ph_max + 1) + 1;
     char *output = (char *)malloc(out_cap);
     if (!output) return NULL;
@@ -63,14 +71,6 @@ char *replace_all_matches(regex_t *pattern, const char *input,
         size_t ph_len = write_placeholder(ph_buf, ph, cursor + core_so, core_len);
-        size_t needed = out_len + prefix_len + ph_len + suffix_len + strlen(cursor + full_eo) + 1;
-        if (needed > out_cap) {
-            out_cap = needed * 2;
-            char *tmp = (char *)realloc(output, out_cap);
-            if (!tmp) { free(output); free(ph_buf); return NULL; }
-            output = tmp;
-        }
         memcpy(output + out_len, cursor, prefix_len);
         out_len += prefix_len;
@@ -92,13 +92,6 @@ char *replace_all_matches(regex_t *pattern, const char *input,
     free(ph_buf);
     size_t tail_len = strlen(cursor);
-    size_t needed = out_len + tail_len + 1;
-    if (needed > out_cap) {
-        out_cap = needed;
-        char *tmp = (char *)realloc(output, out_cap);
-        if (!tmp) { free(output); return NULL; }
-        output = tmp;
-    }
     memcpy(output + out_len, cursor, tail_len);
     out_len += tail_len;
     output[out_len] = '\0';
@@ -113,6 +106,79 @@ static inline int enable_bit(VALUE rb_enable_bits, long i) {
     return RTEST(v) && NUM2INT(v) != 0;
 }
+/* Copy the first NUM_PATTERNS entries of the enable_bits Array into a C int[].
+ * Only the built-in slice is needed: the v19 engine runs built-ins only; custom
+ * patterns are gated separately in the glibc loop. Caller frees. */
+static int *builtin_enable_bits(VALUE rb_enable_bits) {
+    int *bits = (int *)malloc((size_t)NUM_PATTERNS * sizeof(int));
+    if (!bits) return NULL;
+    long alen = RARRAY_LEN(rb_enable_bits);
+    for (int i = 0; i < NUM_PATTERNS; i++) {
+        if (i < alen) {
+            VALUE v = rb_ary_entry(rb_enable_bits, i);
+            bits[i] = (RTEST(v) && NUM2INT(v) != 0) ? 1 : 0;
+        } else {
+            bits[i] = 0;
+        }
+    }
+    return bits;
+}
+/* Redact the built-in patterns from `input` (len bytes) with the v19 engine,
+ * resolved to today's sequential semantics. Returns a newly malloc'd
+ * NUL-terminated C string (caller frees) and writes its length to *out_len_p.
+ * `bits` gates the built-ins (length NUM_PATTERNS). */
+static char *redact_builtins(const char *input, size_t in_len, const int *bits,
+                             int ph_mode, const char *ph_str_plain,
+                             size_t *out_len_p) {
+    /* Scan + resolve. Grow and rescan if the buffer fills exactly (possible
+     * truncation), so no built-in match is ever silently dropped. */
+    size_t cap = in_len / 4 + 16;
+    mm_match_t *ev = NULL;
+    size_t n;
+    for (;;) {
+        mm_match_t *grown = (mm_match_t *)realloc(ev, cap * sizeof(mm_match_t));
+        if (!grown) { free(ev); return NULL; }
+        ev = grown;
+        n = mm_scan(input, in_len, bits, (size_t)NUM_PATTERNS, ev, cap);
+        if (n < cap) break;
+        cap *= 2;
+    }
+    n = mm_resolve(ev, n);
+    placeholder_t ph;
+    ph.mode = ph_mode;
+    /* Size against the widest placeholder (longest tag name) so one allocation
+     * covers any per-event tag. Each input byte maps to at most (ph_max+1) out
+     * bytes (verbatim, or one byte of a CORE span replaced by ph_max). */
+    ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN) ? ph_str_plain : "NATIONAL_ID";
+    size_t ph_max = max_placeholder_len(&ph);
+    size_t out_cap = in_len * (ph_max + 1) + 1;
+    char *output = (char *)malloc(out_cap);
+    char *ph_buf = (char *)malloc(ph_max + 1);
+    if (!output || !ph_buf) { free(output); free(ph_buf); free(ev); return NULL; }
+    size_t out_len = 0, cur = 0;
+    for (size_t i = 0; i < n; i++) {
+        size_t s = ev[i].start, l = ev[i].length;
+        if (s > cur) { memcpy(output + out_len, input + cur, s - cur); out_len += s - cur; }
+        ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
+                     ? ph_str_plain
+                     : tag_name_for_bit(pattern_tags[ev[i].pattern_id]);
+        size_t pl = write_placeholder(ph_buf, &ph, input + s, l);
+        memcpy(output + out_len, ph_buf, pl); out_len += pl;
+        cur = s + l;
+    }
+    if (cur < in_len) { memcpy(output + out_len, input + cur, in_len - cur); out_len += in_len - cur; }
+    output[out_len] = '\0';
+    free(ph_buf);
+    free(ev);
+    *out_len_p = out_len;
+    return output;
+}
 VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
                               VALUE rb_ph_mode, VALUE rb_ph_str,
                               VALUE rb_enable_bits) {
@@ -123,25 +189,25 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
     int ph_mode = NUM2INT(rb_ph_mode);
     const char *ph_str_plain = StringValueCStr(rb_ph_str);
-    const char *input = StringValueCStr(rb_text);
-    char *working = strdup(input);
-    if (!working) rb_raise(rb_eNoMemError, "strdup failed");
+    const char *input = RSTRING_PTR(rb_text);
+    size_t in_len = (size_t)RSTRING_LEN(rb_text);
+    /* Stage 1: built-ins through the fast v19 engine (single pass, resolved to
+     * earlier-index-wins). */
+    int *bits = builtin_enable_bits(rb_enable_bits);
+    if (!bits) rb_raise(rb_eNoMemError, "enable_bits allocation failed");
+    size_t work_len = 0;
+    char *working = redact_builtins(input, in_len, bits, ph_mode, ph_str_plain, &work_len);
+    free(bits);
+    if (!working) rb_raise(rb_eNoMemError, "built-in redaction allocation failed");
+    /* Stage 2: custom patterns through the glibc regexec path, on the buffer the
+     * built-ins already rewrote — preserving the sequential built-ins→customs
+     * order and full UTF-8 matching for user regex (see Gap 2 hybrid split). The
+     * "[REDACTED…]" placeholders introduce none of any custom pattern's literals
+     * incidentally beyond what today already did. */
     placeholder_t ph;
     ph.mode = ph_mode;
-    for (int i = 0; i < NUM_PATTERNS; i++) {
-        if (!enable_bit(rb_enable_bits, i)) continue;
-        ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
-                     ? ph_str_plain
-                     : tag_name_for_bit(pattern_tags[i]);
-        char *result = replace_all_matches(&compiled_patterns[i], working,
-                                           boundary_wrapped[i], &ph);
-        free(working);
-        if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed");
-        working = result;
-    }
     for (int i = 0; i < custom_count; i++) {
         if (!enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
         ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
@@ -156,5 +222,12 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
     VALUE rb_result = rb_str_new_cstr(working);
     free(working);
+    /* Preserve the input's encoding. We go through Ruby's force_encoding rather
+     * than the C rb_enc_* API because pulling in ruby/encoding.h drags in
+     * onigmo.h, whose regex_t collides with the POSIX <regex.h> this TU uses for
+     * the custom-pattern path. Placeholders are pure ASCII, valid in every
+     * encoding the gem accepts. */
+    rb_funcall(rb_result, rb_intern("force_encoding"), 1,
+               rb_funcall(rb_text, rb_intern("encoding"), 0));
     return rb_result;
 }