RubyGems - data_redactor - Versions diffs - 0.13.0 → 0.14.0 - Mend

data_redactor 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +22 -1
data/README.md +16 -0
data/ext/data_redactor/matcher.c +20 -0
data/ext/data_redactor/patterns.c +40 -5
data/ext/data_redactor/patterns.h +10 -1
data/lib/data_redactor/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fa978ec8daa0c8285f48283bc251a512c9263202046100ac03ad39ef4889e070
-  data.tar.gz: 6b5ff39107b5948bcf499fa122e2af4c342d46fb520e94c6d0d3d1f128f30781
+  metadata.gz: b29290519836ca25d5188a5ef4da2585bd7f11faa0c072927863c637fb618eeb
+  data.tar.gz: 465091099d2fcf4b990d4e4259c3c4ad549588839d918d831c9747236f84e864
 SHA512:
-  metadata.gz: 4169cf320312e05e77d5c6fe699f1516f031f6ba097fb9a327e9c7686c462d0c96759b58f03eb2aab024178e01998f6c7607f2ac758eb773e6681b68f8b0717e
-  data.tar.gz: 92a9d114b28305d2da038571259ff3d819371e1276aa66aea436d820e4e540084a0e0c2e555013e53fc454abe50029058e9ca7fcdc239591b86685700cf29d62
+  metadata.gz: fbc51cb331674163af43d4e952bce6ec936db4e3235ca356082a83211ae552d84409bebcdffcb364c09ed8099504ac8418d2fffed3d273d4392d762d99098d59
+  data.tar.gz: e57d9545b5acec4ca25c1c5a30b1987d3d9769f027725e6613f396e7b2bedbe352620278a6cb8c613d5e1a1c1ecabb0e446bf2a4b6ae0339bedf8a4563a33b01

data/CHANGELOG.md CHANGED Viewed

@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.14.0] - 2026-06-17
+### Added
+- **Key-name-anchored secret redaction** (`:credentials`). A new pattern tier
+  redacts a secret by the *name of the field it is assigned to*, for values with
+  no distinctive shape of their own — the primary case being an `.env` file or
+  config blob passed through the redactor. Anchored on the key words `password`,
+  `passwd`, `pwd`, `secret`, `token`, `api_key`, `apikey`, `access_key`, and
+  `client_secret` (case-insensitive), followed by `=` or `:` (dotenv and YAML
+  styles), with quoted (`"..."`/`'...'`) or unquoted (≥6 chars) values. Only the
+  **value** is redacted; the key is kept so logs stay greppable
+  (`PASSWORD=[REDACTED]`). Compound key names match whether the secret word is a
+  prefix or suffix segment (`POSTGRES_DB_PASSWORD=`, `PASSWORD_POSTGRES=`).
+  Requires the assignment separator, so the word in prose ("reset your password")
+  is not a false positive.
+- `examples/` directory with runnable, copy-pasteable usage scripts for every
+  feature (core redaction, scan/dry-run, custom patterns, deep/JSON traversal,
+  and the Logger / Rack / Rails / LLM integrations). Repo-only — not packaged in
+  the gem. Linked from the README.
 ## [0.13.0] - 2026-06-13
 ### Changed
@@ -255,7 +275,8 @@ features as 0.7.1 plus the pipeline fix.
 - `DataRedactor.redact(text)` module function returning the input with every match replaced by `[REDACTED]`.
 - RSpec suite with one example per pattern.
-[Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.13.0...HEAD
+[Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.14.0...HEAD
+[0.14.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.13.0...v0.14.0
 [0.13.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.11.0...v0.13.0
 [0.11.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.1...v0.11.0
 [0.10.1]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.0...v0.10.1

data/README.md CHANGED Viewed

@@ -46,6 +46,12 @@ DataRedactor.redact(text)
 # => "User CF is [REDACTED] and key is [REDACTED]"
 ```
+Prefer runnable code? The [`examples/`](examples/) directory has self-contained,
+copy-pasteable scripts for every feature below — core redaction, scan/dry-run,
+custom patterns, deep/JSON traversal, and the Logger / Rack / Rails / LLM
+integrations. Run any of them with `bundle exec ruby examples/<name>.rb` (see
+[examples/README.md](examples/README.md)).
 ### Filtering by tag or pattern name
 `only:` and `except:` both accept a single value or an Array, mixing **Symbols** (tag names) and **Strings** (specific pattern names).
@@ -415,6 +421,16 @@ redactor/
 │       └── tags.h                # TAG_* bit constants
 ├── spec/
 │   └── data_redactor_spec.rb     # RSpec tests — at least one example per pattern, plus filter / placeholder / custom-pattern coverage
+├── examples/                     # Repo-only runnable usage scripts (not packaged in the gem)
+│   ├── README.md                 # Index + how to run
+│   ├── basic_redact.rb           # redact, tag filters, placeholder modes
+│   ├── scan_report.rb            # scan dry-run with byte offsets
+│   ├── custom_pattern.rb         # add_pattern + name_pattern
+│   ├── deep_and_json.rb          # redact_deep / redact_json
+│   ├── logger.rb                 # Logger::Formatter integration
+│   ├── rack_middleware.rb        # Rack middleware (body + headers)
+│   ├── rails_filter.rb           # filter_parameters adapter
+│   └── llm_payload.rb            # Claude / OpenAI message + response redaction
 ├── benchmark/                    # Repo-only perf scripts (not packaged in the gem)
 │   ├── README.md                 # How to run, what each script measures
 │   ├── support/corpus.rb         # Shared payload builders + pure-Ruby baseline redactor

data/ext/data_redactor/matcher.c CHANGED Viewed

@@ -406,6 +406,7 @@ typedef struct {
     int         has_first_filter;
     int         use_dfa;
     int         boundary_wrapped;
+    int         keyname_anchored;
     int         has_eol;
     size_t      max_len;
     /* selective-merge membership (built-ins only; customs never join a merge) */
@@ -1014,6 +1015,24 @@ static size_t scan_one(int p, scan_state_t *state, const char *input, size_t len
                     !isalnum((unsigned char)input[core_so])) core_so++;
                 if (core_eo > core_so &&
                     !isalnum((unsigned char)input[core_eo-1])) core_eo--;
+            } else if (eng->keyname_anchored) {
+                /* The match is KEY<sep>VALUE (e.g. PASSWORD="hunter2"). We redact
+                 * only VALUE and keep KEY<sep> so logs stay greppable. The value
+                 * grammar forbids '=' and ':' unquoted, so the FIRST separator in
+                 * the span unambiguously ends the key. Advance past it, then past
+                 * surrounding whitespace and a single opening/closing quote. */
+                size_t s = core_so;
+                while (s < core_eo && input[s] != '=' && input[s] != ':') s++;
+                if (s < core_eo) s++;                       /* skip the separator */
+                while (s < core_eo &&
+                       (input[s] == ' ' || input[s] == '\t')) s++;
+                if (s < core_eo &&
+                    (input[s] == '"' || input[s] == '\'')) {
+                    char q = input[s];
+                    s++;
+                    if (core_eo > s && input[core_eo-1] == q) core_eo--;
+                }
+                core_so = s;
             }
             if (count < max)
                 out[count++] = (mm_match_t){p, core_so, core_eo - core_so};
@@ -1150,6 +1169,7 @@ void mm_init(void) {
     for (int p = 0; p < NUM_PATTERNS; p++) {
         engine_t *eng = eng_grow_one();
         engine_build(eng, pattern_strings[p], boundary_wrapped[p], pattern_names[p]);
+        eng->keyname_anchored = keyname_anchored[p];
         const char *lit = pattern_required_literal[p];
         if (lit) {

data/ext/data_redactor/patterns.c CHANGED Viewed

@@ -120,7 +120,17 @@ const int boundary_wrapped[NUM_PATTERNS] = {
     1, /* 84: Passport 9 digits */
     1, /* 85: Dutch BSN (8-9 digits) */
     1, /* 86: Austrian Abgabenkontonummer (9 digits) */
-    1  /* 87: Polish PESEL duplicate */
+    1, /* 87: Polish PESEL duplicate */
+    0  /* 88: Key-name-anchored secret (KEY=VALUE / KEY: VALUE) */
+};
+/*
+ * keyname_anchored[i] == 1 marks a KEY<sep>VALUE pattern whose match span has
+ * the key + separator (and any quotes) stripped so only VALUE is redacted.
+ * Mutually exclusive with boundary_wrapped[] above. See patterns.h.
+ */
+const int keyname_anchored[NUM_PATTERNS] = {
+    [88] = 1,
 };
 /*
@@ -178,7 +188,8 @@ const int pattern_tags[NUM_PATTERNS] = {
     TAG_TRAVEL,       /* 84: passport 9 digits */
     TAG_NATIONAL_ID,  /* 85: Dutch BSN */
     TAG_TAX_ID,       /* 86: Austrian Abgabenkontonummer */
-    TAG_NATIONAL_ID   /* 87: Polish PESEL duplicate */
+    TAG_NATIONAL_ID,  /* 87: Polish PESEL duplicate */
+    TAG_CREDENTIALS   /* 88: Key-name-anchored secret */
 };
 const char *pattern_names[NUM_PATTERNS] = {
@@ -269,7 +280,8 @@ const char *pattern_names[NUM_PATTERNS] = {
     "passport_9digits",              /* 84 */
     "dutch_bsn",                     /* 85 */
     "austrian_abgabenkontonummer",   /* 86 */
-    "polish_pesel_2"                 /* 87 */
+    "polish_pesel_2",                /* 87 */
+    "keyname_anchored_secret"        /* 88 */
 };
 /*
@@ -387,7 +399,8 @@ const char *pattern_required_literal[NUM_PATTERNS] = {
     NULL,             /* 84: passport 9 digits — pure digits */
     NULL,             /* 85: Dutch BSN — pure digits */
     NULL,             /* 86: Austrian Abgabenkontonummer — pure digits */
-    NULL              /* 87: Polish PESEL duplicate — pure digits */
+    NULL,             /* 87: Polish PESEL duplicate — pure digits */
+    NULL              /* 88: Key-name-anchored — key name is an alternation, no single required literal */
 };
 /*
@@ -587,5 +600,27 @@ const char *pattern_strings[NUM_PATTERNS] = {
     /* 86: Austrian Abgabenkontonummer (9 digits) */
     "[0-9]{9}",
     /* 87: Polish PESEL duplicate */
-    "[0-9]{11}"
+    "[0-9]{11}",
+    /* 88: Key-name-anchored secret (dotenv KEY=VALUE / YAML KEY: VALUE).
+     * POSIX ERE has no /i, so each key name is char-class case-folded by hand.
+     * Keys ordered longest-first so leftmost-longest picks the full name.
+     * The key word may be surrounded by other key-name chars on either side
+     * (unanchored left; [A-Za-z0-9_]* right) so compound names match both ways:
+     * POSTGRES_DB_PASSWORD= (prefix) and PASSWORD_POSTGRES= (suffix).
+     * Separator is = or : with optional surrounding space. Value is either a
+     * quoted run ("..."/'...') or an unquoted token of >=6 chars that stops at
+     * whitespace, quotes, ; , : =. The matcher strips key+sep (keyname_anchored)
+     * so only the value is redacted, the full compound key name is kept. */
+    "([Cc][Ll][Ii][Ee][Nn][Tt]_[Ss][Ee][Cc][Rr][Ee][Tt]"
+    "|[Aa][Cc][Cc][Ee][Ss][Ss]_[Kk][Ee][Yy]"
+    "|[Aa][Pp][Ii]_[Kk][Ee][Yy]"
+    "|[Aa][Pp][Ii][Kk][Ee][Yy]"
+    "|[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd]"
+    "|[Pp][Aa][Ss][Ss][Ww][Dd]"
+    "|[Ss][Ee][Cc][Rr][Ee][Tt]"
+    "|[Tt][Oo][Kk][Ee][Nn]"
+    "|[Pp][Ww][Dd])"
+    "[A-Za-z0-9_]*"
+    "[[:space:]]*[=:][[:space:]]*"
+    "(\"[^\"]+\"|'[^']+'|[^[:space:]\"';,:=]{6,})"
 };

data/ext/data_redactor/patterns.h CHANGED Viewed

@@ -3,13 +3,22 @@
 #include <regex.h>
-#define NUM_PATTERNS 88
+#define NUM_PATTERNS 89
 extern const char *pattern_strings[NUM_PATTERNS];
 extern const int   boundary_wrapped[NUM_PATTERNS];
 extern const int   pattern_tags[NUM_PATTERNS];
 extern const char *pattern_names[NUM_PATTERNS];
+/*
+ * Key-name-anchored patterns match KEY<sep>VALUE (e.g. PASSWORD="hunter2") and
+ * redact only VALUE, preserving KEY<sep> so logs stay greppable. The matcher
+ * strips the key+separator (and surrounding quotes/whitespace) from the match
+ * span; see the keyname_anchored branch in matcher.c's match emission. These
+ * are mutually exclusive with boundary_wrapped[] (a span has one strip rule).
+ */
+extern const int   keyname_anchored[NUM_PATTERNS];
 /*
  * Optional case-sensitive literal substring that the input must contain for
  * the pattern to have any chance of matching. NULL means no pre-filter — the

data/lib/data_redactor/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module DataRedactor
   # Current gem version. Follows {https://semver.org Semantic Versioning 2.0.0}.
-  VERSION = "0.13.0"
+  VERSION = "0.14.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: data_redactor
 version: !ruby/object:Gem::Version
-  version: 0.13.0
+  version: 0.14.0
 platform: ruby
 authors:
 - Daniele Frisanco