data_redactor 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fa978ec8daa0c8285f48283bc251a512c9263202046100ac03ad39ef4889e070
4
- data.tar.gz: 6b5ff39107b5948bcf499fa122e2af4c342d46fb520e94c6d0d3d1f128f30781
3
+ metadata.gz: b29290519836ca25d5188a5ef4da2585bd7f11faa0c072927863c637fb618eeb
4
+ data.tar.gz: 465091099d2fcf4b990d4e4259c3c4ad549588839d918d831c9747236f84e864
5
5
  SHA512:
6
- metadata.gz: 4169cf320312e05e77d5c6fe699f1516f031f6ba097fb9a327e9c7686c462d0c96759b58f03eb2aab024178e01998f6c7607f2ac758eb773e6681b68f8b0717e
7
- data.tar.gz: 92a9d114b28305d2da038571259ff3d819371e1276aa66aea436d820e4e540084a0e0c2e555013e53fc454abe50029058e9ca7fcdc239591b86685700cf29d62
6
+ metadata.gz: fbc51cb331674163af43d4e952bce6ec936db4e3235ca356082a83211ae552d84409bebcdffcb364c09ed8099504ac8418d2fffed3d273d4392d762d99098d59
7
+ data.tar.gz: e57d9545b5acec4ca25c1c5a30b1987d3d9769f027725e6613f396e7b2bedbe352620278a6cb8c613d5e1a1c1ecabb0e446bf2a4b6ae0339bedf8a4563a33b01
data/CHANGELOG.md CHANGED
@@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.14.0] - 2026-06-17
11
+
12
+ ### Added
13
+ - **Key-name-anchored secret redaction** (`:credentials`). A new pattern tier
14
+ redacts a secret by the *name of the field it is assigned to*, for values with
15
+ no distinctive shape of their own — the primary case being an `.env` file or
16
+ config blob passed through the redactor. Anchored on the key words `password`,
17
+ `passwd`, `pwd`, `secret`, `token`, `api_key`, `apikey`, `access_key`, and
18
+ `client_secret` (case-insensitive), followed by `=` or `:` (dotenv and YAML
19
+ styles), with quoted (`"..."`/`'...'`) or unquoted (≥6 chars) values. Only the
20
+ **value** is redacted; the key is kept so logs stay greppable
21
+ (`PASSWORD=[REDACTED]`). Compound key names match whether the secret word is a
22
+ prefix or suffix segment (`POSTGRES_DB_PASSWORD=`, `PASSWORD_POSTGRES=`).
23
+ Requires the assignment separator, so the word in prose ("reset your password")
24
+ is not a false positive.
25
+ - `examples/` directory with runnable, copy-pasteable usage scripts for every
26
+ feature (core redaction, scan/dry-run, custom patterns, deep/JSON traversal,
27
+ and the Logger / Rack / Rails / LLM integrations). Repo-only — not packaged in
28
+ the gem. Linked from the README.
29
+
10
30
  ## [0.13.0] - 2026-06-13
11
31
 
12
32
  ### Changed
@@ -255,7 +275,8 @@ features as 0.7.1 plus the pipeline fix.
255
275
  - `DataRedactor.redact(text)` module function returning the input with every match replaced by `[REDACTED]`.
256
276
  - RSpec suite with one example per pattern.
257
277
 
258
- [Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.13.0...HEAD
278
+ [Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.14.0...HEAD
279
+ [0.14.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.13.0...v0.14.0
259
280
  [0.13.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.11.0...v0.13.0
260
281
  [0.11.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.1...v0.11.0
261
282
  [0.10.1]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.0...v0.10.1
data/README.md CHANGED
@@ -46,6 +46,12 @@ DataRedactor.redact(text)
46
46
  # => "User CF is [REDACTED] and key is [REDACTED]"
47
47
  ```
48
48
 
49
+ Prefer runnable code? The [`examples/`](examples/) directory has self-contained,
50
+ copy-pasteable scripts for every feature below — core redaction, scan/dry-run,
51
+ custom patterns, deep/JSON traversal, and the Logger / Rack / Rails / LLM
52
+ integrations. Run any of them with `bundle exec ruby examples/<name>.rb` (see
53
+ [examples/README.md](examples/README.md)).
54
+
49
55
  ### Filtering by tag or pattern name
50
56
 
51
57
  `only:` and `except:` both accept a single value or an Array, mixing **Symbols** (tag names) and **Strings** (specific pattern names).
@@ -415,6 +421,16 @@ redactor/
415
421
  │ └── tags.h # TAG_* bit constants
416
422
  ├── spec/
417
423
  │ └── data_redactor_spec.rb # RSpec tests — at least one example per pattern, plus filter / placeholder / custom-pattern coverage
424
+ ├── examples/ # Repo-only runnable usage scripts (not packaged in the gem)
425
+ │ ├── README.md # Index + how to run
426
+ │ ├── basic_redact.rb # redact, tag filters, placeholder modes
427
+ │ ├── scan_report.rb # scan dry-run with byte offsets
428
+ │ ├── custom_pattern.rb # add_pattern + name_pattern
429
+ │ ├── deep_and_json.rb # redact_deep / redact_json
430
+ │ ├── logger.rb # Logger::Formatter integration
431
+ │ ├── rack_middleware.rb # Rack middleware (body + headers)
432
+ │ ├── rails_filter.rb # filter_parameters adapter
433
+ │ └── llm_payload.rb # Claude / OpenAI message + response redaction
418
434
  ├── benchmark/ # Repo-only perf scripts (not packaged in the gem)
419
435
  │ ├── README.md # How to run, what each script measures
420
436
  │ ├── support/corpus.rb # Shared payload builders + pure-Ruby baseline redactor
@@ -406,6 +406,7 @@ typedef struct {
406
406
  int has_first_filter;
407
407
  int use_dfa;
408
408
  int boundary_wrapped;
409
+ int keyname_anchored;
409
410
  int has_eol;
410
411
  size_t max_len;
411
412
  /* selective-merge membership (built-ins only; customs never join a merge) */
@@ -1014,6 +1015,24 @@ static size_t scan_one(int p, scan_state_t *state, const char *input, size_t len
1014
1015
  !isalnum((unsigned char)input[core_so])) core_so++;
1015
1016
  if (core_eo > core_so &&
1016
1017
  !isalnum((unsigned char)input[core_eo-1])) core_eo--;
1018
+ } else if (eng->keyname_anchored) {
1019
+ /* The match is KEY<sep>VALUE (e.g. PASSWORD="hunter2"). We redact
1020
+ * only VALUE and keep KEY<sep> so logs stay greppable. The value
1021
+ * grammar forbids '=' and ':' unquoted, so the FIRST separator in
1022
+ * the span unambiguously ends the key. Advance past it, then past
1023
+ * surrounding whitespace and a single opening/closing quote. */
1024
+ size_t s = core_so;
1025
+ while (s < core_eo && input[s] != '=' && input[s] != ':') s++;
1026
+ if (s < core_eo) s++; /* skip the separator */
1027
+ while (s < core_eo &&
1028
+ (input[s] == ' ' || input[s] == '\t')) s++;
1029
+ if (s < core_eo &&
1030
+ (input[s] == '"' || input[s] == '\'')) {
1031
+ char q = input[s];
1032
+ s++;
1033
+ if (core_eo > s && input[core_eo-1] == q) core_eo--;
1034
+ }
1035
+ core_so = s;
1017
1036
  }
1018
1037
  if (count < max)
1019
1038
  out[count++] = (mm_match_t){p, core_so, core_eo - core_so};
@@ -1150,6 +1169,7 @@ void mm_init(void) {
1150
1169
  for (int p = 0; p < NUM_PATTERNS; p++) {
1151
1170
  engine_t *eng = eng_grow_one();
1152
1171
  engine_build(eng, pattern_strings[p], boundary_wrapped[p], pattern_names[p]);
1172
+ eng->keyname_anchored = keyname_anchored[p];
1153
1173
 
1154
1174
  const char *lit = pattern_required_literal[p];
1155
1175
  if (lit) {
@@ -120,7 +120,17 @@ const int boundary_wrapped[NUM_PATTERNS] = {
120
120
  1, /* 84: Passport 9 digits */
121
121
  1, /* 85: Dutch BSN (8-9 digits) */
122
122
  1, /* 86: Austrian Abgabenkontonummer (9 digits) */
123
- 1 /* 87: Polish PESEL duplicate */
123
+ 1, /* 87: Polish PESEL duplicate */
124
+ 0 /* 88: Key-name-anchored secret (KEY=VALUE / KEY: VALUE) */
125
+ };
126
+
127
+ /*
128
+ * keyname_anchored[i] == 1 marks a KEY<sep>VALUE pattern whose match span has
129
+ * the key + separator (and any quotes) stripped so only VALUE is redacted.
130
+ * Mutually exclusive with boundary_wrapped[] above. See patterns.h.
131
+ */
132
+ const int keyname_anchored[NUM_PATTERNS] = {
133
+ [88] = 1,
124
134
  };
125
135
 
126
136
  /*
@@ -178,7 +188,8 @@ const int pattern_tags[NUM_PATTERNS] = {
178
188
  TAG_TRAVEL, /* 84: passport 9 digits */
179
189
  TAG_NATIONAL_ID, /* 85: Dutch BSN */
180
190
  TAG_TAX_ID, /* 86: Austrian Abgabenkontonummer */
181
- TAG_NATIONAL_ID /* 87: Polish PESEL duplicate */
191
+ TAG_NATIONAL_ID, /* 87: Polish PESEL duplicate */
192
+ TAG_CREDENTIALS /* 88: Key-name-anchored secret */
182
193
  };
183
194
 
184
195
  const char *pattern_names[NUM_PATTERNS] = {
@@ -269,7 +280,8 @@ const char *pattern_names[NUM_PATTERNS] = {
269
280
  "passport_9digits", /* 84 */
270
281
  "dutch_bsn", /* 85 */
271
282
  "austrian_abgabenkontonummer", /* 86 */
272
- "polish_pesel_2" /* 87 */
283
+ "polish_pesel_2", /* 87 */
284
+ "keyname_anchored_secret" /* 88 */
273
285
  };
274
286
 
275
287
  /*
@@ -387,7 +399,8 @@ const char *pattern_required_literal[NUM_PATTERNS] = {
387
399
  NULL, /* 84: passport 9 digits — pure digits */
388
400
  NULL, /* 85: Dutch BSN — pure digits */
389
401
  NULL, /* 86: Austrian Abgabenkontonummer — pure digits */
390
- NULL /* 87: Polish PESEL duplicate — pure digits */
402
+ NULL, /* 87: Polish PESEL duplicate — pure digits */
403
+ NULL /* 88: Key-name-anchored — key name is an alternation, no single required literal */
391
404
  };
392
405
 
393
406
  /*
@@ -587,5 +600,27 @@ const char *pattern_strings[NUM_PATTERNS] = {
587
600
  /* 86: Austrian Abgabenkontonummer (9 digits) */
588
601
  "[0-9]{9}",
589
602
  /* 87: Polish PESEL duplicate */
590
- "[0-9]{11}"
603
+ "[0-9]{11}",
604
+ /* 88: Key-name-anchored secret (dotenv KEY=VALUE / YAML KEY: VALUE).
605
+ * POSIX ERE has no /i, so each key name is char-class case-folded by hand.
606
+ * Keys ordered longest-first so leftmost-longest picks the full name.
607
+ * The key word may be surrounded by other key-name chars on either side
608
+ * (unanchored left; [A-Za-z0-9_]* right) so compound names match both ways:
609
+ * POSTGRES_DB_PASSWORD= (prefix) and PASSWORD_POSTGRES= (suffix).
610
+ * Separator is = or : with optional surrounding space. Value is either a
611
+ * quoted run ("..."/'...') or an unquoted token of >=6 chars that stops at
612
+ * whitespace, quotes, ; , : =. The matcher strips key+sep (keyname_anchored)
613
+ * so only the value is redacted, the full compound key name is kept. */
614
+ "([Cc][Ll][Ii][Ee][Nn][Tt]_[Ss][Ee][Cc][Rr][Ee][Tt]"
615
+ "|[Aa][Cc][Cc][Ee][Ss][Ss]_[Kk][Ee][Yy]"
616
+ "|[Aa][Pp][Ii]_[Kk][Ee][Yy]"
617
+ "|[Aa][Pp][Ii][Kk][Ee][Yy]"
618
+ "|[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd]"
619
+ "|[Pp][Aa][Ss][Ss][Ww][Dd]"
620
+ "|[Ss][Ee][Cc][Rr][Ee][Tt]"
621
+ "|[Tt][Oo][Kk][Ee][Nn]"
622
+ "|[Pp][Ww][Dd])"
623
+ "[A-Za-z0-9_]*"
624
+ "[[:space:]]*[=:][[:space:]]*"
625
+ "(\"[^\"]+\"|'[^']+'|[^[:space:]\"';,:=]{6,})"
591
626
  };
@@ -3,13 +3,22 @@
3
3
 
4
4
  #include <regex.h>
5
5
 
6
- #define NUM_PATTERNS 88
6
+ #define NUM_PATTERNS 89
7
7
 
8
8
  extern const char *pattern_strings[NUM_PATTERNS];
9
9
  extern const int boundary_wrapped[NUM_PATTERNS];
10
10
  extern const int pattern_tags[NUM_PATTERNS];
11
11
  extern const char *pattern_names[NUM_PATTERNS];
12
12
 
13
+ /*
14
+ * Key-name-anchored patterns match KEY<sep>VALUE (e.g. PASSWORD="hunter2") and
15
+ * redact only VALUE, preserving KEY<sep> so logs stay greppable. The matcher
16
+ * strips the key+separator (and surrounding quotes/whitespace) from the match
17
+ * span; see the keyname_anchored branch in matcher.c's match emission. These
18
+ * are mutually exclusive with boundary_wrapped[] (a span has one strip rule).
19
+ */
20
+ extern const int keyname_anchored[NUM_PATTERNS];
21
+
13
22
  /*
14
23
  * Optional case-sensitive literal substring that the input must contain for
15
24
  * the pattern to have any chance of matching. NULL means no pre-filter — the
@@ -1,4 +1,4 @@
1
1
  module DataRedactor
2
2
  # Current gem version. Follows {https://semver.org Semantic Versioning 2.0.0}.
3
- VERSION = "0.13.0"
3
+ VERSION = "0.14.0"
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_redactor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniele Frisanco