data_redactor 0.11.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +46 -1
- data/README.md +20 -4
- data/ext/data_redactor/custom_patterns.c +34 -4
- data/ext/data_redactor/custom_patterns.h +10 -0
- data/ext/data_redactor/extconf.rb +5 -0
- data/ext/data_redactor/matcher.c +184 -63
- data/ext/data_redactor/patterns.c +40 -5
- data/ext/data_redactor/patterns.h +10 -1
- data/ext/data_redactor/redact.c +50 -2
- data/ext/data_redactor/scan.c +5 -1
- data/lib/data_redactor/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b29290519836ca25d5188a5ef4da2585bd7f11faa0c072927863c637fb618eeb
|
|
4
|
+
data.tar.gz: 465091099d2fcf4b990d4e4259c3c4ad549588839d918d831c9747236f84e864
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: fbc51cb331674163af43d4e952bce6ec936db4e3235ca356082a83211ae552d84409bebcdffcb364c09ed8099504ac8418d2fffed3d273d4392d762d99098d59
|
|
7
|
+
data.tar.gz: e57d9545b5acec4ca25c1c5a30b1987d3d9769f027725e6613f396e7b2bedbe352620278a6cb8c613d5e1a1c1ecabb0e446bf2a4b6ae0339bedf8a4563a33b01
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,49 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.14.0] - 2026-06-17
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
- **Key-name-anchored secret redaction** (`:credentials`). A new pattern tier
|
|
14
|
+
redacts a secret by the *name of the field it is assigned to*, for values with
|
|
15
|
+
no distinctive shape of their own — the primary case being an `.env` file or
|
|
16
|
+
config blob passed through the redactor. Anchored on the key words `password`,
|
|
17
|
+
`passwd`, `pwd`, `secret`, `token`, `api_key`, `apikey`, `access_key`, and
|
|
18
|
+
`client_secret` (case-insensitive), followed by `=` or `:` (dotenv and YAML
|
|
19
|
+
styles), with quoted (`"..."`/`'...'`) or unquoted (≥6 chars) values. Only the
|
|
20
|
+
**value** is redacted; the key is kept so logs stay greppable
|
|
21
|
+
(`PASSWORD=[REDACTED]`). Compound key names match whether the secret word is a
|
|
22
|
+
prefix or suffix segment (`POSTGRES_DB_PASSWORD=`, `PASSWORD_POSTGRES=`).
|
|
23
|
+
Requires the assignment separator, so the word in prose ("reset your password")
|
|
24
|
+
is not a false positive.
|
|
25
|
+
- `examples/` directory with runnable, copy-pasteable usage scripts for every
|
|
26
|
+
feature (core redaction, scan/dry-run, custom patterns, deep/JSON traversal,
|
|
27
|
+
and the Logger / Rack / Rails / LLM integrations). Repo-only — not packaged in
|
|
28
|
+
the gem. Linked from the README.
|
|
29
|
+
|
|
30
|
+
## [0.13.0] - 2026-06-13
|
|
31
|
+
|
|
32
|
+
### Changed
|
|
33
|
+
- **Custom-pattern registration is now thread-safe.** `add_pattern`,
|
|
34
|
+
`remove_pattern`, and `clear_custom_patterns!` are guarded by a mutex shared
|
|
35
|
+
with the `redact`/`scan` custom-pattern loop, so patterns may be registered,
|
|
36
|
+
removed, or cleared from any thread at any time — including at runtime from a
|
|
37
|
+
request handler — without coordinating with in-flight redactions. The previous
|
|
38
|
+
"register custom patterns at boot only" caveat is lifted. (The C extension now
|
|
39
|
+
links `-lpthread` on glibc; no-op on musl and macOS where pthread is in libc.)
|
|
40
|
+
- **`redact` releases the GVL for large inputs.** The v19 engine's per-scan
|
|
41
|
+
mutable state (NFA scratch and the lazy DFA cache) moved into per-thread
|
|
42
|
+
storage, making the engine re-entrant. `redact` now releases the GVL
|
|
43
|
+
(`rb_thread_call_without_gvl`) around the built-in scan for inputs above a few
|
|
44
|
+
KB, so a large redaction on one thread no longer blocks other Ruby threads.
|
|
45
|
+
Small inputs keep the GVL. No public API change; output is byte-for-byte
|
|
46
|
+
identical (verified by a differential gate over ~6000 inputs). The per-thread
|
|
47
|
+
DFA cache's allocation floor was tuned so this adds ~0.86 MB per scanning
|
|
48
|
+
thread (down from a naive ~3.2 MB), with no throughput change. Per-thread scan
|
|
49
|
+
state is freed at thread exit (via a `pthread_key` destructor), so processes
|
|
50
|
+
that churn many short-lived scanning threads do not accumulate dead caches —
|
|
51
|
+
RSS stays flat across thousands of threads.
|
|
52
|
+
|
|
10
53
|
## [0.11.0] - 2026-06-10
|
|
11
54
|
|
|
12
55
|
### Added
|
|
@@ -232,7 +275,9 @@ features as 0.7.1 plus the pipeline fix.
|
|
|
232
275
|
- `DataRedactor.redact(text)` module function returning the input with every match replaced by `[REDACTED]`.
|
|
233
276
|
- RSpec suite with one example per pattern.
|
|
234
277
|
|
|
235
|
-
[Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.
|
|
278
|
+
[Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.14.0...HEAD
|
|
279
|
+
[0.14.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.13.0...v0.14.0
|
|
280
|
+
[0.13.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.11.0...v0.13.0
|
|
236
281
|
[0.11.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.1...v0.11.0
|
|
237
282
|
[0.10.1]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.0...v0.10.1
|
|
238
283
|
[0.10.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.9.0...v0.10.0
|
data/README.md
CHANGED
|
@@ -19,7 +19,7 @@ It ships **88 built-in patterns** across 15+ countries, grouped into tags
|
|
|
19
19
|
(`:credentials`, `:financial`, `:contact`, ...) so you can redact only what you
|
|
20
20
|
care about. Beyond plain strings it can walk nested Hashes, Arrays, and JSON,
|
|
21
21
|
audit a payload without mutating it (`scan`), and plug into Logger, Rails, and
|
|
22
|
-
Rack. You can also register your own patterns at boot.
|
|
22
|
+
Rack. You can also register your own patterns — at boot or at runtime from any thread.
|
|
23
23
|
|
|
24
24
|
### Use cases
|
|
25
25
|
|
|
@@ -46,6 +46,12 @@ DataRedactor.redact(text)
|
|
|
46
46
|
# => "User CF is [REDACTED] and key is [REDACTED]"
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Prefer runnable code? The [`examples/`](examples/) directory has self-contained,
|
|
50
|
+
copy-pasteable scripts for every feature below — core redaction, scan/dry-run,
|
|
51
|
+
custom patterns, deep/JSON traversal, and the Logger / Rack / Rails / LLM
|
|
52
|
+
integrations. Run any of them with `bundle exec ruby examples/<name>.rb` (see
|
|
53
|
+
[examples/README.md](examples/README.md)).
|
|
54
|
+
|
|
49
55
|
### Filtering by tag or pattern name
|
|
50
56
|
|
|
51
57
|
`only:` and `except:` both accept a single value or an Array, mixing **Symbols** (tag names) and **Strings** (specific pattern names).
|
|
@@ -161,7 +167,7 @@ DataRedactor.redact_json("not json") # => JSON::ParserError
|
|
|
161
167
|
|
|
162
168
|
### Custom patterns
|
|
163
169
|
|
|
164
|
-
Teams often have internal IDs that the gem can't ship. Register them at boot:
|
|
170
|
+
Teams often have internal IDs that the gem can't ship. Register them at boot — or at runtime from any thread (registration is thread-safe, see [Thread safety](#thread-safety)):
|
|
165
171
|
|
|
166
172
|
```ruby
|
|
167
173
|
# String (POSIX ERE) or Regexp — both accepted
|
|
@@ -415,6 +421,16 @@ redactor/
|
|
|
415
421
|
│ └── tags.h # TAG_* bit constants
|
|
416
422
|
├── spec/
|
|
417
423
|
│ └── data_redactor_spec.rb # RSpec tests — at least one example per pattern, plus filter / placeholder / custom-pattern coverage
|
|
424
|
+
├── examples/ # Repo-only runnable usage scripts (not packaged in the gem)
|
|
425
|
+
│ ├── README.md # Index + how to run
|
|
426
|
+
│ ├── basic_redact.rb # redact, tag filters, placeholder modes
|
|
427
|
+
│ ├── scan_report.rb # scan dry-run with byte offsets
|
|
428
|
+
│ ├── custom_pattern.rb # add_pattern + name_pattern
|
|
429
|
+
│ ├── deep_and_json.rb # redact_deep / redact_json
|
|
430
|
+
│ ├── logger.rb # Logger::Formatter integration
|
|
431
|
+
│ ├── rack_middleware.rb # Rack middleware (body + headers)
|
|
432
|
+
│ ├── rails_filter.rb # filter_parameters adapter
|
|
433
|
+
│ └── llm_payload.rb # Claude / OpenAI message + response redaction
|
|
418
434
|
├── benchmark/ # Repo-only perf scripts (not packaged in the gem)
|
|
419
435
|
│ ├── README.md # How to run, what each script measures
|
|
420
436
|
│ ├── support/corpus.rb # Shared payload builders + pure-Ruby baseline redactor
|
|
@@ -571,9 +587,9 @@ All C-side buffers are heap-allocated with `malloc`/`strdup` and freed before th
|
|
|
571
587
|
|
|
572
588
|
## Thread safety
|
|
573
589
|
|
|
574
|
-
`DataRedactor.redact` and `DataRedactor.scan` are safe to call concurrently from multiple threads. The v19 engine
|
|
590
|
+
`DataRedactor.redact` and `DataRedactor.scan` are safe to call concurrently from multiple threads. The v19 engine keeps its compiled patterns immutable and shared (read-only after `mm_init()` at load time) and all per-scan mutable state — NFA scratch and the lazy DFA cache — in per-thread storage, so concurrent scans never touch each other's state. For inputs above a few KB, `redact` **releases the GVL** (`rb_thread_call_without_gvl`) around the built-in scan, so a large redaction on one thread no longer blocks other Ruby threads from running. Small inputs keep the GVL (the release bookkeeping would cost more than the scan). Each call allocates its own working buffers. A thread's per-thread state is freed automatically when the thread exits, so processes that spawn many short-lived scanning threads do not accumulate memory.
|
|
575
591
|
|
|
576
|
-
`DataRedactor.add_pattern`, `remove_pattern`, and `clear_custom_patterns!`
|
|
592
|
+
`DataRedactor.add_pattern`, `remove_pattern`, and `clear_custom_patterns!` are also thread-safe: the shared custom-pattern array is guarded by a mutex that writers take around the mutation and `redact`/`scan` take around their custom-pattern loop. You can register, remove, or clear custom patterns from any thread at any time — including from request handlers in a running server — without coordinating with in-flight redactions. (Registration is still a rare operation; the lock is uncontended in practice.)
|
|
577
593
|
|
|
578
594
|
## Versioning
|
|
579
595
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include "redact.h" /* wrap_boundary */
|
|
3
3
|
#include <string.h>
|
|
4
4
|
#include <stdlib.h>
|
|
5
|
+
#include <pthread.h>
|
|
5
6
|
|
|
6
7
|
/* Custom patterns deliberately do NOT use the v19 engine: they keep the glibc
|
|
7
8
|
* regexec path (replace_all_matches), because user regex can contain multibyte
|
|
@@ -12,6 +13,11 @@ custom_pattern_t *custom_patterns = NULL;
|
|
|
12
13
|
int custom_count = 0;
|
|
13
14
|
int custom_cap = 0;
|
|
14
15
|
|
|
16
|
+
static pthread_mutex_t custom_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
17
|
+
|
|
18
|
+
void custom_patterns_lock(void) { pthread_mutex_lock(&custom_mutex); }
|
|
19
|
+
void custom_patterns_unlock(void) { pthread_mutex_unlock(&custom_mutex); }
|
|
20
|
+
|
|
15
21
|
static int find_custom_by_name(const char *name) {
|
|
16
22
|
for (int i = 0; i < custom_count; i++) {
|
|
17
23
|
if (strcmp(custom_patterns[i].name, name) == 0) return i;
|
|
@@ -58,6 +64,13 @@ VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
|
58
64
|
rb_raise(eClass, "%s", errbuf);
|
|
59
65
|
}
|
|
60
66
|
|
|
67
|
+
/* regcomp succeeded above (no array access yet); now mutate the shared array
|
|
68
|
+
* under the lock. Keep the critical section rb_raise-free: on failure, record
|
|
69
|
+
* it, unlock, then raise outside the lock so the mutex can't leak via longjmp. */
|
|
70
|
+
custom_patterns_lock();
|
|
71
|
+
|
|
72
|
+
const char *err = NULL;
|
|
73
|
+
int stored = 0; /* 1 once `compiled` is owned by a slot (don't regfree it) */
|
|
61
74
|
int idx = find_custom_by_name(name);
|
|
62
75
|
if (idx >= 0) {
|
|
63
76
|
free_custom_at(idx);
|
|
@@ -67,8 +80,8 @@ VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
|
67
80
|
custom_pattern_t *tmp = (custom_pattern_t *)realloc(
|
|
68
81
|
custom_patterns, sizeof(custom_pattern_t) * new_cap);
|
|
69
82
|
if (!tmp) {
|
|
70
|
-
|
|
71
|
-
|
|
83
|
+
err = "custom_patterns realloc failed";
|
|
84
|
+
goto unlock;
|
|
72
85
|
}
|
|
73
86
|
custom_patterns = tmp;
|
|
74
87
|
custom_cap = new_cap;
|
|
@@ -81,9 +94,17 @@ VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
|
81
94
|
custom_patterns[idx].compiled = compiled;
|
|
82
95
|
custom_patterns[idx].tag = tag_bit;
|
|
83
96
|
custom_patterns[idx].boundary = boundary;
|
|
97
|
+
stored = 1;
|
|
84
98
|
|
|
85
99
|
if (!custom_patterns[idx].name || !custom_patterns[idx].source) {
|
|
86
|
-
|
|
100
|
+
err = "strdup failed";
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
unlock:
|
|
104
|
+
custom_patterns_unlock();
|
|
105
|
+
if (err) {
|
|
106
|
+
if (!stored) regfree(&compiled);
|
|
107
|
+
rb_raise(rb_eNoMemError, "%s", err);
|
|
87
108
|
}
|
|
88
109
|
|
|
89
110
|
return Qnil;
|
|
@@ -93,8 +114,12 @@ VALUE rb_remove_pattern(VALUE self, VALUE rb_name) {
|
|
|
93
114
|
Check_Type(rb_name, T_STRING);
|
|
94
115
|
const char *name = StringValueCStr(rb_name);
|
|
95
116
|
|
|
117
|
+
custom_patterns_lock();
|
|
96
118
|
int idx = find_custom_by_name(name);
|
|
97
|
-
if (idx < 0)
|
|
119
|
+
if (idx < 0) {
|
|
120
|
+
custom_patterns_unlock();
|
|
121
|
+
return Qfalse;
|
|
122
|
+
}
|
|
98
123
|
|
|
99
124
|
free_custom_at(idx);
|
|
100
125
|
|
|
@@ -102,19 +127,23 @@ VALUE rb_remove_pattern(VALUE self, VALUE rb_name) {
|
|
|
102
127
|
custom_patterns[i] = custom_patterns[i + 1];
|
|
103
128
|
}
|
|
104
129
|
custom_count--;
|
|
130
|
+
custom_patterns_unlock();
|
|
105
131
|
|
|
106
132
|
return Qtrue;
|
|
107
133
|
}
|
|
108
134
|
|
|
109
135
|
VALUE rb_clear_custom_patterns(VALUE self) {
|
|
136
|
+
custom_patterns_lock();
|
|
110
137
|
for (int i = 0; i < custom_count; i++) {
|
|
111
138
|
free_custom_at(i);
|
|
112
139
|
}
|
|
113
140
|
custom_count = 0;
|
|
141
|
+
custom_patterns_unlock();
|
|
114
142
|
return Qnil;
|
|
115
143
|
}
|
|
116
144
|
|
|
117
145
|
VALUE rb_custom_patterns(VALUE self) {
|
|
146
|
+
custom_patterns_lock();
|
|
118
147
|
VALUE arr = rb_ary_new_capa(custom_count);
|
|
119
148
|
for (int i = 0; i < custom_count; i++) {
|
|
120
149
|
VALUE h = rb_hash_new();
|
|
@@ -124,5 +153,6 @@ VALUE rb_custom_patterns(VALUE self) {
|
|
|
124
153
|
rb_hash_aset(h, ID2SYM(rb_intern("boundary")), custom_patterns[i].boundary ? Qtrue : Qfalse);
|
|
125
154
|
rb_ary_push(arr, h);
|
|
126
155
|
}
|
|
156
|
+
custom_patterns_unlock();
|
|
127
157
|
return arr;
|
|
128
158
|
}
|
|
@@ -16,6 +16,16 @@ extern custom_pattern_t *custom_patterns;
|
|
|
16
16
|
extern int custom_count;
|
|
17
17
|
extern int custom_cap;
|
|
18
18
|
|
|
19
|
+
/* Guards the custom_patterns array against concurrent mutation. redact/scan
|
|
20
|
+
* take it for the duration of their custom-pattern loop (readers); add/remove/
|
|
21
|
+
* clear take it around the array mutation (writers). A plain mutex is enough:
|
|
22
|
+
* contention is low (registration is rare relative to redaction) and the GVL
|
|
23
|
+
* already serialises everything else, so the only race this closes is a writer
|
|
24
|
+
* realloc/shift running against a reader's iteration. Lock is always taken
|
|
25
|
+
* inside the GVL, never the reverse, so there is no lock-ordering hazard. */
|
|
26
|
+
void custom_patterns_lock(void);
|
|
27
|
+
void custom_patterns_unlock(void);
|
|
28
|
+
|
|
19
29
|
VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
20
30
|
VALUE rb_tag_bit, VALUE rb_boundary);
|
|
21
31
|
VALUE rb_remove_pattern(VALUE self, VALUE rb_name);
|
|
@@ -4,6 +4,11 @@ abort "Missing C compiler or stdio.h" unless have_header("stdio.h")
|
|
|
4
4
|
abort "Missing regex.h" unless have_header("regex.h")
|
|
5
5
|
abort "Missing stdlib.h" unless have_header("stdlib.h")
|
|
6
6
|
abort "Missing string.h" unless have_header("string.h")
|
|
7
|
+
abort "Missing pthread.h" unless have_header("pthread.h")
|
|
8
|
+
|
|
9
|
+
# pthread_mutex_* needs -lpthread on glibc; on musl and macOS it lives in libc
|
|
10
|
+
# and have_library is a harmless no-op.
|
|
11
|
+
have_library("pthread")
|
|
7
12
|
|
|
8
13
|
# Compile every .c file in this directory. Order doesn't matter; mkmf
|
|
9
14
|
# generates per-object rules.
|
data/ext/data_redactor/matcher.c
CHANGED
|
@@ -44,6 +44,7 @@
|
|
|
44
44
|
#include <stdint.h>
|
|
45
45
|
#include <ctype.h>
|
|
46
46
|
#include <limits.h>
|
|
47
|
+
#include <pthread.h>
|
|
47
48
|
|
|
48
49
|
/* ========================================================================
|
|
49
50
|
* 0. Utilities
|
|
@@ -386,8 +387,13 @@ typedef struct {
|
|
|
386
387
|
int matched;
|
|
387
388
|
} tlist_t;
|
|
388
389
|
|
|
390
|
+
/* engine_t holds ONLY immutable, compiled state — built once at mm_init()/mm_add()
|
|
391
|
+
* and never written during a scan, so it is safe to share read-only across
|
|
392
|
+
* threads. All per-scan mutable state (NFA scratch, merge cursors) and the lazy
|
|
393
|
+
* DFA cache live in scan_state_t, which is per-thread (t_block below). This
|
|
394
|
+
* split is what lets redact/scan release the GVL: with no shared writes during a
|
|
395
|
+
* scan, concurrent scans on distinct threads cannot race. */
|
|
389
396
|
typedef struct {
|
|
390
|
-
/* compiled, immutable after build (safe to share across scans) */
|
|
391
397
|
prog_t prog;
|
|
392
398
|
size_t min_len;
|
|
393
399
|
const char *req_literal; /* points into a heap copy owned by this engine */
|
|
@@ -400,22 +406,27 @@ typedef struct {
|
|
|
400
406
|
int has_first_filter;
|
|
401
407
|
int use_dfa;
|
|
402
408
|
int boundary_wrapped;
|
|
409
|
+
int keyname_anchored;
|
|
403
410
|
int has_eol;
|
|
404
411
|
size_t max_len;
|
|
405
412
|
/* selective-merge membership (built-ins only; customs never join a merge) */
|
|
406
413
|
int digit_member, digit_lo, digit_hi;
|
|
407
414
|
int iban_member;
|
|
408
|
-
|
|
415
|
+
} engine_t;
|
|
416
|
+
|
|
417
|
+
/* Per-engine MUTABLE scan state. One per engine, owned per-thread (t_state).
|
|
418
|
+
* The DFA cache warms lazily across this thread's scans; the rest is reset each
|
|
419
|
+
* scan. seen_cap==0 / dfa.n_states==0 means "not yet warmed" for this thread. */
|
|
420
|
+
typedef struct {
|
|
409
421
|
dfa_t dfa;
|
|
410
422
|
int *seen;
|
|
411
423
|
int seen_cap;
|
|
412
424
|
tlist_t clist, nlist;
|
|
413
425
|
int *estack;
|
|
414
426
|
int gen;
|
|
415
|
-
/*
|
|
416
|
-
int digit_last_end;
|
|
427
|
+
int digit_last_end; /* selective-merge non-overlap cursors */
|
|
417
428
|
size_t iban_last_end;
|
|
418
|
-
}
|
|
429
|
+
} scan_state_t;
|
|
419
430
|
|
|
420
431
|
static engine_t *g_eng = NULL;
|
|
421
432
|
static int g_eng_n = 0; /* engines built (NUM_PATTERNS + custom_n) */
|
|
@@ -423,6 +434,33 @@ static int g_eng_cap= 0;
|
|
|
423
434
|
static int g_custom_n = 0;
|
|
424
435
|
static int g_initialized = 0;
|
|
425
436
|
|
|
437
|
+
/* Bumped whenever the pattern set changes (mm_add/mm_remove/mm_clear_custom).
|
|
438
|
+
* A thread whose cached t_gen lags this value drops its whole scan-state cache
|
|
439
|
+
* and rebuilds — the simplest safe invalidation (slot p may now hold a
|
|
440
|
+
* different pattern after mm_remove compacts g_eng). Registration is rare, so
|
|
441
|
+
* the full rebuild is cheap; a surgical per-slot invalidation is a possible
|
|
442
|
+
* future refinement (see TODO §"Full thread safety"). */
|
|
443
|
+
static unsigned g_pattern_gen = 0;
|
|
444
|
+
|
|
445
|
+
/* Per-thread mutable scan state: one scan_state_t per engine, lazily grown to
|
|
446
|
+
* g_eng_n. Held in a heap block whose header carries the element count, so the
|
|
447
|
+
* pthread_key destructor (which frees the block at thread exit) is fully
|
|
448
|
+
* self-contained — it must NOT read __thread storage, which may already be torn
|
|
449
|
+
* down when key destructors run. The __thread pointer is the fast hot-path
|
|
450
|
+
* handle; the key holds the same pointer purely so it can be reclaimed on exit.
|
|
451
|
+
* This bounds memory for processes that churn many short-lived scanning threads;
|
|
452
|
+
* fixed pools (Puma/Sidekiq) just reuse the block for the thread's lifetime. */
|
|
453
|
+
typedef struct {
|
|
454
|
+
int n; /* number of scan_state_t entries in states[] */
|
|
455
|
+
scan_state_t states[]; /* flexible array member */
|
|
456
|
+
} thread_block_t;
|
|
457
|
+
|
|
458
|
+
static __thread thread_block_t *t_block = NULL;
|
|
459
|
+
static __thread unsigned t_gen = 0;
|
|
460
|
+
|
|
461
|
+
static pthread_key_t t_block_key;
|
|
462
|
+
static pthread_once_t t_block_key_once = PTHREAD_ONCE_INIT;
|
|
463
|
+
|
|
426
464
|
/* IBAN union-pass dispatch (built-ins only): unique 2-byte country prefixes. */
|
|
427
465
|
static int g_iban_first[256];
|
|
428
466
|
static int g_iban_pair[256][256];
|
|
@@ -606,14 +644,64 @@ static void engine_set_literal(engine_t *eng, const char *lit, int at_start) {
|
|
|
606
644
|
static void engine_free(engine_t *eng) {
|
|
607
645
|
free(eng->prog.code);
|
|
608
646
|
free(eng->req_literal_own);
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
647
|
+
memset(eng, 0, sizeof(*eng));
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
/* Free one thread's mutable scan state for an engine (scratch + DFA cache).
|
|
651
|
+
* Used when a thread drops its cache on a pattern-set generation change. */
|
|
652
|
+
static void free_scan_state(scan_state_t *st) {
|
|
653
|
+
free(st->seen);
|
|
654
|
+
free(st->clist.list);
|
|
655
|
+
free(st->nlist.list);
|
|
656
|
+
free(st->estack);
|
|
657
|
+
dfa_t *d = &st->dfa;
|
|
614
658
|
free(d->set_pool); free(d->set_off); free(d->set_len);
|
|
615
659
|
free(d->matched); free(d->trans); free(d->hash);
|
|
616
|
-
memset(
|
|
660
|
+
memset(st, 0, sizeof(*st));
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
/* pthread_key destructor: free a thread's whole block at thread exit. Reads only
|
|
664
|
+
* the passed-in pointer + its header count — no __thread access (unsafe here). */
|
|
665
|
+
static void free_thread_block(void *p) {
|
|
666
|
+
thread_block_t *b = (thread_block_t *)p;
|
|
667
|
+
if (!b) return;
|
|
668
|
+
for (int i = 0; i < b->n; i++) free_scan_state(&b->states[i]);
|
|
669
|
+
free(b);
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
static void make_t_block_key(void) {
|
|
673
|
+
if (pthread_key_create(&t_block_key, free_thread_block) != 0) {
|
|
674
|
+
perror("pthread_key_create"); exit(1);
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
/* Return this thread's scan_state_t array, synced to the current pattern set.
|
|
679
|
+
* Drops the whole cache if the pattern set changed (generation guard), then
|
|
680
|
+
* lazily grows (zero-initialised) to cover every engine. Called under the
|
|
681
|
+
* custom-pattern mutex during a scan, so g_pattern_gen / g_eng_n are stable.
|
|
682
|
+
* The owning block is registered with t_block_key so it is freed at thread exit;
|
|
683
|
+
* the key value is re-set after any (re)allocation since the block may move. */
|
|
684
|
+
static scan_state_t *thread_state(void) {
|
|
685
|
+
pthread_once(&t_block_key_once, make_t_block_key);
|
|
686
|
+
|
|
687
|
+
if (t_gen != g_pattern_gen) {
|
|
688
|
+
free_thread_block(t_block);
|
|
689
|
+
t_block = NULL;
|
|
690
|
+
pthread_setspecific(t_block_key, NULL);
|
|
691
|
+
t_gen = g_pattern_gen;
|
|
692
|
+
}
|
|
693
|
+
int have = t_block ? t_block->n : 0;
|
|
694
|
+
if (have < g_eng_n) {
|
|
695
|
+
thread_block_t *nb = realloc(t_block,
|
|
696
|
+
sizeof(thread_block_t) + (size_t)g_eng_n * sizeof(scan_state_t));
|
|
697
|
+
if (!nb) { perror("realloc"); exit(1); }
|
|
698
|
+
memset(&nb->states[have], 0,
|
|
699
|
+
(size_t)(g_eng_n - have) * sizeof(scan_state_t));
|
|
700
|
+
nb->n = g_eng_n;
|
|
701
|
+
t_block = nb;
|
|
702
|
+
pthread_setspecific(t_block_key, nb);
|
|
703
|
+
}
|
|
704
|
+
return t_block->states;
|
|
617
705
|
}
|
|
618
706
|
|
|
619
707
|
/* ========================================================================
|
|
@@ -692,7 +780,13 @@ static void dfa_hash_insert(dfa_t *d, int sid);
|
|
|
692
780
|
|
|
693
781
|
static void dfa_grow_states(dfa_t *d) {
|
|
694
782
|
if (d->n_states < d->states_cap) return;
|
|
695
|
-
|
|
783
|
+
/* Start small (8) and double. Each state owns a 1 KB transition row, and the
|
|
784
|
+
* DFA cache is now per-thread, so the initial cap is the per-thread memory
|
|
785
|
+
* floor multiplied across every engine. Most patterns settle at 1-14 states
|
|
786
|
+
* (max 45), so a floor of 8 fits the common case in 8 KB instead of 64 KB
|
|
787
|
+
* (~4x less per-thread memory across 79 DFA engines); the few larger DFAs
|
|
788
|
+
* just do a couple extra doublings during warmup, off the hot path. */
|
|
789
|
+
int newcap = d->states_cap ? d->states_cap * 2 : 8;
|
|
696
790
|
d->set_off = realloc(d->set_off, (size_t)newcap * sizeof(int));
|
|
697
791
|
d->set_len = realloc(d->set_len, (size_t)newcap * sizeof(int));
|
|
698
792
|
d->matched = realloc(d->matched, (size_t)newcap * sizeof(int));
|
|
@@ -748,28 +842,28 @@ static int dfa_intern(dfa_t *d, const int *set, int n, int matched) {
|
|
|
748
842
|
return sid;
|
|
749
843
|
}
|
|
750
844
|
|
|
751
|
-
static void ensure_scratch(engine_t *eng) {
|
|
845
|
+
static void ensure_scratch(engine_t *eng, scan_state_t *st) {
|
|
752
846
|
prog_t *pr = &eng->prog;
|
|
753
|
-
if (
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
if (!
|
|
847
|
+
if (st->seen_cap >= pr->n) return;
|
|
848
|
+
st->seen = realloc(st->seen, pr->n * sizeof(int));
|
|
849
|
+
st->clist.list = realloc(st->clist.list, pr->n * sizeof(int));
|
|
850
|
+
st->nlist.list = realloc(st->nlist.list, pr->n * sizeof(int));
|
|
851
|
+
st->estack = realloc(st->estack, (2 * pr->n + 1) * sizeof(int));
|
|
852
|
+
if (!st->seen || !st->clist.list || !st->nlist.list || !st->estack) {
|
|
759
853
|
perror("realloc"); exit(1);
|
|
760
854
|
}
|
|
761
|
-
memset(
|
|
762
|
-
|
|
855
|
+
memset(st->seen, 0, pr->n * sizeof(int));
|
|
856
|
+
st->seen_cap = pr->n;
|
|
763
857
|
}
|
|
764
858
|
|
|
765
|
-
static int dfa_compute_trans(engine_t *eng, int sid, unsigned char c) {
|
|
859
|
+
static int dfa_compute_trans(engine_t *eng, scan_state_t *st, int sid, unsigned char c) {
|
|
766
860
|
prog_t *pr = &eng->prog;
|
|
767
|
-
dfa_t *d = &
|
|
768
|
-
int *seen =
|
|
769
|
-
int *estk =
|
|
770
|
-
tlist_t *nl = &
|
|
861
|
+
dfa_t *d = &st->dfa;
|
|
862
|
+
int *seen = st->seen;
|
|
863
|
+
int *estk = st->estack;
|
|
864
|
+
tlist_t *nl = &st->nlist;
|
|
771
865
|
|
|
772
|
-
int gen = ++
|
|
866
|
+
int gen = ++st->gen;
|
|
773
867
|
nl->n = 0; nl->matched = 0;
|
|
774
868
|
|
|
775
869
|
const int *set = &d->set_pool[d->set_off[sid]];
|
|
@@ -800,14 +894,14 @@ static int dfa_compute_trans(engine_t *eng, int sid, unsigned char c) {
|
|
|
800
894
|
return next;
|
|
801
895
|
}
|
|
802
896
|
|
|
803
|
-
static void dfa_build_start(engine_t *eng) {
|
|
897
|
+
static void dfa_build_start(engine_t *eng, scan_state_t *st) {
|
|
804
898
|
prog_t *pr = &eng->prog;
|
|
805
|
-
dfa_t *d = &
|
|
806
|
-
int *seen =
|
|
807
|
-
int *estk =
|
|
808
|
-
tlist_t *cl = &
|
|
899
|
+
dfa_t *d = &st->dfa;
|
|
900
|
+
int *seen = st->seen;
|
|
901
|
+
int *estk = st->estack;
|
|
902
|
+
tlist_t *cl = &st->clist;
|
|
809
903
|
|
|
810
|
-
int gen = ++
|
|
904
|
+
int gen = ++st->gen;
|
|
811
905
|
cl->n = 0; cl->matched = 0;
|
|
812
906
|
addthread_dfa(pr, cl, seen, gen, estk, 0);
|
|
813
907
|
qsort(cl->list, (size_t)cl->n, sizeof(int), int_cmp);
|
|
@@ -821,23 +915,24 @@ static void dfa_build_start(engine_t *eng) {
|
|
|
821
915
|
* 9. Per-pattern scan (scan_one) — identical logic to the prototype
|
|
822
916
|
* ======================================================================== */
|
|
823
917
|
|
|
824
|
-
static size_t scan_one(int p, const char *input, size_t len,
|
|
918
|
+
static size_t scan_one(int p, scan_state_t *state, const char *input, size_t len,
|
|
825
919
|
mm_match_t *out, size_t max, size_t count) {
|
|
826
|
-
engine_t
|
|
827
|
-
|
|
920
|
+
engine_t *eng = &g_eng[p];
|
|
921
|
+
scan_state_t *sst = &state[p];
|
|
922
|
+
prog_t *pr = &eng->prog;
|
|
828
923
|
|
|
829
|
-
ensure_scratch(eng);
|
|
830
|
-
int *seen =
|
|
831
|
-
int *estk =
|
|
832
|
-
tlist_t *cl = &
|
|
924
|
+
ensure_scratch(eng, sst);
|
|
925
|
+
int *seen = sst->seen;
|
|
926
|
+
int *estk = sst->estack;
|
|
927
|
+
tlist_t *cl = &sst->clist, *nl = &sst->nlist;
|
|
833
928
|
|
|
834
|
-
if (
|
|
929
|
+
if (sst->gen > INT_MAX - (int)(2 * (len + 2))) {
|
|
835
930
|
memset(seen, 0, pr->n * sizeof(int));
|
|
836
|
-
|
|
931
|
+
sst->gen = 0;
|
|
837
932
|
}
|
|
838
933
|
|
|
839
|
-
dfa_t *d = &
|
|
840
|
-
if (eng->use_dfa && d->n_states == 0) dfa_build_start(eng);
|
|
934
|
+
dfa_t *d = &sst->dfa;
|
|
935
|
+
if (eng->use_dfa && d->n_states == 0) dfa_build_start(eng, sst);
|
|
841
936
|
|
|
842
937
|
size_t pos = 0;
|
|
843
938
|
while (pos <= len) {
|
|
@@ -874,19 +969,19 @@ static size_t scan_one(int p, const char *input, size_t len,
|
|
|
874
969
|
if (sp == len) break;
|
|
875
970
|
int next = d->trans[st * 256 + (unsigned char)input[sp]];
|
|
876
971
|
if (next == TRANS_UNFILLED)
|
|
877
|
-
next = dfa_compute_trans(eng, st, (unsigned char)input[sp]);
|
|
972
|
+
next = dfa_compute_trans(eng, sst, st, (unsigned char)input[sp]);
|
|
878
973
|
st = next;
|
|
879
974
|
sp++;
|
|
880
975
|
}
|
|
881
976
|
} else {
|
|
882
|
-
int gen = ++
|
|
977
|
+
int gen = ++sst->gen;
|
|
883
978
|
cl->n = 0; cl->matched = 0;
|
|
884
979
|
addthread(pr, cl, seen, gen, estk, 0, input, len, pos);
|
|
885
980
|
while (cl->n > 0 || cl->matched) {
|
|
886
981
|
if (cl->matched && sp - pos >= eng->min_len) match_end = sp;
|
|
887
982
|
if (cl->n == 0 || sp == len) break;
|
|
888
983
|
unsigned char c = (unsigned char)input[sp];
|
|
889
|
-
gen = ++
|
|
984
|
+
gen = ++sst->gen;
|
|
890
985
|
nl->n = 0; nl->matched = 0;
|
|
891
986
|
for (int i = 0; i < cl->n; i++) {
|
|
892
987
|
inst_t *in = &pr->code[cl->list[i]];
|
|
@@ -920,6 +1015,24 @@ static size_t scan_one(int p, const char *input, size_t len,
|
|
|
920
1015
|
!isalnum((unsigned char)input[core_so])) core_so++;
|
|
921
1016
|
if (core_eo > core_so &&
|
|
922
1017
|
!isalnum((unsigned char)input[core_eo-1])) core_eo--;
|
|
1018
|
+
} else if (eng->keyname_anchored) {
|
|
1019
|
+
/* The match is KEY<sep>VALUE (e.g. PASSWORD="hunter2"). We redact
|
|
1020
|
+
* only VALUE and keep KEY<sep> so logs stay greppable. The value
|
|
1021
|
+
* grammar forbids '=' and ':' unquoted, so the FIRST separator in
|
|
1022
|
+
* the span unambiguously ends the key. Advance past it, then past
|
|
1023
|
+
* surrounding whitespace and a single opening/closing quote. */
|
|
1024
|
+
size_t s = core_so;
|
|
1025
|
+
while (s < core_eo && input[s] != '=' && input[s] != ':') s++;
|
|
1026
|
+
if (s < core_eo) s++; /* skip the separator */
|
|
1027
|
+
while (s < core_eo &&
|
|
1028
|
+
(input[s] == ' ' || input[s] == '\t')) s++;
|
|
1029
|
+
if (s < core_eo &&
|
|
1030
|
+
(input[s] == '"' || input[s] == '\'')) {
|
|
1031
|
+
char q = input[s];
|
|
1032
|
+
s++;
|
|
1033
|
+
if (core_eo > s && input[core_eo-1] == q) core_eo--;
|
|
1034
|
+
}
|
|
1035
|
+
core_so = s;
|
|
923
1036
|
}
|
|
924
1037
|
if (count < max)
|
|
925
1038
|
out[count++] = (mm_match_t){p, core_so, core_eo - core_so};
|
|
@@ -935,11 +1048,11 @@ static size_t scan_one(int p, const char *input, size_t len,
|
|
|
935
1048
|
* 10. Selective merges (digit run pass + IBAN union pass)
|
|
936
1049
|
* ======================================================================== */
|
|
937
1050
|
|
|
938
|
-
static size_t scan_digit_group(const char *input, size_t len,
|
|
1051
|
+
static size_t scan_digit_group(scan_state_t *state, const char *input, size_t len,
|
|
939
1052
|
const int *enable_bits, size_t n_bits,
|
|
940
1053
|
mm_match_t *out, size_t max, size_t count) {
|
|
941
1054
|
for (int p = 0; p < g_eng_n; p++)
|
|
942
|
-
if (g_eng[p].digit_member)
|
|
1055
|
+
if (g_eng[p].digit_member) state[p].digit_last_end = 0;
|
|
943
1056
|
|
|
944
1057
|
size_t i = 0;
|
|
945
1058
|
while (i < len) {
|
|
@@ -963,7 +1076,7 @@ static size_t scan_digit_group(const char *input, size_t len,
|
|
|
963
1076
|
|
|
964
1077
|
size_t start;
|
|
965
1078
|
if (rs > 0 && !isalnum((unsigned char)input[rs-1]) &&
|
|
966
|
-
rs - 1 >= (size_t)
|
|
1079
|
+
rs - 1 >= (size_t)state[p].digit_last_end) {
|
|
967
1080
|
start = rs - 1;
|
|
968
1081
|
} else if (rs == 0 || input[rs-1] == '\n') {
|
|
969
1082
|
start = rs;
|
|
@@ -978,22 +1091,22 @@ static size_t scan_digit_group(const char *input, size_t len,
|
|
|
978
1091
|
* separator are resolved exactly as gsub would. */
|
|
979
1092
|
(void)start;
|
|
980
1093
|
out[count++] = (mm_match_t){p, rs, re - rs};
|
|
981
|
-
|
|
1094
|
+
state[p].digit_last_end = (int)end;
|
|
982
1095
|
}
|
|
983
1096
|
if (count >= max) break;
|
|
984
1097
|
}
|
|
985
1098
|
return count;
|
|
986
1099
|
}
|
|
987
1100
|
|
|
988
|
-
static size_t scan_iban_group(const char *input, size_t len,
|
|
1101
|
+
static size_t scan_iban_group(scan_state_t *state, const char *input, size_t len,
|
|
989
1102
|
const int *enable_bits, size_t n_bits,
|
|
990
1103
|
mm_match_t *out, size_t max, size_t count) {
|
|
991
1104
|
for (int p = 0; p < g_eng_n; p++)
|
|
992
1105
|
if (g_eng[p].iban_member) {
|
|
993
|
-
|
|
1106
|
+
state[p].iban_last_end = 0;
|
|
994
1107
|
engine_t *eng = &g_eng[p];
|
|
995
|
-
if (eng->use_dfa &&
|
|
996
|
-
ensure_scratch(eng); dfa_build_start(eng);
|
|
1108
|
+
if (eng->use_dfa && state[p].dfa.n_states == 0) {
|
|
1109
|
+
ensure_scratch(eng, &state[p]); dfa_build_start(eng, &state[p]);
|
|
997
1110
|
}
|
|
998
1111
|
}
|
|
999
1112
|
|
|
@@ -1004,10 +1117,11 @@ static size_t scan_iban_group(const char *input, size_t len,
|
|
|
1004
1117
|
int p = g_iban_pair[c0][(unsigned char)input[i + 1]];
|
|
1005
1118
|
if (p < 0) { i++; continue; }
|
|
1006
1119
|
if ((size_t)p < n_bits && !enable_bits[p]) { i++; continue; }
|
|
1007
|
-
if (i <
|
|
1120
|
+
if (i < state[p].iban_last_end) { i++; continue; }
|
|
1008
1121
|
|
|
1009
|
-
engine_t
|
|
1010
|
-
|
|
1122
|
+
engine_t *eng = &g_eng[p];
|
|
1123
|
+
scan_state_t *sst = &state[p];
|
|
1124
|
+
dfa_t *d = &sst->dfa;
|
|
1011
1125
|
size_t match_end = (size_t)-1, sp = i;
|
|
1012
1126
|
int st = 0;
|
|
1013
1127
|
while (st != DFA_DEAD) {
|
|
@@ -1015,7 +1129,7 @@ static size_t scan_iban_group(const char *input, size_t len,
|
|
|
1015
1129
|
if (sp == len) break;
|
|
1016
1130
|
int next = d->trans[st * 256 + (unsigned char)input[sp]];
|
|
1017
1131
|
if (next == TRANS_UNFILLED)
|
|
1018
|
-
next = dfa_compute_trans(eng, st, (unsigned char)input[sp]);
|
|
1132
|
+
next = dfa_compute_trans(eng, sst, st, (unsigned char)input[sp]);
|
|
1019
1133
|
st = next;
|
|
1020
1134
|
sp++;
|
|
1021
1135
|
}
|
|
@@ -1023,7 +1137,7 @@ static size_t scan_iban_group(const char *input, size_t len,
|
|
|
1023
1137
|
if (match_end != (size_t)-1) {
|
|
1024
1138
|
size_t span = match_end - i;
|
|
1025
1139
|
out[count++] = (mm_match_t){p, i, span};
|
|
1026
|
-
|
|
1140
|
+
sst->iban_last_end = match_end;
|
|
1027
1141
|
i = (span == 0) ? i + 1 : match_end;
|
|
1028
1142
|
} else {
|
|
1029
1143
|
i++;
|
|
@@ -1055,6 +1169,7 @@ void mm_init(void) {
|
|
|
1055
1169
|
for (int p = 0; p < NUM_PATTERNS; p++) {
|
|
1056
1170
|
engine_t *eng = eng_grow_one();
|
|
1057
1171
|
engine_build(eng, pattern_strings[p], boundary_wrapped[p], pattern_names[p]);
|
|
1172
|
+
eng->keyname_anchored = keyname_anchored[p];
|
|
1058
1173
|
|
|
1059
1174
|
const char *lit = pattern_required_literal[p];
|
|
1060
1175
|
if (lit) {
|
|
@@ -1089,6 +1204,7 @@ int mm_add(const char *regex, int boundary) {
|
|
|
1089
1204
|
/* Custom patterns never join the selective merges (TODO §1d Gap 4): they keep
|
|
1090
1205
|
* the per-pattern path. No digit/IBAN membership, no literal-skip hint. */
|
|
1091
1206
|
g_custom_n++;
|
|
1207
|
+
g_pattern_gen++; /* invalidate every thread's cached scan state */
|
|
1092
1208
|
return 0;
|
|
1093
1209
|
}
|
|
1094
1210
|
|
|
@@ -1102,12 +1218,16 @@ void mm_remove(int idx) {
|
|
|
1102
1218
|
g_eng[s] = g_eng[s + 1];
|
|
1103
1219
|
g_eng_n--;
|
|
1104
1220
|
g_custom_n--;
|
|
1221
|
+
/* slot p now holds a DIFFERENT pattern (compaction), so every thread's
|
|
1222
|
+
* scan-state cache indexed by p is stale — invalidate. */
|
|
1223
|
+
g_pattern_gen++;
|
|
1105
1224
|
}
|
|
1106
1225
|
|
|
1107
1226
|
void mm_clear_custom(void) {
|
|
1108
1227
|
for (int s = NUM_PATTERNS; s < g_eng_n; s++) engine_free(&g_eng[s]);
|
|
1109
1228
|
g_eng_n = NUM_PATTERNS;
|
|
1110
1229
|
g_custom_n = 0;
|
|
1230
|
+
g_pattern_gen++;
|
|
1111
1231
|
}
|
|
1112
1232
|
|
|
1113
1233
|
/* ========================================================================
|
|
@@ -1124,18 +1244,19 @@ size_t mm_scan(const char *input, size_t len,
|
|
|
1124
1244
|
const int *enable_bits, size_t n_bits,
|
|
1125
1245
|
mm_match_t *out, size_t max) {
|
|
1126
1246
|
if (!g_initialized) mm_init();
|
|
1247
|
+
scan_state_t *state = thread_state();
|
|
1127
1248
|
size_t count = 0;
|
|
1128
1249
|
|
|
1129
1250
|
for (int p = 0; p < g_eng_n && count < max; p++) {
|
|
1130
1251
|
if (g_eng[p].digit_member) continue;
|
|
1131
1252
|
if (g_eng[p].iban_member) continue;
|
|
1132
1253
|
if (!enabled(enable_bits, n_bits, p)) continue;
|
|
1133
|
-
count = scan_one(p, input, len, out, max, count);
|
|
1254
|
+
count = scan_one(p, state, input, len, out, max, count);
|
|
1134
1255
|
}
|
|
1135
1256
|
if (g_have_iban_group && count < max)
|
|
1136
|
-
count = scan_iban_group(input, len, enable_bits, n_bits, out, max, count);
|
|
1257
|
+
count = scan_iban_group(state, input, len, enable_bits, n_bits, out, max, count);
|
|
1137
1258
|
if (g_have_digit_group && count < max)
|
|
1138
|
-
count = scan_digit_group(input, len, enable_bits, n_bits, out, max, count);
|
|
1259
|
+
count = scan_digit_group(state, input, len, enable_bits, n_bits, out, max, count);
|
|
1139
1260
|
return count;
|
|
1140
1261
|
}
|
|
1141
1262
|
|
|
@@ -120,7 +120,17 @@ const int boundary_wrapped[NUM_PATTERNS] = {
|
|
|
120
120
|
1, /* 84: Passport 9 digits */
|
|
121
121
|
1, /* 85: Dutch BSN (8-9 digits) */
|
|
122
122
|
1, /* 86: Austrian Abgabenkontonummer (9 digits) */
|
|
123
|
-
1
|
|
123
|
+
1, /* 87: Polish PESEL duplicate */
|
|
124
|
+
0 /* 88: Key-name-anchored secret (KEY=VALUE / KEY: VALUE) */
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
/*
|
|
128
|
+
* keyname_anchored[i] == 1 marks a KEY<sep>VALUE pattern whose match span has
|
|
129
|
+
* the key + separator (and any quotes) stripped so only VALUE is redacted.
|
|
130
|
+
* Mutually exclusive with boundary_wrapped[] above. See patterns.h.
|
|
131
|
+
*/
|
|
132
|
+
const int keyname_anchored[NUM_PATTERNS] = {
|
|
133
|
+
[88] = 1,
|
|
124
134
|
};
|
|
125
135
|
|
|
126
136
|
/*
|
|
@@ -178,7 +188,8 @@ const int pattern_tags[NUM_PATTERNS] = {
|
|
|
178
188
|
TAG_TRAVEL, /* 84: passport 9 digits */
|
|
179
189
|
TAG_NATIONAL_ID, /* 85: Dutch BSN */
|
|
180
190
|
TAG_TAX_ID, /* 86: Austrian Abgabenkontonummer */
|
|
181
|
-
TAG_NATIONAL_ID
|
|
191
|
+
TAG_NATIONAL_ID, /* 87: Polish PESEL duplicate */
|
|
192
|
+
TAG_CREDENTIALS /* 88: Key-name-anchored secret */
|
|
182
193
|
};
|
|
183
194
|
|
|
184
195
|
const char *pattern_names[NUM_PATTERNS] = {
|
|
@@ -269,7 +280,8 @@ const char *pattern_names[NUM_PATTERNS] = {
|
|
|
269
280
|
"passport_9digits", /* 84 */
|
|
270
281
|
"dutch_bsn", /* 85 */
|
|
271
282
|
"austrian_abgabenkontonummer", /* 86 */
|
|
272
|
-
"polish_pesel_2"
|
|
283
|
+
"polish_pesel_2", /* 87 */
|
|
284
|
+
"keyname_anchored_secret" /* 88 */
|
|
273
285
|
};
|
|
274
286
|
|
|
275
287
|
/*
|
|
@@ -387,7 +399,8 @@ const char *pattern_required_literal[NUM_PATTERNS] = {
|
|
|
387
399
|
NULL, /* 84: passport 9 digits — pure digits */
|
|
388
400
|
NULL, /* 85: Dutch BSN — pure digits */
|
|
389
401
|
NULL, /* 86: Austrian Abgabenkontonummer — pure digits */
|
|
390
|
-
NULL
|
|
402
|
+
NULL, /* 87: Polish PESEL duplicate — pure digits */
|
|
403
|
+
NULL /* 88: Key-name-anchored — key name is an alternation, no single required literal */
|
|
391
404
|
};
|
|
392
405
|
|
|
393
406
|
/*
|
|
@@ -587,5 +600,27 @@ const char *pattern_strings[NUM_PATTERNS] = {
|
|
|
587
600
|
/* 86: Austrian Abgabenkontonummer (9 digits) */
|
|
588
601
|
"[0-9]{9}",
|
|
589
602
|
/* 87: Polish PESEL duplicate */
|
|
590
|
-
"[0-9]{11}"
|
|
603
|
+
"[0-9]{11}",
|
|
604
|
+
/* 88: Key-name-anchored secret (dotenv KEY=VALUE / YAML KEY: VALUE).
|
|
605
|
+
* POSIX ERE has no /i, so each key name is char-class case-folded by hand.
|
|
606
|
+
* Keys ordered longest-first so leftmost-longest picks the full name.
|
|
607
|
+
* The key word may be surrounded by other key-name chars on either side
|
|
608
|
+
* (unanchored left; [A-Za-z0-9_]* right) so compound names match both ways:
|
|
609
|
+
* POSTGRES_DB_PASSWORD= (prefix) and PASSWORD_POSTGRES= (suffix).
|
|
610
|
+
* Separator is = or : with optional surrounding space. Value is either a
|
|
611
|
+
* quoted run ("..."/'...') or an unquoted token of >=6 chars that stops at
|
|
612
|
+
* whitespace, quotes, ; , : =. The matcher strips key+sep (keyname_anchored)
|
|
613
|
+
* so only the value is redacted, the full compound key name is kept. */
|
|
614
|
+
"([Cc][Ll][Ii][Ee][Nn][Tt]_[Ss][Ee][Cc][Rr][Ee][Tt]"
|
|
615
|
+
"|[Aa][Cc][Cc][Ee][Ss][Ss]_[Kk][Ee][Yy]"
|
|
616
|
+
"|[Aa][Pp][Ii]_[Kk][Ee][Yy]"
|
|
617
|
+
"|[Aa][Pp][Ii][Kk][Ee][Yy]"
|
|
618
|
+
"|[Pp][Aa][Ss][Ss][Ww][Oo][Rr][Dd]"
|
|
619
|
+
"|[Pp][Aa][Ss][Ss][Ww][Dd]"
|
|
620
|
+
"|[Ss][Ee][Cc][Rr][Ee][Tt]"
|
|
621
|
+
"|[Tt][Oo][Kk][Ee][Nn]"
|
|
622
|
+
"|[Pp][Ww][Dd])"
|
|
623
|
+
"[A-Za-z0-9_]*"
|
|
624
|
+
"[[:space:]]*[=:][[:space:]]*"
|
|
625
|
+
"(\"[^\"]+\"|'[^']+'|[^[:space:]\"';,:=]{6,})"
|
|
591
626
|
};
|
|
@@ -3,13 +3,22 @@
|
|
|
3
3
|
|
|
4
4
|
#include <regex.h>
|
|
5
5
|
|
|
6
|
-
#define NUM_PATTERNS
|
|
6
|
+
#define NUM_PATTERNS 89
|
|
7
7
|
|
|
8
8
|
extern const char *pattern_strings[NUM_PATTERNS];
|
|
9
9
|
extern const int boundary_wrapped[NUM_PATTERNS];
|
|
10
10
|
extern const int pattern_tags[NUM_PATTERNS];
|
|
11
11
|
extern const char *pattern_names[NUM_PATTERNS];
|
|
12
12
|
|
|
13
|
+
/*
|
|
14
|
+
* Key-name-anchored patterns match KEY<sep>VALUE (e.g. PASSWORD="hunter2") and
|
|
15
|
+
* redact only VALUE, preserving KEY<sep> so logs stay greppable. The matcher
|
|
16
|
+
* strips the key+separator (and surrounding quotes/whitespace) from the match
|
|
17
|
+
* span; see the keyname_anchored branch in matcher.c's match emission. These
|
|
18
|
+
* are mutually exclusive with boundary_wrapped[] (a span has one strip rule).
|
|
19
|
+
*/
|
|
20
|
+
extern const int keyname_anchored[NUM_PATTERNS];
|
|
21
|
+
|
|
13
22
|
/*
|
|
14
23
|
* Optional case-sensitive literal substring that the input must contain for
|
|
15
24
|
* the pattern to have any chance of matching. NULL means no pre-filter — the
|
data/ext/data_redactor/redact.c
CHANGED
|
@@ -4,10 +4,19 @@
|
|
|
4
4
|
#include "custom_patterns.h"
|
|
5
5
|
#include "matcher.h"
|
|
6
6
|
#include "tags.h"
|
|
7
|
+
#include <ruby/thread.h>
|
|
7
8
|
#include <string.h>
|
|
8
9
|
#include <stdlib.h>
|
|
9
10
|
#include <stdio.h>
|
|
10
11
|
|
|
12
|
+
/* Inputs at or above this byte size release the GVL around the built-in v19
|
|
13
|
+
* pass so other Ruby threads can run during the scan. Below it, the
|
|
14
|
+
* rb_thread_call_without_gvl bookkeeping costs more than the scan, so we keep
|
|
15
|
+
* the GVL. The Ruby layer chunks inputs > CHUNK_SIZE (64 KB) before calling
|
|
16
|
+
* _redact, so the practical ceiling per call is one chunk; 4 KB cleanly
|
|
17
|
+
* separates per-leaf/log-line calls (keep GVL) from chunk-sized work (release). */
|
|
18
|
+
#define GVL_RELEASE_THRESHOLD (4 * 1024)
|
|
19
|
+
|
|
11
20
|
char *wrap_boundary(const char *core) {
|
|
12
21
|
const char *prefix = "(^|[^0-9A-Za-z])(";
|
|
13
22
|
const char *suffix = ")([^0-9A-Za-z]|$)";
|
|
@@ -179,6 +188,41 @@ static char *redact_builtins(const char *input, size_t in_len, const int *bits,
|
|
|
179
188
|
return output;
|
|
180
189
|
}
|
|
181
190
|
|
|
191
|
+
/* Trampoline for running redact_builtins() with the GVL released. Everything it
|
|
192
|
+
* touches is plain C (raw char* in/out, the per-thread engine state); no Ruby
|
|
193
|
+
* VALUE or Ruby API call happens inside, which is the contract for
|
|
194
|
+
* rb_thread_call_without_gvl. */
|
|
195
|
+
typedef struct {
|
|
196
|
+
const char *input;
|
|
197
|
+
size_t in_len;
|
|
198
|
+
const int *bits;
|
|
199
|
+
int ph_mode;
|
|
200
|
+
const char *ph_str_plain;
|
|
201
|
+
size_t out_len;
|
|
202
|
+
char *result;
|
|
203
|
+
} builtins_args_t;
|
|
204
|
+
|
|
205
|
+
static void *redact_builtins_nogvl(void *p) {
|
|
206
|
+
builtins_args_t *a = (builtins_args_t *)p;
|
|
207
|
+
a->result = redact_builtins(a->input, a->in_len, a->bits,
|
|
208
|
+
a->ph_mode, a->ph_str_plain, &a->out_len);
|
|
209
|
+
return NULL;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/* Run the built-in v19 pass, releasing the GVL for inputs large enough that the
|
|
213
|
+
* scan dominates the release bookkeeping. Small inputs run inline under the GVL. */
|
|
214
|
+
static char *redact_builtins_maybe_nogvl(const char *input, size_t in_len,
|
|
215
|
+
const int *bits, int ph_mode,
|
|
216
|
+
const char *ph_str_plain, size_t *out_len_p) {
|
|
217
|
+
if (in_len < GVL_RELEASE_THRESHOLD)
|
|
218
|
+
return redact_builtins(input, in_len, bits, ph_mode, ph_str_plain, out_len_p);
|
|
219
|
+
|
|
220
|
+
builtins_args_t a = { input, in_len, bits, ph_mode, ph_str_plain, 0, NULL };
|
|
221
|
+
rb_thread_call_without_gvl(redact_builtins_nogvl, &a, RUBY_UBF_IO, NULL);
|
|
222
|
+
*out_len_p = a.out_len;
|
|
223
|
+
return a.result;
|
|
224
|
+
}
|
|
225
|
+
|
|
182
226
|
VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
183
227
|
VALUE rb_ph_mode, VALUE rb_ph_str,
|
|
184
228
|
VALUE rb_enable_bits) {
|
|
@@ -197,7 +241,7 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
|
197
241
|
int *bits = builtin_enable_bits(rb_enable_bits);
|
|
198
242
|
if (!bits) rb_raise(rb_eNoMemError, "enable_bits allocation failed");
|
|
199
243
|
size_t work_len = 0;
|
|
200
|
-
char *working =
|
|
244
|
+
char *working = redact_builtins_maybe_nogvl(input, in_len, bits, ph_mode, ph_str_plain, &work_len);
|
|
201
245
|
free(bits);
|
|
202
246
|
if (!working) rb_raise(rb_eNoMemError, "built-in redaction allocation failed");
|
|
203
247
|
|
|
@@ -208,6 +252,8 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
|
208
252
|
* incidentally beyond what today already did. */
|
|
209
253
|
placeholder_t ph;
|
|
210
254
|
ph.mode = ph_mode;
|
|
255
|
+
custom_patterns_lock();
|
|
256
|
+
int oom = 0;
|
|
211
257
|
for (int i = 0; i < custom_count; i++) {
|
|
212
258
|
if (!enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
|
|
213
259
|
ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
|
|
@@ -216,9 +262,11 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
|
216
262
|
char *result = replace_all_matches(&custom_patterns[i].compiled, working,
|
|
217
263
|
custom_patterns[i].boundary, &ph);
|
|
218
264
|
free(working);
|
|
219
|
-
if (!result)
|
|
265
|
+
if (!result) { working = NULL; oom = 1; break; }
|
|
220
266
|
working = result;
|
|
221
267
|
}
|
|
268
|
+
custom_patterns_unlock();
|
|
269
|
+
if (oom) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed (custom)");
|
|
222
270
|
|
|
223
271
|
VALUE rb_result = rb_str_new_cstr(working);
|
|
224
272
|
free(working);
|
data/ext/data_redactor/scan.c
CHANGED
|
@@ -126,6 +126,8 @@ VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits) {
|
|
|
126
126
|
/* Stage 2: custom patterns via glibc on the rewritten buffer. */
|
|
127
127
|
/* Original coords recovered via working_to_orig() using ev[]. */
|
|
128
128
|
/* ------------------------------------------------------------------ */
|
|
129
|
+
custom_patterns_lock();
|
|
130
|
+
int oom = 0;
|
|
129
131
|
for (int i = 0; i < custom_count; i++) {
|
|
130
132
|
if (!scan_enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
|
|
131
133
|
|
|
@@ -163,9 +165,11 @@ VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits) {
|
|
|
163
165
|
char *next = replace_all_matches(&custom_patterns[i].compiled, working,
|
|
164
166
|
custom_patterns[i].boundary, &ph_plain);
|
|
165
167
|
free(working);
|
|
166
|
-
if (!next) {
|
|
168
|
+
if (!next) { working = NULL; oom = 1; break; }
|
|
167
169
|
working = next;
|
|
168
170
|
}
|
|
171
|
+
custom_patterns_unlock();
|
|
172
|
+
if (oom) { free(ev); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); }
|
|
169
173
|
|
|
170
174
|
free(ev);
|
|
171
175
|
|