data_redactor 0.11.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -1
- data/README.md +4 -4
- data/ext/data_redactor/custom_patterns.c +34 -4
- data/ext/data_redactor/custom_patterns.h +10 -0
- data/ext/data_redactor/extconf.rb +5 -0
- data/ext/data_redactor/matcher.c +164 -63
- data/ext/data_redactor/redact.c +50 -2
- data/ext/data_redactor/scan.c +5 -1
- data/lib/data_redactor/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fa978ec8daa0c8285f48283bc251a512c9263202046100ac03ad39ef4889e070
|
|
4
|
+
data.tar.gz: 6b5ff39107b5948bcf499fa122e2af4c342d46fb520e94c6d0d3d1f128f30781
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4169cf320312e05e77d5c6fe699f1516f031f6ba097fb9a327e9c7686c462d0c96759b58f03eb2aab024178e01998f6c7607f2ac758eb773e6681b68f8b0717e
|
|
7
|
+
data.tar.gz: 92a9d114b28305d2da038571259ff3d819371e1276aa66aea436d820e4e540084a0e0c2e555013e53fc454abe50029058e9ca7fcdc239591b86685700cf29d62
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.13.0] - 2026-06-13
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
- **Custom-pattern registration is now thread-safe.** `add_pattern`,
|
|
14
|
+
`remove_pattern`, and `clear_custom_patterns!` are guarded by a mutex shared
|
|
15
|
+
with the `redact`/`scan` custom-pattern loop, so patterns may be registered,
|
|
16
|
+
removed, or cleared from any thread at any time — including at runtime from a
|
|
17
|
+
request handler — without coordinating with in-flight redactions. The previous
|
|
18
|
+
"register custom patterns at boot only" caveat is lifted. (The C extension now
|
|
19
|
+
links `-lpthread` on glibc; no-op on musl and macOS where pthread is in libc.)
|
|
20
|
+
- **`redact` releases the GVL for large inputs.** The v19 engine's per-scan
|
|
21
|
+
mutable state (NFA scratch and the lazy DFA cache) moved into per-thread
|
|
22
|
+
storage, making the engine re-entrant. `redact` now releases the GVL
|
|
23
|
+
(`rb_thread_call_without_gvl`) around the built-in scan for inputs above a few
|
|
24
|
+
KB, so a large redaction on one thread no longer blocks other Ruby threads.
|
|
25
|
+
Small inputs keep the GVL. No public API change; output is byte-for-byte
|
|
26
|
+
identical (verified by a differential gate over ~6000 inputs). The per-thread
|
|
27
|
+
DFA cache's allocation floor was tuned so this adds ~0.86 MB per scanning
|
|
28
|
+
thread (down from a naive ~3.2 MB), with no throughput change. Per-thread scan
|
|
29
|
+
state is freed at thread exit (via a `pthread_key` destructor), so processes
|
|
30
|
+
that churn many short-lived scanning threads do not accumulate dead caches —
|
|
31
|
+
RSS stays flat across thousands of threads.
|
|
32
|
+
|
|
10
33
|
## [0.11.0] - 2026-06-10
|
|
11
34
|
|
|
12
35
|
### Added
|
|
@@ -232,7 +255,8 @@ features as 0.7.1 plus the pipeline fix.
|
|
|
232
255
|
- `DataRedactor.redact(text)` module function returning the input with every match replaced by `[REDACTED]`.
|
|
233
256
|
- RSpec suite with one example per pattern.
|
|
234
257
|
|
|
235
|
-
[Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.
|
|
258
|
+
[Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.13.0...HEAD
|
|
259
|
+
[0.13.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.11.0...v0.13.0
|
|
236
260
|
[0.11.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.1...v0.11.0
|
|
237
261
|
[0.10.1]: https://github.com/danielefrisanco/data_redactor/compare/v0.10.0...v0.10.1
|
|
238
262
|
[0.10.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.9.0...v0.10.0
|
data/README.md
CHANGED
|
@@ -19,7 +19,7 @@ It ships **88 built-in patterns** across 15+ countries, grouped into tags
|
|
|
19
19
|
(`:credentials`, `:financial`, `:contact`, ...) so you can redact only what you
|
|
20
20
|
care about. Beyond plain strings it can walk nested Hashes, Arrays, and JSON,
|
|
21
21
|
audit a payload without mutating it (`scan`), and plug into Logger, Rails, and
|
|
22
|
-
Rack. You can also register your own patterns at boot.
|
|
22
|
+
Rack. You can also register your own patterns — at boot or at runtime from any thread.
|
|
23
23
|
|
|
24
24
|
### Use cases
|
|
25
25
|
|
|
@@ -161,7 +161,7 @@ DataRedactor.redact_json("not json") # => JSON::ParserError
|
|
|
161
161
|
|
|
162
162
|
### Custom patterns
|
|
163
163
|
|
|
164
|
-
Teams often have internal IDs that the gem can't ship. Register them at boot:
|
|
164
|
+
Teams often have internal IDs that the gem can't ship. Register them at boot — or at runtime from any thread (registration is thread-safe, see [Thread safety](#thread-safety)):
|
|
165
165
|
|
|
166
166
|
```ruby
|
|
167
167
|
# String (POSIX ERE) or Regexp — both accepted
|
|
@@ -571,9 +571,9 @@ All C-side buffers are heap-allocated with `malloc`/`strdup` and freed before th
|
|
|
571
571
|
|
|
572
572
|
## Thread safety
|
|
573
573
|
|
|
574
|
-
`DataRedactor.redact` and `DataRedactor.scan` are safe to call concurrently from multiple threads. The v19 engine
|
|
574
|
+
`DataRedactor.redact` and `DataRedactor.scan` are safe to call concurrently from multiple threads. The v19 engine keeps its compiled patterns immutable and shared (read-only after `mm_init()` at load time) and all per-scan mutable state — NFA scratch and the lazy DFA cache — in per-thread storage, so concurrent scans never touch each other's state. For inputs above a few KB, `redact` **releases the GVL** (`rb_thread_call_without_gvl`) around the built-in scan, so a large redaction on one thread no longer blocks other Ruby threads from running. Small inputs keep the GVL (the release bookkeeping would cost more than the scan). Each call allocates its own working buffers. A thread's per-thread state is freed automatically when the thread exits, so processes that spawn many short-lived scanning threads do not accumulate memory.
|
|
575
575
|
|
|
576
|
-
`DataRedactor.add_pattern`, `remove_pattern`, and `clear_custom_patterns!`
|
|
576
|
+
`DataRedactor.add_pattern`, `remove_pattern`, and `clear_custom_patterns!` are also thread-safe: the shared custom-pattern array is guarded by a mutex that writers take around the mutation and `redact`/`scan` take around their custom-pattern loop. You can register, remove, or clear custom patterns from any thread at any time — including from request handlers in a running server — without coordinating with in-flight redactions. (Registration is still a rare operation; the lock is uncontended in practice.)
|
|
577
577
|
|
|
578
578
|
## Versioning
|
|
579
579
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
#include "redact.h" /* wrap_boundary */
|
|
3
3
|
#include <string.h>
|
|
4
4
|
#include <stdlib.h>
|
|
5
|
+
#include <pthread.h>
|
|
5
6
|
|
|
6
7
|
/* Custom patterns deliberately do NOT use the v19 engine: they keep the glibc
|
|
7
8
|
* regexec path (replace_all_matches), because user regex can contain multibyte
|
|
@@ -12,6 +13,11 @@ custom_pattern_t *custom_patterns = NULL;
|
|
|
12
13
|
int custom_count = 0;
|
|
13
14
|
int custom_cap = 0;
|
|
14
15
|
|
|
16
|
+
static pthread_mutex_t custom_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
17
|
+
|
|
18
|
+
void custom_patterns_lock(void) { pthread_mutex_lock(&custom_mutex); }
|
|
19
|
+
void custom_patterns_unlock(void) { pthread_mutex_unlock(&custom_mutex); }
|
|
20
|
+
|
|
15
21
|
static int find_custom_by_name(const char *name) {
|
|
16
22
|
for (int i = 0; i < custom_count; i++) {
|
|
17
23
|
if (strcmp(custom_patterns[i].name, name) == 0) return i;
|
|
@@ -58,6 +64,13 @@ VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
|
58
64
|
rb_raise(eClass, "%s", errbuf);
|
|
59
65
|
}
|
|
60
66
|
|
|
67
|
+
/* regcomp succeeded above (no array access yet); now mutate the shared array
|
|
68
|
+
* under the lock. Keep the critical section rb_raise-free: on failure, record
|
|
69
|
+
* it, unlock, then raise outside the lock so the mutex can't leak via longjmp. */
|
|
70
|
+
custom_patterns_lock();
|
|
71
|
+
|
|
72
|
+
const char *err = NULL;
|
|
73
|
+
int stored = 0; /* 1 once `compiled` is owned by a slot (don't regfree it) */
|
|
61
74
|
int idx = find_custom_by_name(name);
|
|
62
75
|
if (idx >= 0) {
|
|
63
76
|
free_custom_at(idx);
|
|
@@ -67,8 +80,8 @@ VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
|
67
80
|
custom_pattern_t *tmp = (custom_pattern_t *)realloc(
|
|
68
81
|
custom_patterns, sizeof(custom_pattern_t) * new_cap);
|
|
69
82
|
if (!tmp) {
|
|
70
|
-
|
|
71
|
-
|
|
83
|
+
err = "custom_patterns realloc failed";
|
|
84
|
+
goto unlock;
|
|
72
85
|
}
|
|
73
86
|
custom_patterns = tmp;
|
|
74
87
|
custom_cap = new_cap;
|
|
@@ -81,9 +94,17 @@ VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
|
81
94
|
custom_patterns[idx].compiled = compiled;
|
|
82
95
|
custom_patterns[idx].tag = tag_bit;
|
|
83
96
|
custom_patterns[idx].boundary = boundary;
|
|
97
|
+
stored = 1;
|
|
84
98
|
|
|
85
99
|
if (!custom_patterns[idx].name || !custom_patterns[idx].source) {
|
|
86
|
-
|
|
100
|
+
err = "strdup failed";
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
unlock:
|
|
104
|
+
custom_patterns_unlock();
|
|
105
|
+
if (err) {
|
|
106
|
+
if (!stored) regfree(&compiled);
|
|
107
|
+
rb_raise(rb_eNoMemError, "%s", err);
|
|
87
108
|
}
|
|
88
109
|
|
|
89
110
|
return Qnil;
|
|
@@ -93,8 +114,12 @@ VALUE rb_remove_pattern(VALUE self, VALUE rb_name) {
|
|
|
93
114
|
Check_Type(rb_name, T_STRING);
|
|
94
115
|
const char *name = StringValueCStr(rb_name);
|
|
95
116
|
|
|
117
|
+
custom_patterns_lock();
|
|
96
118
|
int idx = find_custom_by_name(name);
|
|
97
|
-
if (idx < 0)
|
|
119
|
+
if (idx < 0) {
|
|
120
|
+
custom_patterns_unlock();
|
|
121
|
+
return Qfalse;
|
|
122
|
+
}
|
|
98
123
|
|
|
99
124
|
free_custom_at(idx);
|
|
100
125
|
|
|
@@ -102,19 +127,23 @@ VALUE rb_remove_pattern(VALUE self, VALUE rb_name) {
|
|
|
102
127
|
custom_patterns[i] = custom_patterns[i + 1];
|
|
103
128
|
}
|
|
104
129
|
custom_count--;
|
|
130
|
+
custom_patterns_unlock();
|
|
105
131
|
|
|
106
132
|
return Qtrue;
|
|
107
133
|
}
|
|
108
134
|
|
|
109
135
|
VALUE rb_clear_custom_patterns(VALUE self) {
|
|
136
|
+
custom_patterns_lock();
|
|
110
137
|
for (int i = 0; i < custom_count; i++) {
|
|
111
138
|
free_custom_at(i);
|
|
112
139
|
}
|
|
113
140
|
custom_count = 0;
|
|
141
|
+
custom_patterns_unlock();
|
|
114
142
|
return Qnil;
|
|
115
143
|
}
|
|
116
144
|
|
|
117
145
|
VALUE rb_custom_patterns(VALUE self) {
|
|
146
|
+
custom_patterns_lock();
|
|
118
147
|
VALUE arr = rb_ary_new_capa(custom_count);
|
|
119
148
|
for (int i = 0; i < custom_count; i++) {
|
|
120
149
|
VALUE h = rb_hash_new();
|
|
@@ -124,5 +153,6 @@ VALUE rb_custom_patterns(VALUE self) {
|
|
|
124
153
|
rb_hash_aset(h, ID2SYM(rb_intern("boundary")), custom_patterns[i].boundary ? Qtrue : Qfalse);
|
|
125
154
|
rb_ary_push(arr, h);
|
|
126
155
|
}
|
|
156
|
+
custom_patterns_unlock();
|
|
127
157
|
return arr;
|
|
128
158
|
}
|
|
@@ -16,6 +16,16 @@ extern custom_pattern_t *custom_patterns;
|
|
|
16
16
|
extern int custom_count;
|
|
17
17
|
extern int custom_cap;
|
|
18
18
|
|
|
19
|
+
/* Guards the custom_patterns array against concurrent mutation. redact/scan
|
|
20
|
+
* take it for the duration of their custom-pattern loop (readers); add/remove/
|
|
21
|
+
* clear take it around the array mutation (writers). A plain mutex is enough:
|
|
22
|
+
* contention is low (registration is rare relative to redaction) and the GVL
|
|
23
|
+
* already serialises everything else, so the only race this closes is a writer
|
|
24
|
+
* realloc/shift running against a reader's iteration. Lock is always taken
|
|
25
|
+
* inside the GVL, never the reverse, so there is no lock-ordering hazard. */
|
|
26
|
+
void custom_patterns_lock(void);
|
|
27
|
+
void custom_patterns_unlock(void);
|
|
28
|
+
|
|
19
29
|
VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
|
|
20
30
|
VALUE rb_tag_bit, VALUE rb_boundary);
|
|
21
31
|
VALUE rb_remove_pattern(VALUE self, VALUE rb_name);
|
|
@@ -4,6 +4,11 @@ abort "Missing C compiler or stdio.h" unless have_header("stdio.h")
|
|
|
4
4
|
abort "Missing regex.h" unless have_header("regex.h")
|
|
5
5
|
abort "Missing stdlib.h" unless have_header("stdlib.h")
|
|
6
6
|
abort "Missing string.h" unless have_header("string.h")
|
|
7
|
+
abort "Missing pthread.h" unless have_header("pthread.h")
|
|
8
|
+
|
|
9
|
+
# pthread_mutex_* needs -lpthread on glibc; on musl and macOS it lives in libc
|
|
10
|
+
# and have_library is a harmless no-op.
|
|
11
|
+
have_library("pthread")
|
|
7
12
|
|
|
8
13
|
# Compile every .c file in this directory. Order doesn't matter; mkmf
|
|
9
14
|
# generates per-object rules.
|
data/ext/data_redactor/matcher.c
CHANGED
|
@@ -44,6 +44,7 @@
|
|
|
44
44
|
#include <stdint.h>
|
|
45
45
|
#include <ctype.h>
|
|
46
46
|
#include <limits.h>
|
|
47
|
+
#include <pthread.h>
|
|
47
48
|
|
|
48
49
|
/* ========================================================================
|
|
49
50
|
* 0. Utilities
|
|
@@ -386,8 +387,13 @@ typedef struct {
|
|
|
386
387
|
int matched;
|
|
387
388
|
} tlist_t;
|
|
388
389
|
|
|
390
|
+
/* engine_t holds ONLY immutable, compiled state — built once at mm_init()/mm_add()
|
|
391
|
+
* and never written during a scan, so it is safe to share read-only across
|
|
392
|
+
* threads. All per-scan mutable state (NFA scratch, merge cursors) and the lazy
|
|
393
|
+
* DFA cache live in scan_state_t, which is per-thread (t_block below). This
|
|
394
|
+
* split is what lets redact/scan release the GVL: with no shared writes during a
|
|
395
|
+
* scan, concurrent scans on distinct threads cannot race. */
|
|
389
396
|
typedef struct {
|
|
390
|
-
/* compiled, immutable after build (safe to share across scans) */
|
|
391
397
|
prog_t prog;
|
|
392
398
|
size_t min_len;
|
|
393
399
|
const char *req_literal; /* points into a heap copy owned by this engine */
|
|
@@ -405,17 +411,21 @@ typedef struct {
|
|
|
405
411
|
/* selective-merge membership (built-ins only; customs never join a merge) */
|
|
406
412
|
int digit_member, digit_lo, digit_hi;
|
|
407
413
|
int iban_member;
|
|
408
|
-
|
|
414
|
+
} engine_t;
|
|
415
|
+
|
|
416
|
+
/* Per-engine MUTABLE scan state. One per engine, owned per-thread (t_state).
|
|
417
|
+
* The DFA cache warms lazily across this thread's scans; the rest is reset each
|
|
418
|
+
* scan. seen_cap==0 / dfa.n_states==0 means "not yet warmed" for this thread. */
|
|
419
|
+
typedef struct {
|
|
409
420
|
dfa_t dfa;
|
|
410
421
|
int *seen;
|
|
411
422
|
int seen_cap;
|
|
412
423
|
tlist_t clist, nlist;
|
|
413
424
|
int *estack;
|
|
414
425
|
int gen;
|
|
415
|
-
/*
|
|
416
|
-
int digit_last_end;
|
|
426
|
+
int digit_last_end; /* selective-merge non-overlap cursors */
|
|
417
427
|
size_t iban_last_end;
|
|
418
|
-
}
|
|
428
|
+
} scan_state_t;
|
|
419
429
|
|
|
420
430
|
static engine_t *g_eng = NULL;
|
|
421
431
|
static int g_eng_n = 0; /* engines built (NUM_PATTERNS + custom_n) */
|
|
@@ -423,6 +433,33 @@ static int g_eng_cap= 0;
|
|
|
423
433
|
static int g_custom_n = 0;
|
|
424
434
|
static int g_initialized = 0;
|
|
425
435
|
|
|
436
|
+
/* Bumped whenever the pattern set changes (mm_add/mm_remove/mm_clear_custom).
|
|
437
|
+
* A thread whose cached t_gen lags this value drops its whole scan-state cache
|
|
438
|
+
* and rebuilds — the simplest safe invalidation (slot p may now hold a
|
|
439
|
+
* different pattern after mm_remove compacts g_eng). Registration is rare, so
|
|
440
|
+
* the full rebuild is cheap; a surgical per-slot invalidation is a possible
|
|
441
|
+
* future refinement (see TODO §"Full thread safety"). */
|
|
442
|
+
static unsigned g_pattern_gen = 0;
|
|
443
|
+
|
|
444
|
+
/* Per-thread mutable scan state: one scan_state_t per engine, lazily grown to
|
|
445
|
+
* g_eng_n. Held in a heap block whose header carries the element count, so the
|
|
446
|
+
* pthread_key destructor (which frees the block at thread exit) is fully
|
|
447
|
+
* self-contained — it must NOT read __thread storage, which may already be torn
|
|
448
|
+
* down when key destructors run. The __thread pointer is the fast hot-path
|
|
449
|
+
* handle; the key holds the same pointer purely so it can be reclaimed on exit.
|
|
450
|
+
* This bounds memory for processes that churn many short-lived scanning threads;
|
|
451
|
+
* fixed pools (Puma/Sidekiq) just reuse the block for the thread's lifetime. */
|
|
452
|
+
typedef struct {
|
|
453
|
+
int n; /* number of scan_state_t entries in states[] */
|
|
454
|
+
scan_state_t states[]; /* flexible array member */
|
|
455
|
+
} thread_block_t;
|
|
456
|
+
|
|
457
|
+
static __thread thread_block_t *t_block = NULL;
|
|
458
|
+
static __thread unsigned t_gen = 0;
|
|
459
|
+
|
|
460
|
+
static pthread_key_t t_block_key;
|
|
461
|
+
static pthread_once_t t_block_key_once = PTHREAD_ONCE_INIT;
|
|
462
|
+
|
|
426
463
|
/* IBAN union-pass dispatch (built-ins only): unique 2-byte country prefixes. */
|
|
427
464
|
static int g_iban_first[256];
|
|
428
465
|
static int g_iban_pair[256][256];
|
|
@@ -606,14 +643,64 @@ static void engine_set_literal(engine_t *eng, const char *lit, int at_start) {
|
|
|
606
643
|
static void engine_free(engine_t *eng) {
|
|
607
644
|
free(eng->prog.code);
|
|
608
645
|
free(eng->req_literal_own);
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
646
|
+
memset(eng, 0, sizeof(*eng));
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
/* Free one thread's mutable scan state for an engine (scratch + DFA cache).
|
|
650
|
+
* Used when a thread drops its cache on a pattern-set generation change. */
|
|
651
|
+
static void free_scan_state(scan_state_t *st) {
|
|
652
|
+
free(st->seen);
|
|
653
|
+
free(st->clist.list);
|
|
654
|
+
free(st->nlist.list);
|
|
655
|
+
free(st->estack);
|
|
656
|
+
dfa_t *d = &st->dfa;
|
|
614
657
|
free(d->set_pool); free(d->set_off); free(d->set_len);
|
|
615
658
|
free(d->matched); free(d->trans); free(d->hash);
|
|
616
|
-
memset(
|
|
659
|
+
memset(st, 0, sizeof(*st));
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
/* pthread_key destructor: free a thread's whole block at thread exit. Reads only
|
|
663
|
+
* the passed-in pointer + its header count — no __thread access (unsafe here). */
|
|
664
|
+
static void free_thread_block(void *p) {
|
|
665
|
+
thread_block_t *b = (thread_block_t *)p;
|
|
666
|
+
if (!b) return;
|
|
667
|
+
for (int i = 0; i < b->n; i++) free_scan_state(&b->states[i]);
|
|
668
|
+
free(b);
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
static void make_t_block_key(void) {
|
|
672
|
+
if (pthread_key_create(&t_block_key, free_thread_block) != 0) {
|
|
673
|
+
perror("pthread_key_create"); exit(1);
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
/* Return this thread's scan_state_t array, synced to the current pattern set.
|
|
678
|
+
* Drops the whole cache if the pattern set changed (generation guard), then
|
|
679
|
+
* lazily grows (zero-initialised) to cover every engine. Called under the
|
|
680
|
+
* custom-pattern mutex during a scan, so g_pattern_gen / g_eng_n are stable.
|
|
681
|
+
* The owning block is registered with t_block_key so it is freed at thread exit;
|
|
682
|
+
* the key value is re-set after any (re)allocation since the block may move. */
|
|
683
|
+
static scan_state_t *thread_state(void) {
|
|
684
|
+
pthread_once(&t_block_key_once, make_t_block_key);
|
|
685
|
+
|
|
686
|
+
if (t_gen != g_pattern_gen) {
|
|
687
|
+
free_thread_block(t_block);
|
|
688
|
+
t_block = NULL;
|
|
689
|
+
pthread_setspecific(t_block_key, NULL);
|
|
690
|
+
t_gen = g_pattern_gen;
|
|
691
|
+
}
|
|
692
|
+
int have = t_block ? t_block->n : 0;
|
|
693
|
+
if (have < g_eng_n) {
|
|
694
|
+
thread_block_t *nb = realloc(t_block,
|
|
695
|
+
sizeof(thread_block_t) + (size_t)g_eng_n * sizeof(scan_state_t));
|
|
696
|
+
if (!nb) { perror("realloc"); exit(1); }
|
|
697
|
+
memset(&nb->states[have], 0,
|
|
698
|
+
(size_t)(g_eng_n - have) * sizeof(scan_state_t));
|
|
699
|
+
nb->n = g_eng_n;
|
|
700
|
+
t_block = nb;
|
|
701
|
+
pthread_setspecific(t_block_key, nb);
|
|
702
|
+
}
|
|
703
|
+
return t_block->states;
|
|
617
704
|
}
|
|
618
705
|
|
|
619
706
|
/* ========================================================================
|
|
@@ -692,7 +779,13 @@ static void dfa_hash_insert(dfa_t *d, int sid);
|
|
|
692
779
|
|
|
693
780
|
static void dfa_grow_states(dfa_t *d) {
|
|
694
781
|
if (d->n_states < d->states_cap) return;
|
|
695
|
-
|
|
782
|
+
/* Start small (8) and double. Each state owns a 1 KB transition row, and the
|
|
783
|
+
* DFA cache is now per-thread, so the initial cap is the per-thread memory
|
|
784
|
+
* floor multiplied across every engine. Most patterns settle at 1-14 states
|
|
785
|
+
* (max 45), so a floor of 8 fits the common case in 8 KB instead of 64 KB
|
|
786
|
+
* (~4x less per-thread memory across 79 DFA engines); the few larger DFAs
|
|
787
|
+
* just do a couple extra doublings during warmup, off the hot path. */
|
|
788
|
+
int newcap = d->states_cap ? d->states_cap * 2 : 8;
|
|
696
789
|
d->set_off = realloc(d->set_off, (size_t)newcap * sizeof(int));
|
|
697
790
|
d->set_len = realloc(d->set_len, (size_t)newcap * sizeof(int));
|
|
698
791
|
d->matched = realloc(d->matched, (size_t)newcap * sizeof(int));
|
|
@@ -748,28 +841,28 @@ static int dfa_intern(dfa_t *d, const int *set, int n, int matched) {
|
|
|
748
841
|
return sid;
|
|
749
842
|
}
|
|
750
843
|
|
|
751
|
-
static void ensure_scratch(engine_t *eng) {
|
|
844
|
+
static void ensure_scratch(engine_t *eng, scan_state_t *st) {
|
|
752
845
|
prog_t *pr = &eng->prog;
|
|
753
|
-
if (
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
if (!
|
|
846
|
+
if (st->seen_cap >= pr->n) return;
|
|
847
|
+
st->seen = realloc(st->seen, pr->n * sizeof(int));
|
|
848
|
+
st->clist.list = realloc(st->clist.list, pr->n * sizeof(int));
|
|
849
|
+
st->nlist.list = realloc(st->nlist.list, pr->n * sizeof(int));
|
|
850
|
+
st->estack = realloc(st->estack, (2 * pr->n + 1) * sizeof(int));
|
|
851
|
+
if (!st->seen || !st->clist.list || !st->nlist.list || !st->estack) {
|
|
759
852
|
perror("realloc"); exit(1);
|
|
760
853
|
}
|
|
761
|
-
memset(
|
|
762
|
-
|
|
854
|
+
memset(st->seen, 0, pr->n * sizeof(int));
|
|
855
|
+
st->seen_cap = pr->n;
|
|
763
856
|
}
|
|
764
857
|
|
|
765
|
-
static int dfa_compute_trans(engine_t *eng, int sid, unsigned char c) {
|
|
858
|
+
static int dfa_compute_trans(engine_t *eng, scan_state_t *st, int sid, unsigned char c) {
|
|
766
859
|
prog_t *pr = &eng->prog;
|
|
767
|
-
dfa_t *d = &
|
|
768
|
-
int *seen =
|
|
769
|
-
int *estk =
|
|
770
|
-
tlist_t *nl = &
|
|
860
|
+
dfa_t *d = &st->dfa;
|
|
861
|
+
int *seen = st->seen;
|
|
862
|
+
int *estk = st->estack;
|
|
863
|
+
tlist_t *nl = &st->nlist;
|
|
771
864
|
|
|
772
|
-
int gen = ++
|
|
865
|
+
int gen = ++st->gen;
|
|
773
866
|
nl->n = 0; nl->matched = 0;
|
|
774
867
|
|
|
775
868
|
const int *set = &d->set_pool[d->set_off[sid]];
|
|
@@ -800,14 +893,14 @@ static int dfa_compute_trans(engine_t *eng, int sid, unsigned char c) {
|
|
|
800
893
|
return next;
|
|
801
894
|
}
|
|
802
895
|
|
|
803
|
-
static void dfa_build_start(engine_t *eng) {
|
|
896
|
+
static void dfa_build_start(engine_t *eng, scan_state_t *st) {
|
|
804
897
|
prog_t *pr = &eng->prog;
|
|
805
|
-
dfa_t *d = &
|
|
806
|
-
int *seen =
|
|
807
|
-
int *estk =
|
|
808
|
-
tlist_t *cl = &
|
|
898
|
+
dfa_t *d = &st->dfa;
|
|
899
|
+
int *seen = st->seen;
|
|
900
|
+
int *estk = st->estack;
|
|
901
|
+
tlist_t *cl = &st->clist;
|
|
809
902
|
|
|
810
|
-
int gen = ++
|
|
903
|
+
int gen = ++st->gen;
|
|
811
904
|
cl->n = 0; cl->matched = 0;
|
|
812
905
|
addthread_dfa(pr, cl, seen, gen, estk, 0);
|
|
813
906
|
qsort(cl->list, (size_t)cl->n, sizeof(int), int_cmp);
|
|
@@ -821,23 +914,24 @@ static void dfa_build_start(engine_t *eng) {
|
|
|
821
914
|
* 9. Per-pattern scan (scan_one) — identical logic to the prototype
|
|
822
915
|
* ======================================================================== */
|
|
823
916
|
|
|
824
|
-
static size_t scan_one(int p, const char *input, size_t len,
|
|
917
|
+
static size_t scan_one(int p, scan_state_t *state, const char *input, size_t len,
|
|
825
918
|
mm_match_t *out, size_t max, size_t count) {
|
|
826
|
-
engine_t
|
|
827
|
-
|
|
919
|
+
engine_t *eng = &g_eng[p];
|
|
920
|
+
scan_state_t *sst = &state[p];
|
|
921
|
+
prog_t *pr = &eng->prog;
|
|
828
922
|
|
|
829
|
-
ensure_scratch(eng);
|
|
830
|
-
int *seen =
|
|
831
|
-
int *estk =
|
|
832
|
-
tlist_t *cl = &
|
|
923
|
+
ensure_scratch(eng, sst);
|
|
924
|
+
int *seen = sst->seen;
|
|
925
|
+
int *estk = sst->estack;
|
|
926
|
+
tlist_t *cl = &sst->clist, *nl = &sst->nlist;
|
|
833
927
|
|
|
834
|
-
if (
|
|
928
|
+
if (sst->gen > INT_MAX - (int)(2 * (len + 2))) {
|
|
835
929
|
memset(seen, 0, pr->n * sizeof(int));
|
|
836
|
-
|
|
930
|
+
sst->gen = 0;
|
|
837
931
|
}
|
|
838
932
|
|
|
839
|
-
dfa_t *d = &
|
|
840
|
-
if (eng->use_dfa && d->n_states == 0) dfa_build_start(eng);
|
|
933
|
+
dfa_t *d = &sst->dfa;
|
|
934
|
+
if (eng->use_dfa && d->n_states == 0) dfa_build_start(eng, sst);
|
|
841
935
|
|
|
842
936
|
size_t pos = 0;
|
|
843
937
|
while (pos <= len) {
|
|
@@ -874,19 +968,19 @@ static size_t scan_one(int p, const char *input, size_t len,
|
|
|
874
968
|
if (sp == len) break;
|
|
875
969
|
int next = d->trans[st * 256 + (unsigned char)input[sp]];
|
|
876
970
|
if (next == TRANS_UNFILLED)
|
|
877
|
-
next = dfa_compute_trans(eng, st, (unsigned char)input[sp]);
|
|
971
|
+
next = dfa_compute_trans(eng, sst, st, (unsigned char)input[sp]);
|
|
878
972
|
st = next;
|
|
879
973
|
sp++;
|
|
880
974
|
}
|
|
881
975
|
} else {
|
|
882
|
-
int gen = ++
|
|
976
|
+
int gen = ++sst->gen;
|
|
883
977
|
cl->n = 0; cl->matched = 0;
|
|
884
978
|
addthread(pr, cl, seen, gen, estk, 0, input, len, pos);
|
|
885
979
|
while (cl->n > 0 || cl->matched) {
|
|
886
980
|
if (cl->matched && sp - pos >= eng->min_len) match_end = sp;
|
|
887
981
|
if (cl->n == 0 || sp == len) break;
|
|
888
982
|
unsigned char c = (unsigned char)input[sp];
|
|
889
|
-
gen = ++
|
|
983
|
+
gen = ++sst->gen;
|
|
890
984
|
nl->n = 0; nl->matched = 0;
|
|
891
985
|
for (int i = 0; i < cl->n; i++) {
|
|
892
986
|
inst_t *in = &pr->code[cl->list[i]];
|
|
@@ -935,11 +1029,11 @@ static size_t scan_one(int p, const char *input, size_t len,
|
|
|
935
1029
|
* 10. Selective merges (digit run pass + IBAN union pass)
|
|
936
1030
|
* ======================================================================== */
|
|
937
1031
|
|
|
938
|
-
static size_t scan_digit_group(const char *input, size_t len,
|
|
1032
|
+
static size_t scan_digit_group(scan_state_t *state, const char *input, size_t len,
|
|
939
1033
|
const int *enable_bits, size_t n_bits,
|
|
940
1034
|
mm_match_t *out, size_t max, size_t count) {
|
|
941
1035
|
for (int p = 0; p < g_eng_n; p++)
|
|
942
|
-
if (g_eng[p].digit_member)
|
|
1036
|
+
if (g_eng[p].digit_member) state[p].digit_last_end = 0;
|
|
943
1037
|
|
|
944
1038
|
size_t i = 0;
|
|
945
1039
|
while (i < len) {
|
|
@@ -963,7 +1057,7 @@ static size_t scan_digit_group(const char *input, size_t len,
|
|
|
963
1057
|
|
|
964
1058
|
size_t start;
|
|
965
1059
|
if (rs > 0 && !isalnum((unsigned char)input[rs-1]) &&
|
|
966
|
-
rs - 1 >= (size_t)
|
|
1060
|
+
rs - 1 >= (size_t)state[p].digit_last_end) {
|
|
967
1061
|
start = rs - 1;
|
|
968
1062
|
} else if (rs == 0 || input[rs-1] == '\n') {
|
|
969
1063
|
start = rs;
|
|
@@ -978,22 +1072,22 @@ static size_t scan_digit_group(const char *input, size_t len,
|
|
|
978
1072
|
* separator are resolved exactly as gsub would. */
|
|
979
1073
|
(void)start;
|
|
980
1074
|
out[count++] = (mm_match_t){p, rs, re - rs};
|
|
981
|
-
|
|
1075
|
+
state[p].digit_last_end = (int)end;
|
|
982
1076
|
}
|
|
983
1077
|
if (count >= max) break;
|
|
984
1078
|
}
|
|
985
1079
|
return count;
|
|
986
1080
|
}
|
|
987
1081
|
|
|
988
|
-
static size_t scan_iban_group(const char *input, size_t len,
|
|
1082
|
+
static size_t scan_iban_group(scan_state_t *state, const char *input, size_t len,
|
|
989
1083
|
const int *enable_bits, size_t n_bits,
|
|
990
1084
|
mm_match_t *out, size_t max, size_t count) {
|
|
991
1085
|
for (int p = 0; p < g_eng_n; p++)
|
|
992
1086
|
if (g_eng[p].iban_member) {
|
|
993
|
-
|
|
1087
|
+
state[p].iban_last_end = 0;
|
|
994
1088
|
engine_t *eng = &g_eng[p];
|
|
995
|
-
if (eng->use_dfa &&
|
|
996
|
-
ensure_scratch(eng); dfa_build_start(eng);
|
|
1089
|
+
if (eng->use_dfa && state[p].dfa.n_states == 0) {
|
|
1090
|
+
ensure_scratch(eng, &state[p]); dfa_build_start(eng, &state[p]);
|
|
997
1091
|
}
|
|
998
1092
|
}
|
|
999
1093
|
|
|
@@ -1004,10 +1098,11 @@ static size_t scan_iban_group(const char *input, size_t len,
|
|
|
1004
1098
|
int p = g_iban_pair[c0][(unsigned char)input[i + 1]];
|
|
1005
1099
|
if (p < 0) { i++; continue; }
|
|
1006
1100
|
if ((size_t)p < n_bits && !enable_bits[p]) { i++; continue; }
|
|
1007
|
-
if (i <
|
|
1101
|
+
if (i < state[p].iban_last_end) { i++; continue; }
|
|
1008
1102
|
|
|
1009
|
-
engine_t
|
|
1010
|
-
|
|
1103
|
+
engine_t *eng = &g_eng[p];
|
|
1104
|
+
scan_state_t *sst = &state[p];
|
|
1105
|
+
dfa_t *d = &sst->dfa;
|
|
1011
1106
|
size_t match_end = (size_t)-1, sp = i;
|
|
1012
1107
|
int st = 0;
|
|
1013
1108
|
while (st != DFA_DEAD) {
|
|
@@ -1015,7 +1110,7 @@ static size_t scan_iban_group(const char *input, size_t len,
|
|
|
1015
1110
|
if (sp == len) break;
|
|
1016
1111
|
int next = d->trans[st * 256 + (unsigned char)input[sp]];
|
|
1017
1112
|
if (next == TRANS_UNFILLED)
|
|
1018
|
-
next = dfa_compute_trans(eng, st, (unsigned char)input[sp]);
|
|
1113
|
+
next = dfa_compute_trans(eng, sst, st, (unsigned char)input[sp]);
|
|
1019
1114
|
st = next;
|
|
1020
1115
|
sp++;
|
|
1021
1116
|
}
|
|
@@ -1023,7 +1118,7 @@ static size_t scan_iban_group(const char *input, size_t len,
|
|
|
1023
1118
|
if (match_end != (size_t)-1) {
|
|
1024
1119
|
size_t span = match_end - i;
|
|
1025
1120
|
out[count++] = (mm_match_t){p, i, span};
|
|
1026
|
-
|
|
1121
|
+
sst->iban_last_end = match_end;
|
|
1027
1122
|
i = (span == 0) ? i + 1 : match_end;
|
|
1028
1123
|
} else {
|
|
1029
1124
|
i++;
|
|
@@ -1089,6 +1184,7 @@ int mm_add(const char *regex, int boundary) {
|
|
|
1089
1184
|
/* Custom patterns never join the selective merges (TODO §1d Gap 4): they keep
|
|
1090
1185
|
* the per-pattern path. No digit/IBAN membership, no literal-skip hint. */
|
|
1091
1186
|
g_custom_n++;
|
|
1187
|
+
g_pattern_gen++; /* invalidate every thread's cached scan state */
|
|
1092
1188
|
return 0;
|
|
1093
1189
|
}
|
|
1094
1190
|
|
|
@@ -1102,12 +1198,16 @@ void mm_remove(int idx) {
|
|
|
1102
1198
|
g_eng[s] = g_eng[s + 1];
|
|
1103
1199
|
g_eng_n--;
|
|
1104
1200
|
g_custom_n--;
|
|
1201
|
+
/* slot p now holds a DIFFERENT pattern (compaction), so every thread's
|
|
1202
|
+
* scan-state cache indexed by p is stale — invalidate. */
|
|
1203
|
+
g_pattern_gen++;
|
|
1105
1204
|
}
|
|
1106
1205
|
|
|
1107
1206
|
void mm_clear_custom(void) {
|
|
1108
1207
|
for (int s = NUM_PATTERNS; s < g_eng_n; s++) engine_free(&g_eng[s]);
|
|
1109
1208
|
g_eng_n = NUM_PATTERNS;
|
|
1110
1209
|
g_custom_n = 0;
|
|
1210
|
+
g_pattern_gen++;
|
|
1111
1211
|
}
|
|
1112
1212
|
|
|
1113
1213
|
/* ========================================================================
|
|
@@ -1124,18 +1224,19 @@ size_t mm_scan(const char *input, size_t len,
|
|
|
1124
1224
|
const int *enable_bits, size_t n_bits,
|
|
1125
1225
|
mm_match_t *out, size_t max) {
|
|
1126
1226
|
if (!g_initialized) mm_init();
|
|
1227
|
+
scan_state_t *state = thread_state();
|
|
1127
1228
|
size_t count = 0;
|
|
1128
1229
|
|
|
1129
1230
|
for (int p = 0; p < g_eng_n && count < max; p++) {
|
|
1130
1231
|
if (g_eng[p].digit_member) continue;
|
|
1131
1232
|
if (g_eng[p].iban_member) continue;
|
|
1132
1233
|
if (!enabled(enable_bits, n_bits, p)) continue;
|
|
1133
|
-
count = scan_one(p, input, len, out, max, count);
|
|
1234
|
+
count = scan_one(p, state, input, len, out, max, count);
|
|
1134
1235
|
}
|
|
1135
1236
|
if (g_have_iban_group && count < max)
|
|
1136
|
-
count = scan_iban_group(input, len, enable_bits, n_bits, out, max, count);
|
|
1237
|
+
count = scan_iban_group(state, input, len, enable_bits, n_bits, out, max, count);
|
|
1137
1238
|
if (g_have_digit_group && count < max)
|
|
1138
|
-
count = scan_digit_group(input, len, enable_bits, n_bits, out, max, count);
|
|
1239
|
+
count = scan_digit_group(state, input, len, enable_bits, n_bits, out, max, count);
|
|
1139
1240
|
return count;
|
|
1140
1241
|
}
|
|
1141
1242
|
|
data/ext/data_redactor/redact.c
CHANGED
|
@@ -4,10 +4,19 @@
|
|
|
4
4
|
#include "custom_patterns.h"
|
|
5
5
|
#include "matcher.h"
|
|
6
6
|
#include "tags.h"
|
|
7
|
+
#include <ruby/thread.h>
|
|
7
8
|
#include <string.h>
|
|
8
9
|
#include <stdlib.h>
|
|
9
10
|
#include <stdio.h>
|
|
10
11
|
|
|
12
|
+
/* Inputs at or above this byte size release the GVL around the built-in v19
|
|
13
|
+
* pass so other Ruby threads can run during the scan. Below it, the
|
|
14
|
+
* rb_thread_call_without_gvl bookkeeping costs more than the scan, so we keep
|
|
15
|
+
* the GVL. The Ruby layer chunks inputs > CHUNK_SIZE (64 KB) before calling
|
|
16
|
+
* _redact, so the practical ceiling per call is one chunk; 4 KB cleanly
|
|
17
|
+
* separates per-leaf/log-line calls (keep GVL) from chunk-sized work (release). */
|
|
18
|
+
#define GVL_RELEASE_THRESHOLD (4 * 1024)
|
|
19
|
+
|
|
11
20
|
char *wrap_boundary(const char *core) {
|
|
12
21
|
const char *prefix = "(^|[^0-9A-Za-z])(";
|
|
13
22
|
const char *suffix = ")([^0-9A-Za-z]|$)";
|
|
@@ -179,6 +188,41 @@ static char *redact_builtins(const char *input, size_t in_len, const int *bits,
|
|
|
179
188
|
return output;
|
|
180
189
|
}
|
|
181
190
|
|
|
191
|
+
/* Trampoline for running redact_builtins() with the GVL released. Everything it
|
|
192
|
+
* touches is plain C (raw char* in/out, the per-thread engine state); no Ruby
|
|
193
|
+
* VALUE or Ruby API call happens inside, which is the contract for
|
|
194
|
+
* rb_thread_call_without_gvl. */
|
|
195
|
+
typedef struct {
|
|
196
|
+
const char *input;
|
|
197
|
+
size_t in_len;
|
|
198
|
+
const int *bits;
|
|
199
|
+
int ph_mode;
|
|
200
|
+
const char *ph_str_plain;
|
|
201
|
+
size_t out_len;
|
|
202
|
+
char *result;
|
|
203
|
+
} builtins_args_t;
|
|
204
|
+
|
|
205
|
+
static void *redact_builtins_nogvl(void *p) {
|
|
206
|
+
builtins_args_t *a = (builtins_args_t *)p;
|
|
207
|
+
a->result = redact_builtins(a->input, a->in_len, a->bits,
|
|
208
|
+
a->ph_mode, a->ph_str_plain, &a->out_len);
|
|
209
|
+
return NULL;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
/* Run the built-in v19 pass, releasing the GVL for inputs large enough that the
|
|
213
|
+
* scan dominates the release bookkeeping. Small inputs run inline under the GVL. */
|
|
214
|
+
static char *redact_builtins_maybe_nogvl(const char *input, size_t in_len,
|
|
215
|
+
const int *bits, int ph_mode,
|
|
216
|
+
const char *ph_str_plain, size_t *out_len_p) {
|
|
217
|
+
if (in_len < GVL_RELEASE_THRESHOLD)
|
|
218
|
+
return redact_builtins(input, in_len, bits, ph_mode, ph_str_plain, out_len_p);
|
|
219
|
+
|
|
220
|
+
builtins_args_t a = { input, in_len, bits, ph_mode, ph_str_plain, 0, NULL };
|
|
221
|
+
rb_thread_call_without_gvl(redact_builtins_nogvl, &a, RUBY_UBF_IO, NULL);
|
|
222
|
+
*out_len_p = a.out_len;
|
|
223
|
+
return a.result;
|
|
224
|
+
}
|
|
225
|
+
|
|
182
226
|
VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
183
227
|
VALUE rb_ph_mode, VALUE rb_ph_str,
|
|
184
228
|
VALUE rb_enable_bits) {
|
|
@@ -197,7 +241,7 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
|
197
241
|
int *bits = builtin_enable_bits(rb_enable_bits);
|
|
198
242
|
if (!bits) rb_raise(rb_eNoMemError, "enable_bits allocation failed");
|
|
199
243
|
size_t work_len = 0;
|
|
200
|
-
char *working =
|
|
244
|
+
char *working = redact_builtins_maybe_nogvl(input, in_len, bits, ph_mode, ph_str_plain, &work_len);
|
|
201
245
|
free(bits);
|
|
202
246
|
if (!working) rb_raise(rb_eNoMemError, "built-in redaction allocation failed");
|
|
203
247
|
|
|
@@ -208,6 +252,8 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
|
208
252
|
* incidentally beyond what today already did. */
|
|
209
253
|
placeholder_t ph;
|
|
210
254
|
ph.mode = ph_mode;
|
|
255
|
+
custom_patterns_lock();
|
|
256
|
+
int oom = 0;
|
|
211
257
|
for (int i = 0; i < custom_count; i++) {
|
|
212
258
|
if (!enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
|
|
213
259
|
ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
|
|
@@ -216,9 +262,11 @@ VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text,
|
|
|
216
262
|
char *result = replace_all_matches(&custom_patterns[i].compiled, working,
|
|
217
263
|
custom_patterns[i].boundary, &ph);
|
|
218
264
|
free(working);
|
|
219
|
-
if (!result)
|
|
265
|
+
if (!result) { working = NULL; oom = 1; break; }
|
|
220
266
|
working = result;
|
|
221
267
|
}
|
|
268
|
+
custom_patterns_unlock();
|
|
269
|
+
if (oom) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed (custom)");
|
|
222
270
|
|
|
223
271
|
VALUE rb_result = rb_str_new_cstr(working);
|
|
224
272
|
free(working);
|
data/ext/data_redactor/scan.c
CHANGED
|
@@ -126,6 +126,8 @@ VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits) {
|
|
|
126
126
|
/* Stage 2: custom patterns via glibc on the rewritten buffer. */
|
|
127
127
|
/* Original coords recovered via working_to_orig() using ev[]. */
|
|
128
128
|
/* ------------------------------------------------------------------ */
|
|
129
|
+
custom_patterns_lock();
|
|
130
|
+
int oom = 0;
|
|
129
131
|
for (int i = 0; i < custom_count; i++) {
|
|
130
132
|
if (!scan_enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
|
|
131
133
|
|
|
@@ -163,9 +165,11 @@ VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits) {
|
|
|
163
165
|
char *next = replace_all_matches(&custom_patterns[i].compiled, working,
|
|
164
166
|
custom_patterns[i].boundary, &ph_plain);
|
|
165
167
|
free(working);
|
|
166
|
-
if (!next) {
|
|
168
|
+
if (!next) { working = NULL; oom = 1; break; }
|
|
167
169
|
working = next;
|
|
168
170
|
}
|
|
171
|
+
custom_patterns_unlock();
|
|
172
|
+
if (oom) { free(ev); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); }
|
|
169
173
|
|
|
170
174
|
free(ev);
|
|
171
175
|
|