data_redactor 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/ext/data_redactor/custom_patterns.c +5 -0
- data/ext/data_redactor/data_redactor.c +15 -1
- data/ext/data_redactor/matcher.c +1193 -0
- data/ext/data_redactor/matcher.h +78 -0
- data/ext/data_redactor/patterns.c +119 -0
- data/ext/data_redactor/patterns.h +11 -0
- data/ext/data_redactor/redact.c +106 -33
- data/ext/data_redactor/scan.c +141 -92
- data/lib/data_redactor/version.rb +1 -1
- data/lib/data_redactor.rb +75 -1
- data/readme.md +56 -5
- metadata +31 -1
data/ext/data_redactor/scan.c
CHANGED
|
@@ -3,22 +3,12 @@
|
|
|
3
3
|
#include "placeholder.h"
|
|
4
4
|
#include "custom_patterns.h"
|
|
5
5
|
#include "redact.h"
|
|
6
|
+
#include "matcher.h"
|
|
7
|
+
#include "tags.h"
|
|
6
8
|
#include <regex.h>
|
|
7
9
|
#include <string.h>
|
|
8
10
|
#include <stdlib.h>
|
|
9
11
|
|
|
10
|
-
/*
|
|
11
|
-
* To map working-buffer positions back to original-string positions we
|
|
12
|
-
* maintain a log of every replacement already applied. Each entry records
|
|
13
|
-
* where in the *working* buffer the replacement started (after all prior
|
|
14
|
-
* replacements) and how many bytes were removed (orig_len) vs. inserted
|
|
15
|
-
* (always 10, the length of "[REDACTED]").
|
|
16
|
-
*
|
|
17
|
-
* For a new match at working position W:
|
|
18
|
-
* cumulative_shift_before_W = sum of (10 - orig_len) for all prior
|
|
19
|
-
* replacements whose working_pos <= W
|
|
20
|
-
* original_pos = W - cumulative_shift_before_W
|
|
21
|
-
*/
|
|
22
12
|
/* Look up the i-th entry of the enable_bits Array. Out-of-bounds → 0 (skip). */
|
|
23
13
|
static inline int scan_enable_bit(VALUE rb_enable_bits, long i) {
|
|
24
14
|
if (i < 0 || i >= RARRAY_LEN(rb_enable_bits)) return 0;
|
|
@@ -26,105 +16,164 @@ static inline int scan_enable_bit(VALUE rb_enable_bits, long i) {
|
|
|
26
16
|
return RTEST(v) && NUM2INT(v) != 0;
|
|
27
17
|
}
|
|
28
18
|
|
|
19
|
+
/*
|
|
20
|
+
* Map a working-buffer position (after built-in redaction) back to the
|
|
21
|
+
* original-input position.
|
|
22
|
+
*
|
|
23
|
+
* After the built-in pass, the working buffer contains the original input
|
|
24
|
+
* with each matched CORE span replaced by "[REDACTED]" (10 bytes). The
|
|
25
|
+
* ev[] array (sorted by start, non-overlapping) records every replacement
|
|
26
|
+
* in original-frame coordinates. Walking ev[] we can find which verbatim
|
|
27
|
+
* segment or replacement a working position falls in and recover the
|
|
28
|
+
* original position.
|
|
29
|
+
*
|
|
30
|
+
* For a match that lands inside a "[REDACTED]" span we return the start of
|
|
31
|
+
* the corresponding original CORE (can only happen if a custom pattern
|
|
32
|
+
* matches the literal "[REDACTED]" itself, which is a degenerate case).
|
|
33
|
+
*/
|
|
34
|
+
static long working_to_orig(long wpos, const mm_match_t *ev, size_t n,
|
|
35
|
+
size_t ph_len) {
|
|
36
|
+
long cum_orig = 0;
|
|
37
|
+
long cum_work = 0;
|
|
38
|
+
for (size_t i = 0; i < n; i++) {
|
|
39
|
+
long seg = (long)ev[i].start - cum_orig;
|
|
40
|
+
if (wpos < cum_work + seg)
|
|
41
|
+
return cum_orig + (wpos - cum_work);
|
|
42
|
+
cum_orig += seg + (long)ev[i].length;
|
|
43
|
+
cum_work += seg + (long)ph_len;
|
|
44
|
+
}
|
|
45
|
+
return cum_orig + (wpos - cum_work);
|
|
46
|
+
}
|
|
47
|
+
|
|
29
48
|
VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_enable_bits) {
|
|
30
49
|
Check_Type(rb_text, T_STRING);
|
|
31
50
|
Check_Type(rb_enable_bits, T_ARRAY);
|
|
32
51
|
|
|
33
|
-
const char *input
|
|
52
|
+
const char *input = RSTRING_PTR(rb_text);
|
|
53
|
+
size_t in_len = (size_t)RSTRING_LEN(rb_text);
|
|
54
|
+
|
|
55
|
+
static const placeholder_t ph_plain = { PLACEHOLDER_MODE_PLAIN, "[REDACTED]" };
|
|
56
|
+
|
|
57
|
+
/* ------------------------------------------------------------------ */
|
|
58
|
+
/* Stage 1: built-ins through v19 (original-frame coords, no rewrite */
|
|
59
|
+
/* coordinate mapping needed). */
|
|
60
|
+
/* ------------------------------------------------------------------ */
|
|
34
61
|
|
|
35
|
-
|
|
62
|
+
/* Build enable-bits array for built-ins. */
|
|
63
|
+
int *bits = (int *)malloc((size_t)NUM_PATTERNS * sizeof(int));
|
|
64
|
+
if (!bits) rb_raise(rb_eNoMemError, "enable_bits allocation failed");
|
|
65
|
+
long alen = RARRAY_LEN(rb_enable_bits);
|
|
66
|
+
for (int i = 0; i < NUM_PATTERNS; i++) {
|
|
67
|
+
if (i < alen) {
|
|
68
|
+
VALUE v = rb_ary_entry(rb_enable_bits, i);
|
|
69
|
+
bits[i] = (RTEST(v) && NUM2INT(v) != 0) ? 1 : 0;
|
|
70
|
+
} else {
|
|
71
|
+
bits[i] = 0;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
36
74
|
|
|
37
|
-
|
|
38
|
-
|
|
75
|
+
/* Scan + resolve, growing buffer if needed. */
|
|
76
|
+
size_t cap = in_len / 4 + 16;
|
|
77
|
+
mm_match_t *ev = NULL;
|
|
78
|
+
size_t n_ev;
|
|
79
|
+
for (;;) {
|
|
80
|
+
mm_match_t *grown = (mm_match_t *)realloc(ev, cap * sizeof(mm_match_t));
|
|
81
|
+
if (!grown) { free(ev); free(bits); rb_raise(rb_eNoMemError, "mm_scan alloc"); }
|
|
82
|
+
ev = grown;
|
|
83
|
+
n_ev = mm_scan(input, in_len, bits, (size_t)NUM_PATTERNS, ev, cap);
|
|
84
|
+
if (n_ev < cap) break;
|
|
85
|
+
cap *= 2;
|
|
86
|
+
}
|
|
87
|
+
free(bits);
|
|
88
|
+
n_ev = mm_resolve(ev, n_ev);
|
|
39
89
|
|
|
90
|
+
/* Collect built-in match hashes. */
|
|
40
91
|
VALUE matches_arr = rb_ary_new();
|
|
92
|
+
for (size_t i = 0; i < n_ev; i++) {
|
|
93
|
+
int pid = ev[i].pattern_id;
|
|
94
|
+
VALUE h = rb_hash_new();
|
|
95
|
+
rb_hash_aset(h, ID2SYM(rb_intern("tag")),
|
|
96
|
+
ID2SYM(rb_intern(tag_name_for_bit(pattern_tags[pid]))));
|
|
97
|
+
rb_hash_aset(h, ID2SYM(rb_intern("name")),
|
|
98
|
+
rb_str_new_cstr(pattern_names[pid]));
|
|
99
|
+
rb_hash_aset(h, ID2SYM(rb_intern("value")),
|
|
100
|
+
rb_str_new(input + ev[i].start, ev[i].length));
|
|
101
|
+
rb_hash_aset(h, ID2SYM(rb_intern("start")),
|
|
102
|
+
LONG2NUM((long)ev[i].start));
|
|
103
|
+
rb_hash_aset(h, ID2SYM(rb_intern("length")),
|
|
104
|
+
LONG2NUM((long)ev[i].length));
|
|
105
|
+
rb_ary_push(matches_arr, h);
|
|
106
|
+
}
|
|
41
107
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
#define REPL_LOG_PUSH(_wpos, _olen) do { \
|
|
48
|
-
if (repl_count >= repl_cap) { \
|
|
49
|
-
int _nc = repl_cap == 0 ? 16 : repl_cap * 2; \
|
|
50
|
-
repl_t *_t = (repl_t *)realloc(repl_log, sizeof(repl_t) * _nc); \
|
|
51
|
-
if (!_t) { free(repl_log); free(working); rb_raise(rb_eNoMemError, "repl_log"); } \
|
|
52
|
-
repl_log = _t; repl_cap = _nc; \
|
|
53
|
-
} \
|
|
54
|
-
repl_log[repl_count].wpos = (_wpos); \
|
|
55
|
-
repl_log[repl_count].orig_len = (_olen); \
|
|
56
|
-
repl_count++; \
|
|
57
|
-
} while (0)
|
|
58
|
-
|
|
59
|
-
#define WORKING_TO_ORIG(_wpos) ({ \
|
|
60
|
-
long _shift = 0; \
|
|
61
|
-
for (int _ri = 0; _ri < repl_count; _ri++) { \
|
|
62
|
-
if (repl_log[_ri].wpos <= (_wpos)) \
|
|
63
|
-
_shift += 10 - repl_log[_ri].orig_len; \
|
|
64
|
-
} \
|
|
65
|
-
(_wpos) - _shift; \
|
|
66
|
-
})
|
|
67
|
-
|
|
68
|
-
#define COLLECT_AND_REPLACE(pat, use_bnd, tag_bit, pat_name) do { \
|
|
69
|
-
const char *_cur = working; \
|
|
70
|
-
regmatch_t _m[4]; \
|
|
71
|
-
while (regexec((pat), _cur, 4, _m, 0) == 0) { \
|
|
72
|
-
regoff_t _fso = _m[0].rm_so, _feo = _m[0].rm_eo; \
|
|
73
|
-
if (_fso < 0 || _feo < _fso) break; \
|
|
74
|
-
regoff_t _cso = _fso, _ceo = _feo; \
|
|
75
|
-
if (use_bnd) { \
|
|
76
|
-
if (_m[1].rm_so >= 0 && _m[1].rm_eo > _m[1].rm_so) \
|
|
77
|
-
_cso = _m[1].rm_eo; \
|
|
78
|
-
if (_m[3].rm_so >= 0 && _m[3].rm_eo > _m[3].rm_so) \
|
|
79
|
-
_ceo = _m[3].rm_so; \
|
|
80
|
-
} \
|
|
81
|
-
size_t _vlen = (size_t)(_ceo - _cso); \
|
|
82
|
-
long _wpos = (long)(_cur - working) + (long)_cso; \
|
|
83
|
-
long _orig = WORKING_TO_ORIG(_wpos); \
|
|
84
|
-
VALUE _match = rb_hash_new(); \
|
|
85
|
-
rb_hash_aset(_match, ID2SYM(rb_intern("tag")), \
|
|
86
|
-
ID2SYM(rb_intern(tag_name_for_bit(tag_bit)))); \
|
|
87
|
-
rb_hash_aset(_match, ID2SYM(rb_intern("name")), \
|
|
88
|
-
rb_str_new_cstr(pat_name)); \
|
|
89
|
-
rb_hash_aset(_match, ID2SYM(rb_intern("value")), \
|
|
90
|
-
rb_str_new(_cur + _cso, _vlen)); \
|
|
91
|
-
rb_hash_aset(_match, ID2SYM(rb_intern("start")), \
|
|
92
|
-
LONG2NUM(_orig)); \
|
|
93
|
-
rb_hash_aset(_match, ID2SYM(rb_intern("length")), \
|
|
94
|
-
LONG2NUM((long)_vlen)); \
|
|
95
|
-
rb_ary_push(matches_arr, _match); \
|
|
96
|
-
REPL_LOG_PUSH(_wpos, (long)_vlen); \
|
|
97
|
-
if (_feo == _fso) { if (*_cur) _cur++; else break; } \
|
|
98
|
-
else _cur += _feo; \
|
|
99
|
-
} \
|
|
100
|
-
char *_next = replace_all_matches((pat), working, (use_bnd), &ph_default); \
|
|
101
|
-
free(working); \
|
|
102
|
-
if (!_next) { free(repl_log); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); } \
|
|
103
|
-
working = _next; \
|
|
104
|
-
} while (0)
|
|
108
|
+
/* Build the redacted working buffer (same logic as redact_builtins). */
|
|
109
|
+
size_t ph_len = strlen(ph_plain.str); /* "[REDACTED]" = 10 */
|
|
110
|
+
size_t out_cap = in_len + n_ev * ph_len + 1;
|
|
111
|
+
char *working = (char *)malloc(out_cap);
|
|
112
|
+
if (!working) { free(ev); rb_raise(rb_eNoMemError, "scan working buffer alloc"); }
|
|
105
113
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
114
|
+
size_t out_len = 0, cur = 0;
|
|
115
|
+
for (size_t i = 0; i < n_ev; i++) {
|
|
116
|
+
size_t s = ev[i].start, l = ev[i].length;
|
|
117
|
+
if (s > cur) { memcpy(working + out_len, input + cur, s - cur); out_len += s - cur; }
|
|
118
|
+
memcpy(working + out_len, ph_plain.str, ph_len);
|
|
119
|
+
out_len += ph_len;
|
|
120
|
+
cur = s + l;
|
|
110
121
|
}
|
|
122
|
+
if (cur < in_len) { memcpy(working + out_len, input + cur, in_len - cur); out_len += in_len - cur; }
|
|
123
|
+
working[out_len] = '\0';
|
|
111
124
|
|
|
125
|
+
/* ------------------------------------------------------------------ */
|
|
126
|
+
/* Stage 2: custom patterns via glibc on the rewritten buffer. */
|
|
127
|
+
/* Original coords recovered via working_to_orig() using ev[]. */
|
|
128
|
+
/* ------------------------------------------------------------------ */
|
|
112
129
|
for (int i = 0; i < custom_count; i++) {
|
|
113
130
|
if (!scan_enable_bit(rb_enable_bits, NUM_PATTERNS + i)) continue;
|
|
114
|
-
COLLECT_AND_REPLACE(&custom_patterns[i].compiled,
|
|
115
|
-
custom_patterns[i].boundary,
|
|
116
|
-
custom_patterns[i].tag, custom_patterns[i].name);
|
|
117
|
-
}
|
|
118
131
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
132
|
+
const char *cur_ptr = working;
|
|
133
|
+
regmatch_t m[4];
|
|
134
|
+
while (regexec(&custom_patterns[i].compiled, cur_ptr, 4, m, 0) == 0) {
|
|
135
|
+
regoff_t fso = m[0].rm_so, feo = m[0].rm_eo;
|
|
136
|
+
if (fso < 0 || feo < fso) break;
|
|
137
|
+
|
|
138
|
+
regoff_t cso = fso, ceo = feo;
|
|
139
|
+
if (custom_patterns[i].boundary) {
|
|
140
|
+
if (m[1].rm_so >= 0 && m[1].rm_eo > m[1].rm_so) cso = m[1].rm_eo;
|
|
141
|
+
if (m[3].rm_so >= 0 && m[3].rm_eo > m[3].rm_so) ceo = m[3].rm_so;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
long wpos_core = (long)(cur_ptr - working) + (long)cso;
|
|
145
|
+
long orig_start = working_to_orig(wpos_core, ev, n_ev, ph_len);
|
|
146
|
+
long core_len = (long)(ceo - cso);
|
|
147
|
+
|
|
148
|
+
VALUE h = rb_hash_new();
|
|
149
|
+
rb_hash_aset(h, ID2SYM(rb_intern("tag")),
|
|
150
|
+
ID2SYM(rb_intern(tag_name_for_bit(custom_patterns[i].tag))));
|
|
151
|
+
rb_hash_aset(h, ID2SYM(rb_intern("name")),
|
|
152
|
+
rb_str_new_cstr(custom_patterns[i].name));
|
|
153
|
+
rb_hash_aset(h, ID2SYM(rb_intern("value")),
|
|
154
|
+
rb_str_new(cur_ptr + cso, (size_t)core_len));
|
|
155
|
+
rb_hash_aset(h, ID2SYM(rb_intern("start")), LONG2NUM(orig_start));
|
|
156
|
+
rb_hash_aset(h, ID2SYM(rb_intern("length")), LONG2NUM(core_len));
|
|
157
|
+
rb_ary_push(matches_arr, h);
|
|
158
|
+
|
|
159
|
+
if (feo == fso) { if (*cur_ptr) cur_ptr++; else break; }
|
|
160
|
+
else cur_ptr += feo;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
char *next = replace_all_matches(&custom_patterns[i].compiled, working,
|
|
164
|
+
custom_patterns[i].boundary, &ph_plain);
|
|
165
|
+
free(working);
|
|
166
|
+
if (!next) { free(ev); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); }
|
|
167
|
+
working = next;
|
|
168
|
+
}
|
|
122
169
|
|
|
123
|
-
free(
|
|
170
|
+
free(ev);
|
|
124
171
|
|
|
125
|
-
VALUE result
|
|
172
|
+
VALUE result = rb_hash_new();
|
|
126
173
|
VALUE rb_redacted = rb_str_new_cstr(working);
|
|
127
174
|
free(working);
|
|
175
|
+
rb_funcall(rb_redacted, rb_intern("force_encoding"), 1,
|
|
176
|
+
rb_funcall(rb_text, rb_intern("encoding"), 0));
|
|
128
177
|
rb_hash_aset(result, ID2SYM(rb_intern("redacted")), rb_redacted);
|
|
129
178
|
rb_hash_aset(result, ID2SYM(rb_intern("matches")), matches_arr);
|
|
130
179
|
return result;
|
data/lib/data_redactor.rb
CHANGED
|
@@ -74,6 +74,15 @@ module DataRedactor
|
|
|
74
74
|
# Default placeholder used when +placeholder:+ is not given to {redact}.
|
|
75
75
|
PLACEHOLDER_DEFAULT = "[REDACTED]"
|
|
76
76
|
|
|
77
|
+
# @api private
|
|
78
|
+
# Inputs larger than this (bytes) are split into newline-bounded chunks before
|
|
79
|
+
# being handed to the C engine. Bounds the per-call O(N) cost glibc regexec
|
|
80
|
+
# pays for state-log allocation, turning total redaction cost from O(N²) (one
|
|
81
|
+
# giant pass) into O(N × CHUNK_SIZE) (many bounded passes). 64 KB is a
|
|
82
|
+
# compromise: small enough to keep per-call cost low, large enough that
|
|
83
|
+
# typical log/JSON inputs use few chunks. See option G in TODO.md.
|
|
84
|
+
CHUNK_SIZE = 64 * 1024
|
|
85
|
+
|
|
77
86
|
module_function
|
|
78
87
|
|
|
79
88
|
# List of supported tag symbols.
|
|
@@ -132,6 +141,11 @@ module DataRedactor
|
|
|
132
141
|
def redact(text, only: nil, except: nil, placeholder: PLACEHOLDER_DEFAULT)
|
|
133
142
|
enable_bits = build_enable_bits(only, except)
|
|
134
143
|
ph_mode, ph_str = resolve_placeholder(placeholder)
|
|
144
|
+
# Defer to the C layer's TypeError for non-Strings; only chunk if the input
|
|
145
|
+
# is a String big enough to benefit (avoid bytesize on non-Strings).
|
|
146
|
+
if text.is_a?(String) && text.bytesize > CHUNK_SIZE
|
|
147
|
+
return _chunk_bytes(text).map { |c| _redact(c, ph_mode, ph_str, enable_bits) }.join
|
|
148
|
+
end
|
|
135
149
|
_redact(text, ph_mode, ph_str, enable_bits)
|
|
136
150
|
end
|
|
137
151
|
|
|
@@ -157,7 +171,12 @@ module DataRedactor
|
|
|
157
171
|
# # value: "user@example.com", start: 0, length: 16}] }
|
|
158
172
|
def scan(text, only: nil, except: nil)
|
|
159
173
|
enable_bits = build_enable_bits(only, except)
|
|
160
|
-
result =
|
|
174
|
+
result =
|
|
175
|
+
if text.is_a?(String) && text.bytesize > CHUNK_SIZE
|
|
176
|
+
_chunked_scan(text, enable_bits)
|
|
177
|
+
else
|
|
178
|
+
_scan(text, enable_bits)
|
|
179
|
+
end
|
|
161
180
|
# Normalise: convert tag string from C (uppercase) back to the Symbol used in TAGS
|
|
162
181
|
result[:matches].each { |m| m[:tag] = m[:tag].to_s.downcase.to_sym }
|
|
163
182
|
result
|
|
@@ -419,4 +438,59 @@ module DataRedactor
|
|
|
419
438
|
"placeholder must be a String, :tagged, or :hash — got #{placeholder.inspect}"
|
|
420
439
|
end
|
|
421
440
|
end
|
|
441
|
+
|
|
442
|
+
# @api private
|
|
443
|
+
# Split +text+ into byte-bounded chunks for the chunked redact/scan path.
|
|
444
|
+
# Chunks end at a +\n+ when possible so no match straddles a boundary; if a
|
|
445
|
+
# single line exceeds {CHUNK_SIZE} (rare in real inputs), it becomes one
|
|
446
|
+
# oversized chunk and pays the per-pattern O(N) cost — documented limitation.
|
|
447
|
+
# Returns an Array of byte-Strings whose concatenation equals +text+ exactly
|
|
448
|
+
# (including the original newline separators).
|
|
449
|
+
#
|
|
450
|
+
# @param text [String]
|
|
451
|
+
# @return [Array<String>]
|
|
452
|
+
def _chunk_bytes(text)
|
|
453
|
+
chunks = []
|
|
454
|
+
pos = 0
|
|
455
|
+
len = text.bytesize
|
|
456
|
+
while pos < len
|
|
457
|
+
remaining = len - pos
|
|
458
|
+
if remaining <= CHUNK_SIZE
|
|
459
|
+
chunks << text.byteslice(pos, remaining)
|
|
460
|
+
break
|
|
461
|
+
end
|
|
462
|
+
# Find the last \n in [pos, pos+CHUNK_SIZE). If none, chunk is one long
|
|
463
|
+
# line — take CHUNK_SIZE bytes as a fallback (boundary-split risk).
|
|
464
|
+
window = text.byteslice(pos, CHUNK_SIZE)
|
|
465
|
+
nl = window.rindex("\n")
|
|
466
|
+
take = nl ? nl + 1 : CHUNK_SIZE
|
|
467
|
+
chunks << text.byteslice(pos, take)
|
|
468
|
+
pos += take
|
|
469
|
+
end
|
|
470
|
+
chunks
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
# @api private
|
|
474
|
+
# Chunked variant of +_scan+: runs the C scanner on each chunk, then offsets
|
|
475
|
+
# each match's +:start+ by the chunk's base byte-position in the original
|
|
476
|
+
# input so the byteslice invariant holds end-to-end.
|
|
477
|
+
#
|
|
478
|
+
# @param text [String]
|
|
479
|
+
# @param enable_bits [Array<Integer>]
|
|
480
|
+
# @return [Hash{Symbol => Object}] +{ redacted: String, matches: Array<Hash> }+
|
|
481
|
+
def _chunked_scan(text, enable_bits)
|
|
482
|
+
redacted = +""
|
|
483
|
+
matches = []
|
|
484
|
+
base = 0
|
|
485
|
+
_chunk_bytes(text).each do |chunk|
|
|
486
|
+
part = _scan(chunk, enable_bits)
|
|
487
|
+
redacted << part[:redacted]
|
|
488
|
+
part[:matches].each do |m|
|
|
489
|
+
m[:start] += base
|
|
490
|
+
matches << m
|
|
491
|
+
end
|
|
492
|
+
base += chunk.bytesize
|
|
493
|
+
end
|
|
494
|
+
{ redacted: redacted, matches: matches }
|
|
495
|
+
end
|
|
422
496
|
end
|
data/readme.md
CHANGED
|
@@ -10,9 +10,10 @@ A Ruby gem with a C extension for high-performance regex-based redaction of sens
|
|
|
10
10
|
|
|
11
11
|
DataRedactor scans text for sensitive data — API keys and cloud secrets, IBANs,
|
|
12
12
|
credit cards, national IDs, emails, phone numbers, IPs, and more — and replaces
|
|
13
|
-
each match with a placeholder. The scanning runs in a C extension backed by
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
each match with a placeholder. The scanning runs in a C extension backed by a
|
|
14
|
+
zero-dependency Thompson NFA → lazy-DFA multi-pattern engine (v19) that scans
|
|
15
|
+
all 88 built-in patterns in a single pass — 2–2.5× faster than pure-Ruby `gsub`
|
|
16
|
+
on large payloads, with no external library dependencies.
|
|
16
17
|
|
|
17
18
|
It ships **88 built-in patterns** across 15+ countries, grouped into tags
|
|
18
19
|
(`:credentials`, `:financial`, `:contact`, ...) so you can redact only what you
|
|
@@ -384,8 +385,18 @@ redactor/
|
|
|
384
385
|
│ ├── scan.{c,h} # _scan + byte-offset replacement-log macros
|
|
385
386
|
│ ├── custom_patterns.{c,h} # Dynamic registry: add/remove/clear/list
|
|
386
387
|
│ └── tags.h # TAG_* bit constants
|
|
387
|
-
|
|
388
|
-
|
|
388
|
+
├── spec/
|
|
389
|
+
│ └── data_redactor_spec.rb # RSpec tests — at least one example per pattern, plus filter / placeholder / custom-pattern coverage
|
|
390
|
+
├── benchmark/ # Repo-only perf scripts (not packaged in the gem)
|
|
391
|
+
│ ├── README.md # How to run, what each script measures
|
|
392
|
+
│ ├── support/corpus.rb # Shared payload builders + pure-Ruby baseline redactor
|
|
393
|
+
│ ├── throughput.rb # MB/s on representative payloads
|
|
394
|
+
│ ├── vs_pure_ruby.rb # C extension vs pure-Ruby gsub (same 88 patterns)
|
|
395
|
+
│ ├── scaling.rb # Runtime vs input size 1KB → 50MB
|
|
396
|
+
│ └── per_pattern.rb # Per-pattern scan cost
|
|
397
|
+
└── docs/ # Design and execution docs for future work
|
|
398
|
+
├── standalone_matcher_design.md
|
|
399
|
+
└── combined_matcher_plan.md
|
|
389
400
|
```
|
|
390
401
|
|
|
391
402
|
## Requirements
|
|
@@ -460,6 +471,45 @@ Or compile and test in one step:
|
|
|
460
471
|
bundle exec rake
|
|
461
472
|
```
|
|
462
473
|
|
|
474
|
+
## Benchmarks
|
|
475
|
+
|
|
476
|
+
The `benchmark/` directory holds four scripts that measure the C engine under
|
|
477
|
+
different angles. They are **not** packaged with the gem.
|
|
478
|
+
|
|
479
|
+
```bash
|
|
480
|
+
bundle install # pulls benchmark-ips, benchmark-memory (dev deps)
|
|
481
|
+
bundle exec rake compile
|
|
482
|
+
bundle exec ruby benchmark/vs_pure_ruby.rb # head-to-head vs pure-Ruby gsub, same 88 patterns
|
|
483
|
+
bundle exec ruby benchmark/throughput.rb # MB/s on a log line, JSON, 1MB and 10MB log files
|
|
484
|
+
bundle exec ruby benchmark/scaling.rb # runtime vs input size (1KB → 50MB), confirms linear scaling
|
|
485
|
+
bundle exec ruby benchmark/per_pattern.rb # per-pattern scan cost over a 1MB payload
|
|
486
|
+
```
|
|
487
|
+
|
|
488
|
+
See [`benchmark/README.md`](benchmark/README.md) for what each script measures
|
|
489
|
+
and how the pure-Ruby baseline is kept honest (it reads the same patterns the
|
|
490
|
+
C engine uses, via `DataRedactor::BUILTIN_PATTERN_SOURCES`).
|
|
491
|
+
|
|
492
|
+
### Performance (0.10.0 — v19 multi-pattern engine)
|
|
493
|
+
|
|
494
|
+
As of 0.10.0 the C extension runs a **Thompson NFA → lazy-DFA multi-pattern
|
|
495
|
+
engine** (v19) that scans the input once across all 88 built-in patterns,
|
|
496
|
+
with two selective-merge passes (pure-digit group + IBAN union) that further
|
|
497
|
+
reduce work for the most common pattern classes. Custom patterns (`add_pattern`)
|
|
498
|
+
still use the glibc path (required for correct UTF-8 diacritic matching).
|
|
499
|
+
|
|
500
|
+
| Payload | v19 engine (0.10.0) | Pure-Ruby `gsub` | Ratio |
|
|
501
|
+
|-----------------------|---------------------|------------------|-----------------|
|
|
502
|
+
| log line (168 B) | 41 µs / call | 71 µs / call | **1.7× faster** |
|
|
503
|
+
| JSON blob (~580 B) | 81 µs / call | 132 µs / call | **1.6× faster** |
|
|
504
|
+
| 8 log lines (1.3 KB) | 175 µs / call | 399 µs / call | **2.3× faster** |
|
|
505
|
+
| 100 log lines (17 KB) | 2.0 ms / call | 4.6 ms / call | **2.3× faster** |
|
|
506
|
+
| 1 MB log | 138 ms / call | 294 ms / call | **2.1× faster** |
|
|
507
|
+
| 10 MB log | 1.44 s / call | — | 6.9 MB/s |
|
|
508
|
+
|
|
509
|
+
All payload sizes pass a correctness check (redaction count matches pure-Ruby `gsub`).
|
|
510
|
+
The previous engine (per-pattern `regexec`) was **4.25× slower** than pure Ruby on the
|
|
511
|
+
1 MB payload — a ~9× swing. Old numbers are in git history (`CHANGELOG.md` [0.9.0]).
|
|
512
|
+
|
|
463
513
|
## How it works
|
|
464
514
|
|
|
465
515
|
1. At load time, `Init_data_redactor` compiles all 85 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
|
|
@@ -490,3 +540,4 @@ Released under the [MIT License](LICENSE).
|
|
|
490
540
|
- **Pattern ordering matters** — patterns run sequentially. An early broad pattern (e.g. the 9-digit passport) may consume digits that a later pattern (e.g. credit card) depends on. Boundary wrapping mitigates this for pure-digit patterns.
|
|
491
541
|
- **AWS Secret Key (pattern 1)** — 40 consecutive base64 characters is a broad match. It can produce false positives in base64-encoded content such as embedded images or binary blobs.
|
|
492
542
|
- **Duplicate digit patterns** — several national ID formats share the same digit-length (11 digits: PESEL, Norwegian Fødselsnummer, Belgian National Number). They are kept as separate slots for clarity but the practical effect is that any 11-digit boundary-delimited number will be redacted.
|
|
543
|
+
- **Performance is currently slower than pure-Ruby `gsub`.** A May 2026 investigation found the C extension is 3–5× slower than a pure-Ruby `gsub` loop running the same 88 patterns, across input sizes from 168 bytes to 1 MB. The root cause is glibc's POSIX `regexec()`: each call allocates an O(input-length) state buffer before any matching begins, and the gem calls it once per pattern in sequence. Ruby's Onigmo engine wins by using a built-in Boyer-Moore literal pre-filter that this gem can only approximate. Two perf fixes have shipped (buffer-sizing in `replace_all_matches`, a `strstr` literal pre-filter, and input chunking for large payloads), which gave ~25-30% improvement and made scaling linear, but the absolute gap remains. Use the gem on small payloads where the absolute latency is still acceptable (< 1 ms for typical log lines); for high-throughput pipelines, hold off until the next major release. See `docs/standalone_matcher_design.md` for the long-term plan.
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: data_redactor
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.10.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Daniele Frisanco
|
|
@@ -79,6 +79,34 @@ dependencies:
|
|
|
79
79
|
- - ">="
|
|
80
80
|
- !ruby/object:Gem::Version
|
|
81
81
|
version: '2.0'
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: benchmark-ips
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - "~>"
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '2.13'
|
|
89
|
+
type: :development
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - "~>"
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '2.13'
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: benchmark-memory
|
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
|
99
|
+
requirements:
|
|
100
|
+
- - "~>"
|
|
101
|
+
- !ruby/object:Gem::Version
|
|
102
|
+
version: '0.2'
|
|
103
|
+
type: :development
|
|
104
|
+
prerelease: false
|
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - "~>"
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: '0.2'
|
|
82
110
|
description: A Ruby gem with a C extension for high-performance scanning and redaction
|
|
83
111
|
of 85 sensitive patterns — API keys, tokens, credentials, IBANs, national IDs, emails,
|
|
84
112
|
phone numbers, and PII from 15+ countries. Optional Logger formatter, Rails filter_parameters
|
|
@@ -97,6 +125,8 @@ files:
|
|
|
97
125
|
- ext/data_redactor/custom_patterns.h
|
|
98
126
|
- ext/data_redactor/data_redactor.c
|
|
99
127
|
- ext/data_redactor/extconf.rb
|
|
128
|
+
- ext/data_redactor/matcher.c
|
|
129
|
+
- ext/data_redactor/matcher.h
|
|
100
130
|
- ext/data_redactor/patterns.c
|
|
101
131
|
- ext/data_redactor/patterns.h
|
|
102
132
|
- ext/data_redactor/placeholder.c
|