data_redactor 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +27 -0
- data/ext/data_redactor/custom_patterns.c +5 -0
- data/ext/data_redactor/data_redactor.c +15 -1
- data/ext/data_redactor/matcher.c +1193 -0
- data/ext/data_redactor/matcher.h +78 -0
- data/ext/data_redactor/patterns.c +119 -0
- data/ext/data_redactor/patterns.h +11 -0
- data/ext/data_redactor/redact.c +106 -33
- data/ext/data_redactor/scan.c +141 -92
- data/lib/data_redactor/version.rb +1 -1
- data/lib/data_redactor.rb +75 -1
- data/readme.md +56 -5
- metadata +31 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 59ae814186478e16f6ba16f66aa1dfa8f3fd63d088cd5c837221d7530c6a0c73
|
|
4
|
+
data.tar.gz: 631c2a6f5198d7c2e741f9a283263ffbadfb49053bf6767ba57cce67b33e381f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a5fdcc1bf088c9065f7e0c458fa4cf210917d688cb9b2b17e0824e59e9757f2f9c0491ef53c0e91f7ff29ac971b1ef6cc2d434a1d80505206e2d9f5b36893ca9
|
|
7
|
+
data.tar.gz: 1457805dc7599d1655ebb8bc569607be3380290f99f65cdf15cdc17fc7431932e4ff4250f967ff0d7459350d1d836c4d63857bf0b90ad408c2fcd7a969ab453f
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.10.0] - 2026-06-09
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
- **Engine rewrite (v19 hybrid)** — `redact` and `scan` now run through a
|
|
14
|
+
Thompson NFA → bytecode → lazy-DFA multi-pattern engine (v19) for all 88
|
|
15
|
+
built-in patterns, replacing the previous per-pattern POSIX `regexec` loop.
|
|
16
|
+
Custom patterns (`add_pattern`) continue to use the glibc path (hybrid split
|
|
17
|
+
— required for correct UTF-8 multibyte character-class matching in user regex).
|
|
18
|
+
- Throughput on a 1 MB log: **~8.4× faster** than the previous C engine
|
|
19
|
+
(0.87 i/s → 7.27 i/s); **2.25× faster** than pure-Ruby `gsub` (was 4×
|
|
20
|
+
slower). Small per-call strings: 1.7–2.3× faster (was 3–4.6× slower).
|
|
21
|
+
- Overlap resolution: built-in matches are now resolved by an index-order
|
|
22
|
+
greedy claim (`mm_resolve`) that reproduces today's sequential per-pattern
|
|
23
|
+
rewrite semantics exactly. The one accepted divergence (rewrite-created
|
|
24
|
+
boundary when two secrets abut with no separator) is documented in
|
|
25
|
+
`TODO.md §1d` and pinned by `DIVERGENCE` specs.
|
|
26
|
+
- `rb_data_redactor_scan`: coordinate mapping (`repl_log` / `WORKING_TO_ORIG`)
|
|
27
|
+
replaced by direct original-frame offset emission from the v19 engine; custom
|
|
28
|
+
patterns use a lightweight offset-walk over the built-in event list.
|
|
29
|
+
|
|
30
|
+
### Fixed
|
|
31
|
+
- **Swiss AHV false-negative** — boundary-wrapped patterns with a
|
|
32
|
+
start-anchored required literal now correctly set `max_back = 1` (not 0) so
|
|
33
|
+
the literal-skip does not overshoot the boundary byte. `756.1234.5678.90`
|
|
34
|
+
now matches as expected. (Pre-existing bug in the old engine, caught by
|
|
35
|
+
going live.)
|
|
36
|
+
|
|
10
37
|
## [0.9.0] - 2026-05-22
|
|
11
38
|
|
|
12
39
|
### Added
|
|
@@ -3,6 +3,11 @@
|
|
|
3
3
|
#include <string.h>
|
|
4
4
|
#include <stdlib.h>
|
|
5
5
|
|
|
6
|
+
/* Custom patterns deliberately do NOT use the v19 engine: they keep the glibc
|
|
7
|
+
* regexec path (replace_all_matches), because user regex can contain multibyte
|
|
8
|
+
* UTF-8 inside character classes (e.g. name_pattern's [oOòóô…]) that the
|
|
9
|
+
* byte-level v19 engine cannot match the way glibc's locale-aware matcher does.
|
|
10
|
+
* See TODO.md §1d Gap 2 for the hybrid-split rationale. */
|
|
6
11
|
custom_pattern_t *custom_patterns = NULL;
|
|
7
12
|
int custom_count = 0;
|
|
8
13
|
int custom_cap = 0;
|
|
@@ -7,8 +7,12 @@
|
|
|
7
7
|
#include "redact.h"
|
|
8
8
|
#include "scan.h"
|
|
9
9
|
#include "custom_patterns.h"
|
|
10
|
+
#include "matcher.h"
|
|
10
11
|
|
|
11
12
|
void Init_data_redactor(void) {
|
|
13
|
+
/* Build the v19 multi-pattern engines from the gem's pattern arrays. */
|
|
14
|
+
mm_init();
|
|
15
|
+
|
|
12
16
|
/* Compile all built-in regex patterns at load time. */
|
|
13
17
|
for (int i = 0; i < NUM_PATTERNS; i++) {
|
|
14
18
|
const char *pat;
|
|
@@ -43,15 +47,25 @@ void Init_data_redactor(void) {
|
|
|
43
47
|
rb_define_module_function(mDataRedactor, "_clear_custom_patterns", rb_clear_custom_patterns, 0);
|
|
44
48
|
rb_define_module_function(mDataRedactor, "_custom_patterns", rb_custom_patterns, 0);
|
|
45
49
|
|
|
46
|
-
/* Frozen
|
|
50
|
+
/* Frozen introspection arrays, parallel to the pattern table (same index = same
|
|
51
|
+
* pattern). NAMES/TAG_BITS back only:/except: validation; SOURCES/BOUNDARY are
|
|
52
|
+
* internal aids (the benchmark suite rebuilds the patterns in pure Ruby from
|
|
53
|
+
* them). SOURCES holds the unwrapped POSIX ERE — boundary wrapping is applied
|
|
54
|
+
* above at compile time for patterns with BOUNDARY[i] == true. */
|
|
47
55
|
VALUE builtin_names = rb_ary_new_capa(NUM_PATTERNS);
|
|
48
56
|
VALUE builtin_tag_bits = rb_ary_new_capa(NUM_PATTERNS);
|
|
57
|
+
VALUE builtin_sources = rb_ary_new_capa(NUM_PATTERNS);
|
|
58
|
+
VALUE builtin_boundary = rb_ary_new_capa(NUM_PATTERNS);
|
|
49
59
|
for (int i = 0; i < NUM_PATTERNS; i++) {
|
|
50
60
|
rb_ary_push(builtin_names, rb_str_new_frozen(rb_str_new_cstr(pattern_names[i])));
|
|
51
61
|
rb_ary_push(builtin_tag_bits, INT2NUM(pattern_tags[i]));
|
|
62
|
+
rb_ary_push(builtin_sources, rb_str_new_frozen(rb_str_new_cstr(pattern_strings[i])));
|
|
63
|
+
rb_ary_push(builtin_boundary, boundary_wrapped[i] ? Qtrue : Qfalse);
|
|
52
64
|
}
|
|
53
65
|
rb_define_const(mDataRedactor, "BUILTIN_PATTERN_NAMES", rb_ary_freeze(builtin_names));
|
|
54
66
|
rb_define_const(mDataRedactor, "BUILTIN_PATTERN_TAG_BITS", rb_ary_freeze(builtin_tag_bits));
|
|
67
|
+
rb_define_const(mDataRedactor, "BUILTIN_PATTERN_SOURCES", rb_ary_freeze(builtin_sources));
|
|
68
|
+
rb_define_const(mDataRedactor, "BUILTIN_PATTERN_BOUNDARY", rb_ary_freeze(builtin_boundary));
|
|
55
69
|
|
|
56
70
|
/* Placeholder mode constants. */
|
|
57
71
|
rb_define_const(mDataRedactor, "PH_MODE_PLAIN", INT2NUM(PLACEHOLDER_MODE_PLAIN));
|