data_redactor 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 007d59e430d1675a13b84670f6c34c300f8b72fd7ee4744aa191f846bb89b072
4
- data.tar.gz: a23f3b99c3ead341d2c9415a1b4b2eb32a45ee002f052a8e58d928eb1ce03919
3
+ metadata.gz: 59ae814186478e16f6ba16f66aa1dfa8f3fd63d088cd5c837221d7530c6a0c73
4
+ data.tar.gz: 631c2a6f5198d7c2e741f9a283263ffbadfb49053bf6767ba57cce67b33e381f
5
5
  SHA512:
6
- metadata.gz: ccd4f6f97a0110585e4f43f9402eac2a1f57b2aef01a3c6870f0e57ea578377291a7367ee924585d9d11e92af98f4178bb0b9488c1a24a2338f6a41936efad30
7
- data.tar.gz: 5281171119b4892167a6b1d55e0996db47408c8a6d334656998f8f2ca50794a3a7b5c987132369ca32965da0943f954eab61f34f5a97c683b8a14851e9beca1e
6
+ metadata.gz: a5fdcc1bf088c9065f7e0c458fa4cf210917d688cb9b2b17e0824e59e9757f2f9c0491ef53c0e91f7ff29ac971b1ef6cc2d434a1d80505206e2d9f5b36893ca9
7
+ data.tar.gz: 1457805dc7599d1655ebb8bc569607be3380290f99f65cdf15cdc17fc7431932e4ff4250f967ff0d7459350d1d836c4d63857bf0b90ad408c2fcd7a969ab453f
data/CHANGELOG.md CHANGED
@@ -7,6 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.10.0] - 2026-06-09
11
+
12
+ ### Changed
13
+ - **Engine rewrite (v19 hybrid)** — `redact` and `scan` now run through a
14
+ Thompson NFA → bytecode → lazy-DFA multi-pattern engine (v19) for all 88
15
+ built-in patterns, replacing the previous per-pattern POSIX `regexec` loop.
16
+ Custom patterns (`add_pattern`) continue to use the glibc path (hybrid split
17
+ — required for correct UTF-8 multibyte character-class matching in user regex).
18
+ - Throughput on a 1 MB log: **~8.4× faster** than the previous C engine
19
+ (0.87 i/s → 7.27 i/s); **2.25× faster** than pure-Ruby `gsub` (was 4×
20
+ slower). Small per-call strings: 1.7–2.3× faster (was 3–4.6× slower).
21
+ - Overlap resolution: built-in matches are now resolved by an index-order
22
+ greedy claim (`mm_resolve`) that reproduces today's sequential per-pattern
23
+ rewrite semantics exactly. The one accepted divergence (rewrite-created
24
+ boundary when two secrets abut with no separator) is documented in
25
+ `TODO.md §1d` and pinned by `DIVERGENCE` specs.
26
+ - `rb_data_redactor_scan`: coordinate mapping (`repl_log` / `WORKING_TO_ORIG`)
27
+ replaced by direct original-frame offset emission from the v19 engine; custom
28
+ patterns use a lightweight offset-walk over the built-in event list.
29
+
30
+ ### Fixed
31
+ - **Swiss AHV false-negative** — boundary-wrapped patterns with a
32
+ start-anchored required literal now correctly set `max_back = 1` (not 0) so
33
+ the literal-skip does not overshoot the boundary byte. `756.1234.5678.90`
34
+ now matches as expected. (Pre-existing bug in the old engine, caught by
35
+ going live.)
36
+
10
37
  ## [0.9.0] - 2026-05-22
11
38
 
12
39
  ### Added
@@ -3,6 +3,11 @@
3
3
  #include <string.h>
4
4
  #include <stdlib.h>
5
5
 
6
+ /* Custom patterns deliberately do NOT use the v19 engine: they keep the glibc
7
+ * regexec path (replace_all_matches), because user regex can contain multibyte
8
+ * UTF-8 inside character classes (e.g. name_pattern's [oOòóô…]) that the
9
+ * byte-level v19 engine cannot match the way glibc's locale-aware matcher does.
10
+ * See TODO.md §1d Gap 2 for the hybrid-split rationale. */
6
11
  custom_pattern_t *custom_patterns = NULL;
7
12
  int custom_count = 0;
8
13
  int custom_cap = 0;
@@ -7,8 +7,12 @@
7
7
  #include "redact.h"
8
8
  #include "scan.h"
9
9
  #include "custom_patterns.h"
10
+ #include "matcher.h"
10
11
 
11
12
  void Init_data_redactor(void) {
13
+ /* Build the v19 multi-pattern engines from the gem's pattern arrays. */
14
+ mm_init();
15
+
12
16
  /* Compile all built-in regex patterns at load time. */
13
17
  for (int i = 0; i < NUM_PATTERNS; i++) {
14
18
  const char *pat;
@@ -43,15 +47,25 @@ void Init_data_redactor(void) {
43
47
  rb_define_module_function(mDataRedactor, "_clear_custom_patterns", rb_clear_custom_patterns, 0);
44
48
  rb_define_module_function(mDataRedactor, "_custom_patterns", rb_custom_patterns, 0);
45
49
 
46
- /* Frozen array of built-in pattern names, for introspection and only:/except: validation. */
50
+ /* Frozen introspection arrays, parallel to the pattern table (same index = same
51
+ * pattern). NAMES/TAG_BITS back only:/except: validation; SOURCES/BOUNDARY are
52
+ * internal aids (the benchmark suite rebuilds the patterns in pure Ruby from
53
+ * them). SOURCES holds the unwrapped POSIX ERE — boundary wrapping is applied
54
+ * above at compile time for patterns with BOUNDARY[i] == true. */
47
55
  VALUE builtin_names = rb_ary_new_capa(NUM_PATTERNS);
48
56
  VALUE builtin_tag_bits = rb_ary_new_capa(NUM_PATTERNS);
57
+ VALUE builtin_sources = rb_ary_new_capa(NUM_PATTERNS);
58
+ VALUE builtin_boundary = rb_ary_new_capa(NUM_PATTERNS);
49
59
  for (int i = 0; i < NUM_PATTERNS; i++) {
50
60
  rb_ary_push(builtin_names, rb_str_new_frozen(rb_str_new_cstr(pattern_names[i])));
51
61
  rb_ary_push(builtin_tag_bits, INT2NUM(pattern_tags[i]));
62
+ rb_ary_push(builtin_sources, rb_str_new_frozen(rb_str_new_cstr(pattern_strings[i])));
63
+ rb_ary_push(builtin_boundary, boundary_wrapped[i] ? Qtrue : Qfalse);
52
64
  }
53
65
  rb_define_const(mDataRedactor, "BUILTIN_PATTERN_NAMES", rb_ary_freeze(builtin_names));
54
66
  rb_define_const(mDataRedactor, "BUILTIN_PATTERN_TAG_BITS", rb_ary_freeze(builtin_tag_bits));
67
+ rb_define_const(mDataRedactor, "BUILTIN_PATTERN_SOURCES", rb_ary_freeze(builtin_sources));
68
+ rb_define_const(mDataRedactor, "BUILTIN_PATTERN_BOUNDARY", rb_ary_freeze(builtin_boundary));
55
69
 
56
70
  /* Placeholder mode constants. */
57
71
  rb_define_const(mDataRedactor, "PH_MODE_PLAIN", INT2NUM(PLACEHOLDER_MODE_PLAIN));