data_redactor 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1193 @@
1
+ /* matcher.c — the v19 multi-pattern engine, ported into the gem.
2
+ *
3
+ * Ported from prototypes/multi_matcher_v1/matcher19.c (the standalone prototype
4
+ * proven in docs/research_log.md). The matching core — regex parser -> Thompson
5
+ * bytecode -> per-pattern lazy DFA, the v14 first-byte filter, the v12 literal
6
+ * skip, the v18.1 anchor lowering, the v19 pure-digit and IBAN selective merges,
7
+ * and the v19.1 EOL-at-buffer-end fix — is unchanged. Two things differ from the
8
+ * prototype:
9
+ *
10
+ * 1. Pattern source. The prototype baked in a generated MM88_PATTERNS table.
11
+ * Here the built-in engines are built at mm_init() from the gem's own
12
+ * pattern_strings[]/boundary_wrapped[]/pattern_required_literal[] arrays
13
+ * (patterns.{h,c} is the single source of truth — CLAUDE.md), and custom
14
+ * patterns are appended dynamically via mm_add()/mm_remove(). So engine
15
+ * storage is a growable array, not a fixed [NUM_PATTERNS] one.
16
+ *
17
+ * 2. Output contract. mm_scan() takes an enable_bits gate and emits ORIGINAL-
18
+ * frame (pattern_id, start, span) events for ALL enabled patterns in one
19
+ * pass; it does NOT model the gem's cross-pattern sequential rewrite. The
20
+ * caller applies mm_resolve() (index-order greedy claim) to reproduce
21
+ * today's "earlier-index pattern wins" semantics byte-for-byte. See
22
+ * TODO.md §1d Gap 5 and the AKIA specs in spec/data_redactor_spec.rb.
23
+ *
24
+ * The infix-literal classification and the BM_INFIX hint table below are ported
25
+ * from prototypes/multi_matcher_v1/gen_patterns.rb (which derived them from the
26
+ * same gem arrays at codegen time). They are pure optimisation hints — the
27
+ * first-byte filter computed from the program itself is what guarantees
28
+ * correctness — so a stale hint can only cost speed, never miss a match.
29
+ */
30
+
31
+ /* _GNU_SOURCE must be defined before any system header so memmem(3) is declared
32
+ * with its correct void* prototype (otherwise it is implicitly int-returning and
33
+ * its result is truncated to a garbage pointer). mkmf also passes -D_GNU_SOURCE
34
+ * on the gem build; the guard avoids a redefinition warning there. */
35
+ #ifndef _GNU_SOURCE
36
+ #define _GNU_SOURCE
37
+ #endif
38
+ #include "matcher.h"
39
+ #include "patterns.h"
40
+
41
+ #include <stdio.h>
42
+ #include <stdlib.h>
43
+ #include <string.h>
44
+ #include <stdint.h>
45
+ #include <ctype.h>
46
+ #include <limits.h>
47
+
48
+ /* ========================================================================
49
+ * 0. Utilities
50
+ * ======================================================================== */
51
+
52
+ /* Named mm_x* (not xmalloc/xcalloc) to avoid clashing with Ruby's same-named
53
+ * macros if a future include pulls in ruby.h. Plain libc malloc/calloc — this
54
+ * engine owns its buffers and does not use Ruby's GC-managed allocator. */
55
+ static void *mm_xmalloc(size_t n) {
56
+ void *p = malloc(n); if (!p) { perror("malloc"); exit(1); } return p;
57
+ }
58
+ static void *mm_xcalloc(size_t n, size_t s) {
59
+ void *p = calloc(n, s); if (!p) { perror("calloc"); exit(1); } return p;
60
+ }
61
+
62
+ /* ========================================================================
63
+ * 1. Character class bitmap (256 bits = 4 × uint64_t)
64
+ * ======================================================================== */
65
+
66
+ typedef struct { uint64_t w[4]; } cclass_t;
67
+
68
+ static void cc_set(cclass_t *c, unsigned ch) { c->w[ch>>6] |= (uint64_t)1<<(ch&63); }
69
+ static void cc_unset(cclass_t *c, unsigned ch) { c->w[ch>>6] &= ~((uint64_t)1<<(ch&63)); }
70
+ static int cc_test(const cclass_t *c, unsigned ch) { return (c->w[ch>>6]>>(ch&63))&1; }
71
+ static void cc_negate(cclass_t *c) {
72
+ c->w[0]=~c->w[0]; c->w[1]=~c->w[1]; c->w[2]=~c->w[2]; c->w[3]=~c->w[3];
73
+ }
74
+ static void cc_add_range(cclass_t *c, unsigned lo, unsigned hi) {
75
+ for (unsigned i=lo; i<=hi; i++) cc_set(c,i);
76
+ }
77
+ static void cc_add_posix(cclass_t *c, const char *cls, size_t len) {
78
+ if (len==5 && !memcmp(cls,"alpha",5)) { cc_add_range(c,'a','z'); cc_add_range(c,'A','Z'); }
79
+ else if (len==5 && !memcmp(cls,"digit",5)) { cc_add_range(c,'0','9'); }
80
+ else if (len==5 && !memcmp(cls,"alnum",5)) { cc_add_range(c,'a','z'); cc_add_range(c,'A','Z'); cc_add_range(c,'0','9'); }
81
+ else if (len==5 && !memcmp(cls,"upper",5)) { cc_add_range(c,'A','Z'); }
82
+ else if (len==5 && !memcmp(cls,"lower",5)) { cc_add_range(c,'a','z'); }
83
+ else if (len==5 && !memcmp(cls,"space",5)) { cc_set(c,' '); cc_set(c,'\t'); cc_set(c,'\n'); cc_set(c,'\r'); cc_set(c,'\f'); cc_set(c,'\v'); }
84
+ else if (len==6 && !memcmp(cls,"xdigit",6)) { cc_add_range(c,'0','9'); cc_add_range(c,'a','f'); cc_add_range(c,'A','F'); }
85
+ else if (len==5 && !memcmp(cls,"print",5)) { cc_add_range(c,0x20,0x7e); }
86
+ else if (len==5 && !memcmp(cls,"graph",5)) { cc_add_range(c,0x21,0x7e); }
87
+ else if (len==5 && !memcmp(cls,"punct",5)) {
88
+ for (unsigned i=0x21;i<=0x7e;i++) if (!isalnum(i)) cc_set(c,i);
89
+ } else { fprintf(stderr,"data_redactor matcher: unknown POSIX class [:%.*s:]\n",(int)len,cls); exit(1); }
90
+ }
91
+
92
+ /* ========================================================================
93
+ * 2. Regex parser → AST
94
+ * ======================================================================== */
95
+
96
+ typedef enum {
97
+ AST_LITERAL, AST_CCLASS, AST_DOT, AST_CONCAT, AST_ALT, AST_REPEAT,
98
+ AST_ANCHOR_BOL, AST_ANCHOR_EOL,
99
+ } ast_type_t;
100
+
101
+ typedef struct ast_node ast_node_t;
102
+ struct ast_node {
103
+ ast_type_t type;
104
+ unsigned char ch;
105
+ cclass_t cc;
106
+ ast_node_t *left, *right;
107
+ int lo, hi;
108
+ };
109
+
110
+ static ast_node_t *ast_alloc(ast_type_t t) {
111
+ ast_node_t *n = mm_xcalloc(1, sizeof(*n)); n->type=t; return n;
112
+ }
113
+ static void ast_free(ast_node_t *n) {
114
+ if (!n) return;
115
+ ast_free(n->left); ast_free(n->right); free(n);
116
+ }
117
+
118
+ typedef struct { const char *p, *end; } pctx_t;
119
+ static ast_node_t *parse_alt(pctx_t *ctx);
120
+
121
+ static ast_node_t *parse_cclass(pctx_t *ctx) {
122
+ ast_node_t *n = ast_alloc(AST_CCLASS);
123
+ int negate=0;
124
+ if (ctx->p < ctx->end && *ctx->p=='^') { negate=1; ctx->p++; }
125
+ int first=1;
126
+ while (ctx->p < ctx->end && (*ctx->p!=']' || first)) {
127
+ first=0;
128
+ unsigned char c=(unsigned char)*ctx->p++;
129
+ if (c=='[' && ctx->p < ctx->end && *ctx->p==':') {
130
+ ctx->p++;
131
+ const char *cs=ctx->p;
132
+ while (ctx->p < ctx->end && !(*ctx->p==':' && *(ctx->p+1)==']')) ctx->p++;
133
+ cc_add_posix(&n->cc, cs, (size_t)(ctx->p-cs));
134
+ ctx->p+=2;
135
+ } else if (ctx->p+1 < ctx->end && *ctx->p=='-' && *(ctx->p+1)!=']') {
136
+ ctx->p++;
137
+ cc_add_range(&n->cc, c, (unsigned char)*ctx->p++);
138
+ } else if (c=='\\' && ctx->p < ctx->end) {
139
+ cc_set(&n->cc, (unsigned char)*ctx->p++);
140
+ } else {
141
+ cc_set(&n->cc, c);
142
+ }
143
+ }
144
+ if (ctx->p < ctx->end && *ctx->p==']') ctx->p++;
145
+ if (negate) cc_negate(&n->cc);
146
+ return n;
147
+ }
148
+
149
+ static void parse_quantifier(pctx_t *ctx, ast_node_t **io) {
150
+ if (ctx->p >= ctx->end) return;
151
+ char c=*ctx->p;
152
+ int lo=1, hi=1;
153
+ if (c=='*') { lo=0; hi=-1; ctx->p++; }
154
+ else if (c=='+') { lo=1; hi=-1; ctx->p++; }
155
+ else if (c=='?') { lo=0; hi=1; ctx->p++; }
156
+ else if (c=='{') {
157
+ ctx->p++; lo=0;
158
+ while (ctx->p<ctx->end && isdigit((unsigned char)*ctx->p)) lo=lo*10+(*ctx->p++-'0');
159
+ if (ctx->p<ctx->end && *ctx->p==',') {
160
+ ctx->p++; hi=0;
161
+ if (ctx->p<ctx->end && *ctx->p=='}') { hi=-1; }
162
+ else while (ctx->p<ctx->end && isdigit((unsigned char)*ctx->p)) hi=hi*10+(*ctx->p++-'0');
163
+ } else { hi=lo; }
164
+ if (ctx->p<ctx->end && *ctx->p=='}') ctx->p++;
165
+ } else return;
166
+ if (ctx->p<ctx->end && *ctx->p=='?') ctx->p++;
167
+ ast_node_t *rep=ast_alloc(AST_REPEAT);
168
+ rep->left=*io; rep->lo=lo; rep->hi=hi; *io=rep;
169
+ }
170
+
171
+ static ast_node_t *parse_atom(pctx_t *ctx) {
172
+ if (ctx->p >= ctx->end) return NULL;
173
+ unsigned char c=(unsigned char)*ctx->p;
174
+ if (c=='(') {
175
+ ctx->p++;
176
+ if (ctx->p+1<ctx->end && *ctx->p=='?' && *(ctx->p+1)==':') ctx->p+=2;
177
+ ast_node_t *inner=parse_alt(ctx);
178
+ if (ctx->p<ctx->end && *ctx->p==')') ctx->p++;
179
+ return inner;
180
+ }
181
+ if (c=='[') { ctx->p++; return parse_cclass(ctx); }
182
+ if (c=='.') { ctx->p++; return ast_alloc(AST_DOT); }
183
+ if (c=='^') { ctx->p++; return ast_alloc(AST_ANCHOR_BOL); }
184
+ if (c=='$') { ctx->p++; return ast_alloc(AST_ANCHOR_EOL); }
185
+ if (c=='\\' && ctx->p+1<ctx->end) {
186
+ ctx->p++;
187
+ ast_node_t *n=ast_alloc(AST_LITERAL); n->ch=(unsigned char)*ctx->p++; return n;
188
+ }
189
+ if (c==')' || c=='|') return NULL;
190
+ ctx->p++;
191
+ ast_node_t *n=ast_alloc(AST_LITERAL); n->ch=c; return n;
192
+ }
193
+
194
+ static ast_node_t *parse_concat(pctx_t *ctx) {
195
+ ast_node_t *head=NULL;
196
+ while (ctx->p<ctx->end && *ctx->p!=')' && *ctx->p!='|') {
197
+ ast_node_t *atom=parse_atom(ctx);
198
+ if (!atom) break;
199
+ parse_quantifier(ctx, &atom);
200
+ if (!head) { head=atom; }
201
+ else { ast_node_t *cat=ast_alloc(AST_CONCAT); cat->left=head; cat->right=atom; head=cat; }
202
+ }
203
+ return head;
204
+ }
205
+
206
+ static ast_node_t *parse_alt(pctx_t *ctx) {
207
+ ast_node_t *left=parse_concat(ctx);
208
+ while (ctx->p<ctx->end && *ctx->p=='|') {
209
+ ctx->p++;
210
+ ast_node_t *right=parse_concat(ctx);
211
+ ast_node_t *alt=ast_alloc(AST_ALT);
212
+ alt->left=left; alt->right=right; left=alt;
213
+ }
214
+ return left;
215
+ }
216
+
217
+ static ast_node_t *parse_regex(const char *src) {
218
+ pctx_t ctx={src, src+strlen(src)}; return parse_alt(&ctx);
219
+ }
220
+
221
+ /* ========================================================================
222
+ * 3. Bytecode program
223
+ * ======================================================================== */
224
+
225
+ typedef enum {
226
+ OP_CHAR, OP_CLASS, OP_ANY, OP_SPLIT, OP_JMP, OP_BOL, OP_EOL, OP_MATCH,
227
+ } opcode_t;
228
+
229
+ typedef struct {
230
+ opcode_t op;
231
+ unsigned char ch;
232
+ cclass_t cc;
233
+ int x, y;
234
+ } inst_t;
235
+
236
+ typedef struct {
237
+ inst_t *code;
238
+ int n;
239
+ int cap;
240
+ } prog_t;
241
+
242
+ static int prog_emit(prog_t *pr, opcode_t op) {
243
+ if (pr->n >= pr->cap) {
244
+ pr->cap = pr->cap ? pr->cap * 2 : 64;
245
+ pr->code = realloc(pr->code, pr->cap * sizeof(inst_t));
246
+ if (!pr->code) { perror("realloc"); exit(1); }
247
+ }
248
+ int at = pr->n++;
249
+ memset(&pr->code[at], 0, sizeof(inst_t));
250
+ pr->code[at].op = op;
251
+ return at;
252
+ }
253
+
254
+ /* ========================================================================
255
+ * 4. AST → bytecode
256
+ * ======================================================================== */
257
+
258
+ static void emit_node(prog_t *pr, ast_node_t *node);
259
+
260
+ static void emit_repeat(prog_t *pr, ast_node_t *node) {
261
+ ast_node_t *child = node->left;
262
+ int lo = node->lo, hi = node->hi;
263
+
264
+ for (int i = 0; i < lo; i++) emit_node(pr, child);
265
+
266
+ if (hi == -1) {
267
+ int l = prog_emit(pr, OP_SPLIT);
268
+ emit_node(pr, child);
269
+ int j = prog_emit(pr, OP_JMP);
270
+ pr->code[j].x = l;
271
+ pr->code[l].x = l + 1;
272
+ pr->code[l].y = pr->n;
273
+ } else {
274
+ int n_opt = hi - lo;
275
+ int *splits = n_opt ? mm_xmalloc(n_opt * sizeof(int)) : NULL;
276
+ for (int i = 0; i < n_opt; i++) {
277
+ splits[i] = prog_emit(pr, OP_SPLIT);
278
+ pr->code[splits[i]].x = splits[i] + 1;
279
+ emit_node(pr, child);
280
+ }
281
+ for (int i = 0; i < n_opt; i++) pr->code[splits[i]].y = pr->n;
282
+ free(splits);
283
+ }
284
+ }
285
+
286
+ static void emit_node(prog_t *pr, ast_node_t *node) {
287
+ if (!node) return;
288
+ switch (node->type) {
289
+ case AST_LITERAL: {
290
+ int i = prog_emit(pr, OP_CHAR); pr->code[i].ch = node->ch; break;
291
+ }
292
+ case AST_CCLASS: {
293
+ int i = prog_emit(pr, OP_CLASS); pr->code[i].cc = node->cc; break;
294
+ }
295
+ case AST_DOT: prog_emit(pr, OP_ANY); break;
296
+ case AST_ANCHOR_BOL: prog_emit(pr, OP_BOL); break;
297
+ case AST_ANCHOR_EOL: prog_emit(pr, OP_EOL); break;
298
+ case AST_CONCAT:
299
+ emit_node(pr, node->left);
300
+ emit_node(pr, node->right);
301
+ break;
302
+ case AST_ALT: {
303
+ int s = prog_emit(pr, OP_SPLIT);
304
+ pr->code[s].x = pr->n;
305
+ emit_node(pr, node->left);
306
+ int j = prog_emit(pr, OP_JMP);
307
+ pr->code[s].y = pr->n;
308
+ emit_node(pr, node->right);
309
+ pr->code[j].x = pr->n;
310
+ break;
311
+ }
312
+ case AST_REPEAT: emit_repeat(pr, node); break;
313
+ }
314
+ }
315
+
316
+ /* ========================================================================
317
+ * 5. Min/max match length
318
+ * ======================================================================== */
319
+
320
+ static size_t ast_min_len(const ast_node_t *n) {
321
+ if (!n) return 0;
322
+ switch (n->type) {
323
+ case AST_LITERAL: case AST_CCLASS: case AST_DOT: return 1;
324
+ case AST_ANCHOR_BOL: case AST_ANCHOR_EOL: return 0;
325
+ case AST_CONCAT: return ast_min_len(n->left) + ast_min_len(n->right);
326
+ case AST_ALT: { size_t l=ast_min_len(n->left), r=ast_min_len(n->right); return l<r?l:r; }
327
+ case AST_REPEAT: return n->lo==0 ? 0 : (size_t)n->lo * ast_min_len(n->left);
328
+ }
329
+ return 0;
330
+ }
331
+
332
+ #define LEN_UNBOUNDED SIZE_MAX
333
+ static size_t add_sat(size_t a, size_t b) {
334
+ if (a == LEN_UNBOUNDED || b == LEN_UNBOUNDED) return LEN_UNBOUNDED;
335
+ size_t s = a + b; return s < a ? LEN_UNBOUNDED : s;
336
+ }
337
+ static size_t mul_sat(size_t a, size_t b) {
338
+ if (a == 0 || b == 0) return 0;
339
+ if (a == LEN_UNBOUNDED || b == LEN_UNBOUNDED) return LEN_UNBOUNDED;
340
+ size_t s = a * b; return s / a != b ? LEN_UNBOUNDED : s;
341
+ }
342
+ static size_t ast_max_len(const ast_node_t *n) {
343
+ if (!n) return 0;
344
+ switch (n->type) {
345
+ case AST_LITERAL: case AST_CCLASS: case AST_DOT: return 1;
346
+ case AST_ANCHOR_BOL: case AST_ANCHOR_EOL: return 0;
347
+ case AST_CONCAT: return add_sat(ast_max_len(n->left), ast_max_len(n->right));
348
+ case AST_ALT: { size_t l=ast_max_len(n->left), r=ast_max_len(n->right); return l>r?l:r; }
349
+ case AST_REPEAT:
350
+ if (n->hi == -1) return LEN_UNBOUNDED;
351
+ return mul_sat((size_t)n->hi, ast_max_len(n->left));
352
+ }
353
+ return 0;
354
+ }
355
+
356
+ /* ========================================================================
357
+ * 6. Per-pattern engine + dynamic storage
358
+ *
359
+ * The prototype used fixed [MM88_NUM_PATTERNS] arrays. To carry custom patterns
360
+ * the storage is now a single growable array `g_eng` of `engine_t`, each owning
361
+ * its program, lazy-DFA cache, and per-scan scratch (so a custom add/remove only
362
+ * touches one slot). Built-ins occupy slots [0, NUM_PATTERNS); customs follow.
363
+ * ======================================================================== */
364
+
365
+ #define WRAP_PFX "(^|[^0-9A-Za-z])("
366
+ #define WRAP_SFX ")([^0-9A-Za-z]|$)"
367
+
368
+ #define DFA_DEAD (-1)
369
+ #define TRANS_UNFILLED (-2)
370
+
371
+ typedef struct {
372
+ int *set_pool;
373
+ size_t set_pool_n, set_pool_cap;
374
+ int *set_off;
375
+ int *set_len;
376
+ int *matched;
377
+ int *trans;
378
+ int n_states, states_cap;
379
+ int *hash;
380
+ int hash_cap;
381
+ } dfa_t;
382
+
383
+ typedef struct {
384
+ int *list;
385
+ int n;
386
+ int matched;
387
+ } tlist_t;
388
+
389
+ typedef struct {
390
+ /* compiled, immutable after build (safe to share across scans) */
391
+ prog_t prog;
392
+ size_t min_len;
393
+ const char *req_literal; /* points into a heap copy owned by this engine */
394
+ char *req_literal_own;
395
+ size_t req_lit_len;
396
+ int req_lit_at_start;
397
+ int can_skip;
398
+ size_t max_back;
399
+ cclass_t first;
400
+ int has_first_filter;
401
+ int use_dfa;
402
+ int boundary_wrapped;
403
+ int has_eol;
404
+ size_t max_len;
405
+ /* selective-merge membership (built-ins only; customs never join a merge) */
406
+ int digit_member, digit_lo, digit_hi;
407
+ int iban_member;
408
+ /* lazy DFA + per-scan scratch (mutable; GVL-guarded in Phase 1) */
409
+ dfa_t dfa;
410
+ int *seen;
411
+ int seen_cap;
412
+ tlist_t clist, nlist;
413
+ int *estack;
414
+ int gen;
415
+ /* per-scan non-overlapping cursors used by the selective-merge passes */
416
+ int digit_last_end;
417
+ size_t iban_last_end;
418
+ } engine_t;
419
+
420
+ static engine_t *g_eng = NULL;
421
+ static int g_eng_n = 0; /* engines built (NUM_PATTERNS + custom_n) */
422
+ static int g_eng_cap= 0;
423
+ static int g_custom_n = 0;
424
+ static int g_initialized = 0;
425
+
426
+ /* IBAN union-pass dispatch (built-ins only): unique 2-byte country prefixes. */
427
+ static int g_iban_first[256];
428
+ static int g_iban_pair[256][256];
429
+ static int g_have_iban_group = 0;
430
+ static int g_have_digit_group = 0;
431
+
432
+ /* BM infix-literal hints for always-candidate patterns, ported verbatim from
433
+ * gen_patterns.rb's BM_INFIX. Matched by pattern NAME so it survives reordering.
434
+ * Pure speed hints; correctness comes from the program's first-byte filter. */
435
+ static const struct { const char *name, *lit; } BM_INFIX[] = {
436
+ { "aws_s3_presigned_url", "X-Amz-Signature=" },
437
+ { "microsoft_teams_webhook", ".webhook.office.com" },
438
+ { "slack_webhook_url", "hooks.slack.com" },
439
+ { "sentry_dsn", ".ingest.sentry.io" },
440
+ { "hashicorp_terraform_api_token",".atlasv1." },
441
+ { "uri_with_password", "://" },
442
+ { "bearer_token", "earer " },
443
+ { "email", "@" },
444
+ { "uuid_v4", "-4" },
445
+ { "phone_e164", "+" },
446
+ { "launchdarkly_api_key", "-" },
447
+ };
448
+
449
+ static const char *bm_infix_for(const char *name) {
450
+ for (size_t i = 0; i < sizeof(BM_INFIX)/sizeof(BM_INFIX[0]); i++)
451
+ if (!strcmp(BM_INFIX[i].name, name)) return BM_INFIX[i].lit;
452
+ return NULL;
453
+ }
454
+
455
+ /* True iff `literal` is the literal start of `regex` once backslash escapes are
456
+ * collapsed (mirrors gen_patterns.rb's regex_starts_with_literal?). Used to
457
+ * classify a required literal as start-anchored vs infix. */
458
+ static int regex_starts_with_literal(const char *regex, const char *lit) {
459
+ const char *r = regex;
460
+ const char *l = lit;
461
+ while (*l) {
462
+ char rc;
463
+ if (*r == '\\' && *(r+1)) { rc = *(r+1); r += 2; }
464
+ else if (*r) { rc = *r; r += 1; }
465
+ else return 0;
466
+ if (rc != *l) return 0;
467
+ l++;
468
+ }
469
+ return 1;
470
+ }
471
+
472
+ static int parse_pure_digit(const char *re, int *lo, int *hi) {
473
+ if (strncmp(re, "[0-9]{", 6) != 0) return 0;
474
+ const char *p = re + 6;
475
+ int a = 0, has_a = 0;
476
+ while (*p >= '0' && *p <= '9') { a = a*10 + (*p - '0'); p++; has_a = 1; }
477
+ if (!has_a) return 0;
478
+ int b;
479
+ if (*p == ',') {
480
+ p++;
481
+ int c = 0, has_c = 0;
482
+ while (*p >= '0' && *p <= '9') { c = c*10 + (*p - '0'); p++; has_c = 1; }
483
+ if (!has_c) return 0;
484
+ b = c;
485
+ } else {
486
+ b = a;
487
+ }
488
+ if (*p != '}' || *(p+1) != '\0') return 0;
489
+ *lo = a; *hi = b;
490
+ return 1;
491
+ }
492
+
493
+ static int parse_iban_prefix(const char *re, int boundary_wrapped_flag,
494
+ unsigned char *c0, unsigned char *c1) {
495
+ if (boundary_wrapped_flag) return 0;
496
+ if (!(re[0] >= 'A' && re[0] <= 'Z')) return 0;
497
+ if (!(re[1] >= 'A' && re[1] <= 'Z')) return 0;
498
+ if (strncmp(re + 2, "[0-9]{2}", 8) != 0) return 0;
499
+ *c0 = (unsigned char)re[0];
500
+ *c1 = (unsigned char)re[1];
501
+ return 1;
502
+ }
503
+
504
+ static void compute_first_set(const prog_t *pr, cclass_t *out, int *full) {
505
+ memset(out, 0, sizeof(*out));
506
+ *full = 0;
507
+ if (pr->n > 2048) { *full = 1; return; }
508
+ uint8_t seen[2048];
509
+ int stack[2048], top = 0;
510
+ memset(seen, 0, (size_t)pr->n);
511
+ stack[top++] = 0;
512
+ while (top > 0) {
513
+ int pc = stack[--top];
514
+ if (pc < 0 || pc >= pr->n || seen[pc]) continue;
515
+ seen[pc] = 1;
516
+ const inst_t *in = &pr->code[pc];
517
+ switch (in->op) {
518
+ case OP_JMP: stack[top++] = in->x; break;
519
+ case OP_SPLIT: stack[top++] = in->x; stack[top++] = in->y; break;
520
+ case OP_BOL: case OP_EOL: stack[top++] = pc + 1; break;
521
+ case OP_CHAR: cc_set(out, in->ch); break;
522
+ case OP_CLASS: out->w[0]|=in->cc.w[0]; out->w[1]|=in->cc.w[1];
523
+ out->w[2]|=in->cc.w[2]; out->w[3]|=in->cc.w[3]; break;
524
+ case OP_ANY: cc_add_range(out, 0, 255); cc_unset(out, '\n'); break;
525
+ case OP_MATCH: *full = 1; break;
526
+ }
527
+ if (*full) return;
528
+ }
529
+ }
530
+
531
+ static int prog_has_eol(const prog_t *pr) {
532
+ for (int i = 0; i < pr->n; i++)
533
+ if (pr->code[i].op == OP_EOL) return 1;
534
+ return 0;
535
+ }
536
+
537
+ /* Build one engine (program + length bounds + literal/first-byte hints) from a
538
+ * CORE regex and a boundary flag. `name` may be NULL (customs) — it is only used
539
+ * to look up the BM infix hint table. Does NOT touch the merge dispatch tables;
540
+ * the caller records merge membership for built-ins. */
541
+ static void engine_build(engine_t *eng, const char *core_regex, int boundary,
542
+ const char *name) {
543
+ memset(eng, 0, sizeof(*eng));
544
+
545
+ const char *src = core_regex;
546
+ char *to_free = NULL;
547
+ if (boundary) {
548
+ size_t len = strlen(WRAP_PFX)+strlen(core_regex)+strlen(WRAP_SFX)+1;
549
+ to_free = mm_xmalloc(len);
550
+ snprintf(to_free, len, "%s%s%s", WRAP_PFX, core_regex, WRAP_SFX);
551
+ src = to_free;
552
+ }
553
+
554
+ ast_node_t *ast = parse_regex(src);
555
+ eng->min_len = ast_min_len(ast);
556
+ size_t max_len = ast_max_len(ast);
557
+ emit_node(&eng->prog, ast);
558
+ prog_emit(&eng->prog, OP_MATCH);
559
+ ast_free(ast);
560
+ free(to_free);
561
+
562
+ /* The literal-skip hint is attached separately by engine_set_literal() after
563
+ * this build: built-ins get pattern_required_literal[] (start/infix
564
+ * reclassified from the regex) or a BM infix hint; customs get none and take
565
+ * the per-pattern full scan — safe, just unoptimised. `name` is consumed
566
+ * there, not here. */
567
+
568
+ /* v14 first-byte filter (correctness-bearing). */
569
+ int full;
570
+ compute_first_set(&eng->prog, &eng->first, &full);
571
+ eng->has_first_filter = !full;
572
+
573
+ eng->use_dfa = !full;
574
+ eng->boundary_wrapped = boundary;
575
+ eng->has_eol = prog_has_eol(&eng->prog);
576
+ eng->max_len = max_len;
577
+ (void)name;
578
+ }
579
+
580
+ /* Attach a literal-skip hint to an already-built engine. `lit` may be NULL.
581
+ * Must run after engine_build (reads eng->boundary_wrapped / eng->max_len). */
582
+ static void engine_set_literal(engine_t *eng, const char *lit, int at_start) {
583
+ if (!lit) return;
584
+ eng->req_literal_own = strdup(lit);
585
+ if (!eng->req_literal_own) { perror("strdup"); exit(1); }
586
+ eng->req_literal = eng->req_literal_own;
587
+ eng->req_lit_len = strlen(lit);
588
+ eng->req_lit_at_start = at_start;
589
+
590
+ /* The literal is classified against the CORE regex, but the compiled program
591
+ * is the boundary-wrapped (^|[^0-9A-Za-z])(CORE)([^...]|$) form, whose leading
592
+ * group consumes up to 1 byte before the CORE. So the match can start 1 byte
593
+ * before the literal even for a "start-anchored" literal. Account for that in
594
+ * max_back, else the literal skip jumps past the boundary byte and misses the
595
+ * match (e.g. swiss_ahv "756." in " 756.1234.5678.90"). */
596
+ size_t bw = eng->boundary_wrapped ? 1 : 0;
597
+ if (at_start) {
598
+ eng->can_skip = 1; eng->max_back = bw;
599
+ } else if (eng->max_len != LEN_UNBOUNDED && eng->max_len >= eng->req_lit_len) {
600
+ eng->can_skip = 1; eng->max_back = eng->max_len - eng->req_lit_len;
601
+ } else {
602
+ eng->can_skip = 0;
603
+ }
604
+ }
605
+
606
+ static void engine_free(engine_t *eng) {
607
+ free(eng->prog.code);
608
+ free(eng->req_literal_own);
609
+ free(eng->seen);
610
+ free(eng->clist.list);
611
+ free(eng->nlist.list);
612
+ free(eng->estack);
613
+ dfa_t *d = &eng->dfa;
614
+ free(d->set_pool); free(d->set_off); free(d->set_len);
615
+ free(d->matched); free(d->trans); free(d->hash);
616
+ memset(eng, 0, sizeof(*eng));
617
+ }
618
+
619
+ /* ========================================================================
620
+ * 7. Thompson VM
621
+ * ======================================================================== */
622
+
623
+ static void addthread(prog_t *pr, tlist_t *tl, int *seen, int gen,
624
+ int *stk, int pc0,
625
+ const char *input, size_t len, size_t pos) {
626
+ int top = 0;
627
+ stk[top++] = pc0;
628
+ while (top > 0) {
629
+ int pc = stk[--top];
630
+ if (seen[pc] == gen) continue;
631
+ seen[pc] = gen;
632
+ inst_t *in = &pr->code[pc];
633
+ switch (in->op) {
634
+ case OP_JMP:
635
+ stk[top++] = in->x;
636
+ break;
637
+ case OP_SPLIT:
638
+ stk[top++] = in->y;
639
+ stk[top++] = in->x;
640
+ break;
641
+ case OP_BOL:
642
+ if (pos == 0 || input[pos-1] == '\n') stk[top++] = pc + 1;
643
+ break;
644
+ case OP_EOL:
645
+ /* addthread_dfa seeds the position-independent closure with
646
+ * pos=1,len=0 (pos>len), so guard the input[pos] read against
647
+ * pos<len to avoid reading past the buffer for that dummy seed. */
648
+ if (pos == len || (pos < len && input[pos] == '\n')) stk[top++] = pc + 1;
649
+ break;
650
+ case OP_MATCH:
651
+ tl->matched = 1;
652
+ break;
653
+ default:
654
+ tl->list[tl->n++] = pc;
655
+ break;
656
+ }
657
+ }
658
+ }
659
+
660
+ static void addthread_dfa(prog_t *pr, tlist_t *tl, int *seen, int gen, int *stk, int pc0) {
661
+ addthread(pr, tl, seen, gen, stk, pc0, "", 0, 1);
662
+ }
663
+
664
+ /* ========================================================================
665
+ * 8. Lazy DFA construction
666
+ * ======================================================================== */
667
+
668
+ static int int_cmp(const void *a, const void *b) {
669
+ int x = *(const int *)a, y = *(const int *)b;
670
+ return (x > y) - (x < y);
671
+ }
672
+
673
+ static unsigned dfa_hash(const int *set, int n, int matched, int cap_mask) {
674
+ uint64_t h = 1469598103934665603ULL;
675
+ for (int i = 0; i < n; i++) {
676
+ uint32_t k = (uint32_t)set[i];
677
+ for (int b = 0; b < 4; b++) { h ^= (k & 0xff); h *= 1099511628211ULL; k >>= 8; }
678
+ }
679
+ h ^= (uint64_t)(matched & 1); h *= 1099511628211ULL;
680
+ return (unsigned)(h & (uint64_t)cap_mask);
681
+ }
682
+
683
+ static int dfa_set_eq(const dfa_t *d, int sid, const int *set, int n, int matched) {
684
+ if (d->set_len[sid] != n) return 0;
685
+ if ((d->matched[sid] & 1) != (matched & 1)) return 0;
686
+ const int *s = &d->set_pool[d->set_off[sid]];
687
+ for (int i = 0; i < n; i++) if (s[i] != set[i]) return 0;
688
+ return 1;
689
+ }
690
+
691
+ static void dfa_hash_insert(dfa_t *d, int sid);
692
+
693
+ static void dfa_grow_states(dfa_t *d) {
694
+ if (d->n_states < d->states_cap) return;
695
+ int newcap = d->states_cap ? d->states_cap * 2 : 64;
696
+ d->set_off = realloc(d->set_off, (size_t)newcap * sizeof(int));
697
+ d->set_len = realloc(d->set_len, (size_t)newcap * sizeof(int));
698
+ d->matched = realloc(d->matched, (size_t)newcap * sizeof(int));
699
+ d->trans = realloc(d->trans, (size_t)newcap * 256 * sizeof(int));
700
+ if (!d->set_off || !d->set_len || !d->matched || !d->trans) {
701
+ perror("realloc"); exit(1);
702
+ }
703
+ d->states_cap = newcap;
704
+ }
705
+
706
+ static void dfa_rehash(dfa_t *d) {
707
+ int newcap = d->hash_cap ? d->hash_cap * 2 : 256;
708
+ free(d->hash);
709
+ d->hash = mm_xmalloc((size_t)newcap * sizeof(int));
710
+ for (int i = 0; i < newcap; i++) d->hash[i] = -1;
711
+ d->hash_cap = newcap;
712
+ for (int sid = 0; sid < d->n_states; sid++) dfa_hash_insert(d, sid);
713
+ }
714
+
715
+ static void dfa_hash_insert(dfa_t *d, int sid) {
716
+ int mask = d->hash_cap - 1;
717
+ unsigned h = dfa_hash(&d->set_pool[d->set_off[sid]], d->set_len[sid],
718
+ d->matched[sid], mask);
719
+ while (d->hash[h] != -1) h = (h + 1) & (unsigned)mask;
720
+ d->hash[h] = sid;
721
+ }
722
+
723
+ static int dfa_intern(dfa_t *d, const int *set, int n, int matched) {
724
+ if (d->n_states * 4 >= d->hash_cap * 3) dfa_rehash(d);
725
+ int mask = d->hash_cap - 1;
726
+ unsigned h = dfa_hash(set, n, matched, mask);
727
+ while (d->hash[h] != -1) {
728
+ int sid = d->hash[h];
729
+ if (dfa_set_eq(d, sid, set, n, matched)) return sid;
730
+ h = (h + 1) & (unsigned)mask;
731
+ }
732
+ dfa_grow_states(d);
733
+ int sid = d->n_states++;
734
+ if (d->set_pool_n + (size_t)n > d->set_pool_cap) {
735
+ size_t newcap = d->set_pool_cap ? d->set_pool_cap * 2 : 1024;
736
+ while (newcap < d->set_pool_n + (size_t)n) newcap *= 2;
737
+ d->set_pool = realloc(d->set_pool, newcap * sizeof(int));
738
+ if (!d->set_pool) { perror("realloc"); exit(1); }
739
+ d->set_pool_cap = newcap;
740
+ }
741
+ d->set_off[sid] = (int)d->set_pool_n;
742
+ d->set_len[sid] = n;
743
+ d->matched[sid] = matched & 1;
744
+ memcpy(&d->set_pool[d->set_pool_n], set, (size_t)n * sizeof(int));
745
+ d->set_pool_n += (size_t)n;
746
+ for (int b = 0; b < 256; b++) d->trans[sid * 256 + b] = TRANS_UNFILLED;
747
+ d->hash[h] = sid;
748
+ return sid;
749
+ }
750
+
751
+ static void ensure_scratch(engine_t *eng) {
752
+ prog_t *pr = &eng->prog;
753
+ if (eng->seen_cap >= pr->n) return;
754
+ eng->seen = realloc(eng->seen, pr->n * sizeof(int));
755
+ eng->clist.list = realloc(eng->clist.list, pr->n * sizeof(int));
756
+ eng->nlist.list = realloc(eng->nlist.list, pr->n * sizeof(int));
757
+ eng->estack = realloc(eng->estack, (2 * pr->n + 1) * sizeof(int));
758
+ if (!eng->seen || !eng->clist.list || !eng->nlist.list || !eng->estack) {
759
+ perror("realloc"); exit(1);
760
+ }
761
+ memset(eng->seen, 0, pr->n * sizeof(int));
762
+ eng->seen_cap = pr->n;
763
+ }
764
+
765
+ static int dfa_compute_trans(engine_t *eng, int sid, unsigned char c) {
766
+ prog_t *pr = &eng->prog;
767
+ dfa_t *d = &eng->dfa;
768
+ int *seen = eng->seen;
769
+ int *estk = eng->estack;
770
+ tlist_t *nl = &eng->nlist;
771
+
772
+ int gen = ++eng->gen;
773
+ nl->n = 0; nl->matched = 0;
774
+
775
+ const int *set = &d->set_pool[d->set_off[sid]];
776
+ int sn = d->set_len[sid];
777
+ for (int i = 0; i < sn; i++) {
778
+ inst_t *in = &pr->code[set[i]];
779
+ int matches = 0;
780
+ switch (in->op) {
781
+ case OP_CHAR: matches = (in->ch == c); break;
782
+ case OP_CLASS: matches = cc_test(&in->cc, c); break;
783
+ case OP_ANY: matches = (c != '\n'); break;
784
+ default: break;
785
+ }
786
+ if (matches)
787
+ addthread_dfa(pr, nl, seen, gen, estk, set[i] + 1);
788
+ }
789
+
790
+ if (nl->n == 0 && !nl->matched) {
791
+ d->trans[sid * 256 + c] = DFA_DEAD;
792
+ return DFA_DEAD;
793
+ }
794
+ qsort(nl->list, (size_t)nl->n, sizeof(int), int_cmp);
795
+ int m = 0;
796
+ for (int i = 0; i < nl->n; i++)
797
+ if (i == 0 || nl->list[i] != nl->list[i-1]) nl->list[m++] = nl->list[i];
798
+ int next = dfa_intern(d, nl->list, m, nl->matched);
799
+ d->trans[sid * 256 + c] = next;
800
+ return next;
801
+ }
802
+
803
+ static void dfa_build_start(engine_t *eng) {
804
+ prog_t *pr = &eng->prog;
805
+ dfa_t *d = &eng->dfa;
806
+ int *seen = eng->seen;
807
+ int *estk = eng->estack;
808
+ tlist_t *cl = &eng->clist;
809
+
810
+ int gen = ++eng->gen;
811
+ cl->n = 0; cl->matched = 0;
812
+ addthread_dfa(pr, cl, seen, gen, estk, 0);
813
+ qsort(cl->list, (size_t)cl->n, sizeof(int), int_cmp);
814
+ int m = 0;
815
+ for (int i = 0; i < cl->n; i++)
816
+ if (i == 0 || cl->list[i] != cl->list[i-1]) cl->list[m++] = cl->list[i];
817
+ dfa_intern(d, cl->list, m, cl->matched);
818
+ }
819
+
820
+ /* ========================================================================
821
+ * 9. Per-pattern scan (scan_one) — identical logic to the prototype
822
+ * ======================================================================== */
823
+
824
+ static size_t scan_one(int p, const char *input, size_t len,
825
+ mm_match_t *out, size_t max, size_t count) {
826
+ engine_t *eng = &g_eng[p];
827
+ prog_t *pr = &eng->prog;
828
+
829
+ ensure_scratch(eng);
830
+ int *seen = eng->seen;
831
+ int *estk = eng->estack;
832
+ tlist_t *cl = &eng->clist, *nl = &eng->nlist;
833
+
834
+ if (eng->gen > INT_MAX - (int)(2 * (len + 2))) {
835
+ memset(seen, 0, pr->n * sizeof(int));
836
+ eng->gen = 0;
837
+ }
838
+
839
+ dfa_t *d = &eng->dfa;
840
+ if (eng->use_dfa && d->n_states == 0) dfa_build_start(eng);
841
+
842
+ size_t pos = 0;
843
+ while (pos <= len) {
844
+ if (eng->can_skip) {
845
+ if (len - pos < eng->req_lit_len) break;
846
+ const char *hit = memmem(input + pos, len - pos,
847
+ eng->req_literal, eng->req_lit_len);
848
+ if (!hit) break;
849
+ size_t hpos = (size_t)(hit - input);
850
+ size_t start = hpos > eng->max_back ? hpos - eng->max_back : 0;
851
+ if (start > pos) pos = start;
852
+ }
853
+
854
+ if (eng->has_first_filter && pos < len) {
855
+ while (pos < len && !cc_test(&eng->first, (unsigned char)input[pos]))
856
+ pos++;
857
+ if (pos >= len) break;
858
+ }
859
+
860
+ size_t match_end = (size_t)-1;
861
+ size_t sp = pos;
862
+
863
+ int near_eol = eng->has_eol &&
864
+ (eng->max_len == LEN_UNBOUNDED ||
865
+ pos + eng->max_len >= len);
866
+ int use_dfa_here = eng->use_dfa &&
867
+ !(eng->boundary_wrapped && pos == 0) &&
868
+ !near_eol;
869
+
870
+ if (use_dfa_here) {
871
+ int st = 0;
872
+ while (st != DFA_DEAD) {
873
+ if (d->matched[st] && sp - pos >= eng->min_len) match_end = sp;
874
+ if (sp == len) break;
875
+ int next = d->trans[st * 256 + (unsigned char)input[sp]];
876
+ if (next == TRANS_UNFILLED)
877
+ next = dfa_compute_trans(eng, st, (unsigned char)input[sp]);
878
+ st = next;
879
+ sp++;
880
+ }
881
+ } else {
882
+ int gen = ++eng->gen;
883
+ cl->n = 0; cl->matched = 0;
884
+ addthread(pr, cl, seen, gen, estk, 0, input, len, pos);
885
+ while (cl->n > 0 || cl->matched) {
886
+ if (cl->matched && sp - pos >= eng->min_len) match_end = sp;
887
+ if (cl->n == 0 || sp == len) break;
888
+ unsigned char c = (unsigned char)input[sp];
889
+ gen = ++eng->gen;
890
+ nl->n = 0; nl->matched = 0;
891
+ for (int i = 0; i < cl->n; i++) {
892
+ inst_t *in = &pr->code[cl->list[i]];
893
+ int matches = 0;
894
+ switch (in->op) {
895
+ case OP_CHAR: matches = (in->ch == c); break;
896
+ case OP_CLASS: matches = cc_test(&in->cc, c); break;
897
+ case OP_ANY: matches = (c != '\n'); break;
898
+ default: break;
899
+ }
900
+ if (matches)
901
+ addthread(pr, nl, seen, gen, estk, cl->list[i] + 1,
902
+ input, len, sp + 1);
903
+ }
904
+ tlist_t tmp = *cl; *cl = *nl; *nl = tmp;
905
+ sp++;
906
+ }
907
+ }
908
+
909
+ if (match_end != (size_t)-1) {
910
+ size_t span = match_end - pos;
911
+ size_t core_so = pos, core_eo = match_end;
912
+ if (eng->boundary_wrapped) {
913
+ /* The wrapper is (^|[^0-9A-Za-z])(CORE)([^0-9A-Za-z]|$). The gem
914
+ * redacts only CORE and preserves the boundary bytes. The CORE's
915
+ * outer chars are alphanumeric for every boundary-wrapped pattern,
916
+ * while a consumed boundary byte is [^0-9A-Za-z]; so a non-alnum
917
+ * first/last byte of the span is a consumed boundary (^/$ are
918
+ * zero-width and leave an alnum edge). Strip them to get CORE. */
919
+ if (core_so < core_eo &&
920
+ !isalnum((unsigned char)input[core_so])) core_so++;
921
+ if (core_eo > core_so &&
922
+ !isalnum((unsigned char)input[core_eo-1])) core_eo--;
923
+ }
924
+ if (count < max)
925
+ out[count++] = (mm_match_t){p, core_so, core_eo - core_so};
926
+ pos = (span == 0) ? pos + 1 : match_end;
927
+ } else {
928
+ pos++;
929
+ }
930
+ }
931
+ return count;
932
+ }
933
+
934
+ /* ========================================================================
935
+ * 10. Selective merges (digit run pass + IBAN union pass)
936
+ * ======================================================================== */
937
+
938
+ static size_t scan_digit_group(const char *input, size_t len,
939
+ const int *enable_bits, size_t n_bits,
940
+ mm_match_t *out, size_t max, size_t count) {
941
+ for (int p = 0; p < g_eng_n; p++)
942
+ if (g_eng[p].digit_member) g_eng[p].digit_last_end = 0;
943
+
944
+ size_t i = 0;
945
+ while (i < len) {
946
+ unsigned char c = (unsigned char)input[i];
947
+ if (c < '0' || c > '9') { i++; continue; }
948
+ size_t rs = i;
949
+ while (i < len && input[i] >= '0' && input[i] <= '9') i++;
950
+ size_t re = i;
951
+ size_t L = re - rs;
952
+
953
+ size_t end;
954
+ if (re == len) end = re;
955
+ else if (!isalnum((unsigned char)input[re])) end = re + 1;
956
+ else continue;
957
+
958
+ for (int p = 0; p < g_eng_n && count < max; p++) {
959
+ engine_t *eng = &g_eng[p];
960
+ if (!eng->digit_member) continue;
961
+ if ((size_t)p < n_bits && !enable_bits[p]) continue;
962
+ if ((int)L < eng->digit_lo || (int)L > eng->digit_hi) continue;
963
+
964
+ size_t start;
965
+ if (rs > 0 && !isalnum((unsigned char)input[rs-1]) &&
966
+ rs - 1 >= (size_t)eng->digit_last_end) {
967
+ start = rs - 1;
968
+ } else if (rs == 0 || input[rs-1] == '\n') {
969
+ start = rs;
970
+ } else {
971
+ continue;
972
+ }
973
+
974
+ /* The boundary-wrapped span is [start, end); the gem redacts only the
975
+ * CORE token (the digit run [rs, re)) and copies the boundary bytes
976
+ * back verbatim. Emit CORE coordinates; keep the per-pattern gsub
977
+ * cursor (digit_last_end) on the FULL span so adjacent runs sharing a
978
+ * separator are resolved exactly as gsub would. */
979
+ (void)start;
980
+ out[count++] = (mm_match_t){p, rs, re - rs};
981
+ eng->digit_last_end = (int)end;
982
+ }
983
+ if (count >= max) break;
984
+ }
985
+ return count;
986
+ }
987
+
988
+ static size_t scan_iban_group(const char *input, size_t len,
989
+ const int *enable_bits, size_t n_bits,
990
+ mm_match_t *out, size_t max, size_t count) {
991
+ for (int p = 0; p < g_eng_n; p++)
992
+ if (g_eng[p].iban_member) {
993
+ g_eng[p].iban_last_end = 0;
994
+ engine_t *eng = &g_eng[p];
995
+ if (eng->use_dfa && eng->dfa.n_states == 0) {
996
+ ensure_scratch(eng); dfa_build_start(eng);
997
+ }
998
+ }
999
+
1000
+ size_t i = 0;
1001
+ while (i + 1 < len && count < max) {
1002
+ unsigned char c0 = (unsigned char)input[i];
1003
+ if (!g_iban_first[c0]) { i++; continue; }
1004
+ int p = g_iban_pair[c0][(unsigned char)input[i + 1]];
1005
+ if (p < 0) { i++; continue; }
1006
+ if ((size_t)p < n_bits && !enable_bits[p]) { i++; continue; }
1007
+ if (i < g_eng[p].iban_last_end) { i++; continue; }
1008
+
1009
+ engine_t *eng = &g_eng[p];
1010
+ dfa_t *d = &eng->dfa;
1011
+ size_t match_end = (size_t)-1, sp = i;
1012
+ int st = 0;
1013
+ while (st != DFA_DEAD) {
1014
+ if (d->matched[st] && sp - i >= eng->min_len) match_end = sp;
1015
+ if (sp == len) break;
1016
+ int next = d->trans[st * 256 + (unsigned char)input[sp]];
1017
+ if (next == TRANS_UNFILLED)
1018
+ next = dfa_compute_trans(eng, st, (unsigned char)input[sp]);
1019
+ st = next;
1020
+ sp++;
1021
+ }
1022
+
1023
+ if (match_end != (size_t)-1) {
1024
+ size_t span = match_end - i;
1025
+ out[count++] = (mm_match_t){p, i, span};
1026
+ eng->iban_last_end = match_end;
1027
+ i = (span == 0) ? i + 1 : match_end;
1028
+ } else {
1029
+ i++;
1030
+ }
1031
+ }
1032
+ return count;
1033
+ }
1034
+
1035
+ /* ========================================================================
1036
+ * 11. Init / add / remove
1037
+ * ======================================================================== */
1038
+
1039
+ static engine_t *eng_grow_one(void) {
1040
+ if (g_eng_n >= g_eng_cap) {
1041
+ int newcap = g_eng_cap ? g_eng_cap * 2 : (NUM_PATTERNS + 16);
1042
+ g_eng = realloc(g_eng, (size_t)newcap * sizeof(engine_t));
1043
+ if (!g_eng) { perror("realloc"); exit(1); }
1044
+ g_eng_cap = newcap;
1045
+ }
1046
+ return &g_eng[g_eng_n++];
1047
+ }
1048
+
1049
+ void mm_init(void) {
1050
+ if (g_initialized) return;
1051
+
1052
+ for (int a = 0; a < 256; a++)
1053
+ for (int b = 0; b < 256; b++) g_iban_pair[a][b] = -1;
1054
+
1055
+ for (int p = 0; p < NUM_PATTERNS; p++) {
1056
+ engine_t *eng = eng_grow_one();
1057
+ engine_build(eng, pattern_strings[p], boundary_wrapped[p], pattern_names[p]);
1058
+
1059
+ const char *lit = pattern_required_literal[p];
1060
+ if (lit) {
1061
+ int at_start = regex_starts_with_literal(pattern_strings[p], lit);
1062
+ engine_set_literal(eng, lit, at_start);
1063
+ } else {
1064
+ const char *bm = bm_infix_for(pattern_names[p]);
1065
+ if (bm) engine_set_literal(eng, bm, 0);
1066
+ }
1067
+
1068
+ int lo, hi;
1069
+ if (boundary_wrapped[p] && parse_pure_digit(pattern_strings[p], &lo, &hi)) {
1070
+ eng->digit_member = 1; eng->digit_lo = lo; eng->digit_hi = hi;
1071
+ g_have_digit_group = 1;
1072
+ }
1073
+ unsigned char c0, c1;
1074
+ if (parse_iban_prefix(pattern_strings[p], boundary_wrapped[p], &c0, &c1)) {
1075
+ eng->iban_member = 1;
1076
+ g_iban_first[c0] = 1;
1077
+ g_iban_pair[c0][c1] = p;
1078
+ g_have_iban_group = 1;
1079
+ }
1080
+ }
1081
+ g_custom_n = 0;
1082
+ g_initialized = 1;
1083
+ }
1084
+
1085
+ int mm_add(const char *regex, int boundary) {
1086
+ if (!g_initialized) mm_init();
1087
+ engine_t *eng = eng_grow_one();
1088
+ engine_build(eng, regex, boundary, NULL);
1089
+ /* Custom patterns never join the selective merges (TODO §1d Gap 4): they keep
1090
+ * the per-pattern path. No digit/IBAN membership, no literal-skip hint. */
1091
+ g_custom_n++;
1092
+ return 0;
1093
+ }
1094
+
1095
+ void mm_remove(int idx) {
1096
+ int slot = NUM_PATTERNS + idx;
1097
+ if (idx < 0 || slot >= g_eng_n) return;
1098
+ engine_free(&g_eng[slot]);
1099
+ /* compact: shift the trailing customs down one to preserve registration
1100
+ * order (slot == NUM_PATTERNS + position). */
1101
+ for (int s = slot; s < g_eng_n - 1; s++)
1102
+ g_eng[s] = g_eng[s + 1];
1103
+ g_eng_n--;
1104
+ g_custom_n--;
1105
+ }
1106
+
1107
+ void mm_clear_custom(void) {
1108
+ for (int s = NUM_PATTERNS; s < g_eng_n; s++) engine_free(&g_eng[s]);
1109
+ g_eng_n = NUM_PATTERNS;
1110
+ g_custom_n = 0;
1111
+ }
1112
+
1113
+ /* ========================================================================
1114
+ * 12. Public scan + resolve
1115
+ * ======================================================================== */
1116
+
1117
+ static inline int enabled(const int *enable_bits, size_t n_bits, int p) {
1118
+ if (!enable_bits) return 1;
1119
+ if ((size_t)p >= n_bits) return 0;
1120
+ return enable_bits[p] != 0;
1121
+ }
1122
+
1123
+ size_t mm_scan(const char *input, size_t len,
1124
+ const int *enable_bits, size_t n_bits,
1125
+ mm_match_t *out, size_t max) {
1126
+ if (!g_initialized) mm_init();
1127
+ size_t count = 0;
1128
+
1129
+ for (int p = 0; p < g_eng_n && count < max; p++) {
1130
+ if (g_eng[p].digit_member) continue;
1131
+ if (g_eng[p].iban_member) continue;
1132
+ if (!enabled(enable_bits, n_bits, p)) continue;
1133
+ count = scan_one(p, input, len, out, max, count);
1134
+ }
1135
+ if (g_have_iban_group && count < max)
1136
+ count = scan_iban_group(input, len, enable_bits, n_bits, out, max, count);
1137
+ if (g_have_digit_group && count < max)
1138
+ count = scan_digit_group(input, len, enable_bits, n_bits, out, max, count);
1139
+ return count;
1140
+ }
1141
+
1142
+ /* Order events for the index-order greedy claim: ascending pattern_id, then
1143
+ * ascending start (so a lower-index pattern always gets first claim on a region;
1144
+ * within a pattern, earlier matches are seen first). */
1145
+ static int ev_cmp_resolve(const void *a, const void *b) {
1146
+ const mm_match_t *x = a, *y = b;
1147
+ if (x->pattern_id != y->pattern_id) return x->pattern_id - y->pattern_id;
1148
+ if (x->start != y->start) return x->start < y->start ? -1 : 1;
1149
+ return 0;
1150
+ }
1151
+
1152
+ /* Order kept events for emission: ascending start. */
1153
+ static int ev_cmp_start(const void *a, const void *b) {
1154
+ const mm_match_t *x = a, *y = b;
1155
+ if (x->start != y->start) return x->start < y->start ? -1 : 1;
1156
+ return x->pattern_id - y->pattern_id;
1157
+ }
1158
+
1159
+ size_t mm_resolve(mm_match_t *ev, size_t n) {
1160
+ if (n == 0) return 0;
1161
+ qsort(ev, n, sizeof(mm_match_t), ev_cmp_resolve);
1162
+
1163
+ /* Greedy claim in (pattern_id, start) order. An event is kept iff its span
1164
+ * [start, start+length) does not overlap any already-kept span. Kept spans
1165
+ * are accumulated in `kept`; we check membership against them. n is small
1166
+ * for typical inputs, but to stay linear-ish we keep `kept` sorted by start
1167
+ * and binary-search the neighbourhood. For simplicity and because match
1168
+ * counts are modest, a linear overlap check against the kept set is used. */
1169
+ mm_match_t *kept = mm_xmalloc(n * sizeof(mm_match_t));
1170
+ size_t nk = 0;
1171
+ for (size_t i = 0; i < n; i++) {
1172
+ size_t s = ev[i].start, e = s + ev[i].length;
1173
+ int overlaps = 0;
1174
+ for (size_t j = 0; j < nk; j++) {
1175
+ size_t ks = kept[j].start, ke = ks + kept[j].length;
1176
+ if (s < ke && ks < e) { overlaps = 1; break; }
1177
+ }
1178
+ if (!overlaps) kept[nk++] = ev[i];
1179
+ }
1180
+ qsort(kept, nk, sizeof(mm_match_t), ev_cmp_start);
1181
+ memcpy(ev, kept, nk * sizeof(mm_match_t));
1182
+ free(kept);
1183
+ return nk;
1184
+ }
1185
+
1186
+ const char *mm_pattern_name(int id) {
1187
+ if (id < 0 || id >= NUM_PATTERNS) return NULL;
1188
+ return pattern_names[id];
1189
+ }
1190
+
1191
+ int mm_pattern_count(void) {
1192
+ return g_eng_n;
1193
+ }