data_redactor 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1047 @@
1
+ #include <ruby.h>
2
+ #include <regex.h>
3
+ #include <string.h>
4
+ #include <stdlib.h>
5
+
6
+ #define NUM_PATTERNS 79
7
+
8
+ #define PLACEHOLDER_MODE_PLAIN 0 /* use ph.str verbatim */
9
+ #define PLACEHOLDER_MODE_TAGGED 1 /* "[REDACTED:TAGNAME]" */
10
+ #define PLACEHOLDER_MODE_HASH 2 /* "[TAGNAME_xxxx]" (4-hex djb2 suffix) */
11
+
12
+ typedef struct {
13
+ int mode;
14
+ const char *str; /* plain string (mode 0); tag name (modes 1/2) */
15
+ } placeholder_t;
16
+
17
+ /* djb2 — fast, dependency-free, good enough for 4-hex log correlation */
18
+ static unsigned int djb2(const char *s, size_t len) {
19
+ unsigned int h = 5381;
20
+ for (size_t i = 0; i < len; i++)
21
+ h = h * 33 ^ (unsigned char)s[i];
22
+ return h;
23
+ }
24
+
25
+ /*
26
+ * Write the placeholder for one match into `buf` (which must be large enough).
27
+ * Returns the number of bytes written.
28
+ *
29
+ * mode 0 (plain): writes ph->str verbatim
30
+ * mode 1 (tagged): writes "[REDACTED:TAGNAME]"
31
+ * mode 2 (hash): writes "[TAGNAME_xxxx]" where xxxx = low 16 bits of djb2(match)
32
+ */
33
+ static size_t write_placeholder(char *buf, const placeholder_t *ph,
34
+ const char *match, size_t match_len) {
35
+ switch (ph->mode) {
36
+ case PLACEHOLDER_MODE_TAGGED:
37
+ return (size_t)sprintf(buf, "[REDACTED:%s]", ph->str);
38
+ case PLACEHOLDER_MODE_HASH: {
39
+ unsigned int h = djb2(match, match_len) & 0xFFFF;
40
+ return (size_t)sprintf(buf, "[%s_%04x]", ph->str, h);
41
+ }
42
+ default: /* PLACEHOLDER_MODE_PLAIN */
43
+ {
44
+ size_t len = strlen(ph->str);
45
+ memcpy(buf, ph->str, len);
46
+ return len;
47
+ }
48
+ }
49
+ }
50
+
51
+ /* Upper bound on placeholder length for a given ph (for buffer sizing). */
52
+ static size_t max_placeholder_len(const placeholder_t *ph) {
53
+ size_t tag_len = strlen(ph->str);
54
+ switch (ph->mode) {
55
+ case PLACEHOLDER_MODE_TAGGED: return 2 + 9 + tag_len + 1; /* "[REDACTED:" + tag + "]" */
56
+ case PLACEHOLDER_MODE_HASH: return 1 + tag_len + 1 + 4 + 1; /* "[" + tag + "_" + 4hex + "]" */
57
+ default: return tag_len;
58
+ }
59
+ }
60
+
61
+ /*
62
+ * Tag bits. Each pattern belongs to exactly one tag. Callers can pass a
63
+ * bitmask to restrict which patterns run (only / except). The default mask
64
+ * (TAG_ALL) runs every pattern and matches the historical behaviour of
65
+ * `redact(text)` with no second argument.
66
+ */
67
+ #define TAG_CREDENTIALS (1 << 0)
68
+ #define TAG_FINANCIAL (1 << 1)
69
+ #define TAG_TAX_ID (1 << 2)
70
+ #define TAG_NATIONAL_ID (1 << 3)
71
+ #define TAG_CONTACT (1 << 4)
72
+ #define TAG_NETWORK (1 << 5)
73
+ #define TAG_TRAVEL (1 << 6)
74
+ #define TAG_OTHER (1 << 7)
75
+ #define TAG_CUSTOM (1 << 8)
76
+ #define TAG_BUILTIN_ALL (TAG_CREDENTIALS | TAG_FINANCIAL | TAG_TAX_ID | \
77
+ TAG_NATIONAL_ID | TAG_CONTACT | TAG_NETWORK | \
78
+ TAG_TRAVEL | TAG_OTHER)
79
+ #define TAG_ALL (TAG_BUILTIN_ALL | TAG_CUSTOM)
80
+
81
+ static regex_t compiled_patterns[NUM_PATTERNS];
82
+
83
+ /*
84
+ * Patterns that consist of generic digit/alphanum sequences with no distinctive
85
+ * prefix are wrapped with word-boundary groups:
86
+ * (^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)
87
+ * The boundary_wrapped flag tells replace_all_matches to use sub-match [2]
88
+ * (the actual sensitive token) rather than the full match, so the surrounding
89
+ * non-word characters are preserved and not replaced.
90
+ */
91
+ /*
92
+ * ORDERING: Most specific / longest patterns first, most generic last.
93
+ * This prevents shorter patterns from consuming parts of longer matches.
94
+ *
95
+ * Tiers:
96
+ * 1. Full URLs (longest, most distinctive)
97
+ * 2. Long prefixed tokens (API keys, PATs)
98
+ * 3. IBANs (country prefix + fixed length)
99
+ * 4. Structured formats (dots, dashes, slashes)
100
+ * 5. Short prefixed / letter-anchored patterns
101
+ * 6. Boundary-wrapped structured (dash/dot separated digits)
102
+ * 7. Boundary-wrapped pure digits (longest → shortest)
103
+ */
104
+ static const int boundary_wrapped[NUM_PATTERNS] = {
105
+ /* ---- Tier 1: Full URLs ---- */
106
+ 0, /* 0: AWS S3 Presigned URL */
107
+ 0, /* 1: Microsoft Teams Webhook */
108
+ 0, /* 2: Slack Webhook URL */
109
+ 0, /* 3: MongoDB Connection String */
110
+ 0, /* 4: URI with Embedded Password */
111
+ /* ---- Tier 2: Long prefixed tokens ---- */
112
+ 0, /* 5: GitHub PAT (fine-grained, 93 chars) */
113
+ 0, /* 6: JWT */
114
+ 0, /* 7: Grafana API Token */
115
+ 0, /* 8: SSH Public Key */
116
+ 0, /* 9: Bearer Token */
117
+ 0, /* 10: Google API Key (39 chars) */
118
+ 0, /* 11: AWS Access Key ID (20 chars) */
119
+ 0, /* 12: AWS Secret Access Key (40 base64) */
120
+ 0, /* 13: SendGrid API Key */
121
+ 0, /* 14: Amazon MWS Auth Token */
122
+ 0, /* 15: LaunchDarkly API Key */
123
+ 0, /* 16: GitHub Classic PAT (ghp_) */
124
+ 0, /* 17: GitHub OAuth Token (gho_) */
125
+ 0, /* 18: Stripe Secret Key */
126
+ 0, /* 19: ClickUp API Key */
127
+ 0, /* 20: Scaleway Access Key */
128
+ 0, /* 21: PEM private key header (generic) */
129
+ 0, /* 22: GPG Private Key Block */
130
+ /* ---- Tier 3: IBANs (longest → shortest) ---- */
131
+ 0, /* 23: Hungary IBAN (28 chars) */
132
+ 0, /* 24: Poland IBAN (28 chars) */
133
+ 0, /* 25: France IBAN (27 chars) */
134
+ 0, /* 26: Italy IBAN (27 chars) */
135
+ 0, /* 27: Portugal IBAN (25 chars) */
136
+ 0, /* 28: Spain IBAN (24 chars) */
137
+ 0, /* 29: Czechia IBAN (24 chars) */
138
+ 0, /* 30: Romania IBAN (24 chars) */
139
+ 0, /* 31: Sweden IBAN (24 chars) */
140
+ 0, /* 32: Germany IBAN (22 chars) */
141
+ 0, /* 33: Ireland IBAN (22 chars) */
142
+ 0, /* 34: Switzerland IBAN (21 chars) */
143
+ 0, /* 35: Austria IBAN (20 chars) */
144
+ 0, /* 36: Netherlands IBAN (18 chars) */
145
+ 0, /* 37: Denmark IBAN (18 chars) */
146
+ 0, /* 38: Finland IBAN (18 chars) */
147
+ 0, /* 39: Belgium IBAN (16 chars) */
148
+ 0, /* 40: Norway IBAN (15 chars) */
149
+ /* ---- Tier 4: Structured formats (dots, dashes, slashes, @) ---- */
150
+ 0, /* 41: Email Address */
151
+ 0, /* 42: International Phone Number */
152
+ 0, /* 43: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
153
+ 0, /* 44: Brazilian CPF (XXX.XXX.XXX-XX) */
154
+ 0, /* 45: UUID v4 */
155
+ 0, /* 46: IPv4 address */
156
+ 0, /* 47: Credit card numbers */
157
+ 0, /* 48: Indian Aadhaar (XXXX XXXX XXXX) */
158
+ /* ---- Tier 5: Letter-anchored patterns ---- */
159
+ 0, /* 49: Mexican CURP (18 alphanum, distinctive structure) */
160
+ 0, /* 50: Italian CF with omocodia (16 chars) */
161
+ 0, /* 51: Italian CF basic (16 chars) */
162
+ 0, /* 52: UK National Insurance Number */
163
+ 0, /* 53: Spanish NIE (X/Y/Z prefix) */
164
+ 0, /* 54: Passport letter prefix + digits */
165
+ /* ---- Tier 6: Boundary-wrapped structured (dash/dot/slash separated) ---- */
166
+ 1, /* 55: South Korean RRN (YYMMDD-XXXXXXX, 14 chars) */
167
+ 1, /* 56: Swiss AHV Number (756.XXXX.XXXX.XX) */
168
+ 1, /* 57: Finnish HETU (DDMMYY[+-A]XXXC) */
169
+ 1, /* 58: Swedish Personnummer (YYMMDD[-+]XXXX) */
170
+ 1, /* 59: Danish CPR Number (DDMMYY-XXXX) */
171
+ 1, /* 60: Czech Rodné číslo (YYMMDD/XXXX) */
172
+ 1, /* 61: US Social Security Number (XXX-XX-XXXX) */
173
+ 1, /* 62: US ITIN (9XX-XX-XXXX) */
174
+ 1, /* 63: Canadian SIN (XXX-XXX-XXX) */
175
+ 1, /* 64: Australian TFN (XXX-XXX-XXX) */
176
+ 1, /* 65: Indian PAN (AAAAA0000A) */
177
+ 1, /* 66: Spanish DNI (8 digits + letter) */
178
+ 1, /* 67: Hungarian Tax ID (8XXXXXXXXX, 10 digits) */
179
+ /* ---- Tier 7: Boundary-wrapped pure digits (longest → shortest) ---- */
180
+ 1, /* 68: French NIR (15 digits) */
181
+ 1, /* 69: South African ID (13 digits) */
182
+ 1, /* 70: Romanian CNP (13 digits) */
183
+ 1, /* 71: Japanese My Number (12 digits) */
184
+ 1, /* 72: Polish PESEL (11 digits) */
185
+ 1, /* 73: Belgian National Number (11 digits) */
186
+ 1, /* 74: Norwegian Fødselsnummer (11 digits) */
187
+ 1, /* 75: Passport 9 digits */
188
+ 1, /* 76: Dutch BSN (8-9 digits) */
189
+ 1, /* 77: Austrian Abgabenkontonummer (9 digits) */
190
+ 1 /* 78: Polish PESEL duplicate */
191
+ };
192
+
193
+ /*
194
+ * Tag for each pattern. Exactly one tag per pattern. Used to filter which
195
+ * patterns run when the caller passes a mask (only/except).
196
+ */
197
+ static const int pattern_tags[NUM_PATTERNS] = {
198
+ /* 0-22: secrets, API keys, tokens, private keys, webhooks */
199
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
200
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
201
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
202
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
203
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
204
+ /* 23-40: IBANs */
205
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
206
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
207
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
208
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
209
+ TAG_CONTACT, /* 41: email */
210
+ TAG_CONTACT, /* 42: phone */
211
+ TAG_TAX_ID, /* 43: Brazilian CNPJ */
212
+ TAG_TAX_ID, /* 44: Brazilian CPF */
213
+ TAG_OTHER, /* 45: UUID v4 */
214
+ TAG_NETWORK, /* 46: IPv4 */
215
+ TAG_FINANCIAL, /* 47: credit card */
216
+ TAG_NATIONAL_ID, /* 48: Indian Aadhaar */
217
+ TAG_NATIONAL_ID, /* 49: Mexican CURP */
218
+ TAG_TAX_ID, /* 50: Italian CF (omocodia) */
219
+ TAG_TAX_ID, /* 51: Italian CF (basic) */
220
+ TAG_NATIONAL_ID, /* 52: UK NIN */
221
+ TAG_NATIONAL_ID, /* 53: Spanish NIE */
222
+ TAG_TRAVEL, /* 54: passport letter prefix */
223
+ TAG_NATIONAL_ID, /* 55: Korean RRN */
224
+ TAG_NATIONAL_ID, /* 56: Swiss AHV */
225
+ TAG_NATIONAL_ID, /* 57: Finnish HETU */
226
+ TAG_NATIONAL_ID, /* 58: Swedish Personnummer */
227
+ TAG_NATIONAL_ID, /* 59: Danish CPR */
228
+ TAG_NATIONAL_ID, /* 60: Czech Rodné číslo */
229
+ TAG_NATIONAL_ID, /* 61: US SSN */
230
+ TAG_TAX_ID, /* 62: US ITIN */
231
+ TAG_NATIONAL_ID, /* 63: Canadian SIN */
232
+ TAG_TAX_ID, /* 64: Australian TFN */
233
+ TAG_TAX_ID, /* 65: Indian PAN */
234
+ TAG_NATIONAL_ID, /* 66: Spanish DNI */
235
+ TAG_TAX_ID, /* 67: Hungarian Tax ID */
236
+ TAG_NATIONAL_ID, /* 68: French NIR */
237
+ TAG_NATIONAL_ID, /* 69: South African ID */
238
+ TAG_NATIONAL_ID, /* 70: Romanian CNP */
239
+ TAG_TAX_ID, /* 71: Japanese My Number */
240
+ TAG_NATIONAL_ID, /* 72: Polish PESEL */
241
+ TAG_NATIONAL_ID, /* 73: Belgian National Number */
242
+ TAG_NATIONAL_ID, /* 74: Norwegian Fødselsnummer */
243
+ TAG_TRAVEL, /* 75: passport 9 digits */
244
+ TAG_NATIONAL_ID, /* 76: Dutch BSN */
245
+ TAG_TAX_ID, /* 77: Austrian Abgabenkontonummer */
246
+ TAG_NATIONAL_ID /* 78: Polish PESEL duplicate */
247
+ };
248
+
249
+ static const char *pattern_names[NUM_PATTERNS] = {
250
+ "aws_s3_presigned_url", /* 0 */
251
+ "microsoft_teams_webhook", /* 1 */
252
+ "slack_webhook_url", /* 2 */
253
+ "mongodb_connection_string", /* 3 */
254
+ "uri_with_password", /* 4 */
255
+ "github_pat_fine_grained", /* 5 */
256
+ "jwt", /* 6 */
257
+ "grafana_api_token", /* 7 */
258
+ "ssh_public_key", /* 8 */
259
+ "bearer_token", /* 9 */
260
+ "google_api_key", /* 10 */
261
+ "aws_access_key_id", /* 11 */
262
+ "aws_secret_access_key", /* 12 */
263
+ "sendgrid_api_key", /* 13 */
264
+ "amazon_mws_auth_token", /* 14 */
265
+ "launchdarkly_api_key", /* 15 */
266
+ "github_classic_pat", /* 16 */
267
+ "github_oauth_token", /* 17 */
268
+ "stripe_secret_key", /* 18 */
269
+ "clickup_api_key", /* 19 */
270
+ "scaleway_access_key", /* 20 */
271
+ "pem_private_key", /* 21 */
272
+ "gpg_private_key", /* 22 */
273
+ "iban_hu", /* 23 */
274
+ "iban_pl", /* 24 */
275
+ "iban_fr", /* 25 */
276
+ "iban_it", /* 26 */
277
+ "iban_pt", /* 27 */
278
+ "iban_es", /* 28 */
279
+ "iban_cz", /* 29 */
280
+ "iban_ro", /* 30 */
281
+ "iban_se", /* 31 */
282
+ "iban_de", /* 32 */
283
+ "iban_ie", /* 33 */
284
+ "iban_ch", /* 34 */
285
+ "iban_at", /* 35 */
286
+ "iban_nl", /* 36 */
287
+ "iban_dk", /* 37 */
288
+ "iban_fi", /* 38 */
289
+ "iban_be", /* 39 */
290
+ "iban_no", /* 40 */
291
+ "email", /* 41 */
292
+ "phone_e164", /* 42 */
293
+ "brazilian_cnpj", /* 43 */
294
+ "brazilian_cpf", /* 44 */
295
+ "uuid_v4", /* 45 */
296
+ "ipv4", /* 46 */
297
+ "credit_card", /* 47 */
298
+ "indian_aadhaar", /* 48 */
299
+ "mexican_curp", /* 49 */
300
+ "italian_cf_omocodia", /* 50 */
301
+ "italian_cf", /* 51 */
302
+ "uk_nin", /* 52 */
303
+ "spanish_nie", /* 53 */
304
+ "passport_letter_prefix", /* 54 */
305
+ "korean_rrn", /* 55 */
306
+ "swiss_ahv", /* 56 */
307
+ "finnish_hetu", /* 57 */
308
+ "swedish_personnummer", /* 58 */
309
+ "danish_cpr", /* 59 */
310
+ "czech_rodne_cislo", /* 60 */
311
+ "us_ssn", /* 61 */
312
+ "us_itin", /* 62 */
313
+ "canadian_sin", /* 63 */
314
+ "australian_tfn", /* 64 */
315
+ "indian_pan", /* 65 */
316
+ "spanish_dni", /* 66 */
317
+ "hungarian_tax_id", /* 67 */
318
+ "french_nir", /* 68 */
319
+ "south_african_id", /* 69 */
320
+ "romanian_cnp", /* 70 */
321
+ "japanese_my_number", /* 71 */
322
+ "polish_pesel", /* 72 */
323
+ "belgian_national_number", /* 73 */
324
+ "norwegian_fodselsnummer", /* 74 */
325
+ "passport_9digits", /* 75 */
326
+ "dutch_bsn", /* 76 */
327
+ "austrian_abgabenkontonummer", /* 77 */
328
+ "polish_pesel_2" /* 78 */
329
+ };
330
+
331
+ /*
332
+ * Raw patterns. Boundary-wrapped patterns are stored unwrapped here;
333
+ * the wrapper is applied in Init_data_redactor at compile time.
334
+ */
335
+ static const char *pattern_strings[NUM_PATTERNS] = {
336
+ /* ---- Tier 1: Full URLs ---- */
337
+ /* 0: AWS S3 Presigned URL */
338
+ "https://[a-z0-9.-]+\\.s3\\.amazonaws\\.com/[^[:space:]?]+\\?[^[:space:]]*X-Amz-Signature=[^[:space:]]+",
339
+ /* 1: Microsoft Teams Incoming Webhook */
340
+ "https://[a-z0-9-]+\\.webhook\\.office\\.com/webhookb2/[a-fA-F0-9-]{36}@[a-fA-F0-9-]{36}/[^/ ]+/[a-fA-F0-9]{32}/[a-fA-F0-9-]{36}",
341
+ /* 2: Slack Webhook URL */
342
+ "https://hooks\\.slack\\.com/services/T[A-Z0-9]{8}/B[A-Z0-9]{8}/[A-Za-z0-9]{24}",
343
+ /* 3: MongoDB Connection String (with credentials) */
344
+ "mongodb(\\+srv)?://[^[:space:]'\"<>/:@]+:[^[:space:]'\"<>/@]+@[^[:space:]?'\"]+",
345
+ /* 4: URI with Embedded Password (scheme://user:pass@host) */
346
+ "[A-Za-z][A-Za-z0-9+_-]*://[^[:space:]/?#:@]+:[^[:space:]/?#@]+@[A-Za-z0-9.-]+",
347
+
348
+ /* ---- Tier 2: Long prefixed tokens ---- */
349
+ /* 5: GitHub PAT fine-grained (github_pat_ + 82 chars) */
350
+ "github_pat_[0-9a-zA-Z_]{82}",
351
+ /* 6: JWT (three base64url segments) */
352
+ "eyJ[A-Za-z0-9_-]{10,}\\.eyJ[A-Za-z0-9_-]{10,}\\.[A-Za-z0-9_-]+",
353
+ /* 7: Grafana API Token (base64 of {"k":") */
354
+ "eyJrIjoi[A-Za-z0-9_=-]{42,}",
355
+ /* 8: SSH Public Key */
356
+ "ssh-(rsa|ed25519|ecdsa) [a-zA-Z0-9/+=]{20,}",
357
+ /* 9: Bearer Token */
358
+ "[Bb]earer [a-zA-Z0-9_.=/+:-]{12,}",
359
+ /* 10: Google API Key (AIza + 35 chars) */
360
+ "AIza[0-9A-Za-z_-]{35}",
361
+ /* 11: AWS Access Key ID (all prefixes + 16 chars) */
362
+ "(A3T[A-Z0-9]|AKIA|ABIA|ACCA|AGPA|AIDA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[A-Z2-7]{16}",
363
+ /* 12: AWS Secret Access Key (40 base64 chars) */
364
+ "[A-Za-z0-9/+=]{40}",
365
+ /* 13: SendGrid API Key */
366
+ "SG\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}",
367
+ /* 14: Amazon MWS Auth Token */
368
+ "amzn\\.mws\\.[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
369
+ /* 15: LaunchDarkly API Key (api-UUID or sdk-UUID) */
370
+ "(api|sdk)-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}",
371
+ /* 16: GitHub Classic PAT (ghp_ + 36 chars) */
372
+ "ghp_[0-9a-zA-Z]{36}",
373
+ /* 17: GitHub OAuth Token (gho_ + 36 chars) */
374
+ "gho_[0-9a-zA-Z]{36}",
375
+ /* 18: Stripe Secret Key (sk_live_ + 24 chars) */
376
+ "sk_live_[0-9a-zA-Z]{24}",
377
+ /* 19: ClickUp API Key */
378
+ "pk_[0-9]{6,8}_[A-Z0-9]{32}",
379
+ /* 20: Scaleway Access Key (SCW + 17 chars) */
380
+ "SCW[A-Z0-9]{17}",
381
+ /* 21: PEM private key header (generic) */
382
+ "-----BEGIN [A-Z ]*PRIVATE KEY-----",
383
+ /* 22: GPG Private Key Block */
384
+ "-----BEGIN PGP PRIVATE KEY BLOCK-----",
385
+
386
+ /* ---- Tier 3: IBANs (longest → shortest) ---- */
387
+ /* 23: Hungary IBAN (HU, 28 chars) */
388
+ "HU[0-9]{2}[0-9]{24}",
389
+ /* 24: Poland IBAN (PL, 28 chars) */
390
+ "PL[0-9]{2}[0-9]{24}",
391
+ /* 25: France IBAN (FR, 27 chars) */
392
+ "FR[0-9]{2}[0-9]{10}[A-Z0-9]{11}[0-9]{2}",
393
+ /* 26: Italy IBAN (IT, 27 chars) */
394
+ "IT[0-9]{2}[A-Z][0-9]{10}[A-Z0-9]{12}",
395
+ /* 27: Portugal IBAN (PT, 25 chars) */
396
+ "PT[0-9]{2}[0-9]{21}",
397
+ /* 28: Spain IBAN (ES, 24 chars) */
398
+ "ES[0-9]{2}[0-9]{20}",
399
+ /* 29: Czechia IBAN (CZ, 24 chars) */
400
+ "CZ[0-9]{2}[0-9]{20}",
401
+ /* 30: Romania IBAN (RO, 24 chars) */
402
+ "RO[0-9]{2}[A-Z]{4}[A-Z0-9]{16}",
403
+ /* 31: Sweden IBAN (SE, 24 chars) */
404
+ "SE[0-9]{2}[0-9]{20}",
405
+ /* 32: Germany IBAN (DE, 22 chars) */
406
+ "DE[0-9]{2}[0-9]{18}",
407
+ /* 33: Ireland IBAN (IE, 22 chars) */
408
+ "IE[0-9]{2}[A-Z]{4}[0-9]{14}",
409
+ /* 34: Switzerland IBAN (CH, 21 chars) */
410
+ "CH[0-9]{2}[0-9]{5}[A-Z0-9]{12}",
411
+ /* 35: Austria IBAN (AT, 20 chars) */
412
+ "AT[0-9]{2}[0-9]{16}",
413
+ /* 36: Netherlands IBAN (NL, 18 chars) */
414
+ "NL[0-9]{2}[A-Z]{4}[0-9]{10}",
415
+ /* 37: Denmark IBAN (DK, 18 chars) */
416
+ "DK[0-9]{2}[0-9]{14}",
417
+ /* 38: Finland IBAN (FI, 18 chars) */
418
+ "FI[0-9]{2}[0-9]{14}",
419
+ /* 39: Belgium IBAN (BE, 16 chars) */
420
+ "BE[0-9]{2}[0-9]{12}",
421
+ /* 40: Norway IBAN (NO, 15 chars) */
422
+ "NO[0-9]{2}[0-9]{11}",
423
+
424
+ /* ---- Tier 4: Structured formats (dots, dashes, slashes, @) ---- */
425
+ /* 41: Email Address */
426
+ "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
427
+ /* 42: International Phone Number (E.164) */
428
+ "\\+[0-9]{1,3}[- ]?[0-9][0-9 -]{6,13}[0-9]",
429
+ /* 43: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
430
+ "[0-9]{2}\\.[0-9]{3}\\.[0-9]{3}/[0-9]{4}-[0-9]{2}",
431
+ /* 44: Brazilian CPF (XXX.XXX.XXX-XX) */
432
+ "[0-9]{3}\\.[0-9]{3}\\.[0-9]{3}-[0-9]{2}",
433
+ /* 45: UUID v4 / Scaleway Secret Key */
434
+ "[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
435
+ /* 46: IPv4 address */
436
+ "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
437
+ /* 47: Credit card numbers (Visa, Mastercard, Amex, Discover, JCB) */
438
+ "(4[0-9]{15}|4[0-9]{12}|5[1-5][0-9]{14}|6011[0-9]{12}|65[0-9]{14}|3[47][0-9]{13}|3[068][0-9]{11}|35[0-9]{14})",
439
+ /* 48: Indian Aadhaar (XXXX XXXX XXXX or XXXX-XXXX-XXXX) */
440
+ "[0-9]{4}[- ][0-9]{4}[- ][0-9]{4}",
441
+
442
+ /* ---- Tier 5: Letter-anchored patterns ---- */
443
+ /* 49: Mexican CURP (18 alphanum, distinctive structure) */
444
+ "[A-Z]{4}[0-9]{6}[HM][A-Z]{5}[A-Z0-9][0-9]",
445
+ /* 50: Italian CF with omocodia (16 chars) */
446
+ "[A-Z]{6}[0-9LMNPQRSTUV]{2}[ABCDEHLMPRST][0-9LMNPQRSTUV]{2}[A-Z][0-9LMNPQRSTUV]{3}[A-Z]",
447
+ /* 51: Italian CF basic (16 chars) */
448
+ "[A-Z]{6}[0-9]{2}[A-Z][0-9]{2}[A-Z][0-9]{3}[A-Z]",
449
+ /* 52: UK National Insurance Number (AA 99 99 99 A-D) */
450
+ "[A-Z]{2} ?[0-9]{2} ?[0-9]{2} ?[0-9]{2} ?[A-D]",
451
+ /* 53: Spanish NIE (X/Y/Z + 7 digits + letter) */
452
+ "[XYZ][0-9]{7}[A-Z]",
453
+ /* 54: Passport - letter prefix + digits (e.g. AB1234567) */
454
+ "[A-Z]{1,2}[0-9]{6,7}",
455
+
456
+ /* ---- Tier 6: Boundary-wrapped structured (dash/dot/slash separated) ---- */
457
+ /* 55: South Korean RRN (YYMMDD-XXXXXXX, 14 chars with dash) */
458
+ "[0-9]{6}-[0-9]{7}",
459
+ /* 56: Swiss AHV Number (756.XXXX.XXXX.XX) */
460
+ "756\\.[0-9]{4}\\.[0-9]{4}\\.[0-9]{2}",
461
+ /* 57: Finnish HETU (DDMMYY[+-A]XXXC) */
462
+ "[0-9]{6}[-+A][0-9]{3}[0-9A-Y]",
463
+ /* 58: Swedish Personnummer (YYMMDD[-+]XXXX) */
464
+ "[0-9]{6}[-+][0-9]{4}",
465
+ /* 59: Danish CPR Number (DDMMYY-XXXX) */
466
+ "[0-9]{6}-[0-9]{4}",
467
+ /* 60: Czech Rodné číslo (YYMMDD/XXXX or YYMMDDXXXX) */
468
+ "[0-9]{6}/?[0-9]{3,4}",
469
+ /* 61: US Social Security Number (XXX-XX-XXXX) */
470
+ "[0-9]{3}-[0-9]{2}-[0-9]{4}",
471
+ /* 62: US ITIN (9XX-XX-XXXX) */
472
+ "9[0-9]{2}-[0-9]{2}-[0-9]{4}",
473
+ /* 63: Canadian SIN (XXX-XXX-XXX) */
474
+ "[0-9]{3}-[0-9]{3}-[0-9]{3}",
475
+ /* 64: Australian TFN (XXX-XXX-XXX or XXX XXX XXX) */
476
+ "[0-9]{3}[- ][0-9]{3}[- ][0-9]{3}",
477
+ /* 65: Indian PAN (5 letters + 4 digits + 1 letter) */
478
+ "[A-Z]{5}[0-9]{4}[A-Z]",
479
+ /* 66: Spanish DNI (8 digits + 1 letter) */
480
+ "[0-9]{8}[A-Z]",
481
+ /* 67: Hungarian Tax ID (starts with 8, 10 digits) */
482
+ "8[0-9]{9}",
483
+
484
+ /* ---- Tier 7: Boundary-wrapped pure digits (longest → shortest) ---- */
485
+ /* 68: French NIR / Social Security (15 digits) */
486
+ "[12][0-9]{2}[01][0-9][0-9]{2}[0-9]{3}[0-9]{3}[0-9]{2}",
487
+ /* 69: South African ID (13 digits) */
488
+ "[0-9]{13}",
489
+ /* 70: Romanian CNP (13 digits, first digit 1-8) */
490
+ "[1-8][0-9]{12}",
491
+ /* 71: Japanese My Number (12 digits) */
492
+ "[0-9]{12}",
493
+ /* 72: Polish PESEL (11 digits) */
494
+ "[0-9]{11}",
495
+ /* 73: Belgian National Number (11 digits) */
496
+ "[0-9]{11}",
497
+ /* 74: Norwegian Fødselsnummer (11 digits) */
498
+ "[0-9]{11}",
499
+ /* 75: Passport - 9 consecutive digits */
500
+ "[0-9]{9}",
501
+ /* 76: Dutch BSN (8-9 digits) */
502
+ "[0-9]{8,9}",
503
+ /* 77: Austrian Abgabenkontonummer (9 digits) */
504
+ "[0-9]{9}",
505
+ /* 78: Polish PESEL duplicate */
506
+ "[0-9]{11}"
507
+ };
508
+
509
+ static char *wrap_boundary(const char *core); /* forward declaration */
510
+
511
+ /* ---- Custom pattern registry ---- */
512
+
513
+ typedef struct {
514
+ char *name;
515
+ char *source; /* original POSIX ERE string, for introspection */
516
+ regex_t compiled;
517
+ int tag; /* TAG_* bit */
518
+ int boundary; /* 1 if compiled with boundary wrapper */
519
+ } custom_pattern_t;
520
+
521
+ static custom_pattern_t *custom_patterns = NULL;
522
+ static int custom_count = 0;
523
+ static int custom_cap = 0;
524
+
525
+ /*
526
+ * Find index of a custom pattern by name, or -1 if not found.
527
+ */
528
+ static int find_custom_by_name(const char *name) {
529
+ for (int i = 0; i < custom_count; i++) {
530
+ if (strcmp(custom_patterns[i].name, name) == 0) return i;
531
+ }
532
+ return -1;
533
+ }
534
+
535
+ static void free_custom_at(int idx) {
536
+ free(custom_patterns[idx].name);
537
+ free(custom_patterns[idx].source);
538
+ regfree(&custom_patterns[idx].compiled);
539
+ }
540
+
541
+ /* ---- Custom pattern Ruby methods ---- */
542
+
543
+ /*
544
+ * DataRedactor._add_pattern(name, source, tag_bit, boundary) -> nil
545
+ *
546
+ * Compile `source` as POSIX ERE (with boundary wrapper when boundary=1),
547
+ * store under `name`. Replaces any existing pattern with the same name.
548
+ * Raises DataRedactor::InvalidPatternError on regcomp failure.
549
+ */
550
+ static VALUE rb_add_pattern(VALUE self, VALUE rb_name, VALUE rb_source,
551
+ VALUE rb_tag_bit, VALUE rb_boundary) {
552
+ Check_Type(rb_name, T_STRING);
553
+ Check_Type(rb_source, T_STRING);
554
+
555
+ const char *name = StringValueCStr(rb_name);
556
+ const char *source = StringValueCStr(rb_source);
557
+ int tag_bit = NUM2INT(rb_tag_bit);
558
+ int boundary = NUM2INT(rb_boundary);
559
+
560
+ /* Build the pattern string (wrap boundary if requested) */
561
+ char *pat_to_compile;
562
+ char *wrapped = NULL;
563
+ if (boundary) {
564
+ wrapped = wrap_boundary(source);
565
+ if (!wrapped) rb_raise(rb_eNoMemError, "wrap_boundary allocation failed");
566
+ pat_to_compile = wrapped;
567
+ } else {
568
+ pat_to_compile = (char *)source;
569
+ }
570
+
571
+ regex_t compiled;
572
+ int ret = regcomp(&compiled, pat_to_compile, REG_EXTENDED);
573
+ free(wrapped);
574
+
575
+ if (ret != 0) {
576
+ char errbuf[256];
577
+ regerror(ret, &compiled, errbuf, sizeof(errbuf));
578
+ regfree(&compiled);
579
+ VALUE eClass = rb_const_get(rb_define_module("DataRedactor"),
580
+ rb_intern("InvalidPatternError"));
581
+ rb_raise(eClass, "%s", errbuf);
582
+ }
583
+
584
+ /* Replace existing or append */
585
+ int idx = find_custom_by_name(name);
586
+ if (idx >= 0) {
587
+ free_custom_at(idx);
588
+ } else {
589
+ if (custom_count >= custom_cap) {
590
+ int new_cap = custom_cap == 0 ? 8 : custom_cap * 2;
591
+ custom_pattern_t *tmp = (custom_pattern_t *)realloc(
592
+ custom_patterns, sizeof(custom_pattern_t) * new_cap);
593
+ if (!tmp) {
594
+ regfree(&compiled);
595
+ rb_raise(rb_eNoMemError, "custom_patterns realloc failed");
596
+ }
597
+ custom_patterns = tmp;
598
+ custom_cap = new_cap;
599
+ }
600
+ idx = custom_count++;
601
+ }
602
+
603
+ custom_patterns[idx].name = strdup(name);
604
+ custom_patterns[idx].source = strdup(source);
605
+ custom_patterns[idx].compiled = compiled;
606
+ custom_patterns[idx].tag = tag_bit;
607
+ custom_patterns[idx].boundary = boundary;
608
+
609
+ if (!custom_patterns[idx].name || !custom_patterns[idx].source) {
610
+ rb_raise(rb_eNoMemError, "strdup failed");
611
+ }
612
+
613
+ return Qnil;
614
+ }
615
+
616
+ /*
617
+ * DataRedactor._remove_pattern(name) -> true/false
618
+ *
619
+ * Remove the named custom pattern. Returns true if found and removed.
620
+ */
621
+ static VALUE rb_remove_pattern(VALUE self, VALUE rb_name) {
622
+ Check_Type(rb_name, T_STRING);
623
+ const char *name = StringValueCStr(rb_name);
624
+
625
+ int idx = find_custom_by_name(name);
626
+ if (idx < 0) return Qfalse;
627
+
628
+ free_custom_at(idx);
629
+
630
+ /* Shift remaining entries left */
631
+ for (int i = idx; i < custom_count - 1; i++) {
632
+ custom_patterns[i] = custom_patterns[i + 1];
633
+ }
634
+ custom_count--;
635
+
636
+ return Qtrue;
637
+ }
638
+
639
+ /*
640
+ * DataRedactor._clear_custom_patterns -> nil
641
+ */
642
+ static VALUE rb_clear_custom_patterns(VALUE self) {
643
+ for (int i = 0; i < custom_count; i++) {
644
+ free_custom_at(i);
645
+ }
646
+ custom_count = 0;
647
+ return Qnil;
648
+ }
649
+
650
+ /*
651
+ * DataRedactor._custom_patterns -> Array<Hash>
652
+ *
653
+ * Returns [{name:, source:, tag_bit:, boundary:}, ...] for each custom pattern.
654
+ */
655
+ static VALUE rb_custom_patterns(VALUE self) {
656
+ VALUE arr = rb_ary_new_capa(custom_count);
657
+ for (int i = 0; i < custom_count; i++) {
658
+ VALUE h = rb_hash_new();
659
+ rb_hash_aset(h, ID2SYM(rb_intern("name")), rb_str_new_cstr(custom_patterns[i].name));
660
+ rb_hash_aset(h, ID2SYM(rb_intern("source")), rb_str_new_cstr(custom_patterns[i].source));
661
+ rb_hash_aset(h, ID2SYM(rb_intern("tag_bit")), INT2NUM(custom_patterns[i].tag));
662
+ rb_hash_aset(h, ID2SYM(rb_intern("boundary")), custom_patterns[i].boundary ? Qtrue : Qfalse);
663
+ rb_ary_push(arr, h);
664
+ }
665
+ return arr;
666
+ }
667
+
668
+ /*
669
+ * Replace all occurrences of a compiled pattern in `input` with PLACEHOLDER.
670
+ *
671
+ * If `use_boundary` is non-zero the pattern was compiled as:
672
+ * (^|[^0-9A-Za-z])(CORE)([^0-9A-Za-z]|$)
673
+ * groups: [0]=full match [1]=left boundary [2]=CORE [3]=right boundary
674
+ * We pass nmatch=4 so the engine fills all four slots, then use matches[1].rm_eo
675
+ * and matches[3].rm_so to locate the exact CORE span. The boundary characters
676
+ * are copied back verbatim so they are not lost.
677
+ *
678
+ * NOTE: CORE must NOT contain additional capture groups — if it does, group
679
+ * indices shift and matches[2]/[3] will be wrong. All boundary-wrapped
680
+ * patterns in pattern_strings[] are written without inner groups for this reason.
681
+ *
682
+ * Returns a newly malloc'd string (caller must free), or NULL on failure.
683
+ */
684
+ static char *replace_all_matches(regex_t *pattern, const char *input,
685
+ int use_boundary, const placeholder_t *ph) {
686
+ size_t ph_max = max_placeholder_len(ph);
687
+ size_t out_cap = strlen(input) * 2 + 512;
688
+ char *output = (char *)malloc(out_cap);
689
+ if (!output) return NULL;
690
+
691
+ /* Scratch buffer for the rendered placeholder (worst-case size). */
692
+ char *ph_buf = (char *)malloc(ph_max + 1);
693
+ if (!ph_buf) { free(output); return NULL; }
694
+
695
+ size_t out_len = 0;
696
+ const char *cursor = input;
697
+ regmatch_t matches[4];
698
+
699
+ while (regexec(pattern, cursor, 4, matches, 0) == 0) {
700
+ regoff_t full_so = matches[0].rm_so;
701
+ regoff_t full_eo = matches[0].rm_eo;
702
+
703
+ if (full_so < 0 || full_eo < full_so) break;
704
+
705
+ regoff_t core_so = full_so;
706
+ regoff_t core_eo = full_eo;
707
+
708
+ if (use_boundary) {
709
+ /* group 1: left boundary char (or empty at ^) */
710
+ if (matches[1].rm_so >= 0 && matches[1].rm_eo > matches[1].rm_so)
711
+ core_so = matches[1].rm_eo;
712
+ /* group 3: right boundary char (or empty at $) */
713
+ if (matches[3].rm_so >= 0 && matches[3].rm_eo > matches[3].rm_so)
714
+ core_eo = matches[3].rm_so;
715
+ }
716
+
717
+ size_t prefix_len = (size_t)core_so;
718
+ size_t suffix_len = (size_t)(full_eo - core_eo);
719
+ size_t match_len = (size_t)(full_eo - full_so);
720
+ size_t core_len = (size_t)(core_eo - core_so);
721
+
722
+ size_t ph_len = write_placeholder(ph_buf, ph, cursor + core_so, core_len);
723
+
724
+ size_t needed = out_len + prefix_len + ph_len + suffix_len + strlen(cursor + full_eo) + 1;
725
+ if (needed > out_cap) {
726
+ out_cap = needed * 2;
727
+ char *tmp = (char *)realloc(output, out_cap);
728
+ if (!tmp) { free(output); free(ph_buf); return NULL; }
729
+ output = tmp;
730
+ }
731
+
732
+ /* Copy prefix (includes left boundary char if present) */
733
+ memcpy(output + out_len, cursor, prefix_len);
734
+ out_len += prefix_len;
735
+
736
+ /* Insert rendered placeholder */
737
+ memcpy(output + out_len, ph_buf, ph_len);
738
+ out_len += ph_len;
739
+
740
+ /* Restore right boundary char */
741
+ if (suffix_len > 0) {
742
+ memcpy(output + out_len, cursor + core_eo, suffix_len);
743
+ out_len += suffix_len;
744
+ }
745
+
746
+ cursor += full_eo;
747
+
748
+ if (match_len == 0) {
749
+ if (*cursor) output[out_len++] = *cursor++;
750
+ else break;
751
+ }
752
+ }
753
+ free(ph_buf);
754
+
755
+ /* Copy the remaining unmatched tail */
756
+ size_t tail_len = strlen(cursor);
757
+ size_t needed = out_len + tail_len + 1;
758
+ if (needed > out_cap) {
759
+ out_cap = needed;
760
+ char *tmp = (char *)realloc(output, out_cap);
761
+ if (!tmp) { free(output); return NULL; }
762
+ output = tmp;
763
+ }
764
+ memcpy(output + out_len, cursor, tail_len);
765
+ out_len += tail_len;
766
+ output[out_len] = '\0';
767
+
768
+ return output;
769
+ }
770
+
771
+ /* Map a TAG_* bit to a short lowercase name used in tagged/hash placeholders. */
772
+ static const char *tag_name_for_bit(int tag_bit) {
773
+ switch (tag_bit) {
774
+ case TAG_CREDENTIALS: return "CREDENTIALS";
775
+ case TAG_FINANCIAL: return "FINANCIAL";
776
+ case TAG_TAX_ID: return "TAX_ID";
777
+ case TAG_NATIONAL_ID: return "NATIONAL_ID";
778
+ case TAG_CONTACT: return "CONTACT";
779
+ case TAG_NETWORK: return "NETWORK";
780
+ case TAG_TRAVEL: return "TRAVEL";
781
+ case TAG_OTHER: return "OTHER";
782
+ case TAG_CUSTOM: return "CUSTOM";
783
+ default: return "REDACTED";
784
+ }
785
+ }
786
+
787
+ /*
788
+ * DataRedactor._redact(text, mask, ph_mode, ph_str) -> String
789
+ *
790
+ * `mask` — integer bitmask of TAG_* values (only / except filtering).
791
+ * `ph_mode` — 0 = plain string, 1 = tagged "[REDACTED:TAG]", 2 = hash "[TAG_xxxx]".
792
+ * `ph_str` — the plain string for mode 0; ignored for modes 1 and 2.
793
+ *
794
+ * The Ruby wrapper builds all four arguments and is the public API.
795
+ */
796
+ static VALUE rb_data_redactor_redact(VALUE self, VALUE rb_text, VALUE rb_mask,
797
+ VALUE rb_ph_mode, VALUE rb_ph_str) {
798
+ Check_Type(rb_text, T_STRING);
799
+ Check_Type(rb_ph_str, T_STRING);
800
+
801
+ int mask = NUM2INT(rb_mask);
802
+ int ph_mode = NUM2INT(rb_ph_mode);
803
+ const char *ph_str_plain = StringValueCStr(rb_ph_str);
804
+
805
+ const char *input = StringValueCStr(rb_text);
806
+ char *working = strdup(input);
807
+ if (!working) rb_raise(rb_eNoMemError, "strdup failed");
808
+
809
+ placeholder_t ph;
810
+ ph.mode = ph_mode;
811
+
812
+ for (int i = 0; i < NUM_PATTERNS; i++) {
813
+ if ((pattern_tags[i] & mask) == 0) continue;
814
+ ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
815
+ ? ph_str_plain
816
+ : tag_name_for_bit(pattern_tags[i]);
817
+ char *result = replace_all_matches(&compiled_patterns[i], working,
818
+ boundary_wrapped[i], &ph);
819
+ free(working);
820
+ if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed");
821
+ working = result;
822
+ }
823
+
824
+ for (int i = 0; i < custom_count; i++) {
825
+ if ((custom_patterns[i].tag & mask) == 0) continue;
826
+ ph.str = (ph_mode == PLACEHOLDER_MODE_PLAIN)
827
+ ? ph_str_plain
828
+ : tag_name_for_bit(custom_patterns[i].tag);
829
+ char *result = replace_all_matches(&custom_patterns[i].compiled, working,
830
+ custom_patterns[i].boundary, &ph);
831
+ free(working);
832
+ if (!result) rb_raise(rb_eNoMemError, "replace_all_matches allocation failed (custom)");
833
+ working = result;
834
+ }
835
+
836
+ VALUE rb_result = rb_str_new_cstr(working);
837
+ free(working);
838
+ return rb_result;
839
+ }
840
+
841
+ /*
842
+ * DataRedactor._scan(text, mask) -> Hash
843
+ *
844
+ * Returns { redacted: String, matches: Array<Hash> } where each match hash is:
845
+ * { tag: Symbol, name: String, value: String, start: Integer, length: Integer }
846
+ *
847
+ * Matches are reported in the order they are consumed by the sequential redaction
848
+ * loop (built-ins first, most-specific to most-generic; then custom patterns).
849
+ * `start` and `length` refer to byte positions in the *original* input string.
850
+ * Because patterns run sequentially on a shrinking/expanding working buffer,
851
+ * positions are tracked relative to the original by maintaining a running offset.
852
+ */
853
+ static VALUE rb_data_redactor_scan(VALUE self, VALUE rb_text, VALUE rb_mask) {
854
+ Check_Type(rb_text, T_STRING);
855
+ int mask = NUM2INT(rb_mask);
856
+
857
+ const char *input = StringValueCStr(rb_text);
858
+ size_t input_len = strlen(input);
859
+
860
+ /* Working buffer — we redact with the default plain placeholder so the
861
+ * scan result also contains the redacted string. */
862
+ static const placeholder_t ph_default = { PLACEHOLDER_MODE_PLAIN, "[REDACTED]" };
863
+
864
+ char *working = strdup(input);
865
+ if (!working) rb_raise(rb_eNoMemError, "strdup failed");
866
+
867
+ VALUE matches_arr = rb_ary_new();
868
+
869
+ /*
870
+ * To map working-buffer positions back to original-string positions we
871
+ * maintain a log of every replacement already applied. Each entry records
872
+ * where in the *working* buffer the replacement started (after all prior
873
+ * replacements) and how many bytes were removed (orig_len) vs. inserted
874
+ * (always 10, the length of "[REDACTED]").
875
+ *
876
+ * For a new match at working position W:
877
+ * cumulative_shift_before_W = sum of (10 - orig_len) for all prior
878
+ * replacements whose working_pos <= W
879
+ * original_pos = W - cumulative_shift_before_W
880
+ *
881
+ * Replacements are appended in order so the log is already sorted by
882
+ * working_pos; we just walk it linearly per match.
883
+ */
884
+ typedef struct { long wpos; long orig_len; } repl_t;
885
+ repl_t *repl_log = NULL;
886
+ int repl_count = 0;
887
+ int repl_cap = 0;
888
+
889
+ #define REPL_LOG_PUSH(_wpos, _olen) do { \
890
+ if (repl_count >= repl_cap) { \
891
+ int _nc = repl_cap == 0 ? 16 : repl_cap * 2; \
892
+ repl_t *_t = (repl_t *)realloc(repl_log, sizeof(repl_t) * _nc); \
893
+ if (!_t) { free(repl_log); free(working); rb_raise(rb_eNoMemError, "repl_log"); } \
894
+ repl_log = _t; repl_cap = _nc; \
895
+ } \
896
+ repl_log[repl_count].wpos = (_wpos); \
897
+ repl_log[repl_count].orig_len = (_olen); \
898
+ repl_count++; \
899
+ } while (0)
900
+
901
+ /* Map a position in the current working buffer to original-string position. */
902
+ #define WORKING_TO_ORIG(_wpos) ({ \
903
+ long _shift = 0; \
904
+ for (int _ri = 0; _ri < repl_count; _ri++) { \
905
+ if (repl_log[_ri].wpos <= (_wpos)) \
906
+ _shift += 10 - repl_log[_ri].orig_len; \
907
+ } \
908
+ (_wpos) - _shift; \
909
+ })
910
+
911
+ /* Collect matches for one pattern on the current working buffer, translate
912
+ * positions to original coordinates, then do the replacement. */
913
+ #define COLLECT_AND_REPLACE(pat, use_bnd, tag_bit, pat_name) do { \
914
+ const char *_cur = working; \
915
+ regmatch_t _m[4]; \
916
+ while (regexec((pat), _cur, 4, _m, 0) == 0) { \
917
+ regoff_t _fso = _m[0].rm_so, _feo = _m[0].rm_eo; \
918
+ if (_fso < 0 || _feo < _fso) break; \
919
+ regoff_t _cso = _fso, _ceo = _feo; \
920
+ if (use_bnd) { \
921
+ if (_m[1].rm_so >= 0 && _m[1].rm_eo > _m[1].rm_so) \
922
+ _cso = _m[1].rm_eo; \
923
+ if (_m[3].rm_so >= 0 && _m[3].rm_eo > _m[3].rm_so) \
924
+ _ceo = _m[3].rm_so; \
925
+ } \
926
+ size_t _vlen = (size_t)(_ceo - _cso); \
927
+ long _wpos = (long)(_cur - working) + (long)_cso; \
928
+ long _orig = WORKING_TO_ORIG(_wpos); \
929
+ VALUE _match = rb_hash_new(); \
930
+ rb_hash_aset(_match, ID2SYM(rb_intern("tag")), \
931
+ ID2SYM(rb_intern(tag_name_for_bit(tag_bit)))); \
932
+ rb_hash_aset(_match, ID2SYM(rb_intern("name")), \
933
+ rb_str_new_cstr(pat_name)); \
934
+ rb_hash_aset(_match, ID2SYM(rb_intern("value")), \
935
+ rb_str_new(_cur + _cso, _vlen)); \
936
+ rb_hash_aset(_match, ID2SYM(rb_intern("start")), \
937
+ LONG2NUM(_orig)); \
938
+ rb_hash_aset(_match, ID2SYM(rb_intern("length")), \
939
+ LONG2NUM((long)_vlen)); \
940
+ rb_ary_push(matches_arr, _match); \
941
+ /* Log this replacement; wpos advances by 10 for subsequent entries */ \
942
+ REPL_LOG_PUSH(_wpos, (long)_vlen); \
943
+ /* Re-anchor cursor: skip past the full match in working buf */ \
944
+ if (_feo == _fso) { if (*_cur) _cur++; else break; } \
945
+ else _cur += _feo; \
946
+ } \
947
+ char *_next = replace_all_matches((pat), working, (use_bnd), &ph_default); \
948
+ free(working); \
949
+ if (!_next) { free(repl_log); rb_raise(rb_eNoMemError, "replace_all_matches failed in scan"); } \
950
+ working = _next; \
951
+ } while (0)
952
+
953
+ for (int i = 0; i < NUM_PATTERNS; i++) {
954
+ if ((pattern_tags[i] & mask) == 0) continue;
955
+ COLLECT_AND_REPLACE(&compiled_patterns[i], boundary_wrapped[i],
956
+ pattern_tags[i], pattern_names[i]);
957
+ }
958
+
959
+ for (int i = 0; i < custom_count; i++) {
960
+ if ((custom_patterns[i].tag & mask) == 0) continue;
961
+ COLLECT_AND_REPLACE(&custom_patterns[i].compiled,
962
+ custom_patterns[i].boundary,
963
+ custom_patterns[i].tag, custom_patterns[i].name);
964
+ }
965
+
966
+ #undef COLLECT_AND_REPLACE
967
+ #undef WORKING_TO_ORIG
968
+ #undef REPL_LOG_PUSH
969
+
970
+ free(repl_log);
971
+
972
+ VALUE result = rb_hash_new();
973
+ VALUE rb_redacted = rb_str_new_cstr(working);
974
+ free(working);
975
+ rb_hash_aset(result, ID2SYM(rb_intern("redacted")), rb_redacted);
976
+ rb_hash_aset(result, ID2SYM(rb_intern("matches")), matches_arr);
977
+ return result;
978
+
979
+ (void)input_len; /* suppress unused-variable warning */
980
+ }
981
+
982
+ /*
983
+ * Build a boundary-wrapped version of a pattern:
984
+ * (^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)
985
+ * Caller must free the returned string.
986
+ */
987
+ static char *wrap_boundary(const char *core) {
988
+ const char *prefix = "(^|[^0-9A-Za-z])(";
989
+ const char *suffix = ")([^0-9A-Za-z]|$)";
990
+ size_t len = strlen(prefix) + strlen(core) + strlen(suffix) + 1;
991
+ char *buf = (char *)malloc(len);
992
+ if (!buf) return NULL;
993
+ snprintf(buf, len, "%s%s%s", prefix, core, suffix);
994
+ return buf;
995
+ }
996
+
997
+ void Init_data_redactor(void) {
998
+ /* Compile all regex patterns at load time */
999
+ for (int i = 0; i < NUM_PATTERNS; i++) {
1000
+ const char *pat;
1001
+ char *wrapped = NULL;
1002
+
1003
+ if (boundary_wrapped[i]) {
1004
+ wrapped = wrap_boundary(pattern_strings[i]);
1005
+ if (!wrapped) {
1006
+ rb_raise(rb_eNoMemError, "wrap_boundary allocation failed for pattern %d", i);
1007
+ }
1008
+ pat = wrapped;
1009
+ } else {
1010
+ pat = pattern_strings[i];
1011
+ }
1012
+
1013
+ int ret = regcomp(&compiled_patterns[i], pat, REG_EXTENDED);
1014
+ free(wrapped); /* safe to free after regcomp copies the pattern */
1015
+
1016
+ if (ret != 0) {
1017
+ char errbuf[256];
1018
+ regerror(ret, &compiled_patterns[i], errbuf, sizeof(errbuf));
1019
+ rb_raise(rb_eRuntimeError, "Failed to compile pattern %d: %s", i, errbuf);
1020
+ }
1021
+ }
1022
+
1023
+ VALUE mDataRedactor = rb_define_module("DataRedactor");
1024
+ rb_define_module_function(mDataRedactor, "_redact", rb_data_redactor_redact, 4);
1025
+ rb_define_module_function(mDataRedactor, "_scan", rb_data_redactor_scan, 2);
1026
+ rb_define_module_function(mDataRedactor, "_add_pattern", rb_add_pattern, 4);
1027
+ rb_define_module_function(mDataRedactor, "_remove_pattern", rb_remove_pattern, 1);
1028
+ rb_define_module_function(mDataRedactor, "_clear_custom_patterns",rb_clear_custom_patterns, 0);
1029
+ rb_define_module_function(mDataRedactor, "_custom_patterns", rb_custom_patterns, 0);
1030
+
1031
+ /* Placeholder mode constants. */
1032
+ rb_define_const(mDataRedactor, "PH_MODE_PLAIN", INT2NUM(PLACEHOLDER_MODE_PLAIN));
1033
+ rb_define_const(mDataRedactor, "PH_MODE_TAGGED", INT2NUM(PLACEHOLDER_MODE_TAGGED));
1034
+ rb_define_const(mDataRedactor, "PH_MODE_HASH", INT2NUM(PLACEHOLDER_MODE_HASH));
1035
+
1036
+ /* Expose tag bitmask values so the Ruby wrapper can build the mask. */
1037
+ rb_define_const(mDataRedactor, "TAG_CREDENTIALS", INT2NUM(TAG_CREDENTIALS));
1038
+ rb_define_const(mDataRedactor, "TAG_FINANCIAL", INT2NUM(TAG_FINANCIAL));
1039
+ rb_define_const(mDataRedactor, "TAG_TAX_ID", INT2NUM(TAG_TAX_ID));
1040
+ rb_define_const(mDataRedactor, "TAG_NATIONAL_ID", INT2NUM(TAG_NATIONAL_ID));
1041
+ rb_define_const(mDataRedactor, "TAG_CONTACT", INT2NUM(TAG_CONTACT));
1042
+ rb_define_const(mDataRedactor, "TAG_NETWORK", INT2NUM(TAG_NETWORK));
1043
+ rb_define_const(mDataRedactor, "TAG_TRAVEL", INT2NUM(TAG_TRAVEL));
1044
+ rb_define_const(mDataRedactor, "TAG_OTHER", INT2NUM(TAG_OTHER));
1045
+ rb_define_const(mDataRedactor, "TAG_CUSTOM", INT2NUM(TAG_CUSTOM));
1046
+ rb_define_const(mDataRedactor, "TAG_ALL", INT2NUM(TAG_ALL));
1047
+ }