data_redactor 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,4 +5,8 @@ abort "Missing regex.h" unless have_header("regex.h")
5
5
  abort "Missing stdlib.h" unless have_header("stdlib.h")
6
6
  abort "Missing string.h" unless have_header("string.h")
7
7
 
8
+ # Compile every .c file in this directory. Order doesn't matter; mkmf
9
+ # generates per-object rules.
10
+ $srcs = Dir.glob("#{__dir__}/*.c").map { |f| File.basename(f) }
11
+
8
12
  create_makefile("data_redactor/data_redactor")
@@ -0,0 +1,430 @@
1
+ #include "patterns.h"
2
+ #include "tags.h"
3
+
4
+ regex_t compiled_patterns[NUM_PATTERNS];
5
+
6
+ /*
7
+ * Patterns that consist of generic digit/alphanum sequences with no distinctive
8
+ * prefix are wrapped with word-boundary groups:
9
+ * (^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)
10
+ * The boundary_wrapped flag tells replace_all_matches to use sub-match [2]
11
+ * (the actual sensitive token) rather than the full match, so the surrounding
12
+ * non-word characters are preserved and not replaced.
13
+ */
14
+ /*
15
+ * ORDERING: Most specific / longest patterns first, most generic last.
16
+ * This prevents shorter patterns from consuming parts of longer matches.
17
+ *
18
+ * Tiers:
19
+ * 1. Full URLs (longest, most distinctive)
20
+ * 2. Long prefixed tokens (API keys, PATs)
21
+ * 3. IBANs (country prefix + fixed length)
22
+ * 4. Structured formats (dots, dashes, slashes)
23
+ * 5. Short prefixed / letter-anchored patterns
24
+ * 6. Boundary-wrapped structured (dash/dot separated digits)
25
+ * 7. Boundary-wrapped pure digits (longest → shortest)
26
+ */
27
+ const int boundary_wrapped[NUM_PATTERNS] = {
28
+ /* ---- Tier 1: Full URLs ---- */
29
+ 0, /* 0: AWS S3 Presigned URL */
30
+ 0, /* 1: Microsoft Teams Webhook */
31
+ 0, /* 2: Slack Webhook URL */
32
+ 0, /* 3: MongoDB Connection String */
33
+ 0, /* 4: URI with Embedded Password */
34
+ /* ---- Tier 2: Long prefixed tokens ---- */
35
+ 0, /* 5: GitHub PAT (fine-grained, 93 chars) */
36
+ 0, /* 6: JWT */
37
+ 0, /* 7: Grafana API Token */
38
+ 0, /* 8: SSH Public Key */
39
+ 0, /* 9: Bearer Token */
40
+ 0, /* 10: Google API Key (39 chars) */
41
+ 0, /* 11: AWS Access Key ID (20 chars) */
42
+ 0, /* 12: AWS Secret Access Key (40 base64) */
43
+ 0, /* 13: SendGrid API Key */
44
+ 0, /* 14: Amazon MWS Auth Token */
45
+ 0, /* 15: LaunchDarkly API Key */
46
+ 0, /* 16: GitHub Classic PAT (ghp_) */
47
+ 0, /* 17: GitHub OAuth Token (gho_) */
48
+ 0, /* 18: Stripe Secret Key */
49
+ 0, /* 19: ClickUp API Key */
50
+ 0, /* 20: Scaleway Access Key */
51
+ 0, /* 21: PEM private key header (generic) */
52
+ 0, /* 22: GPG Private Key Block */
53
+ /* ---- Tier 3: IBANs (longest → shortest) ---- */
54
+ 0, /* 23: Hungary IBAN (28 chars) */
55
+ 0, /* 24: Poland IBAN (28 chars) */
56
+ 0, /* 25: France IBAN (27 chars) */
57
+ 0, /* 26: Italy IBAN (27 chars) */
58
+ 0, /* 27: Portugal IBAN (25 chars) */
59
+ 0, /* 28: Spain IBAN (24 chars) */
60
+ 0, /* 29: Czechia IBAN (24 chars) */
61
+ 0, /* 30: Romania IBAN (24 chars) */
62
+ 0, /* 31: Sweden IBAN (24 chars) */
63
+ 0, /* 32: Germany IBAN (22 chars) */
64
+ 0, /* 33: Ireland IBAN (22 chars) */
65
+ 0, /* 34: Switzerland IBAN (21 chars) */
66
+ 0, /* 35: Austria IBAN (20 chars) */
67
+ 0, /* 36: Netherlands IBAN (18 chars) */
68
+ 0, /* 37: Denmark IBAN (18 chars) */
69
+ 0, /* 38: Finland IBAN (18 chars) */
70
+ 0, /* 39: Belgium IBAN (16 chars) */
71
+ 0, /* 40: Norway IBAN (15 chars) */
72
+ /* ---- Tier 4: Structured formats (dots, dashes, slashes, @) ---- */
73
+ 0, /* 41: Email Address */
74
+ 0, /* 42: International Phone Number */
75
+ 0, /* 43: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
76
+ 0, /* 44: Brazilian CPF (XXX.XXX.XXX-XX) */
77
+ 0, /* 45: UUID v4 */
78
+ 0, /* 46: IPv4 address */
79
+ 0, /* 47: Credit card numbers */
80
+ 0, /* 48: Indian Aadhaar (XXXX XXXX XXXX) */
81
+ /* ---- Tier 5: Letter-anchored patterns ---- */
82
+ 0, /* 49: Mexican CURP (18 alphanum, distinctive structure) */
83
+ 0, /* 50: Italian CF with omocodia (16 chars) */
84
+ 0, /* 51: Italian CF basic (16 chars) */
85
+ 0, /* 52: UK National Insurance Number */
86
+ 0, /* 53: Spanish NIE (X/Y/Z prefix) */
87
+ 0, /* 54: Passport letter prefix + digits */
88
+ /* ---- Tier 6: Boundary-wrapped structured (dash/dot/slash separated) ---- */
89
+ 1, /* 55: South Korean RRN (YYMMDD-XXXXXXX, 14 chars) */
90
+ 1, /* 56: Swiss AHV Number (756.XXXX.XXXX.XX) */
91
+ 1, /* 57: Finnish HETU (DDMMYY[+-A]XXXC) */
92
+ 1, /* 58: Swedish Personnummer (YYMMDD[-+]XXXX) */
93
+ 1, /* 59: Danish CPR Number (DDMMYY-XXXX) */
94
+ 1, /* 60: Czech Rodné číslo (YYMMDD/XXXX) */
95
+ 1, /* 61: US Social Security Number (XXX-XX-XXXX) */
96
+ 1, /* 62: US ITIN (9XX-XX-XXXX) */
97
+ 1, /* 63: Canadian SIN (XXX-XXX-XXX) */
98
+ 1, /* 64: Australian TFN (XXX-XXX-XXX) */
99
+ 1, /* 65: Indian PAN (AAAAA0000A) */
100
+ 1, /* 66: Spanish DNI (8 digits + letter) */
101
+ 1, /* 67: Hungarian Tax ID (8XXXXXXXXX, 10 digits) */
102
+ /* ---- Tier 7: Boundary-wrapped pure digits (longest → shortest) ---- */
103
+ 1, /* 68: French NIR (15 digits) */
104
+ 1, /* 69: South African ID (13 digits) */
105
+ 1, /* 70: Romanian CNP (13 digits) */
106
+ 1, /* 71: Japanese My Number (12 digits) */
107
+ 1, /* 72: Polish PESEL (11 digits) */
108
+ 1, /* 73: Belgian National Number (11 digits) */
109
+ 1, /* 74: Norwegian Fødselsnummer (11 digits) */
110
+ 1, /* 75: Passport 9 digits */
111
+ 1, /* 76: Dutch BSN (8-9 digits) */
112
+ 1, /* 77: Austrian Abgabenkontonummer (9 digits) */
113
+ 1 /* 78: Polish PESEL duplicate */
114
+ };
115
+
116
+ /*
117
+ * Tag for each pattern. Exactly one tag per pattern. Used to filter which
118
+ * patterns run when the caller passes a mask (only/except).
119
+ */
120
+ const int pattern_tags[NUM_PATTERNS] = {
121
+ /* 0-22: secrets, API keys, tokens, private keys, webhooks */
122
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
123
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
124
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
125
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
126
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
127
+ /* 23-40: IBANs */
128
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
129
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
130
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
131
+ TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
132
+ TAG_CONTACT, /* 41: email */
133
+ TAG_CONTACT, /* 42: phone */
134
+ TAG_TAX_ID, /* 43: Brazilian CNPJ */
135
+ TAG_TAX_ID, /* 44: Brazilian CPF */
136
+ TAG_OTHER, /* 45: UUID v4 */
137
+ TAG_NETWORK, /* 46: IPv4 */
138
+ TAG_FINANCIAL, /* 47: credit card */
139
+ TAG_NATIONAL_ID, /* 48: Indian Aadhaar */
140
+ TAG_NATIONAL_ID, /* 49: Mexican CURP */
141
+ TAG_TAX_ID, /* 50: Italian CF (omocodia) */
142
+ TAG_TAX_ID, /* 51: Italian CF (basic) */
143
+ TAG_NATIONAL_ID, /* 52: UK NIN */
144
+ TAG_NATIONAL_ID, /* 53: Spanish NIE */
145
+ TAG_TRAVEL, /* 54: passport letter prefix */
146
+ TAG_NATIONAL_ID, /* 55: Korean RRN */
147
+ TAG_NATIONAL_ID, /* 56: Swiss AHV */
148
+ TAG_NATIONAL_ID, /* 57: Finnish HETU */
149
+ TAG_NATIONAL_ID, /* 58: Swedish Personnummer */
150
+ TAG_NATIONAL_ID, /* 59: Danish CPR */
151
+ TAG_NATIONAL_ID, /* 60: Czech Rodné číslo */
152
+ TAG_NATIONAL_ID, /* 61: US SSN */
153
+ TAG_TAX_ID, /* 62: US ITIN */
154
+ TAG_NATIONAL_ID, /* 63: Canadian SIN */
155
+ TAG_TAX_ID, /* 64: Australian TFN */
156
+ TAG_TAX_ID, /* 65: Indian PAN */
157
+ TAG_NATIONAL_ID, /* 66: Spanish DNI */
158
+ TAG_TAX_ID, /* 67: Hungarian Tax ID */
159
+ TAG_NATIONAL_ID, /* 68: French NIR */
160
+ TAG_NATIONAL_ID, /* 69: South African ID */
161
+ TAG_NATIONAL_ID, /* 70: Romanian CNP */
162
+ TAG_TAX_ID, /* 71: Japanese My Number */
163
+ TAG_NATIONAL_ID, /* 72: Polish PESEL */
164
+ TAG_NATIONAL_ID, /* 73: Belgian National Number */
165
+ TAG_NATIONAL_ID, /* 74: Norwegian Fødselsnummer */
166
+ TAG_TRAVEL, /* 75: passport 9 digits */
167
+ TAG_NATIONAL_ID, /* 76: Dutch BSN */
168
+ TAG_TAX_ID, /* 77: Austrian Abgabenkontonummer */
169
+ TAG_NATIONAL_ID /* 78: Polish PESEL duplicate */
170
+ };
171
+
172
+ const char *pattern_names[NUM_PATTERNS] = {
173
+ "aws_s3_presigned_url", /* 0 */
174
+ "microsoft_teams_webhook", /* 1 */
175
+ "slack_webhook_url", /* 2 */
176
+ "mongodb_connection_string", /* 3 */
177
+ "uri_with_password", /* 4 */
178
+ "github_pat_fine_grained", /* 5 */
179
+ "jwt", /* 6 */
180
+ "grafana_api_token", /* 7 */
181
+ "ssh_public_key", /* 8 */
182
+ "bearer_token", /* 9 */
183
+ "google_api_key", /* 10 */
184
+ "aws_access_key_id", /* 11 */
185
+ "aws_secret_access_key", /* 12 */
186
+ "sendgrid_api_key", /* 13 */
187
+ "amazon_mws_auth_token", /* 14 */
188
+ "launchdarkly_api_key", /* 15 */
189
+ "github_classic_pat", /* 16 */
190
+ "github_oauth_token", /* 17 */
191
+ "stripe_secret_key", /* 18 */
192
+ "clickup_api_key", /* 19 */
193
+ "scaleway_access_key", /* 20 */
194
+ "pem_private_key", /* 21 */
195
+ "gpg_private_key", /* 22 */
196
+ "iban_hu", /* 23 */
197
+ "iban_pl", /* 24 */
198
+ "iban_fr", /* 25 */
199
+ "iban_it", /* 26 */
200
+ "iban_pt", /* 27 */
201
+ "iban_es", /* 28 */
202
+ "iban_cz", /* 29 */
203
+ "iban_ro", /* 30 */
204
+ "iban_se", /* 31 */
205
+ "iban_de", /* 32 */
206
+ "iban_ie", /* 33 */
207
+ "iban_ch", /* 34 */
208
+ "iban_at", /* 35 */
209
+ "iban_nl", /* 36 */
210
+ "iban_dk", /* 37 */
211
+ "iban_fi", /* 38 */
212
+ "iban_be", /* 39 */
213
+ "iban_no", /* 40 */
214
+ "email", /* 41 */
215
+ "phone_e164", /* 42 */
216
+ "brazilian_cnpj", /* 43 */
217
+ "brazilian_cpf", /* 44 */
218
+ "uuid_v4", /* 45 */
219
+ "ipv4", /* 46 */
220
+ "credit_card", /* 47 */
221
+ "indian_aadhaar", /* 48 */
222
+ "mexican_curp", /* 49 */
223
+ "italian_cf_omocodia", /* 50 */
224
+ "italian_cf", /* 51 */
225
+ "uk_nin", /* 52 */
226
+ "spanish_nie", /* 53 */
227
+ "passport_letter_prefix", /* 54 */
228
+ "korean_rrn", /* 55 */
229
+ "swiss_ahv", /* 56 */
230
+ "finnish_hetu", /* 57 */
231
+ "swedish_personnummer", /* 58 */
232
+ "danish_cpr", /* 59 */
233
+ "czech_rodne_cislo", /* 60 */
234
+ "us_ssn", /* 61 */
235
+ "us_itin", /* 62 */
236
+ "canadian_sin", /* 63 */
237
+ "australian_tfn", /* 64 */
238
+ "indian_pan", /* 65 */
239
+ "spanish_dni", /* 66 */
240
+ "hungarian_tax_id", /* 67 */
241
+ "french_nir", /* 68 */
242
+ "south_african_id", /* 69 */
243
+ "romanian_cnp", /* 70 */
244
+ "japanese_my_number", /* 71 */
245
+ "polish_pesel", /* 72 */
246
+ "belgian_national_number", /* 73 */
247
+ "norwegian_fodselsnummer", /* 74 */
248
+ "passport_9digits", /* 75 */
249
+ "dutch_bsn", /* 76 */
250
+ "austrian_abgabenkontonummer", /* 77 */
251
+ "polish_pesel_2" /* 78 */
252
+ };
253
+
254
+ /*
255
+ * Raw patterns. Boundary-wrapped patterns are stored unwrapped here;
256
+ * the wrapper is applied in Init_data_redactor at compile time.
257
+ */
258
+ const char *pattern_strings[NUM_PATTERNS] = {
259
+ /* ---- Tier 1: Full URLs ---- */
260
+ /* 0: AWS S3 Presigned URL */
261
+ "https://[a-z0-9.-]+\\.s3\\.amazonaws\\.com/[^[:space:]?]+\\?[^[:space:]]*X-Amz-Signature=[^[:space:]]+",
262
+ /* 1: Microsoft Teams Incoming Webhook */
263
+ "https://[a-z0-9-]+\\.webhook\\.office\\.com/webhookb2/[a-fA-F0-9-]{36}@[a-fA-F0-9-]{36}/[^/ ]+/[a-fA-F0-9]{32}/[a-fA-F0-9-]{36}",
264
+ /* 2: Slack Webhook URL */
265
+ "https://hooks\\.slack\\.com/services/T[A-Z0-9]{8}/B[A-Z0-9]{8}/[A-Za-z0-9]{24}",
266
+ /* 3: MongoDB Connection String (with credentials) */
267
+ "mongodb(\\+srv)?://[^[:space:]'\"<>/:@]+:[^[:space:]'\"<>/@]+@[^[:space:]?'\"]+",
268
+ /* 4: URI with Embedded Password (scheme://user:pass@host) */
269
+ "[A-Za-z][A-Za-z0-9+_-]*://[^[:space:]/?#:@]+:[^[:space:]/?#@]+@[A-Za-z0-9.-]+",
270
+
271
+ /* ---- Tier 2: Long prefixed tokens ---- */
272
+ /* 5: GitHub PAT fine-grained (github_pat_ + 82 chars) */
273
+ "github_pat_[0-9a-zA-Z_]{82}",
274
+ /* 6: JWT (three base64url segments) */
275
+ "eyJ[A-Za-z0-9_-]{10,}\\.eyJ[A-Za-z0-9_-]{10,}\\.[A-Za-z0-9_-]+",
276
+ /* 7: Grafana API Token (base64 of {\"k\":\") */
277
+ "eyJrIjoi[A-Za-z0-9_=-]{42,}",
278
+ /* 8: SSH Public Key */
279
+ "ssh-(rsa|ed25519|ecdsa) [a-zA-Z0-9/+=]{20,}",
280
+ /* 9: Bearer Token */
281
+ "[Bb]earer [a-zA-Z0-9_.=/+:-]{12,}",
282
+ /* 10: Google API Key (AIza + 35 chars) */
283
+ "AIza[0-9A-Za-z_-]{35}",
284
+ /* 11: AWS Access Key ID (all prefixes + 16 chars) */
285
+ "(A3T[A-Z0-9]|AKIA|ABIA|ACCA|AGPA|AIDA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[A-Z2-7]{16}",
286
+ /* 12: AWS Secret Access Key (40 base64 chars) */
287
+ "[A-Za-z0-9/+=]{40}",
288
+ /* 13: SendGrid API Key */
289
+ "SG\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}",
290
+ /* 14: Amazon MWS Auth Token */
291
+ "amzn\\.mws\\.[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
292
+ /* 15: LaunchDarkly API Key (api-UUID or sdk-UUID) */
293
+ "(api|sdk)-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}",
294
+ /* 16: GitHub Classic PAT (ghp_ + 36 chars) */
295
+ "ghp_[0-9a-zA-Z]{36}",
296
+ /* 17: GitHub OAuth Token (gho_ + 36 chars) */
297
+ "gho_[0-9a-zA-Z]{36}",
298
+ /* 18: Stripe Secret Key (sk_live_ + 24 chars) */
299
+ "sk_live_[0-9a-zA-Z]{24}",
300
+ /* 19: ClickUp API Key */
301
+ "pk_[0-9]{6,8}_[A-Z0-9]{32}",
302
+ /* 20: Scaleway Access Key (SCW + 17 chars) */
303
+ "SCW[A-Z0-9]{17}",
304
+ /* 21: PEM private key header (generic) */
305
+ "-----BEGIN [A-Z ]*PRIVATE KEY-----",
306
+ /* 22: GPG Private Key Block */
307
+ "-----BEGIN PGP PRIVATE KEY BLOCK-----",
308
+
309
+ /* ---- Tier 3: IBANs (longest → shortest) ---- */
310
+ /* 23: Hungary IBAN (HU, 28 chars) */
311
+ "HU[0-9]{2}[0-9]{24}",
312
+ /* 24: Poland IBAN (PL, 28 chars) */
313
+ "PL[0-9]{2}[0-9]{24}",
314
+ /* 25: France IBAN (FR, 27 chars) */
315
+ "FR[0-9]{2}[0-9]{10}[A-Z0-9]{11}[0-9]{2}",
316
+ /* 26: Italy IBAN (IT, 27 chars) */
317
+ "IT[0-9]{2}[A-Z][0-9]{10}[A-Z0-9]{12}",
318
+ /* 27: Portugal IBAN (PT, 25 chars) */
319
+ "PT[0-9]{2}[0-9]{21}",
320
+ /* 28: Spain IBAN (ES, 24 chars) */
321
+ "ES[0-9]{2}[0-9]{20}",
322
+ /* 29: Czechia IBAN (CZ, 24 chars) */
323
+ "CZ[0-9]{2}[0-9]{20}",
324
+ /* 30: Romania IBAN (RO, 24 chars) */
325
+ "RO[0-9]{2}[A-Z]{4}[A-Z0-9]{16}",
326
+ /* 31: Sweden IBAN (SE, 24 chars) */
327
+ "SE[0-9]{2}[0-9]{20}",
328
+ /* 32: Germany IBAN (DE, 22 chars) */
329
+ "DE[0-9]{2}[0-9]{18}",
330
+ /* 33: Ireland IBAN (IE, 22 chars) */
331
+ "IE[0-9]{2}[A-Z]{4}[0-9]{14}",
332
+ /* 34: Switzerland IBAN (CH, 21 chars) */
333
+ "CH[0-9]{2}[0-9]{5}[A-Z0-9]{12}",
334
+ /* 35: Austria IBAN (AT, 20 chars) */
335
+ "AT[0-9]{2}[0-9]{16}",
336
+ /* 36: Netherlands IBAN (NL, 18 chars) */
337
+ "NL[0-9]{2}[A-Z]{4}[0-9]{10}",
338
+ /* 37: Denmark IBAN (DK, 18 chars) */
339
+ "DK[0-9]{2}[0-9]{14}",
340
+ /* 38: Finland IBAN (FI, 18 chars) */
341
+ "FI[0-9]{2}[0-9]{14}",
342
+ /* 39: Belgium IBAN (BE, 16 chars) */
343
+ "BE[0-9]{2}[0-9]{12}",
344
+ /* 40: Norway IBAN (NO, 15 chars) */
345
+ "NO[0-9]{2}[0-9]{11}",
346
+
347
+ /* ---- Tier 4: Structured formats (dots, dashes, slashes, @) ---- */
348
+ /* 41: Email Address */
349
+ "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
350
+ /* 42: International Phone Number (E.164) */
351
+ "\\+[0-9]{1,3}[- ]?[0-9][0-9 -]{6,13}[0-9]",
352
+ /* 43: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
353
+ "[0-9]{2}\\.[0-9]{3}\\.[0-9]{3}/[0-9]{4}-[0-9]{2}",
354
+ /* 44: Brazilian CPF (XXX.XXX.XXX-XX) */
355
+ "[0-9]{3}\\.[0-9]{3}\\.[0-9]{3}-[0-9]{2}",
356
+ /* 45: UUID v4 / Scaleway Secret Key */
357
+ "[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
358
+ /* 46: IPv4 address */
359
+ "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
360
+ /* 47: Credit card numbers (Visa, Mastercard, Amex, Discover, JCB) */
361
+ "(4[0-9]{15}|4[0-9]{12}|5[1-5][0-9]{14}|6011[0-9]{12}|65[0-9]{14}|3[47][0-9]{13}|3[068][0-9]{11}|35[0-9]{14})",
362
+ /* 48: Indian Aadhaar (XXXX XXXX XXXX or XXXX-XXXX-XXXX) */
363
+ "[0-9]{4}[- ][0-9]{4}[- ][0-9]{4}",
364
+
365
+ /* ---- Tier 5: Letter-anchored patterns ---- */
366
+ /* 49: Mexican CURP (18 alphanum, distinctive structure) */
367
+ "[A-Z]{4}[0-9]{6}[HM][A-Z]{5}[A-Z0-9][0-9]",
368
+ /* 50: Italian CF with omocodia (16 chars) */
369
+ "[A-Z]{6}[0-9LMNPQRSTUV]{2}[ABCDEHLMPRST][0-9LMNPQRSTUV]{2}[A-Z][0-9LMNPQRSTUV]{3}[A-Z]",
370
+ /* 51: Italian CF basic (16 chars) */
371
+ "[A-Z]{6}[0-9]{2}[A-Z][0-9]{2}[A-Z][0-9]{3}[A-Z]",
372
+ /* 52: UK National Insurance Number (AA 99 99 99 A-D) */
373
+ "[A-Z]{2} ?[0-9]{2} ?[0-9]{2} ?[0-9]{2} ?[A-D]",
374
+ /* 53: Spanish NIE (X/Y/Z + 7 digits + letter) */
375
+ "[XYZ][0-9]{7}[A-Z]",
376
+ /* 54: Passport - letter prefix + digits (e.g. AB1234567) */
377
+ "[A-Z]{1,2}[0-9]{6,7}",
378
+
379
+ /* ---- Tier 6: Boundary-wrapped structured (dash/dot/slash separated) ---- */
380
+ /* 55: South Korean RRN (YYMMDD-XXXXXXX, 14 chars with dash) */
381
+ "[0-9]{6}-[0-9]{7}",
382
+ /* 56: Swiss AHV Number (756.XXXX.XXXX.XX) */
383
+ "756\\.[0-9]{4}\\.[0-9]{4}\\.[0-9]{2}",
384
+ /* 57: Finnish HETU (DDMMYY[+-A]XXXC) */
385
+ "[0-9]{6}[-+A][0-9]{3}[0-9A-Y]",
386
+ /* 58: Swedish Personnummer (YYMMDD[-+]XXXX) */
387
+ "[0-9]{6}[-+][0-9]{4}",
388
+ /* 59: Danish CPR Number (DDMMYY-XXXX) */
389
+ "[0-9]{6}-[0-9]{4}",
390
+ /* 60: Czech Rodné číslo (YYMMDD/XXXX or YYMMDDXXXX) */
391
+ "[0-9]{6}/?[0-9]{3,4}",
392
+ /* 61: US Social Security Number (XXX-XX-XXXX) */
393
+ "[0-9]{3}-[0-9]{2}-[0-9]{4}",
394
+ /* 62: US ITIN (9XX-XX-XXXX) */
395
+ "9[0-9]{2}-[0-9]{2}-[0-9]{4}",
396
+ /* 63: Canadian SIN (XXX-XXX-XXX) */
397
+ "[0-9]{3}-[0-9]{3}-[0-9]{3}",
398
+ /* 64: Australian TFN (XXX-XXX-XXX or XXX XXX XXX) */
399
+ "[0-9]{3}[- ][0-9]{3}[- ][0-9]{3}",
400
+ /* 65: Indian PAN (5 letters + 4 digits + 1 letter) */
401
+ "[A-Z]{5}[0-9]{4}[A-Z]",
402
+ /* 66: Spanish DNI (8 digits + 1 letter) */
403
+ "[0-9]{8}[A-Z]",
404
+ /* 67: Hungarian Tax ID (starts with 8, 10 digits) */
405
+ "8[0-9]{9}",
406
+
407
+ /* ---- Tier 7: Boundary-wrapped pure digits (longest → shortest) ---- */
408
+ /* 68: French NIR / Social Security (15 digits) */
409
+ "[12][0-9]{2}[01][0-9][0-9]{2}[0-9]{3}[0-9]{3}[0-9]{2}",
410
+ /* 69: South African ID (13 digits) */
411
+ "[0-9]{13}",
412
+ /* 70: Romanian CNP (13 digits, first digit 1-8) */
413
+ "[1-8][0-9]{12}",
414
+ /* 71: Japanese My Number (12 digits) */
415
+ "[0-9]{12}",
416
+ /* 72: Polish PESEL (11 digits) */
417
+ "[0-9]{11}",
418
+ /* 73: Belgian National Number (11 digits) */
419
+ "[0-9]{11}",
420
+ /* 74: Norwegian Fødselsnummer (11 digits) */
421
+ "[0-9]{11}",
422
+ /* 75: Passport - 9 consecutive digits */
423
+ "[0-9]{9}",
424
+ /* 76: Dutch BSN (8-9 digits) */
425
+ "[0-9]{8,9}",
426
+ /* 77: Austrian Abgabenkontonummer (9 digits) */
427
+ "[0-9]{9}",
428
+ /* 78: Polish PESEL duplicate */
429
+ "[0-9]{11}"
430
+ };
@@ -0,0 +1,16 @@
1
+ #ifndef DATA_REDACTOR_PATTERNS_H
2
+ #define DATA_REDACTOR_PATTERNS_H
3
+
4
+ #include <regex.h>
5
+
6
+ #define NUM_PATTERNS 79
7
+
8
+ extern const char *pattern_strings[NUM_PATTERNS];
9
+ extern const int boundary_wrapped[NUM_PATTERNS];
10
+ extern const int pattern_tags[NUM_PATTERNS];
11
+ extern const char *pattern_names[NUM_PATTERNS];
12
+
13
+ /* Compiled at Init_data_redactor time. */
14
+ extern regex_t compiled_patterns[NUM_PATTERNS];
15
+
16
+ #endif
@@ -0,0 +1,54 @@
1
+ #include "placeholder.h"
2
+ #include "tags.h"
3
+ #include <stdio.h>
4
+ #include <string.h>
5
+
6
+ /* djb2 — fast, dependency-free, good enough for 4-hex log correlation */
7
+ unsigned int djb2(const char *s, size_t len) {
8
+ unsigned int h = 5381;
9
+ for (size_t i = 0; i < len; i++)
10
+ h = h * 33 ^ (unsigned char)s[i];
11
+ return h;
12
+ }
13
+
14
+ size_t write_placeholder(char *buf, const placeholder_t *ph,
15
+ const char *match, size_t match_len) {
16
+ switch (ph->mode) {
17
+ case PLACEHOLDER_MODE_TAGGED:
18
+ return (size_t)sprintf(buf, "[REDACTED:%s]", ph->str);
19
+ case PLACEHOLDER_MODE_HASH: {
20
+ unsigned int h = djb2(match, match_len) & 0xFFFF;
21
+ return (size_t)sprintf(buf, "[%s_%04x]", ph->str, h);
22
+ }
23
+ default: /* PLACEHOLDER_MODE_PLAIN */
24
+ {
25
+ size_t len = strlen(ph->str);
26
+ memcpy(buf, ph->str, len);
27
+ return len;
28
+ }
29
+ }
30
+ }
31
+
32
+ size_t max_placeholder_len(const placeholder_t *ph) {
33
+ size_t tag_len = strlen(ph->str);
34
+ switch (ph->mode) {
35
+ case PLACEHOLDER_MODE_TAGGED: return 2 + 9 + tag_len + 1; /* "[REDACTED:" + tag + "]" */
36
+ case PLACEHOLDER_MODE_HASH: return 1 + tag_len + 1 + 4 + 1; /* "[" + tag + "_" + 4hex + "]" */
37
+ default: return tag_len;
38
+ }
39
+ }
40
+
41
+ const char *tag_name_for_bit(int tag_bit) {
42
+ switch (tag_bit) {
43
+ case TAG_CREDENTIALS: return "CREDENTIALS";
44
+ case TAG_FINANCIAL: return "FINANCIAL";
45
+ case TAG_TAX_ID: return "TAX_ID";
46
+ case TAG_NATIONAL_ID: return "NATIONAL_ID";
47
+ case TAG_CONTACT: return "CONTACT";
48
+ case TAG_NETWORK: return "NETWORK";
49
+ case TAG_TRAVEL: return "TRAVEL";
50
+ case TAG_OTHER: return "OTHER";
51
+ case TAG_CUSTOM: return "CUSTOM";
52
+ default: return "REDACTED";
53
+ }
54
+ }
@@ -0,0 +1,30 @@
1
+ #ifndef DATA_REDACTOR_PLACEHOLDER_H
2
+ #define DATA_REDACTOR_PLACEHOLDER_H
3
+
4
+ #include <stddef.h>
5
+
6
+ #define PLACEHOLDER_MODE_PLAIN 0 /* use ph.str verbatim */
7
+ #define PLACEHOLDER_MODE_TAGGED 1 /* "[REDACTED:TAGNAME]" */
8
+ #define PLACEHOLDER_MODE_HASH 2 /* "[TAGNAME_xxxx]" (4-hex djb2 suffix) */
9
+
10
+ typedef struct {
11
+ int mode;
12
+ const char *str; /* plain string (mode 0); tag name (modes 1/2) */
13
+ } placeholder_t;
14
+
15
+ unsigned int djb2(const char *s, size_t len);
16
+
17
+ /*
18
+ * Write the placeholder for one match into `buf` (which must be large enough).
19
+ * Returns the number of bytes written.
20
+ */
21
+ size_t write_placeholder(char *buf, const placeholder_t *ph,
22
+ const char *match, size_t match_len);
23
+
24
+ /* Upper bound on placeholder length for a given ph (for buffer sizing). */
25
+ size_t max_placeholder_len(const placeholder_t *ph);
26
+
27
+ /* Map a TAG_* bit to the uppercase tag name used in tagged/hash placeholders. */
28
+ const char *tag_name_for_bit(int tag_bit);
29
+
30
+ #endif