data_redactor 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0c2837740c12c4424fe8837a023c6043badc61c94c99a4b53a826e57f60c362f
4
- data.tar.gz: fe57411af8c23f54a462a2c71d76e8e3306adb35958001a3918e817e5bceeabc
3
+ metadata.gz: 136d8a6bfac2c1caf2f7628a23d5afb17a7afa8a5edb2346f836d007c3f65625
4
+ data.tar.gz: 3df79f171c3e36c0c69192a69d20d020fbb5d84b3a214a5cd7de15f5302e2141
5
5
  SHA512:
6
- metadata.gz: d882f103607569f259bd8bba0cb10a34c1967564bbe81909c6ba877ccecd020b53efc8df350ab08b19169239b4be276a5bb28c54d8559a17914d10044d2aa4d3
7
- data.tar.gz: b567563554f6f8549c9207b43596a3aba7ef33bb44a6efab4df02a8e3de851c06caf2afe253a7b0f01ac1d37b4808e4c99a5f4b421d395169b7525eabc26ad3f
6
+ metadata.gz: a281c6884c6e748ade3fe9b5c4528fe342105e67963335f02dcda94f5692d39a5d759b6e3a0cc08b88eebae1af00bd19f830accd248329e675a39f7d7c16f4c5
7
+ data.tar.gz: 2f474edbc67f02a0558ceb3bd8c76a250c9e0d86be29c6dd5ebe542388a7c01f593450df77f9f0714c92c52becafb6002fbc27c8c379a2db1fb56d55fcedd3ff
data/CHANGELOG.md CHANGED
@@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.7.0] - 2026-05-08
11
+
12
+ ### Added
13
+ - **Rails / Rack / Logger integrations** under `lib/data_redactor/integrations/`. Soft-required — none are loaded by default; the gem still has zero runtime dependencies in the gemspec.
14
+ - `DataRedactor::Integrations::Logger` — drop-in `Logger::Formatter` that scrubs every emitted line, wraps an inner formatter (default `Logger::Formatter`), and preserves exception cause chains.
15
+ - `DataRedactor::Integrations::Rails.filter(...)` — returns a `(key, value)` proc for `Rails.application.config.filter_parameters`. Mutates String values in place via `String#replace`.
16
+ - `DataRedactor::Integrations::Rack` — middleware with selectable surfaces. `scrub:` accepts any subset of `[:body, :headers]` (default both). `:body` buffers the response and drops `Content-Length`; `:headers` scrubs sensitive response headers (`Set-Cookie`, `Authorization`, `X-Api-Key`, ...) and request headers in the env hash. Unknown surfaces raise `ArgumentError`.
17
+ - All three integrations forward `only:`, `except:`, `placeholder:` to `DataRedactor.redact`.
18
+
19
+ ### Changed
20
+ - Gemspec: added `rack` as a development dependency. No new runtime dependencies.
21
+
22
+ ## [0.6.1] - 2026-05-08
23
+
24
+ ### Added
25
+ - Six new distinctive-prefix API key patterns under the `:credentials` tag, exposed via `DataRedactor.pattern_names`:
26
+ - `anthropic_api_key` — `sk-ant-apiNN-...`
27
+ - `openai_project_api_key` — `sk-proj-...`
28
+ - `gitlab_pat` — `glpat-...`
29
+ - `digitalocean_pat` — `dop_v1_...`
30
+ - `databricks_api_token` — `dapi...`
31
+ - `sentry_dsn` — `https://KEY@oNNN.ingest.sentry.io/PID` (also matches the legacy `KEY:SECRET@` form)
32
+
33
+ ### Changed
34
+ - `NUM_PATTERNS` is now 85 (was 79). Built-in pattern indices in C have shifted accordingly; the public Ruby API and pattern names are stable.
35
+
10
36
  ## [0.6.0] - 2026-05-08
11
37
 
12
38
  ### Added
@@ -80,7 +106,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
80
106
  - `DataRedactor.redact(text)` module function returning the input with every match replaced by `[REDACTED]`.
81
107
  - RSpec suite with one example per pattern.
82
108
 
83
- [Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.6.0...HEAD
109
+ [Unreleased]: https://github.com/danielefrisanco/data_redactor/compare/v0.7.0...HEAD
110
+ [0.7.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.6.1...v0.7.0
111
+ [0.6.1]: https://github.com/danielefrisanco/data_redactor/compare/v0.6.0...v0.6.1
84
112
  [0.6.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.5.0...v0.6.0
85
113
  [0.2.0]: https://github.com/danielefrisanco/data_redactor/compare/v0.1.0...v0.2.0
86
114
  [0.1.0]: https://github.com/danielefrisanco/data_redactor/releases/tag/v0.1.0
@@ -30,87 +30,93 @@ const int boundary_wrapped[NUM_PATTERNS] = {
30
30
  0, /* 1: Microsoft Teams Webhook */
31
31
  0, /* 2: Slack Webhook URL */
32
32
  0, /* 3: MongoDB Connection String */
33
- 0, /* 4: URI with Embedded Password */
33
+ 0, /* 4: Sentry DSN */
34
+ 0, /* 5: URI with Embedded Password */
34
35
  /* ---- Tier 2: Long prefixed tokens ---- */
35
- 0, /* 5: GitHub PAT (fine-grained, 93 chars) */
36
- 0, /* 6: JWT */
37
- 0, /* 7: Grafana API Token */
38
- 0, /* 8: SSH Public Key */
39
- 0, /* 9: Bearer Token */
40
- 0, /* 10: Google API Key (39 chars) */
41
- 0, /* 11: AWS Access Key ID (20 chars) */
42
- 0, /* 12: AWS Secret Access Key (40 base64) */
43
- 0, /* 13: SendGrid API Key */
44
- 0, /* 14: Amazon MWS Auth Token */
45
- 0, /* 15: LaunchDarkly API Key */
46
- 0, /* 16: GitHub Classic PAT (ghp_) */
47
- 0, /* 17: GitHub OAuth Token (gho_) */
48
- 0, /* 18: Stripe Secret Key */
49
- 0, /* 19: ClickUp API Key */
50
- 0, /* 20: Scaleway Access Key */
51
- 0, /* 21: PEM private key header (generic) */
52
- 0, /* 22: GPG Private Key Block */
36
+ 0, /* 6: GitHub PAT (fine-grained, 93 chars) */
37
+ 0, /* 7: JWT */
38
+ 0, /* 8: Grafana API Token */
39
+ 0, /* 9: SSH Public Key */
40
+ 0, /* 10: Bearer Token */
41
+ 0, /* 11: Anthropic API Key (sk-ant-api...) */
42
+ 0, /* 12: OpenAI Project API Key (sk-proj-...) */
43
+ 0, /* 13: Google API Key (39 chars) */
44
+ 0, /* 14: AWS Access Key ID (20 chars) */
45
+ 0, /* 15: AWS Secret Access Key (40 base64) */
46
+ 0, /* 16: SendGrid API Key */
47
+ 0, /* 17: Amazon MWS Auth Token */
48
+ 0, /* 18: LaunchDarkly API Key */
49
+ 0, /* 19: GitHub Classic PAT (ghp_) */
50
+ 0, /* 20: GitHub OAuth Token (gho_) */
51
+ 0, /* 21: Stripe Secret Key */
52
+ 0, /* 22: ClickUp API Key */
53
+ 0, /* 23: GitLab Personal Access Token (glpat-) */
54
+ 0, /* 24: DigitalOcean PAT (dop_v1_) */
55
+ 0, /* 25: Databricks API Token (dapi) */
56
+ 0, /* 26: Scaleway Access Key */
57
+ 0, /* 27: PEM private key header (generic) */
58
+ 0, /* 28: GPG Private Key Block */
53
59
  /* ---- Tier 3: IBANs (longest → shortest) ---- */
54
- 0, /* 23: Hungary IBAN (28 chars) */
55
- 0, /* 24: Poland IBAN (28 chars) */
56
- 0, /* 25: France IBAN (27 chars) */
57
- 0, /* 26: Italy IBAN (27 chars) */
58
- 0, /* 27: Portugal IBAN (25 chars) */
59
- 0, /* 28: Spain IBAN (24 chars) */
60
- 0, /* 29: Czechia IBAN (24 chars) */
61
- 0, /* 30: Romania IBAN (24 chars) */
62
- 0, /* 31: Sweden IBAN (24 chars) */
63
- 0, /* 32: Germany IBAN (22 chars) */
64
- 0, /* 33: Ireland IBAN (22 chars) */
65
- 0, /* 34: Switzerland IBAN (21 chars) */
66
- 0, /* 35: Austria IBAN (20 chars) */
67
- 0, /* 36: Netherlands IBAN (18 chars) */
68
- 0, /* 37: Denmark IBAN (18 chars) */
69
- 0, /* 38: Finland IBAN (18 chars) */
70
- 0, /* 39: Belgium IBAN (16 chars) */
71
- 0, /* 40: Norway IBAN (15 chars) */
60
+ 0, /* 29: Hungary IBAN (28 chars) */
61
+ 0, /* 30: Poland IBAN (28 chars) */
62
+ 0, /* 31: France IBAN (27 chars) */
63
+ 0, /* 32: Italy IBAN (27 chars) */
64
+ 0, /* 33: Portugal IBAN (25 chars) */
65
+ 0, /* 34: Spain IBAN (24 chars) */
66
+ 0, /* 35: Czechia IBAN (24 chars) */
67
+ 0, /* 36: Romania IBAN (24 chars) */
68
+ 0, /* 37: Sweden IBAN (24 chars) */
69
+ 0, /* 38: Germany IBAN (22 chars) */
70
+ 0, /* 39: Ireland IBAN (22 chars) */
71
+ 0, /* 40: Switzerland IBAN (21 chars) */
72
+ 0, /* 41: Austria IBAN (20 chars) */
73
+ 0, /* 42: Netherlands IBAN (18 chars) */
74
+ 0, /* 43: Denmark IBAN (18 chars) */
75
+ 0, /* 44: Finland IBAN (18 chars) */
76
+ 0, /* 45: Belgium IBAN (16 chars) */
77
+ 0, /* 46: Norway IBAN (15 chars) */
72
78
  /* ---- Tier 4: Structured formats (dots, dashes, slashes, @) ---- */
73
- 0, /* 41: Email Address */
74
- 0, /* 42: International Phone Number */
75
- 0, /* 43: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
76
- 0, /* 44: Brazilian CPF (XXX.XXX.XXX-XX) */
77
- 0, /* 45: UUID v4 */
78
- 0, /* 46: IPv4 address */
79
- 0, /* 47: Credit card numbers */
80
- 0, /* 48: Indian Aadhaar (XXXX XXXX XXXX) */
79
+ 0, /* 47: Email Address */
80
+ 0, /* 48: International Phone Number */
81
+ 0, /* 49: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
82
+ 0, /* 50: Brazilian CPF (XXX.XXX.XXX-XX) */
83
+ 0, /* 51: UUID v4 */
84
+ 0, /* 52: IPv4 address */
85
+ 0, /* 53: Credit card numbers */
86
+ 0, /* 54: Indian Aadhaar (XXXX XXXX XXXX) */
81
87
  /* ---- Tier 5: Letter-anchored patterns ---- */
82
- 0, /* 49: Mexican CURP (18 alphanum, distinctive structure) */
83
- 0, /* 50: Italian CF with omocodia (16 chars) */
84
- 0, /* 51: Italian CF basic (16 chars) */
85
- 0, /* 52: UK National Insurance Number */
86
- 0, /* 53: Spanish NIE (X/Y/Z prefix) */
87
- 0, /* 54: Passport letter prefix + digits */
88
+ 0, /* 55: Mexican CURP (18 alphanum, distinctive structure) */
89
+ 0, /* 56: Italian CF with omocodia (16 chars) */
90
+ 0, /* 57: Italian CF basic (16 chars) */
91
+ 0, /* 58: UK National Insurance Number */
92
+ 0, /* 59: Spanish NIE (X/Y/Z prefix) */
93
+ 0, /* 60: Passport letter prefix + digits */
88
94
  /* ---- Tier 6: Boundary-wrapped structured (dash/dot/slash separated) ---- */
89
- 1, /* 55: South Korean RRN (YYMMDD-XXXXXXX, 14 chars) */
90
- 1, /* 56: Swiss AHV Number (756.XXXX.XXXX.XX) */
91
- 1, /* 57: Finnish HETU (DDMMYY[+-A]XXXC) */
92
- 1, /* 58: Swedish Personnummer (YYMMDD[-+]XXXX) */
93
- 1, /* 59: Danish CPR Number (DDMMYY-XXXX) */
94
- 1, /* 60: Czech Rodné číslo (YYMMDD/XXXX) */
95
- 1, /* 61: US Social Security Number (XXX-XX-XXXX) */
96
- 1, /* 62: US ITIN (9XX-XX-XXXX) */
97
- 1, /* 63: Canadian SIN (XXX-XXX-XXX) */
98
- 1, /* 64: Australian TFN (XXX-XXX-XXX) */
99
- 1, /* 65: Indian PAN (AAAAA0000A) */
100
- 1, /* 66: Spanish DNI (8 digits + letter) */
101
- 1, /* 67: Hungarian Tax ID (8XXXXXXXXX, 10 digits) */
95
+ 1, /* 61: South Korean RRN (YYMMDD-XXXXXXX, 14 chars) */
96
+ 1, /* 62: Swiss AHV Number (756.XXXX.XXXX.XX) */
97
+ 1, /* 63: Finnish HETU (DDMMYY[+-A]XXXC) */
98
+ 1, /* 64: Swedish Personnummer (YYMMDD[-+]XXXX) */
99
+ 1, /* 65: Danish CPR Number (DDMMYY-XXXX) */
100
+ 1, /* 66: Czech Rodné číslo (YYMMDD/XXXX) */
101
+ 1, /* 67: US Social Security Number (XXX-XX-XXXX) */
102
+ 1, /* 68: US ITIN (9XX-XX-XXXX) */
103
+ 1, /* 69: Canadian SIN (XXX-XXX-XXX) */
104
+ 1, /* 70: Australian TFN (XXX-XXX-XXX) */
105
+ 1, /* 71: Indian PAN (AAAAA0000A) */
106
+ 1, /* 72: Spanish DNI (8 digits + letter) */
107
+ 1, /* 73: Hungarian Tax ID (8XXXXXXXXX, 10 digits) */
102
108
  /* ---- Tier 7: Boundary-wrapped pure digits (longest → shortest) ---- */
103
- 1, /* 68: French NIR (15 digits) */
104
- 1, /* 69: South African ID (13 digits) */
105
- 1, /* 70: Romanian CNP (13 digits) */
106
- 1, /* 71: Japanese My Number (12 digits) */
107
- 1, /* 72: Polish PESEL (11 digits) */
108
- 1, /* 73: Belgian National Number (11 digits) */
109
- 1, /* 74: Norwegian Fødselsnummer (11 digits) */
110
- 1, /* 75: Passport 9 digits */
111
- 1, /* 76: Dutch BSN (8-9 digits) */
112
- 1, /* 77: Austrian Abgabenkontonummer (9 digits) */
113
- 1 /* 78: Polish PESEL duplicate */
109
+ 1, /* 74: French NIR (15 digits) */
110
+ 1, /* 75: South African ID (13 digits) */
111
+ 1, /* 76: Romanian CNP (13 digits) */
112
+ 1, /* 77: Japanese My Number (12 digits) */
113
+ 1, /* 78: Polish PESEL (11 digits) */
114
+ 1, /* 79: Belgian National Number (11 digits) */
115
+ 1, /* 80: Norwegian Fødselsnummer (11 digits) */
116
+ 1, /* 81: Passport 9 digits */
117
+ 1, /* 82: Dutch BSN (8-9 digits) */
118
+ 1, /* 83: Austrian Abgabenkontonummer (9 digits) */
119
+ 1 /* 84: Polish PESEL duplicate */
114
120
  };
115
121
 
116
122
  /*
@@ -118,55 +124,56 @@ const int boundary_wrapped[NUM_PATTERNS] = {
118
124
  * patterns run when the caller passes a mask (only/except).
119
125
  */
120
126
  const int pattern_tags[NUM_PATTERNS] = {
121
- /* 0-22: secrets, API keys, tokens, private keys, webhooks */
127
+ /* 0-28: secrets, API keys, tokens, private keys, webhooks */
122
128
  TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
123
129
  TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
124
130
  TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
125
131
  TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
126
- TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
127
- /* 23-40: IBANs */
132
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
133
+ TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS, TAG_CREDENTIALS,
134
+ /* 29-46: IBANs */
128
135
  TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
129
136
  TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
130
137
  TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
131
138
  TAG_FINANCIAL, TAG_FINANCIAL, TAG_FINANCIAL,
132
- TAG_CONTACT, /* 41: email */
133
- TAG_CONTACT, /* 42: phone */
134
- TAG_TAX_ID, /* 43: Brazilian CNPJ */
135
- TAG_TAX_ID, /* 44: Brazilian CPF */
136
- TAG_OTHER, /* 45: UUID v4 */
137
- TAG_NETWORK, /* 46: IPv4 */
138
- TAG_FINANCIAL, /* 47: credit card */
139
- TAG_NATIONAL_ID, /* 48: Indian Aadhaar */
140
- TAG_NATIONAL_ID, /* 49: Mexican CURP */
141
- TAG_TAX_ID, /* 50: Italian CF (omocodia) */
142
- TAG_TAX_ID, /* 51: Italian CF (basic) */
143
- TAG_NATIONAL_ID, /* 52: UK NIN */
144
- TAG_NATIONAL_ID, /* 53: Spanish NIE */
145
- TAG_TRAVEL, /* 54: passport letter prefix */
146
- TAG_NATIONAL_ID, /* 55: Korean RRN */
147
- TAG_NATIONAL_ID, /* 56: Swiss AHV */
148
- TAG_NATIONAL_ID, /* 57: Finnish HETU */
149
- TAG_NATIONAL_ID, /* 58: Swedish Personnummer */
150
- TAG_NATIONAL_ID, /* 59: Danish CPR */
151
- TAG_NATIONAL_ID, /* 60: Czech Rodné číslo */
152
- TAG_NATIONAL_ID, /* 61: US SSN */
153
- TAG_TAX_ID, /* 62: US ITIN */
154
- TAG_NATIONAL_ID, /* 63: Canadian SIN */
155
- TAG_TAX_ID, /* 64: Australian TFN */
156
- TAG_TAX_ID, /* 65: Indian PAN */
157
- TAG_NATIONAL_ID, /* 66: Spanish DNI */
158
- TAG_TAX_ID, /* 67: Hungarian Tax ID */
159
- TAG_NATIONAL_ID, /* 68: French NIR */
160
- TAG_NATIONAL_ID, /* 69: South African ID */
161
- TAG_NATIONAL_ID, /* 70: Romanian CNP */
162
- TAG_TAX_ID, /* 71: Japanese My Number */
163
- TAG_NATIONAL_ID, /* 72: Polish PESEL */
164
- TAG_NATIONAL_ID, /* 73: Belgian National Number */
165
- TAG_NATIONAL_ID, /* 74: Norwegian Fødselsnummer */
166
- TAG_TRAVEL, /* 75: passport 9 digits */
167
- TAG_NATIONAL_ID, /* 76: Dutch BSN */
168
- TAG_TAX_ID, /* 77: Austrian Abgabenkontonummer */
169
- TAG_NATIONAL_ID /* 78: Polish PESEL duplicate */
139
+ TAG_CONTACT, /* 47: email */
140
+ TAG_CONTACT, /* 48: phone */
141
+ TAG_TAX_ID, /* 49: Brazilian CNPJ */
142
+ TAG_TAX_ID, /* 50: Brazilian CPF */
143
+ TAG_OTHER, /* 51: UUID v4 */
144
+ TAG_NETWORK, /* 52: IPv4 */
145
+ TAG_FINANCIAL, /* 53: credit card */
146
+ TAG_NATIONAL_ID, /* 54: Indian Aadhaar */
147
+ TAG_NATIONAL_ID, /* 55: Mexican CURP */
148
+ TAG_TAX_ID, /* 56: Italian CF (omocodia) */
149
+ TAG_TAX_ID, /* 57: Italian CF (basic) */
150
+ TAG_NATIONAL_ID, /* 58: UK NIN */
151
+ TAG_NATIONAL_ID, /* 59: Spanish NIE */
152
+ TAG_TRAVEL, /* 60: passport letter prefix */
153
+ TAG_NATIONAL_ID, /* 61: Korean RRN */
154
+ TAG_NATIONAL_ID, /* 62: Swiss AHV */
155
+ TAG_NATIONAL_ID, /* 63: Finnish HETU */
156
+ TAG_NATIONAL_ID, /* 64: Swedish Personnummer */
157
+ TAG_NATIONAL_ID, /* 65: Danish CPR */
158
+ TAG_NATIONAL_ID, /* 66: Czech Rodné číslo */
159
+ TAG_NATIONAL_ID, /* 67: US SSN */
160
+ TAG_TAX_ID, /* 68: US ITIN */
161
+ TAG_NATIONAL_ID, /* 69: Canadian SIN */
162
+ TAG_TAX_ID, /* 70: Australian TFN */
163
+ TAG_TAX_ID, /* 71: Indian PAN */
164
+ TAG_NATIONAL_ID, /* 72: Spanish DNI */
165
+ TAG_TAX_ID, /* 73: Hungarian Tax ID */
166
+ TAG_NATIONAL_ID, /* 74: French NIR */
167
+ TAG_NATIONAL_ID, /* 75: South African ID */
168
+ TAG_NATIONAL_ID, /* 76: Romanian CNP */
169
+ TAG_TAX_ID, /* 77: Japanese My Number */
170
+ TAG_NATIONAL_ID, /* 78: Polish PESEL */
171
+ TAG_NATIONAL_ID, /* 79: Belgian National Number */
172
+ TAG_NATIONAL_ID, /* 80: Norwegian Fødselsnummer */
173
+ TAG_TRAVEL, /* 81: passport 9 digits */
174
+ TAG_NATIONAL_ID, /* 82: Dutch BSN */
175
+ TAG_TAX_ID, /* 83: Austrian Abgabenkontonummer */
176
+ TAG_NATIONAL_ID /* 84: Polish PESEL duplicate */
170
177
  };
171
178
 
172
179
  const char *pattern_names[NUM_PATTERNS] = {
@@ -174,81 +181,87 @@ const char *pattern_names[NUM_PATTERNS] = {
174
181
  "microsoft_teams_webhook", /* 1 */
175
182
  "slack_webhook_url", /* 2 */
176
183
  "mongodb_connection_string", /* 3 */
177
- "uri_with_password", /* 4 */
178
- "github_pat_fine_grained", /* 5 */
179
- "jwt", /* 6 */
180
- "grafana_api_token", /* 7 */
181
- "ssh_public_key", /* 8 */
182
- "bearer_token", /* 9 */
183
- "google_api_key", /* 10 */
184
- "aws_access_key_id", /* 11 */
185
- "aws_secret_access_key", /* 12 */
186
- "sendgrid_api_key", /* 13 */
187
- "amazon_mws_auth_token", /* 14 */
188
- "launchdarkly_api_key", /* 15 */
189
- "github_classic_pat", /* 16 */
190
- "github_oauth_token", /* 17 */
191
- "stripe_secret_key", /* 18 */
192
- "clickup_api_key", /* 19 */
193
- "scaleway_access_key", /* 20 */
194
- "pem_private_key", /* 21 */
195
- "gpg_private_key", /* 22 */
196
- "iban_hu", /* 23 */
197
- "iban_pl", /* 24 */
198
- "iban_fr", /* 25 */
199
- "iban_it", /* 26 */
200
- "iban_pt", /* 27 */
201
- "iban_es", /* 28 */
202
- "iban_cz", /* 29 */
203
- "iban_ro", /* 30 */
204
- "iban_se", /* 31 */
205
- "iban_de", /* 32 */
206
- "iban_ie", /* 33 */
207
- "iban_ch", /* 34 */
208
- "iban_at", /* 35 */
209
- "iban_nl", /* 36 */
210
- "iban_dk", /* 37 */
211
- "iban_fi", /* 38 */
212
- "iban_be", /* 39 */
213
- "iban_no", /* 40 */
214
- "email", /* 41 */
215
- "phone_e164", /* 42 */
216
- "brazilian_cnpj", /* 43 */
217
- "brazilian_cpf", /* 44 */
218
- "uuid_v4", /* 45 */
219
- "ipv4", /* 46 */
220
- "credit_card", /* 47 */
221
- "indian_aadhaar", /* 48 */
222
- "mexican_curp", /* 49 */
223
- "italian_cf_omocodia", /* 50 */
224
- "italian_cf", /* 51 */
225
- "uk_nin", /* 52 */
226
- "spanish_nie", /* 53 */
227
- "passport_letter_prefix", /* 54 */
228
- "korean_rrn", /* 55 */
229
- "swiss_ahv", /* 56 */
230
- "finnish_hetu", /* 57 */
231
- "swedish_personnummer", /* 58 */
232
- "danish_cpr", /* 59 */
233
- "czech_rodne_cislo", /* 60 */
234
- "us_ssn", /* 61 */
235
- "us_itin", /* 62 */
236
- "canadian_sin", /* 63 */
237
- "australian_tfn", /* 64 */
238
- "indian_pan", /* 65 */
239
- "spanish_dni", /* 66 */
240
- "hungarian_tax_id", /* 67 */
241
- "french_nir", /* 68 */
242
- "south_african_id", /* 69 */
243
- "romanian_cnp", /* 70 */
244
- "japanese_my_number", /* 71 */
245
- "polish_pesel", /* 72 */
246
- "belgian_national_number", /* 73 */
247
- "norwegian_fodselsnummer", /* 74 */
248
- "passport_9digits", /* 75 */
249
- "dutch_bsn", /* 76 */
250
- "austrian_abgabenkontonummer", /* 77 */
251
- "polish_pesel_2" /* 78 */
184
+ "sentry_dsn", /* 4 */
185
+ "uri_with_password", /* 5 */
186
+ "github_pat_fine_grained", /* 6 */
187
+ "jwt", /* 7 */
188
+ "grafana_api_token", /* 8 */
189
+ "ssh_public_key", /* 9 */
190
+ "bearer_token", /* 10 */
191
+ "anthropic_api_key", /* 11 */
192
+ "openai_project_api_key", /* 12 */
193
+ "google_api_key", /* 13 */
194
+ "aws_access_key_id", /* 14 */
195
+ "aws_secret_access_key", /* 15 */
196
+ "sendgrid_api_key", /* 16 */
197
+ "amazon_mws_auth_token", /* 17 */
198
+ "launchdarkly_api_key", /* 18 */
199
+ "github_classic_pat", /* 19 */
200
+ "github_oauth_token", /* 20 */
201
+ "stripe_secret_key", /* 21 */
202
+ "clickup_api_key", /* 22 */
203
+ "gitlab_pat", /* 23 */
204
+ "digitalocean_pat", /* 24 */
205
+ "databricks_api_token", /* 25 */
206
+ "scaleway_access_key", /* 26 */
207
+ "pem_private_key", /* 27 */
208
+ "gpg_private_key", /* 28 */
209
+ "iban_hu", /* 29 */
210
+ "iban_pl", /* 30 */
211
+ "iban_fr", /* 31 */
212
+ "iban_it", /* 32 */
213
+ "iban_pt", /* 33 */
214
+ "iban_es", /* 34 */
215
+ "iban_cz", /* 35 */
216
+ "iban_ro", /* 36 */
217
+ "iban_se", /* 37 */
218
+ "iban_de", /* 38 */
219
+ "iban_ie", /* 39 */
220
+ "iban_ch", /* 40 */
221
+ "iban_at", /* 41 */
222
+ "iban_nl", /* 42 */
223
+ "iban_dk", /* 43 */
224
+ "iban_fi", /* 44 */
225
+ "iban_be", /* 45 */
226
+ "iban_no", /* 46 */
227
+ "email", /* 47 */
228
+ "phone_e164", /* 48 */
229
+ "brazilian_cnpj", /* 49 */
230
+ "brazilian_cpf", /* 50 */
231
+ "uuid_v4", /* 51 */
232
+ "ipv4", /* 52 */
233
+ "credit_card", /* 53 */
234
+ "indian_aadhaar", /* 54 */
235
+ "mexican_curp", /* 55 */
236
+ "italian_cf_omocodia", /* 56 */
237
+ "italian_cf", /* 57 */
238
+ "uk_nin", /* 58 */
239
+ "spanish_nie", /* 59 */
240
+ "passport_letter_prefix", /* 60 */
241
+ "korean_rrn", /* 61 */
242
+ "swiss_ahv", /* 62 */
243
+ "finnish_hetu", /* 63 */
244
+ "swedish_personnummer", /* 64 */
245
+ "danish_cpr", /* 65 */
246
+ "czech_rodne_cislo", /* 66 */
247
+ "us_ssn", /* 67 */
248
+ "us_itin", /* 68 */
249
+ "canadian_sin", /* 69 */
250
+ "australian_tfn", /* 70 */
251
+ "indian_pan", /* 71 */
252
+ "spanish_dni", /* 72 */
253
+ "hungarian_tax_id", /* 73 */
254
+ "french_nir", /* 74 */
255
+ "south_african_id", /* 75 */
256
+ "romanian_cnp", /* 76 */
257
+ "japanese_my_number", /* 77 */
258
+ "polish_pesel", /* 78 */
259
+ "belgian_national_number", /* 79 */
260
+ "norwegian_fodselsnummer", /* 80 */
261
+ "passport_9digits", /* 81 */
262
+ "dutch_bsn", /* 82 */
263
+ "austrian_abgabenkontonummer", /* 83 */
264
+ "polish_pesel_2" /* 84 */
252
265
  };
253
266
 
254
267
  /*
@@ -265,166 +278,178 @@ const char *pattern_strings[NUM_PATTERNS] = {
265
278
  "https://hooks\\.slack\\.com/services/T[A-Z0-9]{8}/B[A-Z0-9]{8}/[A-Za-z0-9]{24}",
266
279
  /* 3: MongoDB Connection String (with credentials) */
267
280
  "mongodb(\\+srv)?://[^[:space:]'\"<>/:@]+:[^[:space:]'\"<>/@]+@[^[:space:]?'\"]+",
268
- /* 4: URI with Embedded Password (scheme://user:pass@host) */
281
+ /* 4: Sentry DSN (https://KEY@host.ingest.sentry.io/PROJECT_ID) */
282
+ "https://[a-f0-9]{32}(:[a-f0-9]{32})?@[a-zA-Z0-9.-]+\\.ingest\\.sentry\\.io/[0-9]+",
283
+ /* 5: URI with Embedded Password (scheme://user:pass@host) */
269
284
  "[A-Za-z][A-Za-z0-9+_-]*://[^[:space:]/?#:@]+:[^[:space:]/?#@]+@[A-Za-z0-9.-]+",
270
285
 
271
286
  /* ---- Tier 2: Long prefixed tokens ---- */
272
- /* 5: GitHub PAT fine-grained (github_pat_ + 82 chars) */
287
+ /* 6: GitHub PAT fine-grained (github_pat_ + 82 chars) */
273
288
  "github_pat_[0-9a-zA-Z_]{82}",
274
- /* 6: JWT (three base64url segments) */
289
+ /* 7: JWT (three base64url segments) */
275
290
  "eyJ[A-Za-z0-9_-]{10,}\\.eyJ[A-Za-z0-9_-]{10,}\\.[A-Za-z0-9_-]+",
276
- /* 7: Grafana API Token (base64 of {\"k\":\") */
291
+ /* 8: Grafana API Token (base64 of {\"k\":\") */
277
292
  "eyJrIjoi[A-Za-z0-9_=-]{42,}",
278
- /* 8: SSH Public Key */
293
+ /* 9: SSH Public Key */
279
294
  "ssh-(rsa|ed25519|ecdsa) [a-zA-Z0-9/+=]{20,}",
280
- /* 9: Bearer Token */
295
+ /* 10: Bearer Token */
281
296
  "[Bb]earer [a-zA-Z0-9_.=/+:-]{12,}",
282
- /* 10: Google API Key (AIza + 35 chars) */
297
+ /* 11: Anthropic API Key (sk-ant-apiNN-... ~ 95+ chars) */
298
+ "sk-ant-api[0-9]{2}-[A-Za-z0-9_-]{90,}",
299
+ /* 12: OpenAI Project API Key (sk-proj-...) */
300
+ "sk-proj-[A-Za-z0-9_-]{20,}",
301
+ /* 13: Google API Key (AIza + 35 chars) */
283
302
  "AIza[0-9A-Za-z_-]{35}",
284
- /* 11: AWS Access Key ID (all prefixes + 16 chars) */
303
+ /* 14: AWS Access Key ID (all prefixes + 16 chars) */
285
304
  "(A3T[A-Z0-9]|AKIA|ABIA|ACCA|AGPA|AIDA|ANPA|ANVA|APKA|AROA|ASCA|ASIA)[A-Z2-7]{16}",
286
- /* 12: AWS Secret Access Key (40 base64 chars) */
305
+ /* 15: AWS Secret Access Key (40 base64 chars) */
287
306
  "[A-Za-z0-9/+=]{40}",
288
- /* 13: SendGrid API Key */
307
+ /* 16: SendGrid API Key */
289
308
  "SG\\.[a-zA-Z0-9_-]{5,}\\.[a-zA-Z0-9_-]{5,}",
290
- /* 14: Amazon MWS Auth Token */
309
+ /* 17: Amazon MWS Auth Token */
291
310
  "amzn\\.mws\\.[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
292
- /* 15: LaunchDarkly API Key (api-UUID or sdk-UUID) */
311
+ /* 18: LaunchDarkly API Key (api-UUID or sdk-UUID) */
293
312
  "(api|sdk)-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}",
294
- /* 16: GitHub Classic PAT (ghp_ + 36 chars) */
313
+ /* 19: GitHub Classic PAT (ghp_ + 36 chars) */
295
314
  "ghp_[0-9a-zA-Z]{36}",
296
- /* 17: GitHub OAuth Token (gho_ + 36 chars) */
315
+ /* 20: GitHub OAuth Token (gho_ + 36 chars) */
297
316
  "gho_[0-9a-zA-Z]{36}",
298
- /* 18: Stripe Secret Key (sk_live_ + 24 chars) */
317
+ /* 21: Stripe Secret Key (sk_live_ + 24 chars) */
299
318
  "sk_live_[0-9a-zA-Z]{24}",
300
- /* 19: ClickUp API Key */
319
+ /* 22: ClickUp API Key */
301
320
  "pk_[0-9]{6,8}_[A-Z0-9]{32}",
302
- /* 20: Scaleway Access Key (SCW + 17 chars) */
321
+ /* 23: GitLab Personal Access Token (glpat- + 20 chars) */
322
+ "glpat-[0-9a-zA-Z_-]{20}",
323
+ /* 24: DigitalOcean PAT (dop_v1_ + 64 hex chars) */
324
+ "dop_v1_[a-f0-9]{64}",
325
+ /* 25: Databricks API Token (dapi + 32 hex chars) */
326
+ "dapi[a-f0-9]{32}",
327
+ /* 26: Scaleway Access Key (SCW + 17 chars) */
303
328
  "SCW[A-Z0-9]{17}",
304
- /* 21: PEM private key header (generic) */
329
+ /* 27: PEM private key header (generic) */
305
330
  "-----BEGIN [A-Z ]*PRIVATE KEY-----",
306
- /* 22: GPG Private Key Block */
331
+ /* 28: GPG Private Key Block */
307
332
  "-----BEGIN PGP PRIVATE KEY BLOCK-----",
308
333
 
309
334
  /* ---- Tier 3: IBANs (longest → shortest) ---- */
310
- /* 23: Hungary IBAN (HU, 28 chars) */
335
+ /* 29: Hungary IBAN (HU, 28 chars) */
311
336
  "HU[0-9]{2}[0-9]{24}",
312
- /* 24: Poland IBAN (PL, 28 chars) */
337
+ /* 30: Poland IBAN (PL, 28 chars) */
313
338
  "PL[0-9]{2}[0-9]{24}",
314
- /* 25: France IBAN (FR, 27 chars) */
339
+ /* 31: France IBAN (FR, 27 chars) */
315
340
  "FR[0-9]{2}[0-9]{10}[A-Z0-9]{11}[0-9]{2}",
316
- /* 26: Italy IBAN (IT, 27 chars) */
341
+ /* 32: Italy IBAN (IT, 27 chars) */
317
342
  "IT[0-9]{2}[A-Z][0-9]{10}[A-Z0-9]{12}",
318
- /* 27: Portugal IBAN (PT, 25 chars) */
343
+ /* 33: Portugal IBAN (PT, 25 chars) */
319
344
  "PT[0-9]{2}[0-9]{21}",
320
- /* 28: Spain IBAN (ES, 24 chars) */
345
+ /* 34: Spain IBAN (ES, 24 chars) */
321
346
  "ES[0-9]{2}[0-9]{20}",
322
- /* 29: Czechia IBAN (CZ, 24 chars) */
347
+ /* 35: Czechia IBAN (CZ, 24 chars) */
323
348
  "CZ[0-9]{2}[0-9]{20}",
324
- /* 30: Romania IBAN (RO, 24 chars) */
349
+ /* 36: Romania IBAN (RO, 24 chars) */
325
350
  "RO[0-9]{2}[A-Z]{4}[A-Z0-9]{16}",
326
- /* 31: Sweden IBAN (SE, 24 chars) */
351
+ /* 37: Sweden IBAN (SE, 24 chars) */
327
352
  "SE[0-9]{2}[0-9]{20}",
328
- /* 32: Germany IBAN (DE, 22 chars) */
353
+ /* 38: Germany IBAN (DE, 22 chars) */
329
354
  "DE[0-9]{2}[0-9]{18}",
330
- /* 33: Ireland IBAN (IE, 22 chars) */
355
+ /* 39: Ireland IBAN (IE, 22 chars) */
331
356
  "IE[0-9]{2}[A-Z]{4}[0-9]{14}",
332
- /* 34: Switzerland IBAN (CH, 21 chars) */
357
+ /* 40: Switzerland IBAN (CH, 21 chars) */
333
358
  "CH[0-9]{2}[0-9]{5}[A-Z0-9]{12}",
334
- /* 35: Austria IBAN (AT, 20 chars) */
359
+ /* 41: Austria IBAN (AT, 20 chars) */
335
360
  "AT[0-9]{2}[0-9]{16}",
336
- /* 36: Netherlands IBAN (NL, 18 chars) */
361
+ /* 42: Netherlands IBAN (NL, 18 chars) */
337
362
  "NL[0-9]{2}[A-Z]{4}[0-9]{10}",
338
- /* 37: Denmark IBAN (DK, 18 chars) */
363
+ /* 43: Denmark IBAN (DK, 18 chars) */
339
364
  "DK[0-9]{2}[0-9]{14}",
340
- /* 38: Finland IBAN (FI, 18 chars) */
365
+ /* 44: Finland IBAN (FI, 18 chars) */
341
366
  "FI[0-9]{2}[0-9]{14}",
342
- /* 39: Belgium IBAN (BE, 16 chars) */
367
+ /* 45: Belgium IBAN (BE, 16 chars) */
343
368
  "BE[0-9]{2}[0-9]{12}",
344
- /* 40: Norway IBAN (NO, 15 chars) */
369
+ /* 46: Norway IBAN (NO, 15 chars) */
345
370
  "NO[0-9]{2}[0-9]{11}",
346
371
 
347
372
  /* ---- Tier 4: Structured formats (dots, dashes, slashes, @) ---- */
348
- /* 41: Email Address */
373
+ /* 47: Email Address */
349
374
  "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}",
350
- /* 42: International Phone Number (E.164) */
375
+ /* 48: International Phone Number (E.164) */
351
376
  "\\+[0-9]{1,3}[- ]?[0-9][0-9 -]{6,13}[0-9]",
352
- /* 43: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
377
+ /* 49: Brazilian CNPJ (XX.XXX.XXX/XXXX-XX) */
353
378
  "[0-9]{2}\\.[0-9]{3}\\.[0-9]{3}/[0-9]{4}-[0-9]{2}",
354
- /* 44: Brazilian CPF (XXX.XXX.XXX-XX) */
379
+ /* 50: Brazilian CPF (XXX.XXX.XXX-XX) */
355
380
  "[0-9]{3}\\.[0-9]{3}\\.[0-9]{3}-[0-9]{2}",
356
- /* 45: UUID v4 / Scaleway Secret Key */
381
+ /* 51: UUID v4 / Scaleway Secret Key */
357
382
  "[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
358
- /* 46: IPv4 address */
383
+ /* 52: IPv4 address */
359
384
  "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
360
- /* 47: Credit card numbers (Visa, Mastercard, Amex, Discover, JCB) */
385
+ /* 53: Credit card numbers (Visa, Mastercard, Amex, Discover, JCB) */
361
386
  "(4[0-9]{15}|4[0-9]{12}|5[1-5][0-9]{14}|6011[0-9]{12}|65[0-9]{14}|3[47][0-9]{13}|3[068][0-9]{11}|35[0-9]{14})",
362
- /* 48: Indian Aadhaar (XXXX XXXX XXXX or XXXX-XXXX-XXXX) */
387
+ /* 54: Indian Aadhaar (XXXX XXXX XXXX or XXXX-XXXX-XXXX) */
363
388
  "[0-9]{4}[- ][0-9]{4}[- ][0-9]{4}",
364
389
 
365
390
  /* ---- Tier 5: Letter-anchored patterns ---- */
366
- /* 49: Mexican CURP (18 alphanum, distinctive structure) */
391
+ /* 55: Mexican CURP (18 alphanum, distinctive structure) */
367
392
  "[A-Z]{4}[0-9]{6}[HM][A-Z]{5}[A-Z0-9][0-9]",
368
- /* 50: Italian CF with omocodia (16 chars) */
393
+ /* 56: Italian CF with omocodia (16 chars) */
369
394
  "[A-Z]{6}[0-9LMNPQRSTUV]{2}[ABCDEHLMPRST][0-9LMNPQRSTUV]{2}[A-Z][0-9LMNPQRSTUV]{3}[A-Z]",
370
- /* 51: Italian CF basic (16 chars) */
395
+ /* 57: Italian CF basic (16 chars) */
371
396
  "[A-Z]{6}[0-9]{2}[A-Z][0-9]{2}[A-Z][0-9]{3}[A-Z]",
372
- /* 52: UK National Insurance Number (AA 99 99 99 A-D) */
397
+ /* 58: UK National Insurance Number (AA 99 99 99 A-D) */
373
398
  "[A-Z]{2} ?[0-9]{2} ?[0-9]{2} ?[0-9]{2} ?[A-D]",
374
- /* 53: Spanish NIE (X/Y/Z + 7 digits + letter) */
399
+ /* 59: Spanish NIE (X/Y/Z + 7 digits + letter) */
375
400
  "[XYZ][0-9]{7}[A-Z]",
376
- /* 54: Passport - letter prefix + digits (e.g. AB1234567) */
401
+ /* 60: Passport - letter prefix + digits (e.g. AB1234567) */
377
402
  "[A-Z]{1,2}[0-9]{6,7}",
378
403
 
379
404
  /* ---- Tier 6: Boundary-wrapped structured (dash/dot/slash separated) ---- */
380
- /* 55: South Korean RRN (YYMMDD-XXXXXXX, 14 chars with dash) */
405
+ /* 61: South Korean RRN (YYMMDD-XXXXXXX, 14 chars with dash) */
381
406
  "[0-9]{6}-[0-9]{7}",
382
- /* 56: Swiss AHV Number (756.XXXX.XXXX.XX) */
407
+ /* 62: Swiss AHV Number (756.XXXX.XXXX.XX) */
383
408
  "756\\.[0-9]{4}\\.[0-9]{4}\\.[0-9]{2}",
384
- /* 57: Finnish HETU (DDMMYY[+-A]XXXC) */
409
+ /* 63: Finnish HETU (DDMMYY[+-A]XXXC) */
385
410
  "[0-9]{6}[-+A][0-9]{3}[0-9A-Y]",
386
- /* 58: Swedish Personnummer (YYMMDD[-+]XXXX) */
411
+ /* 64: Swedish Personnummer (YYMMDD[-+]XXXX) */
387
412
  "[0-9]{6}[-+][0-9]{4}",
388
- /* 59: Danish CPR Number (DDMMYY-XXXX) */
413
+ /* 65: Danish CPR Number (DDMMYY-XXXX) */
389
414
  "[0-9]{6}-[0-9]{4}",
390
- /* 60: Czech Rodné číslo (YYMMDD/XXXX or YYMMDDXXXX) */
415
+ /* 66: Czech Rodné číslo (YYMMDD/XXXX or YYMMDDXXXX) */
391
416
  "[0-9]{6}/?[0-9]{3,4}",
392
- /* 61: US Social Security Number (XXX-XX-XXXX) */
417
+ /* 67: US Social Security Number (XXX-XX-XXXX) */
393
418
  "[0-9]{3}-[0-9]{2}-[0-9]{4}",
394
- /* 62: US ITIN (9XX-XX-XXXX) */
419
+ /* 68: US ITIN (9XX-XX-XXXX) */
395
420
  "9[0-9]{2}-[0-9]{2}-[0-9]{4}",
396
- /* 63: Canadian SIN (XXX-XXX-XXX) */
421
+ /* 69: Canadian SIN (XXX-XXX-XXX) */
397
422
  "[0-9]{3}-[0-9]{3}-[0-9]{3}",
398
- /* 64: Australian TFN (XXX-XXX-XXX or XXX XXX XXX) */
423
+ /* 70: Australian TFN (XXX-XXX-XXX or XXX XXX XXX) */
399
424
  "[0-9]{3}[- ][0-9]{3}[- ][0-9]{3}",
400
- /* 65: Indian PAN (5 letters + 4 digits + 1 letter) */
425
+ /* 71: Indian PAN (5 letters + 4 digits + 1 letter) */
401
426
  "[A-Z]{5}[0-9]{4}[A-Z]",
402
- /* 66: Spanish DNI (8 digits + 1 letter) */
427
+ /* 72: Spanish DNI (8 digits + 1 letter) */
403
428
  "[0-9]{8}[A-Z]",
404
- /* 67: Hungarian Tax ID (starts with 8, 10 digits) */
429
+ /* 73: Hungarian Tax ID (starts with 8, 10 digits) */
405
430
  "8[0-9]{9}",
406
431
 
407
432
  /* ---- Tier 7: Boundary-wrapped pure digits (longest → shortest) ---- */
408
- /* 68: French NIR / Social Security (15 digits) */
433
+ /* 74: French NIR / Social Security (15 digits) */
409
434
  "[12][0-9]{2}[01][0-9][0-9]{2}[0-9]{3}[0-9]{3}[0-9]{2}",
410
- /* 69: South African ID (13 digits) */
435
+ /* 75: South African ID (13 digits) */
411
436
  "[0-9]{13}",
412
- /* 70: Romanian CNP (13 digits, first digit 1-8) */
437
+ /* 76: Romanian CNP (13 digits, first digit 1-8) */
413
438
  "[1-8][0-9]{12}",
414
- /* 71: Japanese My Number (12 digits) */
439
+ /* 77: Japanese My Number (12 digits) */
415
440
  "[0-9]{12}",
416
- /* 72: Polish PESEL (11 digits) */
441
+ /* 78: Polish PESEL (11 digits) */
417
442
  "[0-9]{11}",
418
- /* 73: Belgian National Number (11 digits) */
443
+ /* 79: Belgian National Number (11 digits) */
419
444
  "[0-9]{11}",
420
- /* 74: Norwegian Fødselsnummer (11 digits) */
445
+ /* 80: Norwegian Fødselsnummer (11 digits) */
421
446
  "[0-9]{11}",
422
- /* 75: Passport - 9 consecutive digits */
447
+ /* 81: Passport - 9 consecutive digits */
423
448
  "[0-9]{9}",
424
- /* 76: Dutch BSN (8-9 digits) */
449
+ /* 82: Dutch BSN (8-9 digits) */
425
450
  "[0-9]{8,9}",
426
- /* 77: Austrian Abgabenkontonummer (9 digits) */
451
+ /* 83: Austrian Abgabenkontonummer (9 digits) */
427
452
  "[0-9]{9}",
428
- /* 78: Polish PESEL duplicate */
453
+ /* 84: Polish PESEL duplicate */
429
454
  "[0-9]{11}"
430
455
  };
@@ -3,7 +3,7 @@
3
3
 
4
4
  #include <regex.h>
5
5
 
6
- #define NUM_PATTERNS 79
6
+ #define NUM_PATTERNS 85
7
7
 
8
8
  extern const char *pattern_strings[NUM_PATTERNS];
9
9
  extern const int boundary_wrapped[NUM_PATTERNS];
@@ -0,0 +1,42 @@
1
+ require "logger"
2
+ require "data_redactor"
3
+
4
+ module DataRedactor
5
+ module Integrations
6
+ # Logger formatter that runs every log message through {DataRedactor.redact}
7
+ # before delegating to an inner formatter.
8
+ #
9
+ # @example Drop-in replacement for Ruby's default formatter
10
+ # logger = Logger.new($stdout)
11
+ # logger.formatter = DataRedactor::Integrations::Logger.new
12
+ # logger.info("Auth failed for user alice@example.com")
13
+ # # => "I, [...] -- : Auth failed for user [REDACTED]"
14
+ #
15
+ # @example Wrapping an existing formatter (e.g. Rails JSON logger)
16
+ # logger.formatter = DataRedactor::Integrations::Logger.new(
17
+ # inner: Rails.logger.formatter,
18
+ # only: [:credentials, :contact]
19
+ # )
20
+ class Logger
21
+ # @param inner [#call, nil] formatter to wrap. Defaults to {::Logger::Formatter}.
22
+ # @param only [Symbol, String, Array, nil] forwarded to {DataRedactor.redact}.
23
+ # @param except [Symbol, String, Array, nil] forwarded to {DataRedactor.redact}.
24
+ # @param placeholder forwarded to {DataRedactor.redact}.
25
+ def initialize(inner: ::Logger::Formatter.new, only: nil, except: nil, placeholder: DataRedactor::PLACEHOLDER_DEFAULT)
26
+ @inner = inner
27
+ @only = only
28
+ @except = except
29
+ @placeholder = placeholder
30
+ end
31
+
32
+ # Formatter contract — called by Logger for every emitted line.
33
+ # Lets the inner formatter render whatever it likes (string, exception,
34
+ # arbitrary object) and scrubs the resulting line in one pass. Keeps the
35
+ # exception cause chain intact so downstream formatters still see it.
36
+ def call(severity, time, progname, msg)
37
+ line = @inner.call(severity, time, progname, msg)
38
+ DataRedactor.redact(line.to_s, only: @only, except: @except, placeholder: @placeholder)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,121 @@
1
+ require "data_redactor"
2
+
3
+ module DataRedactor
4
+ module Integrations
5
+ # Rack middleware that scrubs sensitive data from selectable surfaces of
6
+ # the response (and request headers, for downstream loggers to see scrubbed
7
+ # values).
8
+ #
9
+ # @example Both surfaces (default)
10
+ # use DataRedactor::Integrations::Rack, scrub: [:body, :headers]
11
+ #
12
+ # @example Headers only — leave the response body untouched
13
+ # use DataRedactor::Integrations::Rack, scrub: [:headers]
14
+ #
15
+ # ### Surfaces
16
+ #
17
+ # - `:body` — wraps the response body so emitted bytes pass through
18
+ # {DataRedactor.redact} before reaching the client. Drops the
19
+ # `Content-Length` header (the redacted body may have a different
20
+ # byte length, and recomputing requires buffering).
21
+ # - `:headers` — scrubs response headers in place. Sensitive request
22
+ # headers (`Authorization`, `Cookie`, `X-Api-Key`, etc.) are redacted in
23
+ # the env hash so any downstream middleware that logs them sees scrubbed
24
+ # values.
25
+ class Rack
26
+ DEFAULT_SCRUB = [:body, :headers].freeze
27
+
28
+ SENSITIVE_REQUEST_HEADERS = %w[
29
+ HTTP_AUTHORIZATION
30
+ HTTP_PROXY_AUTHORIZATION
31
+ HTTP_COOKIE
32
+ HTTP_X_API_KEY
33
+ HTTP_X_AUTH_TOKEN
34
+ HTTP_X_ACCESS_TOKEN
35
+ ].freeze
36
+
37
+ SENSITIVE_RESPONSE_HEADERS = %w[
38
+ Set-Cookie
39
+ Authorization
40
+ X-Api-Key
41
+ X-Auth-Token
42
+ X-Access-Token
43
+ ].freeze
44
+
45
+ # @param app [#call] the Rack app
46
+ # @param scrub [Array<Symbol>] which surfaces to redact. Subset of
47
+ # `[:body, :headers]`. Defaults to `[:body, :headers]`.
48
+ # @param only forwarded to {DataRedactor.redact}
49
+ # @param except forwarded to {DataRedactor.redact}
50
+ # @param placeholder forwarded to {DataRedactor.redact}
51
+ def initialize(app, scrub: DEFAULT_SCRUB, only: nil, except: nil, placeholder: DataRedactor::PLACEHOLDER_DEFAULT)
52
+ @app = app
53
+ @scrub = Array(scrub).map(&:to_sym)
54
+ unknown = @scrub - [:body, :headers]
55
+ unless unknown.empty?
56
+ raise ArgumentError, "unknown scrub surface(s) #{unknown.inspect}; valid: [:body, :headers]"
57
+ end
58
+ @only = only
59
+ @except = except
60
+ @placeholder = placeholder
61
+ end
62
+
63
+ def call(env)
64
+ scrub_request_headers(env) if @scrub.include?(:headers)
65
+ status, headers, body = @app.call(env)
66
+ headers = scrub_response_headers(headers) if @scrub.include?(:headers)
67
+ if @scrub.include?(:body)
68
+ body, headers = wrap_body(body, headers)
69
+ end
70
+ [status, headers, body]
71
+ end
72
+
73
+ private
74
+
75
+ def redact(s)
76
+ DataRedactor.redact(s, only: @only, except: @except, placeholder: @placeholder)
77
+ end
78
+
79
+ def scrub_request_headers(env)
80
+ SENSITIVE_REQUEST_HEADERS.each do |key|
81
+ value = env[key]
82
+ env[key] = redact(value) if value.is_a?(String) && !value.empty?
83
+ end
84
+ end
85
+
86
+ def scrub_response_headers(headers)
87
+ # Rack 3 uses lower-case header names; Rack 2 uses Capitalized.
88
+ # Match case-insensitively against our known list.
89
+ sensitive_lc = SENSITIVE_RESPONSE_HEADERS.map(&:downcase)
90
+ headers.each_with_object({}) do |(key, value), out|
91
+ if sensitive_lc.include?(key.to_s.downcase)
92
+ out[key] = scrub_header_value(value)
93
+ else
94
+ out[key] = value
95
+ end
96
+ end
97
+ end
98
+
99
+ def scrub_header_value(value)
100
+ case value
101
+ when String then redact(value)
102
+ when Array then value.map { |v| v.is_a?(String) ? redact(v) : v }
103
+ else value
104
+ end
105
+ end
106
+
107
+ def wrap_body(body, headers)
108
+ # Buffer the body, redact, return as a single-element array.
109
+ # Stripping Content-Length because the redacted body may differ in
110
+ # byte length; downstream servers will recompute or chunk-encode.
111
+ buffered = +""
112
+ body.each { |chunk| buffered << chunk.to_s }
113
+ body.close if body.respond_to?(:close)
114
+
115
+ scrubbed = redact(buffered)
116
+ new_headers = headers.reject { |k, _| k.to_s.downcase == "content-length" }
117
+ [[scrubbed], new_headers]
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,38 @@
1
+ require "data_redactor"
2
+
3
+ module DataRedactor
4
+ module Integrations
5
+ # Rails `config.filter_parameters` adapter. Returns a `Proc` that Rails
6
+ # invokes with `(key, value)` for every leaf in the params tree; we redact
7
+ # the value in place when it is a String.
8
+ #
9
+ # @example
10
+ # # config/initializers/filter_parameter_logging.rb
11
+ # require "data_redactor/integrations/rails"
12
+ # Rails.application.config.filter_parameters += [
13
+ # DataRedactor::Integrations::Rails.filter
14
+ # ]
15
+ #
16
+ # @example Restricting to specific tags
17
+ # Rails.application.config.filter_parameters += [
18
+ # DataRedactor::Integrations::Rails.filter(only: [:credentials, :financial])
19
+ # ]
20
+ module Rails
21
+ module_function
22
+
23
+ # @param only forwarded to {DataRedactor.redact}
24
+ # @param except forwarded to {DataRedactor.redact}
25
+ # @param placeholder forwarded to {DataRedactor.redact}
26
+ # @return [Proc] a `(key, value)` proc compatible with `config.filter_parameters`
27
+ def filter(only: nil, except: nil, placeholder: DataRedactor::PLACEHOLDER_DEFAULT)
28
+ lambda do |_key, value|
29
+ next unless value.is_a?(String)
30
+ # Rails' Parameter Filter mutates the value in place. We can't
31
+ # reassign `value` here, so use String#replace.
32
+ redacted = DataRedactor.redact(value, only: only, except: except, placeholder: placeholder)
33
+ value.replace(redacted) if redacted != value
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -1,4 +1,4 @@
1
1
  module DataRedactor
2
2
  # Current gem version. Follows {https://semver.org Semantic Versioning 2.0.0}.
3
- VERSION = "0.6.0"
3
+ VERSION = "0.7.0"
4
4
  end
data/readme.md CHANGED
@@ -128,7 +128,58 @@ DataRedactor.clear_custom_patterns! # mostly for test suites
128
128
 
129
129
  **`boundary: true`** — wraps the pattern with `(^|[^0-9A-Za-z])(PATTERN)([^0-9A-Za-z]|$)` so it only fires when the token is not embedded in a longer alphanumeric string. Incompatible with patterns that contain capture groups.
130
130
 
131
- ## Detected patterns (79 total)
131
+ ## Integrations
132
+
133
+ Optional adapters for Logger, Rails, and Rack. None are loaded automatically — `require` only what you use, and the gem adds zero runtime dependencies in the gemspec.
134
+
135
+ ### Logger formatter
136
+
137
+ Drop-in `Logger::Formatter` replacement that scrubs every emitted line:
138
+
139
+ ```ruby
140
+ require "data_redactor/integrations/logger"
141
+
142
+ logger = Logger.new($stdout)
143
+ logger.formatter = DataRedactor::Integrations::Logger.new
144
+ logger.info("Auth failed for alice@example.com")
145
+ # => I, [...] -- : Auth failed for [REDACTED]
146
+ ```
147
+
148
+ Wraps an inner formatter (defaults to `Logger::Formatter`), so it composes with structured loggers. Forwards `only:`, `except:`, `placeholder:` to `DataRedactor.redact`. Exception messages and arbitrary objects are scrubbed too — the wrapped object is passed unchanged to the inner formatter so the exception cause chain is preserved; only the rendered string is redacted.
149
+
150
+ ### Rails `filter_parameters` adapter
151
+
152
+ ```ruby
153
+ # config/initializers/filter_parameter_logging.rb
154
+ require "data_redactor/integrations/rails"
155
+
156
+ Rails.application.config.filter_parameters += [
157
+ DataRedactor::Integrations::Rails.filter
158
+ ]
159
+ ```
160
+
161
+ Returns a `(key, value)` proc compatible with Rails' parameter filter. String values are mutated in place via `String#replace` so Rails sees the redacted value. Non-strings are left alone. Accepts the same `only:`/`except:`/`placeholder:` kwargs.
162
+
163
+ ### Rack middleware
164
+
165
+ ```ruby
166
+ # config.ru
167
+ require "data_redactor/integrations/rack"
168
+
169
+ use DataRedactor::Integrations::Rack, scrub: [:body, :headers]
170
+ run MyApp
171
+ ```
172
+
173
+ `scrub:` selects which surfaces to redact (default `[:body, :headers]`):
174
+
175
+ - **`:body`** — buffers the response body, runs `DataRedactor.redact` over it, returns it as a single chunk. Drops the `Content-Length` header so the server recomputes (the redacted body may differ in byte length).
176
+ - **`:headers`** — scrubs sensitive **response** headers (`Set-Cookie`, `Authorization`, `X-Api-Key`, `X-Auth-Token`, `X-Access-Token`) in place, and sensitive **request** headers (`HTTP_AUTHORIZATION`, `HTTP_PROXY_AUTHORIZATION`, `HTTP_COOKIE`, `HTTP_X_API_KEY`, `HTTP_X_AUTH_TOKEN`, `HTTP_X_ACCESS_TOKEN`) in the env hash so any downstream middleware that logs them sees redacted values.
177
+
178
+ Pass an empty subset (e.g. `scrub: [:headers]`) to opt out of body wrapping. Forwards `only:`/`except:`/`placeholder:` to `DataRedactor.redact`. Unknown surfaces raise `ArgumentError` at boot.
179
+
180
+ > **Body wrapping is buffering.** The middleware reads the entire response body into memory before scanning. For streaming endpoints (SSE, large file downloads, Rack::Hijack) use `scrub: [:headers]` and rely on the Logger formatter for application logs instead.
181
+
182
+ ## Detected patterns (85 total)
132
183
 
133
184
  The table below is a representative sample. Use `DataRedactor.pattern_names` for the canonical, machine-readable list — it stays in sync with the C extension automatically.
134
185
 
@@ -136,15 +187,22 @@ The table below is a representative sample. Use `DataRedactor.pattern_names` for
136
187
 
137
188
  | # | Pattern | Example |
138
189
  |---|---|---|
139
- | 0 | AWS Access Key ID | `AKIAIOSFODNN7EXAMPLE` |
140
- | 1 | AWS Secret Access Key | 40-character base64 string |
141
- | 5 | Google API Key | `AIzaSyXXXX...` |
142
- | 6 | GitHub Personal Access Token | `github_pat_XXXX...` |
143
- | 7 | Slack Webhook URL | `https://hooks.slack.com/services/T.../B.../...` |
144
- | 8 | Stripe Secret Key | `sk_live_XXXX...` |
145
- | 9 | PEM Private Key header | `-----BEGIN RSA PRIVATE KEY-----` |
146
- | 13 | Scaleway Access Key | `SCW12345ABCDE6789FGHIJ` |
147
- | 14 | UUID v4 / Scaleway Secret Key | `550e8400-e29b-41d4-a716-446655440000` |
190
+ | | AWS Access Key ID | `AKIAIOSFODNN7EXAMPLE` |
191
+ | | AWS Secret Access Key | 40-character base64 string |
192
+ | | Google API Key | `AIzaSyXXXX...` |
193
+ | | GitHub Personal Access Token | `github_pat_XXXX...` |
194
+ | | GitHub Classic PAT / OAuth | `ghp_XXXX...` / `gho_XXXX...` |
195
+ | | Slack Webhook URL | `https://hooks.slack.com/services/T.../B.../...` |
196
+ | | Stripe Secret Key | `sk_live_XXXX...` |
197
+ | | Anthropic API Key | `sk-ant-api03-XXXX...` |
198
+ | | OpenAI Project API Key | `sk-proj-XXXX...` |
199
+ | — | GitLab Personal Access Token | `glpat-XXXX...` |
200
+ | — | DigitalOcean PAT | `dop_v1_XXXX...` |
201
+ | — | Databricks API Token | `dapiXXXX...` |
202
+ | — | Sentry DSN | `https://KEY@oNNN.ingest.sentry.io/PID` |
203
+ | — | PEM Private Key header | `-----BEGIN RSA PRIVATE KEY-----` |
204
+ | — | Scaleway Access Key | `SCW12345ABCDE6789FGHIJ` |
205
+ | — | UUID v4 / Scaleway Secret Key | `550e8400-e29b-41d4-a716-446655440000` |
148
206
 
149
207
  ### Travel documents
150
208
 
@@ -267,7 +325,7 @@ bundle exec rake
267
325
 
268
326
  ## How it works
269
327
 
270
- 1. At load time, `Init_data_redactor` compiles all 79 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
328
+ 1. At load time, `Init_data_redactor` compiles all 85 regex patterns once using `regcomp` (POSIX ERE) and stores them as static `regex_t` structs. Patterns marked as boundary-wrapped are expanded with `wrap_boundary()` before compilation.
271
329
  2. `DataRedactor.redact(text)` receives a Ruby `String`, converts it to a C `char*` via `StringValueCStr`, and runs each compiled pattern in sequence on a working buffer.
272
330
  3. For each pattern, `replace_all_matches` iterates using `regexec`, copies non-matching segments to a fresh output buffer, and inserts `[REDACTED]` in place of each match. For boundary-wrapped patterns, `regexec` is called with `nmatch=4` and sub-match groups `[1]`/`[3]` identify the boundary characters so they are preserved verbatim.
273
331
  4. The output buffer is grown with `realloc` as needed. After all patterns are applied the result is returned as a Ruby `String` via `rb_str_new_cstr`. All intermediate `malloc`/`strdup` allocations are explicitly `free`d.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_redactor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniele Frisanco
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-05-08 00:00:00.000000000 Z
11
+ date: 2026-05-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake-compiler
@@ -52,10 +52,25 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0.9'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rack
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '2.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '2.0'
55
69
  description: A Ruby gem with a C extension for high-performance scanning and redaction
56
- of 79 sensitive patterns — API keys, tokens, credentials, IBANs, national IDs, emails,
57
- phone numbers, and PII from 15+ countries. Designed to sanitize text before sending
58
- to LLMs, logging systems, or any public/third-party API.
70
+ of 85 sensitive patterns — API keys, tokens, credentials, IBANs, national IDs, emails,
71
+ phone numbers, and PII from 15+ countries. Optional Logger formatter, Rails filter_parameters
72
+ adapter, and Rack middleware. Designed to sanitize text before sending to LLMs,
73
+ logging systems, or any public/third-party API.
59
74
  email:
60
75
  - daniele.frisanco@gmail.com
61
76
  executables: []
@@ -79,6 +94,9 @@ files:
79
94
  - ext/data_redactor/scan.h
80
95
  - ext/data_redactor/tags.h
81
96
  - lib/data_redactor.rb
97
+ - lib/data_redactor/integrations/logger.rb
98
+ - lib/data_redactor/integrations/rack.rb
99
+ - lib/data_redactor/integrations/rails.rb
82
100
  - lib/data_redactor/version.rb
83
101
  - readme.md
84
102
  homepage: https://github.com/danielefrisanco/data_redactor