dpdp-erasure-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (155) hide show
  1. package/.env.example +55 -0
  2. package/Dockerfile +33 -0
  3. package/compliance.worker.yaml +64 -0
  4. package/package.json +41 -0
  5. package/src/constants/index.ts +1 -0
  6. package/src/errors/fail.ts +110 -0
  7. package/src/errors/index.ts +4 -0
  8. package/src/errors/inferer.ts +166 -0
  9. package/src/errors/registry.ts +122 -0
  10. package/src/errors/types.ts +65 -0
  11. package/src/errors/worker.ts +161 -0
  12. package/src/index.ts +328 -0
  13. package/src/lib/crypto/digest.ts +22 -0
  14. package/src/lib/crypto/encoding.ts +78 -0
  15. package/src/lib/crypto/index.ts +2 -0
  16. package/src/lib/index.ts +1 -0
  17. package/src/modules/bootstrap/index.ts +2 -0
  18. package/src/modules/bootstrap/integrity.ts +38 -0
  19. package/src/modules/bootstrap/preflight.ts +296 -0
  20. package/src/modules/cli/check-integrity.ts +48 -0
  21. package/src/modules/cli/dry-run.ts +90 -0
  22. package/src/modules/cli/graph.ts +87 -0
  23. package/src/modules/cli/index.ts +184 -0
  24. package/src/modules/cli/init.ts +115 -0
  25. package/src/modules/cli/inspect.ts +86 -0
  26. package/src/modules/cli/introspector.ts +117 -0
  27. package/src/modules/cli/keygen.ts +38 -0
  28. package/src/modules/cli/scan.ts +126 -0
  29. package/src/modules/cli/sign.ts +50 -0
  30. package/src/modules/cli/ui.ts +61 -0
  31. package/src/modules/cli/verify-schema.ts +31 -0
  32. package/src/modules/cli/verify.ts +85 -0
  33. package/src/modules/config/compatibility.ts +271 -0
  34. package/src/modules/config/index.ts +4 -0
  35. package/src/modules/config/reader.ts +149 -0
  36. package/src/modules/config/signature.ts +69 -0
  37. package/src/modules/config/validation.ts +658 -0
  38. package/src/modules/crypto/aes.ts +158 -0
  39. package/src/modules/crypto/envelope.ts +48 -0
  40. package/src/modules/crypto/hmac.ts +60 -0
  41. package/src/modules/crypto/index.ts +3 -0
  42. package/src/modules/db/drift.ts +36 -0
  43. package/src/modules/db/graph.ts +203 -0
  44. package/src/modules/db/index.ts +4 -0
  45. package/src/modules/db/migrations.ts +254 -0
  46. package/src/modules/db/sql-debug.ts +61 -0
  47. package/src/modules/engine/blob/index.ts +3 -0
  48. package/src/modules/engine/blob/s3.ts +455 -0
  49. package/src/modules/engine/blob/store.ts +236 -0
  50. package/src/modules/engine/blob/types.ts +44 -0
  51. package/src/modules/engine/helpers/identity.ts +47 -0
  52. package/src/modules/engine/helpers/index.ts +4 -0
  53. package/src/modules/engine/helpers/outbox.ts +118 -0
  54. package/src/modules/engine/helpers/runtime.ts +115 -0
  55. package/src/modules/engine/helpers/types.ts +61 -0
  56. package/src/modules/engine/index.ts +6 -0
  57. package/src/modules/engine/notifier/config.ts +147 -0
  58. package/src/modules/engine/notifier/dispatcher.ts +300 -0
  59. package/src/modules/engine/notifier/index.ts +3 -0
  60. package/src/modules/engine/notifier/payload.ts +51 -0
  61. package/src/modules/engine/notifier/reservation.ts +153 -0
  62. package/src/modules/engine/notifier/types.ts +38 -0
  63. package/src/modules/engine/shredder.ts +254 -0
  64. package/src/modules/engine/types.ts +146 -0
  65. package/src/modules/engine/vault/compiled-targets.ts +562 -0
  66. package/src/modules/engine/vault/context.ts +254 -0
  67. package/src/modules/engine/vault/dry-run.ts +94 -0
  68. package/src/modules/engine/vault/execution.ts +485 -0
  69. package/src/modules/engine/vault/index.ts +3 -0
  70. package/src/modules/engine/vault/purge.ts +82 -0
  71. package/src/modules/engine/vault/retention.ts +124 -0
  72. package/src/modules/engine/vault/satellite-mutation.ts +193 -0
  73. package/src/modules/engine/vault/satellite.ts +103 -0
  74. package/src/modules/engine/vault/shadow.ts +36 -0
  75. package/src/modules/engine/vault/static-plan.ts +116 -0
  76. package/src/modules/engine/vault/store.ts +34 -0
  77. package/src/modules/engine/vault/vault.ts +84 -0
  78. package/src/modules/introspector/classifier.ts +502 -0
  79. package/src/modules/introspector/dag.ts +276 -0
  80. package/src/modules/introspector/index.ts +7 -0
  81. package/src/modules/introspector/naming.ts +75 -0
  82. package/src/modules/introspector/report.ts +153 -0
  83. package/src/modules/introspector/run.ts +123 -0
  84. package/src/modules/introspector/s3-sampler.ts +227 -0
  85. package/src/modules/introspector/types.ts +131 -0
  86. package/src/modules/introspector/yaml.ts +101 -0
  87. package/src/modules/network/api/control-plane.ts +275 -0
  88. package/src/modules/network/api/index.ts +1 -0
  89. package/src/modules/network/api/validation.ts +71 -0
  90. package/src/modules/network/index.ts +4 -0
  91. package/src/modules/network/object-store/aws/client.ts +444 -0
  92. package/src/modules/network/object-store/aws/credentials.ts +271 -0
  93. package/src/modules/network/object-store/aws/index.ts +2 -0
  94. package/src/modules/network/object-store/aws/sigv4.ts +190 -0
  95. package/src/modules/network/object-store/aws/type.ts +6 -0
  96. package/src/modules/network/object-store/index.ts +1 -0
  97. package/src/modules/network/outbox/dispatcher.ts +183 -0
  98. package/src/modules/network/outbox/index.ts +3 -0
  99. package/src/modules/network/outbox/process.ts +133 -0
  100. package/src/modules/network/outbox/shared.ts +56 -0
  101. package/src/modules/network/outbox/store.ts +346 -0
  102. package/src/modules/network/outbox/types.ts +54 -0
  103. package/src/modules/network/request-signing.ts +61 -0
  104. package/src/modules/worker/index.ts +2 -0
  105. package/src/modules/worker/tasks.ts +58 -0
  106. package/src/modules/worker/types.ts +89 -0
  107. package/src/modules/worker/worker.ts +243 -0
  108. package/src/secrets/index.ts +4 -0
  109. package/src/secrets/kms/index.ts +2 -0
  110. package/src/secrets/kms/signature.ts +82 -0
  111. package/src/secrets/kms/validation.ts +64 -0
  112. package/src/secrets/reader.ts +42 -0
  113. package/src/secrets/repository/crypto.ts +89 -0
  114. package/src/secrets/repository/index.ts +2 -0
  115. package/src/secrets/repository/methods.ts +37 -0
  116. package/src/secrets/resolvers.ts +247 -0
  117. package/src/secrets/signature.ts +78 -0
  118. package/src/types/index.ts +1 -0
  119. package/src/types/types.ts +23 -0
  120. package/src/utils/identifiers.ts +48 -0
  121. package/src/utils/index.ts +3 -0
  122. package/src/utils/json.ts +35 -0
  123. package/src/utils/logger.ts +161 -0
  124. package/src/validation/zod.ts +70 -0
  125. package/tests/adversarial.test.ts +464 -0
  126. package/tests/blob-s3.test.ts +216 -0
  127. package/tests/config.test.ts +395 -0
  128. package/tests/control-plane-client.test.ts +108 -0
  129. package/tests/crypto.test.ts +106 -0
  130. package/tests/errors.test.ts +69 -0
  131. package/tests/fetch-dispatcher.test.ts +213 -0
  132. package/tests/graph.test.ts +84 -0
  133. package/tests/helpers/index.ts +101 -0
  134. package/tests/index-preflight.test.ts +168 -0
  135. package/tests/introspector-classifier.test.ts +62 -0
  136. package/tests/introspector-report.test.ts +85 -0
  137. package/tests/introspector.test.ts +394 -0
  138. package/tests/kms.test.ts +124 -0
  139. package/tests/logger.test.ts +61 -0
  140. package/tests/notifier.test.ts +303 -0
  141. package/tests/outbox.test.ts +478 -0
  142. package/tests/purge-policy.test.ts +124 -0
  143. package/tests/retention.test.ts +103 -0
  144. package/tests/s3-client.test.ts +110 -0
  145. package/tests/satellite.test.ts +119 -0
  146. package/tests/schema-compatibility.test.ts +237 -0
  147. package/tests/schema-integrity.test.ts +64 -0
  148. package/tests/shredder.test.ts +163 -0
  149. package/tests/vault.compiled-targets.test.ts +243 -0
  150. package/tests/vault.replica.test.ts +59 -0
  151. package/tests/vault.test.ts +279 -0
  152. package/tests/worker.retry.test.ts +291 -0
  153. package/tests/worker.test.ts +200 -0
  154. package/tsconfig.json +19 -0
  155. package/vitest.config.ts +13 -0
@@ -0,0 +1,502 @@
1
+ import { fail } from "@/errors";
2
+ import type { ClassifierOptions, ColumnTaxonomy, DagTarget, QualifiedTable } from "./types";
3
+
4
+ const textEncoder = new TextEncoder();
5
+ const textDecoder = new TextDecoder();
6
+ const MAX_FLATTEN_DEPTH = 10;
7
+
8
+ const SUPPORTED_DATA_TYPES = new Set([
9
+ "text",
10
+ "character varying",
11
+ "varchar",
12
+ "character",
13
+ "char",
14
+ "json",
15
+ "jsonb",
16
+ "uuid",
17
+ "inet",
18
+ "cidr",
19
+ "macaddr",
20
+ "macaddr8",
21
+ ]);
22
+
23
+ const STRONG_METADATA_SCORE = 0.92;
24
+ const MEDIUM_METADATA_SCORE = 0.82;
25
+ const WEAK_METADATA_SCORE = 0.62;
26
+ const DEFAULT_THRESHOLD = 0.75;
27
+ const DEFAULT_SAMPLE_PERCENT = 1;
28
+ const DEFAULT_SAMPLE_LIMIT = 100;
29
+
30
+ const VERHOEFF_D = [
31
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
32
+ [1, 2, 3, 4, 0, 6, 7, 8, 9, 5],
33
+ [2, 3, 4, 0, 1, 7, 8, 9, 5, 6],
34
+ [3, 4, 0, 1, 2, 8, 9, 5, 6, 7],
35
+ [4, 0, 1, 2, 3, 9, 5, 6, 7, 8],
36
+ [5, 9, 8, 7, 6, 0, 4, 3, 2, 1],
37
+ [6, 5, 9, 8, 7, 1, 0, 4, 3, 2],
38
+ [7, 6, 5, 9, 8, 2, 1, 0, 4, 3],
39
+ [8, 7, 6, 5, 9, 3, 2, 1, 0, 4],
40
+ [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
41
+ ] as const;
42
+
43
+ const VERHOEFF_P = [
44
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
45
+ [1, 5, 7, 6, 2, 8, 3, 0, 9, 4],
46
+ [5, 8, 0, 3, 7, 9, 6, 1, 4, 2],
47
+ [8, 9, 1, 6, 0, 4, 3, 5, 2, 7],
48
+ [9, 4, 5, 3, 1, 2, 6, 8, 7, 0],
49
+ [4, 2, 8, 6, 5, 7, 3, 9, 0, 1],
50
+ [2, 7, 9, 3, 8, 0, 6, 4, 1, 5],
51
+ [7, 0, 4, 6, 9, 1, 3, 2, 5, 8],
52
+ ] as const;
53
+
54
+ interface ColumnRow {
55
+ table_schema: string;
56
+ table_name: string;
57
+ column_name: string;
58
+ data_type: string;
59
+ }
60
+
61
+ interface SampleRow {
62
+ sample_value: unknown;
63
+ }
64
+
65
+ interface JsonLeafEntry {
66
+ path: string;
67
+ value: string;
68
+ }
69
+
70
+ interface ContentSignature {
71
+ name: string;
72
+ pattern: RegExp;
73
+ weight: number;
74
+ validate?: (value: string) => boolean;
75
+ requiresMetadata?: boolean;
76
+ metadataHints?: RegExp[];
77
+ }
78
+
79
+ const CONTENT_SIGNATURES: ContentSignature[] = [
80
+ { name: "aadhaar", pattern: /^[2-9][0-9]{3}[\s-]?[0-9]{4}[\s-]?[0-9]{4}$/, weight: 0.98, validate: validateAadhaar },
81
+ { name: "pan", pattern: /^[A-Z]{5}[0-9]{4}[A-Z]$/, weight: 0.96, validate: validatePan },
82
+ { name: "gstin", pattern: /^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z][1-9A-Z]Z[0-9A-Z]$/, weight: 0.94, validate: validateGstin },
83
+ { name: "credit_card", pattern: /^(?:\d[ -]?){13,19}$/, weight: 0.97, validate: validateLuhn },
84
+ { name: "email", pattern: /^[^\s@]+@[^\s@]+\.[^\s@]+$/, weight: 0.95 },
85
+ { name: "upi", pattern: /^[a-zA-Z0-9.\-_]{2,256}@[a-zA-Z]{2,64}$/, weight: 0.93 },
86
+ { name: "ifsc", pattern: /^[A-Z]{4}0[A-Z0-9]{6}$/, weight: 0.9 },
87
+ { name: "indian_mobile", pattern: /^(?:(?:\+|0{0,2})91(\s*[-]\s*)?|[0]?)?[6789]\d{9}$/, weight: 0.9 },
88
+ { name: "indian_passport", pattern: /^[A-Z][0-9]{7}$/, weight: 0.88 },
89
+ { name: "voter_epic", pattern: /^[A-Z]{3}[0-9]{7}$/, weight: 0.88 },
90
+ { name: "ipv4", pattern: /^(?:(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)$/, weight: 0.82 },
91
+ { name: "ipv6", pattern: /^[0-9A-Fa-f:.]+$/, weight: 0.82, validate: validateIpv6 },
92
+ { name: "mac_address", pattern: /^(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}$/, weight: 0.82 },
93
+ {
94
+ name: "bank_account",
95
+ pattern: /^\d{9,18}$/,
96
+ weight: 0.78,
97
+ requiresMetadata: true,
98
+ metadataHints: [/bank/i, /account/i, /iban/i],
99
+ },
100
+ {
101
+ name: "date_of_birth",
102
+ pattern: /^(?:19|20)\d{2}[-/](?:0[1-9]|1[0-2])[-/](?:0[1-9]|[12]\d|3[01])$/,
103
+ weight: 0.82,
104
+ requiresMetadata: true,
105
+ metadataHints: [/dob/i, /birth/i, /date_of_birth/i],
106
+ },
107
+ {
108
+ name: "indian_driving_license",
109
+ pattern: /^[A-Z]{2}[-\s]?[0-9]{2}[-\s]?[0-9]{11}$/,
110
+ weight: 0.84,
111
+ requiresMetadata: true,
112
+ metadataHints: [/driving/i, /licen[cs]e/i, /(^|_)dl(_|$)/i],
113
+ },
114
+ {
115
+ name: "indian_pin_code",
116
+ pattern: /^[1-9][0-9]{5}$/,
117
+ weight: 0.78,
118
+ requiresMetadata: true,
119
+ metadataHints: [/pincode/i, /pin_code/i, /postal/i, /zip/i],
120
+ },
121
+ ];
122
+
123
+ const METADATA_PATTERNS: Array<{ pattern: RegExp; score: number }> = [
124
+ { pattern: /(^|_)(email|e_mail|email_address|mail_address|contact_email)($|_)/i, score: STRONG_METADATA_SCORE },
125
+ { pattern: /(^|_)(phone|mobile|msisdn|telephone|contact_number|whatsapp)(_number|_no)?($|_)/i, score: STRONG_METADATA_SCORE },
126
+ { pattern: /(^|_)(aadhaar|aadhar|uidai)(_number|_no|_id)?($|_)/i, score: STRONG_METADATA_SCORE },
127
+ { pattern: /(^|_)(pan|pan_number|pan_no)($|_)/i, score: STRONG_METADATA_SCORE },
128
+ { pattern: /(^|_)(gstin|gst_number|gst_no)($|_)/i, score: STRONG_METADATA_SCORE },
129
+ { pattern: /(^|_)(credit_card|debit_card|card_number|card_no|cc_number|cc_no)($|_)/i, score: STRONG_METADATA_SCORE },
130
+ { pattern: /(^|_)(upi|vpa|upi_id|upi_address)($|_)/i, score: STRONG_METADATA_SCORE },
131
+ { pattern: /(^|_)(ifsc|ifsc_code)($|_)/i, score: STRONG_METADATA_SCORE },
132
+ { pattern: /(^|_)(passport|passport_number|passport_no)($|_)/i, score: STRONG_METADATA_SCORE },
133
+ { pattern: /(^|_)(voter|voter_id|epic|epic_number|epic_no)($|_)/i, score: STRONG_METADATA_SCORE },
134
+ { pattern: /(^|_)(dob|date_of_birth|birth_date|birthday)($|_)/i, score: MEDIUM_METADATA_SCORE },
135
+ { pattern: /(^|_)(ip|ip_address|ipv4|ipv6|mac|mac_address)($|_)/i, score: MEDIUM_METADATA_SCORE },
136
+ { pattern: /(^|_)(bank_account|account_number|account_no|iban|swift)($|_)/i, score: MEDIUM_METADATA_SCORE },
137
+ { pattern: /(^|_)(driving_license|driving_licence|license_number|licence_number|dl_number|dl_no)($|_)/i, score: MEDIUM_METADATA_SCORE },
138
+ { pattern: /(^|_)(address|street|postal_code|zip_code|pin_code|pincode)($|_)/i, score: WEAK_METADATA_SCORE },
139
+ { pattern: /(^|_)(device_fingerprint|device_id|advertising_id|gaid|idfa)($|_)/i, score: WEAK_METADATA_SCORE },
140
+ ];
141
+
142
+ function qualifiedKey(table: QualifiedTable): string {
143
+ return `${table.schema}.${table.table}`;
144
+ }
145
+
146
+ export function metadataScore(columnName: string): number {
147
+ const normalized = columnName.toLowerCase();
148
+ return METADATA_PATTERNS.reduce(
149
+ (score, candidate) => candidate.pattern.test(normalized) ? Math.max(score, candidate.score) : score,
150
+ 0
151
+ );
152
+ }
153
+
154
+ function digitsOnly(value: string): string {
155
+ return value.replace(/\D/g, "");
156
+ }
157
+
158
+ /**
159
+ * Validates an Aadhaar candidate with the Verhoeff checksum.
160
+ *
161
+ * @param value - Aadhaar candidate, with or without spaces.
162
+ * @returns `true` when the final digit satisfies Verhoeff validation.
163
+ */
164
+ export function validateAadhaar(value: string): boolean {
165
+ const digits = digitsOnly(value);
166
+ if (!/^[2-9]\d{11}$/.test(digits) || /^(\d)\1+$/.test(digits)) {
167
+ return false;
168
+ }
169
+
170
+ let checksum = 0;
171
+ const reversed = digits.split("").reverse();
172
+ for (let index = 0; index < reversed.length; index += 1) {
173
+ const digit = Number(reversed[index]);
174
+ checksum = VERHOEFF_D[checksum]![VERHOEFF_P[index % 8]![digit]!]!;
175
+ }
176
+
177
+ return checksum === 0;
178
+ }
179
+
180
+ /**
181
+ * Validates a generic account number candidate with the Luhn checksum.
182
+ *
183
+ * @param value - Candidate number, optionally separated by spaces or hyphens.
184
+ * @returns `true` when the candidate passes Luhn validation.
185
+ */
186
+ export function validateLuhn(value: string): boolean {
187
+ const digits = digitsOnly(value);
188
+ if (!/^\d{13,19}$/.test(digits) || /^(\d)\1+$/.test(digits)) {
189
+ return false;
190
+ }
191
+
192
+ let sum = 0;
193
+ let doubleDigit = false;
194
+ for (let index = digits.length - 1; index >= 0; index -= 1) {
195
+ let digit = Number(digits[index]);
196
+ if (doubleDigit) {
197
+ digit *= 2;
198
+ if (digit > 9) {
199
+ digit -= 9;
200
+ }
201
+ }
202
+ sum += digit;
203
+ doubleDigit = !doubleDigit;
204
+ }
205
+
206
+ return sum % 10 === 0;
207
+ }
208
+
209
+ /**
210
+ * Applies structural PAN validation beyond the regex.
211
+ *
212
+ * The Indian PAN format does not expose a public checksum algorithm. This validator therefore
213
+ * enforces the high-signal holder-status character and terminal alphabetic check character so
214
+ * junk alphanumeric IDs are not treated as PAN solely because they match length and shape.
215
+ *
216
+ * @param value - Uppercase PAN candidate.
217
+ * @returns `true` when status and terminal check character are structurally valid.
218
+ */
219
+ export function validatePan(value: string): boolean {
220
+ const normalized = value.trim().toUpperCase();
221
+ return /^[A-Z]{5}[0-9]{4}[A-Z]$/.test(normalized) && "PCHFATBLJG".includes(normalized[3]!);
222
+ }
223
+
224
+ /**
225
+ * Applies high-signal GSTIN structural validation.
226
+ *
227
+ * GSTIN embeds a PAN in positions 3-12. The final GSTIN checksum is not
228
+ * universally reliable in legacy test data, so the classifier uses shape plus
229
+ * embedded PAN validation to avoid treating arbitrary 15-character IDs as GSTIN.
230
+ *
231
+ * @param value - GSTIN candidate.
232
+ * @returns `true` when the candidate has valid GSTIN shape and embedded PAN structure.
233
+ */
234
+ export function validateGstin(value: string): boolean {
235
+ const normalized = value.trim().toUpperCase();
236
+ return /^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z][1-9A-Z]Z[0-9A-Z]$/.test(normalized) &&
237
+ validatePan(normalized.slice(2, 12));
238
+ }
239
+
240
+ function validateIpv6(value: string): boolean {
241
+ if (!value.includes(":")) {
242
+ return false;
243
+ }
244
+
245
+ try {
246
+ return new URL(`http://[${value}]/`).hostname.length > 0;
247
+ } catch {
248
+ return false;
249
+ }
250
+ }
251
+
252
+ function flattenJsonLeaves(value: unknown): string[] {
253
+ return flattenJsonLeafEntries(value).map((entry) => entry.value);
254
+ }
255
+
256
+ function flattenJsonLeafEntries(value: unknown): JsonLeafEntry[] {
257
+ const output: JsonLeafEntry[] = [];
258
+ const stack: Array<{ path: string; value: unknown; depth: number }> = [{ path: "", value, depth: 0 }];
259
+
260
+ while (stack.length > 0) {
261
+ const current = stack.pop()!;
262
+ if (current.value === null || current.value === undefined || current.depth > MAX_FLATTEN_DEPTH) {
263
+ continue;
264
+ }
265
+
266
+ if (
267
+ typeof current.value === "string" ||
268
+ typeof current.value === "number" ||
269
+ typeof current.value === "boolean"
270
+ ) {
271
+ output.push({ path: current.path, value: String(current.value) });
272
+ continue;
273
+ }
274
+
275
+ if (Array.isArray(current.value)) {
276
+ for (let index = current.value.length - 1; index >= 0; index -= 1) {
277
+ stack.push({
278
+ path: current.path ? `${current.path}[${index}]` : `[${index}]`,
279
+ value: current.value[index],
280
+ depth: current.depth + 1,
281
+ });
282
+ }
283
+ continue;
284
+ }
285
+
286
+ if (typeof current.value === "object") {
287
+ const entries = Object.entries(current.value as Record<string, unknown>);
288
+ for (let index = entries.length - 1; index >= 0; index -= 1) {
289
+ const [key, child] = entries[index]!;
290
+ stack.push({
291
+ path: current.path ? `${current.path}.${key}` : key,
292
+ value: child,
293
+ depth: current.depth + 1,
294
+ });
295
+ }
296
+ }
297
+ }
298
+
299
+ return output;
300
+ }
301
+
302
+ function extractLeafEntries(value: unknown, dataType: string): JsonLeafEntry[] {
303
+ if (value === null || value === undefined) {
304
+ return [];
305
+ }
306
+
307
+ if (dataType === "json" || dataType === "jsonb") {
308
+ if (typeof value === "string") {
309
+ try {
310
+ return flattenJsonLeafEntries(JSON.parse(value));
311
+ } catch {
312
+ return [{ path: "", value }];
313
+ }
314
+ }
315
+
316
+ return flattenJsonLeafEntries(value);
317
+ }
318
+
319
+ return [{ path: "", value: String(value) }];
320
+ }
321
+
322
+ export function extractLeafValues(value: unknown, dataType: string): string[] {
323
+ return extractLeafEntries(value, dataType).map((entry) => entry.value);
324
+ }
325
+
326
+ function signatureHasMetadataSupport(signature: ContentSignature, columnName: string): boolean {
327
+ if (!signature.requiresMetadata) {
328
+ return true;
329
+ }
330
+
331
+ const normalized = columnName.toLowerCase();
332
+ return (signature.metadataHints ?? []).some((pattern) => pattern.test(normalized));
333
+ }
334
+
335
+ function classifyLeafDetailed(value: string, columnName: string = ""): ContentSignature[] {
336
+ const bytes = textEncoder.encode(value.trim());
337
+ try {
338
+ const normalized = textDecoder.decode(bytes).trim();
339
+ return CONTENT_SIGNATURES
340
+ .filter((signature) =>
341
+ signatureHasMetadataSupport(signature, columnName) &&
342
+ signature.pattern.test(normalized) &&
343
+ (!signature.validate || signature.validate(normalized))
344
+ );
345
+ } finally {
346
+ bytes.fill(0);
347
+ }
348
+ }
349
+
350
+ export function classifyLeaf(value: string, columnName: string = ""): string[] {
351
+ return classifyLeafDetailed(value, columnName).map((signature) => signature.name);
352
+ }
353
+
354
+ async function getColumns(sql: ClassifierOptions["sql"], targets: DagTarget[]): Promise<ColumnRow[]> {
355
+ const targetKeys = targets.map((target) => `${target.table.schema}.${target.table.table}`);
356
+
357
+ if (targetKeys.length === 0) {
358
+ return [];
359
+ }
360
+
361
+ return sql<ColumnRow[]>`
362
+ SELECT table_schema, table_name, column_name, data_type
363
+ FROM information_schema.columns
364
+ WHERE table_schema || '.' || table_name = ANY(${targetKeys})
365
+ AND data_type = ANY(${Array.from(SUPPORTED_DATA_TYPES)})
366
+ ORDER BY table_schema, table_name, ordinal_position
367
+ `;
368
+ }
369
+
370
+ async function sampleColumn(
371
+ sql: ClassifierOptions["sql"],
372
+ table: QualifiedTable,
373
+ column: string,
374
+ samplePercent: number,
375
+ sampleLimit: number
376
+ ): Promise<SampleRow[]> {
377
+ const sampledRows = await sql<SampleRow[]>`
378
+ SELECT ${sql(column)} AS sample_value
379
+ FROM ${sql(table.schema)}.${sql(table.table)} TABLESAMPLE SYSTEM (${samplePercent})
380
+ WHERE ${sql(column)} IS NOT NULL
381
+ LIMIT ${sampleLimit}
382
+ `;
383
+
384
+ if (sampledRows.length > 0) {
385
+ return sampledRows;
386
+ }
387
+
388
+ return sql<SampleRow[]>`
389
+ SELECT ${sql(column)} AS sample_value
390
+ FROM ${sql(table.schema)}.${sql(table.table)}
391
+ WHERE ${sql(column)} IS NOT NULL
392
+ LIMIT ${sampleLimit}
393
+ `;
394
+ }
395
+
396
+ /**
397
+ * Classifies likely PII columns using metadata taxonomy and bounded block sampling.
398
+ *
399
+ * @param options - SQL handle, DAG targets, sampling controls, and confidence threshold.
400
+ * @returns PII columns grouped by qualified table.
401
+ */
402
+ export async function classifyDagTargets(options: ClassifierOptions): Promise<Map<string, ColumnTaxonomy[]>> {
403
+ const samplePercent = options.samplePercent ?? DEFAULT_SAMPLE_PERCENT;
404
+ const sampleLimit = options.sampleLimit ?? DEFAULT_SAMPLE_LIMIT;
405
+ const threshold = options.threshold ?? DEFAULT_THRESHOLD;
406
+
407
+ if (samplePercent <= 0 || samplePercent > 100) {
408
+ fail({
409
+ code: "INTROSPECTOR_SAMPLE_INVALID",
410
+ title: "Invalid sample percentage",
411
+ detail: "Introspector samplePercent must be greater than 0 and less than or equal to 100.",
412
+ category: "validation",
413
+ retryable: false,
414
+ context: { samplePercent },
415
+ });
416
+ }
417
+
418
+ if (!Number.isInteger(sampleLimit) || sampleLimit < 1 || sampleLimit > 1000) {
419
+ fail({
420
+ code: "INTROSPECTOR_SAMPLE_INVALID",
421
+ title: "Invalid sample limit",
422
+ detail: "Introspector sampleLimit must be an integer between 1 and 1000.",
423
+ category: "validation",
424
+ retryable: false,
425
+ context: { sampleLimit },
426
+ });
427
+ }
428
+
429
+ const findings = new Map<string, ColumnTaxonomy[]>();
430
+ const columns = await getColumns(options.sql, options.targets);
431
+
432
+ for (const column of columns) {
433
+ const table = { schema: column.table_schema, table: column.table_name };
434
+ const rows = await sampleColumn(options.sql, table, column.column_name, samplePercent, sampleLimit);
435
+ let matchedRows = 0;
436
+ const matchedSignatures = new Set<string>();
437
+ let jsonPathMetadataScore = 0;
438
+
439
+ for (const row of rows) {
440
+ const leaves = extractLeafEntries(row.sample_value, column.data_type);
441
+ let rowMatched = false;
442
+ try {
443
+ for (const leaf of leaves) {
444
+ const leafMetadataScore = leaf.path ? metadataScore(leaf.path.replace(/[.[\]]+/g, "_")) : 0;
445
+ if (leafMetadataScore > 0) {
446
+ jsonPathMetadataScore = Math.max(jsonPathMetadataScore, leafMetadataScore);
447
+ if (leafMetadataScore >= threshold) {
448
+ rowMatched = true;
449
+ matchedSignatures.add(`json_path:${leaf.path}`);
450
+ }
451
+ }
452
+
453
+ const matches = classifyLeafDetailed(leaf.value, `${column.column_name}_${leaf.path}`);
454
+ if (matches.length > 0) {
455
+ rowMatched = true;
456
+ for (const match of matches) {
457
+ matchedSignatures.add(match.name);
458
+ }
459
+ }
460
+ }
461
+ } finally {
462
+ leaves.length = 0;
463
+ }
464
+
465
+ if (rowMatched) {
466
+ matchedRows += 1;
467
+ }
468
+ }
469
+
470
+ const sampleSize = rows.length;
471
+ const contentMatchRatio = sampleSize === 0 ? 0 : matchedRows / sampleSize;
472
+ const meta = Math.max(metadataScore(column.column_name), jsonPathMetadataScore);
473
+ const signatureWeight = Array.from(matchedSignatures).reduce((weight, signatureName) => {
474
+ const signature = CONTENT_SIGNATURES.find((candidate) => candidate.name === signatureName);
475
+ return Math.max(weight, signature?.weight ?? 0);
476
+ }, 0);
477
+ const contentConfidence = contentMatchRatio * signatureWeight;
478
+ const confidence = Math.max(
479
+ meta >= threshold ? meta : 0,
480
+ contentConfidence,
481
+ 0.3 * meta + 0.7 * contentConfidence
482
+ );
483
+
484
+ if (confidence >= threshold) {
485
+ const key = qualifiedKey(table);
486
+ const existing = findings.get(key) ?? [];
487
+ existing.push({
488
+ table,
489
+ column: column.column_name,
490
+ dataType: column.data_type,
491
+ metadataScore: meta,
492
+ contentMatchRatio,
493
+ confidence,
494
+ sampleSize,
495
+ matchedSignatures: Array.from(matchedSignatures).sort(),
496
+ });
497
+ findings.set(key, existing);
498
+ }
499
+ }
500
+
501
+ return findings;
502
+ }