@flexorch/audit 0.6.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +85 -6
- package/dist/index.d.cts +26 -2
- package/dist/index.d.ts +26 -2
- package/dist/index.js +83 -6
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -26,10 +26,12 @@ __export(index_exports, {
|
|
|
26
26
|
auditStream: () => auditStream,
|
|
27
27
|
complianceReport: () => complianceReport,
|
|
28
28
|
detectPii: () => detectPii,
|
|
29
|
+
estimateTokens: () => estimateTokens,
|
|
29
30
|
mask: () => mask,
|
|
30
31
|
noiseMetrics: () => noiseMetrics,
|
|
31
32
|
noiseRatio: () => noiseRatio,
|
|
32
33
|
qualityMetrics: () => qualityMetrics,
|
|
34
|
+
redactForLlm: () => redactForLlm,
|
|
33
35
|
version: () => version
|
|
34
36
|
});
|
|
35
37
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -57,6 +59,7 @@ var COMPANY_NAME_TR_RE = new RegExp(
|
|
|
57
59
|
"gu"
|
|
58
60
|
);
|
|
59
61
|
var MERSIS_RE = /\b([1-9]\d{15})\b/g;
|
|
62
|
+
var SGK_RE = /(?:SGK\s*(?:Sicil\s*No(?:su)?|No(?:su)?|Numara(?:s[ıi])?)?|Sigortal[ıi]\s*(?:Sicil\s*)?(?:No|Numara(?:s[ıi])?)|SSK\s*(?:No|Numara(?:s[ıi])?|Sicil))\s*[:#]*\s*(\d{10,11})\b/giu;
|
|
60
63
|
var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
|
|
61
64
|
var _TR_PROVINCES_SORTED = [
|
|
62
65
|
"Afyonkarahisar",
|
|
@@ -336,12 +339,24 @@ function validEinUs(s) {
|
|
|
336
339
|
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
337
340
|
}
|
|
338
341
|
var PESEL_PL_RE = /\b(\d{11})\b/g;
|
|
342
|
+
var NIP_PL_RE = /(?:NIP|Numer\s+NIP|Numer\s+Identyfikacji\s+Podatkowej)\s*[:#]*\s*(\d{10})\b/gi;
|
|
339
343
|
function validPeselPl(s) {
|
|
340
344
|
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
341
345
|
const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
|
|
342
346
|
const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
|
|
343
347
|
return (10 - total % 10) % 10 === parseInt(s[10]);
|
|
344
348
|
}
|
|
349
|
+
function validNifPt(s) {
|
|
350
|
+
if (s.length !== 9 || !/^\d+$/.test(s) || s[0] === "0") return false;
|
|
351
|
+
let total = 0;
|
|
352
|
+
for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
|
|
353
|
+
const check = (11 - total % 11) % 11;
|
|
354
|
+
return (check >= 10 ? 0 : check) === parseInt(s[8]);
|
|
355
|
+
}
|
|
356
|
+
var NIF_PT_RE = /(?:NIF|N[uú]mero\s+de\s+Contribuinte|Contribuinte)\s*[:#]*\s*(\d{9})\b/gi;
|
|
357
|
+
var PERSONNUMMER_SE_RE = /\b(\d{6,8}[-+]\d{4})\b/g;
|
|
358
|
+
var CPR_DK_RE = /\b(\d{6}-\d{4})\b/g;
|
|
359
|
+
var HETU_FI_RE = /\b(\d{6}[+\-A]\d{3}[0-9A-FHJ-NPR-Y])\b/g;
|
|
345
360
|
var SVNR_AT_RE = /\b(\d{10})\b/g;
|
|
346
361
|
function validSvnrAt(s) {
|
|
347
362
|
if (s.length !== 10 || !/^\d+$/.test(s)) return false;
|
|
@@ -367,17 +382,22 @@ var LOCALE_DETECTORS = {
|
|
|
367
382
|
"company_name_tr",
|
|
368
383
|
"mersis_no",
|
|
369
384
|
"postal_code_tr",
|
|
370
|
-
"province_tr"
|
|
385
|
+
"province_tr",
|
|
386
|
+
"sgk_no"
|
|
371
387
|
]),
|
|
372
388
|
us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
|
|
373
389
|
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
|
|
374
|
-
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
|
|
375
|
-
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
|
|
390
|
+
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de", "social_id_at"]),
|
|
391
|
+
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr", "national_id_be"]),
|
|
376
392
|
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
377
|
-
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
393
|
+
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl", "national_id_be"]),
|
|
378
394
|
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
379
395
|
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
|
|
380
|
-
pl: /* @__PURE__ */ new Set(["national_id_pl"]),
|
|
396
|
+
pl: /* @__PURE__ */ new Set(["national_id_pl", "tax_id_pl"]),
|
|
397
|
+
pt: /* @__PURE__ */ new Set(["tax_id_pt"]),
|
|
398
|
+
sv: /* @__PURE__ */ new Set(["national_id_se"]),
|
|
399
|
+
da: /* @__PURE__ */ new Set(["national_id_dk"]),
|
|
400
|
+
fi: /* @__PURE__ */ new Set(["national_id_fi"]),
|
|
381
401
|
at: /* @__PURE__ */ new Set(["social_id_at"]),
|
|
382
402
|
be: /* @__PURE__ */ new Set(["national_id_be"])
|
|
383
403
|
};
|
|
@@ -490,6 +510,14 @@ function detectPii(text, locale = "und") {
|
|
|
490
510
|
findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
491
511
|
}
|
|
492
512
|
}
|
|
513
|
+
if (active.has("sgk_no")) {
|
|
514
|
+
SGK_RE.lastIndex = 0;
|
|
515
|
+
let m;
|
|
516
|
+
while ((m = SGK_RE.exec(t)) !== null) {
|
|
517
|
+
const start = m.index + m[0].lastIndexOf(m[1]);
|
|
518
|
+
findings.push({ type: "sgk_no", value: m[1], start, end: start + m[1].length });
|
|
519
|
+
}
|
|
520
|
+
}
|
|
493
521
|
if (active.has("postal_code_tr")) {
|
|
494
522
|
POSTAL_CODE_TR_RE.lastIndex = 0;
|
|
495
523
|
let m;
|
|
@@ -638,6 +666,45 @@ function detectPii(text, locale = "und") {
|
|
|
638
666
|
if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
639
667
|
}
|
|
640
668
|
}
|
|
669
|
+
if (active.has("tax_id_pl")) {
|
|
670
|
+
NIP_PL_RE.lastIndex = 0;
|
|
671
|
+
let m;
|
|
672
|
+
while ((m = NIP_PL_RE.exec(t)) !== null) {
|
|
673
|
+
const start = m.index + m[0].lastIndexOf(m[1]);
|
|
674
|
+
findings.push({ type: "tax_id_pl", value: m[1], start, end: start + m[1].length });
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
if (active.has("tax_id_pt")) {
|
|
678
|
+
NIF_PT_RE.lastIndex = 0;
|
|
679
|
+
let m;
|
|
680
|
+
while ((m = NIF_PT_RE.exec(t)) !== null) {
|
|
681
|
+
if (validNifPt(m[1])) {
|
|
682
|
+
const start = m.index + m[0].lastIndexOf(m[1]);
|
|
683
|
+
findings.push({ type: "tax_id_pt", value: m[1], start, end: start + m[1].length });
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
if (active.has("national_id_se")) {
|
|
688
|
+
PERSONNUMMER_SE_RE.lastIndex = 0;
|
|
689
|
+
let m;
|
|
690
|
+
while ((m = PERSONNUMMER_SE_RE.exec(t)) !== null) {
|
|
691
|
+
findings.push({ type: "national_id_se", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
if (active.has("national_id_dk")) {
|
|
695
|
+
CPR_DK_RE.lastIndex = 0;
|
|
696
|
+
let m;
|
|
697
|
+
while ((m = CPR_DK_RE.exec(t)) !== null) {
|
|
698
|
+
findings.push({ type: "national_id_dk", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
if (active.has("national_id_fi")) {
|
|
702
|
+
HETU_FI_RE.lastIndex = 0;
|
|
703
|
+
let m;
|
|
704
|
+
while ((m = HETU_FI_RE.exec(t)) !== null) {
|
|
705
|
+
findings.push({ type: "national_id_fi", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
706
|
+
}
|
|
707
|
+
}
|
|
641
708
|
if (active.has("social_id_at")) {
|
|
642
709
|
SVNR_AT_RE.lastIndex = 0;
|
|
643
710
|
let m;
|
|
@@ -786,7 +853,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
786
853
|
}
|
|
787
854
|
|
|
788
855
|
// src/index.ts
|
|
789
|
-
var version = "0.
|
|
856
|
+
var version = "0.7.0";
|
|
790
857
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
791
858
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
792
859
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -842,6 +909,16 @@ async function* auditStream(texts, options = {}) {
|
|
|
842
909
|
yield audit(text, options);
|
|
843
910
|
}
|
|
844
911
|
}
|
|
912
|
+
function redactForLlm(text, options = {}) {
|
|
913
|
+
const { strategy, ...auditOptions } = options;
|
|
914
|
+
const result = audit(text, auditOptions);
|
|
915
|
+
return mask(text, result.pii, { strategy });
|
|
916
|
+
}
|
|
917
|
+
function estimateTokens(text) {
|
|
918
|
+
if (!text || !text.trim()) return 0;
|
|
919
|
+
const words = text.trim().split(/\s+/).length;
|
|
920
|
+
return Math.max(1, Math.round(words * 4 / 3));
|
|
921
|
+
}
|
|
845
922
|
var HIGH_RISK_TYPES = /* @__PURE__ */ new Set([
|
|
846
923
|
"national_id_tr",
|
|
847
924
|
"ssn",
|
|
@@ -903,9 +980,11 @@ function complianceReport(result) {
|
|
|
903
980
|
auditStream,
|
|
904
981
|
complianceReport,
|
|
905
982
|
detectPii,
|
|
983
|
+
estimateTokens,
|
|
906
984
|
mask,
|
|
907
985
|
noiseMetrics,
|
|
908
986
|
noiseRatio,
|
|
909
987
|
qualityMetrics,
|
|
988
|
+
redactForLlm,
|
|
910
989
|
version
|
|
911
990
|
});
|
package/dist/index.d.cts
CHANGED
|
@@ -54,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
54
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
55
55
|
*/
|
|
56
56
|
|
|
57
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.7.0";
|
|
58
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
59
59
|
interface PiiSummaryEntry {
|
|
60
60
|
type: string;
|
|
@@ -130,6 +130,30 @@ declare function mask(text: string, findings: PiiFinding[], options?: MaskOption
|
|
|
130
130
|
* }
|
|
131
131
|
*/
|
|
132
132
|
declare function auditStream(texts: AsyncIterable<string>, options?: AuditOptions): AsyncGenerator<AuditResult>;
|
|
133
|
+
/**
|
|
134
|
+
* Audit *text* and return a PII-free version ready for LLM processing.
|
|
135
|
+
*
|
|
136
|
+
* One-shot convenience wrapper around audit() + mask(). Equivalent to:
|
|
137
|
+
* const result = audit(text, { locale })
|
|
138
|
+
* return mask(text, result.pii, { strategy })
|
|
139
|
+
*
|
|
140
|
+
* @example
|
|
141
|
+
* const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
|
|
142
|
+
* // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
|
|
143
|
+
*/
|
|
144
|
+
declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
|
|
145
|
+
/**
|
|
146
|
+
* Estimate the token count of *text* using a word-based heuristic.
|
|
147
|
+
*
|
|
148
|
+
* Uses the standard approximation: 1 token ≈ 0.75 words (words × 4/3).
|
|
149
|
+
* No external dependencies — accuracy within ~15% of real tokenizers for
|
|
150
|
+
* English and most European languages. Treat as a planning estimate.
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* estimateTokens("The quick brown fox") // → 7
|
|
154
|
+
* estimateTokens("") // → 0
|
|
155
|
+
*/
|
|
156
|
+
declare function estimateTokens(text: string): number;
|
|
133
157
|
type RiskLevel = "none" | "low" | "medium" | "high";
|
|
134
158
|
interface ComplianceReport {
|
|
135
159
|
has_pii: boolean;
|
|
@@ -145,4 +169,4 @@ interface ComplianceReport {
|
|
|
145
169
|
*/
|
|
146
170
|
declare function complianceReport(result: AuditResult): ComplianceReport;
|
|
147
171
|
|
|
148
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|
|
172
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, estimateTokens, mask, noiseMetrics, noiseRatio, qualityMetrics, redactForLlm, version };
|
package/dist/index.d.ts
CHANGED
|
@@ -54,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
54
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
55
55
|
*/
|
|
56
56
|
|
|
57
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.7.0";
|
|
58
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
59
59
|
interface PiiSummaryEntry {
|
|
60
60
|
type: string;
|
|
@@ -130,6 +130,30 @@ declare function mask(text: string, findings: PiiFinding[], options?: MaskOption
|
|
|
130
130
|
* }
|
|
131
131
|
*/
|
|
132
132
|
declare function auditStream(texts: AsyncIterable<string>, options?: AuditOptions): AsyncGenerator<AuditResult>;
|
|
133
|
+
/**
|
|
134
|
+
* Audit *text* and return a PII-free version ready for LLM processing.
|
|
135
|
+
*
|
|
136
|
+
* One-shot convenience wrapper around audit() + mask(). Equivalent to:
|
|
137
|
+
* const result = audit(text, { locale })
|
|
138
|
+
* return mask(text, result.pii, { strategy })
|
|
139
|
+
*
|
|
140
|
+
* @example
|
|
141
|
+
* const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
|
|
142
|
+
* // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
|
|
143
|
+
*/
|
|
144
|
+
declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
|
|
145
|
+
/**
|
|
146
|
+
* Estimate the token count of *text* using a word-based heuristic.
|
|
147
|
+
*
|
|
148
|
+
* Uses the standard approximation: 1 token ≈ 0.75 words (words × 4/3).
|
|
149
|
+
* No external dependencies — accuracy within ~15% of real tokenizers for
|
|
150
|
+
* English and most European languages. Treat as a planning estimate.
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* estimateTokens("The quick brown fox") // → 7
|
|
154
|
+
* estimateTokens("") // → 0
|
|
155
|
+
*/
|
|
156
|
+
declare function estimateTokens(text: string): number;
|
|
133
157
|
type RiskLevel = "none" | "low" | "medium" | "high";
|
|
134
158
|
interface ComplianceReport {
|
|
135
159
|
has_pii: boolean;
|
|
@@ -145,4 +169,4 @@ interface ComplianceReport {
|
|
|
145
169
|
*/
|
|
146
170
|
declare function complianceReport(result: AuditResult): ComplianceReport;
|
|
147
171
|
|
|
148
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|
|
172
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, estimateTokens, mask, noiseMetrics, noiseRatio, qualityMetrics, redactForLlm, version };
|
package/dist/index.js
CHANGED
|
@@ -21,6 +21,7 @@ var COMPANY_NAME_TR_RE = new RegExp(
|
|
|
21
21
|
"gu"
|
|
22
22
|
);
|
|
23
23
|
var MERSIS_RE = /\b([1-9]\d{15})\b/g;
|
|
24
|
+
var SGK_RE = /(?:SGK\s*(?:Sicil\s*No(?:su)?|No(?:su)?|Numara(?:s[ıi])?)?|Sigortal[ıi]\s*(?:Sicil\s*)?(?:No|Numara(?:s[ıi])?)|SSK\s*(?:No|Numara(?:s[ıi])?|Sicil))\s*[:#]*\s*(\d{10,11})\b/giu;
|
|
24
25
|
var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
|
|
25
26
|
var _TR_PROVINCES_SORTED = [
|
|
26
27
|
"Afyonkarahisar",
|
|
@@ -300,12 +301,24 @@ function validEinUs(s) {
|
|
|
300
301
|
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
301
302
|
}
|
|
302
303
|
var PESEL_PL_RE = /\b(\d{11})\b/g;
|
|
304
|
+
var NIP_PL_RE = /(?:NIP|Numer\s+NIP|Numer\s+Identyfikacji\s+Podatkowej)\s*[:#]*\s*(\d{10})\b/gi;
|
|
303
305
|
function validPeselPl(s) {
|
|
304
306
|
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
305
307
|
const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
|
|
306
308
|
const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
|
|
307
309
|
return (10 - total % 10) % 10 === parseInt(s[10]);
|
|
308
310
|
}
|
|
311
|
+
function validNifPt(s) {
|
|
312
|
+
if (s.length !== 9 || !/^\d+$/.test(s) || s[0] === "0") return false;
|
|
313
|
+
let total = 0;
|
|
314
|
+
for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
|
|
315
|
+
const check = (11 - total % 11) % 11;
|
|
316
|
+
return (check >= 10 ? 0 : check) === parseInt(s[8]);
|
|
317
|
+
}
|
|
318
|
+
var NIF_PT_RE = /(?:NIF|N[uú]mero\s+de\s+Contribuinte|Contribuinte)\s*[:#]*\s*(\d{9})\b/gi;
|
|
319
|
+
var PERSONNUMMER_SE_RE = /\b(\d{6,8}[-+]\d{4})\b/g;
|
|
320
|
+
var CPR_DK_RE = /\b(\d{6}-\d{4})\b/g;
|
|
321
|
+
var HETU_FI_RE = /\b(\d{6}[+\-A]\d{3}[0-9A-FHJ-NPR-Y])\b/g;
|
|
309
322
|
var SVNR_AT_RE = /\b(\d{10})\b/g;
|
|
310
323
|
function validSvnrAt(s) {
|
|
311
324
|
if (s.length !== 10 || !/^\d+$/.test(s)) return false;
|
|
@@ -331,17 +344,22 @@ var LOCALE_DETECTORS = {
|
|
|
331
344
|
"company_name_tr",
|
|
332
345
|
"mersis_no",
|
|
333
346
|
"postal_code_tr",
|
|
334
|
-
"province_tr"
|
|
347
|
+
"province_tr",
|
|
348
|
+
"sgk_no"
|
|
335
349
|
]),
|
|
336
350
|
us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
|
|
337
351
|
eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
|
|
338
|
-
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
|
|
339
|
-
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
|
|
352
|
+
de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de", "social_id_at"]),
|
|
353
|
+
fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr", "national_id_be"]),
|
|
340
354
|
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
341
|
-
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
355
|
+
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl", "national_id_be"]),
|
|
342
356
|
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
343
357
|
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
|
|
344
|
-
pl: /* @__PURE__ */ new Set(["national_id_pl"]),
|
|
358
|
+
pl: /* @__PURE__ */ new Set(["national_id_pl", "tax_id_pl"]),
|
|
359
|
+
pt: /* @__PURE__ */ new Set(["tax_id_pt"]),
|
|
360
|
+
sv: /* @__PURE__ */ new Set(["national_id_se"]),
|
|
361
|
+
da: /* @__PURE__ */ new Set(["national_id_dk"]),
|
|
362
|
+
fi: /* @__PURE__ */ new Set(["national_id_fi"]),
|
|
345
363
|
at: /* @__PURE__ */ new Set(["social_id_at"]),
|
|
346
364
|
be: /* @__PURE__ */ new Set(["national_id_be"])
|
|
347
365
|
};
|
|
@@ -454,6 +472,14 @@ function detectPii(text, locale = "und") {
|
|
|
454
472
|
findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
455
473
|
}
|
|
456
474
|
}
|
|
475
|
+
if (active.has("sgk_no")) {
|
|
476
|
+
SGK_RE.lastIndex = 0;
|
|
477
|
+
let m;
|
|
478
|
+
while ((m = SGK_RE.exec(t)) !== null) {
|
|
479
|
+
const start = m.index + m[0].lastIndexOf(m[1]);
|
|
480
|
+
findings.push({ type: "sgk_no", value: m[1], start, end: start + m[1].length });
|
|
481
|
+
}
|
|
482
|
+
}
|
|
457
483
|
if (active.has("postal_code_tr")) {
|
|
458
484
|
POSTAL_CODE_TR_RE.lastIndex = 0;
|
|
459
485
|
let m;
|
|
@@ -602,6 +628,45 @@ function detectPii(text, locale = "und") {
|
|
|
602
628
|
if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
603
629
|
}
|
|
604
630
|
}
|
|
631
|
+
if (active.has("tax_id_pl")) {
|
|
632
|
+
NIP_PL_RE.lastIndex = 0;
|
|
633
|
+
let m;
|
|
634
|
+
while ((m = NIP_PL_RE.exec(t)) !== null) {
|
|
635
|
+
const start = m.index + m[0].lastIndexOf(m[1]);
|
|
636
|
+
findings.push({ type: "tax_id_pl", value: m[1], start, end: start + m[1].length });
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
if (active.has("tax_id_pt")) {
|
|
640
|
+
NIF_PT_RE.lastIndex = 0;
|
|
641
|
+
let m;
|
|
642
|
+
while ((m = NIF_PT_RE.exec(t)) !== null) {
|
|
643
|
+
if (validNifPt(m[1])) {
|
|
644
|
+
const start = m.index + m[0].lastIndexOf(m[1]);
|
|
645
|
+
findings.push({ type: "tax_id_pt", value: m[1], start, end: start + m[1].length });
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
if (active.has("national_id_se")) {
|
|
650
|
+
PERSONNUMMER_SE_RE.lastIndex = 0;
|
|
651
|
+
let m;
|
|
652
|
+
while ((m = PERSONNUMMER_SE_RE.exec(t)) !== null) {
|
|
653
|
+
findings.push({ type: "national_id_se", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
if (active.has("national_id_dk")) {
|
|
657
|
+
CPR_DK_RE.lastIndex = 0;
|
|
658
|
+
let m;
|
|
659
|
+
while ((m = CPR_DK_RE.exec(t)) !== null) {
|
|
660
|
+
findings.push({ type: "national_id_dk", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
if (active.has("national_id_fi")) {
|
|
664
|
+
HETU_FI_RE.lastIndex = 0;
|
|
665
|
+
let m;
|
|
666
|
+
while ((m = HETU_FI_RE.exec(t)) !== null) {
|
|
667
|
+
findings.push({ type: "national_id_fi", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
668
|
+
}
|
|
669
|
+
}
|
|
605
670
|
if (active.has("social_id_at")) {
|
|
606
671
|
SVNR_AT_RE.lastIndex = 0;
|
|
607
672
|
let m;
|
|
@@ -750,7 +815,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
750
815
|
}
|
|
751
816
|
|
|
752
817
|
// src/index.ts
|
|
753
|
-
var version = "0.
|
|
818
|
+
var version = "0.7.0";
|
|
754
819
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
755
820
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
756
821
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -806,6 +871,16 @@ async function* auditStream(texts, options = {}) {
|
|
|
806
871
|
yield audit(text, options);
|
|
807
872
|
}
|
|
808
873
|
}
|
|
874
|
+
function redactForLlm(text, options = {}) {
|
|
875
|
+
const { strategy, ...auditOptions } = options;
|
|
876
|
+
const result = audit(text, auditOptions);
|
|
877
|
+
return mask(text, result.pii, { strategy });
|
|
878
|
+
}
|
|
879
|
+
function estimateTokens(text) {
|
|
880
|
+
if (!text || !text.trim()) return 0;
|
|
881
|
+
const words = text.trim().split(/\s+/).length;
|
|
882
|
+
return Math.max(1, Math.round(words * 4 / 3));
|
|
883
|
+
}
|
|
809
884
|
var HIGH_RISK_TYPES = /* @__PURE__ */ new Set([
|
|
810
885
|
"national_id_tr",
|
|
811
886
|
"ssn",
|
|
@@ -866,9 +941,11 @@ export {
|
|
|
866
941
|
auditStream,
|
|
867
942
|
complianceReport,
|
|
868
943
|
detectPii,
|
|
944
|
+
estimateTokens,
|
|
869
945
|
mask,
|
|
870
946
|
noiseMetrics,
|
|
871
947
|
noiseRatio,
|
|
872
948
|
qualityMetrics,
|
|
949
|
+
redactForLlm,
|
|
873
950
|
version
|
|
874
951
|
};
|