@flexorch/audit 0.5.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +166 -6
- package/dist/index.d.cts +54 -2
- package/dist/index.d.ts +54 -2
- package/dist/index.js +162 -6
- package/package.json +2 -2
package/dist/index.cjs
CHANGED
|
@@ -23,11 +23,15 @@ __export(index_exports, {
|
|
|
23
23
|
applyMask: () => applyMask,
|
|
24
24
|
audit: () => audit,
|
|
25
25
|
auditBatch: () => auditBatch,
|
|
26
|
+
auditStream: () => auditStream,
|
|
27
|
+
complianceReport: () => complianceReport,
|
|
26
28
|
detectPii: () => detectPii,
|
|
29
|
+
estimateTokens: () => estimateTokens,
|
|
27
30
|
mask: () => mask,
|
|
28
31
|
noiseMetrics: () => noiseMetrics,
|
|
29
32
|
noiseRatio: () => noiseRatio,
|
|
30
33
|
qualityMetrics: () => qualityMetrics,
|
|
34
|
+
redactForLlm: () => redactForLlm,
|
|
31
35
|
version: () => version
|
|
32
36
|
});
|
|
33
37
|
module.exports = __toCommonJS(index_exports);
|
|
@@ -333,6 +337,28 @@ var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
|
|
|
333
337
|
function validEinUs(s) {
|
|
334
338
|
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
335
339
|
}
|
|
340
|
+
var PESEL_PL_RE = /\b(\d{11})\b/g;
|
|
341
|
+
function validPeselPl(s) {
|
|
342
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
343
|
+
const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
|
|
344
|
+
const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
|
|
345
|
+
return (10 - total % 10) % 10 === parseInt(s[10]);
|
|
346
|
+
}
|
|
347
|
+
var SVNR_AT_RE = /\b(\d{10})\b/g;
|
|
348
|
+
function validSvnrAt(s) {
|
|
349
|
+
if (s.length !== 10 || !/^\d+$/.test(s)) return false;
|
|
350
|
+
const weights = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6];
|
|
351
|
+
const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
|
|
352
|
+
return total % 10 === parseInt(s[3]);
|
|
353
|
+
}
|
|
354
|
+
var NRRNISS_BE_RE = /\b(\d{11})\b/g;
|
|
355
|
+
function validNrrnissBe(s) {
|
|
356
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
357
|
+
const body = parseInt(s.slice(0, 9));
|
|
358
|
+
const check = parseInt(s.slice(9));
|
|
359
|
+
if (97 - body % 97 === check) return true;
|
|
360
|
+
return 97 - (2e9 + body) % 97 === check;
|
|
361
|
+
}
|
|
336
362
|
var LOCALE_DETECTORS = {
|
|
337
363
|
tr: /* @__PURE__ */ new Set([
|
|
338
364
|
"national_id_tr",
|
|
@@ -352,7 +378,10 @@ var LOCALE_DETECTORS = {
|
|
|
352
378
|
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
353
379
|
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
354
380
|
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
355
|
-
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
|
|
381
|
+
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
|
|
382
|
+
pl: /* @__PURE__ */ new Set(["national_id_pl"]),
|
|
383
|
+
at: /* @__PURE__ */ new Set(["social_id_at"]),
|
|
384
|
+
be: /* @__PURE__ */ new Set(["national_id_be"])
|
|
356
385
|
};
|
|
357
386
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
358
387
|
function activeDetectors(locale) {
|
|
@@ -604,6 +633,27 @@ function detectPii(text, locale = "und") {
|
|
|
604
633
|
findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
605
634
|
}
|
|
606
635
|
}
|
|
636
|
+
if (active.has("national_id_pl")) {
|
|
637
|
+
PESEL_PL_RE.lastIndex = 0;
|
|
638
|
+
let m;
|
|
639
|
+
while ((m = PESEL_PL_RE.exec(t)) !== null) {
|
|
640
|
+
if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
if (active.has("social_id_at")) {
|
|
644
|
+
SVNR_AT_RE.lastIndex = 0;
|
|
645
|
+
let m;
|
|
646
|
+
while ((m = SVNR_AT_RE.exec(t)) !== null) {
|
|
647
|
+
if (validSvnrAt(m[1])) findings.push({ type: "social_id_at", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
648
|
+
}
|
|
649
|
+
}
|
|
650
|
+
if (active.has("national_id_be")) {
|
|
651
|
+
NRRNISS_BE_RE.lastIndex = 0;
|
|
652
|
+
let m;
|
|
653
|
+
while ((m = NRRNISS_BE_RE.exec(t)) !== null) {
|
|
654
|
+
if (validNrrnissBe(m[1])) findings.push({ type: "national_id_be", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
655
|
+
}
|
|
656
|
+
}
|
|
607
657
|
findings.sort((a, b) => a.start - b.start);
|
|
608
658
|
const specificIbanSpans = new Set(
|
|
609
659
|
findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
|
|
@@ -659,18 +709,56 @@ function noiseMetrics(text) {
|
|
|
659
709
|
|
|
660
710
|
// src/mask.ts
|
|
661
711
|
var import_crypto = require("crypto");
|
|
662
|
-
var
|
|
712
|
+
var TCKN_POOL = ["12345678950", "10000000146", "23456789060"];
|
|
713
|
+
var IBAN_TR_POOL = ["TR330006100519786457841326", "TR390006199999888888888813"];
|
|
714
|
+
var NAME_POOL = [
|
|
715
|
+
"Ahmet Yilmaz",
|
|
716
|
+
"Mehmet Demir",
|
|
717
|
+
"Ayse Kaya",
|
|
718
|
+
"Fatma Celik",
|
|
719
|
+
"Ali Sahin",
|
|
720
|
+
"Zeynep Arslan",
|
|
721
|
+
"Mustafa Ozturk",
|
|
722
|
+
"Emine Dogan",
|
|
723
|
+
"Ibrahim Kurt",
|
|
724
|
+
"Hatice Aydin",
|
|
725
|
+
"Hasan Yildiz",
|
|
726
|
+
"Elif Gunes",
|
|
727
|
+
"Huseyin Cetin",
|
|
728
|
+
"Meryem Polat",
|
|
729
|
+
"Omer Koc",
|
|
730
|
+
"Busra Tekin",
|
|
731
|
+
"Yusuf Erdogan",
|
|
732
|
+
"Selin Bozkurt",
|
|
733
|
+
"Kemal Akin",
|
|
734
|
+
"Derya Ucar"
|
|
735
|
+
];
|
|
736
|
+
var STATIC_SYNTHETIC = {
|
|
663
737
|
email: "user@example.com",
|
|
664
738
|
phone: "+1 000 000 0000",
|
|
665
739
|
phone_tr: "0500 000 00 00",
|
|
666
|
-
|
|
740
|
+
phone_intl: "+1 000 000 0000",
|
|
667
741
|
ssn: "000-00-0000",
|
|
668
742
|
iban: "XX00 0000 0000 0000 0000 00",
|
|
669
743
|
credit_card: "0000 0000 0000 0000",
|
|
670
744
|
ip: "0.0.0.0",
|
|
671
|
-
|
|
745
|
+
ip_v6: "2001:db8::1",
|
|
746
|
+
national_id_pl: "00000000000",
|
|
747
|
+
social_id_at: "0000000000",
|
|
748
|
+
national_id_be: "00000000000"
|
|
672
749
|
};
|
|
673
750
|
var VALID_STRATEGIES = /* @__PURE__ */ new Set(["redact", "replace", "token", "hash"]);
|
|
751
|
+
function pick(pool, seed) {
|
|
752
|
+
const h = (0, import_crypto.createHash)("sha256").update(seed).digest("hex");
|
|
753
|
+
const idx = parseInt(h.slice(0, 8), 16) % pool.length;
|
|
754
|
+
return pool[idx];
|
|
755
|
+
}
|
|
756
|
+
function synthetic(ptype, original) {
|
|
757
|
+
if (ptype === "national_id_tr") return pick(TCKN_POOL, original);
|
|
758
|
+
if (ptype === "iban_tr" || ptype === "iban_intl") return pick(IBAN_TR_POOL, original);
|
|
759
|
+
if (ptype === "name") return pick(NAME_POOL, original);
|
|
760
|
+
return STATIC_SYNTHETIC[ptype] ?? `[${ptype.toUpperCase()}]`;
|
|
761
|
+
}
|
|
674
762
|
function applyMask(text, findings, strategy = "redact") {
|
|
675
763
|
if (!VALID_STRATEGIES.has(strategy)) {
|
|
676
764
|
throw new Error(`Unknown strategy "${strategy}". Use: redact, replace, token, hash`);
|
|
@@ -687,7 +775,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
687
775
|
if (strategy === "redact") {
|
|
688
776
|
replacement = `[REDACTED_${tag}]`;
|
|
689
777
|
} else if (strategy === "replace") {
|
|
690
|
-
replacement =
|
|
778
|
+
replacement = synthetic(type, value);
|
|
691
779
|
} else if (strategy === "token") {
|
|
692
780
|
replacement = `<PII_${tag}_${counter[type]}>`;
|
|
693
781
|
} else {
|
|
@@ -700,7 +788,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
700
788
|
}
|
|
701
789
|
|
|
702
790
|
// src/index.ts
|
|
703
|
-
var version = "0.
|
|
791
|
+
var version = "0.7.0";
|
|
704
792
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
705
793
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
706
794
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -751,15 +839,87 @@ function auditBatch(texts, options = {}) {
|
|
|
751
839
|
function mask(text, findings, options = {}) {
|
|
752
840
|
return applyMask(text, findings, options.strategy ?? "redact");
|
|
753
841
|
}
|
|
842
|
+
async function* auditStream(texts, options = {}) {
|
|
843
|
+
for await (const text of texts) {
|
|
844
|
+
yield audit(text, options);
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
function redactForLlm(text, options = {}) {
|
|
848
|
+
const { strategy, ...auditOptions } = options;
|
|
849
|
+
const result = audit(text, auditOptions);
|
|
850
|
+
return mask(text, result.pii, { strategy });
|
|
851
|
+
}
|
|
852
|
+
function estimateTokens(text) {
|
|
853
|
+
if (!text || !text.trim()) return 0;
|
|
854
|
+
const words = text.trim().split(/\s+/).length;
|
|
855
|
+
return Math.max(1, Math.round(words * 4 / 3));
|
|
856
|
+
}
|
|
857
|
+
var HIGH_RISK_TYPES = /* @__PURE__ */ new Set([
|
|
858
|
+
"national_id_tr",
|
|
859
|
+
"ssn",
|
|
860
|
+
"credit_card",
|
|
861
|
+
"national_id_pl",
|
|
862
|
+
"national_id_be",
|
|
863
|
+
"social_id_at",
|
|
864
|
+
"social_id_de",
|
|
865
|
+
"social_id_uk",
|
|
866
|
+
"national_id_it",
|
|
867
|
+
"national_id_nl",
|
|
868
|
+
"national_id_es",
|
|
869
|
+
"national_id_us",
|
|
870
|
+
"tax_id_tr",
|
|
871
|
+
"tax_id_de"
|
|
872
|
+
]);
|
|
873
|
+
var MEDIUM_RISK_TYPES = /* @__PURE__ */ new Set([
|
|
874
|
+
"email",
|
|
875
|
+
"phone_tr",
|
|
876
|
+
"phone_intl",
|
|
877
|
+
"iban",
|
|
878
|
+
"iban_tr",
|
|
879
|
+
"iban_intl",
|
|
880
|
+
"name"
|
|
881
|
+
]);
|
|
882
|
+
function complianceReport(result) {
|
|
883
|
+
const types = [...new Set(result.pii.map((f) => f.type))].sort();
|
|
884
|
+
let risk_level = "none";
|
|
885
|
+
if (types.length > 0) {
|
|
886
|
+
if (types.some((t) => HIGH_RISK_TYPES.has(t))) risk_level = "high";
|
|
887
|
+
else if (types.some((t) => MEDIUM_RISK_TYPES.has(t))) risk_level = "medium";
|
|
888
|
+
else risk_level = "low";
|
|
889
|
+
}
|
|
890
|
+
const recommendations = [];
|
|
891
|
+
if (risk_level === "high" || risk_level === "medium") {
|
|
892
|
+
recommendations.push("Apply mask({ strategy: 'redact' }) before storing or sharing this text.");
|
|
893
|
+
}
|
|
894
|
+
if (risk_level === "high") {
|
|
895
|
+
recommendations.push(
|
|
896
|
+
"Review applicable regulations (KVKK Art. 6, GDPR Art. 9) for special category data handling."
|
|
897
|
+
);
|
|
898
|
+
}
|
|
899
|
+
if (recommendations.length === 0) {
|
|
900
|
+
recommendations.push("No PII detected \u2014 text is safe for LLM processing.");
|
|
901
|
+
}
|
|
902
|
+
return {
|
|
903
|
+
has_pii: types.length > 0,
|
|
904
|
+
pii_types: types,
|
|
905
|
+
risk_level,
|
|
906
|
+
masking_required: types.length > 0,
|
|
907
|
+
recommendations
|
|
908
|
+
};
|
|
909
|
+
}
|
|
754
910
|
// Annotate the CommonJS export names for ESM import in node:
|
|
755
911
|
0 && (module.exports = {
|
|
756
912
|
applyMask,
|
|
757
913
|
audit,
|
|
758
914
|
auditBatch,
|
|
915
|
+
auditStream,
|
|
916
|
+
complianceReport,
|
|
759
917
|
detectPii,
|
|
918
|
+
estimateTokens,
|
|
760
919
|
mask,
|
|
761
920
|
noiseMetrics,
|
|
762
921
|
noiseRatio,
|
|
763
922
|
qualityMetrics,
|
|
923
|
+
redactForLlm,
|
|
764
924
|
version
|
|
765
925
|
});
|
package/dist/index.d.cts
CHANGED
|
@@ -54,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
54
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
55
55
|
*/
|
|
56
56
|
|
|
57
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.7.0";
|
|
58
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
59
59
|
interface PiiSummaryEntry {
|
|
60
60
|
type: string;
|
|
@@ -116,5 +116,57 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
|
|
|
116
116
|
* Apply masking to PII findings in *text*.
|
|
117
117
|
*/
|
|
118
118
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
119
|
+
/**
|
|
120
|
+
* Async generator that audits texts one at a time from an async iterable.
|
|
121
|
+
*
|
|
122
|
+
* Yields one AuditResult per input text. Processing is sequential.
|
|
123
|
+
*
|
|
124
|
+
* @example
|
|
125
|
+
* async function* lines() {
|
|
126
|
+
* for (const line of data) yield line;
|
|
127
|
+
* }
|
|
128
|
+
* for await (const result of auditStream(lines())) {
|
|
129
|
+
* console.log(result.quality_grade, result.pii_summary);
|
|
130
|
+
* }
|
|
131
|
+
*/
|
|
132
|
+
declare function auditStream(texts: AsyncIterable<string>, options?: AuditOptions): AsyncGenerator<AuditResult>;
|
|
133
|
+
/**
|
|
134
|
+
* Audit *text* and return a PII-free version ready for LLM processing.
|
|
135
|
+
*
|
|
136
|
+
* One-shot convenience wrapper around audit() + mask(). Equivalent to:
|
|
137
|
+
* const result = audit(text, { locale })
|
|
138
|
+
* return mask(text, result.pii, { strategy })
|
|
139
|
+
*
|
|
140
|
+
* @example
|
|
141
|
+
* const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
|
|
142
|
+
* // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
|
|
143
|
+
*/
|
|
144
|
+
declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
|
|
145
|
+
/**
|
|
146
|
+
* Estimate the token count of *text* using a word-based heuristic.
|
|
147
|
+
*
|
|
148
|
+
* Uses the standard approximation: 1 token ≈ 0.75 words (words × 4/3).
|
|
149
|
+
* No external dependencies — accuracy within ~15% of real tokenizers for
|
|
150
|
+
* English and most European languages. Treat as a planning estimate.
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* estimateTokens("The quick brown fox") // → 7
|
|
154
|
+
* estimateTokens("") // → 0
|
|
155
|
+
*/
|
|
156
|
+
declare function estimateTokens(text: string): number;
|
|
157
|
+
type RiskLevel = "none" | "low" | "medium" | "high";
|
|
158
|
+
interface ComplianceReport {
|
|
159
|
+
has_pii: boolean;
|
|
160
|
+
pii_types: string[];
|
|
161
|
+
risk_level: RiskLevel;
|
|
162
|
+
masking_required: boolean;
|
|
163
|
+
recommendations: string[];
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Generate a KVKK/GDPR compliance summary for an AuditResult.
|
|
167
|
+
*
|
|
168
|
+
* This is a technical summary only — not a legal document or regulatory opinion.
|
|
169
|
+
*/
|
|
170
|
+
declare function complianceReport(result: AuditResult): ComplianceReport;
|
|
119
171
|
|
|
120
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|
|
172
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, estimateTokens, mask, noiseMetrics, noiseRatio, qualityMetrics, redactForLlm, version };
|
package/dist/index.d.ts
CHANGED
|
@@ -54,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
|
|
|
54
54
|
* // "Contact: [REDACTED_EMAIL]"
|
|
55
55
|
*/
|
|
56
56
|
|
|
57
|
-
declare const version = "0.
|
|
57
|
+
declare const version = "0.7.0";
|
|
58
58
|
type QualityGrade = "A" | "B" | "C" | "D";
|
|
59
59
|
interface PiiSummaryEntry {
|
|
60
60
|
type: string;
|
|
@@ -116,5 +116,57 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
|
|
|
116
116
|
* Apply masking to PII findings in *text*.
|
|
117
117
|
*/
|
|
118
118
|
declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
|
|
119
|
+
/**
|
|
120
|
+
* Async generator that audits texts one at a time from an async iterable.
|
|
121
|
+
*
|
|
122
|
+
* Yields one AuditResult per input text. Processing is sequential.
|
|
123
|
+
*
|
|
124
|
+
* @example
|
|
125
|
+
* async function* lines() {
|
|
126
|
+
* for (const line of data) yield line;
|
|
127
|
+
* }
|
|
128
|
+
* for await (const result of auditStream(lines())) {
|
|
129
|
+
* console.log(result.quality_grade, result.pii_summary);
|
|
130
|
+
* }
|
|
131
|
+
*/
|
|
132
|
+
declare function auditStream(texts: AsyncIterable<string>, options?: AuditOptions): AsyncGenerator<AuditResult>;
|
|
133
|
+
/**
|
|
134
|
+
* Audit *text* and return a PII-free version ready for LLM processing.
|
|
135
|
+
*
|
|
136
|
+
* One-shot convenience wrapper around audit() + mask(). Equivalent to:
|
|
137
|
+
* const result = audit(text, { locale })
|
|
138
|
+
* return mask(text, result.pii, { strategy })
|
|
139
|
+
*
|
|
140
|
+
* @example
|
|
141
|
+
* const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
|
|
142
|
+
* // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
|
|
143
|
+
*/
|
|
144
|
+
declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
|
|
145
|
+
/**
|
|
146
|
+
* Estimate the token count of *text* using a word-based heuristic.
|
|
147
|
+
*
|
|
148
|
+
* Uses the standard approximation: 1 token ≈ 0.75 words (words × 4/3).
|
|
149
|
+
* No external dependencies — accuracy within ~15% of real tokenizers for
|
|
150
|
+
* English and most European languages. Treat as a planning estimate.
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* estimateTokens("The quick brown fox") // → 7
|
|
154
|
+
* estimateTokens("") // → 0
|
|
155
|
+
*/
|
|
156
|
+
declare function estimateTokens(text: string): number;
|
|
157
|
+
type RiskLevel = "none" | "low" | "medium" | "high";
|
|
158
|
+
interface ComplianceReport {
|
|
159
|
+
has_pii: boolean;
|
|
160
|
+
pii_types: string[];
|
|
161
|
+
risk_level: RiskLevel;
|
|
162
|
+
masking_required: boolean;
|
|
163
|
+
recommendations: string[];
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Generate a KVKK/GDPR compliance summary for an AuditResult.
|
|
167
|
+
*
|
|
168
|
+
* This is a technical summary only — not a legal document or regulatory opinion.
|
|
169
|
+
*/
|
|
170
|
+
declare function complianceReport(result: AuditResult): ComplianceReport;
|
|
119
171
|
|
|
120
|
-
export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
|
|
172
|
+
export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, estimateTokens, mask, noiseMetrics, noiseRatio, qualityMetrics, redactForLlm, version };
|
package/dist/index.js
CHANGED
|
@@ -299,6 +299,28 @@ var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
|
|
|
299
299
|
function validEinUs(s) {
|
|
300
300
|
return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
|
|
301
301
|
}
|
|
302
|
+
var PESEL_PL_RE = /\b(\d{11})\b/g;
|
|
303
|
+
function validPeselPl(s) {
|
|
304
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
305
|
+
const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
|
|
306
|
+
const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
|
|
307
|
+
return (10 - total % 10) % 10 === parseInt(s[10]);
|
|
308
|
+
}
|
|
309
|
+
var SVNR_AT_RE = /\b(\d{10})\b/g;
|
|
310
|
+
function validSvnrAt(s) {
|
|
311
|
+
if (s.length !== 10 || !/^\d+$/.test(s)) return false;
|
|
312
|
+
const weights = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6];
|
|
313
|
+
const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
|
|
314
|
+
return total % 10 === parseInt(s[3]);
|
|
315
|
+
}
|
|
316
|
+
var NRRNISS_BE_RE = /\b(\d{11})\b/g;
|
|
317
|
+
function validNrrnissBe(s) {
|
|
318
|
+
if (s.length !== 11 || !/^\d+$/.test(s)) return false;
|
|
319
|
+
const body = parseInt(s.slice(0, 9));
|
|
320
|
+
const check = parseInt(s.slice(9));
|
|
321
|
+
if (97 - body % 97 === check) return true;
|
|
322
|
+
return 97 - (2e9 + body) % 97 === check;
|
|
323
|
+
}
|
|
302
324
|
var LOCALE_DETECTORS = {
|
|
303
325
|
tr: /* @__PURE__ */ new Set([
|
|
304
326
|
"national_id_tr",
|
|
@@ -318,7 +340,10 @@ var LOCALE_DETECTORS = {
|
|
|
318
340
|
it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
|
|
319
341
|
nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
|
|
320
342
|
es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
|
|
321
|
-
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
|
|
343
|
+
uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
|
|
344
|
+
pl: /* @__PURE__ */ new Set(["national_id_pl"]),
|
|
345
|
+
at: /* @__PURE__ */ new Set(["social_id_at"]),
|
|
346
|
+
be: /* @__PURE__ */ new Set(["national_id_be"])
|
|
322
347
|
};
|
|
323
348
|
var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
|
|
324
349
|
function activeDetectors(locale) {
|
|
@@ -570,6 +595,27 @@ function detectPii(text, locale = "und") {
|
|
|
570
595
|
findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
571
596
|
}
|
|
572
597
|
}
|
|
598
|
+
if (active.has("national_id_pl")) {
|
|
599
|
+
PESEL_PL_RE.lastIndex = 0;
|
|
600
|
+
let m;
|
|
601
|
+
while ((m = PESEL_PL_RE.exec(t)) !== null) {
|
|
602
|
+
if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
if (active.has("social_id_at")) {
|
|
606
|
+
SVNR_AT_RE.lastIndex = 0;
|
|
607
|
+
let m;
|
|
608
|
+
while ((m = SVNR_AT_RE.exec(t)) !== null) {
|
|
609
|
+
if (validSvnrAt(m[1])) findings.push({ type: "social_id_at", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
if (active.has("national_id_be")) {
|
|
613
|
+
NRRNISS_BE_RE.lastIndex = 0;
|
|
614
|
+
let m;
|
|
615
|
+
while ((m = NRRNISS_BE_RE.exec(t)) !== null) {
|
|
616
|
+
if (validNrrnissBe(m[1])) findings.push({ type: "national_id_be", value: m[1], start: m.index, end: m.index + m[1].length });
|
|
617
|
+
}
|
|
618
|
+
}
|
|
573
619
|
findings.sort((a, b) => a.start - b.start);
|
|
574
620
|
const specificIbanSpans = new Set(
|
|
575
621
|
findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
|
|
@@ -625,18 +671,56 @@ function noiseMetrics(text) {
|
|
|
625
671
|
|
|
626
672
|
// src/mask.ts
|
|
627
673
|
import { createHash } from "crypto";
|
|
628
|
-
var
|
|
674
|
+
var TCKN_POOL = ["12345678950", "10000000146", "23456789060"];
|
|
675
|
+
var IBAN_TR_POOL = ["TR330006100519786457841326", "TR390006199999888888888813"];
|
|
676
|
+
var NAME_POOL = [
|
|
677
|
+
"Ahmet Yilmaz",
|
|
678
|
+
"Mehmet Demir",
|
|
679
|
+
"Ayse Kaya",
|
|
680
|
+
"Fatma Celik",
|
|
681
|
+
"Ali Sahin",
|
|
682
|
+
"Zeynep Arslan",
|
|
683
|
+
"Mustafa Ozturk",
|
|
684
|
+
"Emine Dogan",
|
|
685
|
+
"Ibrahim Kurt",
|
|
686
|
+
"Hatice Aydin",
|
|
687
|
+
"Hasan Yildiz",
|
|
688
|
+
"Elif Gunes",
|
|
689
|
+
"Huseyin Cetin",
|
|
690
|
+
"Meryem Polat",
|
|
691
|
+
"Omer Koc",
|
|
692
|
+
"Busra Tekin",
|
|
693
|
+
"Yusuf Erdogan",
|
|
694
|
+
"Selin Bozkurt",
|
|
695
|
+
"Kemal Akin",
|
|
696
|
+
"Derya Ucar"
|
|
697
|
+
];
|
|
698
|
+
var STATIC_SYNTHETIC = {
|
|
629
699
|
email: "user@example.com",
|
|
630
700
|
phone: "+1 000 000 0000",
|
|
631
701
|
phone_tr: "0500 000 00 00",
|
|
632
|
-
|
|
702
|
+
phone_intl: "+1 000 000 0000",
|
|
633
703
|
ssn: "000-00-0000",
|
|
634
704
|
iban: "XX00 0000 0000 0000 0000 00",
|
|
635
705
|
credit_card: "0000 0000 0000 0000",
|
|
636
706
|
ip: "0.0.0.0",
|
|
637
|
-
|
|
707
|
+
ip_v6: "2001:db8::1",
|
|
708
|
+
national_id_pl: "00000000000",
|
|
709
|
+
social_id_at: "0000000000",
|
|
710
|
+
national_id_be: "00000000000"
|
|
638
711
|
};
|
|
639
712
|
var VALID_STRATEGIES = /* @__PURE__ */ new Set(["redact", "replace", "token", "hash"]);
|
|
713
|
+
function pick(pool, seed) {
|
|
714
|
+
const h = createHash("sha256").update(seed).digest("hex");
|
|
715
|
+
const idx = parseInt(h.slice(0, 8), 16) % pool.length;
|
|
716
|
+
return pool[idx];
|
|
717
|
+
}
|
|
718
|
+
function synthetic(ptype, original) {
|
|
719
|
+
if (ptype === "national_id_tr") return pick(TCKN_POOL, original);
|
|
720
|
+
if (ptype === "iban_tr" || ptype === "iban_intl") return pick(IBAN_TR_POOL, original);
|
|
721
|
+
if (ptype === "name") return pick(NAME_POOL, original);
|
|
722
|
+
return STATIC_SYNTHETIC[ptype] ?? `[${ptype.toUpperCase()}]`;
|
|
723
|
+
}
|
|
640
724
|
function applyMask(text, findings, strategy = "redact") {
|
|
641
725
|
if (!VALID_STRATEGIES.has(strategy)) {
|
|
642
726
|
throw new Error(`Unknown strategy "${strategy}". Use: redact, replace, token, hash`);
|
|
@@ -653,7 +737,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
653
737
|
if (strategy === "redact") {
|
|
654
738
|
replacement = `[REDACTED_${tag}]`;
|
|
655
739
|
} else if (strategy === "replace") {
|
|
656
|
-
replacement =
|
|
740
|
+
replacement = synthetic(type, value);
|
|
657
741
|
} else if (strategy === "token") {
|
|
658
742
|
replacement = `<PII_${tag}_${counter[type]}>`;
|
|
659
743
|
} else {
|
|
@@ -666,7 +750,7 @@ function applyMask(text, findings, strategy = "redact") {
|
|
|
666
750
|
}
|
|
667
751
|
|
|
668
752
|
// src/index.ts
|
|
669
|
-
var version = "0.
|
|
753
|
+
var version = "0.7.0";
|
|
670
754
|
function computeQualityScore(completeness, avgLength, garbageRatio) {
|
|
671
755
|
const lengthScore = Math.min(avgLength / 500, 1);
|
|
672
756
|
const noiseScore = Math.max(0, 1 - garbageRatio * 10);
|
|
@@ -717,14 +801,86 @@ function auditBatch(texts, options = {}) {
|
|
|
717
801
|
function mask(text, findings, options = {}) {
|
|
718
802
|
return applyMask(text, findings, options.strategy ?? "redact");
|
|
719
803
|
}
|
|
804
|
+
async function* auditStream(texts, options = {}) {
|
|
805
|
+
for await (const text of texts) {
|
|
806
|
+
yield audit(text, options);
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
function redactForLlm(text, options = {}) {
|
|
810
|
+
const { strategy, ...auditOptions } = options;
|
|
811
|
+
const result = audit(text, auditOptions);
|
|
812
|
+
return mask(text, result.pii, { strategy });
|
|
813
|
+
}
|
|
814
|
+
function estimateTokens(text) {
|
|
815
|
+
if (!text || !text.trim()) return 0;
|
|
816
|
+
const words = text.trim().split(/\s+/).length;
|
|
817
|
+
return Math.max(1, Math.round(words * 4 / 3));
|
|
818
|
+
}
|
|
819
|
+
var HIGH_RISK_TYPES = /* @__PURE__ */ new Set([
|
|
820
|
+
"national_id_tr",
|
|
821
|
+
"ssn",
|
|
822
|
+
"credit_card",
|
|
823
|
+
"national_id_pl",
|
|
824
|
+
"national_id_be",
|
|
825
|
+
"social_id_at",
|
|
826
|
+
"social_id_de",
|
|
827
|
+
"social_id_uk",
|
|
828
|
+
"national_id_it",
|
|
829
|
+
"national_id_nl",
|
|
830
|
+
"national_id_es",
|
|
831
|
+
"national_id_us",
|
|
832
|
+
"tax_id_tr",
|
|
833
|
+
"tax_id_de"
|
|
834
|
+
]);
|
|
835
|
+
var MEDIUM_RISK_TYPES = /* @__PURE__ */ new Set([
|
|
836
|
+
"email",
|
|
837
|
+
"phone_tr",
|
|
838
|
+
"phone_intl",
|
|
839
|
+
"iban",
|
|
840
|
+
"iban_tr",
|
|
841
|
+
"iban_intl",
|
|
842
|
+
"name"
|
|
843
|
+
]);
|
|
844
|
+
function complianceReport(result) {
|
|
845
|
+
const types = [...new Set(result.pii.map((f) => f.type))].sort();
|
|
846
|
+
let risk_level = "none";
|
|
847
|
+
if (types.length > 0) {
|
|
848
|
+
if (types.some((t) => HIGH_RISK_TYPES.has(t))) risk_level = "high";
|
|
849
|
+
else if (types.some((t) => MEDIUM_RISK_TYPES.has(t))) risk_level = "medium";
|
|
850
|
+
else risk_level = "low";
|
|
851
|
+
}
|
|
852
|
+
const recommendations = [];
|
|
853
|
+
if (risk_level === "high" || risk_level === "medium") {
|
|
854
|
+
recommendations.push("Apply mask({ strategy: 'redact' }) before storing or sharing this text.");
|
|
855
|
+
}
|
|
856
|
+
if (risk_level === "high") {
|
|
857
|
+
recommendations.push(
|
|
858
|
+
"Review applicable regulations (KVKK Art. 6, GDPR Art. 9) for special category data handling."
|
|
859
|
+
);
|
|
860
|
+
}
|
|
861
|
+
if (recommendations.length === 0) {
|
|
862
|
+
recommendations.push("No PII detected \u2014 text is safe for LLM processing.");
|
|
863
|
+
}
|
|
864
|
+
return {
|
|
865
|
+
has_pii: types.length > 0,
|
|
866
|
+
pii_types: types,
|
|
867
|
+
risk_level,
|
|
868
|
+
masking_required: types.length > 0,
|
|
869
|
+
recommendations
|
|
870
|
+
};
|
|
871
|
+
}
|
|
720
872
|
export {
|
|
721
873
|
applyMask,
|
|
722
874
|
audit,
|
|
723
875
|
auditBatch,
|
|
876
|
+
auditStream,
|
|
877
|
+
complianceReport,
|
|
724
878
|
detectPii,
|
|
879
|
+
estimateTokens,
|
|
725
880
|
mask,
|
|
726
881
|
noiseMetrics,
|
|
727
882
|
noiseRatio,
|
|
728
883
|
qualityMetrics,
|
|
884
|
+
redactForLlm,
|
|
729
885
|
version
|
|
730
886
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@flexorch/audit",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.7.0",
|
|
4
4
|
"description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"pii",
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
],
|
|
15
15
|
"license": "MIT",
|
|
16
16
|
"author": "FlexOrch",
|
|
17
|
-
"homepage": "https://
|
|
17
|
+
"homepage": "https://flexorch.com",
|
|
18
18
|
"repository": {
|
|
19
19
|
"type": "git",
|
|
20
20
|
"url": "git+https://github.com/flexorch/flexorch-audit-js.git"
|