@flexorch/audit 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -23,11 +23,15 @@ __export(index_exports, {
23
23
  applyMask: () => applyMask,
24
24
  audit: () => audit,
25
25
  auditBatch: () => auditBatch,
26
+ auditStream: () => auditStream,
27
+ complianceReport: () => complianceReport,
26
28
  detectPii: () => detectPii,
29
+ estimateTokens: () => estimateTokens,
27
30
  mask: () => mask,
28
31
  noiseMetrics: () => noiseMetrics,
29
32
  noiseRatio: () => noiseRatio,
30
33
  qualityMetrics: () => qualityMetrics,
34
+ redactForLlm: () => redactForLlm,
31
35
  version: () => version
32
36
  });
33
37
  module.exports = __toCommonJS(index_exports);
@@ -333,6 +337,28 @@ var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
333
337
  function validEinUs(s) {
334
338
  return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
335
339
  }
340
+ var PESEL_PL_RE = /\b(\d{11})\b/g;
341
+ function validPeselPl(s) {
342
+ if (s.length !== 11 || !/^\d+$/.test(s)) return false;
343
+ const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
344
+ const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
345
+ return (10 - total % 10) % 10 === parseInt(s[10]);
346
+ }
347
+ var SVNR_AT_RE = /\b(\d{10})\b/g;
348
+ function validSvnrAt(s) {
349
+ if (s.length !== 10 || !/^\d+$/.test(s)) return false;
350
+ const weights = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6];
351
+ const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
352
+ return total % 10 === parseInt(s[3]);
353
+ }
354
+ var NRRNISS_BE_RE = /\b(\d{11})\b/g;
355
+ function validNrrnissBe(s) {
356
+ if (s.length !== 11 || !/^\d+$/.test(s)) return false;
357
+ const body = parseInt(s.slice(0, 9));
358
+ const check = parseInt(s.slice(9));
359
+ if (97 - body % 97 === check) return true;
360
+ return 97 - (2e9 + body) % 97 === check;
361
+ }
336
362
  var LOCALE_DETECTORS = {
337
363
  tr: /* @__PURE__ */ new Set([
338
364
  "national_id_tr",
@@ -352,7 +378,10 @@ var LOCALE_DETECTORS = {
352
378
  it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
353
379
  nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
354
380
  es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
355
- uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
381
+ uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
382
+ pl: /* @__PURE__ */ new Set(["national_id_pl"]),
383
+ at: /* @__PURE__ */ new Set(["social_id_at"]),
384
+ be: /* @__PURE__ */ new Set(["national_id_be"])
356
385
  };
357
386
  var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
358
387
  function activeDetectors(locale) {
@@ -604,6 +633,27 @@ function detectPii(text, locale = "und") {
604
633
  findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
605
634
  }
606
635
  }
636
+ if (active.has("national_id_pl")) {
637
+ PESEL_PL_RE.lastIndex = 0;
638
+ let m;
639
+ while ((m = PESEL_PL_RE.exec(t)) !== null) {
640
+ if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
641
+ }
642
+ }
643
+ if (active.has("social_id_at")) {
644
+ SVNR_AT_RE.lastIndex = 0;
645
+ let m;
646
+ while ((m = SVNR_AT_RE.exec(t)) !== null) {
647
+ if (validSvnrAt(m[1])) findings.push({ type: "social_id_at", value: m[1], start: m.index, end: m.index + m[1].length });
648
+ }
649
+ }
650
+ if (active.has("national_id_be")) {
651
+ NRRNISS_BE_RE.lastIndex = 0;
652
+ let m;
653
+ while ((m = NRRNISS_BE_RE.exec(t)) !== null) {
654
+ if (validNrrnissBe(m[1])) findings.push({ type: "national_id_be", value: m[1], start: m.index, end: m.index + m[1].length });
655
+ }
656
+ }
607
657
  findings.sort((a, b) => a.start - b.start);
608
658
  const specificIbanSpans = new Set(
609
659
  findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
@@ -659,18 +709,56 @@ function noiseMetrics(text) {
659
709
 
660
710
  // src/mask.ts
661
711
  var import_crypto = require("crypto");
662
- var SYNTHETIC = {
712
+ var TCKN_POOL = ["12345678950", "10000000146", "23456789060"];
713
+ var IBAN_TR_POOL = ["TR330006100519786457841326", "TR390006199999888888888813"];
714
+ var NAME_POOL = [
715
+ "Ahmet Yilmaz",
716
+ "Mehmet Demir",
717
+ "Ayse Kaya",
718
+ "Fatma Celik",
719
+ "Ali Sahin",
720
+ "Zeynep Arslan",
721
+ "Mustafa Ozturk",
722
+ "Emine Dogan",
723
+ "Ibrahim Kurt",
724
+ "Hatice Aydin",
725
+ "Hasan Yildiz",
726
+ "Elif Gunes",
727
+ "Huseyin Cetin",
728
+ "Meryem Polat",
729
+ "Omer Koc",
730
+ "Busra Tekin",
731
+ "Yusuf Erdogan",
732
+ "Selin Bozkurt",
733
+ "Kemal Akin",
734
+ "Derya Ucar"
735
+ ];
736
+ var STATIC_SYNTHETIC = {
663
737
  email: "user@example.com",
664
738
  phone: "+1 000 000 0000",
665
739
  phone_tr: "0500 000 00 00",
666
- national_id_tr: "00000000000",
740
+ phone_intl: "+1 000 000 0000",
667
741
  ssn: "000-00-0000",
668
742
  iban: "XX00 0000 0000 0000 0000 00",
669
743
  credit_card: "0000 0000 0000 0000",
670
744
  ip: "0.0.0.0",
671
- name: "AD SOYAD"
745
+ ip_v6: "2001:db8::1",
746
+ national_id_pl: "00000000000",
747
+ social_id_at: "0000000000",
748
+ national_id_be: "00000000000"
672
749
  };
673
750
  var VALID_STRATEGIES = /* @__PURE__ */ new Set(["redact", "replace", "token", "hash"]);
751
+ function pick(pool, seed) {
752
+ const h = (0, import_crypto.createHash)("sha256").update(seed).digest("hex");
753
+ const idx = parseInt(h.slice(0, 8), 16) % pool.length;
754
+ return pool[idx];
755
+ }
756
+ function synthetic(ptype, original) {
757
+ if (ptype === "national_id_tr") return pick(TCKN_POOL, original);
758
+ if (ptype === "iban_tr" || ptype === "iban_intl") return pick(IBAN_TR_POOL, original);
759
+ if (ptype === "name") return pick(NAME_POOL, original);
760
+ return STATIC_SYNTHETIC[ptype] ?? `[${ptype.toUpperCase()}]`;
761
+ }
674
762
  function applyMask(text, findings, strategy = "redact") {
675
763
  if (!VALID_STRATEGIES.has(strategy)) {
676
764
  throw new Error(`Unknown strategy "${strategy}". Use: redact, replace, token, hash`);
@@ -687,7 +775,7 @@ function applyMask(text, findings, strategy = "redact") {
687
775
  if (strategy === "redact") {
688
776
  replacement = `[REDACTED_${tag}]`;
689
777
  } else if (strategy === "replace") {
690
- replacement = SYNTHETIC[type] ?? `[${tag}]`;
778
+ replacement = synthetic(type, value);
691
779
  } else if (strategy === "token") {
692
780
  replacement = `<PII_${tag}_${counter[type]}>`;
693
781
  } else {
@@ -700,7 +788,7 @@ function applyMask(text, findings, strategy = "redact") {
700
788
  }
701
789
 
702
790
  // src/index.ts
703
- var version = "0.5.1";
791
+ var version = "0.7.0";
704
792
  function computeQualityScore(completeness, avgLength, garbageRatio) {
705
793
  const lengthScore = Math.min(avgLength / 500, 1);
706
794
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
@@ -751,15 +839,87 @@ function auditBatch(texts, options = {}) {
751
839
  function mask(text, findings, options = {}) {
752
840
  return applyMask(text, findings, options.strategy ?? "redact");
753
841
  }
842
+ async function* auditStream(texts, options = {}) {
843
+ for await (const text of texts) {
844
+ yield audit(text, options);
845
+ }
846
+ }
847
+ function redactForLlm(text, options = {}) {
848
+ const { strategy, ...auditOptions } = options;
849
+ const result = audit(text, auditOptions);
850
+ return mask(text, result.pii, { strategy });
851
+ }
852
+ function estimateTokens(text) {
853
+ if (!text || !text.trim()) return 0;
854
+ const words = text.trim().split(/\s+/).length;
855
+ return Math.max(1, Math.round(words * 4 / 3));
856
+ }
857
+ var HIGH_RISK_TYPES = /* @__PURE__ */ new Set([
858
+ "national_id_tr",
859
+ "ssn",
860
+ "credit_card",
861
+ "national_id_pl",
862
+ "national_id_be",
863
+ "social_id_at",
864
+ "social_id_de",
865
+ "social_id_uk",
866
+ "national_id_it",
867
+ "national_id_nl",
868
+ "national_id_es",
869
+ "national_id_us",
870
+ "tax_id_tr",
871
+ "tax_id_de"
872
+ ]);
873
+ var MEDIUM_RISK_TYPES = /* @__PURE__ */ new Set([
874
+ "email",
875
+ "phone_tr",
876
+ "phone_intl",
877
+ "iban",
878
+ "iban_tr",
879
+ "iban_intl",
880
+ "name"
881
+ ]);
882
+ function complianceReport(result) {
883
+ const types = [...new Set(result.pii.map((f) => f.type))].sort();
884
+ let risk_level = "none";
885
+ if (types.length > 0) {
886
+ if (types.some((t) => HIGH_RISK_TYPES.has(t))) risk_level = "high";
887
+ else if (types.some((t) => MEDIUM_RISK_TYPES.has(t))) risk_level = "medium";
888
+ else risk_level = "low";
889
+ }
890
+ const recommendations = [];
891
+ if (risk_level === "high" || risk_level === "medium") {
892
+ recommendations.push("Apply mask({ strategy: 'redact' }) before storing or sharing this text.");
893
+ }
894
+ if (risk_level === "high") {
895
+ recommendations.push(
896
+ "Review applicable regulations (KVKK Art. 6, GDPR Art. 9) for special category data handling."
897
+ );
898
+ }
899
+ if (recommendations.length === 0) {
900
+ recommendations.push("No PII detected \u2014 text is safe for LLM processing.");
901
+ }
902
+ return {
903
+ has_pii: types.length > 0,
904
+ pii_types: types,
905
+ risk_level,
906
+ masking_required: types.length > 0,
907
+ recommendations
908
+ };
909
+ }
754
910
  // Annotate the CommonJS export names for ESM import in node:
755
911
  0 && (module.exports = {
756
912
  applyMask,
757
913
  audit,
758
914
  auditBatch,
915
+ auditStream,
916
+ complianceReport,
759
917
  detectPii,
918
+ estimateTokens,
760
919
  mask,
761
920
  noiseMetrics,
762
921
  noiseRatio,
763
922
  qualityMetrics,
923
+ redactForLlm,
764
924
  version
765
925
  });
package/dist/index.d.cts CHANGED
@@ -54,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
54
54
  * // "Contact: [REDACTED_EMAIL]"
55
55
  */
56
56
 
57
- declare const version = "0.5.1";
57
+ declare const version = "0.7.0";
58
58
  type QualityGrade = "A" | "B" | "C" | "D";
59
59
  interface PiiSummaryEntry {
60
60
  type: string;
@@ -116,5 +116,57 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
116
116
  * Apply masking to PII findings in *text*.
117
117
  */
118
118
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
119
+ /**
120
+ * Async generator that audits texts one at a time from an async iterable.
121
+ *
122
+ * Yields one AuditResult per input text. Processing is sequential.
123
+ *
124
+ * @example
125
+ * async function* lines() {
126
+ * for (const line of data) yield line;
127
+ * }
128
+ * for await (const result of auditStream(lines())) {
129
+ * console.log(result.quality_grade, result.pii_summary);
130
+ * }
131
+ */
132
+ declare function auditStream(texts: AsyncIterable<string>, options?: AuditOptions): AsyncGenerator<AuditResult>;
133
+ /**
134
+ * Audit *text* and return a PII-free version ready for LLM processing.
135
+ *
136
+ * One-shot convenience wrapper around audit() + mask(). Equivalent to:
137
+ * const result = audit(text, { locale })
138
+ * return mask(text, result.pii, { strategy })
139
+ *
140
+ * @example
141
+ * const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
142
+ * // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
143
+ */
144
+ declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
145
+ /**
146
+ * Estimate the token count of *text* using a word-based heuristic.
147
+ *
148
+ * Uses the standard approximation: 1 token ≈ 0.75 words (words × 4/3).
149
+ * No external dependencies — accuracy within ~15% of real tokenizers for
150
+ * English and most European languages. Treat as a planning estimate.
151
+ *
152
+ * @example
153
+ * estimateTokens("The quick brown fox") // → 7
154
+ * estimateTokens("") // → 0
155
+ */
156
+ declare function estimateTokens(text: string): number;
157
+ type RiskLevel = "none" | "low" | "medium" | "high";
158
+ interface ComplianceReport {
159
+ has_pii: boolean;
160
+ pii_types: string[];
161
+ risk_level: RiskLevel;
162
+ masking_required: boolean;
163
+ recommendations: string[];
164
+ }
165
+ /**
166
+ * Generate a KVKK/GDPR compliance summary for an AuditResult.
167
+ *
168
+ * This is a technical summary only — not a legal document or regulatory opinion.
169
+ */
170
+ declare function complianceReport(result: AuditResult): ComplianceReport;
119
171
 
120
- export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
172
+ export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, estimateTokens, mask, noiseMetrics, noiseRatio, qualityMetrics, redactForLlm, version };
package/dist/index.d.ts CHANGED
@@ -54,7 +54,7 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
54
54
  * // "Contact: [REDACTED_EMAIL]"
55
55
  */
56
56
 
57
- declare const version = "0.5.1";
57
+ declare const version = "0.7.0";
58
58
  type QualityGrade = "A" | "B" | "C" | "D";
59
59
  interface PiiSummaryEntry {
60
60
  type: string;
@@ -116,5 +116,57 @@ declare function auditBatch(texts: string[], options?: AuditOptions): BatchAudit
116
116
  * Apply masking to PII findings in *text*.
117
117
  */
118
118
  declare function mask(text: string, findings: PiiFinding[], options?: MaskOptions): string;
119
+ /**
120
+ * Async generator that audits texts one at a time from an async iterable.
121
+ *
122
+ * Yields one AuditResult per input text. Processing is sequential.
123
+ *
124
+ * @example
125
+ * async function* lines() {
126
+ * for (const line of data) yield line;
127
+ * }
128
+ * for await (const result of auditStream(lines())) {
129
+ * console.log(result.quality_grade, result.pii_summary);
130
+ * }
131
+ */
132
+ declare function auditStream(texts: AsyncIterable<string>, options?: AuditOptions): AsyncGenerator<AuditResult>;
133
+ /**
134
+ * Audit *text* and return a PII-free version ready for LLM processing.
135
+ *
136
+ * One-shot convenience wrapper around audit() + mask(). Equivalent to:
137
+ * const result = audit(text, { locale })
138
+ * return mask(text, result.pii, { strategy })
139
+ *
140
+ * @example
141
+ * const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
142
+ * // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
143
+ */
144
+ declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
145
+ /**
146
+ * Estimate the token count of *text* using a word-based heuristic.
147
+ *
148
+ * Uses the standard approximation: 1 token ≈ 0.75 words (words × 4/3).
149
+ * No external dependencies — accuracy within ~15% of real tokenizers for
150
+ * English and most European languages. Treat as a planning estimate.
151
+ *
152
+ * @example
153
+ * estimateTokens("The quick brown fox") // → 7
154
+ * estimateTokens("") // → 0
155
+ */
156
+ declare function estimateTokens(text: string): number;
157
+ type RiskLevel = "none" | "low" | "medium" | "high";
158
+ interface ComplianceReport {
159
+ has_pii: boolean;
160
+ pii_types: string[];
161
+ risk_level: RiskLevel;
162
+ masking_required: boolean;
163
+ recommendations: string[];
164
+ }
165
+ /**
166
+ * Generate a KVKK/GDPR compliance summary for an AuditResult.
167
+ *
168
+ * This is a technical summary only — not a legal document or regulatory opinion.
169
+ */
170
+ declare function complianceReport(result: AuditResult): ComplianceReport;
119
171
 
120
- export { type AuditOptions, type AuditResult, type BatchAuditResult, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, applyMask, audit, auditBatch, detectPii, mask, noiseMetrics, noiseRatio, qualityMetrics, version };
172
+ export { type AuditOptions, type AuditResult, type BatchAuditResult, type ComplianceReport, type MaskOptions, type MaskStrategy, type NoiseMetrics, type PiiFinding, type PiiSummaryEntry, type QualityGrade, type QualityMetrics, type RiskLevel, applyMask, audit, auditBatch, auditStream, complianceReport, detectPii, estimateTokens, mask, noiseMetrics, noiseRatio, qualityMetrics, redactForLlm, version };
package/dist/index.js CHANGED
@@ -299,6 +299,28 @@ var _EIN_INVALID_PREFIXES = /* @__PURE__ */ new Set([
299
299
  function validEinUs(s) {
300
300
  return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
301
301
  }
302
+ var PESEL_PL_RE = /\b(\d{11})\b/g;
303
+ function validPeselPl(s) {
304
+ if (s.length !== 11 || !/^\d+$/.test(s)) return false;
305
+ const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
306
+ const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
307
+ return (10 - total % 10) % 10 === parseInt(s[10]);
308
+ }
309
+ var SVNR_AT_RE = /\b(\d{10})\b/g;
310
+ function validSvnrAt(s) {
311
+ if (s.length !== 10 || !/^\d+$/.test(s)) return false;
312
+ const weights = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6];
313
+ const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
314
+ return total % 10 === parseInt(s[3]);
315
+ }
316
+ var NRRNISS_BE_RE = /\b(\d{11})\b/g;
317
+ function validNrrnissBe(s) {
318
+ if (s.length !== 11 || !/^\d+$/.test(s)) return false;
319
+ const body = parseInt(s.slice(0, 9));
320
+ const check = parseInt(s.slice(9));
321
+ if (97 - body % 97 === check) return true;
322
+ return 97 - (2e9 + body) % 97 === check;
323
+ }
302
324
  var LOCALE_DETECTORS = {
303
325
  tr: /* @__PURE__ */ new Set([
304
326
  "national_id_tr",
@@ -318,7 +340,10 @@ var LOCALE_DETECTORS = {
318
340
  it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
319
341
  nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
320
342
  es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
321
- uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"])
343
+ uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
344
+ pl: /* @__PURE__ */ new Set(["national_id_pl"]),
345
+ at: /* @__PURE__ */ new Set(["social_id_at"]),
346
+ be: /* @__PURE__ */ new Set(["national_id_be"])
322
347
  };
323
348
  var UNIVERSAL = /* @__PURE__ */ new Set(["email", "iban", "credit_card", "ip", "ip_v6"]);
324
349
  function activeDetectors(locale) {
@@ -570,6 +595,27 @@ function detectPii(text, locale = "und") {
570
595
  findings.push({ type: "company_name_intl", value: m[1], start: m.index, end: m.index + m[1].length });
571
596
  }
572
597
  }
598
+ if (active.has("national_id_pl")) {
599
+ PESEL_PL_RE.lastIndex = 0;
600
+ let m;
601
+ while ((m = PESEL_PL_RE.exec(t)) !== null) {
602
+ if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
603
+ }
604
+ }
605
+ if (active.has("social_id_at")) {
606
+ SVNR_AT_RE.lastIndex = 0;
607
+ let m;
608
+ while ((m = SVNR_AT_RE.exec(t)) !== null) {
609
+ if (validSvnrAt(m[1])) findings.push({ type: "social_id_at", value: m[1], start: m.index, end: m.index + m[1].length });
610
+ }
611
+ }
612
+ if (active.has("national_id_be")) {
613
+ NRRNISS_BE_RE.lastIndex = 0;
614
+ let m;
615
+ while ((m = NRRNISS_BE_RE.exec(t)) !== null) {
616
+ if (validNrrnissBe(m[1])) findings.push({ type: "national_id_be", value: m[1], start: m.index, end: m.index + m[1].length });
617
+ }
618
+ }
573
619
  findings.sort((a, b) => a.start - b.start);
574
620
  const specificIbanSpans = new Set(
575
621
  findings.filter((f) => f.type === "iban_tr" || f.type === "iban_intl").map((f) => `${f.start}:${f.end}`)
@@ -625,18 +671,56 @@ function noiseMetrics(text) {
625
671
 
626
672
  // src/mask.ts
627
673
  import { createHash } from "crypto";
628
- var SYNTHETIC = {
674
+ var TCKN_POOL = ["12345678950", "10000000146", "23456789060"];
675
+ var IBAN_TR_POOL = ["TR330006100519786457841326", "TR390006199999888888888813"];
676
+ var NAME_POOL = [
677
+ "Ahmet Yilmaz",
678
+ "Mehmet Demir",
679
+ "Ayse Kaya",
680
+ "Fatma Celik",
681
+ "Ali Sahin",
682
+ "Zeynep Arslan",
683
+ "Mustafa Ozturk",
684
+ "Emine Dogan",
685
+ "Ibrahim Kurt",
686
+ "Hatice Aydin",
687
+ "Hasan Yildiz",
688
+ "Elif Gunes",
689
+ "Huseyin Cetin",
690
+ "Meryem Polat",
691
+ "Omer Koc",
692
+ "Busra Tekin",
693
+ "Yusuf Erdogan",
694
+ "Selin Bozkurt",
695
+ "Kemal Akin",
696
+ "Derya Ucar"
697
+ ];
698
+ var STATIC_SYNTHETIC = {
629
699
  email: "user@example.com",
630
700
  phone: "+1 000 000 0000",
631
701
  phone_tr: "0500 000 00 00",
632
- national_id_tr: "00000000000",
702
+ phone_intl: "+1 000 000 0000",
633
703
  ssn: "000-00-0000",
634
704
  iban: "XX00 0000 0000 0000 0000 00",
635
705
  credit_card: "0000 0000 0000 0000",
636
706
  ip: "0.0.0.0",
637
- name: "AD SOYAD"
707
+ ip_v6: "2001:db8::1",
708
+ national_id_pl: "00000000000",
709
+ social_id_at: "0000000000",
710
+ national_id_be: "00000000000"
638
711
  };
639
712
  var VALID_STRATEGIES = /* @__PURE__ */ new Set(["redact", "replace", "token", "hash"]);
713
+ function pick(pool, seed) {
714
+ const h = createHash("sha256").update(seed).digest("hex");
715
+ const idx = parseInt(h.slice(0, 8), 16) % pool.length;
716
+ return pool[idx];
717
+ }
718
+ function synthetic(ptype, original) {
719
+ if (ptype === "national_id_tr") return pick(TCKN_POOL, original);
720
+ if (ptype === "iban_tr" || ptype === "iban_intl") return pick(IBAN_TR_POOL, original);
721
+ if (ptype === "name") return pick(NAME_POOL, original);
722
+ return STATIC_SYNTHETIC[ptype] ?? `[${ptype.toUpperCase()}]`;
723
+ }
640
724
  function applyMask(text, findings, strategy = "redact") {
641
725
  if (!VALID_STRATEGIES.has(strategy)) {
642
726
  throw new Error(`Unknown strategy "${strategy}". Use: redact, replace, token, hash`);
@@ -653,7 +737,7 @@ function applyMask(text, findings, strategy = "redact") {
653
737
  if (strategy === "redact") {
654
738
  replacement = `[REDACTED_${tag}]`;
655
739
  } else if (strategy === "replace") {
656
- replacement = SYNTHETIC[type] ?? `[${tag}]`;
740
+ replacement = synthetic(type, value);
657
741
  } else if (strategy === "token") {
658
742
  replacement = `<PII_${tag}_${counter[type]}>`;
659
743
  } else {
@@ -666,7 +750,7 @@ function applyMask(text, findings, strategy = "redact") {
666
750
  }
667
751
 
668
752
  // src/index.ts
669
- var version = "0.5.1";
753
+ var version = "0.7.0";
670
754
  function computeQualityScore(completeness, avgLength, garbageRatio) {
671
755
  const lengthScore = Math.min(avgLength / 500, 1);
672
756
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
@@ -717,14 +801,86 @@ function auditBatch(texts, options = {}) {
717
801
  function mask(text, findings, options = {}) {
718
802
  return applyMask(text, findings, options.strategy ?? "redact");
719
803
  }
804
+ async function* auditStream(texts, options = {}) {
805
+ for await (const text of texts) {
806
+ yield audit(text, options);
807
+ }
808
+ }
809
+ function redactForLlm(text, options = {}) {
810
+ const { strategy, ...auditOptions } = options;
811
+ const result = audit(text, auditOptions);
812
+ return mask(text, result.pii, { strategy });
813
+ }
814
+ function estimateTokens(text) {
815
+ if (!text || !text.trim()) return 0;
816
+ const words = text.trim().split(/\s+/).length;
817
+ return Math.max(1, Math.round(words * 4 / 3));
818
+ }
819
+ var HIGH_RISK_TYPES = /* @__PURE__ */ new Set([
820
+ "national_id_tr",
821
+ "ssn",
822
+ "credit_card",
823
+ "national_id_pl",
824
+ "national_id_be",
825
+ "social_id_at",
826
+ "social_id_de",
827
+ "social_id_uk",
828
+ "national_id_it",
829
+ "national_id_nl",
830
+ "national_id_es",
831
+ "national_id_us",
832
+ "tax_id_tr",
833
+ "tax_id_de"
834
+ ]);
835
+ var MEDIUM_RISK_TYPES = /* @__PURE__ */ new Set([
836
+ "email",
837
+ "phone_tr",
838
+ "phone_intl",
839
+ "iban",
840
+ "iban_tr",
841
+ "iban_intl",
842
+ "name"
843
+ ]);
844
+ function complianceReport(result) {
845
+ const types = [...new Set(result.pii.map((f) => f.type))].sort();
846
+ let risk_level = "none";
847
+ if (types.length > 0) {
848
+ if (types.some((t) => HIGH_RISK_TYPES.has(t))) risk_level = "high";
849
+ else if (types.some((t) => MEDIUM_RISK_TYPES.has(t))) risk_level = "medium";
850
+ else risk_level = "low";
851
+ }
852
+ const recommendations = [];
853
+ if (risk_level === "high" || risk_level === "medium") {
854
+ recommendations.push("Apply mask({ strategy: 'redact' }) before storing or sharing this text.");
855
+ }
856
+ if (risk_level === "high") {
857
+ recommendations.push(
858
+ "Review applicable regulations (KVKK Art. 6, GDPR Art. 9) for special category data handling."
859
+ );
860
+ }
861
+ if (recommendations.length === 0) {
862
+ recommendations.push("No PII detected \u2014 text is safe for LLM processing.");
863
+ }
864
+ return {
865
+ has_pii: types.length > 0,
866
+ pii_types: types,
867
+ risk_level,
868
+ masking_required: types.length > 0,
869
+ recommendations
870
+ };
871
+ }
720
872
  export {
721
873
  applyMask,
722
874
  audit,
723
875
  auditBatch,
876
+ auditStream,
877
+ complianceReport,
724
878
  detectPii,
879
+ estimateTokens,
725
880
  mask,
726
881
  noiseMetrics,
727
882
  noiseRatio,
728
883
  qualityMetrics,
884
+ redactForLlm,
729
885
  version
730
886
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flexorch/audit",
3
- "version": "0.5.1",
3
+ "version": "0.7.0",
4
4
  "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
5
5
  "keywords": [
6
6
  "pii",
@@ -14,7 +14,7 @@
14
14
  ],
15
15
  "license": "MIT",
16
16
  "author": "FlexOrch",
17
- "homepage": "https://github.com/flexorch/flexorch-audit-js",
17
+ "homepage": "https://flexorch.com",
18
18
  "repository": {
19
19
  "type": "git",
20
20
  "url": "git+https://github.com/flexorch/flexorch-audit-js.git"