@flexorch/audit 0.7.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -59,6 +59,7 @@ var COMPANY_NAME_TR_RE = new RegExp(
59
59
  "gu"
60
60
  );
61
61
  var MERSIS_RE = /\b([1-9]\d{15})\b/g;
62
+ var SGK_RE = /(?:SGK\s*(?:Sicil\s*No(?:su)?|No(?:su)?|Numara(?:s[ıi])?)?|Sigortal[ıi]\s*(?:Sicil\s*)?(?:No|Numara(?:s[ıi])?)|SSK\s*(?:No|Numara(?:s[ıi])?|Sicil))\s*[:#]*\s*(\d{10,11})\b/giu;
62
63
  var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
63
64
  var _TR_PROVINCES_SORTED = [
64
65
  "Afyonkarahisar",
@@ -338,12 +339,24 @@ function validEinUs(s) {
338
339
  return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
339
340
  }
340
341
  var PESEL_PL_RE = /\b(\d{11})\b/g;
342
+ var NIP_PL_RE = /(?:NIP|Numer\s+NIP|Numer\s+Identyfikacji\s+Podatkowej)\s*[:#]*\s*(\d{10})\b/gi;
341
343
  function validPeselPl(s) {
342
344
  if (s.length !== 11 || !/^\d+$/.test(s)) return false;
343
345
  const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
344
346
  const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
345
347
  return (10 - total % 10) % 10 === parseInt(s[10]);
346
348
  }
349
+ function validNifPt(s) {
350
+ if (s.length !== 9 || !/^\d+$/.test(s) || s[0] === "0") return false;
351
+ let total = 0;
352
+ for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
353
+ const check = (11 - total % 11) % 11;
354
+ return (check >= 10 ? 0 : check) === parseInt(s[8]);
355
+ }
356
+ var NIF_PT_RE = /(?:NIF|N[uú]mero\s+de\s+Contribuinte|Contribuinte)\s*[:#]*\s*(\d{9})\b/gi;
357
+ var PERSONNUMMER_SE_RE = /\b(\d{6,8}[-+]\d{4})\b/g;
358
+ var CPR_DK_RE = /\b(\d{6}-\d{4})\b/g;
359
+ var HETU_FI_RE = /\b(\d{6}[+\-A]\d{3}[0-9A-FHJ-NPR-Y])\b/g;
347
360
  var SVNR_AT_RE = /\b(\d{10})\b/g;
348
361
  function validSvnrAt(s) {
349
362
  if (s.length !== 10 || !/^\d+$/.test(s)) return false;
@@ -369,17 +382,22 @@ var LOCALE_DETECTORS = {
369
382
  "company_name_tr",
370
383
  "mersis_no",
371
384
  "postal_code_tr",
372
- "province_tr"
385
+ "province_tr",
386
+ "sgk_no"
373
387
  ]),
374
388
  us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
375
389
  eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
376
- de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
377
- fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
390
+ de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de", "social_id_at"]),
391
+ fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr", "national_id_be"]),
378
392
  it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
379
- nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
393
+ nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl", "national_id_be"]),
380
394
  es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
381
395
  uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
382
- pl: /* @__PURE__ */ new Set(["national_id_pl"]),
396
+ pl: /* @__PURE__ */ new Set(["national_id_pl", "tax_id_pl"]),
397
+ pt: /* @__PURE__ */ new Set(["tax_id_pt"]),
398
+ sv: /* @__PURE__ */ new Set(["national_id_se"]),
399
+ da: /* @__PURE__ */ new Set(["national_id_dk"]),
400
+ fi: /* @__PURE__ */ new Set(["national_id_fi"]),
383
401
  at: /* @__PURE__ */ new Set(["social_id_at"]),
384
402
  be: /* @__PURE__ */ new Set(["national_id_be"])
385
403
  };
@@ -492,6 +510,14 @@ function detectPii(text, locale = "und") {
492
510
  findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
493
511
  }
494
512
  }
513
+ if (active.has("sgk_no")) {
514
+ SGK_RE.lastIndex = 0;
515
+ let m;
516
+ while ((m = SGK_RE.exec(t)) !== null) {
517
+ const start = m.index + m[0].lastIndexOf(m[1]);
518
+ findings.push({ type: "sgk_no", value: m[1], start, end: start + m[1].length });
519
+ }
520
+ }
495
521
  if (active.has("postal_code_tr")) {
496
522
  POSTAL_CODE_TR_RE.lastIndex = 0;
497
523
  let m;
@@ -640,6 +666,45 @@ function detectPii(text, locale = "und") {
640
666
  if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
641
667
  }
642
668
  }
669
+ if (active.has("tax_id_pl")) {
670
+ NIP_PL_RE.lastIndex = 0;
671
+ let m;
672
+ while ((m = NIP_PL_RE.exec(t)) !== null) {
673
+ const start = m.index + m[0].lastIndexOf(m[1]);
674
+ findings.push({ type: "tax_id_pl", value: m[1], start, end: start + m[1].length });
675
+ }
676
+ }
677
+ if (active.has("tax_id_pt")) {
678
+ NIF_PT_RE.lastIndex = 0;
679
+ let m;
680
+ while ((m = NIF_PT_RE.exec(t)) !== null) {
681
+ if (validNifPt(m[1])) {
682
+ const start = m.index + m[0].lastIndexOf(m[1]);
683
+ findings.push({ type: "tax_id_pt", value: m[1], start, end: start + m[1].length });
684
+ }
685
+ }
686
+ }
687
+ if (active.has("national_id_se")) {
688
+ PERSONNUMMER_SE_RE.lastIndex = 0;
689
+ let m;
690
+ while ((m = PERSONNUMMER_SE_RE.exec(t)) !== null) {
691
+ findings.push({ type: "national_id_se", value: m[1], start: m.index, end: m.index + m[1].length });
692
+ }
693
+ }
694
+ if (active.has("national_id_dk")) {
695
+ CPR_DK_RE.lastIndex = 0;
696
+ let m;
697
+ while ((m = CPR_DK_RE.exec(t)) !== null) {
698
+ findings.push({ type: "national_id_dk", value: m[1], start: m.index, end: m.index + m[1].length });
699
+ }
700
+ }
701
+ if (active.has("national_id_fi")) {
702
+ HETU_FI_RE.lastIndex = 0;
703
+ let m;
704
+ while ((m = HETU_FI_RE.exec(t)) !== null) {
705
+ findings.push({ type: "national_id_fi", value: m[1], start: m.index, end: m.index + m[1].length });
706
+ }
707
+ }
643
708
  if (active.has("social_id_at")) {
644
709
  SVNR_AT_RE.lastIndex = 0;
645
710
  let m;
@@ -773,7 +838,7 @@ function applyMask(text, findings, strategy = "redact") {
773
838
  const tag = type.toUpperCase();
774
839
  let replacement;
775
840
  if (strategy === "redact") {
776
- replacement = `[REDACTED_${tag}]`;
841
+ replacement = `[MASKED_${tag}]`;
777
842
  } else if (strategy === "replace") {
778
843
  replacement = synthetic(type, value);
779
844
  } else if (strategy === "token") {
@@ -788,7 +853,7 @@ function applyMask(text, findings, strategy = "redact") {
788
853
  }
789
854
 
790
855
  // src/index.ts
791
- var version = "0.7.0";
856
+ var version = "0.8.2";
792
857
  function computeQualityScore(completeness, avgLength, garbageRatio) {
793
858
  const lengthScore = Math.min(avgLength / 500, 1);
794
859
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
package/dist/index.d.cts CHANGED
@@ -51,10 +51,10 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
51
51
  * result.noise // { garbage_ratio, encoding_ok }
52
52
  *
53
53
  * const clean = mask(text, result.pii, { strategy: "redact" })
54
- * // "Contact: [REDACTED_EMAIL]"
54
+ * // "Contact: [MASKED_EMAIL]"
55
55
  */
56
56
 
57
- declare const version = "0.7.0";
57
+ declare const version = "0.8.2";
58
58
  type QualityGrade = "A" | "B" | "C" | "D";
59
59
  interface PiiSummaryEntry {
60
60
  type: string;
@@ -139,7 +139,7 @@ declare function auditStream(texts: AsyncIterable<string>, options?: AuditOption
139
139
  *
140
140
  * @example
141
141
  * const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
142
- * // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
142
+ * // "TCKN: [MASKED_NATIONAL_ID_TR], email: [MASKED_EMAIL]"
143
143
  */
144
144
  declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
145
145
  /**
package/dist/index.d.ts CHANGED
@@ -51,10 +51,10 @@ declare function applyMask(text: string, findings: PiiFinding[], strategy?: Mask
51
51
  * result.noise // { garbage_ratio, encoding_ok }
52
52
  *
53
53
  * const clean = mask(text, result.pii, { strategy: "redact" })
54
- * // "Contact: [REDACTED_EMAIL]"
54
+ * // "Contact: [MASKED_EMAIL]"
55
55
  */
56
56
 
57
- declare const version = "0.7.0";
57
+ declare const version = "0.8.2";
58
58
  type QualityGrade = "A" | "B" | "C" | "D";
59
59
  interface PiiSummaryEntry {
60
60
  type: string;
@@ -139,7 +139,7 @@ declare function auditStream(texts: AsyncIterable<string>, options?: AuditOption
139
139
  *
140
140
  * @example
141
141
  * const clean = redactForLlm("TCKN: 12345678950, email: ali@example.com", { locale: "tr" })
142
- * // "TCKN: [REDACTED_NATIONAL_ID_TR], email: [REDACTED_EMAIL]"
142
+ * // "TCKN: [MASKED_NATIONAL_ID_TR], email: [MASKED_EMAIL]"
143
143
  */
144
144
  declare function redactForLlm(text: string, options?: AuditOptions & MaskOptions): string;
145
145
  /**
package/dist/index.js CHANGED
@@ -21,6 +21,7 @@ var COMPANY_NAME_TR_RE = new RegExp(
21
21
  "gu"
22
22
  );
23
23
  var MERSIS_RE = /\b([1-9]\d{15})\b/g;
24
+ var SGK_RE = /(?:SGK\s*(?:Sicil\s*No(?:su)?|No(?:su)?|Numara(?:s[ıi])?)?|Sigortal[ıi]\s*(?:Sicil\s*)?(?:No|Numara(?:s[ıi])?)|SSK\s*(?:No|Numara(?:s[ıi])?|Sicil))\s*[:#]*\s*(\d{10,11})\b/giu;
24
25
  var POSTAL_CODE_TR_RE = /\b((?:0[1-9]|[1-7]\d|80|81)\d{3})\b/g;
25
26
  var _TR_PROVINCES_SORTED = [
26
27
  "Afyonkarahisar",
@@ -300,12 +301,24 @@ function validEinUs(s) {
300
301
  return !_EIN_INVALID_PREFIXES.has(s.slice(0, 2));
301
302
  }
302
303
  var PESEL_PL_RE = /\b(\d{11})\b/g;
304
+ var NIP_PL_RE = /(?:NIP|Numer\s+NIP|Numer\s+Identyfikacji\s+Podatkowej)\s*[:#]*\s*(\d{10})\b/gi;
303
305
  function validPeselPl(s) {
304
306
  if (s.length !== 11 || !/^\d+$/.test(s)) return false;
305
307
  const weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3];
306
308
  const total = weights.reduce((sum, w, i) => sum + w * parseInt(s[i]), 0);
307
309
  return (10 - total % 10) % 10 === parseInt(s[10]);
308
310
  }
311
+ function validNifPt(s) {
312
+ if (s.length !== 9 || !/^\d+$/.test(s) || s[0] === "0") return false;
313
+ let total = 0;
314
+ for (let i = 0; i < 8; i++) total += (9 - i) * parseInt(s[i]);
315
+ const check = (11 - total % 11) % 11;
316
+ return (check >= 10 ? 0 : check) === parseInt(s[8]);
317
+ }
318
+ var NIF_PT_RE = /(?:NIF|N[uú]mero\s+de\s+Contribuinte|Contribuinte)\s*[:#]*\s*(\d{9})\b/gi;
319
+ var PERSONNUMMER_SE_RE = /\b(\d{6,8}[-+]\d{4})\b/g;
320
+ var CPR_DK_RE = /\b(\d{6}-\d{4})\b/g;
321
+ var HETU_FI_RE = /\b(\d{6}[+\-A]\d{3}[0-9A-FHJ-NPR-Y])\b/g;
309
322
  var SVNR_AT_RE = /\b(\d{10})\b/g;
310
323
  function validSvnrAt(s) {
311
324
  if (s.length !== 10 || !/^\d+$/.test(s)) return false;
@@ -331,17 +344,22 @@ var LOCALE_DETECTORS = {
331
344
  "company_name_tr",
332
345
  "mersis_no",
333
346
  "postal_code_tr",
334
- "province_tr"
347
+ "province_tr",
348
+ "sgk_no"
335
349
  ]),
336
350
  us: /* @__PURE__ */ new Set(["ssn", "tax_id_us", "national_id_us", "phone_intl", "company_name_intl"]),
337
351
  eu: /* @__PURE__ */ new Set(["phone_intl", "iban_intl", "company_name_intl"]),
338
- de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de"]),
339
- fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr"]),
352
+ de: /* @__PURE__ */ new Set(["tax_id_de", "social_id_de", "social_id_at"]),
353
+ fr: /* @__PURE__ */ new Set(["siret_fr", "company_id_fr", "social_id_fr", "national_id_be"]),
340
354
  it: /* @__PURE__ */ new Set(["national_id_it", "tax_id_it"]),
341
- nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl"]),
355
+ nl: /* @__PURE__ */ new Set(["national_id_nl", "company_id_nl", "national_id_be"]),
342
356
  es: /* @__PURE__ */ new Set(["national_id_es", "tax_id_es"]),
343
357
  uk: /* @__PURE__ */ new Set(["social_id_uk", "tax_id_uk"]),
344
- pl: /* @__PURE__ */ new Set(["national_id_pl"]),
358
+ pl: /* @__PURE__ */ new Set(["national_id_pl", "tax_id_pl"]),
359
+ pt: /* @__PURE__ */ new Set(["tax_id_pt"]),
360
+ sv: /* @__PURE__ */ new Set(["national_id_se"]),
361
+ da: /* @__PURE__ */ new Set(["national_id_dk"]),
362
+ fi: /* @__PURE__ */ new Set(["national_id_fi"]),
345
363
  at: /* @__PURE__ */ new Set(["social_id_at"]),
346
364
  be: /* @__PURE__ */ new Set(["national_id_be"])
347
365
  };
@@ -454,6 +472,14 @@ function detectPii(text, locale = "und") {
454
472
  findings.push({ type: "mersis_no", value: m[1], start: m.index, end: m.index + m[1].length });
455
473
  }
456
474
  }
475
+ if (active.has("sgk_no")) {
476
+ SGK_RE.lastIndex = 0;
477
+ let m;
478
+ while ((m = SGK_RE.exec(t)) !== null) {
479
+ const start = m.index + m[0].lastIndexOf(m[1]);
480
+ findings.push({ type: "sgk_no", value: m[1], start, end: start + m[1].length });
481
+ }
482
+ }
457
483
  if (active.has("postal_code_tr")) {
458
484
  POSTAL_CODE_TR_RE.lastIndex = 0;
459
485
  let m;
@@ -602,6 +628,45 @@ function detectPii(text, locale = "und") {
602
628
  if (validPeselPl(m[1])) findings.push({ type: "national_id_pl", value: m[1], start: m.index, end: m.index + m[1].length });
603
629
  }
604
630
  }
631
+ if (active.has("tax_id_pl")) {
632
+ NIP_PL_RE.lastIndex = 0;
633
+ let m;
634
+ while ((m = NIP_PL_RE.exec(t)) !== null) {
635
+ const start = m.index + m[0].lastIndexOf(m[1]);
636
+ findings.push({ type: "tax_id_pl", value: m[1], start, end: start + m[1].length });
637
+ }
638
+ }
639
+ if (active.has("tax_id_pt")) {
640
+ NIF_PT_RE.lastIndex = 0;
641
+ let m;
642
+ while ((m = NIF_PT_RE.exec(t)) !== null) {
643
+ if (validNifPt(m[1])) {
644
+ const start = m.index + m[0].lastIndexOf(m[1]);
645
+ findings.push({ type: "tax_id_pt", value: m[1], start, end: start + m[1].length });
646
+ }
647
+ }
648
+ }
649
+ if (active.has("national_id_se")) {
650
+ PERSONNUMMER_SE_RE.lastIndex = 0;
651
+ let m;
652
+ while ((m = PERSONNUMMER_SE_RE.exec(t)) !== null) {
653
+ findings.push({ type: "national_id_se", value: m[1], start: m.index, end: m.index + m[1].length });
654
+ }
655
+ }
656
+ if (active.has("national_id_dk")) {
657
+ CPR_DK_RE.lastIndex = 0;
658
+ let m;
659
+ while ((m = CPR_DK_RE.exec(t)) !== null) {
660
+ findings.push({ type: "national_id_dk", value: m[1], start: m.index, end: m.index + m[1].length });
661
+ }
662
+ }
663
+ if (active.has("national_id_fi")) {
664
+ HETU_FI_RE.lastIndex = 0;
665
+ let m;
666
+ while ((m = HETU_FI_RE.exec(t)) !== null) {
667
+ findings.push({ type: "national_id_fi", value: m[1], start: m.index, end: m.index + m[1].length });
668
+ }
669
+ }
605
670
  if (active.has("social_id_at")) {
606
671
  SVNR_AT_RE.lastIndex = 0;
607
672
  let m;
@@ -735,7 +800,7 @@ function applyMask(text, findings, strategy = "redact") {
735
800
  const tag = type.toUpperCase();
736
801
  let replacement;
737
802
  if (strategy === "redact") {
738
- replacement = `[REDACTED_${tag}]`;
803
+ replacement = `[MASKED_${tag}]`;
739
804
  } else if (strategy === "replace") {
740
805
  replacement = synthetic(type, value);
741
806
  } else if (strategy === "token") {
@@ -750,7 +815,7 @@ function applyMask(text, findings, strategy = "redact") {
750
815
  }
751
816
 
752
817
  // src/index.ts
753
- var version = "0.7.0";
818
+ var version = "0.8.2";
754
819
  function computeQualityScore(completeness, avgLength, garbageRatio) {
755
820
  const lengthScore = Math.min(avgLength / 500, 1);
756
821
  const noiseScore = Math.max(0, 1 - garbageRatio * 10);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@flexorch/audit",
3
- "version": "0.7.0",
3
+ "version": "0.8.2",
4
4
  "description": "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)",
5
5
  "keywords": [
6
6
  "pii",