flexorch-audit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ """
2
+ flexorch-audit — zero-dependency PII + quality + noise audit for LLM datasets.
3
+
4
+ from flexorch_audit import audit, mask
5
+
6
+ result = audit(text, locale="tr")
7
+ # {
8
+ # "pii": [{"type": "email", "value": "...", "start": 5, "end": 22}, ...],
9
+ # "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
10
+ # "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
11
+ # }
12
+
13
+ clean = mask(text, result["pii"], strategy="redact")
14
+ """
15
+
16
+ from ._pii import detect_pii
17
+ from ._quality import quality_metrics
18
+ from ._noise import noise_metrics
19
+ from ._mask import apply_mask
20
+
21
+ __version__ = "0.1.0"
22
+ __all__ = ["audit", "mask", "__version__"]
23
+
24
+
25
+ def audit(text: str, locale: str = "tr") -> dict:
26
+ """
27
+ Audit *text* for LLM dataset readiness.
28
+
29
+ Args:
30
+ text: Raw text to analyse.
31
+ locale: Which locale-specific detectors to activate.
32
+ "tr" — Turkish: TCKN, phone_tr, name (default)
33
+ "us" — US: SSN, E.164 phone
34
+ "eu" — EU: E.164 phone
35
+ "all" — All detectors (phone_tr takes precedence over generic phone)
36
+ Universal detectors (email, iban, credit_card, ip) are always active.
37
+
38
+ Returns:
39
+ {
40
+ "pii": list of {type, value, start, end} sorted by position,
41
+ "quality": {completeness, avg_length, duplicate_ratio},
42
+ "noise": {garbage_ratio, encoding_ok},
43
+ }
44
+ """
45
+ return {
46
+ "pii": detect_pii(text, locale=locale),
47
+ "quality": quality_metrics(text),
48
+ "noise": noise_metrics(text),
49
+ }
50
+
51
+
52
+ def mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
53
+ """
54
+ Apply masking to PII findings in *text*.
55
+
56
+ Args:
57
+ text: Original text.
58
+ findings: List of findings from audit()["pii"].
59
+ strategy: "redact" (default) | "replace" | "token" | "hash"
60
+
61
+ Returns:
62
+ Text with PII replaced according to *strategy*.
63
+ """
64
+ return apply_mask(text, findings, strategy)
@@ -0,0 +1,57 @@
1
+ import hashlib
2
+
3
+ # Realistic-looking synthetic replacements for strategy="replace"
4
+ _SYNTHETIC: dict[str, str] = {
5
+ "email": "user@example.com",
6
+ "phone": "+1 000 000 0000",
7
+ "phone_tr": "0500 000 00 00",
8
+ "national_id_tr": "00000000000",
9
+ "ssn": "000-00-0000",
10
+ "iban": "XX00 0000 0000 0000 0000 00",
11
+ "credit_card": "0000 0000 0000 0000",
12
+ "ip": "0.0.0.0",
13
+ "name": "AD SOYAD",
14
+ }
15
+
16
+ _VALID_STRATEGIES = frozenset({"redact", "replace", "token", "hash"})
17
+
18
+
19
+ def apply_mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
20
+ """
21
+ Replace PII spans in *text* according to *strategy*.
22
+
23
+ Strategies:
24
+ redact — [REDACTED_EMAIL], [REDACTED_PHONE_TR], … (default)
25
+ replace — realistic synthetic value (e.g. user@example.com)
26
+ token — <PII_EMAIL_1>, <PII_EMAIL_2>, … (unique per type per call)
27
+ hash — first 16 hex chars of SHA-256(original_value)
28
+
29
+ Findings are applied in reverse position order so earlier replacements
30
+ do not shift the indices of later ones.
31
+ """
32
+ if strategy not in _VALID_STRATEGIES:
33
+ raise ValueError(f"Unknown strategy {strategy!r}. Use: {', '.join(sorted(_VALID_STRATEGIES))}")
34
+ if not text or not findings:
35
+ return text or ""
36
+
37
+ result = text
38
+ counter: dict[str, int] = {}
39
+
40
+ for finding in sorted(findings, key=lambda x: x["start"], reverse=True):
41
+ ptype = finding["type"]
42
+ counter[ptype] = counter.get(ptype, 0) + 1
43
+ tag = ptype.upper()
44
+
45
+ if strategy == "redact":
46
+ replacement = f"[REDACTED_{tag}]"
47
+ elif strategy == "replace":
48
+ replacement = _SYNTHETIC.get(ptype, f"[{tag}]")
49
+ elif strategy == "token":
50
+ replacement = f"<PII_{tag}_{counter[ptype]}>"
51
+ else: # hash
52
+ h = hashlib.sha256(finding["value"].encode()).hexdigest()[:16]
53
+ replacement = f"[{h}]"
54
+
55
+ result = result[: finding["start"]] + replacement + result[finding["end"] :]
56
+
57
+ return result
@@ -0,0 +1,35 @@
1
+ import unicodedata
2
+
3
+ # Unicode general categories that indicate non-printable / garbage characters.
4
+ # Cc=control, Cs=surrogate, Co=private-use, Cn=unassigned
5
+ _GARBAGE_CATS = frozenset({"Cc", "Cs", "Co", "Cn"})
6
+
7
+ # Normal whitespace is not garbage even though it falls in Cc
8
+ _SAFE_WHITESPACE = frozenset(" \t\n\r\x0b\x0c")
9
+
10
+
11
+ def _is_garbage(ch: str) -> bool:
12
+ if ch in _SAFE_WHITESPACE:
13
+ return False
14
+ return unicodedata.category(ch) in _GARBAGE_CATS or ch == "�"
15
+
16
+
17
+ def noise_metrics(text: str) -> dict:
18
+ """
19
+ Compute noise metrics for a single text record.
20
+
21
+ Returns:
22
+ garbage_ratio — fraction of characters that are control/private/unassigned
23
+ or Unicode replacement characters (U+FFFD)
24
+ encoding_ok — False when U+FFFD replacement characters are present,
25
+ which typically indicates a transcoding error
26
+ """
27
+ if not text:
28
+ return {"garbage_ratio": 0.0, "encoding_ok": True}
29
+
30
+ n = len(text)
31
+ garbage = sum(1 for ch in text if _is_garbage(ch))
32
+ return {
33
+ "garbage_ratio": round(garbage / n, 4),
34
+ "encoding_ok": "�" not in text,
35
+ }
flexorch_audit/_pii.py ADDED
@@ -0,0 +1,163 @@
1
+ import re
2
+
3
+ # ── Universal detectors ──────────────────────────────────────────────────────
4
+
5
+ EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
6
+
7
+ # E.164 international phone — requires + prefix, 10+ total digits
8
+ # Used for locale=us/eu. TR phones covered by PHONE_TR_RE.
9
+ PHONE_INTL_RE = re.compile(
10
+ r"\+\d{1,3}[\s\-\.]?\(?\d{1,4}\)?[\s\-\.]?\d{3,4}[\s\-\.]?\d{4}\b"
11
+ )
12
+
13
+ # IBAN — ISO 13616 (all countries, including TR)
14
+ IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b")
15
+
16
+ # Credit card — 16 digits with separator groups (Luhn-validated separately)
17
+ CC_RE = re.compile(r"\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b")
18
+
19
+ # IPv4
20
+ IPV4_RE = re.compile(
21
+ r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
22
+ )
23
+
24
+ # ── Turkish detectors ────────────────────────────────────────────────────────
25
+
26
+ # Turkish mobile: +90 5xx... or 0 5xx... or bare 5xx (10 digits)
27
+ PHONE_TR_RE = re.compile(r"\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b")
28
+
29
+ # TCKN — first digit non-zero, 11 digits, checksum-validated below
30
+ TCKN_RE = re.compile(r"\b([1-9]\d{10})\b")
31
+
32
+ _NAME_PREFIX_TR = (
33
+ r"(?:Ad[ıi]\s*(?:Soyad[ıi])?|Soyad[ıi]|İsim|"
34
+ r"Müşteri\s+Ad[ıi]|Yetkili(?:\s+Kişi)?|Çalışan\s+Ad[ıi]|"
35
+ r"Personel\s+Ad[ıi]|Kişi\s+Ad[ıi]|Satıcı\s+Ad[ıi]|"
36
+ r"Alıcı\s+Ad[ıi]|İlgili\s+Kişi|Hesap\s+Sahibi)"
37
+ )
38
+ _NAME_PREFIX_EN = (
39
+ r"(?:Full\s+Name|Customer\s+Name|Employee\s+Name|"
40
+ r"Contact\s+Name|Authorized\s+(?:By|Person)|Account\s+Holder|"
41
+ r"(?<!\bUser\s)Name)"
42
+ )
43
+ _NAME_VALUE = r"([A-ZÇĞİÖŞÜ][a-zçğışöşü]+(?:\s+[A-ZÇĞİÖŞÜ][a-zçğışöşü]+){0,2})"
44
+
45
+ # Label-prefixed name detection (TR and EN labels). NLP-based free-standing name
46
+ # detection is out of scope for v0.1 — requires NER.
47
+ NAME_RE = re.compile(
48
+ rf"(?:{_NAME_PREFIX_TR}|{_NAME_PREFIX_EN})\s*[:\-]\s*{_NAME_VALUE}",
49
+ re.UNICODE,
50
+ )
51
+
52
+ # ── US detectors ─────────────────────────────────────────────────────────────
53
+
54
+ # SSN — hyphens required to minimise false positives
55
+ SSN_RE = re.compile(r"\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b")
56
+
57
+ # ── Validation helpers ────────────────────────────────────────────────────────
58
+
59
+
60
+ def _valid_tckn(s: str) -> bool:
61
+ # TR Nüfus Müdürlüğü modular arithmetic — same as Luhn-family checksums
62
+ if len(s) != 11 or s[0] == "0":
63
+ return False
64
+ d = [int(c) for c in s]
65
+ sum_odd = d[0] + d[2] + d[4] + d[6] + d[8]
66
+ sum_even = d[1] + d[3] + d[5] + d[7]
67
+ if (sum_odd * 7 - sum_even) % 10 != d[9]:
68
+ return False
69
+ return sum(d[:10]) % 10 == d[10]
70
+
71
+
72
+ def _luhn(number: str) -> bool:
73
+ # ISO/IEC 7812 Luhn checksum
74
+ digits = [int(c) for c in number if c.isdigit()]
75
+ if not 13 <= len(digits) <= 19:
76
+ return False
77
+ total = 0
78
+ for i, d in enumerate(reversed(digits)):
79
+ if i % 2 == 1:
80
+ d *= 2
81
+ if d > 9:
82
+ d -= 9
83
+ total += d
84
+ return total % 10 == 0
85
+
86
+
87
+ # ── Locale registry ───────────────────────────────────────────────────────────
88
+
89
+ _LOCALE_DETECTORS: dict[str, set[str]] = {
90
+ "tr": {"national_id_tr", "phone_tr", "name"},
91
+ "us": {"ssn", "phone"},
92
+ "eu": {"phone"},
93
+ }
94
+ _UNIVERSAL: set[str] = {"email", "iban", "credit_card", "ip"}
95
+
96
+
97
+ def _active(locale: str) -> set[str]:
98
+ if locale == "all":
99
+ active: set[str] = set(_UNIVERSAL)
100
+ for detectors in _LOCALE_DETECTORS.values():
101
+ active |= detectors
102
+ # phone_tr is more specific than generic phone; skip generic when both active
103
+ if "phone_tr" in active:
104
+ active.discard("phone")
105
+ return active
106
+ return _UNIVERSAL | _LOCALE_DETECTORS.get(locale, set())
107
+
108
+
109
+ # ── Public detector ───────────────────────────────────────────────────────────
110
+
111
+
112
+ def detect_pii(text: str, locale: str = "tr") -> list[dict]:
113
+ """
114
+ Detect PII in *text* and return a list of findings sorted by position.
115
+
116
+ Each finding: {"type": str, "value": str, "start": int, "end": int}
117
+ """
118
+ active = _active(locale)
119
+ findings: list[dict] = []
120
+ t = text or ""
121
+
122
+ if "email" in active:
123
+ for m in EMAIL_RE.finditer(t):
124
+ findings.append({"type": "email", "value": m.group(), "start": m.start(), "end": m.end()})
125
+
126
+ if "phone" in active:
127
+ for m in PHONE_INTL_RE.finditer(t):
128
+ if sum(c.isdigit() for c in m.group()) >= 10:
129
+ findings.append({"type": "phone", "value": m.group(), "start": m.start(), "end": m.end()})
130
+
131
+ if "iban" in active:
132
+ for m in IBAN_RE.finditer(t):
133
+ findings.append({"type": "iban", "value": m.group(), "start": m.start(), "end": m.end()})
134
+
135
+ if "credit_card" in active:
136
+ for m in CC_RE.finditer(t):
137
+ if _luhn(m.group()):
138
+ findings.append({"type": "credit_card", "value": m.group(), "start": m.start(), "end": m.end()})
139
+
140
+ if "ip" in active:
141
+ for m in IPV4_RE.finditer(t):
142
+ findings.append({"type": "ip", "value": m.group(), "start": m.start(), "end": m.end()})
143
+
144
+ if "phone_tr" in active:
145
+ for m in PHONE_TR_RE.finditer(t):
146
+ findings.append({"type": "phone_tr", "value": m.group(), "start": m.start(), "end": m.end()})
147
+
148
+ if "national_id_tr" in active:
149
+ for m in TCKN_RE.finditer(t):
150
+ if _valid_tckn(m.group(1)):
151
+ findings.append({"type": "national_id_tr", "value": m.group(1), "start": m.start(), "end": m.end()})
152
+
153
+ if "name" in active:
154
+ for m in NAME_RE.finditer(t):
155
+ idx = m.lastindex
156
+ findings.append({"type": "name", "value": m.group(idx), "start": m.start(idx), "end": m.end(idx)})
157
+
158
+ if "ssn" in active:
159
+ for m in SSN_RE.finditer(t):
160
+ findings.append({"type": "ssn", "value": m.group(), "start": m.start(), "end": m.end()})
161
+
162
+ findings.sort(key=lambda x: x["start"])
163
+ return findings
@@ -0,0 +1,16 @@
1
+ def quality_metrics(text: str) -> dict:
2
+ """
3
+ Compute quality metrics for a single text record.
4
+
5
+ Returns:
6
+ completeness — 1.0 if text is non-empty after stripping whitespace, else 0.0
7
+ avg_length — character count of stripped text
8
+ duplicate_ratio — always None for single-record input; compute across your
9
+ full dataset by comparing audit() results per record
10
+ """
11
+ stripped = (text or "").strip()
12
+ return {
13
+ "completeness": 1.0 if stripped else 0.0,
14
+ "avg_length": len(stripped),
15
+ "duplicate_ratio": None,
16
+ }
@@ -0,0 +1,107 @@
1
+ Metadata-Version: 2.4
2
+ Name: flexorch-audit
3
+ Version: 0.1.0
4
+ Summary: Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)
5
+ Project-URL: Homepage, https://github.com/flexorch/flexorch-audit
6
+ Project-URL: Issues, https://github.com/flexorch/flexorch-audit/issues
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: audit,dataset,gdpr,kvkk,llm,pii,privacy,tckn
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: Topic :: Text Processing
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+
23
+ # flexorch-audit
24
+
25
+ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
26
+
27
+ - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
28
+ - **Quality metrics** — completeness, average length, duplicate ratio
29
+ - **Noise metrics** — garbage character ratio, encoding health
30
+ - **Masking** — redact / replace / token / hash strategies
31
+ - **Zero runtime dependencies** — pure Python stdlib, Python 3.10+
32
+
33
+ ```python
34
+ from flexorch_audit import audit, mask
35
+
36
+ result = audit(text, locale="tr")
37
+ # {
38
+ # "pii": [{"type": "email", "value": "ali@example.com", "start": 8, "end": 23}],
39
+ # "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
40
+ # "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
41
+ # }
42
+
43
+ clean = mask(text, result["pii"], strategy="redact")
44
+ # "Contact: [REDACTED_EMAIL]"
45
+ ```
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install flexorch-audit
51
+ ```
52
+
53
+ ## Locale support
54
+
55
+ | `locale` | Active detectors |
56
+ |----------|-----------------|
57
+ | `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
58
+ | `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
59
+ | `"eu"` | email, iban, credit_card, ip + E.164 phone |
60
+ | `"all"` | All of the above (phone_tr takes precedence over generic phone) |
61
+
62
+ ## PII types
63
+
64
+ | Type | Description | Locale |
65
+ |------|-------------|--------|
66
+ | `email` | RFC-5321 address | all |
67
+ | `iban` | ISO 13616 IBAN (any country) | all |
68
+ | `credit_card` | 16-digit groups, Luhn-validated | all |
69
+ | `ip` | IPv4 address | all |
70
+ | `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
71
+ | `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
72
+ | `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
73
+ | `phone` | E.164 international phone | us, eu |
74
+ | `ssn` | US Social Security Number (###-##-####) | us |
75
+
76
+ ## Masking strategies
77
+
78
+ | Strategy | Example output |
79
+ |----------|----------------|
80
+ | `redact` (default) | `[REDACTED_EMAIL]` |
81
+ | `replace` | `user@example.com` (realistic synthetic) |
82
+ | `token` | `<PII_EMAIL_1>` (unique per type) |
83
+ | `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
84
+
85
+ ## Quality & noise
86
+
87
+ `duplicate_ratio` is `null` for single-string input. To compute it across a dataset:
88
+
89
+ ```python
90
+ texts = [record["text"] for record in dataset]
91
+ results = [audit(t) for t in texts]
92
+
93
+ seen = set()
94
+ duplicates = sum(1 for t in texts if t in seen or seen.add(t))
95
+ duplicate_ratio = duplicates / len(texts)
96
+ ```
97
+
98
+ ## Limitations (v0.1)
99
+
100
+ - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
101
+ - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
102
+ - IPv6 not detected.
103
+ - IBAN format-only check; mod-97 validation not performed.
104
+
105
+ ## License
106
+
107
+ MIT
@@ -0,0 +1,9 @@
1
+ flexorch_audit/__init__.py,sha256=-Q5t_QWsIEXvb0ejeaT54DZhheQO3PGpxs1sovQFJ9A,2026
2
+ flexorch_audit/_mask.py,sha256=HoiSPPs3qVjyXtb6Nvp9uaR1PcsEKm0vx0mQQ9spsvI,2015
3
+ flexorch_audit/_noise.py,sha256=OLEuzWSzLghzx1H8ZgkFBhvPXirgxUrYrKmrEdwyNyc,1159
4
+ flexorch_audit/_pii.py,sha256=l4kslkZJOZ9kRCt8b7sZQFGqLoA9Gwmz1TdvtEtOnN4,6569
5
+ flexorch_audit/_quality.py,sha256=pRcYNn5a_Zb3VBYNObJ8aGD7-qxP0qMbe2RSfV5c3p4,614
6
+ flexorch_audit-0.1.0.dist-info/METADATA,sha256=3OT1IKrIq0qYP6oYZMHLRyU_L9MLNh38tI2lKs5HJAE,3768
7
+ flexorch_audit-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ flexorch_audit-0.1.0.dist-info/licenses/LICENSE,sha256=KWRC6Lpbo-eKH92uX2ZbYVZkqIzy3wGItkgxOa7bjGs,1065
9
+ flexorch_audit-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 FlexOrch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.