PyPI - flexorch-audit - Versions diffs - 0.1.0__py3-none-any.whl - Mend

flexorch-audit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

flexorch_audit/__init__.py +64 -0
flexorch_audit/_mask.py +57 -0
flexorch_audit/_noise.py +35 -0
flexorch_audit/_pii.py +163 -0
flexorch_audit/_quality.py +16 -0
flexorch_audit-0.1.0.dist-info/METADATA +107 -0
flexorch_audit-0.1.0.dist-info/RECORD +9 -0
flexorch_audit-0.1.0.dist-info/WHEEL +4 -0
flexorch_audit-0.1.0.dist-info/licenses/LICENSE +21 -0

flexorch_audit/__init__.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""
+flexorch-audit — zero-dependency PII + quality + noise audit for LLM datasets.
+    from flexorch_audit import audit, mask
+    result = audit(text, locale="tr")
+    # {
+    #   "pii": [{"type": "email", "value": "...", "start": 5, "end": 22}, ...],
+    #   "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
+    #   "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
+    # }
+    clean = mask(text, result["pii"], strategy="redact")
+"""
+from ._pii import detect_pii
+from ._quality import quality_metrics
+from ._noise import noise_metrics
+from ._mask import apply_mask
+__version__ = "0.1.0"
+__all__ = ["audit", "mask", "__version__"]
+def audit(text: str, locale: str = "tr") -> dict:
+    """
+    Audit *text* for LLM dataset readiness.
+    Args:
+        text:   Raw text to analyse.
+        locale: Which locale-specific detectors to activate.
+                "tr"  — Turkish: TCKN, phone_tr, name  (default)
+                "us"  — US: SSN, E.164 phone
+                "eu"  — EU: E.164 phone
+                "all" — All detectors (phone_tr takes precedence over generic phone)
+                Universal detectors (email, iban, credit_card, ip) are always active.
+    Returns:
+        {
+            "pii":     list of {type, value, start, end} sorted by position,
+            "quality": {completeness, avg_length, duplicate_ratio},
+            "noise":   {garbage_ratio, encoding_ok},
+        }
+    """
+    return {
+        "pii": detect_pii(text, locale=locale),
+        "quality": quality_metrics(text),
+        "noise": noise_metrics(text),
+    }
+def mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
+    """
+    Apply masking to PII findings in *text*.
+    Args:
+        text:     Original text.
+        findings: List of findings from audit()["pii"].
+        strategy: "redact" (default) | "replace" | "token" | "hash"
+    Returns:
+        Text with PII replaced according to *strategy*.
+    """
+    return apply_mask(text, findings, strategy)

flexorch_audit/_mask.py ADDED Viewed

@@ -0,0 +1,57 @@
+import hashlib
+# Realistic-looking synthetic replacements for strategy="replace"
+_SYNTHETIC: dict[str, str] = {
+    "email": "user@example.com",
+    "phone": "+1 000 000 0000",
+    "phone_tr": "0500 000 00 00",
+    "national_id_tr": "00000000000",
+    "ssn": "000-00-0000",
+    "iban": "XX00 0000 0000 0000 0000 00",
+    "credit_card": "0000 0000 0000 0000",
+    "ip": "0.0.0.0",
+    "name": "AD SOYAD",
+}
+_VALID_STRATEGIES = frozenset({"redact", "replace", "token", "hash"})
+def apply_mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
+    """
+    Replace PII spans in *text* according to *strategy*.
+    Strategies:
+        redact  — [REDACTED_EMAIL], [REDACTED_PHONE_TR], …  (default)
+        replace — realistic synthetic value (e.g. user@example.com)
+        token   — <PII_EMAIL_1>, <PII_EMAIL_2>, …  (unique per type per call)
+        hash    — first 16 hex chars of SHA-256(original_value)
+    Findings are applied in reverse position order so earlier replacements
+    do not shift the indices of later ones.
+    """
+    if strategy not in _VALID_STRATEGIES:
+        raise ValueError(f"Unknown strategy {strategy!r}. Use: {', '.join(sorted(_VALID_STRATEGIES))}")
+    if not text or not findings:
+        return text or ""
+    result = text
+    counter: dict[str, int] = {}
+    for finding in sorted(findings, key=lambda x: x["start"], reverse=True):
+        ptype = finding["type"]
+        counter[ptype] = counter.get(ptype, 0) + 1
+        tag = ptype.upper()
+        if strategy == "redact":
+            replacement = f"[REDACTED_{tag}]"
+        elif strategy == "replace":
+            replacement = _SYNTHETIC.get(ptype, f"[{tag}]")
+        elif strategy == "token":
+            replacement = f"<PII_{tag}_{counter[ptype]}>"
+        else:  # hash
+            h = hashlib.sha256(finding["value"].encode()).hexdigest()[:16]
+            replacement = f"[{h}]"
+        result = result[: finding["start"]] + replacement + result[finding["end"] :]
+    return result

flexorch_audit/_noise.py ADDED Viewed

@@ -0,0 +1,35 @@
+import unicodedata
+# Unicode general categories that indicate non-printable / garbage characters.
+# Cc=control, Cs=surrogate, Co=private-use, Cn=unassigned
+_GARBAGE_CATS = frozenset({"Cc", "Cs", "Co", "Cn"})
+# Normal whitespace is not garbage even though it falls in Cc
+_SAFE_WHITESPACE = frozenset(" \t\n\r\x0b\x0c")
+def _is_garbage(ch: str) -> bool:
+    if ch in _SAFE_WHITESPACE:
+        return False
+    return unicodedata.category(ch) in _GARBAGE_CATS or ch == "�"
+def noise_metrics(text: str) -> dict:
+    """
+    Compute noise metrics for a single text record.
+    Returns:
+        garbage_ratio — fraction of characters that are control/private/unassigned
+                        or Unicode replacement characters (U+FFFD)
+        encoding_ok   — False when U+FFFD replacement characters are present,
+                        which typically indicates a transcoding error
+    """
+    if not text:
+        return {"garbage_ratio": 0.0, "encoding_ok": True}
+    n = len(text)
+    garbage = sum(1 for ch in text if _is_garbage(ch))
+    return {
+        "garbage_ratio": round(garbage / n, 4),
+        "encoding_ok": "�" not in text,
+    }

flexorch_audit/_pii.py ADDED Viewed

@@ -0,0 +1,163 @@
+import re
+# ── Universal detectors ──────────────────────────────────────────────────────
+EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
+# E.164 international phone — requires + prefix, 10+ total digits
+# Used for locale=us/eu. TR phones covered by PHONE_TR_RE.
+PHONE_INTL_RE = re.compile(
+    r"\+\d{1,3}[\s\-\.]?\(?\d{1,4}\)?[\s\-\.]?\d{3,4}[\s\-\.]?\d{4}\b"
+)
+# IBAN — ISO 13616 (all countries, including TR)
+IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b")
+# Credit card — 16 digits with separator groups (Luhn-validated separately)
+CC_RE = re.compile(r"\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b")
+# IPv4
+IPV4_RE = re.compile(
+    r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
+)
+# ── Turkish detectors ────────────────────────────────────────────────────────
+# Turkish mobile: +90 5xx... or 0 5xx... or bare 5xx (10 digits)
+PHONE_TR_RE = re.compile(r"\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b")
+# TCKN — first digit non-zero, 11 digits, checksum-validated below
+TCKN_RE = re.compile(r"\b([1-9]\d{10})\b")
+_NAME_PREFIX_TR = (
+    r"(?:Ad[ıi]\s*(?:Soyad[ıi])?|Soyad[ıi]|İsim|"
+    r"Müşteri\s+Ad[ıi]|Yetkili(?:\s+Kişi)?|Çalışan\s+Ad[ıi]|"
+    r"Personel\s+Ad[ıi]|Kişi\s+Ad[ıi]|Satıcı\s+Ad[ıi]|"
+    r"Alıcı\s+Ad[ıi]|İlgili\s+Kişi|Hesap\s+Sahibi)"
+)
+_NAME_PREFIX_EN = (
+    r"(?:Full\s+Name|Customer\s+Name|Employee\s+Name|"
+    r"Contact\s+Name|Authorized\s+(?:By|Person)|Account\s+Holder|"
+    r"(?<!\bUser\s)Name)"
+)
+_NAME_VALUE = r"([A-ZÇĞİÖŞÜ][a-zçğışöşü]+(?:\s+[A-ZÇĞİÖŞÜ][a-zçğışöşü]+){0,2})"
+# Label-prefixed name detection (TR and EN labels). NLP-based free-standing name
+# detection is out of scope for v0.1 — requires NER.
+NAME_RE = re.compile(
+    rf"(?:{_NAME_PREFIX_TR}|{_NAME_PREFIX_EN})\s*[:\-]\s*{_NAME_VALUE}",
+    re.UNICODE,
+)
+# ── US detectors ─────────────────────────────────────────────────────────────
+# SSN — hyphens required to minimise false positives
+SSN_RE = re.compile(r"\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b")
+# ── Validation helpers ────────────────────────────────────────────────────────
+def _valid_tckn(s: str) -> bool:
+    # TR Nüfus Müdürlüğü modular arithmetic — same as Luhn-family checksums
+    if len(s) != 11 or s[0] == "0":
+        return False
+    d = [int(c) for c in s]
+    sum_odd = d[0] + d[2] + d[4] + d[6] + d[8]
+    sum_even = d[1] + d[3] + d[5] + d[7]
+    if (sum_odd * 7 - sum_even) % 10 != d[9]:
+        return False
+    return sum(d[:10]) % 10 == d[10]
+def _luhn(number: str) -> bool:
+    # ISO/IEC 7812 Luhn checksum
+    digits = [int(c) for c in number if c.isdigit()]
+    if not 13 <= len(digits) <= 19:
+        return False
+    total = 0
+    for i, d in enumerate(reversed(digits)):
+        if i % 2 == 1:
+            d *= 2
+            if d > 9:
+                d -= 9
+        total += d
+    return total % 10 == 0
+# ── Locale registry ───────────────────────────────────────────────────────────
+_LOCALE_DETECTORS: dict[str, set[str]] = {
+    "tr": {"national_id_tr", "phone_tr", "name"},
+    "us": {"ssn", "phone"},
+    "eu": {"phone"},
+}
+_UNIVERSAL: set[str] = {"email", "iban", "credit_card", "ip"}
+def _active(locale: str) -> set[str]:
+    if locale == "all":
+        active: set[str] = set(_UNIVERSAL)
+        for detectors in _LOCALE_DETECTORS.values():
+            active |= detectors
+        # phone_tr is more specific than generic phone; skip generic when both active
+        if "phone_tr" in active:
+            active.discard("phone")
+        return active
+    return _UNIVERSAL | _LOCALE_DETECTORS.get(locale, set())
+# ── Public detector ───────────────────────────────────────────────────────────
+def detect_pii(text: str, locale: str = "tr") -> list[dict]:
+    """
+    Detect PII in *text* and return a list of findings sorted by position.
+    Each finding: {"type": str, "value": str, "start": int, "end": int}
+    """
+    active = _active(locale)
+    findings: list[dict] = []
+    t = text or ""
+    if "email" in active:
+        for m in EMAIL_RE.finditer(t):
+            findings.append({"type": "email", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "phone" in active:
+        for m in PHONE_INTL_RE.finditer(t):
+            if sum(c.isdigit() for c in m.group()) >= 10:
+                findings.append({"type": "phone", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "iban" in active:
+        for m in IBAN_RE.finditer(t):
+            findings.append({"type": "iban", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "credit_card" in active:
+        for m in CC_RE.finditer(t):
+            if _luhn(m.group()):
+                findings.append({"type": "credit_card", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "ip" in active:
+        for m in IPV4_RE.finditer(t):
+            findings.append({"type": "ip", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "phone_tr" in active:
+        for m in PHONE_TR_RE.finditer(t):
+            findings.append({"type": "phone_tr", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "national_id_tr" in active:
+        for m in TCKN_RE.finditer(t):
+            if _valid_tckn(m.group(1)):
+                findings.append({"type": "national_id_tr", "value": m.group(1), "start": m.start(), "end": m.end()})
+    if "name" in active:
+        for m in NAME_RE.finditer(t):
+            idx = m.lastindex
+            findings.append({"type": "name", "value": m.group(idx), "start": m.start(idx), "end": m.end(idx)})
+    if "ssn" in active:
+        for m in SSN_RE.finditer(t):
+            findings.append({"type": "ssn", "value": m.group(), "start": m.start(), "end": m.end()})
+    findings.sort(key=lambda x: x["start"])
+    return findings

flexorch_audit/_quality.py ADDED Viewed

@@ -0,0 +1,16 @@
+def quality_metrics(text: str) -> dict:
+    """
+    Compute quality metrics for a single text record.
+    Returns:
+        completeness    — 1.0 if text is non-empty after stripping whitespace, else 0.0
+        avg_length      — character count of stripped text
+        duplicate_ratio — always None for single-record input; compute across your
+                          full dataset by comparing audit() results per record
+    """
+    stripped = (text or "").strip()
+    return {
+        "completeness": 1.0 if stripped else 0.0,
+        "avg_length": len(stripped),
+        "duplicate_ratio": None,
+    }

flexorch_audit-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,107 @@
+Metadata-Version: 2.4
+Name: flexorch-audit
+Version: 0.1.0
+Summary: Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)
+Project-URL: Homepage, https://github.com/flexorch/flexorch-audit
+Project-URL: Issues, https://github.com/flexorch/flexorch-audit/issues
+License: MIT
+License-File: LICENSE
+Keywords: audit,dataset,gdpr,kvkk,llm,pii,privacy,tckn
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+# flexorch-audit
+Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
+- **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
+- **Quality metrics** — completeness, average length, duplicate ratio
+- **Noise metrics** — garbage character ratio, encoding health
+- **Masking** — redact / replace / token / hash strategies
+- **Zero runtime dependencies** — pure Python stdlib, Python 3.10+
+```python
+from flexorch_audit import audit, mask
+result = audit(text, locale="tr")
+# {
+#   "pii": [{"type": "email", "value": "ali@example.com", "start": 8, "end": 23}],
+#   "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
+#   "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
+# }
+clean = mask(text, result["pii"], strategy="redact")
+# "Contact: [REDACTED_EMAIL]"
+```
+## Install
+```bash
+pip install flexorch-audit
+```
+## Locale support
+| `locale` | Active detectors |
+|----------|-----------------|
+| `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
+| `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
+| `"eu"` | email, iban, credit_card, ip + E.164 phone |
+| `"all"` | All of the above (phone_tr takes precedence over generic phone) |
+## PII types
+| Type | Description | Locale |
+|------|-------------|--------|
+| `email` | RFC-5321 address | all |
+| `iban` | ISO 13616 IBAN (any country) | all |
+| `credit_card` | 16-digit groups, Luhn-validated | all |
+| `ip` | IPv4 address | all |
+| `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
+| `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
+| `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
+| `phone` | E.164 international phone | us, eu |
+| `ssn` | US Social Security Number (###-##-####) | us |
+## Masking strategies
+| Strategy | Example output |
+|----------|----------------|
+| `redact` (default) | `[REDACTED_EMAIL]` |
+| `replace` | `user@example.com` (realistic synthetic) |
+| `token` | `<PII_EMAIL_1>` (unique per type) |
+| `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
+## Quality & noise
+`duplicate_ratio` is `null` for single-string input. To compute it across a dataset:
+```python
+texts = [record["text"] for record in dataset]
+results = [audit(t) for t in texts]
+seen = set()
+duplicates = sum(1 for t in texts if t in seen or seen.add(t))
+duplicate_ratio = duplicates / len(texts)
+```
+## Limitations (v0.1)
+- Free-standing name detection (without a label prefix) requires NLP/NER — not included.
+- `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
+- IPv6 not detected.
+- IBAN format-only check; mod-97 validation not performed.
+## License
+MIT

flexorch_audit-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+flexorch_audit/__init__.py,sha256=-Q5t_QWsIEXvb0ejeaT54DZhheQO3PGpxs1sovQFJ9A,2026
+flexorch_audit/_mask.py,sha256=HoiSPPs3qVjyXtb6Nvp9uaR1PcsEKm0vx0mQQ9spsvI,2015
+flexorch_audit/_noise.py,sha256=OLEuzWSzLghzx1H8ZgkFBhvPXirgxUrYrKmrEdwyNyc,1159
+flexorch_audit/_pii.py,sha256=l4kslkZJOZ9kRCt8b7sZQFGqLoA9Gwmz1TdvtEtOnN4,6569
+flexorch_audit/_quality.py,sha256=pRcYNn5a_Zb3VBYNObJ8aGD7-qxP0qMbe2RSfV5c3p4,614
+flexorch_audit-0.1.0.dist-info/METADATA,sha256=3OT1IKrIq0qYP6oYZMHLRyU_L9MLNh38tI2lKs5HJAE,3768
+flexorch_audit-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
+flexorch_audit-0.1.0.dist-info/licenses/LICENSE,sha256=KWRC6Lpbo-eKH92uX2ZbYVZkqIzy3wGItkgxOa7bjGs,1065
+flexorch_audit-0.1.0.dist-info/RECORD,,

flexorch_audit-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.29.0
+Root-Is-Purelib: true
+Tag: py3-none-any

flexorch_audit-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 FlexOrch
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.