PyPI - flexorch-audit - Versions diffs - 0.1.0__tar.gz - Mend

flexorch-audit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

flexorch_audit-0.1.0/.gitignore +13 -0
flexorch_audit-0.1.0/LICENSE +21 -0
flexorch_audit-0.1.0/PKG-INFO +107 -0
flexorch_audit-0.1.0/README.md +85 -0
flexorch_audit-0.1.0/pyproject.toml +38 -0
flexorch_audit-0.1.0/src/flexorch_audit/__init__.py +64 -0
flexorch_audit-0.1.0/src/flexorch_audit/_mask.py +57 -0
flexorch_audit-0.1.0/src/flexorch_audit/_noise.py +35 -0
flexorch_audit-0.1.0/src/flexorch_audit/_pii.py +163 -0
flexorch_audit-0.1.0/src/flexorch_audit/_quality.py +16 -0
flexorch_audit-0.1.0/tests/__init__.py +0 -0
flexorch_audit-0.1.0/tests/test_api.py +75 -0
flexorch_audit-0.1.0/tests/test_mask.py +87 -0
flexorch_audit-0.1.0/tests/test_noise.py +51 -0
flexorch_audit-0.1.0/tests/test_pii.py +191 -0
flexorch_audit-0.1.0/tests/test_quality.py +38 -0

flexorch_audit-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,13 @@
+__pycache__/
+*.pyc
+*.pyo
+*.egg-info/
+dist/
+build/
+.venv/
+venv/
+.env
+*.pth
+.pytest_cache/
+.ruff_cache/
+*.egg

flexorch_audit-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 FlexOrch
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

flexorch_audit-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,107 @@
+Metadata-Version: 2.4
+Name: flexorch-audit
+Version: 0.1.0
+Summary: Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)
+Project-URL: Homepage, https://github.com/flexorch/flexorch-audit
+Project-URL: Issues, https://github.com/flexorch/flexorch-audit/issues
+License: MIT
+License-File: LICENSE
+Keywords: audit,dataset,gdpr,kvkk,llm,pii,privacy,tckn
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+# flexorch-audit
+Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
+- **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
+- **Quality metrics** — completeness, average length, duplicate ratio
+- **Noise metrics** — garbage character ratio, encoding health
+- **Masking** — redact / replace / token / hash strategies
+- **Zero runtime dependencies** — pure Python stdlib, Python 3.10+
+```python
+from flexorch_audit import audit, mask
+result = audit(text, locale="tr")
+# {
+#   "pii": [{"type": "email", "value": "ali@example.com", "start": 8, "end": 23}],
+#   "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
+#   "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
+# }
+clean = mask(text, result["pii"], strategy="redact")
+# "Contact: [REDACTED_EMAIL]"
+```
+## Install
+```bash
+pip install flexorch-audit
+```
+## Locale support
+| `locale` | Active detectors |
+|----------|-----------------|
+| `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
+| `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
+| `"eu"` | email, iban, credit_card, ip + E.164 phone |
+| `"all"` | All of the above (phone_tr takes precedence over generic phone) |
+## PII types
+| Type | Description | Locale |
+|------|-------------|--------|
+| `email` | RFC-5321 address | all |
+| `iban` | ISO 13616 IBAN (any country) | all |
+| `credit_card` | 16-digit groups, Luhn-validated | all |
+| `ip` | IPv4 address | all |
+| `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
+| `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
+| `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
+| `phone` | E.164 international phone | us, eu |
+| `ssn` | US Social Security Number (###-##-####) | us |
+## Masking strategies
+| Strategy | Example output |
+|----------|----------------|
+| `redact` (default) | `[REDACTED_EMAIL]` |
+| `replace` | `user@example.com` (realistic synthetic) |
+| `token` | `<PII_EMAIL_1>` (unique per type) |
+| `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
+## Quality & noise
+`duplicate_ratio` is `null` for single-string input. To compute it across a dataset:
+```python
+texts = [record["text"] for record in dataset]
+results = [audit(t) for t in texts]
+seen = set()
+duplicates = sum(1 for t in texts if t in seen or seen.add(t))
+duplicate_ratio = duplicates / len(texts)
+```
+## Limitations (v0.1)
+- Free-standing name detection (without a label prefix) requires NLP/NER — not included.
+- `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
+- IPv6 not detected.
+- IBAN format-only check; mod-97 validation not performed.
+## License
+MIT

flexorch_audit-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,85 @@
+# flexorch-audit
+Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
+- **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
+- **Quality metrics** — completeness, average length, duplicate ratio
+- **Noise metrics** — garbage character ratio, encoding health
+- **Masking** — redact / replace / token / hash strategies
+- **Zero runtime dependencies** — pure Python stdlib, Python 3.10+
+```python
+from flexorch_audit import audit, mask
+result = audit(text, locale="tr")
+# {
+#   "pii": [{"type": "email", "value": "ali@example.com", "start": 8, "end": 23}],
+#   "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
+#   "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
+# }
+clean = mask(text, result["pii"], strategy="redact")
+# "Contact: [REDACTED_EMAIL]"
+```
+## Install
+```bash
+pip install flexorch-audit
+```
+## Locale support
+| `locale` | Active detectors |
+|----------|-----------------|
+| `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
+| `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
+| `"eu"` | email, iban, credit_card, ip + E.164 phone |
+| `"all"` | All of the above (phone_tr takes precedence over generic phone) |
+## PII types
+| Type | Description | Locale |
+|------|-------------|--------|
+| `email` | RFC-5321 address | all |
+| `iban` | ISO 13616 IBAN (any country) | all |
+| `credit_card` | 16-digit groups, Luhn-validated | all |
+| `ip` | IPv4 address | all |
+| `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
+| `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
+| `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
+| `phone` | E.164 international phone | us, eu |
+| `ssn` | US Social Security Number (###-##-####) | us |
+## Masking strategies
+| Strategy | Example output |
+|----------|----------------|
+| `redact` (default) | `[REDACTED_EMAIL]` |
+| `replace` | `user@example.com` (realistic synthetic) |
+| `token` | `<PII_EMAIL_1>` (unique per type) |
+| `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
+## Quality & noise
+`duplicate_ratio` is `null` for single-string input. To compute it across a dataset:
+```python
+texts = [record["text"] for record in dataset]
+results = [audit(t) for t in texts]
+seen = set()
+duplicates = sum(1 for t in texts if t in seen or seen.add(t))
+duplicate_ratio = duplicates / len(texts)
+```
+## Limitations (v0.1)
+- Free-standing name detection (without a label prefix) requires NLP/NER — not included.
+- `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
+- IPv6 not detected.
+- IBAN format-only check; mod-97 validation not performed.
+## License
+MIT

flexorch_audit-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,38 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "flexorch-audit"
+version = "0.1.0"
+description = "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)"
+readme = "README.md"
+license = { text = "MIT" }
+requires-python = ">=3.10"
+dependencies = []
+keywords = ["pii", "privacy", "llm", "dataset", "audit", "tckn", "kvkk", "gdpr"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Text Processing",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+[project.urls]
+Homepage = "https://github.com/flexorch/flexorch-audit"
+Issues = "https://github.com/flexorch/flexorch-audit/issues"
+[tool.hatch.build.targets.wheel]
+packages = ["src/flexorch_audit"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+[tool.ruff]
+line-length = 100

flexorch_audit-0.1.0/src/flexorch_audit/__init__.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""
+flexorch-audit — zero-dependency PII + quality + noise audit for LLM datasets.
+    from flexorch_audit import audit, mask
+    result = audit(text, locale="tr")
+    # {
+    #   "pii": [{"type": "email", "value": "...", "start": 5, "end": 22}, ...],
+    #   "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
+    #   "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
+    # }
+    clean = mask(text, result["pii"], strategy="redact")
+"""
+from ._pii import detect_pii
+from ._quality import quality_metrics
+from ._noise import noise_metrics
+from ._mask import apply_mask
+__version__ = "0.1.0"
+__all__ = ["audit", "mask", "__version__"]
+def audit(text: str, locale: str = "tr") -> dict:
+    """
+    Audit *text* for LLM dataset readiness.
+    Args:
+        text:   Raw text to analyse.
+        locale: Which locale-specific detectors to activate.
+                "tr"  — Turkish: TCKN, phone_tr, name  (default)
+                "us"  — US: SSN, E.164 phone
+                "eu"  — EU: E.164 phone
+                "all" — All detectors (phone_tr takes precedence over generic phone)
+                Universal detectors (email, iban, credit_card, ip) are always active.
+    Returns:
+        {
+            "pii":     list of {type, value, start, end} sorted by position,
+            "quality": {completeness, avg_length, duplicate_ratio},
+            "noise":   {garbage_ratio, encoding_ok},
+        }
+    """
+    return {
+        "pii": detect_pii(text, locale=locale),
+        "quality": quality_metrics(text),
+        "noise": noise_metrics(text),
+    }
+def mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
+    """
+    Apply masking to PII findings in *text*.
+    Args:
+        text:     Original text.
+        findings: List of findings from audit()["pii"].
+        strategy: "redact" (default) | "replace" | "token" | "hash"
+    Returns:
+        Text with PII replaced according to *strategy*.
+    """
+    return apply_mask(text, findings, strategy)

flexorch_audit-0.1.0/src/flexorch_audit/_mask.py ADDED Viewed

@@ -0,0 +1,57 @@
+import hashlib
+# Realistic-looking synthetic replacements for strategy="replace"
+_SYNTHETIC: dict[str, str] = {
+    "email": "user@example.com",
+    "phone": "+1 000 000 0000",
+    "phone_tr": "0500 000 00 00",
+    "national_id_tr": "00000000000",
+    "ssn": "000-00-0000",
+    "iban": "XX00 0000 0000 0000 0000 00",
+    "credit_card": "0000 0000 0000 0000",
+    "ip": "0.0.0.0",
+    "name": "AD SOYAD",
+}
+_VALID_STRATEGIES = frozenset({"redact", "replace", "token", "hash"})
+def apply_mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
+    """
+    Replace PII spans in *text* according to *strategy*.
+    Strategies:
+        redact  — [REDACTED_EMAIL], [REDACTED_PHONE_TR], …  (default)
+        replace — realistic synthetic value (e.g. user@example.com)
+        token   — <PII_EMAIL_1>, <PII_EMAIL_2>, …  (unique per type per call)
+        hash    — first 16 hex chars of SHA-256(original_value)
+    Findings are applied in reverse position order so earlier replacements
+    do not shift the indices of later ones.
+    """
+    if strategy not in _VALID_STRATEGIES:
+        raise ValueError(f"Unknown strategy {strategy!r}. Use: {', '.join(sorted(_VALID_STRATEGIES))}")
+    if not text or not findings:
+        return text or ""
+    result = text
+    counter: dict[str, int] = {}
+    for finding in sorted(findings, key=lambda x: x["start"], reverse=True):
+        ptype = finding["type"]
+        counter[ptype] = counter.get(ptype, 0) + 1
+        tag = ptype.upper()
+        if strategy == "redact":
+            replacement = f"[REDACTED_{tag}]"
+        elif strategy == "replace":
+            replacement = _SYNTHETIC.get(ptype, f"[{tag}]")
+        elif strategy == "token":
+            replacement = f"<PII_{tag}_{counter[ptype]}>"
+        else:  # hash
+            h = hashlib.sha256(finding["value"].encode()).hexdigest()[:16]
+            replacement = f"[{h}]"
+        result = result[: finding["start"]] + replacement + result[finding["end"] :]
+    return result

flexorch_audit-0.1.0/src/flexorch_audit/_noise.py ADDED Viewed

@@ -0,0 +1,35 @@
+import unicodedata
+# Unicode general categories that indicate non-printable / garbage characters.
+# Cc=control, Cs=surrogate, Co=private-use, Cn=unassigned
+_GARBAGE_CATS = frozenset({"Cc", "Cs", "Co", "Cn"})
+# Normal whitespace is not garbage even though it falls in Cc
+_SAFE_WHITESPACE = frozenset(" \t\n\r\x0b\x0c")
+def _is_garbage(ch: str) -> bool:
+    if ch in _SAFE_WHITESPACE:
+        return False
+    return unicodedata.category(ch) in _GARBAGE_CATS or ch == "�"
+def noise_metrics(text: str) -> dict:
+    """
+    Compute noise metrics for a single text record.
+    Returns:
+        garbage_ratio — fraction of characters that are control/private/unassigned
+                        or Unicode replacement characters (U+FFFD)
+        encoding_ok   — False when U+FFFD replacement characters are present,
+                        which typically indicates a transcoding error
+    """
+    if not text:
+        return {"garbage_ratio": 0.0, "encoding_ok": True}
+    n = len(text)
+    garbage = sum(1 for ch in text if _is_garbage(ch))
+    return {
+        "garbage_ratio": round(garbage / n, 4),
+        "encoding_ok": "�" not in text,
+    }

flexorch_audit-0.1.0/src/flexorch_audit/_pii.py ADDED Viewed

@@ -0,0 +1,163 @@
+import re
+# ── Universal detectors ──────────────────────────────────────────────────────
+EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
+# E.164 international phone — requires + prefix, 10+ total digits
+# Used for locale=us/eu. TR phones covered by PHONE_TR_RE.
+PHONE_INTL_RE = re.compile(
+    r"\+\d{1,3}[\s\-\.]?\(?\d{1,4}\)?[\s\-\.]?\d{3,4}[\s\-\.]?\d{4}\b"
+)
+# IBAN — ISO 13616 (all countries, including TR)
+IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b")
+# Credit card — 16 digits with separator groups (Luhn-validated separately)
+CC_RE = re.compile(r"\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b")
+# IPv4
+IPV4_RE = re.compile(
+    r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
+)
+# ── Turkish detectors ────────────────────────────────────────────────────────
+# Turkish mobile: +90 5xx... or 0 5xx... or bare 5xx (10 digits)
+PHONE_TR_RE = re.compile(r"\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b")
+# TCKN — first digit non-zero, 11 digits, checksum-validated below
+TCKN_RE = re.compile(r"\b([1-9]\d{10})\b")
+_NAME_PREFIX_TR = (
+    r"(?:Ad[ıi]\s*(?:Soyad[ıi])?|Soyad[ıi]|İsim|"
+    r"Müşteri\s+Ad[ıi]|Yetkili(?:\s+Kişi)?|Çalışan\s+Ad[ıi]|"
+    r"Personel\s+Ad[ıi]|Kişi\s+Ad[ıi]|Satıcı\s+Ad[ıi]|"
+    r"Alıcı\s+Ad[ıi]|İlgili\s+Kişi|Hesap\s+Sahibi)"
+)
+_NAME_PREFIX_EN = (
+    r"(?:Full\s+Name|Customer\s+Name|Employee\s+Name|"
+    r"Contact\s+Name|Authorized\s+(?:By|Person)|Account\s+Holder|"
+    r"(?<!\bUser\s)Name)"
+)
+_NAME_VALUE = r"([A-ZÇĞİÖŞÜ][a-zçğışöşü]+(?:\s+[A-ZÇĞİÖŞÜ][a-zçğışöşü]+){0,2})"
+# Label-prefixed name detection (TR and EN labels). NLP-based free-standing name
+# detection is out of scope for v0.1 — requires NER.
+NAME_RE = re.compile(
+    rf"(?:{_NAME_PREFIX_TR}|{_NAME_PREFIX_EN})\s*[:\-]\s*{_NAME_VALUE}",
+    re.UNICODE,
+)
+# ── US detectors ─────────────────────────────────────────────────────────────
+# SSN — hyphens required to minimise false positives
+SSN_RE = re.compile(r"\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b")
+# ── Validation helpers ────────────────────────────────────────────────────────
+def _valid_tckn(s: str) -> bool:
+    # TR Nüfus Müdürlüğü modular arithmetic — same as Luhn-family checksums
+    if len(s) != 11 or s[0] == "0":
+        return False
+    d = [int(c) for c in s]
+    sum_odd = d[0] + d[2] + d[4] + d[6] + d[8]
+    sum_even = d[1] + d[3] + d[5] + d[7]
+    if (sum_odd * 7 - sum_even) % 10 != d[9]:
+        return False
+    return sum(d[:10]) % 10 == d[10]
+def _luhn(number: str) -> bool:
+    # ISO/IEC 7812 Luhn checksum
+    digits = [int(c) for c in number if c.isdigit()]
+    if not 13 <= len(digits) <= 19:
+        return False
+    total = 0
+    for i, d in enumerate(reversed(digits)):
+        if i % 2 == 1:
+            d *= 2
+            if d > 9:
+                d -= 9
+        total += d
+    return total % 10 == 0
+# ── Locale registry ───────────────────────────────────────────────────────────
+_LOCALE_DETECTORS: dict[str, set[str]] = {
+    "tr": {"national_id_tr", "phone_tr", "name"},
+    "us": {"ssn", "phone"},
+    "eu": {"phone"},
+}
+_UNIVERSAL: set[str] = {"email", "iban", "credit_card", "ip"}
+def _active(locale: str) -> set[str]:
+    if locale == "all":
+        active: set[str] = set(_UNIVERSAL)
+        for detectors in _LOCALE_DETECTORS.values():
+            active |= detectors
+        # phone_tr is more specific than generic phone; skip generic when both active
+        if "phone_tr" in active:
+            active.discard("phone")
+        return active
+    return _UNIVERSAL | _LOCALE_DETECTORS.get(locale, set())
+# ── Public detector ───────────────────────────────────────────────────────────
+def detect_pii(text: str, locale: str = "tr") -> list[dict]:
+    """
+    Detect PII in *text* and return a list of findings sorted by position.
+    Each finding: {"type": str, "value": str, "start": int, "end": int}
+    """
+    active = _active(locale)
+    findings: list[dict] = []
+    t = text or ""
+    if "email" in active:
+        for m in EMAIL_RE.finditer(t):
+            findings.append({"type": "email", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "phone" in active:
+        for m in PHONE_INTL_RE.finditer(t):
+            if sum(c.isdigit() for c in m.group()) >= 10:
+                findings.append({"type": "phone", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "iban" in active:
+        for m in IBAN_RE.finditer(t):
+            findings.append({"type": "iban", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "credit_card" in active:
+        for m in CC_RE.finditer(t):
+            if _luhn(m.group()):
+                findings.append({"type": "credit_card", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "ip" in active:
+        for m in IPV4_RE.finditer(t):
+            findings.append({"type": "ip", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "phone_tr" in active:
+        for m in PHONE_TR_RE.finditer(t):
+            findings.append({"type": "phone_tr", "value": m.group(), "start": m.start(), "end": m.end()})
+    if "national_id_tr" in active:
+        for m in TCKN_RE.finditer(t):
+            if _valid_tckn(m.group(1)):
+                findings.append({"type": "national_id_tr", "value": m.group(1), "start": m.start(), "end": m.end()})
+    if "name" in active:
+        for m in NAME_RE.finditer(t):
+            idx = m.lastindex
+            findings.append({"type": "name", "value": m.group(idx), "start": m.start(idx), "end": m.end(idx)})
+    if "ssn" in active:
+        for m in SSN_RE.finditer(t):
+            findings.append({"type": "ssn", "value": m.group(), "start": m.start(), "end": m.end()})
+    findings.sort(key=lambda x: x["start"])
+    return findings

flexorch_audit-0.1.0/src/flexorch_audit/_quality.py ADDED Viewed

@@ -0,0 +1,16 @@
+def quality_metrics(text: str) -> dict:
+    """
+    Compute quality metrics for a single text record.
+    Returns:
+        completeness    — 1.0 if text is non-empty after stripping whitespace, else 0.0
+        avg_length      — character count of stripped text
+        duplicate_ratio — always None for single-record input; compute across your
+                          full dataset by comparing audit() results per record
+    """
+    stripped = (text or "").strip()
+    return {
+        "completeness": 1.0 if stripped else 0.0,
+        "avg_length": len(stripped),
+        "duplicate_ratio": None,
+    }

flexorch_audit-0.1.0/tests/__init__.py ADDED Viewed

File without changes

flexorch_audit-0.1.0/tests/test_api.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Integration tests for the public audit() and mask() API."""
+import flexorch_audit
+from flexorch_audit import audit, mask
+def test_version_present():
+    assert isinstance(flexorch_audit.__version__, str)
+    assert flexorch_audit.__version__.startswith("0.")
+def test_audit_returns_all_pillars():
+    result = audit("Hello world", locale="tr")
+    assert "pii" in result
+    assert "quality" in result
+    assert "noise" in result
+def test_audit_clean_text():
+    result = audit("The quick brown fox jumps over the lazy dog.", locale="tr")
+    assert result["pii"] == []
+    assert result["quality"]["completeness"] == 1.0
+    assert result["noise"]["garbage_ratio"] == 0.0
+def test_audit_email_found():
+    result = audit("Contact us: hello@flexorch.com", locale="tr")
+    assert any(f["type"] == "email" for f in result["pii"])
+def test_audit_tckn_found():
+    result = audit("TC kimlik: 12345678950", locale="tr")
+    assert any(f["type"] == "national_id_tr" for f in result["pii"])
+def test_mask_redact_round_trip():
+    text = "Email: test@example.com"
+    result = audit(text, locale="tr")
+    clean = mask(text, result["pii"], strategy="redact")
+    assert "test@example.com" not in clean
+    assert "[REDACTED_EMAIL]" in clean
+def test_mask_no_pii_unchanged():
+    text = "Clean text with no personal data."
+    result = audit(text, locale="tr")
+    assert mask(text, result["pii"]) == text
+def test_audit_empty_string():
+    result = audit("", locale="tr")
+    assert result["pii"] == []
+    assert result["quality"]["completeness"] == 0.0
+    assert result["noise"]["encoding_ok"] is True
+def test_audit_locale_us_ssn():
+    result = audit("SSN: 123-45-6789", locale="us")
+    assert any(f["type"] == "ssn" for f in result["pii"])
+def test_audit_locale_all():
+    text = "TC: 12345678950, SSN: 123-45-6789, email: x@y.com"
+    result = audit(text, locale="all")
+    types = {f["type"] for f in result["pii"]}
+    assert "national_id_tr" in types
+    assert "ssn" in types
+    assert "email" in types
+def test_mask_strategies_all_remove_pii():
+    text = "Contact: ali@example.com"
+    result = audit(text, locale="tr")
+    for strategy in ("redact", "replace", "token", "hash"):
+        clean = mask(text, result["pii"], strategy=strategy)
+        assert "ali@example.com" not in clean, f"PII still present with strategy={strategy}"

flexorch_audit-0.1.0/tests/test_mask.py ADDED Viewed

@@ -0,0 +1,87 @@
+import pytest
+from flexorch_audit._mask import apply_mask
+_FINDINGS = [{"type": "email", "value": "a@b.com", "start": 7, "end": 14}]
+_TEXT = "Email: a@b.com end"
+def test_redact_strategy():
+    result = apply_mask(_TEXT, _FINDINGS, strategy="redact")
+    assert "a@b.com" not in result
+    assert "[REDACTED_EMAIL]" in result
+def test_replace_strategy():
+    result = apply_mask(_TEXT, _FINDINGS, strategy="replace")
+    assert "a@b.com" not in result
+    assert "example.com" in result
+def test_token_strategy():
+    result = apply_mask(_TEXT, _FINDINGS, strategy="token")
+    assert "a@b.com" not in result
+    assert "<PII_EMAIL_1>" in result
+def test_hash_strategy():
+    result = apply_mask(_TEXT, _FINDINGS, strategy="hash")
+    assert "a@b.com" not in result
+    # Hash replacement is 16 hex chars wrapped in []
+    import re
+    assert re.search(r"\[[0-9a-f]{16}\]", result)
+def test_default_strategy_is_redact():
+    result = apply_mask(_TEXT, _FINDINGS)
+    assert "[REDACTED_EMAIL]" in result
+def test_invalid_strategy_raises():
+    with pytest.raises(ValueError, match="Unknown strategy"):
+        apply_mask(_TEXT, _FINDINGS, strategy="invalid")
+def test_empty_findings_returns_original():
+    assert apply_mask(_TEXT, []) == _TEXT
+def test_empty_text_returns_empty():
+    assert apply_mask("", _FINDINGS) == ""
+def test_multiple_findings_correct_order():
+    text = "a@b.com and c@d.com"
+    findings = [
+        {"type": "email", "value": "a@b.com", "start": 0, "end": 7},
+        {"type": "email", "value": "c@d.com", "start": 12, "end": 19},
+    ]
+    result = apply_mask(text, findings, strategy="redact")
+    assert "a@b.com" not in result
+    assert "c@d.com" not in result
+    assert result.count("[REDACTED_EMAIL]") == 2
+def test_token_counter_per_type():
+    text = "a@b.com c@d.com"
+    findings = [
+        {"type": "email", "value": "a@b.com", "start": 0, "end": 7},
+        {"type": "email", "value": "c@d.com", "start": 8, "end": 15},
+    ]
+    result = apply_mask(text, findings, strategy="token")
+    # Tokens count up per type
+    assert "<PII_EMAIL_" in result
+def test_phone_tr_replace_synthetic():
+    text = "Tel: 0532 123 45 67"
+    findings = [{"type": "phone_tr", "value": "0532 123 45 67", "start": 5, "end": 19}]
+    result = apply_mask(text, findings, strategy="replace")
+    assert "0532 123 45 67" not in result
+    assert "0500 000 00 00" in result
+def test_hash_is_deterministic():
+    r1 = apply_mask(_TEXT, _FINDINGS, strategy="hash")
+    r2 = apply_mask(_TEXT, _FINDINGS, strategy="hash")
+    assert r1 == r2

flexorch_audit-0.1.0/tests/test_noise.py ADDED Viewed

@@ -0,0 +1,51 @@
+from flexorch_audit._noise import noise_metrics
+def test_clean_text():
+    result = noise_metrics("Hello, world!")
+    assert result["garbage_ratio"] == 0.0
+    assert result["encoding_ok"] is True
+def test_empty_string():
+    result = noise_metrics("")
+    assert result["garbage_ratio"] == 0.0
+    assert result["encoding_ok"] is True
+def test_none_treated_as_empty():
+    result = noise_metrics(None)  # type: ignore[arg-type]
+    assert result["garbage_ratio"] == 0.0
+    assert result["encoding_ok"] is True
+def test_encoding_error_detected():
+    text = "Normal text � with replacement char"
+    result = noise_metrics(text)
+    assert result["encoding_ok"] is False
+    assert result["garbage_ratio"] > 0.0
+def test_control_characters_counted():
+    # \x01 is a control character (Cc category), not normal whitespace
+    text = "abc\x01def"
+    result = noise_metrics(text)
+    assert result["garbage_ratio"] > 0.0
+def test_normal_whitespace_not_garbage():
+    text = "line one\nline two\ttabbed"
+    result = noise_metrics(text)
+    assert result["garbage_ratio"] == 0.0
+def test_high_garbage_text():
+    text = "\x00\x01\x02\x03\x04\x05"
+    result = noise_metrics(text)
+    assert result["garbage_ratio"] == 1.0
+def test_unicode_text_no_garbage():
+    result = noise_metrics("Türkçe metin: Çiğdem, Şükrü, İstanbul")
+    assert result["garbage_ratio"] == 0.0
+    assert result["encoding_ok"] is True

flexorch_audit-0.1.0/tests/test_pii.py ADDED Viewed

@@ -0,0 +1,191 @@
+import pytest
+from flexorch_audit._pii import detect_pii, _valid_tckn, _luhn
+# ── TCKN checksum ─────────────────────────────────────────────────────────────
+def test_valid_tckn():
+    # Computed: d=[1,2,3,4,5,6,7,8,9,5,0], sum_odd=25, sum_even=20
+    # d9=(175-20)%10=5, d10=50%10=0
+    assert _valid_tckn("12345678950") is True
+def test_invalid_tckn_wrong_checksum():
+    assert _valid_tckn("12345678900") is False
+def test_invalid_tckn_starts_with_zero():
+    assert _valid_tckn("01234567890") is False
+def test_invalid_tckn_wrong_length():
+    assert _valid_tckn("1234567890") is False
+# ── Luhn ─────────────────────────────────────────────────────────────────────
+def test_luhn_valid_visa():
+    assert _luhn("4532015112830366") is True
+def test_luhn_invalid():
+    assert _luhn("1234567890123456") is False
+def test_luhn_too_short():
+    assert _luhn("123456") is False
+# ── Email ─────────────────────────────────────────────────────────────────────
+def test_email_detected():
+    findings = detect_pii("Contact: test@example.com today", locale="tr")
+    assert any(f["type"] == "email" and f["value"] == "test@example.com" for f in findings)
+def test_email_subdomain():
+    findings = detect_pii("Send to ali@mail.co.uk", locale="tr")
+    assert any(f["type"] == "email" for f in findings)
+def test_no_email_in_clean_text():
+    findings = detect_pii("Hello world, no PII here.", locale="tr")
+    assert not any(f["type"] == "email" for f in findings)
+# ── Turkish phone ─────────────────────────────────────────────────────────────
+def test_phone_tr_with_prefix():
+    findings = detect_pii("Ara: +90 532 123 45 67", locale="tr")
+    assert any(f["type"] == "phone_tr" for f in findings)
+def test_phone_tr_domestic():
+    findings = detect_pii("GSM: 0532 123 45 67", locale="tr")
+    assert any(f["type"] == "phone_tr" for f in findings)
+def test_phone_tr_not_in_us_locale():
+    findings = detect_pii("GSM: 0532 123 45 67", locale="us")
+    assert not any(f["type"] == "phone_tr" for f in findings)
+# ── TCKN ──────────────────────────────────────────────────────────────────────
+def test_tckn_detected():
+    findings = detect_pii("TC: 12345678950", locale="tr")
+    assert any(f["type"] == "national_id_tr" and f["value"] == "12345678950" for f in findings)
+def test_invalid_tckn_not_detected():
+    findings = detect_pii("TC: 12345678900", locale="tr")
+    assert not any(f["type"] == "national_id_tr" for f in findings)
+def test_tckn_not_in_us_locale():
+    findings = detect_pii("TC: 12345678950", locale="us")
+    assert not any(f["type"] == "national_id_tr" for f in findings)
+# ── IBAN ──────────────────────────────────────────────────────────────────────
+def test_iban_tr_detected():
+    findings = detect_pii("IBAN: TR330006100519786457841326", locale="tr")
+    assert any(f["type"] == "iban" for f in findings)
+def test_iban_de_detected():
+    findings = detect_pii("Bank: DE89370400440532013000", locale="tr")
+    assert any(f["type"] == "iban" for f in findings)
+# ── Credit card ───────────────────────────────────────────────────────────────
+def test_credit_card_detected():
+    # Known Luhn-valid Visa test number
+    findings = detect_pii("Card: 4532 0151 1283 0366", locale="tr")
+    assert any(f["type"] == "credit_card" for f in findings)
+def test_invalid_cc_not_detected():
+    findings = detect_pii("Ref: 1234 5678 9012 3456", locale="tr")
+    assert not any(f["type"] == "credit_card" for f in findings)
+# ── IP ────────────────────────────────────────────────────────────────────────
+def test_ip_detected():
+    findings = detect_pii("Server: 192.168.1.100", locale="tr")
+    assert any(f["type"] == "ip" and f["value"] == "192.168.1.100" for f in findings)
+def test_invalid_ip_not_detected():
+    findings = detect_pii("Bad IP: 999.999.999.999", locale="tr")
+    assert not any(f["type"] == "ip" for f in findings)
+# ── SSN ───────────────────────────────────────────────────────────────────────
+def test_ssn_detected_us_locale():
+    findings = detect_pii("SSN: 123-45-6789", locale="us")
+    assert any(f["type"] == "ssn" and f["value"] == "123-45-6789" for f in findings)
+def test_ssn_not_detected_tr_locale():
+    findings = detect_pii("SSN: 123-45-6789", locale="tr")
+    assert not any(f["type"] == "ssn" for f in findings)
+def test_ssn_invalid_000_not_detected():
+    findings = detect_pii("SSN: 000-45-6789", locale="us")
+    assert not any(f["type"] == "ssn" for f in findings)
+# ── Name ──────────────────────────────────────────────────────────────────────
+def test_name_tr_label():
+    findings = detect_pii("Adı Soyadı: Ahmet Yıldız", locale="tr")
+    assert any(f["type"] == "name" and "Ahmet" in f["value"] for f in findings)
+def test_name_en_label():
+    findings = detect_pii("Full Name: John Smith", locale="tr")
+    assert any(f["type"] == "name" and "John" in f["value"] for f in findings)
+def test_name_not_detected_us_locale():
+    findings = detect_pii("Adı: Ahmet Yıldız", locale="us")
+    assert not any(f["type"] == "name" for f in findings)
+# ── Locale: all ───────────────────────────────────────────────────────────────
+def test_locale_all_includes_tckn_and_ssn():
+    text = "TC: 12345678950 and SSN: 123-45-6789"
+    findings = detect_pii(text, locale="all")
+    types = {f["type"] for f in findings}
+    assert "national_id_tr" in types
+    assert "ssn" in types
+def test_findings_sorted_by_position():
+    text = "Email: a@b.com phone: 0532 123 45 67"
+    findings = detect_pii(text, locale="tr")
+    starts = [f["start"] for f in findings]
+    assert starts == sorted(starts)
+def test_empty_string_returns_empty():
+    assert detect_pii("", locale="tr") == []
+    assert detect_pii("   ", locale="tr") == []

flexorch_audit-0.1.0/tests/test_quality.py ADDED Viewed

@@ -0,0 +1,38 @@
+from flexorch_audit._quality import quality_metrics
+def test_non_empty_text():
+    result = quality_metrics("Hello, world!")
+    assert result["completeness"] == 1.0
+    assert result["avg_length"] == 13
+    assert result["duplicate_ratio"] is None
+def test_empty_string():
+    result = quality_metrics("")
+    assert result["completeness"] == 0.0
+    assert result["avg_length"] == 0
+def test_whitespace_only():
+    result = quality_metrics("   \t\n  ")
+    assert result["completeness"] == 0.0
+    assert result["avg_length"] == 0
+def test_strips_leading_trailing_whitespace():
+    result = quality_metrics("  hello  ")
+    assert result["avg_length"] == 5
+def test_none_treated_as_empty():
+    result = quality_metrics(None)  # type: ignore[arg-type]
+    assert result["completeness"] == 0.0
+    assert result["avg_length"] == 0
+def test_long_text():
+    text = "a" * 10_000
+    result = quality_metrics(text)
+    assert result["completeness"] == 1.0
+    assert result["avg_length"] == 10_000