flexorch-audit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ venv/
9
+ .env
10
+ *.pth
11
+ .pytest_cache/
12
+ .ruff_cache/
13
+ *.egg
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 FlexOrch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,107 @@
1
+ Metadata-Version: 2.4
2
+ Name: flexorch-audit
3
+ Version: 0.1.0
4
+ Summary: Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)
5
+ Project-URL: Homepage, https://github.com/flexorch/flexorch-audit
6
+ Project-URL: Issues, https://github.com/flexorch/flexorch-audit/issues
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: audit,dataset,gdpr,kvkk,llm,pii,privacy,tckn
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Classifier: Topic :: Text Processing
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+
23
+ # flexorch-audit
24
+
25
+ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
26
+
27
+ - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
28
+ - **Quality metrics** — completeness, average length, duplicate ratio
29
+ - **Noise metrics** — garbage character ratio, encoding health
30
+ - **Masking** — redact / replace / token / hash strategies
31
+ - **Zero runtime dependencies** — pure Python stdlib, Python 3.10+
32
+
33
+ ```python
34
+ from flexorch_audit import audit, mask
35
+
36
+ result = audit(text, locale="tr")
37
+ # {
38
+ # "pii": [{"type": "email", "value": "ali@example.com", "start": 8, "end": 23}],
39
+ # "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
40
+ # "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
41
+ # }
42
+
43
+ clean = mask(text, result["pii"], strategy="redact")
44
+ # "Contact: [REDACTED_EMAIL]"
45
+ ```
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install flexorch-audit
51
+ ```
52
+
53
+ ## Locale support
54
+
55
+ | `locale` | Active detectors |
56
+ |----------|-----------------|
57
+ | `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
58
+ | `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
59
+ | `"eu"` | email, iban, credit_card, ip + E.164 phone |
60
+ | `"all"` | All of the above (phone_tr takes precedence over generic phone) |
61
+
62
+ ## PII types
63
+
64
+ | Type | Description | Locale |
65
+ |------|-------------|--------|
66
+ | `email` | RFC-5321 address | all |
67
+ | `iban` | ISO 13616 IBAN (any country) | all |
68
+ | `credit_card` | 16-digit groups, Luhn-validated | all |
69
+ | `ip` | IPv4 address | all |
70
+ | `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
71
+ | `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
72
+ | `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
73
+ | `phone` | E.164 international phone | us, eu |
74
+ | `ssn` | US Social Security Number (###-##-####) | us |
75
+
76
+ ## Masking strategies
77
+
78
+ | Strategy | Example output |
79
+ |----------|----------------|
80
+ | `redact` (default) | `[REDACTED_EMAIL]` |
81
+ | `replace` | `user@example.com` (realistic synthetic) |
82
+ | `token` | `<PII_EMAIL_1>` (unique per type) |
83
+ | `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
84
+
85
+ ## Quality & noise
86
+
87
+ `duplicate_ratio` is `null` for single-string input. To compute it across a dataset:
88
+
89
+ ```python
90
+ texts = [record["text"] for record in dataset]
91
+ results = [audit(t) for t in texts]
92
+
93
+ seen = set()
94
+ duplicates = sum(1 for t in texts if t in seen or seen.add(t))
95
+ duplicate_ratio = duplicates / len(texts)
96
+ ```
97
+
98
+ ## Limitations (v0.1)
99
+
100
+ - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
101
+ - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
102
+ - IPv6 not detected.
103
+ - IBAN format-only check; mod-97 validation not performed.
104
+
105
+ ## License
106
+
107
+ MIT
@@ -0,0 +1,85 @@
1
+ # flexorch-audit
2
+
3
+ Zero-dependency PII + quality + noise audit for LLM datasets. Answers one question: **is this dataset ready for LLM training?**
4
+
5
+ - **PII detection** — email, phone (TR + E.164), credit card (Luhn), IP, TCKN, IBAN, SSN, label-prefixed names
6
+ - **Quality metrics** — completeness, average length, duplicate ratio
7
+ - **Noise metrics** — garbage character ratio, encoding health
8
+ - **Masking** — redact / replace / token / hash strategies
9
+ - **Zero runtime dependencies** — pure Python stdlib, Python 3.10+
10
+
11
+ ```python
12
+ from flexorch_audit import audit, mask
13
+
14
+ result = audit(text, locale="tr")
15
+ # {
16
+ # "pii": [{"type": "email", "value": "ali@example.com", "start": 8, "end": 23}],
17
+ # "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
18
+ # "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
19
+ # }
20
+
21
+ clean = mask(text, result["pii"], strategy="redact")
22
+ # "Contact: [REDACTED_EMAIL]"
23
+ ```
24
+
25
+ ## Install
26
+
27
+ ```bash
28
+ pip install flexorch-audit
29
+ ```
30
+
31
+ ## Locale support
32
+
33
+ | `locale` | Active detectors |
34
+ |----------|-----------------|
35
+ | `"tr"` (default) | email, iban, credit_card, ip + TCKN, phone_tr, name |
36
+ | `"us"` | email, iban, credit_card, ip + SSN, E.164 phone |
37
+ | `"eu"` | email, iban, credit_card, ip + E.164 phone |
38
+ | `"all"` | All of the above (phone_tr takes precedence over generic phone) |
39
+
40
+ ## PII types
41
+
42
+ | Type | Description | Locale |
43
+ |------|-------------|--------|
44
+ | `email` | RFC-5321 address | all |
45
+ | `iban` | ISO 13616 IBAN (any country) | all |
46
+ | `credit_card` | 16-digit groups, Luhn-validated | all |
47
+ | `ip` | IPv4 address | all |
48
+ | `phone_tr` | Turkish mobile (+90/0 prefix + 10 digits) | tr |
49
+ | `national_id_tr` | TCKN — 11-digit modular arithmetic checksum | tr |
50
+ | `name` | Label-prefixed name (e.g. "Adı: Ali Yıldız", "Full Name: Jane Doe") | tr |
51
+ | `phone` | E.164 international phone | us, eu |
52
+ | `ssn` | US Social Security Number (###-##-####) | us |
53
+
54
+ ## Masking strategies
55
+
56
+ | Strategy | Example output |
57
+ |----------|----------------|
58
+ | `redact` (default) | `[REDACTED_EMAIL]` |
59
+ | `replace` | `user@example.com` (realistic synthetic) |
60
+ | `token` | `<PII_EMAIL_1>` (unique per type) |
61
+ | `hash` | `[3d4f9a1b2c8e7f0a]` (SHA-256 first 16 hex chars) |
62
+
63
+ ## Quality & noise
64
+
65
+ `duplicate_ratio` is `null` for single-string input. To compute it across a dataset:
66
+
67
+ ```python
68
+ texts = [record["text"] for record in dataset]
69
+ results = [audit(t) for t in texts]
70
+
71
+ seen = set()
72
+ duplicates = sum(1 for t in texts if t in seen or seen.add(t))
73
+ duplicate_ratio = duplicates / len(texts)
74
+ ```
75
+
76
+ ## Limitations (v0.1)
77
+
78
+ - Free-standing name detection (without a label prefix) requires NLP/NER — not included.
79
+ - `duplicate_ratio` is per-call; aggregate across your dataset manually (see above).
80
+ - IPv6 not detected.
81
+ - IBAN format-only check; mod-97 validation not performed.
82
+
83
+ ## License
84
+
85
+ MIT
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "flexorch-audit"
7
+ version = "0.1.0"
8
+ description = "Zero-dependency PII + quality + noise audit for LLM datasets (TR/EU/US)"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.10"
12
+ dependencies = []
13
+ keywords = ["pii", "privacy", "llm", "dataset", "audit", "tckn", "kvkk", "gdpr"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.10",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Text Processing",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ ]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/flexorch/flexorch-audit"
29
+ Issues = "https://github.com/flexorch/flexorch-audit/issues"
30
+
31
+ [tool.hatch.build.targets.wheel]
32
+ packages = ["src/flexorch_audit"]
33
+
34
+ [tool.pytest.ini_options]
35
+ testpaths = ["tests"]
36
+
37
+ [tool.ruff]
38
+ line-length = 100
@@ -0,0 +1,64 @@
1
+ """
2
+ flexorch-audit — zero-dependency PII + quality + noise audit for LLM datasets.
3
+
4
+ from flexorch_audit import audit, mask
5
+
6
+ result = audit(text, locale="tr")
7
+ # {
8
+ # "pii": [{"type": "email", "value": "...", "start": 5, "end": 22}, ...],
9
+ # "quality": {"completeness": 1.0, "avg_length": 342, "duplicate_ratio": None},
10
+ # "noise": {"garbage_ratio": 0.0, "encoding_ok": True},
11
+ # }
12
+
13
+ clean = mask(text, result["pii"], strategy="redact")
14
+ """
15
+
16
+ from ._pii import detect_pii
17
+ from ._quality import quality_metrics
18
+ from ._noise import noise_metrics
19
+ from ._mask import apply_mask
20
+
21
+ __version__ = "0.1.0"
22
+ __all__ = ["audit", "mask", "__version__"]
23
+
24
+
25
+ def audit(text: str, locale: str = "tr") -> dict:
26
+ """
27
+ Audit *text* for LLM dataset readiness.
28
+
29
+ Args:
30
+ text: Raw text to analyse.
31
+ locale: Which locale-specific detectors to activate.
32
+ "tr" — Turkish: TCKN, phone_tr, name (default)
33
+ "us" — US: SSN, E.164 phone
34
+ "eu" — EU: E.164 phone
35
+ "all" — All detectors (phone_tr takes precedence over generic phone)
36
+ Universal detectors (email, iban, credit_card, ip) are always active.
37
+
38
+ Returns:
39
+ {
40
+ "pii": list of {type, value, start, end} sorted by position,
41
+ "quality": {completeness, avg_length, duplicate_ratio},
42
+ "noise": {garbage_ratio, encoding_ok},
43
+ }
44
+ """
45
+ return {
46
+ "pii": detect_pii(text, locale=locale),
47
+ "quality": quality_metrics(text),
48
+ "noise": noise_metrics(text),
49
+ }
50
+
51
+
52
+ def mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
53
+ """
54
+ Apply masking to PII findings in *text*.
55
+
56
+ Args:
57
+ text: Original text.
58
+ findings: List of findings from audit()["pii"].
59
+ strategy: "redact" (default) | "replace" | "token" | "hash"
60
+
61
+ Returns:
62
+ Text with PII replaced according to *strategy*.
63
+ """
64
+ return apply_mask(text, findings, strategy)
@@ -0,0 +1,57 @@
1
+ import hashlib
2
+
3
+ # Realistic-looking synthetic replacements for strategy="replace"
4
+ _SYNTHETIC: dict[str, str] = {
5
+ "email": "user@example.com",
6
+ "phone": "+1 000 000 0000",
7
+ "phone_tr": "0500 000 00 00",
8
+ "national_id_tr": "00000000000",
9
+ "ssn": "000-00-0000",
10
+ "iban": "XX00 0000 0000 0000 0000 00",
11
+ "credit_card": "0000 0000 0000 0000",
12
+ "ip": "0.0.0.0",
13
+ "name": "AD SOYAD",
14
+ }
15
+
16
+ _VALID_STRATEGIES = frozenset({"redact", "replace", "token", "hash"})
17
+
18
+
19
+ def apply_mask(text: str, findings: list[dict], strategy: str = "redact") -> str:
20
+ """
21
+ Replace PII spans in *text* according to *strategy*.
22
+
23
+ Strategies:
24
+ redact — [REDACTED_EMAIL], [REDACTED_PHONE_TR], … (default)
25
+ replace — realistic synthetic value (e.g. user@example.com)
26
+ token — <PII_EMAIL_1>, <PII_EMAIL_2>, … (unique per type per call)
27
+ hash — first 16 hex chars of SHA-256(original_value)
28
+
29
+ Findings are applied in reverse position order so earlier replacements
30
+ do not shift the indices of later ones.
31
+ """
32
+ if strategy not in _VALID_STRATEGIES:
33
+ raise ValueError(f"Unknown strategy {strategy!r}. Use: {', '.join(sorted(_VALID_STRATEGIES))}")
34
+ if not text or not findings:
35
+ return text or ""
36
+
37
+ result = text
38
+ counter: dict[str, int] = {}
39
+
40
+ for finding in sorted(findings, key=lambda x: x["start"], reverse=True):
41
+ ptype = finding["type"]
42
+ counter[ptype] = counter.get(ptype, 0) + 1
43
+ tag = ptype.upper()
44
+
45
+ if strategy == "redact":
46
+ replacement = f"[REDACTED_{tag}]"
47
+ elif strategy == "replace":
48
+ replacement = _SYNTHETIC.get(ptype, f"[{tag}]")
49
+ elif strategy == "token":
50
+ replacement = f"<PII_{tag}_{counter[ptype]}>"
51
+ else: # hash
52
+ h = hashlib.sha256(finding["value"].encode()).hexdigest()[:16]
53
+ replacement = f"[{h}]"
54
+
55
+ result = result[: finding["start"]] + replacement + result[finding["end"] :]
56
+
57
+ return result
@@ -0,0 +1,35 @@
1
+ import unicodedata
2
+
3
+ # Unicode general categories that indicate non-printable / garbage characters.
4
+ # Cc=control, Cs=surrogate, Co=private-use, Cn=unassigned
5
+ _GARBAGE_CATS = frozenset({"Cc", "Cs", "Co", "Cn"})
6
+
7
+ # Normal whitespace is not garbage even though it falls in Cc
8
+ _SAFE_WHITESPACE = frozenset(" \t\n\r\x0b\x0c")
9
+
10
+
11
+ def _is_garbage(ch: str) -> bool:
12
+ if ch in _SAFE_WHITESPACE:
13
+ return False
14
+ return unicodedata.category(ch) in _GARBAGE_CATS or ch == "�"
15
+
16
+
17
+ def noise_metrics(text: str) -> dict:
18
+ """
19
+ Compute noise metrics for a single text record.
20
+
21
+ Returns:
22
+ garbage_ratio — fraction of characters that are control/private/unassigned
23
+ or Unicode replacement characters (U+FFFD)
24
+ encoding_ok — False when U+FFFD replacement characters are present,
25
+ which typically indicates a transcoding error
26
+ """
27
+ if not text:
28
+ return {"garbage_ratio": 0.0, "encoding_ok": True}
29
+
30
+ n = len(text)
31
+ garbage = sum(1 for ch in text if _is_garbage(ch))
32
+ return {
33
+ "garbage_ratio": round(garbage / n, 4),
34
+ "encoding_ok": "�" not in text,
35
+ }
@@ -0,0 +1,163 @@
1
+ import re
2
+
3
+ # ── Universal detectors ──────────────────────────────────────────────────────
4
+
5
+ EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
6
+
7
+ # E.164 international phone — requires + prefix, 10+ total digits
8
+ # Used for locale=us/eu. TR phones covered by PHONE_TR_RE.
9
+ PHONE_INTL_RE = re.compile(
10
+ r"\+\d{1,3}[\s\-\.]?\(?\d{1,4}\)?[\s\-\.]?\d{3,4}[\s\-\.]?\d{4}\b"
11
+ )
12
+
13
+ # IBAN — ISO 13616 (all countries, including TR)
14
+ IBAN_RE = re.compile(r"\b[A-Z]{2}\d{2}[0-9A-Z]{11,30}\b")
15
+
16
+ # Credit card — 16 digits with separator groups (Luhn-validated separately)
17
+ CC_RE = re.compile(r"\b\d{4}[ \-]\d{4}[ \-]\d{4}[ \-]\d{4}\b")
18
+
19
+ # IPv4
20
+ IPV4_RE = re.compile(
21
+ r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
22
+ )
23
+
24
+ # ── Turkish detectors ────────────────────────────────────────────────────────
25
+
26
+ # Turkish mobile: +90 5xx... or 0 5xx... or bare 5xx (10 digits)
27
+ PHONE_TR_RE = re.compile(r"\b(?:\+90|0)?\s*5\d{2}\s*\d{3}\s*\d{2}\s*\d{2}\b")
28
+
29
+ # TCKN — first digit non-zero, 11 digits, checksum-validated below
30
+ TCKN_RE = re.compile(r"\b([1-9]\d{10})\b")
31
+
32
+ _NAME_PREFIX_TR = (
33
+ r"(?:Ad[ıi]\s*(?:Soyad[ıi])?|Soyad[ıi]|İsim|"
34
+ r"Müşteri\s+Ad[ıi]|Yetkili(?:\s+Kişi)?|Çalışan\s+Ad[ıi]|"
35
+ r"Personel\s+Ad[ıi]|Kişi\s+Ad[ıi]|Satıcı\s+Ad[ıi]|"
36
+ r"Alıcı\s+Ad[ıi]|İlgili\s+Kişi|Hesap\s+Sahibi)"
37
+ )
38
+ _NAME_PREFIX_EN = (
39
+ r"(?:Full\s+Name|Customer\s+Name|Employee\s+Name|"
40
+ r"Contact\s+Name|Authorized\s+(?:By|Person)|Account\s+Holder|"
41
+ r"(?<!\bUser\s)Name)"
42
+ )
43
+ _NAME_VALUE = r"([A-ZÇĞİÖŞÜ][a-zçğışöşü]+(?:\s+[A-ZÇĞİÖŞÜ][a-zçğışöşü]+){0,2})"
44
+
45
+ # Label-prefixed name detection (TR and EN labels). NLP-based free-standing name
46
+ # detection is out of scope for v0.1 — requires NER.
47
+ NAME_RE = re.compile(
48
+ rf"(?:{_NAME_PREFIX_TR}|{_NAME_PREFIX_EN})\s*[:\-]\s*{_NAME_VALUE}",
49
+ re.UNICODE,
50
+ )
51
+
52
+ # ── US detectors ─────────────────────────────────────────────────────────────
53
+
54
+ # SSN — hyphens required to minimise false positives
55
+ SSN_RE = re.compile(r"\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b")
56
+
57
+ # ── Validation helpers ────────────────────────────────────────────────────────
58
+
59
+
60
+ def _valid_tckn(s: str) -> bool:
61
+ # TR Nüfus Müdürlüğü modular arithmetic — same as Luhn-family checksums
62
+ if len(s) != 11 or s[0] == "0":
63
+ return False
64
+ d = [int(c) for c in s]
65
+ sum_odd = d[0] + d[2] + d[4] + d[6] + d[8]
66
+ sum_even = d[1] + d[3] + d[5] + d[7]
67
+ if (sum_odd * 7 - sum_even) % 10 != d[9]:
68
+ return False
69
+ return sum(d[:10]) % 10 == d[10]
70
+
71
+
72
+ def _luhn(number: str) -> bool:
73
+ # ISO/IEC 7812 Luhn checksum
74
+ digits = [int(c) for c in number if c.isdigit()]
75
+ if not 13 <= len(digits) <= 19:
76
+ return False
77
+ total = 0
78
+ for i, d in enumerate(reversed(digits)):
79
+ if i % 2 == 1:
80
+ d *= 2
81
+ if d > 9:
82
+ d -= 9
83
+ total += d
84
+ return total % 10 == 0
85
+
86
+
87
+ # ── Locale registry ───────────────────────────────────────────────────────────
88
+
89
+ _LOCALE_DETECTORS: dict[str, set[str]] = {
90
+ "tr": {"national_id_tr", "phone_tr", "name"},
91
+ "us": {"ssn", "phone"},
92
+ "eu": {"phone"},
93
+ }
94
+ _UNIVERSAL: set[str] = {"email", "iban", "credit_card", "ip"}
95
+
96
+
97
+ def _active(locale: str) -> set[str]:
98
+ if locale == "all":
99
+ active: set[str] = set(_UNIVERSAL)
100
+ for detectors in _LOCALE_DETECTORS.values():
101
+ active |= detectors
102
+ # phone_tr is more specific than generic phone; skip generic when both active
103
+ if "phone_tr" in active:
104
+ active.discard("phone")
105
+ return active
106
+ return _UNIVERSAL | _LOCALE_DETECTORS.get(locale, set())
107
+
108
+
109
+ # ── Public detector ───────────────────────────────────────────────────────────
110
+
111
+
112
+ def detect_pii(text: str, locale: str = "tr") -> list[dict]:
113
+ """
114
+ Detect PII in *text* and return a list of findings sorted by position.
115
+
116
+ Each finding: {"type": str, "value": str, "start": int, "end": int}
117
+ """
118
+ active = _active(locale)
119
+ findings: list[dict] = []
120
+ t = text or ""
121
+
122
+ if "email" in active:
123
+ for m in EMAIL_RE.finditer(t):
124
+ findings.append({"type": "email", "value": m.group(), "start": m.start(), "end": m.end()})
125
+
126
+ if "phone" in active:
127
+ for m in PHONE_INTL_RE.finditer(t):
128
+ if sum(c.isdigit() for c in m.group()) >= 10:
129
+ findings.append({"type": "phone", "value": m.group(), "start": m.start(), "end": m.end()})
130
+
131
+ if "iban" in active:
132
+ for m in IBAN_RE.finditer(t):
133
+ findings.append({"type": "iban", "value": m.group(), "start": m.start(), "end": m.end()})
134
+
135
+ if "credit_card" in active:
136
+ for m in CC_RE.finditer(t):
137
+ if _luhn(m.group()):
138
+ findings.append({"type": "credit_card", "value": m.group(), "start": m.start(), "end": m.end()})
139
+
140
+ if "ip" in active:
141
+ for m in IPV4_RE.finditer(t):
142
+ findings.append({"type": "ip", "value": m.group(), "start": m.start(), "end": m.end()})
143
+
144
+ if "phone_tr" in active:
145
+ for m in PHONE_TR_RE.finditer(t):
146
+ findings.append({"type": "phone_tr", "value": m.group(), "start": m.start(), "end": m.end()})
147
+
148
+ if "national_id_tr" in active:
149
+ for m in TCKN_RE.finditer(t):
150
+ if _valid_tckn(m.group(1)):
151
+ findings.append({"type": "national_id_tr", "value": m.group(1), "start": m.start(), "end": m.end()})
152
+
153
+ if "name" in active:
154
+ for m in NAME_RE.finditer(t):
155
+ idx = m.lastindex
156
+ findings.append({"type": "name", "value": m.group(idx), "start": m.start(idx), "end": m.end(idx)})
157
+
158
+ if "ssn" in active:
159
+ for m in SSN_RE.finditer(t):
160
+ findings.append({"type": "ssn", "value": m.group(), "start": m.start(), "end": m.end()})
161
+
162
+ findings.sort(key=lambda x: x["start"])
163
+ return findings
@@ -0,0 +1,16 @@
1
+ def quality_metrics(text: str) -> dict:
2
+ """
3
+ Compute quality metrics for a single text record.
4
+
5
+ Returns:
6
+ completeness — 1.0 if text is non-empty after stripping whitespace, else 0.0
7
+ avg_length — character count of stripped text
8
+ duplicate_ratio — always None for single-record input; compute across your
9
+ full dataset by comparing audit() results per record
10
+ """
11
+ stripped = (text or "").strip()
12
+ return {
13
+ "completeness": 1.0 if stripped else 0.0,
14
+ "avg_length": len(stripped),
15
+ "duplicate_ratio": None,
16
+ }
File without changes
@@ -0,0 +1,75 @@
1
+ """Integration tests for the public audit() and mask() API."""
2
+ import flexorch_audit
3
+ from flexorch_audit import audit, mask
4
+
5
+
6
+ def test_version_present():
7
+ assert isinstance(flexorch_audit.__version__, str)
8
+ assert flexorch_audit.__version__.startswith("0.")
9
+
10
+
11
+ def test_audit_returns_all_pillars():
12
+ result = audit("Hello world", locale="tr")
13
+ assert "pii" in result
14
+ assert "quality" in result
15
+ assert "noise" in result
16
+
17
+
18
+ def test_audit_clean_text():
19
+ result = audit("The quick brown fox jumps over the lazy dog.", locale="tr")
20
+ assert result["pii"] == []
21
+ assert result["quality"]["completeness"] == 1.0
22
+ assert result["noise"]["garbage_ratio"] == 0.0
23
+
24
+
25
+ def test_audit_email_found():
26
+ result = audit("Contact us: hello@flexorch.com", locale="tr")
27
+ assert any(f["type"] == "email" for f in result["pii"])
28
+
29
+
30
+ def test_audit_tckn_found():
31
+ result = audit("TC kimlik: 12345678950", locale="tr")
32
+ assert any(f["type"] == "national_id_tr" for f in result["pii"])
33
+
34
+
35
+ def test_mask_redact_round_trip():
36
+ text = "Email: test@example.com"
37
+ result = audit(text, locale="tr")
38
+ clean = mask(text, result["pii"], strategy="redact")
39
+ assert "test@example.com" not in clean
40
+ assert "[REDACTED_EMAIL]" in clean
41
+
42
+
43
+ def test_mask_no_pii_unchanged():
44
+ text = "Clean text with no personal data."
45
+ result = audit(text, locale="tr")
46
+ assert mask(text, result["pii"]) == text
47
+
48
+
49
+ def test_audit_empty_string():
50
+ result = audit("", locale="tr")
51
+ assert result["pii"] == []
52
+ assert result["quality"]["completeness"] == 0.0
53
+ assert result["noise"]["encoding_ok"] is True
54
+
55
+
56
+ def test_audit_locale_us_ssn():
57
+ result = audit("SSN: 123-45-6789", locale="us")
58
+ assert any(f["type"] == "ssn" for f in result["pii"])
59
+
60
+
61
+ def test_audit_locale_all():
62
+ text = "TC: 12345678950, SSN: 123-45-6789, email: x@y.com"
63
+ result = audit(text, locale="all")
64
+ types = {f["type"] for f in result["pii"]}
65
+ assert "national_id_tr" in types
66
+ assert "ssn" in types
67
+ assert "email" in types
68
+
69
+
70
+ def test_mask_strategies_all_remove_pii():
71
+ text = "Contact: ali@example.com"
72
+ result = audit(text, locale="tr")
73
+ for strategy in ("redact", "replace", "token", "hash"):
74
+ clean = mask(text, result["pii"], strategy=strategy)
75
+ assert "ali@example.com" not in clean, f"PII still present with strategy={strategy}"
@@ -0,0 +1,87 @@
1
+ import pytest
2
+ from flexorch_audit._mask import apply_mask
3
+
4
+
5
+ _FINDINGS = [{"type": "email", "value": "a@b.com", "start": 7, "end": 14}]
6
+ _TEXT = "Email: a@b.com end"
7
+
8
+
9
+ def test_redact_strategy():
10
+ result = apply_mask(_TEXT, _FINDINGS, strategy="redact")
11
+ assert "a@b.com" not in result
12
+ assert "[REDACTED_EMAIL]" in result
13
+
14
+
15
+ def test_replace_strategy():
16
+ result = apply_mask(_TEXT, _FINDINGS, strategy="replace")
17
+ assert "a@b.com" not in result
18
+ assert "example.com" in result
19
+
20
+
21
+ def test_token_strategy():
22
+ result = apply_mask(_TEXT, _FINDINGS, strategy="token")
23
+ assert "a@b.com" not in result
24
+ assert "<PII_EMAIL_1>" in result
25
+
26
+
27
+ def test_hash_strategy():
28
+ result = apply_mask(_TEXT, _FINDINGS, strategy="hash")
29
+ assert "a@b.com" not in result
30
+ # Hash replacement is 16 hex chars wrapped in []
31
+ import re
32
+ assert re.search(r"\[[0-9a-f]{16}\]", result)
33
+
34
+
35
+ def test_default_strategy_is_redact():
36
+ result = apply_mask(_TEXT, _FINDINGS)
37
+ assert "[REDACTED_EMAIL]" in result
38
+
39
+
40
+ def test_invalid_strategy_raises():
41
+ with pytest.raises(ValueError, match="Unknown strategy"):
42
+ apply_mask(_TEXT, _FINDINGS, strategy="invalid")
43
+
44
+
45
+ def test_empty_findings_returns_original():
46
+ assert apply_mask(_TEXT, []) == _TEXT
47
+
48
+
49
+ def test_empty_text_returns_empty():
50
+ assert apply_mask("", _FINDINGS) == ""
51
+
52
+
53
+ def test_multiple_findings_correct_order():
54
+ text = "a@b.com and c@d.com"
55
+ findings = [
56
+ {"type": "email", "value": "a@b.com", "start": 0, "end": 7},
57
+ {"type": "email", "value": "c@d.com", "start": 12, "end": 19},
58
+ ]
59
+ result = apply_mask(text, findings, strategy="redact")
60
+ assert "a@b.com" not in result
61
+ assert "c@d.com" not in result
62
+ assert result.count("[REDACTED_EMAIL]") == 2
63
+
64
+
65
+ def test_token_counter_per_type():
66
+ text = "a@b.com c@d.com"
67
+ findings = [
68
+ {"type": "email", "value": "a@b.com", "start": 0, "end": 7},
69
+ {"type": "email", "value": "c@d.com", "start": 8, "end": 15},
70
+ ]
71
+ result = apply_mask(text, findings, strategy="token")
72
+ # Tokens count up per type
73
+ assert "<PII_EMAIL_" in result
74
+
75
+
76
+ def test_phone_tr_replace_synthetic():
77
+ text = "Tel: 0532 123 45 67"
78
+ findings = [{"type": "phone_tr", "value": "0532 123 45 67", "start": 5, "end": 19}]
79
+ result = apply_mask(text, findings, strategy="replace")
80
+ assert "0532 123 45 67" not in result
81
+ assert "0500 000 00 00" in result
82
+
83
+
84
+ def test_hash_is_deterministic():
85
+ r1 = apply_mask(_TEXT, _FINDINGS, strategy="hash")
86
+ r2 = apply_mask(_TEXT, _FINDINGS, strategy="hash")
87
+ assert r1 == r2
@@ -0,0 +1,51 @@
1
+ from flexorch_audit._noise import noise_metrics
2
+
3
+
4
+ def test_clean_text():
5
+ result = noise_metrics("Hello, world!")
6
+ assert result["garbage_ratio"] == 0.0
7
+ assert result["encoding_ok"] is True
8
+
9
+
10
+ def test_empty_string():
11
+ result = noise_metrics("")
12
+ assert result["garbage_ratio"] == 0.0
13
+ assert result["encoding_ok"] is True
14
+
15
+
16
+ def test_none_treated_as_empty():
17
+ result = noise_metrics(None) # type: ignore[arg-type]
18
+ assert result["garbage_ratio"] == 0.0
19
+ assert result["encoding_ok"] is True
20
+
21
+
22
+ def test_encoding_error_detected():
23
+ text = "Normal text � with replacement char"
24
+ result = noise_metrics(text)
25
+ assert result["encoding_ok"] is False
26
+ assert result["garbage_ratio"] > 0.0
27
+
28
+
29
+ def test_control_characters_counted():
30
+ # \x01 is a control character (Cc category), not normal whitespace
31
+ text = "abc\x01def"
32
+ result = noise_metrics(text)
33
+ assert result["garbage_ratio"] > 0.0
34
+
35
+
36
+ def test_normal_whitespace_not_garbage():
37
+ text = "line one\nline two\ttabbed"
38
+ result = noise_metrics(text)
39
+ assert result["garbage_ratio"] == 0.0
40
+
41
+
42
+ def test_high_garbage_text():
43
+ text = "\x00\x01\x02\x03\x04\x05"
44
+ result = noise_metrics(text)
45
+ assert result["garbage_ratio"] == 1.0
46
+
47
+
48
+ def test_unicode_text_no_garbage():
49
+ result = noise_metrics("Türkçe metin: Çiğdem, Şükrü, İstanbul")
50
+ assert result["garbage_ratio"] == 0.0
51
+ assert result["encoding_ok"] is True
@@ -0,0 +1,191 @@
1
+ import pytest
2
+ from flexorch_audit._pii import detect_pii, _valid_tckn, _luhn
3
+
4
+
5
+ # ── TCKN checksum ─────────────────────────────────────────────────────────────
6
+
7
+
8
+ def test_valid_tckn():
9
+ # Computed: d=[1,2,3,4,5,6,7,8,9,5,0], sum_odd=25, sum_even=20
10
+ # d9=(175-20)%10=5, d10=50%10=0
11
+ assert _valid_tckn("12345678950") is True
12
+
13
+
14
+ def test_invalid_tckn_wrong_checksum():
15
+ assert _valid_tckn("12345678900") is False
16
+
17
+
18
+ def test_invalid_tckn_starts_with_zero():
19
+ assert _valid_tckn("01234567890") is False
20
+
21
+
22
+ def test_invalid_tckn_wrong_length():
23
+ assert _valid_tckn("1234567890") is False
24
+
25
+
26
+ # ── Luhn ─────────────────────────────────────────────────────────────────────
27
+
28
+
29
+ def test_luhn_valid_visa():
30
+ assert _luhn("4532015112830366") is True
31
+
32
+
33
+ def test_luhn_invalid():
34
+ assert _luhn("1234567890123456") is False
35
+
36
+
37
+ def test_luhn_too_short():
38
+ assert _luhn("123456") is False
39
+
40
+
41
+ # ── Email ─────────────────────────────────────────────────────────────────────
42
+
43
+
44
+ def test_email_detected():
45
+ findings = detect_pii("Contact: test@example.com today", locale="tr")
46
+ assert any(f["type"] == "email" and f["value"] == "test@example.com" for f in findings)
47
+
48
+
49
+ def test_email_subdomain():
50
+ findings = detect_pii("Send to ali@mail.co.uk", locale="tr")
51
+ assert any(f["type"] == "email" for f in findings)
52
+
53
+
54
+ def test_no_email_in_clean_text():
55
+ findings = detect_pii("Hello world, no PII here.", locale="tr")
56
+ assert not any(f["type"] == "email" for f in findings)
57
+
58
+
59
+ # ── Turkish phone ─────────────────────────────────────────────────────────────
60
+
61
+
62
+ def test_phone_tr_with_prefix():
63
+ findings = detect_pii("Ara: +90 532 123 45 67", locale="tr")
64
+ assert any(f["type"] == "phone_tr" for f in findings)
65
+
66
+
67
+ def test_phone_tr_domestic():
68
+ findings = detect_pii("GSM: 0532 123 45 67", locale="tr")
69
+ assert any(f["type"] == "phone_tr" for f in findings)
70
+
71
+
72
+ def test_phone_tr_not_in_us_locale():
73
+ findings = detect_pii("GSM: 0532 123 45 67", locale="us")
74
+ assert not any(f["type"] == "phone_tr" for f in findings)
75
+
76
+
77
+ # ── TCKN ──────────────────────────────────────────────────────────────────────
78
+
79
+
80
+ def test_tckn_detected():
81
+ findings = detect_pii("TC: 12345678950", locale="tr")
82
+ assert any(f["type"] == "national_id_tr" and f["value"] == "12345678950" for f in findings)
83
+
84
+
85
+ def test_invalid_tckn_not_detected():
86
+ findings = detect_pii("TC: 12345678900", locale="tr")
87
+ assert not any(f["type"] == "national_id_tr" for f in findings)
88
+
89
+
90
+ def test_tckn_not_in_us_locale():
91
+ findings = detect_pii("TC: 12345678950", locale="us")
92
+ assert not any(f["type"] == "national_id_tr" for f in findings)
93
+
94
+
95
+ # ── IBAN ──────────────────────────────────────────────────────────────────────
96
+
97
+
98
+ def test_iban_tr_detected():
99
+ findings = detect_pii("IBAN: TR330006100519786457841326", locale="tr")
100
+ assert any(f["type"] == "iban" for f in findings)
101
+
102
+
103
+ def test_iban_de_detected():
104
+ findings = detect_pii("Bank: DE89370400440532013000", locale="tr")
105
+ assert any(f["type"] == "iban" for f in findings)
106
+
107
+
108
+ # ── Credit card ───────────────────────────────────────────────────────────────
109
+
110
+
111
+ def test_credit_card_detected():
112
+ # Known Luhn-valid Visa test number
113
+ findings = detect_pii("Card: 4532 0151 1283 0366", locale="tr")
114
+ assert any(f["type"] == "credit_card" for f in findings)
115
+
116
+
117
+ def test_invalid_cc_not_detected():
118
+ findings = detect_pii("Ref: 1234 5678 9012 3456", locale="tr")
119
+ assert not any(f["type"] == "credit_card" for f in findings)
120
+
121
+
122
+ # ── IP ────────────────────────────────────────────────────────────────────────
123
+
124
+
125
+ def test_ip_detected():
126
+ findings = detect_pii("Server: 192.168.1.100", locale="tr")
127
+ assert any(f["type"] == "ip" and f["value"] == "192.168.1.100" for f in findings)
128
+
129
+
130
+ def test_invalid_ip_not_detected():
131
+ findings = detect_pii("Bad IP: 999.999.999.999", locale="tr")
132
+ assert not any(f["type"] == "ip" for f in findings)
133
+
134
+
135
+ # ── SSN ───────────────────────────────────────────────────────────────────────
136
+
137
+
138
+ def test_ssn_detected_us_locale():
139
+ findings = detect_pii("SSN: 123-45-6789", locale="us")
140
+ assert any(f["type"] == "ssn" and f["value"] == "123-45-6789" for f in findings)
141
+
142
+
143
+ def test_ssn_not_detected_tr_locale():
144
+ findings = detect_pii("SSN: 123-45-6789", locale="tr")
145
+ assert not any(f["type"] == "ssn" for f in findings)
146
+
147
+
148
+ def test_ssn_invalid_000_not_detected():
149
+ findings = detect_pii("SSN: 000-45-6789", locale="us")
150
+ assert not any(f["type"] == "ssn" for f in findings)
151
+
152
+
153
+ # ── Name ──────────────────────────────────────────────────────────────────────
154
+
155
+
156
+ def test_name_tr_label():
157
+ findings = detect_pii("Adı Soyadı: Ahmet Yıldız", locale="tr")
158
+ assert any(f["type"] == "name" and "Ahmet" in f["value"] for f in findings)
159
+
160
+
161
+ def test_name_en_label():
162
+ findings = detect_pii("Full Name: John Smith", locale="tr")
163
+ assert any(f["type"] == "name" and "John" in f["value"] for f in findings)
164
+
165
+
166
+ def test_name_not_detected_us_locale():
167
+ findings = detect_pii("Adı: Ahmet Yıldız", locale="us")
168
+ assert not any(f["type"] == "name" for f in findings)
169
+
170
+
171
+ # ── Locale: all ───────────────────────────────────────────────────────────────
172
+
173
+
174
+ def test_locale_all_includes_tckn_and_ssn():
175
+ text = "TC: 12345678950 and SSN: 123-45-6789"
176
+ findings = detect_pii(text, locale="all")
177
+ types = {f["type"] for f in findings}
178
+ assert "national_id_tr" in types
179
+ assert "ssn" in types
180
+
181
+
182
+ def test_findings_sorted_by_position():
183
+ text = "Email: a@b.com phone: 0532 123 45 67"
184
+ findings = detect_pii(text, locale="tr")
185
+ starts = [f["start"] for f in findings]
186
+ assert starts == sorted(starts)
187
+
188
+
189
+ def test_empty_string_returns_empty():
190
+ assert detect_pii("", locale="tr") == []
191
+ assert detect_pii(" ", locale="tr") == []
@@ -0,0 +1,38 @@
1
+ from flexorch_audit._quality import quality_metrics
2
+
3
+
4
+ def test_non_empty_text():
5
+ result = quality_metrics("Hello, world!")
6
+ assert result["completeness"] == 1.0
7
+ assert result["avg_length"] == 13
8
+ assert result["duplicate_ratio"] is None
9
+
10
+
11
+ def test_empty_string():
12
+ result = quality_metrics("")
13
+ assert result["completeness"] == 0.0
14
+ assert result["avg_length"] == 0
15
+
16
+
17
+ def test_whitespace_only():
18
+ result = quality_metrics(" \t\n ")
19
+ assert result["completeness"] == 0.0
20
+ assert result["avg_length"] == 0
21
+
22
+
23
+ def test_strips_leading_trailing_whitespace():
24
+ result = quality_metrics(" hello ")
25
+ assert result["avg_length"] == 5
26
+
27
+
28
+ def test_none_treated_as_empty():
29
+ result = quality_metrics(None) # type: ignore[arg-type]
30
+ assert result["completeness"] == 0.0
31
+ assert result["avg_length"] == 0
32
+
33
+
34
+ def test_long_text():
35
+ text = "a" * 10_000
36
+ result = quality_metrics(text)
37
+ assert result["completeness"] == 1.0
38
+ assert result["avg_length"] == 10_000