lang-natural 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ __pycache__/
2
+ *.pyc
3
+ dist/
4
+ build/
5
+ *.egg-info/
6
+ .venv/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 diwakarbhatt1983
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.4
2
+ Name: lang-natural
3
+ Version: 0.1.0
4
+ Summary: Detect unnatural / broken translations in locale files with ZERO LLM. Placeholders, untranslated strings, wrong script, repeats and more. Optional AI layer for fix suggestions. Works in any Python backend.
5
+ Project-URL: Homepage, https://github.com/diwakarbhatt1983/lang-natural
6
+ Author: diwakarbhatt1983
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: detector,django,fastapi,i18n,internationalization,linter,localization,naturalness,translation
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Software Development :: Localization
14
+ Requires-Python: >=3.9
15
+ Description-Content-Type: text/markdown
16
+
17
+ # lang-natural (Python)
18
+
19
+ Detect **unnatural / broken translations** in your locale files — with **zero LLM**.
20
+ Catches dropped placeholders, untranslated strings, wrong script, repeated words,
21
+ markup drift and length anomalies. Runs fully offline, no API key. An optional AI
22
+ layer (`--fix`) suggests natural rewrites only when you ask.
23
+
24
+ For Python backends (Django, FastAPI, Flask) that keep user-facing strings in JSON.
25
+
26
+ ## Install
27
+ ```bash
28
+ pip install lang-natural
29
+ ```
30
+
31
+ ## Configure — `lang-natural.config.json`
32
+ ```json
33
+ {
34
+ "localesDir": "./locales",
35
+ "sourceLocale": "en",
36
+ "targetLocales": ["ar", "es", "hi"],
37
+ "failOn": "error"
38
+ }
39
+ ```
40
+
41
+ ## Run
42
+ ```bash
43
+ lang-natural # scan and print the report
44
+ lang-natural --json # machine-readable output
45
+ lang-natural --fix # LLM fix suggestions (needs GEMINI_API_KEY)
46
+ ```
47
+ Exit code is `1` when blocking issues are found — so it fails CI.
48
+
49
+ ## Use in code
50
+ ```python
51
+ from lang_natural import detect
52
+
53
+ issues = detect("cart.itemCount", "ar",
54
+ "You have {count} items", "لديك عناصر")
55
+ for i in issues:
56
+ print(i.severity, i.message)
57
+ ```
58
+
59
+ ## What the local (no-LLM) detector catches
60
+ - Empty translations
61
+ - Missing / extra placeholders (`{count}`, `{{name}}`, `%s`)
62
+ - Strings identical to the source (untranslated)
63
+ - Wrong script (Latin text in an Arabic/Hindi/… locale)
64
+ - Repeated words
65
+ - HTML/markup that differs from the source
66
+ - Length outliers vs. the locale's typical ratio
67
+
68
+ Subtle word-order naturalness and rewrite suggestions require the optional LLM
69
+ layer (`--fix`).
70
+
71
+ MIT
@@ -0,0 +1,55 @@
1
+ # lang-natural (Python)
2
+
3
+ Detect **unnatural / broken translations** in your locale files — with **zero LLM**.
4
+ Catches dropped placeholders, untranslated strings, wrong script, repeated words,
5
+ markup drift and length anomalies. Runs fully offline, no API key. An optional AI
6
+ layer (`--fix`) suggests natural rewrites only when you ask.
7
+
8
+ For Python backends (Django, FastAPI, Flask) that keep user-facing strings in JSON.
9
+
10
+ ## Install
11
+ ```bash
12
+ pip install lang-natural
13
+ ```
14
+
15
+ ## Configure — `lang-natural.config.json`
16
+ ```json
17
+ {
18
+ "localesDir": "./locales",
19
+ "sourceLocale": "en",
20
+ "targetLocales": ["ar", "es", "hi"],
21
+ "failOn": "error"
22
+ }
23
+ ```
24
+
25
+ ## Run
26
+ ```bash
27
+ lang-natural # scan and print the report
28
+ lang-natural --json # machine-readable output
29
+ lang-natural --fix # LLM fix suggestions (needs GEMINI_API_KEY)
30
+ ```
31
+ Exit code is `1` when blocking issues are found — so it fails CI.
32
+
33
+ ## Use in code
34
+ ```python
35
+ from lang_natural import detect
36
+
37
+ issues = detect("cart.itemCount", "ar",
38
+ "You have {count} items", "لديك عناصر")
39
+ for i in issues:
40
+ print(i.severity, i.message)
41
+ ```
42
+
43
+ ## What the local (no-LLM) detector catches
44
+ - Empty translations
45
+ - Missing / extra placeholders (`{count}`, `{{name}}`, `%s`)
46
+ - Strings identical to the source (untranslated)
47
+ - Wrong script (Latin text in an Arabic/Hindi/… locale)
48
+ - Repeated words
49
+ - HTML/markup that differs from the source
50
+ - Length outliers vs. the locale's typical ratio
51
+
52
+ Subtle word-order naturalness and rewrite suggestions require the optional LLM
53
+ layer (`--fix`).
54
+
55
+ MIT
@@ -0,0 +1,6 @@
1
+ {
2
+ "localesDir": "../../examples/messages",
3
+ "sourceLocale": "en",
4
+ "targetLocales": ["ar", "es"],
5
+ "failOn": "error"
6
+ }
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "lang-natural"
7
+ version = "0.1.0"
8
+ description = "Detect unnatural / broken translations in locale files with ZERO LLM. Placeholders, untranslated strings, wrong script, repeats and more. Optional AI layer for fix suggestions. Works in any Python backend."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "diwakarbhatt1983" }]
13
+ keywords = ["i18n", "internationalization", "translation", "localization", "linter", "naturalness", "detector", "django", "fastapi"]
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ "Topic :: Software Development :: Localization",
19
+ ]
20
+ dependencies = []
21
+
22
+ [project.urls]
23
+ Homepage = "https://github.com/diwakarbhatt1983/lang-natural"
24
+
25
+ [project.scripts]
26
+ lang-natural = "lang_natural.cli:main"
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = ["src/lang_natural"]
@@ -0,0 +1,19 @@
1
+ """lang-natural — local (no-LLM) translation naturalness/quality detector."""
2
+ from .detector import detect, Issue
3
+ from .placeholders import check_placeholders, extract_placeholders
4
+ from .script import expected_script, script_ratio, is_non_latin_locale
5
+ from .files import load_locale, pair_locale, flatten
6
+ from .config import Config, load_config_file, find_config, resolve_config
7
+ from .report import run, render_report, RunSummary, Finding
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ __all__ = [
12
+ "detect", "Issue",
13
+ "check_placeholders", "extract_placeholders",
14
+ "expected_script", "script_ratio", "is_non_latin_locale",
15
+ "load_locale", "pair_locale", "flatten",
16
+ "Config", "load_config_file", "find_config", "resolve_config",
17
+ "run", "render_report", "RunSummary", "Finding",
18
+ "__version__",
19
+ ]
@@ -0,0 +1,6 @@
1
+ import sys
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ sys.exit(main())
@@ -0,0 +1,84 @@
1
+ """Command-line entry point: `lang-natural`."""
2
+ import argparse
3
+ import json
4
+ import sys
5
+
6
+ from .config import find_config, load_config_file, resolve_config
7
+ from .report import run, render_report
8
+
9
+
10
+ def _summary_to_dict(summary) -> dict:
11
+ return {
12
+ "ok": summary.ok,
13
+ "stats": summary.stats,
14
+ "reports": [
15
+ {
16
+ "locale": r.locale,
17
+ "file": r.file,
18
+ "missingKeys": r.missing_keys,
19
+ "extraKeys": r.extra_keys,
20
+ "findings": [
21
+ {
22
+ "key": f.key,
23
+ "source": f.source,
24
+ "candidate": f.candidate,
25
+ "issues": [
26
+ {"rule": i.rule, "severity": i.severity, "message": i.message}
27
+ for i in f.issues
28
+ ],
29
+ "suggestion": f.suggestion,
30
+ }
31
+ for f in r.findings
32
+ ],
33
+ }
34
+ for r in summary.reports
35
+ ],
36
+ }
37
+
38
+
39
+ def main(argv: list[str] | None = None) -> int:
40
+ parser = argparse.ArgumentParser(
41
+ prog="lang-natural",
42
+ description="Detect unnatural / broken translations locally (no LLM required).",
43
+ )
44
+ parser.add_argument("-c", "--config", help="Path to config JSON (default: auto-detect)")
45
+ parser.add_argument("--json", action="store_true", help="Machine-readable JSON output")
46
+ parser.add_argument("--fix", action="store_true", help="LLM fix suggestions (needs GEMINI_API_KEY)")
47
+ parser.add_argument("-v", "--version", action="store_true", help="Show version")
48
+ args = parser.parse_args(argv)
49
+
50
+ if args.version:
51
+ from . import __version__
52
+ print(__version__)
53
+ return 0
54
+
55
+ try:
56
+ if args.config:
57
+ config = load_config_file(args.config)
58
+ else:
59
+ path = find_config()
60
+ if not path:
61
+ sys.stderr.write("No lang-natural.config.json found in this directory.\n")
62
+ return 2
63
+ config = load_config_file(path)
64
+ except Exception as err: # noqa: BLE001
65
+ sys.stderr.write(f"Config error: {err}\n")
66
+ return 2
67
+
68
+ try:
69
+ summary = run(config)
70
+ if args.fix:
71
+ from .llm import suggest_fixes
72
+ suggest_fixes(summary)
73
+ if args.json:
74
+ print(json.dumps(_summary_to_dict(summary), ensure_ascii=False, indent=2))
75
+ else:
76
+ print(render_report(summary))
77
+ return 0 if summary.ok else 1
78
+ except Exception as err: # noqa: BLE001
79
+ sys.stderr.write(f"\nError: {err}\n")
80
+ return 2
81
+
82
+
83
+ if __name__ == "__main__":
84
+ sys.exit(main())
@@ -0,0 +1,50 @@
1
+ """Config loading. Uses a JSON config so it stays language-agnostic."""
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass, field
5
+
6
+ _NAMES = ["lang-natural.config.json", "langnatural.config.json"]
7
+
8
+
9
+ @dataclass
10
+ class Config:
11
+ locales_dir: str
12
+ source_locale: str
13
+ target_locales: list[str]
14
+ disable_rules: list[str] = field(default_factory=list)
15
+ fail_on: str = "error"
16
+
17
+
18
+ def find_config(cwd: str | None = None) -> str | None:
19
+ cwd = cwd or os.getcwd()
20
+ for n in _NAMES:
21
+ p = os.path.join(cwd, n)
22
+ if os.path.exists(p):
23
+ return p
24
+ return None
25
+
26
+
27
+ def load_config_file(path: str) -> Config:
28
+ with open(path, encoding="utf-8") as fh:
29
+ raw = json.load(fh)
30
+ return resolve_config(raw)
31
+
32
+
33
+ def resolve_config(raw: dict) -> Config:
34
+ # Accept both camelCase (shared with the JS config) and snake_case.
35
+ locales_dir = raw.get("localesDir") or raw.get("locales_dir")
36
+ source = raw.get("sourceLocale") or raw.get("source_locale")
37
+ targets = raw.get("targetLocales") or raw.get("target_locales")
38
+ if not locales_dir:
39
+ raise ValueError("config.localesDir is required.")
40
+ if not source:
41
+ raise ValueError("config.sourceLocale is required.")
42
+ if not targets:
43
+ raise ValueError("config.targetLocales must be a non-empty list.")
44
+ return Config(
45
+ locales_dir=locales_dir,
46
+ source_locale=source,
47
+ target_locales=list(targets),
48
+ disable_rules=list(raw.get("disableRules") or raw.get("disable_rules") or []),
49
+ fail_on=raw.get("failOn") or raw.get("fail_on") or "error",
50
+ )
@@ -0,0 +1,88 @@
1
+ """The LOCAL detector — zero LLM, zero network, deterministic.
2
+
3
+ Identical checks to the npm version so both packages behave the same.
4
+ """
5
+ import re
6
+ from dataclasses import dataclass
7
+
8
+ from .placeholders import check_placeholders
9
+ from .script import expected_script, script_ratio, is_non_latin_locale
10
+
11
+ _HTML_TAG = re.compile(r"</?[a-zA-Z][^>]*>")
12
+
13
+
14
+ @dataclass
15
+ class Issue:
16
+ rule: str
17
+ severity: str # "error" | "warning" | "info"
18
+ message: str
19
+
20
+
21
+ def detect(
22
+ key: str,
23
+ locale: str,
24
+ source: str,
25
+ candidate: str,
26
+ median_len_ratio: float = 0.0,
27
+ disabled: set[str] | None = None,
28
+ ) -> list[Issue]:
29
+ disabled = disabled or set()
30
+ issues: list[Issue] = []
31
+
32
+ def add(rule: str, severity: str, message: str) -> None:
33
+ if rule not in disabled:
34
+ issues.append(Issue(rule, severity, message))
35
+
36
+ trimmed = candidate.strip()
37
+
38
+ # 1. Empty translation.
39
+ if trimmed == "":
40
+ add("empty", "error", "Translation is empty")
41
+ return issues
42
+
43
+ # 2. Placeholders must survive.
44
+ ph = check_placeholders(source, candidate)
45
+ if ph["missing"]:
46
+ add("placeholder", "error",
47
+ f"Missing placeholder(s): {', '.join(ph['missing'])} — this can crash the UI")
48
+ if ph["extra"]:
49
+ add("placeholder", "warning", f"Unexpected placeholder(s): {', '.join(ph['extra'])}")
50
+
51
+ # 3. Identical to source → likely untranslated.
52
+ if trimmed == source.strip() and any(c.isalpha() for c in source):
53
+ add("untranslated", "warning", "Identical to the source string (untranslated?)")
54
+
55
+ # 4. Wrong script for non-Latin locales.
56
+ if is_non_latin_locale(locale):
57
+ ratio = script_ratio(candidate, expected_script(locale))
58
+ if ratio < 0.4:
59
+ add("wrong-script", "error",
60
+ f"Mostly non-{expected_script(locale)} characters — looks untranslated or wrong script")
61
+
62
+ # 5. Repeated word (works for any script).
63
+ words = [w for w in re.split(r"\s+", candidate) if w]
64
+ for i in range(1, len(words)):
65
+ w = words[i]
66
+ if len(w) >= 2 and w == words[i - 1] and any(c.isalpha() for c in w):
67
+ add("repeated-word", "warning", f'Repeated word: "{w}"')
68
+ break
69
+
70
+ # 6. HTML/markup tags must match.
71
+ src_tags = sorted(_HTML_TAG.findall(source))
72
+ cand_tags = sorted(_HTML_TAG.findall(candidate))
73
+ if src_tags != cand_tags:
74
+ add("markup", "error", "HTML tags differ from the source string")
75
+
76
+ # 7. Length outlier vs the locale's typical translation length.
77
+ if median_len_ratio and len(source.strip()) >= 8:
78
+ ratio = len(candidate.strip()) / max(1, len(source.strip()))
79
+ m = median_len_ratio
80
+ if m > 0 and (ratio > m * 3 or ratio < m / 3):
81
+ add("length-outlier", "info",
82
+ f"Unusual length vs source ({ratio:.2f}x, locale typical ~{m:.2f}x) — possible mistranslation")
83
+
84
+ # 8. Doubled / stray whitespace.
85
+ if re.search(r"\s{2,}", candidate) or candidate != candidate.strip():
86
+ add("whitespace", "info", "Extra or trailing whitespace")
87
+
88
+ return issues
@@ -0,0 +1,79 @@
1
+ """Locale file loading, flattening and pairing."""
2
+ import json
3
+ import os
4
+ from dataclasses import dataclass, field
5
+ from statistics import median
6
+
7
+
8
+ @dataclass
9
+ class LoadedLocale:
10
+ locale: str
11
+ file: str
12
+ strings: dict[str, str]
13
+
14
+
15
+ def flatten(obj, prefix: str = "", out: dict[str, str] | None = None) -> dict[str, str]:
16
+ if out is None:
17
+ out = {}
18
+ if not isinstance(obj, dict):
19
+ return out
20
+ for k, v in obj.items():
21
+ key = f"{prefix}.{k}" if prefix else k
22
+ if isinstance(v, str):
23
+ out[key] = v
24
+ elif isinstance(v, dict):
25
+ flatten(v, key, out)
26
+ return out
27
+
28
+
29
+ def locale_file_path(locales_dir: str, locale: str) -> str:
30
+ d = os.path.abspath(locales_dir)
31
+ candidates = [
32
+ os.path.join(d, f"{locale}.json"),
33
+ os.path.join(d, locale, "index.json"),
34
+ os.path.join(d, locale, f"{locale}.json"),
35
+ ]
36
+ for c in candidates:
37
+ if os.path.exists(c):
38
+ return c
39
+ return candidates[0]
40
+
41
+
42
+ def load_locale(locales_dir: str, locale: str) -> LoadedLocale:
43
+ path = locale_file_path(locales_dir, locale)
44
+ if not os.path.exists(path):
45
+ raise FileNotFoundError(f'Locale file not found for "{locale}": {path}')
46
+ with open(path, encoding="utf-8") as fh:
47
+ parsed = json.load(fh)
48
+ return LoadedLocale(locale=locale, file=path, strings=flatten(parsed))
49
+
50
+
51
+ @dataclass
52
+ class PairingResult:
53
+ pairs: list[dict] = field(default_factory=list)
54
+ missing_keys: list[str] = field(default_factory=list)
55
+ extra_keys: list[str] = field(default_factory=list)
56
+ median_len_ratio: float = 0.0
57
+
58
+
59
+ def pair_locale(source: LoadedLocale, target: LoadedLocale) -> PairingResult:
60
+ pairs: list[dict] = []
61
+ missing: list[str] = []
62
+ ratios: list[float] = []
63
+
64
+ for key, src in source.strings.items():
65
+ candidate = target.strings.get(key)
66
+ if candidate is None:
67
+ missing.append(key)
68
+ continue
69
+ pairs.append({"key": key, "source": src, "candidate": candidate})
70
+ if len(src.strip()) >= 8 and len(candidate.strip()) > 0:
71
+ ratios.append(len(candidate.strip()) / len(src.strip()))
72
+
73
+ extra = [k for k in target.strings if k not in source.strings]
74
+ return PairingResult(
75
+ pairs=pairs,
76
+ missing_keys=missing,
77
+ extra_keys=extra,
78
+ median_len_ratio=median(ratios) if ratios else 0.0,
79
+ )
@@ -0,0 +1,69 @@
1
+ """OPTIONAL LLM layer for --fix. Uses stdlib urllib (no extra deps).
2
+
3
+ The detector never needs this; it only runs on --fix with GEMINI_API_KEY set.
4
+ """
5
+ import json
6
+ import os
7
+ import urllib.request
8
+
9
+
10
+ def suggest_fixes(summary) -> None:
11
+ key = os.environ.get("GEMINI_API_KEY")
12
+ if not key:
13
+ raise RuntimeError("--fix needs an LLM key. Set GEMINI_API_KEY in your environment.")
14
+
15
+ findings = [f for r in summary.reports for f in r.findings]
16
+ if not findings:
17
+ return
18
+
19
+ model = os.environ.get("LANG_NATURAL_MODEL", "gemini-2.5-flash")
20
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
21
+
22
+ items = [
23
+ {
24
+ "i": i,
25
+ "locale": f.locale,
26
+ "source": f.source,
27
+ "current": f.candidate,
28
+ "problems": [x.message for x in f.issues],
29
+ }
30
+ for i, f in enumerate(findings)
31
+ ]
32
+ prompt = (
33
+ "You are a native-speaker localization editor. For each item, rewrite the "
34
+ '"current" translation so it reads naturally to a native speaker of the given '
35
+ "locale, keeping the meaning of the source and preserving every placeholder "
36
+ '(like {name}, {count}). Return STRICT JSON: '
37
+ '{"fixes":[{"i":number,"suggestion":string}]}.\n\n'
38
+ + json.dumps(items, ensure_ascii=False, indent=2)
39
+ )
40
+ body = json.dumps({
41
+ "contents": [{"role": "user", "parts": [{"text": prompt}]}],
42
+ "generationConfig": {"temperature": 0, "responseMimeType": "application/json"},
43
+ }).encode("utf-8")
44
+
45
+ req = urllib.request.Request(
46
+ url, data=body,
47
+ headers={"content-type": "application/json", "x-goog-api-key": key},
48
+ method="POST",
49
+ )
50
+ with urllib.request.urlopen(req) as resp: # noqa: S310
51
+ data = json.loads(resp.read().decode("utf-8"))
52
+
53
+ text = "".join(
54
+ p.get("text", "") for p in data.get("candidates", [{}])[0]
55
+ .get("content", {}).get("parts", [])
56
+ )
57
+ try:
58
+ parsed = json.loads(text)
59
+ except json.JSONDecodeError:
60
+ start, end = text.find("{"), text.rfind("}")
61
+ parsed = json.loads(text[start:end + 1]) if start != -1 else {"fixes": []}
62
+
63
+ by_index = {
64
+ fix["i"]: fix["suggestion"]
65
+ for fix in parsed.get("fixes", [])
66
+ if isinstance(fix.get("i"), int) and isinstance(fix.get("suggestion"), str)
67
+ }
68
+ for i, f in enumerate(findings):
69
+ f.suggestion = by_index.get(i)
@@ -0,0 +1,32 @@
1
+ """Placeholder verification — deterministic, no LLM."""
2
+ import re
3
+ from collections import Counter
4
+
5
+ _PATTERNS = [
6
+ re.compile(r"\{\{\s*[\w.$-]+\s*\}\}"), # {{name}}
7
+ re.compile(r"\{\s*[\w.$-]+\s*\}"), # {name}
8
+ re.compile(r"%\d*\$?[sdifege@%]"), # %s %1$s
9
+ re.compile(r":[a-zA-Z_]\w*"), # :name
10
+ ]
11
+
12
+
13
+ def extract_placeholders(text: str) -> list[str]:
14
+ found: list[str] = []
15
+ for pat in _PATTERNS:
16
+ for m in pat.findall(text):
17
+ found.append(re.sub(r"\s+", "", m))
18
+ return found
19
+
20
+
21
+ def check_placeholders(source: str, candidate: str) -> dict:
22
+ src = Counter(extract_placeholders(source))
23
+ cand = Counter(extract_placeholders(candidate))
24
+ missing: list[str] = []
25
+ extra: list[str] = []
26
+ for tok, n in src.items():
27
+ for _ in range(n - cand.get(tok, 0)):
28
+ missing.append(tok)
29
+ for tok, n in cand.items():
30
+ for _ in range(n - src.get(tok, 0)):
31
+ extra.append(tok)
32
+ return {"missing": missing, "extra": extra, "ok": not missing and not extra}
@@ -0,0 +1,152 @@
1
+ """Run the local detector and render the terminal report."""
2
+ import os
3
+ import sys
4
+ from dataclasses import dataclass, field
5
+
6
+ from .config import Config
7
+ from .detector import detect, Issue
8
+ from .files import load_locale, pair_locale
9
+
10
+ _RANK = {"info": 1, "warning": 2, "error": 3}
11
+ _USE_COLOR = sys.stdout.isatty() and not os.environ.get("NO_COLOR") and os.environ.get("TERM") != "dumb"
12
+
13
+
14
+ def _c(code):
15
+ def fn(s):
16
+ return f"\x1b[{code}m{s}\x1b[0m" if _USE_COLOR else str(s)
17
+ return fn
18
+
19
+
20
+ BOLD, DIM, RED, GREEN, YELLOW, BLUE, CYAN, GRAY = (
21
+ _c(1), _c(2), _c(31), _c(32), _c(33), _c(34), _c(36), _c(90)
22
+ )
23
+ _SEV_COLOR = {"error": RED, "warning": YELLOW, "info": BLUE}
24
+ _SEV_LABEL = {"error": "error", "warning": "warn ", "info": "info "}
25
+
26
+
27
+ @dataclass
28
+ class Finding:
29
+ key: str
30
+ locale: str
31
+ file: str
32
+ source: str
33
+ candidate: str
34
+ issues: list[Issue]
35
+ suggestion: str | None = None
36
+
37
+
38
+ @dataclass
39
+ class LocaleReport:
40
+ locale: str
41
+ file: str
42
+ findings: list[Finding] = field(default_factory=list)
43
+ missing_keys: list[str] = field(default_factory=list)
44
+ extra_keys: list[str] = field(default_factory=list)
45
+ checked: int = 0
46
+
47
+
48
+ @dataclass
49
+ class RunSummary:
50
+ reports: list[LocaleReport]
51
+ ok: bool
52
+ stats: dict
53
+
54
+
55
+ def run(config: Config) -> RunSummary:
56
+ disabled = set(config.disable_rules)
57
+ source = load_locale(config.locales_dir, config.source_locale)
58
+ reports: list[LocaleReport] = []
59
+
60
+ for locale in config.target_locales:
61
+ target = load_locale(config.locales_dir, locale)
62
+ pr = pair_locale(source, target)
63
+ findings: list[Finding] = []
64
+ for p in pr.pairs:
65
+ issues = detect(
66
+ p["key"], locale, p["source"], p["candidate"],
67
+ median_len_ratio=pr.median_len_ratio, disabled=disabled,
68
+ )
69
+ if issues:
70
+ findings.append(Finding(
71
+ key=p["key"], locale=locale, file=target.file,
72
+ source=p["source"], candidate=p["candidate"], issues=issues,
73
+ ))
74
+ reports.append(LocaleReport(
75
+ locale=locale, file=target.file, findings=findings,
76
+ missing_keys=pr.missing_keys, extra_keys=pr.extra_keys, checked=len(pr.pairs),
77
+ ))
78
+
79
+ all_findings = [f for r in reports for f in r.findings]
80
+ errors = sum(1 for f in all_findings if any(i.severity == "error" for i in f.issues))
81
+ warnings = sum(
82
+ 1 for f in all_findings
83
+ if not any(i.severity == "error" for i in f.issues)
84
+ and any(i.severity == "warning" for i in f.issues)
85
+ )
86
+ fail_rank = _RANK[config.fail_on]
87
+ ok = not any(_RANK[i.severity] >= fail_rank for f in all_findings for i in f.issues)
88
+
89
+ return RunSummary(
90
+ reports=reports,
91
+ ok=ok,
92
+ stats={
93
+ "checked": sum(r.checked for r in reports),
94
+ "flagged": len(all_findings),
95
+ "errors": errors,
96
+ "warnings": warnings,
97
+ },
98
+ )
99
+
100
+
101
+ def _rel(path: str) -> str:
102
+ try:
103
+ return os.path.relpath(path)
104
+ except ValueError:
105
+ return path
106
+
107
+
108
+ def render_report(summary: RunSummary) -> str:
109
+ lines = ["", BOLD(" lang-natural ") + DIM("— local translation check (no LLM)"), ""]
110
+ for r in summary.reports:
111
+ _render_locale(r, lines)
112
+
113
+ s = summary.stats
114
+ lines.append(" " + DIM("─" * 50))
115
+ parts = [
116
+ BOLD(str(s["checked"])) + " checked",
117
+ RED(f"{s['errors']} errors") if s["errors"] else DIM("0 errors"),
118
+ YELLOW(f"{s['warnings']} warnings") if s["warnings"] else DIM("0 warnings"),
119
+ ]
120
+ lines.append(" " + DIM(" · ").join(parts))
121
+ lines.append(" " + (GREEN(BOLD("✓ no blocking issues")) if summary.ok
122
+ else RED(BOLD("✗ issues found — see above"))))
123
+ lines.append("")
124
+ return "\n".join(lines)
125
+
126
+
127
+ def _render_locale(r: LocaleReport, lines: list[str]) -> None:
128
+ rel = _rel(r.file)
129
+ status = (GREEN(f"✓ {r.checked} clean") if not r.findings
130
+ else RED(f"✗ {len(r.findings)}/{r.checked} flagged"))
131
+ lines.append(" " + BOLD(CYAN(r.locale.upper())) + " " + DIM(rel) + " " + status)
132
+ if r.missing_keys:
133
+ lines.append(" " + YELLOW("missing keys: ") + DIM(_preview(r.missing_keys)))
134
+
135
+ for f in r.findings:
136
+ worst = "info"
137
+ for i in f.issues:
138
+ if _RANK[i.severity] > _RANK[worst]:
139
+ worst = i.severity
140
+ lines.append("")
141
+ lines.append(f" {_SEV_COLOR[worst]('[' + _SEV_LABEL[worst] + ']')} {BOLD(f.key)} {DIM('— ' + rel)}")
142
+ lines.append(f" {GRAY('source: ')}{f.source}")
143
+ lines.append(f" {GRAY('current: ')}{DIM('(empty)') if f.candidate == '' else f.candidate}")
144
+ for i in f.issues:
145
+ lines.append(f" {_SEV_COLOR[i.severity]('• ' + i.message)}")
146
+ if f.suggestion:
147
+ lines.append(f" {GRAY('fix: ')}{GREEN(f.suggestion)}")
148
+ lines.append("")
149
+
150
+
151
+ def _preview(keys: list[str], n: int = 8) -> str:
152
+ return ", ".join(keys) if len(keys) <= n else ", ".join(keys[:n]) + f", +{len(keys) - n} more"
@@ -0,0 +1,48 @@
1
+ """Script / writing-system detection — no LLM.
2
+
3
+ Mirrors the JS version: if a non-Latin locale's translation is mostly Latin
4
+ letters, it's almost certainly untranslated or wrong.
5
+ """
6
+
7
+ _LOCALE_SCRIPT = {
8
+ "ar": "arabic", "fa": "arabic", "ur": "arabic",
9
+ "hi": "devanagari", "mr": "devanagari", "ne": "devanagari",
10
+ "ru": "cyrillic", "uk": "cyrillic", "bg": "cyrillic", "sr": "cyrillic",
11
+ "zh": "han", "ja": "han", "ko": "hangul",
12
+ "th": "thai", "he": "hebrew", "el": "greek",
13
+ }
14
+
15
+ # Inclusive codepoint ranges per script.
16
+ _RANGES = {
17
+ "latin": [(0x0041, 0x005A), (0x0061, 0x007A), (0x00C0, 0x024F)],
18
+ "arabic": [(0x0600, 0x06FF), (0x0750, 0x077F), (0x08A0, 0x08FF), (0xFB50, 0xFDFF), (0xFE70, 0xFEFF)],
19
+ "devanagari": [(0x0900, 0x097F)],
20
+ "cyrillic": [(0x0400, 0x04FF)],
21
+ "han": [(0x4E00, 0x9FFF), (0x3400, 0x4DBF), (0x3040, 0x30FF)],
22
+ "hangul": [(0xAC00, 0xD7AF), (0x1100, 0x11FF)],
23
+ "thai": [(0x0E00, 0x0E7F)],
24
+ "hebrew": [(0x0590, 0x05FF)],
25
+ "greek": [(0x0370, 0x03FF)],
26
+ }
27
+
28
+
29
+ def expected_script(locale: str) -> str:
30
+ base = locale.replace("_", "-").split("-")[0].lower()
31
+ return _LOCALE_SCRIPT.get(base, "latin")
32
+
33
+
34
+ def _in_script(ch: str, script: str) -> bool:
35
+ cp = ord(ch)
36
+ return any(lo <= cp <= hi for lo, hi in _RANGES[script])
37
+
38
+
39
+ def script_ratio(text: str, script: str) -> float:
40
+ letters = [c for c in text if c.isalpha()]
41
+ if not letters:
42
+ return 1.0
43
+ in_script = sum(1 for c in letters if _in_script(c, script))
44
+ return in_script / len(letters)
45
+
46
+
47
+ def is_non_latin_locale(locale: str) -> bool:
48
+ return expected_script(locale) != "latin"