lang-natural 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lang_natural-0.1.0/.gitignore +6 -0
- lang_natural-0.1.0/LICENSE +21 -0
- lang_natural-0.1.0/PKG-INFO +71 -0
- lang_natural-0.1.0/README.md +55 -0
- lang_natural-0.1.0/lang-natural.config.json +6 -0
- lang_natural-0.1.0/pyproject.toml +29 -0
- lang_natural-0.1.0/src/lang_natural/__init__.py +19 -0
- lang_natural-0.1.0/src/lang_natural/__main__.py +6 -0
- lang_natural-0.1.0/src/lang_natural/cli.py +84 -0
- lang_natural-0.1.0/src/lang_natural/config.py +50 -0
- lang_natural-0.1.0/src/lang_natural/detector.py +88 -0
- lang_natural-0.1.0/src/lang_natural/files.py +79 -0
- lang_natural-0.1.0/src/lang_natural/llm.py +69 -0
- lang_natural-0.1.0/src/lang_natural/placeholders.py +32 -0
- lang_natural-0.1.0/src/lang_natural/report.py +152 -0
- lang_natural-0.1.0/src/lang_natural/script.py +48 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 diwakarbhatt1983
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lang-natural
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect unnatural / broken translations in locale files with ZERO LLM. Placeholders, untranslated strings, wrong script, repeats and more. Optional AI layer for fix suggestions. Works in any Python backend.
|
|
5
|
+
Project-URL: Homepage, https://github.com/diwakarbhatt1983/lang-natural
|
|
6
|
+
Author: diwakarbhatt1983
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: detector,django,fastapi,i18n,internationalization,linter,localization,naturalness,translation
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Software Development :: Localization
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# lang-natural (Python)
|
|
18
|
+
|
|
19
|
+
Detect **unnatural / broken translations** in your locale files — with **zero LLM**.
|
|
20
|
+
Catches dropped placeholders, untranslated strings, wrong script, repeated words,
|
|
21
|
+
markup drift and length anomalies. Runs fully offline, no API key. An optional AI
|
|
22
|
+
layer (`--fix`) suggests natural rewrites only when you ask.
|
|
23
|
+
|
|
24
|
+
For Python backends (Django, FastAPI, Flask) that keep user-facing strings in JSON.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
```bash
|
|
28
|
+
pip install lang-natural
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Configure — `lang-natural.config.json`
|
|
32
|
+
```json
|
|
33
|
+
{
|
|
34
|
+
"localesDir": "./locales",
|
|
35
|
+
"sourceLocale": "en",
|
|
36
|
+
"targetLocales": ["ar", "es", "hi"],
|
|
37
|
+
"failOn": "error"
|
|
38
|
+
}
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Run
|
|
42
|
+
```bash
|
|
43
|
+
lang-natural # scan and print the report
|
|
44
|
+
lang-natural --json # machine-readable output
|
|
45
|
+
lang-natural --fix # LLM fix suggestions (needs GEMINI_API_KEY)
|
|
46
|
+
```
|
|
47
|
+
Exit code is `1` when blocking issues are found — so it fails CI.
|
|
48
|
+
|
|
49
|
+
## Use in code
|
|
50
|
+
```python
|
|
51
|
+
from lang_natural import detect
|
|
52
|
+
|
|
53
|
+
issues = detect("cart.itemCount", "ar",
|
|
54
|
+
"You have {count} items", "لديك عناصر")
|
|
55
|
+
for i in issues:
|
|
56
|
+
print(i.severity, i.message)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## What the local (no-LLM) detector catches
|
|
60
|
+
- Empty translations
|
|
61
|
+
- Missing / extra placeholders (`{count}`, `{{name}}`, `%s`)
|
|
62
|
+
- Strings identical to the source (untranslated)
|
|
63
|
+
- Wrong script (Latin text in an Arabic/Hindi/… locale)
|
|
64
|
+
- Repeated words
|
|
65
|
+
- HTML/markup that differs from the source
|
|
66
|
+
- Length outliers vs. the locale's typical ratio
|
|
67
|
+
|
|
68
|
+
Subtle word-order naturalness and rewrite suggestions require the optional LLM
|
|
69
|
+
layer (`--fix`).
|
|
70
|
+
|
|
71
|
+
MIT
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# lang-natural (Python)
|
|
2
|
+
|
|
3
|
+
Detect **unnatural / broken translations** in your locale files — with **zero LLM**.
|
|
4
|
+
Catches dropped placeholders, untranslated strings, wrong script, repeated words,
|
|
5
|
+
markup drift and length anomalies. Runs fully offline, no API key. An optional AI
|
|
6
|
+
layer (`--fix`) suggests natural rewrites only when you ask.
|
|
7
|
+
|
|
8
|
+
For Python backends (Django, FastAPI, Flask) that keep user-facing strings in JSON.
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
```bash
|
|
12
|
+
pip install lang-natural
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Configure — `lang-natural.config.json`
|
|
16
|
+
```json
|
|
17
|
+
{
|
|
18
|
+
"localesDir": "./locales",
|
|
19
|
+
"sourceLocale": "en",
|
|
20
|
+
"targetLocales": ["ar", "es", "hi"],
|
|
21
|
+
"failOn": "error"
|
|
22
|
+
}
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Run
|
|
26
|
+
```bash
|
|
27
|
+
lang-natural # scan and print the report
|
|
28
|
+
lang-natural --json # machine-readable output
|
|
29
|
+
lang-natural --fix # LLM fix suggestions (needs GEMINI_API_KEY)
|
|
30
|
+
```
|
|
31
|
+
Exit code is `1` when blocking issues are found — so it fails CI.
|
|
32
|
+
|
|
33
|
+
## Use in code
|
|
34
|
+
```python
|
|
35
|
+
from lang_natural import detect
|
|
36
|
+
|
|
37
|
+
issues = detect("cart.itemCount", "ar",
|
|
38
|
+
"You have {count} items", "لديك عناصر")
|
|
39
|
+
for i in issues:
|
|
40
|
+
print(i.severity, i.message)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## What the local (no-LLM) detector catches
|
|
44
|
+
- Empty translations
|
|
45
|
+
- Missing / extra placeholders (`{count}`, `{{name}}`, `%s`)
|
|
46
|
+
- Strings identical to the source (untranslated)
|
|
47
|
+
- Wrong script (Latin text in an Arabic/Hindi/… locale)
|
|
48
|
+
- Repeated words
|
|
49
|
+
- HTML/markup that differs from the source
|
|
50
|
+
- Length outliers vs. the locale's typical ratio
|
|
51
|
+
|
|
52
|
+
Subtle word-order naturalness and rewrite suggestions require the optional LLM
|
|
53
|
+
layer (`--fix`).
|
|
54
|
+
|
|
55
|
+
MIT
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "lang-natural"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Detect unnatural / broken translations in locale files with ZERO LLM. Placeholders, untranslated strings, wrong script, repeats and more. Optional AI layer for fix suggestions. Works in any Python backend."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "diwakarbhatt1983" }]
|
|
13
|
+
keywords = ["i18n", "internationalization", "translation", "localization", "linter", "naturalness", "detector", "django", "fastapi"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Topic :: Software Development :: Localization",
|
|
19
|
+
]
|
|
20
|
+
dependencies = []
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/diwakarbhatt1983/lang-natural"
|
|
24
|
+
|
|
25
|
+
[project.scripts]
|
|
26
|
+
lang-natural = "lang_natural.cli:main"
|
|
27
|
+
|
|
28
|
+
[tool.hatch.build.targets.wheel]
|
|
29
|
+
packages = ["src/lang_natural"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""lang-natural — local (no-LLM) translation naturalness/quality detector."""
|
|
2
|
+
from .detector import detect, Issue
|
|
3
|
+
from .placeholders import check_placeholders, extract_placeholders
|
|
4
|
+
from .script import expected_script, script_ratio, is_non_latin_locale
|
|
5
|
+
from .files import load_locale, pair_locale, flatten
|
|
6
|
+
from .config import Config, load_config_file, find_config, resolve_config
|
|
7
|
+
from .report import run, render_report, RunSummary, Finding
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.0"
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"detect", "Issue",
|
|
13
|
+
"check_placeholders", "extract_placeholders",
|
|
14
|
+
"expected_script", "script_ratio", "is_non_latin_locale",
|
|
15
|
+
"load_locale", "pair_locale", "flatten",
|
|
16
|
+
"Config", "load_config_file", "find_config", "resolve_config",
|
|
17
|
+
"run", "render_report", "RunSummary", "Finding",
|
|
18
|
+
"__version__",
|
|
19
|
+
]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Command-line entry point: `lang-natural`."""
|
|
2
|
+
import argparse
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from .config import find_config, load_config_file, resolve_config
|
|
7
|
+
from .report import run, render_report
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _summary_to_dict(summary) -> dict:
|
|
11
|
+
return {
|
|
12
|
+
"ok": summary.ok,
|
|
13
|
+
"stats": summary.stats,
|
|
14
|
+
"reports": [
|
|
15
|
+
{
|
|
16
|
+
"locale": r.locale,
|
|
17
|
+
"file": r.file,
|
|
18
|
+
"missingKeys": r.missing_keys,
|
|
19
|
+
"extraKeys": r.extra_keys,
|
|
20
|
+
"findings": [
|
|
21
|
+
{
|
|
22
|
+
"key": f.key,
|
|
23
|
+
"source": f.source,
|
|
24
|
+
"candidate": f.candidate,
|
|
25
|
+
"issues": [
|
|
26
|
+
{"rule": i.rule, "severity": i.severity, "message": i.message}
|
|
27
|
+
for i in f.issues
|
|
28
|
+
],
|
|
29
|
+
"suggestion": f.suggestion,
|
|
30
|
+
}
|
|
31
|
+
for f in r.findings
|
|
32
|
+
],
|
|
33
|
+
}
|
|
34
|
+
for r in summary.reports
|
|
35
|
+
],
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main(argv: list[str] | None = None) -> int:
|
|
40
|
+
parser = argparse.ArgumentParser(
|
|
41
|
+
prog="lang-natural",
|
|
42
|
+
description="Detect unnatural / broken translations locally (no LLM required).",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument("-c", "--config", help="Path to config JSON (default: auto-detect)")
|
|
45
|
+
parser.add_argument("--json", action="store_true", help="Machine-readable JSON output")
|
|
46
|
+
parser.add_argument("--fix", action="store_true", help="LLM fix suggestions (needs GEMINI_API_KEY)")
|
|
47
|
+
parser.add_argument("-v", "--version", action="store_true", help="Show version")
|
|
48
|
+
args = parser.parse_args(argv)
|
|
49
|
+
|
|
50
|
+
if args.version:
|
|
51
|
+
from . import __version__
|
|
52
|
+
print(__version__)
|
|
53
|
+
return 0
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
if args.config:
|
|
57
|
+
config = load_config_file(args.config)
|
|
58
|
+
else:
|
|
59
|
+
path = find_config()
|
|
60
|
+
if not path:
|
|
61
|
+
sys.stderr.write("No lang-natural.config.json found in this directory.\n")
|
|
62
|
+
return 2
|
|
63
|
+
config = load_config_file(path)
|
|
64
|
+
except Exception as err: # noqa: BLE001
|
|
65
|
+
sys.stderr.write(f"Config error: {err}\n")
|
|
66
|
+
return 2
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
summary = run(config)
|
|
70
|
+
if args.fix:
|
|
71
|
+
from .llm import suggest_fixes
|
|
72
|
+
suggest_fixes(summary)
|
|
73
|
+
if args.json:
|
|
74
|
+
print(json.dumps(_summary_to_dict(summary), ensure_ascii=False, indent=2))
|
|
75
|
+
else:
|
|
76
|
+
print(render_report(summary))
|
|
77
|
+
return 0 if summary.ok else 1
|
|
78
|
+
except Exception as err: # noqa: BLE001
|
|
79
|
+
sys.stderr.write(f"\nError: {err}\n")
|
|
80
|
+
return 2
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
sys.exit(main())
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Config loading. Uses a JSON config so it stays language-agnostic."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
|
|
6
|
+
_NAMES = ["lang-natural.config.json", "langnatural.config.json"]
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class Config:
|
|
11
|
+
locales_dir: str
|
|
12
|
+
source_locale: str
|
|
13
|
+
target_locales: list[str]
|
|
14
|
+
disable_rules: list[str] = field(default_factory=list)
|
|
15
|
+
fail_on: str = "error"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def find_config(cwd: str | None = None) -> str | None:
|
|
19
|
+
cwd = cwd or os.getcwd()
|
|
20
|
+
for n in _NAMES:
|
|
21
|
+
p = os.path.join(cwd, n)
|
|
22
|
+
if os.path.exists(p):
|
|
23
|
+
return p
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_config_file(path: str) -> Config:
|
|
28
|
+
with open(path, encoding="utf-8") as fh:
|
|
29
|
+
raw = json.load(fh)
|
|
30
|
+
return resolve_config(raw)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def resolve_config(raw: dict) -> Config:
|
|
34
|
+
# Accept both camelCase (shared with the JS config) and snake_case.
|
|
35
|
+
locales_dir = raw.get("localesDir") or raw.get("locales_dir")
|
|
36
|
+
source = raw.get("sourceLocale") or raw.get("source_locale")
|
|
37
|
+
targets = raw.get("targetLocales") or raw.get("target_locales")
|
|
38
|
+
if not locales_dir:
|
|
39
|
+
raise ValueError("config.localesDir is required.")
|
|
40
|
+
if not source:
|
|
41
|
+
raise ValueError("config.sourceLocale is required.")
|
|
42
|
+
if not targets:
|
|
43
|
+
raise ValueError("config.targetLocales must be a non-empty list.")
|
|
44
|
+
return Config(
|
|
45
|
+
locales_dir=locales_dir,
|
|
46
|
+
source_locale=source,
|
|
47
|
+
target_locales=list(targets),
|
|
48
|
+
disable_rules=list(raw.get("disableRules") or raw.get("disable_rules") or []),
|
|
49
|
+
fail_on=raw.get("failOn") or raw.get("fail_on") or "error",
|
|
50
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""The LOCAL detector — zero LLM, zero network, deterministic.
|
|
2
|
+
|
|
3
|
+
Identical checks to the npm version so both packages behave the same.
|
|
4
|
+
"""
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from .placeholders import check_placeholders
|
|
9
|
+
from .script import expected_script, script_ratio, is_non_latin_locale
|
|
10
|
+
|
|
11
|
+
_HTML_TAG = re.compile(r"</?[a-zA-Z][^>]*>")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Issue:
|
|
16
|
+
rule: str
|
|
17
|
+
severity: str # "error" | "warning" | "info"
|
|
18
|
+
message: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def detect(
|
|
22
|
+
key: str,
|
|
23
|
+
locale: str,
|
|
24
|
+
source: str,
|
|
25
|
+
candidate: str,
|
|
26
|
+
median_len_ratio: float = 0.0,
|
|
27
|
+
disabled: set[str] | None = None,
|
|
28
|
+
) -> list[Issue]:
|
|
29
|
+
disabled = disabled or set()
|
|
30
|
+
issues: list[Issue] = []
|
|
31
|
+
|
|
32
|
+
def add(rule: str, severity: str, message: str) -> None:
|
|
33
|
+
if rule not in disabled:
|
|
34
|
+
issues.append(Issue(rule, severity, message))
|
|
35
|
+
|
|
36
|
+
trimmed = candidate.strip()
|
|
37
|
+
|
|
38
|
+
# 1. Empty translation.
|
|
39
|
+
if trimmed == "":
|
|
40
|
+
add("empty", "error", "Translation is empty")
|
|
41
|
+
return issues
|
|
42
|
+
|
|
43
|
+
# 2. Placeholders must survive.
|
|
44
|
+
ph = check_placeholders(source, candidate)
|
|
45
|
+
if ph["missing"]:
|
|
46
|
+
add("placeholder", "error",
|
|
47
|
+
f"Missing placeholder(s): {', '.join(ph['missing'])} — this can crash the UI")
|
|
48
|
+
if ph["extra"]:
|
|
49
|
+
add("placeholder", "warning", f"Unexpected placeholder(s): {', '.join(ph['extra'])}")
|
|
50
|
+
|
|
51
|
+
# 3. Identical to source → likely untranslated.
|
|
52
|
+
if trimmed == source.strip() and any(c.isalpha() for c in source):
|
|
53
|
+
add("untranslated", "warning", "Identical to the source string (untranslated?)")
|
|
54
|
+
|
|
55
|
+
# 4. Wrong script for non-Latin locales.
|
|
56
|
+
if is_non_latin_locale(locale):
|
|
57
|
+
ratio = script_ratio(candidate, expected_script(locale))
|
|
58
|
+
if ratio < 0.4:
|
|
59
|
+
add("wrong-script", "error",
|
|
60
|
+
f"Mostly non-{expected_script(locale)} characters — looks untranslated or wrong script")
|
|
61
|
+
|
|
62
|
+
# 5. Repeated word (works for any script).
|
|
63
|
+
words = [w for w in re.split(r"\s+", candidate) if w]
|
|
64
|
+
for i in range(1, len(words)):
|
|
65
|
+
w = words[i]
|
|
66
|
+
if len(w) >= 2 and w == words[i - 1] and any(c.isalpha() for c in w):
|
|
67
|
+
add("repeated-word", "warning", f'Repeated word: "{w}"')
|
|
68
|
+
break
|
|
69
|
+
|
|
70
|
+
# 6. HTML/markup tags must match.
|
|
71
|
+
src_tags = sorted(_HTML_TAG.findall(source))
|
|
72
|
+
cand_tags = sorted(_HTML_TAG.findall(candidate))
|
|
73
|
+
if src_tags != cand_tags:
|
|
74
|
+
add("markup", "error", "HTML tags differ from the source string")
|
|
75
|
+
|
|
76
|
+
# 7. Length outlier vs the locale's typical translation length.
|
|
77
|
+
if median_len_ratio and len(source.strip()) >= 8:
|
|
78
|
+
ratio = len(candidate.strip()) / max(1, len(source.strip()))
|
|
79
|
+
m = median_len_ratio
|
|
80
|
+
if m > 0 and (ratio > m * 3 or ratio < m / 3):
|
|
81
|
+
add("length-outlier", "info",
|
|
82
|
+
f"Unusual length vs source ({ratio:.2f}x, locale typical ~{m:.2f}x) — possible mistranslation")
|
|
83
|
+
|
|
84
|
+
# 8. Doubled / stray whitespace.
|
|
85
|
+
if re.search(r"\s{2,}", candidate) or candidate != candidate.strip():
|
|
86
|
+
add("whitespace", "info", "Extra or trailing whitespace")
|
|
87
|
+
|
|
88
|
+
return issues
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Locale file loading, flattening and pairing."""
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from statistics import median
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class LoadedLocale:
|
|
10
|
+
locale: str
|
|
11
|
+
file: str
|
|
12
|
+
strings: dict[str, str]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def flatten(obj, prefix: str = "", out: dict[str, str] | None = None) -> dict[str, str]:
|
|
16
|
+
if out is None:
|
|
17
|
+
out = {}
|
|
18
|
+
if not isinstance(obj, dict):
|
|
19
|
+
return out
|
|
20
|
+
for k, v in obj.items():
|
|
21
|
+
key = f"{prefix}.{k}" if prefix else k
|
|
22
|
+
if isinstance(v, str):
|
|
23
|
+
out[key] = v
|
|
24
|
+
elif isinstance(v, dict):
|
|
25
|
+
flatten(v, key, out)
|
|
26
|
+
return out
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def locale_file_path(locales_dir: str, locale: str) -> str:
|
|
30
|
+
d = os.path.abspath(locales_dir)
|
|
31
|
+
candidates = [
|
|
32
|
+
os.path.join(d, f"{locale}.json"),
|
|
33
|
+
os.path.join(d, locale, "index.json"),
|
|
34
|
+
os.path.join(d, locale, f"{locale}.json"),
|
|
35
|
+
]
|
|
36
|
+
for c in candidates:
|
|
37
|
+
if os.path.exists(c):
|
|
38
|
+
return c
|
|
39
|
+
return candidates[0]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_locale(locales_dir: str, locale: str) -> LoadedLocale:
|
|
43
|
+
path = locale_file_path(locales_dir, locale)
|
|
44
|
+
if not os.path.exists(path):
|
|
45
|
+
raise FileNotFoundError(f'Locale file not found for "{locale}": {path}')
|
|
46
|
+
with open(path, encoding="utf-8") as fh:
|
|
47
|
+
parsed = json.load(fh)
|
|
48
|
+
return LoadedLocale(locale=locale, file=path, strings=flatten(parsed))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class PairingResult:
|
|
53
|
+
pairs: list[dict] = field(default_factory=list)
|
|
54
|
+
missing_keys: list[str] = field(default_factory=list)
|
|
55
|
+
extra_keys: list[str] = field(default_factory=list)
|
|
56
|
+
median_len_ratio: float = 0.0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def pair_locale(source: LoadedLocale, target: LoadedLocale) -> PairingResult:
|
|
60
|
+
pairs: list[dict] = []
|
|
61
|
+
missing: list[str] = []
|
|
62
|
+
ratios: list[float] = []
|
|
63
|
+
|
|
64
|
+
for key, src in source.strings.items():
|
|
65
|
+
candidate = target.strings.get(key)
|
|
66
|
+
if candidate is None:
|
|
67
|
+
missing.append(key)
|
|
68
|
+
continue
|
|
69
|
+
pairs.append({"key": key, "source": src, "candidate": candidate})
|
|
70
|
+
if len(src.strip()) >= 8 and len(candidate.strip()) > 0:
|
|
71
|
+
ratios.append(len(candidate.strip()) / len(src.strip()))
|
|
72
|
+
|
|
73
|
+
extra = [k for k in target.strings if k not in source.strings]
|
|
74
|
+
return PairingResult(
|
|
75
|
+
pairs=pairs,
|
|
76
|
+
missing_keys=missing,
|
|
77
|
+
extra_keys=extra,
|
|
78
|
+
median_len_ratio=median(ratios) if ratios else 0.0,
|
|
79
|
+
)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""OPTIONAL LLM layer for --fix. Uses stdlib urllib (no extra deps).
|
|
2
|
+
|
|
3
|
+
The detector never needs this; it only runs on --fix with GEMINI_API_KEY set.
|
|
4
|
+
"""
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import urllib.request
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def suggest_fixes(summary) -> None:
|
|
11
|
+
key = os.environ.get("GEMINI_API_KEY")
|
|
12
|
+
if not key:
|
|
13
|
+
raise RuntimeError("--fix needs an LLM key. Set GEMINI_API_KEY in your environment.")
|
|
14
|
+
|
|
15
|
+
findings = [f for r in summary.reports for f in r.findings]
|
|
16
|
+
if not findings:
|
|
17
|
+
return
|
|
18
|
+
|
|
19
|
+
model = os.environ.get("LANG_NATURAL_MODEL", "gemini-2.5-flash")
|
|
20
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
|
|
21
|
+
|
|
22
|
+
items = [
|
|
23
|
+
{
|
|
24
|
+
"i": i,
|
|
25
|
+
"locale": f.locale,
|
|
26
|
+
"source": f.source,
|
|
27
|
+
"current": f.candidate,
|
|
28
|
+
"problems": [x.message for x in f.issues],
|
|
29
|
+
}
|
|
30
|
+
for i, f in enumerate(findings)
|
|
31
|
+
]
|
|
32
|
+
prompt = (
|
|
33
|
+
"You are a native-speaker localization editor. For each item, rewrite the "
|
|
34
|
+
'"current" translation so it reads naturally to a native speaker of the given '
|
|
35
|
+
"locale, keeping the meaning of the source and preserving every placeholder "
|
|
36
|
+
'(like {name}, {count}). Return STRICT JSON: '
|
|
37
|
+
'{"fixes":[{"i":number,"suggestion":string}]}.\n\n'
|
|
38
|
+
+ json.dumps(items, ensure_ascii=False, indent=2)
|
|
39
|
+
)
|
|
40
|
+
body = json.dumps({
|
|
41
|
+
"contents": [{"role": "user", "parts": [{"text": prompt}]}],
|
|
42
|
+
"generationConfig": {"temperature": 0, "responseMimeType": "application/json"},
|
|
43
|
+
}).encode("utf-8")
|
|
44
|
+
|
|
45
|
+
req = urllib.request.Request(
|
|
46
|
+
url, data=body,
|
|
47
|
+
headers={"content-type": "application/json", "x-goog-api-key": key},
|
|
48
|
+
method="POST",
|
|
49
|
+
)
|
|
50
|
+
with urllib.request.urlopen(req) as resp: # noqa: S310
|
|
51
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
52
|
+
|
|
53
|
+
text = "".join(
|
|
54
|
+
p.get("text", "") for p in data.get("candidates", [{}])[0]
|
|
55
|
+
.get("content", {}).get("parts", [])
|
|
56
|
+
)
|
|
57
|
+
try:
|
|
58
|
+
parsed = json.loads(text)
|
|
59
|
+
except json.JSONDecodeError:
|
|
60
|
+
start, end = text.find("{"), text.rfind("}")
|
|
61
|
+
parsed = json.loads(text[start:end + 1]) if start != -1 else {"fixes": []}
|
|
62
|
+
|
|
63
|
+
by_index = {
|
|
64
|
+
fix["i"]: fix["suggestion"]
|
|
65
|
+
for fix in parsed.get("fixes", [])
|
|
66
|
+
if isinstance(fix.get("i"), int) and isinstance(fix.get("suggestion"), str)
|
|
67
|
+
}
|
|
68
|
+
for i, f in enumerate(findings):
|
|
69
|
+
f.suggestion = by_index.get(i)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Placeholder verification — deterministic, no LLM."""
|
|
2
|
+
import re
|
|
3
|
+
from collections import Counter
|
|
4
|
+
|
|
5
|
+
_PATTERNS = [
|
|
6
|
+
re.compile(r"\{\{\s*[\w.$-]+\s*\}\}"), # {{name}}
|
|
7
|
+
re.compile(r"\{\s*[\w.$-]+\s*\}"), # {name}
|
|
8
|
+
re.compile(r"%\d*\$?[sdifege@%]"), # %s %1$s
|
|
9
|
+
re.compile(r":[a-zA-Z_]\w*"), # :name
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_placeholders(text: str) -> list[str]:
|
|
14
|
+
found: list[str] = []
|
|
15
|
+
for pat in _PATTERNS:
|
|
16
|
+
for m in pat.findall(text):
|
|
17
|
+
found.append(re.sub(r"\s+", "", m))
|
|
18
|
+
return found
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def check_placeholders(source: str, candidate: str) -> dict:
|
|
22
|
+
src = Counter(extract_placeholders(source))
|
|
23
|
+
cand = Counter(extract_placeholders(candidate))
|
|
24
|
+
missing: list[str] = []
|
|
25
|
+
extra: list[str] = []
|
|
26
|
+
for tok, n in src.items():
|
|
27
|
+
for _ in range(n - cand.get(tok, 0)):
|
|
28
|
+
missing.append(tok)
|
|
29
|
+
for tok, n in cand.items():
|
|
30
|
+
for _ in range(n - src.get(tok, 0)):
|
|
31
|
+
extra.append(tok)
|
|
32
|
+
return {"missing": missing, "extra": extra, "ok": not missing and not extra}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Run the local detector and render the terminal report."""
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
|
|
6
|
+
from .config import Config
|
|
7
|
+
from .detector import detect, Issue
|
|
8
|
+
from .files import load_locale, pair_locale
|
|
9
|
+
|
|
10
|
+
_RANK = {"info": 1, "warning": 2, "error": 3}
|
|
11
|
+
_USE_COLOR = sys.stdout.isatty() and not os.environ.get("NO_COLOR") and os.environ.get("TERM") != "dumb"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _c(code):
|
|
15
|
+
def fn(s):
|
|
16
|
+
return f"\x1b[{code}m{s}\x1b[0m" if _USE_COLOR else str(s)
|
|
17
|
+
return fn
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
BOLD, DIM, RED, GREEN, YELLOW, BLUE, CYAN, GRAY = (
|
|
21
|
+
_c(1), _c(2), _c(31), _c(32), _c(33), _c(34), _c(36), _c(90)
|
|
22
|
+
)
|
|
23
|
+
_SEV_COLOR = {"error": RED, "warning": YELLOW, "info": BLUE}
|
|
24
|
+
_SEV_LABEL = {"error": "error", "warning": "warn ", "info": "info "}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Finding:
|
|
29
|
+
key: str
|
|
30
|
+
locale: str
|
|
31
|
+
file: str
|
|
32
|
+
source: str
|
|
33
|
+
candidate: str
|
|
34
|
+
issues: list[Issue]
|
|
35
|
+
suggestion: str | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class LocaleReport:
|
|
40
|
+
locale: str
|
|
41
|
+
file: str
|
|
42
|
+
findings: list[Finding] = field(default_factory=list)
|
|
43
|
+
missing_keys: list[str] = field(default_factory=list)
|
|
44
|
+
extra_keys: list[str] = field(default_factory=list)
|
|
45
|
+
checked: int = 0
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class RunSummary:
|
|
50
|
+
reports: list[LocaleReport]
|
|
51
|
+
ok: bool
|
|
52
|
+
stats: dict
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def run(config: Config) -> RunSummary:
|
|
56
|
+
disabled = set(config.disable_rules)
|
|
57
|
+
source = load_locale(config.locales_dir, config.source_locale)
|
|
58
|
+
reports: list[LocaleReport] = []
|
|
59
|
+
|
|
60
|
+
for locale in config.target_locales:
|
|
61
|
+
target = load_locale(config.locales_dir, locale)
|
|
62
|
+
pr = pair_locale(source, target)
|
|
63
|
+
findings: list[Finding] = []
|
|
64
|
+
for p in pr.pairs:
|
|
65
|
+
issues = detect(
|
|
66
|
+
p["key"], locale, p["source"], p["candidate"],
|
|
67
|
+
median_len_ratio=pr.median_len_ratio, disabled=disabled,
|
|
68
|
+
)
|
|
69
|
+
if issues:
|
|
70
|
+
findings.append(Finding(
|
|
71
|
+
key=p["key"], locale=locale, file=target.file,
|
|
72
|
+
source=p["source"], candidate=p["candidate"], issues=issues,
|
|
73
|
+
))
|
|
74
|
+
reports.append(LocaleReport(
|
|
75
|
+
locale=locale, file=target.file, findings=findings,
|
|
76
|
+
missing_keys=pr.missing_keys, extra_keys=pr.extra_keys, checked=len(pr.pairs),
|
|
77
|
+
))
|
|
78
|
+
|
|
79
|
+
all_findings = [f for r in reports for f in r.findings]
|
|
80
|
+
errors = sum(1 for f in all_findings if any(i.severity == "error" for i in f.issues))
|
|
81
|
+
warnings = sum(
|
|
82
|
+
1 for f in all_findings
|
|
83
|
+
if not any(i.severity == "error" for i in f.issues)
|
|
84
|
+
and any(i.severity == "warning" for i in f.issues)
|
|
85
|
+
)
|
|
86
|
+
fail_rank = _RANK[config.fail_on]
|
|
87
|
+
ok = not any(_RANK[i.severity] >= fail_rank for f in all_findings for i in f.issues)
|
|
88
|
+
|
|
89
|
+
return RunSummary(
|
|
90
|
+
reports=reports,
|
|
91
|
+
ok=ok,
|
|
92
|
+
stats={
|
|
93
|
+
"checked": sum(r.checked for r in reports),
|
|
94
|
+
"flagged": len(all_findings),
|
|
95
|
+
"errors": errors,
|
|
96
|
+
"warnings": warnings,
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _rel(path: str) -> str:
|
|
102
|
+
try:
|
|
103
|
+
return os.path.relpath(path)
|
|
104
|
+
except ValueError:
|
|
105
|
+
return path
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def render_report(summary: RunSummary) -> str:
|
|
109
|
+
lines = ["", BOLD(" lang-natural ") + DIM("— local translation check (no LLM)"), ""]
|
|
110
|
+
for r in summary.reports:
|
|
111
|
+
_render_locale(r, lines)
|
|
112
|
+
|
|
113
|
+
s = summary.stats
|
|
114
|
+
lines.append(" " + DIM("─" * 50))
|
|
115
|
+
parts = [
|
|
116
|
+
BOLD(str(s["checked"])) + " checked",
|
|
117
|
+
RED(f"{s['errors']} errors") if s["errors"] else DIM("0 errors"),
|
|
118
|
+
YELLOW(f"{s['warnings']} warnings") if s["warnings"] else DIM("0 warnings"),
|
|
119
|
+
]
|
|
120
|
+
lines.append(" " + DIM(" · ").join(parts))
|
|
121
|
+
lines.append(" " + (GREEN(BOLD("✓ no blocking issues")) if summary.ok
|
|
122
|
+
else RED(BOLD("✗ issues found — see above"))))
|
|
123
|
+
lines.append("")
|
|
124
|
+
return "\n".join(lines)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _render_locale(r: LocaleReport, lines: list[str]) -> None:
|
|
128
|
+
rel = _rel(r.file)
|
|
129
|
+
status = (GREEN(f"✓ {r.checked} clean") if not r.findings
|
|
130
|
+
else RED(f"✗ {len(r.findings)}/{r.checked} flagged"))
|
|
131
|
+
lines.append(" " + BOLD(CYAN(r.locale.upper())) + " " + DIM(rel) + " " + status)
|
|
132
|
+
if r.missing_keys:
|
|
133
|
+
lines.append(" " + YELLOW("missing keys: ") + DIM(_preview(r.missing_keys)))
|
|
134
|
+
|
|
135
|
+
for f in r.findings:
|
|
136
|
+
worst = "info"
|
|
137
|
+
for i in f.issues:
|
|
138
|
+
if _RANK[i.severity] > _RANK[worst]:
|
|
139
|
+
worst = i.severity
|
|
140
|
+
lines.append("")
|
|
141
|
+
lines.append(f" {_SEV_COLOR[worst]('[' + _SEV_LABEL[worst] + ']')} {BOLD(f.key)} {DIM('— ' + rel)}")
|
|
142
|
+
lines.append(f" {GRAY('source: ')}{f.source}")
|
|
143
|
+
lines.append(f" {GRAY('current: ')}{DIM('(empty)') if f.candidate == '' else f.candidate}")
|
|
144
|
+
for i in f.issues:
|
|
145
|
+
lines.append(f" {_SEV_COLOR[i.severity]('• ' + i.message)}")
|
|
146
|
+
if f.suggestion:
|
|
147
|
+
lines.append(f" {GRAY('fix: ')}{GREEN(f.suggestion)}")
|
|
148
|
+
lines.append("")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _preview(keys: list[str], n: int = 8) -> str:
|
|
152
|
+
return ", ".join(keys) if len(keys) <= n else ", ".join(keys[:n]) + f", +{len(keys) - n} more"
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Script / writing-system detection — no LLM.
|
|
2
|
+
|
|
3
|
+
Mirrors the JS version: if a non-Latin locale's translation is mostly Latin
|
|
4
|
+
letters, it's almost certainly untranslated or wrong.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
_LOCALE_SCRIPT = {
|
|
8
|
+
"ar": "arabic", "fa": "arabic", "ur": "arabic",
|
|
9
|
+
"hi": "devanagari", "mr": "devanagari", "ne": "devanagari",
|
|
10
|
+
"ru": "cyrillic", "uk": "cyrillic", "bg": "cyrillic", "sr": "cyrillic",
|
|
11
|
+
"zh": "han", "ja": "han", "ko": "hangul",
|
|
12
|
+
"th": "thai", "he": "hebrew", "el": "greek",
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# Inclusive codepoint ranges per script.
|
|
16
|
+
_RANGES = {
|
|
17
|
+
"latin": [(0x0041, 0x005A), (0x0061, 0x007A), (0x00C0, 0x024F)],
|
|
18
|
+
"arabic": [(0x0600, 0x06FF), (0x0750, 0x077F), (0x08A0, 0x08FF), (0xFB50, 0xFDFF), (0xFE70, 0xFEFF)],
|
|
19
|
+
"devanagari": [(0x0900, 0x097F)],
|
|
20
|
+
"cyrillic": [(0x0400, 0x04FF)],
|
|
21
|
+
"han": [(0x4E00, 0x9FFF), (0x3400, 0x4DBF), (0x3040, 0x30FF)],
|
|
22
|
+
"hangul": [(0xAC00, 0xD7AF), (0x1100, 0x11FF)],
|
|
23
|
+
"thai": [(0x0E00, 0x0E7F)],
|
|
24
|
+
"hebrew": [(0x0590, 0x05FF)],
|
|
25
|
+
"greek": [(0x0370, 0x03FF)],
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def expected_script(locale: str) -> str:
|
|
30
|
+
base = locale.replace("_", "-").split("-")[0].lower()
|
|
31
|
+
return _LOCALE_SCRIPT.get(base, "latin")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _in_script(ch: str, script: str) -> bool:
|
|
35
|
+
cp = ord(ch)
|
|
36
|
+
return any(lo <= cp <= hi for lo, hi in _RANGES[script])
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def script_ratio(text: str, script: str) -> float:
|
|
40
|
+
letters = [c for c in text if c.isalpha()]
|
|
41
|
+
if not letters:
|
|
42
|
+
return 1.0
|
|
43
|
+
in_script = sum(1 for c in letters if _in_script(c, script))
|
|
44
|
+
return in_script / len(letters)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def is_non_latin_locale(locale: str) -> bool:
|
|
48
|
+
return expected_script(locale) != "latin"
|