python-po-lint 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- po_lint/__init__.py +3 -0
- po_lint/checks.py +291 -0
- po_lint/cli.py +208 -0
- po_lint/config.py +94 -0
- po_lint/detector.py +314 -0
- po_lint/linter.py +277 -0
- python_po_lint-0.1.0.dist-info/METADATA +67 -0
- python_po_lint-0.1.0.dist-info/RECORD +11 -0
- python_po_lint-0.1.0.dist-info/WHEEL +4 -0
- python_po_lint-0.1.0.dist-info/entry_points.txt +2 -0
- python_po_lint-0.1.0.dist-info/licenses/LICENSE +201 -0
po_lint/detector.py
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""Language detection using fastText for .po file linting."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import urllib.request
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import fasttext
|
|
9
|
+
|
|
10
|
+
# Suppress fastText warnings about "\n" in input
|
|
11
|
+
fasttext.FastText.eprint = lambda x: None
|
|
12
|
+
|
|
13
|
+
# Default minimum cleaned text length to attempt language detection.
|
|
14
|
+
# Short strings are unreliable — loan words, cognates, and brand names
|
|
15
|
+
# make detection impossible for anything under ~30 characters.
|
|
16
|
+
DEFAULT_MIN_DETECTION_LENGTH = 30
|
|
17
|
+
|
|
18
|
+
FULL_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
|
|
19
|
+
COMPACT_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
|
|
20
|
+
MODEL_DIR = Path(os.environ.get("PO_LINT_MODEL_DIR", Path.home() / ".cache" / "po-lint"))
|
|
21
|
+
|
|
22
|
+
# Common aliases: locale directory names that don't match the ISO 639-1 code
|
|
23
|
+
# used by fastText. Only edge cases go here — most codes work as-is.
|
|
24
|
+
LOCALE_ALIASES = {
|
|
25
|
+
"zh_Hans": "zh",
|
|
26
|
+
"zh_Hant": "zh",
|
|
27
|
+
"zh_CN": "zh",
|
|
28
|
+
"zh_TW": "zh",
|
|
29
|
+
"nb": "no", # Norwegian Bokmål → fastText uses "no"
|
|
30
|
+
"nn": "no", # Norwegian Nynorsk → fastText uses "no"
|
|
31
|
+
"pt_BR": "pt",
|
|
32
|
+
"pt_PT": "pt",
|
|
33
|
+
"es_AR": "es",
|
|
34
|
+
"es_MX": "es",
|
|
35
|
+
"en_US": "en",
|
|
36
|
+
"en_GB": "en",
|
|
37
|
+
"fr_CA": "fr",
|
|
38
|
+
"fr_FR": "fr",
|
|
39
|
+
"sr_Latn": "sr",
|
|
40
|
+
"sr_Cyrl": "sr",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
# Carrier phrases per language — used for second-pass confirmation.
|
|
44
|
+
# When fastText flags a wrong language, we re-test with a carrier phrase
|
|
45
|
+
# prepended. If the originally detected language drops significantly (>60%)
|
|
46
|
+
# and the expected language rises significantly (>20%), the original detection
|
|
47
|
+
# was likely a false positive on ambiguous text.
|
|
48
|
+
# Uses short "context" style phrases ("In X one says") to give just enough
|
|
49
|
+
# signal without overpowering real contamination in related languages.
|
|
50
|
+
CARRIER_PHRASES = {
|
|
51
|
+
"af": "In Afrikaans sê mens",
|
|
52
|
+
"ar": "بالعربية يقال",
|
|
53
|
+
"bg": "На български се казва",
|
|
54
|
+
"bn": "বাংলায় বলা হয়",
|
|
55
|
+
"bs": "Na bosanskom se kaže",
|
|
56
|
+
"ca": "En català es diu",
|
|
57
|
+
"cs": "Česky se říká",
|
|
58
|
+
"da": "På dansk siger man",
|
|
59
|
+
"de": "Auf Deutsch sagt man",
|
|
60
|
+
"el": "Στα ελληνικά λέμε",
|
|
61
|
+
"en": "In English one says",
|
|
62
|
+
"es": "En español se dice",
|
|
63
|
+
"et": "Eesti keeles öeldakse",
|
|
64
|
+
"fa": "به فارسی میگویند",
|
|
65
|
+
"fi": "Suomeksi sanotaan",
|
|
66
|
+
"fr": "En français on dit",
|
|
67
|
+
"he": "בעברית אומרים",
|
|
68
|
+
"hi": "हिंदी में कहते हैं",
|
|
69
|
+
"hr": "Na hrvatskom se kaže",
|
|
70
|
+
"hu": "Magyarul azt mondják",
|
|
71
|
+
"id": "Dalam bahasa Indonesia dikatakan",
|
|
72
|
+
"it": "In italiano si dice",
|
|
73
|
+
"ja": "日本語では",
|
|
74
|
+
"ko": "한국어로는",
|
|
75
|
+
"lt": "Lietuviškai sakoma",
|
|
76
|
+
"lv": "Latviski saka",
|
|
77
|
+
"mk": "На македонски се вели",
|
|
78
|
+
"ms": "Dalam bahasa Melayu dikatakan",
|
|
79
|
+
"nl": "In het Nederlands zegt men",
|
|
80
|
+
"no": "På norsk sier man",
|
|
81
|
+
"pl": "Po polsku mówi się",
|
|
82
|
+
"pt": "Em português diz-se",
|
|
83
|
+
"ro": "În română se spune",
|
|
84
|
+
"ru": "По-русски говорят",
|
|
85
|
+
"sk": "Po slovensky sa hovorí",
|
|
86
|
+
"sl": "V slovenščini se reče",
|
|
87
|
+
"sr": "На српском се каже",
|
|
88
|
+
"sv": "På svenska säger man",
|
|
89
|
+
"sw": "Kwa Kiswahili tunasema",
|
|
90
|
+
"th": "ในภาษาไทยพูดว่า",
|
|
91
|
+
"tr": "Türkçede denir ki",
|
|
92
|
+
"uk": "Українською кажуть",
|
|
93
|
+
"vi": "Trong tiếng Việt người ta nói",
|
|
94
|
+
"zh": "用中文来说",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# Confused language merges: when fastText detects a language that is commonly
|
|
98
|
+
# confused with the expected language, merge its score into the expected language's
|
|
99
|
+
# score. This is directional — e.g. Swedish text can be confused as German (sv merges
|
|
100
|
+
# de), but German text is rarely confused as Swedish (de does NOT merge sv).
|
|
101
|
+
# This replaces blanket skipping with score redistribution, so genuinely wrong
|
|
102
|
+
# translations at very high confidence are still caught.
|
|
103
|
+
CONFUSED_MERGES: dict[str, set[str]] = {
|
|
104
|
+
# Scandinavian languages — very similar vocabulary and grammar
|
|
105
|
+
"no": {"da", "sv", "nb", "nn", "de"},
|
|
106
|
+
"da": {"no", "sv", "nb", "nn", "de"},
|
|
107
|
+
"sv": {"no", "da", "nb", "nn", "de"},
|
|
108
|
+
"nb": {"no", "da", "sv", "nn", "de"},
|
|
109
|
+
"nn": {"no", "da", "sv", "nb", "de"},
|
|
110
|
+
# Romance languages
|
|
111
|
+
"pt": {"es", "gl"},
|
|
112
|
+
"es": {"pt", "gl"},
|
|
113
|
+
"gl": {"pt", "es"},
|
|
114
|
+
# Germanic
|
|
115
|
+
"nl": {"af"},
|
|
116
|
+
"af": {"nl"},
|
|
117
|
+
# Turkic languages
|
|
118
|
+
"tr": {"az"},
|
|
119
|
+
"az": {"tr"},
|
|
120
|
+
# Cyrillic languages — shared script and vocabulary
|
|
121
|
+
"uk": {"ru"},
|
|
122
|
+
"ru": {"uk"},
|
|
123
|
+
"bg": {"mk"},
|
|
124
|
+
"mk": {"bg"},
|
|
125
|
+
# Indic languages — shared Devanagari script
|
|
126
|
+
"hi": {"mr"},
|
|
127
|
+
"mr": {"hi"},
|
|
128
|
+
# Arabic script languages — shared script and vocabulary roots
|
|
129
|
+
"ar": {"fa", "ur"},
|
|
130
|
+
"fa": {"ar", "ur"},
|
|
131
|
+
"ur": {"ar", "fa"},
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _normalize_locale(locale: str) -> str:
|
|
136
|
+
"""Normalize a locale directory name to a fastText-compatible ISO code."""
|
|
137
|
+
return LOCALE_ALIASES.get(locale, locale)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _use_compact_model() -> bool:
|
|
141
|
+
"""Check if compact model is requested via environment variable."""
|
|
142
|
+
return os.environ.get("PO_LINT_COMPACT_MODEL", "").lower() in ("1", "true", "yes")
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def ensure_model(compact: bool = False) -> Path:
|
|
146
|
+
"""Download the fastText language ID model if not already cached."""
|
|
147
|
+
if compact or _use_compact_model():
|
|
148
|
+
url = COMPACT_MODEL_URL
|
|
149
|
+
path = MODEL_DIR / "lid.176.ftz"
|
|
150
|
+
else:
|
|
151
|
+
url = FULL_MODEL_URL
|
|
152
|
+
path = MODEL_DIR / "lid.176.bin"
|
|
153
|
+
|
|
154
|
+
if path.exists():
|
|
155
|
+
return path
|
|
156
|
+
MODEL_DIR.mkdir(parents=True, exist_ok=True)
|
|
157
|
+
model_type = "compact" if "ftz" in path.name else "full"
|
|
158
|
+
print(f"Downloading fastText language model ({model_type}) to {path}...")
|
|
159
|
+
urllib.request.urlretrieve(url, path)
|
|
160
|
+
return path
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
_ft_model = None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def init_model(compact: bool = False) -> None:
|
|
167
|
+
"""Initialize the fastText model. Call before linting to select model variant."""
|
|
168
|
+
global _ft_model
|
|
169
|
+
model_path = ensure_model(compact)
|
|
170
|
+
_ft_model = fasttext.load_model(str(model_path))
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def get_ft_model() -> fasttext.FastText._FastText:
|
|
174
|
+
"""Load the fastText model (singleton). Auto-initializes with default if not yet loaded."""
|
|
175
|
+
global _ft_model
|
|
176
|
+
if _ft_model is None:
|
|
177
|
+
init_model()
|
|
178
|
+
return _ft_model
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def clean_text(text: str) -> str:
|
|
182
|
+
"""Strip HTML tags, template tags, format strings, and URLs for better detection."""
|
|
183
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
184
|
+
text = re.sub(r"\{[%{].*?[%}]\}", " ", text)
|
|
185
|
+
text = re.sub(r"%\([^)]+\)[sd]", " ", text)
|
|
186
|
+
text = re.sub(r"https?://\S+", " ", text)
|
|
187
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
188
|
+
return text
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def detect_language(text: str, min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH) -> tuple[str, float]:
|
|
192
|
+
"""Detect language of text using fastText.
|
|
193
|
+
|
|
194
|
+
Returns (lang_code, confidence). Returns ("unknown", 0.0) for text
|
|
195
|
+
shorter than min_detection_length after cleaning.
|
|
196
|
+
"""
|
|
197
|
+
cleaned = clean_text(text)
|
|
198
|
+
if len(cleaned) < min_detection_length:
|
|
199
|
+
return ("unknown", 0.0)
|
|
200
|
+
|
|
201
|
+
return _detect_fasttext(cleaned)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _detect_fasttext(text: str, k: int = 1) -> tuple[str, float] | dict[str, float]:
|
|
205
|
+
"""Detect language using fastText.
|
|
206
|
+
|
|
207
|
+
With k=1, returns (lang, confidence).
|
|
208
|
+
With k>1, returns a dict of {lang: confidence} for the top-k predictions.
|
|
209
|
+
"""
|
|
210
|
+
model = get_ft_model()
|
|
211
|
+
predictions = model.predict(text.replace("\n", " "), k=k)
|
|
212
|
+
if k == 1:
|
|
213
|
+
label = predictions[0][0].replace("__label__", "")
|
|
214
|
+
confidence = predictions[1][0]
|
|
215
|
+
return (label, confidence)
|
|
216
|
+
return {
|
|
217
|
+
label.replace("__label__", ""): conf
|
|
218
|
+
for label, conf in zip(predictions[0], predictions[1])
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _merge_confused_scores(
|
|
223
|
+
scores: dict[str, float], expected_code: str,
|
|
224
|
+
) -> dict[str, float]:
|
|
225
|
+
"""Merge scores from languages commonly confused with the expected language.
|
|
226
|
+
|
|
227
|
+
When fastText splits its confidence between the expected language and
|
|
228
|
+
languages it commonly confuses with it, this merges those scores together.
|
|
229
|
+
For example, Swedish text might get de:63% + sv:12% — if sv has de in its
|
|
230
|
+
merge set, the adjusted score becomes sv:75%.
|
|
231
|
+
"""
|
|
232
|
+
merge_from = CONFUSED_MERGES.get(expected_code)
|
|
233
|
+
if not merge_from:
|
|
234
|
+
return scores
|
|
235
|
+
adjusted = dict(scores)
|
|
236
|
+
bonus = sum(scores.get(lang, 0.0) for lang in merge_from)
|
|
237
|
+
adjusted[expected_code] = adjusted.get(expected_code, 0.0) + bonus
|
|
238
|
+
for lang in merge_from:
|
|
239
|
+
adjusted.pop(lang, None)
|
|
240
|
+
return adjusted
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def is_wrong_language(
|
|
244
|
+
msgstr: str,
|
|
245
|
+
expected_lang: str,
|
|
246
|
+
confidence_threshold: float = 0.5,
|
|
247
|
+
source_language: str = "en",
|
|
248
|
+
msgid: str = "",
|
|
249
|
+
min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH,
|
|
250
|
+
) -> tuple[bool, str, float]:
|
|
251
|
+
"""Check if a translation is in the wrong language.
|
|
252
|
+
|
|
253
|
+
Only checks strings >= min_detection_length characters after cleaning.
|
|
254
|
+
Shorter strings are too ambiguous for reliable detection.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
msgstr: The translated text to check.
|
|
258
|
+
expected_lang: The locale code this translation should be in.
|
|
259
|
+
confidence_threshold: Minimum confidence to flag a wrong language.
|
|
260
|
+
source_language: The source language of the .po file (default: "en").
|
|
261
|
+
Detections matching the source language are allowed, since borrowed
|
|
262
|
+
words from the source language are common in translations.
|
|
263
|
+
msgid: The source text (currently unused, reserved for future use).
|
|
264
|
+
min_detection_length: Minimum cleaned text length to attempt detection.
|
|
265
|
+
|
|
266
|
+
Returns (is_wrong, detected_lang, confidence).
|
|
267
|
+
"""
|
|
268
|
+
cleaned = clean_text(msgstr)
|
|
269
|
+
if len(cleaned) < min_detection_length:
|
|
270
|
+
return (False, "unknown", 0.0)
|
|
271
|
+
|
|
272
|
+
expected_code = _normalize_locale(expected_lang)
|
|
273
|
+
|
|
274
|
+
# Get top-5 scores and merge confused language scores
|
|
275
|
+
scores = _detect_fasttext(cleaned, k=5)
|
|
276
|
+
adjusted = _merge_confused_scores(scores, expected_code)
|
|
277
|
+
detected_lang = max(adjusted, key=adjusted.get)
|
|
278
|
+
confidence = scores.get(detected_lang, adjusted[detected_lang])
|
|
279
|
+
|
|
280
|
+
if detected_lang == "unknown":
|
|
281
|
+
return (False, detected_lang, confidence)
|
|
282
|
+
|
|
283
|
+
if detected_lang == expected_code:
|
|
284
|
+
return (False, detected_lang, adjusted[detected_lang])
|
|
285
|
+
|
|
286
|
+
# Allow source language — borrowed words are common
|
|
287
|
+
source_code = _normalize_locale(source_language)
|
|
288
|
+
if detected_lang == source_code:
|
|
289
|
+
return (False, detected_lang, confidence)
|
|
290
|
+
|
|
291
|
+
# Below confidence threshold — not certain enough to flag
|
|
292
|
+
if confidence < confidence_threshold:
|
|
293
|
+
return (False, detected_lang, confidence)
|
|
294
|
+
|
|
295
|
+
# Second-pass confirmation with carrier phrase.
|
|
296
|
+
# Re-test with a short phrase in the expected language prepended.
|
|
297
|
+
# Compare how the detected language's confidence changes:
|
|
298
|
+
# - Real contamination holds strong (detected lang barely drops)
|
|
299
|
+
# - False positives crumble (detected lang drops >60%, expected rises >20%)
|
|
300
|
+
carrier = CARRIER_PHRASES.get(expected_code)
|
|
301
|
+
if carrier:
|
|
302
|
+
bare_det_conf = scores.get(detected_lang, 0.0)
|
|
303
|
+
bare_exp_conf = scores.get(expected_code, 0.0)
|
|
304
|
+
boosted_scores = _detect_fasttext(f"{carrier} {cleaned}", k=5)
|
|
305
|
+
boosted_det_conf = boosted_scores.get(detected_lang, 0.0)
|
|
306
|
+
boosted_exp_conf = boosted_scores.get(expected_code, 0.0)
|
|
307
|
+
|
|
308
|
+
if bare_det_conf > 0:
|
|
309
|
+
det_drop = (bare_det_conf - boosted_det_conf) / bare_det_conf
|
|
310
|
+
exp_rise = boosted_exp_conf - bare_exp_conf
|
|
311
|
+
if det_drop > 0.60 and exp_rise > 0.20:
|
|
312
|
+
return (False, detected_lang, confidence)
|
|
313
|
+
|
|
314
|
+
return (True, detected_lang, confidence)
|
po_lint/linter.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""Main linter that ties all checks together and walks locale directories."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import polib
|
|
8
|
+
|
|
9
|
+
from po_lint.checks import (
|
|
10
|
+
Issue,
|
|
11
|
+
IssueType,
|
|
12
|
+
Severity,
|
|
13
|
+
check_garbled_text,
|
|
14
|
+
check_shifted_entry,
|
|
15
|
+
check_wrong_script,
|
|
16
|
+
)
|
|
17
|
+
from po_lint.detector import DEFAULT_MIN_DETECTION_LENGTH, is_wrong_language
|
|
18
|
+
|
|
19
|
+
IGNORE_FILE = ".po-lint-ignore"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class IgnoreRule:
|
|
24
|
+
"""A single ignore rule from .po-lint-ignore."""
|
|
25
|
+
|
|
26
|
+
msgid: str
|
|
27
|
+
msgctxt: str # Empty string means match any context
|
|
28
|
+
languages: set[str] # Empty set means match all languages
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def load_ignore_rules(locale_dir: Path) -> list[IgnoreRule]:
|
|
32
|
+
"""Load ignore rules from a .po-lint-ignore file in the locale directory.
|
|
33
|
+
|
|
34
|
+
Format:
|
|
35
|
+
# Comment
|
|
36
|
+
Some msgid → ignore for all languages, any context
|
|
37
|
+
[ar,hi] Some msgid → ignore only for Arabic and Hindi
|
|
38
|
+
screening status::Some msgid → ignore with specific msgctxt
|
|
39
|
+
[ar] screening status::Some msgid → both language scope and context
|
|
40
|
+
"""
|
|
41
|
+
ignore_file = locale_dir / IGNORE_FILE
|
|
42
|
+
if not ignore_file.exists():
|
|
43
|
+
return []
|
|
44
|
+
|
|
45
|
+
rules = []
|
|
46
|
+
for line in ignore_file.read_text().splitlines():
|
|
47
|
+
line = line.strip()
|
|
48
|
+
if not line or line.startswith("#"):
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
languages: set[str] = set()
|
|
52
|
+
# Parse optional language scope: [ar,hi,zh_Hans]
|
|
53
|
+
if line.startswith("["):
|
|
54
|
+
bracket_end = line.index("]")
|
|
55
|
+
lang_str = line[1:bracket_end]
|
|
56
|
+
languages = {lang.strip() for lang in lang_str.split(",")}
|
|
57
|
+
line = line[bracket_end + 1:].strip()
|
|
58
|
+
|
|
59
|
+
# Parse optional context: msgctxt::msgid
|
|
60
|
+
if "::" in line:
|
|
61
|
+
msgctxt, msgid = line.split("::", 1)
|
|
62
|
+
else:
|
|
63
|
+
msgctxt = ""
|
|
64
|
+
msgid = line
|
|
65
|
+
|
|
66
|
+
rules.append(IgnoreRule(msgid=msgid, msgctxt=msgctxt, languages=languages))
|
|
67
|
+
|
|
68
|
+
return rules
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _is_ignored(msgid: str, msgctxt: str | None, locale: str, ignore_rules: list[IgnoreRule]) -> bool:
|
|
72
|
+
"""Check if an entry matches any ignore rule."""
|
|
73
|
+
for rule in ignore_rules:
|
|
74
|
+
# Check language scope
|
|
75
|
+
if rule.languages and locale not in rule.languages:
|
|
76
|
+
continue
|
|
77
|
+
# Check msgid
|
|
78
|
+
if rule.msgid != msgid:
|
|
79
|
+
continue
|
|
80
|
+
# Check context
|
|
81
|
+
if rule.msgctxt and rule.msgctxt != (msgctxt or ""):
|
|
82
|
+
continue
|
|
83
|
+
return True
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def extract_locale_from_path(po_file: Path) -> str | None:
|
|
88
|
+
"""Extract the locale code from a .po file path.
|
|
89
|
+
|
|
90
|
+
Expects paths like: .../locale/<lang>/LC_MESSAGES/django.po
|
|
91
|
+
"""
|
|
92
|
+
parts = po_file.parts
|
|
93
|
+
for i, part in enumerate(parts):
|
|
94
|
+
if part == "LC_MESSAGES" and i >= 1:
|
|
95
|
+
return parts[i - 1]
|
|
96
|
+
if part == "locale" and i + 1 < len(parts):
|
|
97
|
+
return parts[i + 1]
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _find_locale_root(po_file: Path) -> Path | None:
|
|
102
|
+
"""Find the locale/ directory that contains this .po file."""
|
|
103
|
+
for parent in po_file.parents:
|
|
104
|
+
if parent.name == "locale":
|
|
105
|
+
return parent
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def lint_po_file(
|
|
110
|
+
po_file: Path,
|
|
111
|
+
locale: str | None = None,
|
|
112
|
+
source_language: str = "en",
|
|
113
|
+
confidence_threshold: float = 0.5,
|
|
114
|
+
min_text_length: int = 3,
|
|
115
|
+
min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH,
|
|
116
|
+
ignore_patterns: list[str] | None = None,
|
|
117
|
+
ignore_rules: list[IgnoreRule] | None = None,
|
|
118
|
+
) -> list[Issue]:
|
|
119
|
+
"""Lint a single .po file and return all issues found."""
|
|
120
|
+
if locale is None:
|
|
121
|
+
locale = extract_locale_from_path(po_file)
|
|
122
|
+
if locale is None:
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
compiled_ignores = [re.compile(p) for p in (ignore_patterns or [])]
|
|
126
|
+
ignore_rules = ignore_rules or []
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
catalog = polib.pofile(str(po_file))
|
|
130
|
+
except (OSError, SyntaxError) as e:
|
|
131
|
+
return [
|
|
132
|
+
Issue(
|
|
133
|
+
file=str(po_file),
|
|
134
|
+
line=0,
|
|
135
|
+
msgid="",
|
|
136
|
+
msgstr="",
|
|
137
|
+
issue_type=IssueType.GARBLED_TEXT,
|
|
138
|
+
severity=Severity.ERROR,
|
|
139
|
+
message=f"Failed to parse .po file: {e}",
|
|
140
|
+
)
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
issues = []
|
|
144
|
+
|
|
145
|
+
for entry in catalog.translated_entries():
|
|
146
|
+
msgid = entry.msgid
|
|
147
|
+
msgstr = entry.msgstr
|
|
148
|
+
|
|
149
|
+
if not msgstr or len(msgstr.strip()) < min_text_length:
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
# Skip entries in the ignore file
|
|
153
|
+
if _is_ignored(msgid, entry.msgctxt, locale, ignore_rules):
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Skip entries matching ignore patterns
|
|
157
|
+
if any(p.search(msgid) or p.search(msgstr) for p in compiled_ignores):
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
# Skip entries that are mostly format strings / placeholders / URLs
|
|
161
|
+
if _is_non_linguistic(msgstr):
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
# Skip entries where the translation is identical to the source
|
|
165
|
+
# (intentionally untranslated — common for brand names, acronyms, technical terms)
|
|
166
|
+
if msgid == msgstr:
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
# 1. Wrong script check (fast, no model needed)
|
|
170
|
+
issue = check_wrong_script(msgstr, locale)
|
|
171
|
+
if issue:
|
|
172
|
+
issue.file = str(po_file)
|
|
173
|
+
issue.line = entry.linenum
|
|
174
|
+
issue.msgid = msgid
|
|
175
|
+
issues.append(issue)
|
|
176
|
+
continue # If wrong script, skip language detection (it would also flag)
|
|
177
|
+
|
|
178
|
+
# 2. Garbled text check
|
|
179
|
+
issue = check_garbled_text(msgstr)
|
|
180
|
+
if issue:
|
|
181
|
+
issue.file = str(po_file)
|
|
182
|
+
issue.line = entry.linenum
|
|
183
|
+
issue.msgid = msgid
|
|
184
|
+
issues.append(issue)
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# 3. Shifted entry check
|
|
188
|
+
issue = check_shifted_entry(msgid, msgstr)
|
|
189
|
+
if issue:
|
|
190
|
+
issue.file = str(po_file)
|
|
191
|
+
issue.line = entry.linenum
|
|
192
|
+
issues.append(issue)
|
|
193
|
+
|
|
194
|
+
# 4. Wrong language check (uses fastText)
|
|
195
|
+
is_wrong, detected_lang, confidence = is_wrong_language(
|
|
196
|
+
msgstr, locale, confidence_threshold, source_language, msgid=msgid,
|
|
197
|
+
min_detection_length=min_detection_length,
|
|
198
|
+
)
|
|
199
|
+
if is_wrong:
|
|
200
|
+
issues.append(
|
|
201
|
+
Issue(
|
|
202
|
+
file=str(po_file),
|
|
203
|
+
line=entry.linenum,
|
|
204
|
+
msgid=msgid,
|
|
205
|
+
msgstr=msgstr,
|
|
206
|
+
issue_type=IssueType.WRONG_LANGUAGE,
|
|
207
|
+
severity=Severity.ERROR,
|
|
208
|
+
message=f"Expected {locale}, detected {detected_lang} (confidence: {confidence:.0%})",
|
|
209
|
+
detected_lang=detected_lang,
|
|
210
|
+
confidence=confidence,
|
|
211
|
+
)
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return issues
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _is_non_linguistic(text: str) -> bool:
|
|
218
|
+
"""Check if text is mostly non-linguistic (URLs, format strings, numbers, etc.)."""
|
|
219
|
+
cleaned = text
|
|
220
|
+
# Strip Django/Python format strings
|
|
221
|
+
cleaned = re.sub(r"%\([^)]+\)[sd]", "", cleaned)
|
|
222
|
+
cleaned = re.sub(r"%[sd]", "", cleaned)
|
|
223
|
+
cleaned = re.sub(r"\{[^}]*\}", "", cleaned)
|
|
224
|
+
# Strip HTML tags
|
|
225
|
+
cleaned = re.sub(r"<[^>]+>", "", cleaned)
|
|
226
|
+
# Strip URLs
|
|
227
|
+
cleaned = re.sub(r"https?://\S+", "", cleaned)
|
|
228
|
+
# Strip numbers and punctuation
|
|
229
|
+
cleaned = re.sub(r"[0-9.,;:!?/\\@#$%^&*()_+=\[\]{}<>\"'\-\s]", "", cleaned)
|
|
230
|
+
# If very little text remains, it's non-linguistic
|
|
231
|
+
return len(cleaned) < 3
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def lint_locale_dir(
|
|
235
|
+
locale_dir: Path,
|
|
236
|
+
languages: list[str] | None = None,
|
|
237
|
+
source_language: str = "en",
|
|
238
|
+
confidence_threshold: float = 0.5,
|
|
239
|
+
min_text_length: int = 3,
|
|
240
|
+
min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH,
|
|
241
|
+
ignore_patterns: list[str] | None = None,
|
|
242
|
+
) -> list[Issue]:
|
|
243
|
+
"""Lint all .po files in a locale directory.
|
|
244
|
+
|
|
245
|
+
Loads .po-lint-ignore from the locale directory if present.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
locale_dir: Path to a locale/ directory containing <lang>/LC_MESSAGES/*.po
|
|
249
|
+
languages: If set, only lint these language codes. If empty, lint all.
|
|
250
|
+
source_language: The source language of the .po files (default: "en").
|
|
251
|
+
confidence_threshold: Minimum confidence to flag a wrong language.
|
|
252
|
+
min_text_length: Minimum msgstr length to check.
|
|
253
|
+
ignore_patterns: Regex patterns for msgid/msgstr to skip.
|
|
254
|
+
"""
|
|
255
|
+
ignore_rules = load_ignore_rules(locale_dir)
|
|
256
|
+
issues = []
|
|
257
|
+
|
|
258
|
+
for po_file in sorted(locale_dir.rglob("*.po")):
|
|
259
|
+
locale = extract_locale_from_path(po_file)
|
|
260
|
+
if locale is None:
|
|
261
|
+
continue
|
|
262
|
+
if languages and locale not in languages:
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
file_issues = lint_po_file(
|
|
266
|
+
po_file,
|
|
267
|
+
locale=locale,
|
|
268
|
+
source_language=source_language,
|
|
269
|
+
confidence_threshold=confidence_threshold,
|
|
270
|
+
min_text_length=min_text_length,
|
|
271
|
+
min_detection_length=min_detection_length,
|
|
272
|
+
ignore_patterns=ignore_patterns,
|
|
273
|
+
ignore_rules=ignore_rules,
|
|
274
|
+
)
|
|
275
|
+
issues.extend(file_issues)
|
|
276
|
+
|
|
277
|
+
return issues
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: python-po-lint
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lint .po translation files for contamination, wrong languages, shifts, and garbled text
|
|
5
|
+
Author: PesCheck
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Requires-Dist: fasttext-wheel>=0.9.2
|
|
10
|
+
Requires-Dist: numpy<2
|
|
11
|
+
Requires-Dist: polib>=1.2
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# python-po-lint
|
|
15
|
+
|
|
16
|
+
Lint `.po` translation files for contamination, wrong languages, shifts, and garbled text.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- **Wrong script detection** — catches Cyrillic in a Dutch file, Arabic in French, etc.
|
|
21
|
+
- **Wrong language detection** — hybrid fastText + lingua approach (fastText for long strings, lingua for short ones)
|
|
22
|
+
- **Shifted entry detection** — finds translations that got shifted to the wrong msgid
|
|
23
|
+
- **Garbled text detection** — catches corrupted/broken unicode
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install python-po-lint
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Usage
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
# Lint a locale directory
|
|
35
|
+
po-lint locale/
|
|
36
|
+
|
|
37
|
+
# Lint with config from pyproject.toml
|
|
38
|
+
po-lint
|
|
39
|
+
|
|
40
|
+
# Only check specific languages
|
|
41
|
+
po-lint locale/ --languages fr de nl
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Configuration
|
|
45
|
+
|
|
46
|
+
Add to your `pyproject.toml`:
|
|
47
|
+
|
|
48
|
+
```toml
|
|
49
|
+
[tool.po-lint]
|
|
50
|
+
# Explicit locale directories (relative to project root)
|
|
51
|
+
paths = ["locale"]
|
|
52
|
+
|
|
53
|
+
# Auto-discover locale dirs from installed Python packages
|
|
54
|
+
packages = ["myapp", "myotherapp"]
|
|
55
|
+
|
|
56
|
+
# Only check these languages (empty = all)
|
|
57
|
+
languages = []
|
|
58
|
+
|
|
59
|
+
# Minimum confidence to flag wrong language (0.0 - 1.0)
|
|
60
|
+
confidence_threshold = 0.5
|
|
61
|
+
|
|
62
|
+
# Skip entries with msgstr shorter than this
|
|
63
|
+
min_text_length = 3
|
|
64
|
+
|
|
65
|
+
# Regex patterns to ignore (matched against msgid and msgstr)
|
|
66
|
+
ignore_patterns = []
|
|
67
|
+
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
po_lint/__init__.py,sha256=2VjrIp2SLGcJKCEhgQ_Ss706P6o6RD86B9-wSPKJ-94,134
|
|
2
|
+
po_lint/checks.py,sha256=cpySTm9QJzcy31rjOKgBsNeXeeHFmzfdCHjhUa2nHLQ,10556
|
|
3
|
+
po_lint/cli.py,sha256=3bc5_ZaZ4-WK29JsyqwgI677p4F6WxpaAnR0_-_9g2Y,6866
|
|
4
|
+
po_lint/config.py,sha256=Ya3Q78_LZ6x4RrmWpexH4qFrHgKBxyUlX103XOXSQSI,3053
|
|
5
|
+
po_lint/detector.py,sha256=fA1tDP3Lf05bFLh-ybvpAgHTAJPC7cut8dpZc7REssg,11735
|
|
6
|
+
po_lint/linter.py,sha256=inT8gSP75nosm9S_uCEKURo4MICVUHqdIFmElQhLAgg,9038
|
|
7
|
+
python_po_lint-0.1.0.dist-info/METADATA,sha256=0cw6AlhCs3UBVJRrkV3YYgAfz79NvmDnoOCctxMezyk,1638
|
|
8
|
+
python_po_lint-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
9
|
+
python_po_lint-0.1.0.dist-info/entry_points.txt,sha256=W9_2iuo5yQtYgY5U8qVH7h6Xq8XSj0KsF71TR_9_xDE,45
|
|
10
|
+
python_po_lint-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
11
|
+
python_po_lint-0.1.0.dist-info/RECORD,,
|