python-po-lint 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
po_lint/detector.py ADDED
@@ -0,0 +1,314 @@
1
+ """Language detection using fastText for .po file linting."""
2
+
3
+ import os
4
+ import re
5
+ import urllib.request
6
+ from pathlib import Path
7
+
8
+ import fasttext
9
+
10
+ # Suppress fastText warnings about "\n" in input
11
+ fasttext.FastText.eprint = lambda x: None
12
+
13
+ # Default minimum cleaned text length to attempt language detection.
14
+ # Short strings are unreliable — loan words, cognates, and brand names
15
+ # make detection impossible for anything under ~30 characters.
16
+ DEFAULT_MIN_DETECTION_LENGTH = 30
17
+
18
+ FULL_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
19
+ COMPACT_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
20
+ MODEL_DIR = Path(os.environ.get("PO_LINT_MODEL_DIR", Path.home() / ".cache" / "po-lint"))
21
+
22
+ # Common aliases: locale directory names that don't match the ISO 639-1 code
23
+ # used by fastText. Only edge cases go here — most codes work as-is.
24
+ LOCALE_ALIASES = {
25
+ "zh_Hans": "zh",
26
+ "zh_Hant": "zh",
27
+ "zh_CN": "zh",
28
+ "zh_TW": "zh",
29
+ "nb": "no", # Norwegian Bokmål → fastText uses "no"
30
+ "nn": "no", # Norwegian Nynorsk → fastText uses "no"
31
+ "pt_BR": "pt",
32
+ "pt_PT": "pt",
33
+ "es_AR": "es",
34
+ "es_MX": "es",
35
+ "en_US": "en",
36
+ "en_GB": "en",
37
+ "fr_CA": "fr",
38
+ "fr_FR": "fr",
39
+ "sr_Latn": "sr",
40
+ "sr_Cyrl": "sr",
41
+ }
42
+
43
+ # Carrier phrases per language — used for second-pass confirmation.
44
+ # When fastText flags a wrong language, we re-test with a carrier phrase
45
+ # prepended. If the originally detected language drops significantly (>60%)
46
+ # and the expected language rises significantly (>20%), the original detection
47
+ # was likely a false positive on ambiguous text.
48
+ # Uses short "context" style phrases ("In X one says") to give just enough
49
+ # signal without overpowering real contamination in related languages.
50
+ CARRIER_PHRASES = {
51
+ "af": "In Afrikaans sê mens",
52
+ "ar": "بالعربية يقال",
53
+ "bg": "На български се казва",
54
+ "bn": "বাংলায় বলা হয়",
55
+ "bs": "Na bosanskom se kaže",
56
+ "ca": "En català es diu",
57
+ "cs": "Česky se říká",
58
+ "da": "På dansk siger man",
59
+ "de": "Auf Deutsch sagt man",
60
+ "el": "Στα ελληνικά λέμε",
61
+ "en": "In English one says",
62
+ "es": "En español se dice",
63
+ "et": "Eesti keeles öeldakse",
64
+ "fa": "به فارسی می‌گویند",
65
+ "fi": "Suomeksi sanotaan",
66
+ "fr": "En français on dit",
67
+ "he": "בעברית אומרים",
68
+ "hi": "हिंदी में कहते हैं",
69
+ "hr": "Na hrvatskom se kaže",
70
+ "hu": "Magyarul azt mondják",
71
+ "id": "Dalam bahasa Indonesia dikatakan",
72
+ "it": "In italiano si dice",
73
+ "ja": "日本語では",
74
+ "ko": "한국어로는",
75
+ "lt": "Lietuviškai sakoma",
76
+ "lv": "Latviski saka",
77
+ "mk": "На македонски се вели",
78
+ "ms": "Dalam bahasa Melayu dikatakan",
79
+ "nl": "In het Nederlands zegt men",
80
+ "no": "På norsk sier man",
81
+ "pl": "Po polsku mówi się",
82
+ "pt": "Em português diz-se",
83
+ "ro": "În română se spune",
84
+ "ru": "По-русски говорят",
85
+ "sk": "Po slovensky sa hovorí",
86
+ "sl": "V slovenščini se reče",
87
+ "sr": "На српском се каже",
88
+ "sv": "På svenska säger man",
89
+ "sw": "Kwa Kiswahili tunasema",
90
+ "th": "ในภาษาไทยพูดว่า",
91
+ "tr": "Türkçede denir ki",
92
+ "uk": "Українською кажуть",
93
+ "vi": "Trong tiếng Việt người ta nói",
94
+ "zh": "用中文来说",
95
+ }
96
+
97
+ # Confused language merges: when fastText detects a language that is commonly
98
+ # confused with the expected language, merge its score into the expected language's
99
+ # score. This is directional — e.g. Swedish text can be confused as German (sv merges
100
+ # de), but German text is rarely confused as Swedish (de does NOT merge sv).
101
+ # This replaces blanket skipping with score redistribution, so genuinely wrong
102
+ # translations at very high confidence are still caught.
103
+ CONFUSED_MERGES: dict[str, set[str]] = {
104
+ # Scandinavian languages — very similar vocabulary and grammar
105
+ "no": {"da", "sv", "nb", "nn", "de"},
106
+ "da": {"no", "sv", "nb", "nn", "de"},
107
+ "sv": {"no", "da", "nb", "nn", "de"},
108
+ "nb": {"no", "da", "sv", "nn", "de"},
109
+ "nn": {"no", "da", "sv", "nb", "de"},
110
+ # Romance languages
111
+ "pt": {"es", "gl"},
112
+ "es": {"pt", "gl"},
113
+ "gl": {"pt", "es"},
114
+ # Germanic
115
+ "nl": {"af"},
116
+ "af": {"nl"},
117
+ # Turkic languages
118
+ "tr": {"az"},
119
+ "az": {"tr"},
120
+ # Cyrillic languages — shared script and vocabulary
121
+ "uk": {"ru"},
122
+ "ru": {"uk"},
123
+ "bg": {"mk"},
124
+ "mk": {"bg"},
125
+ # Indic languages — shared Devanagari script
126
+ "hi": {"mr"},
127
+ "mr": {"hi"},
128
+ # Arabic script languages — shared script and vocabulary roots
129
+ "ar": {"fa", "ur"},
130
+ "fa": {"ar", "ur"},
131
+ "ur": {"ar", "fa"},
132
+ }
133
+
134
+
135
+ def _normalize_locale(locale: str) -> str:
136
+ """Normalize a locale directory name to a fastText-compatible ISO code."""
137
+ return LOCALE_ALIASES.get(locale, locale)
138
+
139
+
140
+ def _use_compact_model() -> bool:
141
+ """Check if compact model is requested via environment variable."""
142
+ return os.environ.get("PO_LINT_COMPACT_MODEL", "").lower() in ("1", "true", "yes")
143
+
144
+
145
+ def ensure_model(compact: bool = False) -> Path:
146
+ """Download the fastText language ID model if not already cached."""
147
+ if compact or _use_compact_model():
148
+ url = COMPACT_MODEL_URL
149
+ path = MODEL_DIR / "lid.176.ftz"
150
+ else:
151
+ url = FULL_MODEL_URL
152
+ path = MODEL_DIR / "lid.176.bin"
153
+
154
+ if path.exists():
155
+ return path
156
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
157
+ model_type = "compact" if "ftz" in path.name else "full"
158
+ print(f"Downloading fastText language model ({model_type}) to {path}...")
159
+ urllib.request.urlretrieve(url, path)
160
+ return path
161
+
162
+
163
+ _ft_model = None
164
+
165
+
166
+ def init_model(compact: bool = False) -> None:
167
+ """Initialize the fastText model. Call before linting to select model variant."""
168
+ global _ft_model
169
+ model_path = ensure_model(compact)
170
+ _ft_model = fasttext.load_model(str(model_path))
171
+
172
+
173
+ def get_ft_model() -> fasttext.FastText._FastText:
174
+ """Load the fastText model (singleton). Auto-initializes with default if not yet loaded."""
175
+ global _ft_model
176
+ if _ft_model is None:
177
+ init_model()
178
+ return _ft_model
179
+
180
+
181
+ def clean_text(text: str) -> str:
182
+ """Strip HTML tags, template tags, format strings, and URLs for better detection."""
183
+ text = re.sub(r"<[^>]+>", " ", text)
184
+ text = re.sub(r"\{[%{].*?[%}]\}", " ", text)
185
+ text = re.sub(r"%\([^)]+\)[sd]", " ", text)
186
+ text = re.sub(r"https?://\S+", " ", text)
187
+ text = re.sub(r"\s+", " ", text).strip()
188
+ return text
189
+
190
+
191
+ def detect_language(text: str, min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH) -> tuple[str, float]:
192
+ """Detect language of text using fastText.
193
+
194
+ Returns (lang_code, confidence). Returns ("unknown", 0.0) for text
195
+ shorter than min_detection_length after cleaning.
196
+ """
197
+ cleaned = clean_text(text)
198
+ if len(cleaned) < min_detection_length:
199
+ return ("unknown", 0.0)
200
+
201
+ return _detect_fasttext(cleaned)
202
+
203
+
204
+ def _detect_fasttext(text: str, k: int = 1) -> tuple[str, float] | dict[str, float]:
205
+ """Detect language using fastText.
206
+
207
+ With k=1, returns (lang, confidence).
208
+ With k>1, returns a dict of {lang: confidence} for the top-k predictions.
209
+ """
210
+ model = get_ft_model()
211
+ predictions = model.predict(text.replace("\n", " "), k=k)
212
+ if k == 1:
213
+ label = predictions[0][0].replace("__label__", "")
214
+ confidence = predictions[1][0]
215
+ return (label, confidence)
216
+ return {
217
+ label.replace("__label__", ""): conf
218
+ for label, conf in zip(predictions[0], predictions[1])
219
+ }
220
+
221
+
222
+ def _merge_confused_scores(
223
+ scores: dict[str, float], expected_code: str,
224
+ ) -> dict[str, float]:
225
+ """Merge scores from languages commonly confused with the expected language.
226
+
227
+ When fastText splits its confidence between the expected language and
228
+ languages it commonly confuses with it, this merges those scores together.
229
+ For example, Swedish text might get de:63% + sv:12% — if sv has de in its
230
+ merge set, the adjusted score becomes sv:75%.
231
+ """
232
+ merge_from = CONFUSED_MERGES.get(expected_code)
233
+ if not merge_from:
234
+ return scores
235
+ adjusted = dict(scores)
236
+ bonus = sum(scores.get(lang, 0.0) for lang in merge_from)
237
+ adjusted[expected_code] = adjusted.get(expected_code, 0.0) + bonus
238
+ for lang in merge_from:
239
+ adjusted.pop(lang, None)
240
+ return adjusted
241
+
242
+
243
+ def is_wrong_language(
244
+ msgstr: str,
245
+ expected_lang: str,
246
+ confidence_threshold: float = 0.5,
247
+ source_language: str = "en",
248
+ msgid: str = "",
249
+ min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH,
250
+ ) -> tuple[bool, str, float]:
251
+ """Check if a translation is in the wrong language.
252
+
253
+ Only checks strings >= min_detection_length characters after cleaning.
254
+ Shorter strings are too ambiguous for reliable detection.
255
+
256
+ Args:
257
+ msgstr: The translated text to check.
258
+ expected_lang: The locale code this translation should be in.
259
+ confidence_threshold: Minimum confidence to flag a wrong language.
260
+ source_language: The source language of the .po file (default: "en").
261
+ Detections matching the source language are allowed, since borrowed
262
+ words from the source language are common in translations.
263
+ msgid: The source text (currently unused, reserved for future use).
264
+ min_detection_length: Minimum cleaned text length to attempt detection.
265
+
266
+ Returns (is_wrong, detected_lang, confidence).
267
+ """
268
+ cleaned = clean_text(msgstr)
269
+ if len(cleaned) < min_detection_length:
270
+ return (False, "unknown", 0.0)
271
+
272
+ expected_code = _normalize_locale(expected_lang)
273
+
274
+ # Get top-5 scores and merge confused language scores
275
+ scores = _detect_fasttext(cleaned, k=5)
276
+ adjusted = _merge_confused_scores(scores, expected_code)
277
+ detected_lang = max(adjusted, key=adjusted.get)
278
+ confidence = scores.get(detected_lang, adjusted[detected_lang])
279
+
280
+ if detected_lang == "unknown":
281
+ return (False, detected_lang, confidence)
282
+
283
+ if detected_lang == expected_code:
284
+ return (False, detected_lang, adjusted[detected_lang])
285
+
286
+ # Allow source language — borrowed words are common
287
+ source_code = _normalize_locale(source_language)
288
+ if detected_lang == source_code:
289
+ return (False, detected_lang, confidence)
290
+
291
+ # Below confidence threshold — not certain enough to flag
292
+ if confidence < confidence_threshold:
293
+ return (False, detected_lang, confidence)
294
+
295
+ # Second-pass confirmation with carrier phrase.
296
+ # Re-test with a short phrase in the expected language prepended.
297
+ # Compare how the detected language's confidence changes:
298
+ # - Real contamination holds strong (detected lang barely drops)
299
+ # - False positives crumble (detected lang drops >60%, expected rises >20%)
300
+ carrier = CARRIER_PHRASES.get(expected_code)
301
+ if carrier:
302
+ bare_det_conf = scores.get(detected_lang, 0.0)
303
+ bare_exp_conf = scores.get(expected_code, 0.0)
304
+ boosted_scores = _detect_fasttext(f"{carrier} {cleaned}", k=5)
305
+ boosted_det_conf = boosted_scores.get(detected_lang, 0.0)
306
+ boosted_exp_conf = boosted_scores.get(expected_code, 0.0)
307
+
308
+ if bare_det_conf > 0:
309
+ det_drop = (bare_det_conf - boosted_det_conf) / bare_det_conf
310
+ exp_rise = boosted_exp_conf - bare_exp_conf
311
+ if det_drop > 0.60 and exp_rise > 0.20:
312
+ return (False, detected_lang, confidence)
313
+
314
+ return (True, detected_lang, confidence)
po_lint/linter.py ADDED
@@ -0,0 +1,277 @@
1
+ """Main linter that ties all checks together and walks locale directories."""
2
+
3
+ import re
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+ import polib
8
+
9
+ from po_lint.checks import (
10
+ Issue,
11
+ IssueType,
12
+ Severity,
13
+ check_garbled_text,
14
+ check_shifted_entry,
15
+ check_wrong_script,
16
+ )
17
+ from po_lint.detector import DEFAULT_MIN_DETECTION_LENGTH, is_wrong_language
18
+
19
+ IGNORE_FILE = ".po-lint-ignore"
20
+
21
+
22
+ @dataclass
23
+ class IgnoreRule:
24
+ """A single ignore rule from .po-lint-ignore."""
25
+
26
+ msgid: str
27
+ msgctxt: str # Empty string means match any context
28
+ languages: set[str] # Empty set means match all languages
29
+
30
+
31
+ def load_ignore_rules(locale_dir: Path) -> list[IgnoreRule]:
32
+ """Load ignore rules from a .po-lint-ignore file in the locale directory.
33
+
34
+ Format:
35
+ # Comment
36
+ Some msgid → ignore for all languages, any context
37
+ [ar,hi] Some msgid → ignore only for Arabic and Hindi
38
+ screening status::Some msgid → ignore with specific msgctxt
39
+ [ar] screening status::Some msgid → both language scope and context
40
+ """
41
+ ignore_file = locale_dir / IGNORE_FILE
42
+ if not ignore_file.exists():
43
+ return []
44
+
45
+ rules = []
46
+ for line in ignore_file.read_text().splitlines():
47
+ line = line.strip()
48
+ if not line or line.startswith("#"):
49
+ continue
50
+
51
+ languages: set[str] = set()
52
+ # Parse optional language scope: [ar,hi,zh_Hans]
53
+ if line.startswith("["):
54
+ bracket_end = line.index("]")
55
+ lang_str = line[1:bracket_end]
56
+ languages = {lang.strip() for lang in lang_str.split(",")}
57
+ line = line[bracket_end + 1:].strip()
58
+
59
+ # Parse optional context: msgctxt::msgid
60
+ if "::" in line:
61
+ msgctxt, msgid = line.split("::", 1)
62
+ else:
63
+ msgctxt = ""
64
+ msgid = line
65
+
66
+ rules.append(IgnoreRule(msgid=msgid, msgctxt=msgctxt, languages=languages))
67
+
68
+ return rules
69
+
70
+
71
+ def _is_ignored(msgid: str, msgctxt: str | None, locale: str, ignore_rules: list[IgnoreRule]) -> bool:
72
+ """Check if an entry matches any ignore rule."""
73
+ for rule in ignore_rules:
74
+ # Check language scope
75
+ if rule.languages and locale not in rule.languages:
76
+ continue
77
+ # Check msgid
78
+ if rule.msgid != msgid:
79
+ continue
80
+ # Check context
81
+ if rule.msgctxt and rule.msgctxt != (msgctxt or ""):
82
+ continue
83
+ return True
84
+ return False
85
+
86
+
87
+ def extract_locale_from_path(po_file: Path) -> str | None:
88
+ """Extract the locale code from a .po file path.
89
+
90
+ Expects paths like: .../locale/<lang>/LC_MESSAGES/django.po
91
+ """
92
+ parts = po_file.parts
93
+ for i, part in enumerate(parts):
94
+ if part == "LC_MESSAGES" and i >= 1:
95
+ return parts[i - 1]
96
+ if part == "locale" and i + 1 < len(parts):
97
+ return parts[i + 1]
98
+ return None
99
+
100
+
101
+ def _find_locale_root(po_file: Path) -> Path | None:
102
+ """Find the locale/ directory that contains this .po file."""
103
+ for parent in po_file.parents:
104
+ if parent.name == "locale":
105
+ return parent
106
+ return None
107
+
108
+
109
+ def lint_po_file(
110
+ po_file: Path,
111
+ locale: str | None = None,
112
+ source_language: str = "en",
113
+ confidence_threshold: float = 0.5,
114
+ min_text_length: int = 3,
115
+ min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH,
116
+ ignore_patterns: list[str] | None = None,
117
+ ignore_rules: list[IgnoreRule] | None = None,
118
+ ) -> list[Issue]:
119
+ """Lint a single .po file and return all issues found."""
120
+ if locale is None:
121
+ locale = extract_locale_from_path(po_file)
122
+ if locale is None:
123
+ return []
124
+
125
+ compiled_ignores = [re.compile(p) for p in (ignore_patterns or [])]
126
+ ignore_rules = ignore_rules or []
127
+
128
+ try:
129
+ catalog = polib.pofile(str(po_file))
130
+ except (OSError, SyntaxError) as e:
131
+ return [
132
+ Issue(
133
+ file=str(po_file),
134
+ line=0,
135
+ msgid="",
136
+ msgstr="",
137
+ issue_type=IssueType.GARBLED_TEXT,
138
+ severity=Severity.ERROR,
139
+ message=f"Failed to parse .po file: {e}",
140
+ )
141
+ ]
142
+
143
+ issues = []
144
+
145
+ for entry in catalog.translated_entries():
146
+ msgid = entry.msgid
147
+ msgstr = entry.msgstr
148
+
149
+ if not msgstr or len(msgstr.strip()) < min_text_length:
150
+ continue
151
+
152
+ # Skip entries in the ignore file
153
+ if _is_ignored(msgid, entry.msgctxt, locale, ignore_rules):
154
+ continue
155
+
156
+ # Skip entries matching ignore patterns
157
+ if any(p.search(msgid) or p.search(msgstr) for p in compiled_ignores):
158
+ continue
159
+
160
+ # Skip entries that are mostly format strings / placeholders / URLs
161
+ if _is_non_linguistic(msgstr):
162
+ continue
163
+
164
+ # Skip entries where the translation is identical to the source
165
+ # (intentionally untranslated — common for brand names, acronyms, technical terms)
166
+ if msgid == msgstr:
167
+ continue
168
+
169
+ # 1. Wrong script check (fast, no model needed)
170
+ issue = check_wrong_script(msgstr, locale)
171
+ if issue:
172
+ issue.file = str(po_file)
173
+ issue.line = entry.linenum
174
+ issue.msgid = msgid
175
+ issues.append(issue)
176
+ continue # If wrong script, skip language detection (it would also flag)
177
+
178
+ # 2. Garbled text check
179
+ issue = check_garbled_text(msgstr)
180
+ if issue:
181
+ issue.file = str(po_file)
182
+ issue.line = entry.linenum
183
+ issue.msgid = msgid
184
+ issues.append(issue)
185
+ continue
186
+
187
+ # 3. Shifted entry check
188
+ issue = check_shifted_entry(msgid, msgstr)
189
+ if issue:
190
+ issue.file = str(po_file)
191
+ issue.line = entry.linenum
192
+ issues.append(issue)
193
+
194
+ # 4. Wrong language check (uses fastText)
195
+ is_wrong, detected_lang, confidence = is_wrong_language(
196
+ msgstr, locale, confidence_threshold, source_language, msgid=msgid,
197
+ min_detection_length=min_detection_length,
198
+ )
199
+ if is_wrong:
200
+ issues.append(
201
+ Issue(
202
+ file=str(po_file),
203
+ line=entry.linenum,
204
+ msgid=msgid,
205
+ msgstr=msgstr,
206
+ issue_type=IssueType.WRONG_LANGUAGE,
207
+ severity=Severity.ERROR,
208
+ message=f"Expected {locale}, detected {detected_lang} (confidence: {confidence:.0%})",
209
+ detected_lang=detected_lang,
210
+ confidence=confidence,
211
+ )
212
+ )
213
+
214
+ return issues
215
+
216
+
217
+ def _is_non_linguistic(text: str) -> bool:
218
+ """Check if text is mostly non-linguistic (URLs, format strings, numbers, etc.)."""
219
+ cleaned = text
220
+ # Strip Django/Python format strings
221
+ cleaned = re.sub(r"%\([^)]+\)[sd]", "", cleaned)
222
+ cleaned = re.sub(r"%[sd]", "", cleaned)
223
+ cleaned = re.sub(r"\{[^}]*\}", "", cleaned)
224
+ # Strip HTML tags
225
+ cleaned = re.sub(r"<[^>]+>", "", cleaned)
226
+ # Strip URLs
227
+ cleaned = re.sub(r"https?://\S+", "", cleaned)
228
+ # Strip numbers and punctuation
229
+ cleaned = re.sub(r"[0-9.,;:!?/\\@#$%^&*()_+=\[\]{}<>\"'\-\s]", "", cleaned)
230
+ # If very little text remains, it's non-linguistic
231
+ return len(cleaned) < 3
232
+
233
+
234
+ def lint_locale_dir(
235
+ locale_dir: Path,
236
+ languages: list[str] | None = None,
237
+ source_language: str = "en",
238
+ confidence_threshold: float = 0.5,
239
+ min_text_length: int = 3,
240
+ min_detection_length: int = DEFAULT_MIN_DETECTION_LENGTH,
241
+ ignore_patterns: list[str] | None = None,
242
+ ) -> list[Issue]:
243
+ """Lint all .po files in a locale directory.
244
+
245
+ Loads .po-lint-ignore from the locale directory if present.
246
+
247
+ Args:
248
+ locale_dir: Path to a locale/ directory containing <lang>/LC_MESSAGES/*.po
249
+ languages: If set, only lint these language codes. If empty, lint all.
250
+ source_language: The source language of the .po files (default: "en").
251
+ confidence_threshold: Minimum confidence to flag a wrong language.
252
+ min_text_length: Minimum msgstr length to check.
253
+ ignore_patterns: Regex patterns for msgid/msgstr to skip.
254
+ """
255
+ ignore_rules = load_ignore_rules(locale_dir)
256
+ issues = []
257
+
258
+ for po_file in sorted(locale_dir.rglob("*.po")):
259
+ locale = extract_locale_from_path(po_file)
260
+ if locale is None:
261
+ continue
262
+ if languages and locale not in languages:
263
+ continue
264
+
265
+ file_issues = lint_po_file(
266
+ po_file,
267
+ locale=locale,
268
+ source_language=source_language,
269
+ confidence_threshold=confidence_threshold,
270
+ min_text_length=min_text_length,
271
+ min_detection_length=min_detection_length,
272
+ ignore_patterns=ignore_patterns,
273
+ ignore_rules=ignore_rules,
274
+ )
275
+ issues.extend(file_issues)
276
+
277
+ return issues
@@ -0,0 +1,67 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-po-lint
3
+ Version: 0.1.0
4
+ Summary: Lint .po translation files for contamination, wrong languages, shifts, and garbled text
5
+ Author: PesCheck
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: fasttext-wheel>=0.9.2
10
+ Requires-Dist: numpy<2
11
+ Requires-Dist: polib>=1.2
12
+ Description-Content-Type: text/markdown
13
+
14
+ # python-po-lint
15
+
16
+ Lint `.po` translation files for contamination, wrong languages, shifts, and garbled text.
17
+
18
+ ## Features
19
+
20
+ - **Wrong script detection** — catches Cyrillic in a Dutch file, Arabic in French, etc.
21
+ - **Wrong language detection** — hybrid fastText + lingua approach (fastText for long strings, lingua for short ones)
22
+ - **Shifted entry detection** — finds translations that got shifted to the wrong msgid
23
+ - **Garbled text detection** — catches corrupted/broken unicode
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install python-po-lint
29
+ ```
30
+
31
+ ## Usage
32
+
33
+ ```bash
34
+ # Lint a locale directory
35
+ po-lint locale/
36
+
37
+ # Lint with config from pyproject.toml
38
+ po-lint
39
+
40
+ # Only check specific languages
41
+ po-lint locale/ --languages fr de nl
42
+ ```
43
+
44
+ ## Configuration
45
+
46
+ Add to your `pyproject.toml`:
47
+
48
+ ```toml
49
+ [tool.po-lint]
50
+ # Explicit locale directories (relative to project root)
51
+ paths = ["locale"]
52
+
53
+ # Auto-discover locale dirs from installed Python packages
54
+ packages = ["myapp", "myotherapp"]
55
+
56
+ # Only check these languages (empty = all)
57
+ languages = []
58
+
59
+ # Minimum confidence to flag wrong language (0.0 - 1.0)
60
+ confidence_threshold = 0.5
61
+
62
+ # Skip entries with msgstr shorter than this
63
+ min_text_length = 3
64
+
65
+ # Regex patterns to ignore (matched against msgid and msgstr)
66
+ ignore_patterns = []
67
+ ```
@@ -0,0 +1,11 @@
1
+ po_lint/__init__.py,sha256=2VjrIp2SLGcJKCEhgQ_Ss706P6o6RD86B9-wSPKJ-94,134
2
+ po_lint/checks.py,sha256=cpySTm9QJzcy31rjOKgBsNeXeeHFmzfdCHjhUa2nHLQ,10556
3
+ po_lint/cli.py,sha256=3bc5_ZaZ4-WK29JsyqwgI677p4F6WxpaAnR0_-_9g2Y,6866
4
+ po_lint/config.py,sha256=Ya3Q78_LZ6x4RrmWpexH4qFrHgKBxyUlX103XOXSQSI,3053
5
+ po_lint/detector.py,sha256=fA1tDP3Lf05bFLh-ybvpAgHTAJPC7cut8dpZc7REssg,11735
6
+ po_lint/linter.py,sha256=inT8gSP75nosm9S_uCEKURo4MICVUHqdIFmElQhLAgg,9038
7
+ python_po_lint-0.1.0.dist-info/METADATA,sha256=0cw6AlhCs3UBVJRrkV3YYgAfz79NvmDnoOCctxMezyk,1638
8
+ python_po_lint-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
9
+ python_po_lint-0.1.0.dist-info/entry_points.txt,sha256=W9_2iuo5yQtYgY5U8qVH7h6Xq8XSj0KsF71TR_9_xDE,45
10
+ python_po_lint-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
11
+ python_po_lint-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ po-lint = po_lint.cli:main