PyPI - python-po-lint - Versions diffs - 0.1.0__py3-none-any.whl - Mend

python-po-lint 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

po_lint/__init__.py +3 -0
po_lint/checks.py +291 -0
po_lint/cli.py +208 -0
po_lint/config.py +94 -0
po_lint/detector.py +314 -0
po_lint/linter.py +277 -0
python_po_lint-0.1.0.dist-info/METADATA +67 -0
python_po_lint-0.1.0.dist-info/RECORD +11 -0
python_po_lint-0.1.0.dist-info/WHEEL +4 -0
python_po_lint-0.1.0.dist-info/entry_points.txt +2 -0
python_po_lint-0.1.0.dist-info/licenses/LICENSE +201 -0

po_lint/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""python-po-lint: Lint .po translation files for contamination, wrong languages, shifts, and garbled text."""
+__version__ = "0.1.0"

po_lint/checks.py ADDED Viewed

@@ -0,0 +1,291 @@
+"""Individual lint checks for .po file entries."""
+import re
+import unicodedata
+from dataclasses import dataclass
+from enum import Enum
+class Severity(Enum):
+    ERROR = "error"
+    WARNING = "warning"
+class IssueType(Enum):
+    WRONG_LANGUAGE = "wrong_language"
+    WRONG_SCRIPT = "wrong_script"
+    SHIFTED_ENTRY = "shifted_entry"
+    GARBLED_TEXT = "garbled_text"
+    UNTRANSLATED = "untranslated"
+@dataclass
+class Issue:
+    """A single lint issue found in a .po file."""
+    file: str
+    line: int
+    msgid: str
+    msgstr: str
+    issue_type: IssueType
+    severity: Severity
+    message: str
+    detected_lang: str = ""
+    confidence: float = 0.0
+    def __str__(self) -> str:
+        msgid_short = self.msgid[:60] + "..." if len(self.msgid) > 60 else self.msgid
+        return f"  {self.severity.value.upper()}: [{self.issue_type.value}] {self.message}\n    msgid: {msgid_short!r}"
+# Script detection patterns — covers all major writing systems
+SCRIPT_PATTERNS = {
+    "latin": re.compile(r"[a-zA-Z\u00C0-\u024F\u1E00-\u1EFF\u0100-\u017F\u0180-\u024F]"),
+    "arabic": re.compile(r"[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]"),
+    "cyrillic": re.compile(r"[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]"),
+    "devanagari": re.compile(r"[\u0900-\u097F\uA8E0-\uA8FF]"),
+    "bengali": re.compile(r"[\u0980-\u09FF]"),
+    "gurmukhi": re.compile(r"[\u0A00-\u0A7F]"),
+    "gujarati": re.compile(r"[\u0A80-\u0AFF]"),
+    "oriya": re.compile(r"[\u0B00-\u0B7F]"),
+    "tamil": re.compile(r"[\u0B80-\u0BFF]"),
+    "telugu": re.compile(r"[\u0C00-\u0C7F]"),
+    "kannada": re.compile(r"[\u0C80-\u0CFF]"),
+    "malayalam": re.compile(r"[\u0D00-\u0D7F]"),
+    "sinhala": re.compile(r"[\u0D80-\u0DFF]"),
+    "thai": re.compile(r"[\u0E00-\u0E7F]"),
+    "lao": re.compile(r"[\u0E80-\u0EFF]"),
+    "tibetan": re.compile(r"[\u0F00-\u0FFF]"),
+    "myanmar": re.compile(r"[\u1000-\u109F\uAA60-\uAA7F]"),
+    "georgian": re.compile(r"[\u10A0-\u10FF\u2D00-\u2D2F]"),
+    "hangul": re.compile(r"[\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F]"),
+    "cjk": re.compile(r"[\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF]"),
+    "kana": re.compile(r"[\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF]"),
+    "hebrew": re.compile(r"[\u0590-\u05FF\uFB1D-\uFB4F]"),
+    "armenian": re.compile(r"[\u0530-\u058F\uFB00-\uFB17]"),
+    "ethiopic": re.compile(r"[\u1200-\u137F\u1380-\u139F\u2D80-\u2DDF]"),
+    "khmer": re.compile(r"[\u1780-\u17FF\u19E0-\u19FF]"),
+}
+# Expected scripts per locale — comprehensive mapping covering all fastText languages.
+# Languages may accept multiple scripts (e.g. Serbian uses both Cyrillic and Latin).
+# fmt: off
+LOCALE_SCRIPTS = {
+    # Latin script
+    "af": {"latin"}, "az": {"latin"}, "br": {"latin"}, "bs": {"latin"}, "ca": {"latin"},
+    "ceb": {"latin"}, "co": {"latin"}, "cs": {"latin"}, "cy": {"latin"}, "da": {"latin"},
+    "de": {"latin"}, "en": {"latin"}, "eo": {"latin"}, "es": {"latin"}, "et": {"latin"},
+    "eu": {"latin"}, "fi": {"latin"}, "fil": {"latin"}, "fr": {"latin"}, "fy": {"latin"},
+    "ga": {"latin"}, "gd": {"latin"}, "gl": {"latin"}, "ha": {"latin"}, "haw": {"latin"},
+    "hr": {"latin"}, "ht": {"latin"}, "hu": {"latin"}, "id": {"latin"}, "ig": {"latin"},
+    "is": {"latin"}, "it": {"latin"}, "jv": {"latin"}, "la": {"latin"}, "lb": {"latin"},
+    "lt": {"latin"}, "lv": {"latin"}, "mg": {"latin"}, "mi": {"latin"}, "ms": {"latin"},
+    "mt": {"latin"}, "nl": {"latin"}, "no": {"latin"}, "nb": {"latin"}, "nn": {"latin"},
+    "ny": {"latin"}, "oc": {"latin"}, "pl": {"latin"}, "pt": {"latin"}, "ro": {"latin"},
+    "rw": {"latin"}, "sk": {"latin"}, "sl": {"latin"}, "sm": {"latin"}, "sn": {"latin"},
+    "so": {"latin"}, "sq": {"latin"}, "st": {"latin"}, "su": {"latin"}, "sv": {"latin"},
+    "sw": {"latin"}, "tl": {"latin"}, "tr": {"latin"}, "uz": {"latin"}, "vi": {"latin"},
+    "war": {"latin"}, "xh": {"latin"}, "yo": {"latin"}, "zu": {"latin"},
+    # Cyrillic script
+    "be": {"cyrillic"}, "bg": {"cyrillic"}, "ky": {"cyrillic"}, "kk": {"cyrillic"},
+    "mk": {"cyrillic"}, "mn": {"cyrillic"}, "ru": {"cyrillic"}, "tg": {"cyrillic"},
+    "tt": {"cyrillic"}, "uk": {"cyrillic"},
+    # Multi-script languages
+    "sr": {"cyrillic", "latin"}, "sh": {"cyrillic", "latin"},
+    # Arabic script
+    "ar": {"arabic"}, "fa": {"arabic"}, "ku": {"arabic"}, "ps": {"arabic"},
+    "sd": {"arabic"}, "ug": {"arabic"}, "ur": {"arabic"},
+    # Devanagari
+    "hi": {"devanagari"}, "mr": {"devanagari"}, "ne": {"devanagari"}, "sa": {"devanagari"},
+    # Other Indic scripts
+    "as": {"bengali"}, "bn": {"bengali"},
+    "gu": {"gujarati"},
+    "kn": {"kannada"},
+    "ml": {"malayalam"},
+    "or": {"oriya"},
+    "pa": {"gurmukhi"},
+    "si": {"sinhala"},
+    "ta": {"tamil"},
+    "te": {"telugu"},
+    # East Asian
+    "zh": {"cjk"}, "zh_Hans": {"cjk"}, "zh_Hant": {"cjk"},
+    "ja": {"cjk", "kana"},
+    "ko": {"hangul"},
+    # Other scripts
+    "am": {"ethiopic"}, "ti": {"ethiopic"},
+    "el": {"latin"},  # Greek — could add greek script pattern if needed
+    "he": {"hebrew"}, "yi": {"hebrew"},
+    "hy": {"armenian"},
+    "ka": {"georgian"},
+    "km": {"khmer"},
+    "lo": {"lao"},
+    "my": {"myanmar"},
+    "th": {"thai"},
+}
+# fmt: on
+def detect_scripts(text: str) -> dict[str, int]:
+    """Count characters belonging to each script in the text."""
+    counts = {}
+    for name, pattern in SCRIPT_PATTERNS.items():
+        count = len(pattern.findall(text))
+        if count > 0:
+            counts[name] = count
+    return counts
+def check_wrong_script(msgstr: str, locale: str) -> Issue | None:
+    """Check if the translation uses the wrong writing script or wrong language within the same script.
+    1. Entirely wrong script (e.g. Latin in Arabic file) → ERROR
+    2. Distinctive character check for same-script languages (e.g. Russian chars in Ukrainian file) → ERROR
+    """
+    expected = LOCALE_SCRIPTS.get(locale)
+    if not expected:
+        return None
+    scripts = detect_scripts(msgstr)
+    if not scripts:
+        return None
+    # Check if any expected script is present
+    expected_present = any(s in scripts for s in expected)
+    # Find dominant script
+    dominant = max(scripts, key=scripts.get)
+    dominant_count = scripts[dominant]
+    total = sum(scripts.values())
+    if dominant in expected:
+        # Script is correct — check distinctive characters for same-script languages
+        return _check_distinctive_chars(msgstr, locale)
+    # Wrong script is dominant
+    if dominant_count / total < 0.5:
+        return None
+    # Expected script is present but not dominant — likely technical terms mixed in, ignore
+    if expected_present:
+        return None
+    # Expected script is completely absent — clear contamination
+    return Issue(
+        file="",
+        line=0,
+        msgid="",
+        msgstr=msgstr,
+        issue_type=IssueType.WRONG_SCRIPT,
+        severity=Severity.ERROR,
+        message=f"Expected {'/'.join(expected)} script, found entirely {dominant} ({dominant_count}/{total} chars)",
+    )
+def _check_distinctive_chars(msgstr: str, locale: str) -> Issue | None:
+    """Check for foreign distinctive characters within the same script.
+    If foreign-only characters are found, it's contamination — regardless of
+    whether the locale's own distinctive characters are also present.
+    If no distinctive characters from either side are found, we can't tell — skip.
+    """
+    config = DISTINCTIVE_CHARS.get(locale)
+    if not config:
+        return None
+    chars = set(msgstr)
+    has_own = bool(chars & config["own"])
+    for foreign_lang, foreign_chars in config.items():
+        if foreign_lang == "own":
+            continue
+        has_foreign = bool(chars & foreign_chars)
+        if not has_foreign:
+            continue
+        if has_own:
+            message = f"Mixed {locale}/{foreign_lang} characters — possible contamination"
+        else:
+            message = f"Found {foreign_lang}-only characters, no {locale}-specific characters"
+        return Issue(
+            file="",
+            line=0,
+            msgid="",
+            msgstr=msgstr,
+            issue_type=IssueType.WRONG_SCRIPT,
+            severity=Severity.ERROR,
+            message=message,
+        )
+    return None
+# Distinctive characters per locale — used to distinguish languages that share
+# a script but have unique alphabet characters. Each entry maps a locale to its
+# own unique characters and the foreign characters that indicate contamination.
+# Add new entries as needed for other language pairs (e.g. Serbian/Bulgarian).
+DISTINCTIVE_CHARS: dict[str, dict[str, set[str]]] = {
+    "uk": {"own": set("ґєіїҐЄІЇ"), "ru": set("ёыэъЁЫЭЪ")},
+    "ru": {"own": set("ёыэъЁЫЭЪ"), "uk": set("ґєіїҐЄІЇ")},
+}
+def check_shifted_entry(msgid: str, msgstr: str) -> Issue | None:
+    """Detect entries where msgstr appears to be shifted (belongs to a different msgid).
+    Heuristic: if msgid is long (>100 chars) but msgstr is very short (<15% of msgid length),
+    the translation is likely shifted from a different entry.
+    """
+    if not msgstr or not msgid:
+        return None
+    msgid_len = len(msgid)
+    msgstr_len = len(msgstr)
+    if msgid_len < 100:
+        return None
+    ratio = msgstr_len / msgid_len
+    if ratio >= 0.15:
+        return None
+    return Issue(
+        file="",
+        line=0,
+        msgid=msgid,
+        msgstr=msgstr,
+        issue_type=IssueType.SHIFTED_ENTRY,
+        severity=Severity.WARNING,
+        message=f"Possible shifted entry: msgstr is {ratio:.0%} the length of msgid ({msgstr_len} vs {msgid_len} chars)",
+    )
+def check_garbled_text(msgstr: str) -> Issue | None:
+    """Detect garbled/corrupted text patterns."""
+    if len(msgstr) < 5:
+        return None
+    # Check for high ratio of replacement characters or unusual unicode categories
+    suspicious = 0
+    total = 0
+    for char in msgstr:
+        cat = unicodedata.category(char)
+        total += 1
+        if cat.startswith("C") and cat != "Cf":  # Control chars (except format)
+            suspicious += 1
+        elif char == "\ufffd":  # Replacement character
+            suspicious += 1
+    if total > 0 and suspicious / total > 0.1:
+        return Issue(
+            file="",
+            line=0,
+            msgid="",
+            msgstr=msgstr,
+            issue_type=IssueType.GARBLED_TEXT,
+            severity=Severity.ERROR,
+            message=f"Garbled text detected: {suspicious}/{total} suspicious characters",
+        )
+    return None

po_lint/cli.py ADDED Viewed

@@ -0,0 +1,208 @@
+"""CLI entry point for po-lint."""
+import argparse
+import json
+import sys
+from pathlib import Path
+from po_lint.checks import Severity
+from po_lint.config import load_config
+from po_lint.detector import init_model
+from po_lint.linter import lint_locale_dir
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="po-lint",
+        description="Lint .po translation files for contamination, wrong languages, shifts, and garbled text.",
+    )
+    parser.add_argument(
+        "paths",
+        nargs="*",
+        type=Path,
+        help="Locale directories to lint. If omitted, reads from pyproject.toml [tool.po-lint] config.",
+    )
+    parser.add_argument(
+        "--config-dir",
+        type=Path,
+        default=None,
+        help="Directory containing pyproject.toml (default: current directory).",
+    )
+    parser.add_argument(
+        "--confidence",
+        type=float,
+        default=None,
+        help="Minimum confidence threshold for wrong language detection (default: 0.5).",
+    )
+    parser.add_argument(
+        "--languages",
+        nargs="*",
+        default=None,
+        help="Only check these language codes (e.g. --languages fr de nl).",
+    )
+    parser.add_argument(
+        "--source-language",
+        default=None,
+        help="Source language of the .po files (default: 'en'). Detections matching this language are allowed.",
+    )
+    parser.add_argument(
+        "--min-detection-length",
+        type=int,
+        default=None,
+        help="Minimum cleaned text length for language detection (default: 30).",
+    )
+    parser.add_argument(
+        "--compact-model",
+        action="store_true",
+        help="Use the compact fastText model (917KB, less accurate) instead of the full model (126MB).",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["text", "json"],
+        default="text",
+        help="Output format (default: text).",
+    )
+    parser.add_argument(
+        "--warnings-as-errors",
+        action="store_true",
+        help="Treat warnings as errors (exit code 1).",
+    )
+    parser.add_argument(
+        "--no-color",
+        action="store_true",
+        help="Disable colored output.",
+    )
+    args = parser.parse_args(argv)
+    # Load config from pyproject.toml
+    config_dir = args.config_dir or Path.cwd()
+    config = load_config(config_dir)
+    # CLI args override config
+    confidence = args.confidence if args.confidence is not None else config.confidence_threshold
+    languages = args.languages if args.languages is not None else (config.languages or None)
+    source_language = args.source_language if args.source_language is not None else config.source_language
+    min_detection_length = (
+        args.min_detection_length if args.min_detection_length is not None
+        else config.min_detection_length
+    )
+    # Resolve locale directories
+    if args.paths:
+        locale_dirs = [p for p in args.paths if p.is_dir()]
+        if not locale_dirs:
+            print(f"Error: No valid directories found in {args.paths}", file=sys.stderr)
+            return 2
+    else:
+        locale_dirs = config.resolve_locale_dirs(config_dir)
+        if not locale_dirs:
+            print("Error: No locale directories found. Specify paths or configure [tool.po-lint] in pyproject.toml.",
+                  file=sys.stderr)
+            return 2
+    # Initialize model
+    compact = args.compact_model or config.compact_model
+    init_model(compact=compact)
+    # Run linting
+    all_issues = []
+    for locale_dir in locale_dirs:
+        if args.format == "text":
+            print(f"Linting {locale_dir}...")
+        issues = lint_locale_dir(
+            locale_dir,
+            languages=languages,
+            source_language=source_language,
+            confidence_threshold=confidence,
+            min_text_length=config.min_text_length,
+            min_detection_length=min_detection_length,
+            ignore_patterns=config.ignore_patterns,
+        )
+        all_issues.extend(issues)
+    if args.format == "json":
+        return _output_json(all_issues, args.warnings_as_errors)
+    return _output_text(all_issues, args)
+def _output_json(issues, warnings_as_errors: bool) -> int:
+    errors = [i for i in issues if i.severity == Severity.ERROR]
+    warnings = [i for i in issues if i.severity == Severity.WARNING]
+    output = {
+        "summary": {
+            "errors": len(errors),
+            "warnings": len(warnings),
+            "files": len({i.file for i in issues}),
+        },
+        "issues": [
+            {
+                "file": i.file,
+                "line": i.line,
+                "severity": i.severity.value,
+                "type": i.issue_type.value,
+                "message": i.message,
+                "msgid": i.msgid,
+                "msgstr": i.msgstr,
+                "detected_lang": i.detected_lang or None,
+                "confidence": round(i.confidence, 4) if i.confidence else None,
+            }
+            for i in sorted(issues, key=lambda x: (x.file, x.line))
+        ],
+    }
+    print(json.dumps(output, ensure_ascii=False, indent=2))
+    if errors:
+        return 1
+    if warnings and warnings_as_errors:
+        return 1
+    return 0
+def _output_text(issues, args) -> int:
+    if not issues:
+        return 0
+    errors = [i for i in issues if i.severity == Severity.ERROR]
+    warnings = [i for i in issues if i.severity == Severity.WARNING]
+    by_file: dict[str, list] = {}
+    for issue in issues:
+        by_file.setdefault(issue.file, []).append(issue)
+    use_color = not args.no_color and sys.stdout.isatty()
+    for file_path, file_issues in sorted(by_file.items()):
+        if use_color:
+            print(f"\n\033[1m{file_path}\033[0m")
+        else:
+            print(f"\n{file_path}")
+        for issue in sorted(file_issues, key=lambda i: i.line):
+            prefix = _severity_prefix(issue.severity, use_color)
+            msgid_short = issue.msgid[:60] + "..." if len(issue.msgid) > 60 else issue.msgid
+            msgstr_short = issue.msgstr[:60] + "..." if len(issue.msgstr) > 60 else issue.msgstr
+            print(f"  line {issue.line}: {prefix} [{issue.issue_type.value}] {issue.message}")
+            if msgid_short:
+                print(f"    msgid:  {msgid_short!r}")
+            if msgstr_short:
+                print(f"    msgstr: {msgstr_short!r}")
+    print(f"\nFound {len(errors)} error(s) and {len(warnings)} warning(s) in {len(by_file)} file(s).")
+    if errors:
+        return 1
+    if warnings and args.warnings_as_errors:
+        return 1
+    return 0
+def _severity_prefix(severity: Severity, color: bool) -> str:
+    if severity == Severity.ERROR:
+        return "\033[31mERROR\033[0m" if color else "ERROR"
+    return "\033[33mWARNING\033[0m" if color else "WARNING"
+if __name__ == "__main__":
+    sys.exit(main())

po_lint/config.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""Configuration loading from pyproject.toml."""
+import importlib.util
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+if sys.version_info >= (3, 11):
+    import tomllib
+else:
+    try:
+        import tomllib
+    except ImportError:
+        import tomli as tomllib
+@dataclass
+class Config:
+    """po-lint configuration."""
+    paths: list[Path] = field(default_factory=lambda: [Path("locale")])
+    packages: list[str] = field(default_factory=list)
+    languages: list[str] = field(default_factory=list)
+    source_language: str = "en"
+    confidence_threshold: float = 0.5
+    min_text_length: int = 3
+    min_detection_length: int = 30
+    ignore_patterns: list[str] = field(default_factory=list)
+    compact_model: bool = False
+    def resolve_locale_dirs(self, base_dir: Path) -> list[Path]:
+        """Resolve all locale directories from paths and packages.
+        Returns a list of existing locale directories.
+        """
+        locale_dirs = []
+        # Explicit paths (relative to base_dir)
+        for p in self.paths:
+            resolved = base_dir / p if not p.is_absolute() else p
+            if resolved.is_dir():
+                locale_dirs.append(resolved)
+        # Auto-discover from installed packages
+        for package_name in self.packages:
+            locale_dir = find_package_locale(package_name)
+            if locale_dir and locale_dir.is_dir():
+                locale_dirs.append(locale_dir)
+        return locale_dirs
+def find_package_locale(package_name: str) -> Path | None:
+    """Find the locale directory for an installed Python package."""
+    spec = importlib.util.find_spec(package_name)
+    if spec is None or spec.origin is None:
+        return None
+    package_dir = Path(spec.origin).parent
+    locale_dir = package_dir / "locale"
+    if locale_dir.is_dir():
+        return locale_dir
+    return None
+def load_config(project_dir: Path | None = None) -> Config:
+    """Load configuration from pyproject.toml in the given directory.
+    Falls back to defaults if no config file or no [tool.po-lint] section found.
+    """
+    if project_dir is None:
+        project_dir = Path.cwd()
+    pyproject = project_dir / "pyproject.toml"
+    if not pyproject.exists():
+        return Config()
+    with open(pyproject, "rb") as f:
+        data = tomllib.load(f)
+    tool_config = data.get("tool", {}).get("po-lint", {})
+    if not tool_config:
+        return Config()
+    return Config(
+        paths=[Path(p) for p in tool_config.get("paths", ["locale"])],
+        packages=tool_config.get("packages", []),
+        languages=tool_config.get("languages", []),
+        source_language=tool_config.get("source_language", "en"),
+        confidence_threshold=tool_config.get("confidence_threshold", 0.5),
+        min_text_length=tool_config.get("min_text_length", 3),
+        min_detection_length=tool_config.get("min_detection_length", 30),
+        ignore_patterns=tool_config.get("ignore_patterns", []),
+        compact_model=tool_config.get("compact_model", False),
+    )