PyPI - paraencoder - Versions diffs - 0.1.0__py3-none-any.whl - Mend

paraencoder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

para/__init__.py +14 -0
para/cli.py +101 -0
para/convert.py +46 -0
para/detect.py +65 -0
para/io.py +42 -0
para/normalize.py +20 -0
para/rules.py +361 -0
paraencoder-0.1.0.dist-info/METADATA +133 -0
paraencoder-0.1.0.dist-info/RECORD +12 -0
paraencoder-0.1.0.dist-info/WHEEL +4 -0
paraencoder-0.1.0.dist-info/entry_points.txt +2 -0
paraencoder-0.1.0.dist-info/licenses/LICENSE +21 -0

para/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Para: Burmese text detection and conversion toolkit."""
+__all__ = [
+    "is_zawgyi",
+    "detect_encoding",
+    "zg_to_unicode",
+    "normalize_unicode",
+]
+from para.detect import detect_encoding, is_zawgyi
+from para.convert import zg_to_unicode
+from para.normalize import normalize_unicode
+__version__ = "0.1.0"

para/cli.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Command line entrypoint for Para."""
+from __future__ import annotations
+import argparse
+import sys
+from typing import Optional
+from para.convert import zg_to_unicode
+from para.detect import detect_encoding, is_zawgyi
+from para.io import convert_file, read_text, write_text
+from para.normalize import normalize_unicode
+def _read_input(input_path: Optional[str]) -> str:
+    if input_path:
+        return read_text(input_path)
+    return sys.stdin.read()
+def _write_output(data: str, output_path: Optional[str]) -> None:
+    if output_path:
+        write_text(output_path, data)
+    else:
+        sys.stdout.write(data)
+def _cmd_detect(args: argparse.Namespace) -> int:
+    data = _read_input(args.input)
+    encoding = detect_encoding(data)
+    sys.stdout.write(f"{encoding}\n")
+    return 0
+def _cmd_convert(args: argparse.Namespace) -> int:
+    if args.input:
+        converted = convert_file(
+            input_path=args.input,
+            output_path=args.output,
+            assume_zawgyi=args.force,
+            normalize=not args.no_normalize,
+        )
+        if not args.output:
+            sys.stdout.write(converted)
+    else:
+        data = sys.stdin.read()
+        converted = zg_to_unicode(
+            data,
+            normalize=not args.no_normalize,
+            force=args.force,
+        )
+        _write_output(converted, args.output)
+    return 0
+def _cmd_normalize(args: argparse.Namespace) -> int:
+    data = _read_input(args.input)
+    normalized = normalize_unicode(data)
+    _write_output(normalized, args.output)
+    return 0
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Para: Zawgyi ↔ Unicode tooling")
+    sub = parser.add_subparsers(dest="command", required=True)
+    detect_parser = sub.add_parser("detect", help="Detect encoding of input text")
+    detect_parser.add_argument("--input", help="Input file path; defaults to stdin")
+    detect_parser.set_defaults(func=_cmd_detect)
+    convert_parser = sub.add_parser("convert", help="Convert Zawgyi text to Unicode")
+    convert_parser.add_argument("--input", help="Input file path; defaults to stdin")
+    convert_parser.add_argument("--output", help="Output file path; defaults to stdout")
+    convert_parser.add_argument(
+        "--force",
+        action="store_true",
+        help="Force conversion even if detection is uncertain",
+    )
+    convert_parser.add_argument(
+        "--no-normalize",
+        action="store_true",
+        help="Skip Unicode normalization step",
+    )
+    convert_parser.set_defaults(func=_cmd_convert)
+    normalize_parser = sub.add_parser("normalize", help="Normalize Unicode Burmese text")
+    normalize_parser.add_argument("--input", help="Input file path; defaults to stdin")
+    normalize_parser.add_argument("--output", help="Output file path; defaults to stdout")
+    normalize_parser.set_defaults(func=_cmd_normalize)
+    return parser
+def main(argv: Optional[list[str]] = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    return args.func(args)
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())

para/convert.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Rule-based Zawgyi to Unicode conversion."""
+from __future__ import annotations
+import re
+from typing import Iterable
+from para.detect import detect_encoding, is_zawgyi
+from para.normalize import normalize_unicode
+from para.rules import ZAWGYI_TO_UNICODE_RULES
+def _compile_rules(rules: Iterable[tuple[str, str]]) -> list[tuple[re.Pattern[str], str]]:
+    compiled: list[tuple[re.Pattern[str], str]] = []
+    for pattern, replacement in rules:
+        compiled.append((re.compile(pattern), replacement))
+    return compiled
+_COMPILED_RULES = _compile_rules(ZAWGYI_TO_UNICODE_RULES)
+def zg_to_unicode(text: str, *, normalize: bool = True, force: bool = False) -> str:
+    """
+    Convert Zawgyi text to Unicode using ordered regex rules.
+    Args:
+        text: Input text that may be Zawgyi.
+        normalize: Whether to apply Unicode normalization and basic reordering.
+        force: When False, conversion only runs if the detector believes the text is Zawgyi.
+    """
+    if not text:
+        return ""
+    # Hard guard: never modify non-Zawgyi input (contract guarantee).
+    if not force and detect_encoding(text) != "zawgyi":
+        return text
+    converted = text
+    for pattern, repl in _COMPILED_RULES:
+        converted = pattern.sub(repl, converted)
+    if normalize:
+        converted = normalize_unicode(converted)
+    return converted

para/detect.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Deterministic detection for Zawgyi vs Unicode Burmese text."""
+from __future__ import annotations
+import re
+from typing import Literal
+Encoding = Literal["zawgyi", "unicode", "unknown"]
+_MYANMAR_RANGE = re.compile(r"[\u1000-\u109F]")
+# If scores are equal or differ by less than this margin, result is "unknown".
+SCORE_TIE_MARGIN = 0
+# Patterns that strongly suggest Zawgyi encoding.
+_ZG_PATTERNS = [
+    (re.compile(r"[\u105A\u1060-\u1097]"), 4),
+    (re.compile(r"\u1031[\u103B-\u103E]"), 3),
+    (re.compile(r"\u1039[\u1000-\u1021]?\u1031"), 3),
+    (re.compile(r"\u103A\u103A"), 2),
+    (re.compile(r"\u1039[\u1000-\u109F]"), 2),
+    (re.compile(r"\u1031\u108A"), 3),
+]
+# Patterns that indicate proper Unicode ordering or characters.
+_UNI_PATTERNS = [
+    (re.compile(r"\u1031[\u1000-\u1021]"), 3),
+    (re.compile(r"\u102B\u103A"), 2),
+    (re.compile(r"\u103B[\u103C\u103D]"), 2),
+    (re.compile(r"\u103C[\u103E]"), 2),
+    (re.compile(r"\u1037[\u103A]"), 2),
+    (re.compile(r"\u1004\u103A\u1039"), 3),
+    (re.compile(r"[\u1000-\u1021]\u103C"), 2),
+]
+def _score(text: str, patterns: list[tuple[re.Pattern[str], int]]) -> int:
+    score = 0
+    for pattern, weight in patterns:
+        matches = pattern.findall(text)
+        if matches:
+            score += len(matches) * weight
+    return score
+def detect_encoding(text: str) -> Encoding:
+    """Return "zawgyi", "unicode", or "unknown" based on heuristic scoring."""
+    if not text:
+        return "unknown"
+    if not _MYANMAR_RANGE.search(text):
+        return "unknown"
+    zg_score = _score(text, _ZG_PATTERNS)
+    uni_score = _score(text, _UNI_PATTERNS)
+    if abs(zg_score - uni_score) <= SCORE_TIE_MARGIN:
+        return "unknown"
+    return "zawgyi" if zg_score > uni_score else "unicode"
+def is_zawgyi(text: str) -> bool:
+    """Convenience boolean: True when the detector prefers Zawgyi."""
+    return detect_encoding(text) == "zawgyi"

para/io.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""Batch-friendly I/O helpers."""
+from __future__ import annotations
+from pathlib import Path
+from typing import Optional
+from para.convert import zg_to_unicode
+DEFAULT_ENCODING = "utf-8"
+def read_text(path: str, *, encoding: str = DEFAULT_ENCODING) -> str:
+    return Path(path).read_text(encoding=encoding)
+def write_text(path: str, data: str, *, encoding: str = DEFAULT_ENCODING) -> None:
+    Path(path).write_text(data, encoding=encoding)
+def convert_file(
+    *,
+    input_path: str,
+    output_path: Optional[str] = None,
+    assume_zawgyi: bool = False,
+    normalize: bool = True,
+    encoding: str = DEFAULT_ENCODING,
+) -> str:
+    """
+    Convert a file from Zawgyi to Unicode and write the result.
+    Returns the converted text. When ``output_path`` is None, the caller can
+    capture the returned string.
+    """
+    data = read_text(input_path, encoding=encoding)
+    converted = zg_to_unicode(data, normalize=normalize, force=assume_zawgyi)
+    if output_path:
+        write_text(output_path, converted, encoding=encoding)
+    return converted

para/normalize.py ADDED Viewed

@@ -0,0 +1,20 @@
+"""Unicode-focused normalization helpers."""
+from __future__ import annotations
+# NOTE (v0.1.0): Normalization is intentionally disabled for safety.
+# Previous reordering logic corrupted valid canonical Unicode text.
+# Until a provably-safe implementation is available, this function is a
+# strict no-op.  Unicode safety > clever normalization.
+def normalize_unicode(text: str) -> str:
+    """Return text unchanged (normalization disabled in v0.1.0 for safety).
+    ParaEncoder must never modify valid Unicode text unless explicitly and
+    provably necessary.  Reordering / NFC logic has been removed because it
+    corrupted canonical input such as "မင်္ဂလာပါ".
+    Future versions may re-introduce opt-in, test-backed normalization.
+    """
+    return text

para/rules.py ADDED Viewed

@@ -0,0 +1,361 @@
+"""Zawgyi-to-Unicode conversion rules ported from ParaEncoder.
+Rules are applied in order. Each rule is a (pattern, replacement) tuple.
+Ported from: https://github.com/Laitei40/ParaEncoder/issues/new
+"""
+ZAWGYI_TO_UNICODE_RULES = [
+    # Remove duplicate diacritics
+    (r"([\u102D\u102E\u103D\u102F\u1037\u1095])\1+", r"\1"),
+    # Remove zero-width space
+    ("\u200B", ""),
+    # Medial combinations
+    ("\u103d\u103c", "\u108a"),
+    # Medial HA variants -> U+103E
+    ("(\u103d|\u1087)", "\u103e"),
+    # Medial WA: U+103C -> U+103D
+    ("\u103c", "\u103d"),
+    # Medial RA variants -> U+103C
+    ("(\u103b|\u107e|\u107f|\u1080|\u1081|\u1082|\u1083|\u1084)", "\u103c"),
+    # Medial YA variants -> U+103B
+    ("(\u103a|\u107d)", "\u103b"),
+    # Asat: U+1039 -> U+103A
+    ("\u1039", "\u103a"),
+    # Stacked SA variants
+    ("(\u1066|\u1067)", "\u1039\u1006"),
+    # NGA variant
+    ("\u106a", "\u1009"),
+    # NYA variant
+    ("\u106b", "\u100a"),
+    # Stacked TTA
+    ("\u106c", "\u1039\u100b"),
+    # Stacked TTHA
+    ("\u106d", "\u1039\u100c"),
+    # Stacked DDA + DDA
+    ("\u106e", "\u100d\u1039\u100d"),
+    # Stacked DDA + DDHA
+    ("\u106f", "\u100d\u1039\u100e"),
+    # Stacked NNA
+    ("\u1070", "\u1039\u100f"),
+    # Stacked TA variants
+    ("(\u1071|\u1072)", "\u1039\u1010"),
+    # Stacked KA
+    ("\u1060", "\u1039\u1000"),
+    # Stacked KHA
+    ("\u1061", "\u1039\u1001"),
+    # Stacked GA
+    ("\u1062", "\u1039\u1002"),
+    # Stacked GHA
+    ("\u1063", "\u1039\u1003"),
+    # Stacked CA
+    ("\u1065", "\u1039\u1005"),
+    # Stacked JA
+    ("\u1068", "\u1039\u1007"),
+    # Stacked JHA
+    ("\u1069", "\u1039\u1008"),
+    # Stacked THA variants
+    ("(\u1073|\u1074)", "\u1039\u1011"),
+    # Stacked DA
+    ("\u1075", "\u1039\u1012"),
+    # Stacked DHA
+    ("\u1076", "\u1039\u1013"),
+    # Stacked NA
+    ("\u1077", "\u1039\u1014"),
+    # Stacked PA
+    ("\u1078", "\u1039\u1015"),
+    # Stacked PHA
+    ("\u1079", "\u1039\u1016"),
+    # Stacked BA
+    ("\u107a", "\u1039\u1017"),
+    # Stacked MA
+    ("\u107c", "\u1039\u1019"),
+    # Stacked LA
+    ("\u1085", "\u1039\u101c"),
+    # Tall AA -> U+102F
+    ("\u1033", "\u102f"),
+    # Tall AA variant -> U+1030
+    ("\u1034", "\u1030"),
+    # Another U variant -> U+1030
+    ("\u103f", "\u1030"),
+    # Great SA -> U+103F
+    ("\u1086", "\u103f"),
+    # Reorder anusvara and medial HA+U
+    ("\u1036\u1088", "\u1088\u1036"),
+    # Medial HA + U combination
+    ("\u1088", "\u103e\u102f"),
+    # Medial HA + UU combination
+    ("\u1089", "\u103e\u1030"),
+    # Medial WA + HA combination
+    ("\u108a", "\u103d\u103e"),
+    # Reorder kinzi and medial YA
+    ("\u103B\u1064", "\u1064\u103B"),
+    # Reorder medial RA + consonant + kinzi
+    ("\u103c([\u1000-\u1021])([\u1064\u108b\u108d])", "\\1\u103c\\2"),
+    # Kinzi basic form
+    ("(\u1031)?([\u1000-\u1021\u1040-\u1049])(\u103c)?\u1064", "\u1004\u103a\u1039\\1\\2\\3"),
+    # Kinzi + vowel I
+    ("(\u1031)?([\u1000-\u1021])(\u103b|\u103c)?\u108b", "\u1004\u103a\u1039\\1\\2\\3\u102d"),
+    # Kinzi + vowel II
+    ("(\u1031)?([\u1000-\u1021])(\u103b)?\u108c", "\u1004\u103a\u1039\\1\\2\\3\u102e"),
+    # Kinzi + anusvara
+    ("(\u1031)?([\u1000-\u1021])([\u103b\u103c])?\u108d", "\u1004\u103a\u1039\\1\\2\\3\u1036"),
+    # Vowel I + anusvara combination
+    ("\u108e", "\u102d\u1036"),
+    # NA variant
+    ("\u108f", "\u1014"),
+    # RA variant
+    ("\u1090", "\u101b"),
+    # NNA + DDA stacked
+    ("\u1091", "\u100f\u1039\u100d"),
+    # TTA + TTHA stacked
+    ("\u1092", "\u100b\u1039\u100c"),
+    # Special MA + BBA combination
+    ("\u1019\u102c(\u107b|\u1093)", "\u1019\u1039\u1018\u102c"),
+    # Stacked BHA variants
+    ("(\u107b|\u1093)", "\u1039\u1018"),
+    # Dot below variants -> U+1037
+    ("(\u1094|\u1095)", "\u1037"),
+    # Reorder consonant + dot + AI
+    ("([\u1000-\u1021])\u1037\u1032", "\\1\u1032\u1037"),
+    # Stacked TA + medial WA combination
+    ("\u1096", "\u1039\u1010\u103d"),
+    # Stacked TTA + TTA
+    ("\u1097", "\u100b\u1039\u100b"),
+    # Reorder medial RA + consonant
+    ("\u103c([\u1000-\u1021])([\u1000-\u1021])?", "\\1\u103c\\2"),
+    # Reorder consonant + medial RA + medial YA
+    ("([\u1000-\u1021])\u103c\u103a", "\u103c\\1\u103a"),
+    # Digit 7 -> RA in certain contexts
+    ("\u1047(?=[\u102c-\u1030\u1032\u1036-\u1038\u103d\u103e])", "\u101b"),
+    # E vowel + digit 7 -> E vowel + RA
+    ("\u1031\u1047", "\u1031\u101b"),
+    # Digit 0 -> WA in certain contexts
+    ("\u1040(\u102e|\u102f|\u102d\u102f|\u1030|\u1036|\u103d|\u103e)", "\u101d\\1"),
+    # Digit 0 + AA -> WA + AA (not after digits)
+    ("([^\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049])\u1040\u102b", "\\1\u101d\u102b"),
+    # Digit 0 + AA -> WA + AA (after digits, not followed by visarga)
+    ("([\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049])\u1040\u102b(?!\u1038)", "\\1\u101d\u102b"),
+    # Digit 0 at start + AA -> WA
+    ("^\u1040(?=\u102b)", "\u101d"),
+    # Digit 0 + vowel I -> WA + vowel I (not before space+slash)
+    ("\u1040\u102d(?!\u0020?/)", "\u101d\u102d"),
+    # Digit 0 -> WA (between non-digits)
+    ("([^\u1040-\u1049])\u1040([^\u1040-\u1049\u0020]|[\u104a\u104b])", "\\1\u101d\\2"),
+    # Digit 0 -> WA (before newline, not after digit)
+    ("([^\u1040-\u1049])\u1040(?=[\\f\\n\\r])", "\\1\u101d"),
+    # Digit 0 -> WA (at end, not after digit)
+    ("([^\u1040-\u1049])\u1040$", "\\1\u101d"),
+    # Reorder E vowel after consonant and medials
+    ("\u1031([\u1000-\u1021\u103f])(\u103e)?(\u103b)?", "\\1\\2\\3\u1031"),
+    # Reorder E vowel after consonant and remaining medials
+    ("([\u1000-\u1021])\u1031([\u103b\u103c\u103d\u103e]+)", "\\1\\2\u1031"),
+    # Reorder AI and medial WA
+    ("\u1032\u103d", "\u103d\u1032"),
+    # Reorder vowel I/II and medial YA
+    ("([\u102d\u102e])\u103b", "\u103b\\1"),
+    # Reorder medial WA and YA
+    ("\u103d\u103b", "\u103b\u103d"),
+    # Reorder asat and dot below
+    ("\u103a\u1037", "\u1037\u103a"),
+    # Remove duplicate U after vowel
+    ("\u102f(\u102d|\u102e|\u1036|\u1037)\u102f", "\u102f\\1"),
+    # Reorder U/UU and vowel I/II
+    ("(\u102f|\u1030)(\u102d|\u102e)", "\\2\\1"),
+    # Reorder medial HA and YA/RA
+    ("(\u103e)(\u103b|\u103c)", "\\2\\1"),
+    # U+1025 -> U+1009 before asat/AA
+    ("\u1025(?=[\u1037]?[\u103a\u102c])", "\u1009"),
+    # U+1025 + vowel II -> U+1026
+    ("\u1025\u102e", "\u1026"),
+    # CA + medial YA -> JHA
+    ("\u1005\u103b", "\u1008"),
+    # Reorder anusvara and U/UU
+    ("\u1036(\u102f|\u1030)", "\\1\u1036"),
+    # Reorder E + dot + medial HA
+    ("\u1031\u1037\u103e", "\u103e\u1031\u1037"),
+    # Reorder E + medial HA + AA
+    ("\u1031\u103e\u102c", "\u103e\u1031\u102c"),
+    # Tall AA + asat combination
+    ("\u105a", "\u102b\u103a"),
+    # Reorder E + medial YA + medial HA
+    ("\u1031\u103b\u103e", "\u103b\u103e\u1031"),
+    # Reorder vowel I/II and medial WA/HA
+    ("(\u102d|\u102e)(\u103d|\u103e)", "\\2\\1"),
+    # Reorder AA and stacked consonant
+    ("\u102c\u1039([\u1000-\u1021])", "\u1039\\1\u102c"),
+    # Complex reordering with medial RA + asat + stacked
+    ("\u1039\u103c\u103a\u1039([\u1000-\u1021])", "\u103a\u1039\\1\u103c"),
+    # Reorder medial RA and stacked consonant
+    ("\u103c\u1039([\u1000-\u1021])", "\u1039\\1\u103c"),
+    # Reorder anusvara and stacked consonant
+    ("\u1036\u1039([\u1000-\u1021])", "\u1039\\1\u1036"),
+    # Expand abbreviated form
+    ("\u104e", "\u104e\u1004\u103a\u1038"),
+    # Digit 0 + AA/AI -> WA + AA/AI
+    ("\u1040(\u102b|\u102c|\u1036)", "\u101d\\1"),
+    # U+1025 + asat -> U+1009 + asat
+    ("\u1025\u1039", "\u1009\u1039"),
+    # Reorder consonant + medial RA + E + medial WA
+    ("([\u1000-\u1021])\u103c\u1031\u103d", "\\1\u103c\u103d\u1031"),
+    # Reorder consonant + medial YA + E + medial WA + optional HA
+    ("([\u1000-\u1021])\u103b\u1031\u103d(\u103e)?", "\\1\u103b\u103d\\2\u1031"),
+    # Reorder consonant + medial WA + E + medial YA
+    ("([\u1000-\u1021])\u103d\u1031\u103b", "\\1\u103b\u103d\u1031"),
+    # Reorder consonant + E + stacked consonant
+    ("([\u1000-\u1021])\u1031(\u1039[\u1000-\u1021]\u103d?)", "\\1\\2\u1031"),
+    # Reorder visarga and asat
+    ("\u1038\u103a", "\u103a\u1038"),
+    # Remove redundant vowel I + asat combinations
+    ("\u102d\u103a|\u103a\u102d", "\u102d"),
+    # Remove asat after vowel I + U
+    ("\u102d\u102f\u103a", "\u102d\u102f"),
+    # Remove space before dot below
+    ("\u0020\u1037", "\u1037"),
+    # Reorder dot below and anusvara
+    ("\u1037\u1036", "\u1036\u1037"),
+    # Remove duplicate vowel I
+    ("[\u102d]+", "\u102d"),
+    # Remove duplicate asat
+    ("[\u103a]+", "\u103a"),
+    # Remove duplicate medial WA
+    ("[\u103d]+", "\u103d"),
+    # Remove duplicate dot below
+    ("[\u1037]+", "\u1037"),
+    # Remove duplicate vowel II
+    ("[\u102e]+", "\u102e"),
+    # Normalize vowel I + II -> II
+    ("\u102d\u102e|\u102e\u102d", "\u102e"),
+    # Reorder U + vowel I
+    ("\u102f\u102d", "\u102d\u102f"),
+    # Remove double dot below
+    ("\u1037\u1037", "\u1037"),
+    # Remove double AI
+    ("\u1032\u1032", "\u1032"),
+    # Digit 4 + NGA + asat + visarga -> abbreviated form
+    ("\u1044\u1004\u103a\u1038", "\u104E\u1004\u103a\u1038"),
+    # Reorder vowel I/II + stacked consonant
+    ("([\u102d\u102e])\u1039([\u1000-\u1021])", "\u1039\\2\\1"),
+    # Reorder medial RA + E + stacked consonant
+    ("(\u103c\u1031)\u1039([\u1000-\u1021])", "\u1039\\2\\1"),
+    # Reorder anusvara and medial WA
+    ("\u1036\u103d", "\u103d\u1036"),
+    # Digit 7 -> RA in certain contexts (final)
+    ("\u1047((?=[\u1000-\u1021]\u103a)|(?=[\u102c-\u1030\u1032\u1036-\u1038\u103d\u103e]))", "\u101b"),
+]

paraencoder-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,133 @@
+Metadata-Version: 2.4
+Name: paraencoder
+Version: 0.1.0
+Summary: Burmese text detection and conversion toolkit for Zawgyi and Unicode
+Project-URL: Homepage, https://github.com/Laitei40/ParaEncoder
+Project-URL: Repository, https://github.com/Laitei40/ParaEncoder
+Project-URL: Issues, https://github.com/Laitei40/ParaEncoder/issues
+Author: Para Maintainers
+License: MIT
+License-File: LICENSE
+Keywords: burmese,conversion,myanmar,text,unicode,zawgyi
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Topic :: Text Processing :: Linguistic
+Requires-Python: >=3.9
+Provides-Extra: test
+Requires-Dist: pytest>=7; extra == 'test'
+Description-Content-Type: text/markdown
+# Para
+Para is a small, boring, and transparent toolkit for working with Burmese text. It detects whether text is encoded in Zawgyi or Unicode and converts Zawgyi to Unicode using a rule-based approach. Para never invents a new encoding and keeps its APIs explicit.
+## Goals
+- Be Unicode-first and never invent a new encoding.
+- Offer stable, explicit APIs without side effects or magic imports.
+- Provide deterministic Zawgyi vs Unicode detection.
+- Convert Zawgyi to Unicode with maintainable, rule-based logic (Parabaik-style), not machine learning.
+- Stay batch-friendly for spreadsheets, CSVs, and plain text.
+- Avoid heavy native dependencies.
+- Be honest about limitations and edge cases.
+## Installation
+```bash
+pip install para
+```
+## Usage
+```python
+from para.detect import is_zawgyi, detect_encoding
+from para.convert import zg_to_unicode
+from para.normalize import normalize_unicode
+text = "\u1031\u1010\u1004\u103a"  # Zawgyi-encoded string
+if is_zawgyi(text):
+    cleaned = zg_to_unicode(text)
+    cleaned = normalize_unicode(cleaned)
+```
+### CLI
+Detect encoding:
+```bash
+echo "\u1031\u1010\u1004\u103a" | para detect
+```
+Convert Zawgyi to Unicode:
+```bash
+echo "\u1031\u1010\u1004\u103a" | para convert > output.txt
+```
+Process a file in place (write to stdout by default):
+```bash
+para convert --input input.txt --output output.txt
+```
+#### Windows / PowerShell note
+PowerShell's default encoding corrupts Myanmar text in pipes. Before piping Burmese text, set UTF-8 encoding:
+```powershell
+$OutputEncoding = [System.Text.Encoding]::UTF8
+[Console]::InputEncoding = [System.Text.Encoding]::UTF8
+[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
+echo "ျမန္မာ" | para convert
+```
+Or use file-based input/output to avoid pipe issues:
+```powershell
+para convert --input input.txt --output output.txt
+```
+## API surface
+- `para.detect.is_zawgyi(text: str) -> bool`
+    - Input: `text` string.
+    - Output: `True` only when the detector score prefers Zawgyi; otherwise `False`.
+    - Guarantee: Never raises on empty/ASCII-only input; returns `False` for those.
+- `para.detect.detect_encoding(text: str) -> Literal["zawgyi", "unicode", "unknown"]`
+    - Input: `text` string.
+    - Output: One of the three labels. Ties or insufficient evidence → `"unknown"` (no auto-conversion).
+    - Guarantee: Deterministic, no network/ML, explicit tie handling.
+- `para.convert.zg_to_unicode(text: str, *, normalize: bool = True, force: bool = False) -> str`
+    - Input: `text` string.
+    - Output: Converted Unicode string when detection prefers Zawgyi (or when `force=True`). Otherwise passes through (optionally normalized).
+    - Guarantee: Ordered, test-backed regex rules; no Unicode→Zawgyi path; `force=False` avoids silent conversion on ambiguous text.
+- `para.normalize.normalize_unicode(text: str) -> str`
+    - Input: `text` string.
+    - Output: NFC-normalized string with simple Myanmar ordering tweaks.
+    - Guarantee: Idempotent on already-normalized Unicode Burmese.
+- `para.io.read_text(path: str, *, encoding: str = "utf-8") -> str`
+- `para.io.write_text(path: str, data: str, *, encoding: str = "utf-8") -> None`
+- `para.io.convert_file(...) -> str`
+    - Batch helpers for files; never guess encodings beyond the provided `encoding` argument.
+## Detection approach
+Detection is deterministic and rule-based. Para scores the input with Zawgyi-specific patterns (e.g., `U+1031` prefix order, `U+105A`, stacked medials) and Unicode-only patterns (e.g., valid ordering of medials, `U+103A` usage). The side with the higher score wins; ties produce `"unknown"`. No machine learning, no network calls.
+## Conversion approach
+Conversion uses an ordered list of regex replacements derived from Parabaik-style mappings. The rules are explicit, unit-tested, and live in `para.rules`. The converter does not attempt Unicode-to-Zawgyi; it only supports Zawgyi-to-Unicode because Unicode is the target canonical encoding.
+## Limitations
+- Ambiguous short strings (e.g., ASCII-only) return `"unknown"` and pass through unchanged.
+- Extremely malformed Zawgyi text may require manual cleanup.
+- The converter focuses on common Zawgyi usage; rare legacy ligatures may need additional rules.
+## Non-goals
+- Creating or endorsing any new Burmese encoding.
+- Unicode-to-Zawgyi conversion.
+- ML-based detection or probabilistic auto-conversion.
+- Silent mutation of text when detection confidence is low; ties stay `"unknown"`.
+## Contributing
+Issues and pull requests are welcome. Keep changes readable and testable.
+## Packaging
+- Build a wheel/sdist locally: `python -m pip install build` then `python -m build`.
+- Publish to PyPI (once ready): `python -m pip install twine` then `twine upload dist/*`.
+- The package metadata in `pyproject.toml` is PyPI-ready (MIT license, explicit packages, CLI entrypoint).
+## License
+MIT

paraencoder-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+para/__init__.py,sha256=XgvX7tM1z4fLz6yEjcJJU4jW1OzR5SAUaXYuKwZ352s,319
+para/cli.py,sha256=_hZsUTXKAS_X1zO7GDM1zbNjexnJ8dfPsFpMUz9BJIg,3154
+para/convert.py,sha256=hpsqjjt8kgEnOfryw1sDYE6RsTX-INNm8hGuL1pqZeA,1370
+para/detect.py,sha256=rGask21S1ST1KwZnvPT-SFpODGXJ6-VAAkLfaalsKKk,1929
+para/io.py,sha256=jG-vB7y_x7dn-nHjMrygn3e9jz-FDsxRRtjJylCHDeA,1074
+para/normalize.py,sha256=k4a8-OtYh-bbPAwGytpP92CwiX_R9QNZSDjdccSgYEM,784
+para/rules.py,sha256=U1uIxYW2Ag-Y8ZNa0DY5KsdLKNw5fwHztpYhjRo9GfA,9753
+paraencoder-0.1.0.dist-info/METADATA,sha256=iU0fpxoFo9RW2omwMxjJ-CrYDk3a8PP-WMQktZEuG-I,5700
+paraencoder-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+paraencoder-0.1.0.dist-info/entry_points.txt,sha256=Dn1jwtUjVRTWNPcpkWvVzPcCZMJTvDcOmi6DT1F_A2E,39
+paraencoder-0.1.0.dist-info/licenses/LICENSE,sha256=ykJYlrfnN4vfXeFv-XrRR5Yzftp-F9TlSYiXDcNTfTY,1073
+paraencoder-0.1.0.dist-info/RECORD,,

paraencoder-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.28.0
+Root-Is-Purelib: true
+Tag: py3-none-any

paraencoder-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ para = para.cli:main

paraencoder-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Para Maintainers
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.