paraencoder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
para/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ """Para: Burmese text detection and conversion toolkit."""
2
+
3
+ __all__ = [
4
+ "is_zawgyi",
5
+ "detect_encoding",
6
+ "zg_to_unicode",
7
+ "normalize_unicode",
8
+ ]
9
+
10
+ from para.detect import detect_encoding, is_zawgyi
11
+ from para.convert import zg_to_unicode
12
+ from para.normalize import normalize_unicode
13
+
14
+ __version__ = "0.1.0"
para/cli.py ADDED
@@ -0,0 +1,101 @@
1
+ """Command line entrypoint for Para."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+ from typing import Optional
8
+
9
+ from para.convert import zg_to_unicode
10
+ from para.detect import detect_encoding, is_zawgyi
11
+ from para.io import convert_file, read_text, write_text
12
+ from para.normalize import normalize_unicode
13
+
14
+
15
+ def _read_input(input_path: Optional[str]) -> str:
16
+ if input_path:
17
+ return read_text(input_path)
18
+ return sys.stdin.read()
19
+
20
+
21
+ def _write_output(data: str, output_path: Optional[str]) -> None:
22
+ if output_path:
23
+ write_text(output_path, data)
24
+ else:
25
+ sys.stdout.write(data)
26
+
27
+
28
+ def _cmd_detect(args: argparse.Namespace) -> int:
29
+ data = _read_input(args.input)
30
+ encoding = detect_encoding(data)
31
+ sys.stdout.write(f"{encoding}\n")
32
+ return 0
33
+
34
+
35
+ def _cmd_convert(args: argparse.Namespace) -> int:
36
+ if args.input:
37
+ converted = convert_file(
38
+ input_path=args.input,
39
+ output_path=args.output,
40
+ assume_zawgyi=args.force,
41
+ normalize=not args.no_normalize,
42
+ )
43
+ if not args.output:
44
+ sys.stdout.write(converted)
45
+ else:
46
+ data = sys.stdin.read()
47
+ converted = zg_to_unicode(
48
+ data,
49
+ normalize=not args.no_normalize,
50
+ force=args.force,
51
+ )
52
+ _write_output(converted, args.output)
53
+ return 0
54
+
55
+
56
+ def _cmd_normalize(args: argparse.Namespace) -> int:
57
+ data = _read_input(args.input)
58
+ normalized = normalize_unicode(data)
59
+ _write_output(normalized, args.output)
60
+ return 0
61
+
62
+
63
+ def build_parser() -> argparse.ArgumentParser:
64
+ parser = argparse.ArgumentParser(description="Para: Zawgyi ↔ Unicode tooling")
65
+ sub = parser.add_subparsers(dest="command", required=True)
66
+
67
+ detect_parser = sub.add_parser("detect", help="Detect encoding of input text")
68
+ detect_parser.add_argument("--input", help="Input file path; defaults to stdin")
69
+ detect_parser.set_defaults(func=_cmd_detect)
70
+
71
+ convert_parser = sub.add_parser("convert", help="Convert Zawgyi text to Unicode")
72
+ convert_parser.add_argument("--input", help="Input file path; defaults to stdin")
73
+ convert_parser.add_argument("--output", help="Output file path; defaults to stdout")
74
+ convert_parser.add_argument(
75
+ "--force",
76
+ action="store_true",
77
+ help="Force conversion even if detection is uncertain",
78
+ )
79
+ convert_parser.add_argument(
80
+ "--no-normalize",
81
+ action="store_true",
82
+ help="Skip Unicode normalization step",
83
+ )
84
+ convert_parser.set_defaults(func=_cmd_convert)
85
+
86
+ normalize_parser = sub.add_parser("normalize", help="Normalize Unicode Burmese text")
87
+ normalize_parser.add_argument("--input", help="Input file path; defaults to stdin")
88
+ normalize_parser.add_argument("--output", help="Output file path; defaults to stdout")
89
+ normalize_parser.set_defaults(func=_cmd_normalize)
90
+
91
+ return parser
92
+
93
+
94
+ def main(argv: Optional[list[str]] = None) -> int:
95
+ parser = build_parser()
96
+ args = parser.parse_args(argv)
97
+ return args.func(args)
98
+
99
+
100
+ if __name__ == "__main__": # pragma: no cover
101
+ raise SystemExit(main())
para/convert.py ADDED
@@ -0,0 +1,46 @@
1
+ """Rule-based Zawgyi to Unicode conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Iterable
7
+
8
+ from para.detect import detect_encoding, is_zawgyi
9
+ from para.normalize import normalize_unicode
10
+ from para.rules import ZAWGYI_TO_UNICODE_RULES
11
+
12
+
13
+ def _compile_rules(rules: Iterable[tuple[str, str]]) -> list[tuple[re.Pattern[str], str]]:
14
+ compiled: list[tuple[re.Pattern[str], str]] = []
15
+ for pattern, replacement in rules:
16
+ compiled.append((re.compile(pattern), replacement))
17
+ return compiled
18
+
19
+
20
+ _COMPILED_RULES = _compile_rules(ZAWGYI_TO_UNICODE_RULES)
21
+
22
+
23
+ def zg_to_unicode(text: str, *, normalize: bool = True, force: bool = False) -> str:
24
+ """
25
+ Convert Zawgyi text to Unicode using ordered regex rules.
26
+
27
+ Args:
28
+ text: Input text that may be Zawgyi.
29
+ normalize: Whether to apply Unicode normalization and basic reordering.
30
+ force: When False, conversion only runs if the detector believes the text is Zawgyi.
31
+ """
32
+ if not text:
33
+ return ""
34
+
35
+ # Hard guard: never modify non-Zawgyi input (contract guarantee).
36
+ if not force and detect_encoding(text) != "zawgyi":
37
+ return text
38
+
39
+ converted = text
40
+ for pattern, repl in _COMPILED_RULES:
41
+ converted = pattern.sub(repl, converted)
42
+
43
+ if normalize:
44
+ converted = normalize_unicode(converted)
45
+
46
+ return converted
para/detect.py ADDED
@@ -0,0 +1,65 @@
1
+ """Deterministic detection for Zawgyi vs Unicode Burmese text."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Literal
7
+
8
+ Encoding = Literal["zawgyi", "unicode", "unknown"]
9
+
10
+ _MYANMAR_RANGE = re.compile(r"[\u1000-\u109F]")
11
+
12
+ # If scores are equal or differ by less than this margin, result is "unknown".
13
+ SCORE_TIE_MARGIN = 0
14
+
15
+ # Patterns that strongly suggest Zawgyi encoding.
16
+ _ZG_PATTERNS = [
17
+ (re.compile(r"[\u105A\u1060-\u1097]"), 4),
18
+ (re.compile(r"\u1031[\u103B-\u103E]"), 3),
19
+ (re.compile(r"\u1039[\u1000-\u1021]?\u1031"), 3),
20
+ (re.compile(r"\u103A\u103A"), 2),
21
+ (re.compile(r"\u1039[\u1000-\u109F]"), 2),
22
+ (re.compile(r"\u1031\u108A"), 3),
23
+ ]
24
+
25
+ # Patterns that indicate proper Unicode ordering or characters.
26
+ _UNI_PATTERNS = [
27
+ (re.compile(r"\u1031[\u1000-\u1021]"), 3),
28
+ (re.compile(r"\u102B\u103A"), 2),
29
+ (re.compile(r"\u103B[\u103C\u103D]"), 2),
30
+ (re.compile(r"\u103C[\u103E]"), 2),
31
+ (re.compile(r"\u1037[\u103A]"), 2),
32
+ (re.compile(r"\u1004\u103A\u1039"), 3),
33
+ (re.compile(r"[\u1000-\u1021]\u103C"), 2),
34
+ ]
35
+
36
+
37
+ def _score(text: str, patterns: list[tuple[re.Pattern[str], int]]) -> int:
38
+ score = 0
39
+ for pattern, weight in patterns:
40
+ matches = pattern.findall(text)
41
+ if matches:
42
+ score += len(matches) * weight
43
+ return score
44
+
45
+
46
+ def detect_encoding(text: str) -> Encoding:
47
+ """Return "zawgyi", "unicode", or "unknown" based on heuristic scoring."""
48
+ if not text:
49
+ return "unknown"
50
+
51
+ if not _MYANMAR_RANGE.search(text):
52
+ return "unknown"
53
+
54
+ zg_score = _score(text, _ZG_PATTERNS)
55
+ uni_score = _score(text, _UNI_PATTERNS)
56
+
57
+ if abs(zg_score - uni_score) <= SCORE_TIE_MARGIN:
58
+ return "unknown"
59
+
60
+ return "zawgyi" if zg_score > uni_score else "unicode"
61
+
62
+
63
+ def is_zawgyi(text: str) -> bool:
64
+ """Convenience boolean: True when the detector prefers Zawgyi."""
65
+ return detect_encoding(text) == "zawgyi"
para/io.py ADDED
@@ -0,0 +1,42 @@
1
+ """Batch-friendly I/O helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from para.convert import zg_to_unicode
9
+
10
+
11
+ DEFAULT_ENCODING = "utf-8"
12
+
13
+
14
+ def read_text(path: str, *, encoding: str = DEFAULT_ENCODING) -> str:
15
+ return Path(path).read_text(encoding=encoding)
16
+
17
+
18
+ def write_text(path: str, data: str, *, encoding: str = DEFAULT_ENCODING) -> None:
19
+ Path(path).write_text(data, encoding=encoding)
20
+
21
+
22
+ def convert_file(
23
+ *,
24
+ input_path: str,
25
+ output_path: Optional[str] = None,
26
+ assume_zawgyi: bool = False,
27
+ normalize: bool = True,
28
+ encoding: str = DEFAULT_ENCODING,
29
+ ) -> str:
30
+ """
31
+ Convert a file from Zawgyi to Unicode and write the result.
32
+
33
+ Returns the converted text. When ``output_path`` is None, the caller can
34
+ capture the returned string.
35
+ """
36
+ data = read_text(input_path, encoding=encoding)
37
+ converted = zg_to_unicode(data, normalize=normalize, force=assume_zawgyi)
38
+
39
+ if output_path:
40
+ write_text(output_path, converted, encoding=encoding)
41
+
42
+ return converted
para/normalize.py ADDED
@@ -0,0 +1,20 @@
1
+ """Unicode-focused normalization helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ # NOTE (v0.1.0): Normalization is intentionally disabled for safety.
6
+ # Previous reordering logic corrupted valid canonical Unicode text.
7
+ # Until a provably-safe implementation is available, this function is a
8
+ # strict no-op. Unicode safety > clever normalization.
9
+
10
+
11
+ def normalize_unicode(text: str) -> str:
12
+ """Return text unchanged (normalization disabled in v0.1.0 for safety).
13
+
14
+ ParaEncoder must never modify valid Unicode text unless explicitly and
15
+ provably necessary. Reordering / NFC logic has been removed because it
16
+ corrupted canonical input such as "မင်္ဂလာပါ".
17
+
18
+ Future versions may re-introduce opt-in, test-backed normalization.
19
+ """
20
+ return text
para/rules.py ADDED
@@ -0,0 +1,361 @@
1
+ """Zawgyi-to-Unicode conversion rules ported from ParaEncoder.
2
+
3
+ Rules are applied in order. Each rule is a (pattern, replacement) tuple.
4
+ Ported from: https://github.com/Laitei40/ParaEncoder/issues/new
5
+ """
6
+
7
+ ZAWGYI_TO_UNICODE_RULES = [
8
+ # Remove duplicate diacritics
9
+ (r"([\u102D\u102E\u103D\u102F\u1037\u1095])\1+", r"\1"),
10
+
11
+ # Remove zero-width space
12
+ ("\u200B", ""),
13
+
14
+ # Medial combinations
15
+ ("\u103d\u103c", "\u108a"),
16
+
17
+ # Medial HA variants -> U+103E
18
+ ("(\u103d|\u1087)", "\u103e"),
19
+
20
+ # Medial WA: U+103C -> U+103D
21
+ ("\u103c", "\u103d"),
22
+
23
+ # Medial RA variants -> U+103C
24
+ ("(\u103b|\u107e|\u107f|\u1080|\u1081|\u1082|\u1083|\u1084)", "\u103c"),
25
+
26
+ # Medial YA variants -> U+103B
27
+ ("(\u103a|\u107d)", "\u103b"),
28
+
29
+ # Asat: U+1039 -> U+103A
30
+ ("\u1039", "\u103a"),
31
+
32
+ # Stacked SA variants
33
+ ("(\u1066|\u1067)", "\u1039\u1006"),
34
+
35
+ # NGA variant
36
+ ("\u106a", "\u1009"),
37
+
38
+ # NYA variant
39
+ ("\u106b", "\u100a"),
40
+
41
+ # Stacked TTA
42
+ ("\u106c", "\u1039\u100b"),
43
+
44
+ # Stacked TTHA
45
+ ("\u106d", "\u1039\u100c"),
46
+
47
+ # Stacked DDA + DDA
48
+ ("\u106e", "\u100d\u1039\u100d"),
49
+
50
+ # Stacked DDA + DDHA
51
+ ("\u106f", "\u100d\u1039\u100e"),
52
+
53
+ # Stacked NNA
54
+ ("\u1070", "\u1039\u100f"),
55
+
56
+ # Stacked TA variants
57
+ ("(\u1071|\u1072)", "\u1039\u1010"),
58
+
59
+ # Stacked KA
60
+ ("\u1060", "\u1039\u1000"),
61
+
62
+ # Stacked KHA
63
+ ("\u1061", "\u1039\u1001"),
64
+
65
+ # Stacked GA
66
+ ("\u1062", "\u1039\u1002"),
67
+
68
+ # Stacked GHA
69
+ ("\u1063", "\u1039\u1003"),
70
+
71
+ # Stacked CA
72
+ ("\u1065", "\u1039\u1005"),
73
+
74
+ # Stacked JA
75
+ ("\u1068", "\u1039\u1007"),
76
+
77
+ # Stacked JHA
78
+ ("\u1069", "\u1039\u1008"),
79
+
80
+ # Stacked THA variants
81
+ ("(\u1073|\u1074)", "\u1039\u1011"),
82
+
83
+ # Stacked DA
84
+ ("\u1075", "\u1039\u1012"),
85
+
86
+ # Stacked DHA
87
+ ("\u1076", "\u1039\u1013"),
88
+
89
+ # Stacked NA
90
+ ("\u1077", "\u1039\u1014"),
91
+
92
+ # Stacked PA
93
+ ("\u1078", "\u1039\u1015"),
94
+
95
+ # Stacked PHA
96
+ ("\u1079", "\u1039\u1016"),
97
+
98
+ # Stacked BA
99
+ ("\u107a", "\u1039\u1017"),
100
+
101
+ # Stacked MA
102
+ ("\u107c", "\u1039\u1019"),
103
+
104
+ # Stacked LA
105
+ ("\u1085", "\u1039\u101c"),
106
+
107
+ # Tall AA -> U+102F
108
+ ("\u1033", "\u102f"),
109
+
110
+ # Tall AA variant -> U+1030
111
+ ("\u1034", "\u1030"),
112
+
113
+ # Another U variant -> U+1030
114
+ ("\u103f", "\u1030"),
115
+
116
+ # Great SA -> U+103F
117
+ ("\u1086", "\u103f"),
118
+
119
+ # Reorder anusvara and medial HA+U
120
+ ("\u1036\u1088", "\u1088\u1036"),
121
+
122
+ # Medial HA + U combination
123
+ ("\u1088", "\u103e\u102f"),
124
+
125
+ # Medial HA + UU combination
126
+ ("\u1089", "\u103e\u1030"),
127
+
128
+ # Medial WA + HA combination
129
+ ("\u108a", "\u103d\u103e"),
130
+
131
+ # Reorder kinzi and medial YA
132
+ ("\u103B\u1064", "\u1064\u103B"),
133
+
134
+ # Reorder medial RA + consonant + kinzi
135
+ ("\u103c([\u1000-\u1021])([\u1064\u108b\u108d])", "\\1\u103c\\2"),
136
+
137
+ # Kinzi basic form
138
+ ("(\u1031)?([\u1000-\u1021\u1040-\u1049])(\u103c)?\u1064", "\u1004\u103a\u1039\\1\\2\\3"),
139
+
140
+ # Kinzi + vowel I
141
+ ("(\u1031)?([\u1000-\u1021])(\u103b|\u103c)?\u108b", "\u1004\u103a\u1039\\1\\2\\3\u102d"),
142
+
143
+ # Kinzi + vowel II
144
+ ("(\u1031)?([\u1000-\u1021])(\u103b)?\u108c", "\u1004\u103a\u1039\\1\\2\\3\u102e"),
145
+
146
+ # Kinzi + anusvara
147
+ ("(\u1031)?([\u1000-\u1021])([\u103b\u103c])?\u108d", "\u1004\u103a\u1039\\1\\2\\3\u1036"),
148
+
149
+ # Vowel I + anusvara combination
150
+ ("\u108e", "\u102d\u1036"),
151
+
152
+ # NA variant
153
+ ("\u108f", "\u1014"),
154
+
155
+ # RA variant
156
+ ("\u1090", "\u101b"),
157
+
158
+ # NNA + DDA stacked
159
+ ("\u1091", "\u100f\u1039\u100d"),
160
+
161
+ # TTA + TTHA stacked
162
+ ("\u1092", "\u100b\u1039\u100c"),
163
+
164
+ # Special MA + BBA combination
165
+ ("\u1019\u102c(\u107b|\u1093)", "\u1019\u1039\u1018\u102c"),
166
+
167
+ # Stacked BHA variants
168
+ ("(\u107b|\u1093)", "\u1039\u1018"),
169
+
170
+ # Dot below variants -> U+1037
171
+ ("(\u1094|\u1095)", "\u1037"),
172
+
173
+ # Reorder consonant + dot + AI
174
+ ("([\u1000-\u1021])\u1037\u1032", "\\1\u1032\u1037"),
175
+
176
+ # Stacked TA + medial WA combination
177
+ ("\u1096", "\u1039\u1010\u103d"),
178
+
179
+ # Stacked TTA + TTA
180
+ ("\u1097", "\u100b\u1039\u100b"),
181
+
182
+ # Reorder medial RA + consonant
183
+ ("\u103c([\u1000-\u1021])([\u1000-\u1021])?", "\\1\u103c\\2"),
184
+
185
+ # Reorder consonant + medial RA + medial YA
186
+ ("([\u1000-\u1021])\u103c\u103a", "\u103c\\1\u103a"),
187
+
188
+ # Digit 7 -> RA in certain contexts
189
+ ("\u1047(?=[\u102c-\u1030\u1032\u1036-\u1038\u103d\u103e])", "\u101b"),
190
+
191
+ # E vowel + digit 7 -> E vowel + RA
192
+ ("\u1031\u1047", "\u1031\u101b"),
193
+
194
+ # Digit 0 -> WA in certain contexts
195
+ ("\u1040(\u102e|\u102f|\u102d\u102f|\u1030|\u1036|\u103d|\u103e)", "\u101d\\1"),
196
+
197
+ # Digit 0 + AA -> WA + AA (not after digits)
198
+ ("([^\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049])\u1040\u102b", "\\1\u101d\u102b"),
199
+
200
+ # Digit 0 + AA -> WA + AA (after digits, not followed by visarga)
201
+ ("([\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049])\u1040\u102b(?!\u1038)", "\\1\u101d\u102b"),
202
+
203
+ # Digit 0 at start + AA -> WA
204
+ ("^\u1040(?=\u102b)", "\u101d"),
205
+
206
+ # Digit 0 + vowel I -> WA + vowel I (not before space+slash)
207
+ ("\u1040\u102d(?!\u0020?/)", "\u101d\u102d"),
208
+
209
+ # Digit 0 -> WA (between non-digits)
210
+ ("([^\u1040-\u1049])\u1040([^\u1040-\u1049\u0020]|[\u104a\u104b])", "\\1\u101d\\2"),
211
+
212
+ # Digit 0 -> WA (before newline, not after digit)
213
+ ("([^\u1040-\u1049])\u1040(?=[\\f\\n\\r])", "\\1\u101d"),
214
+
215
+ # Digit 0 -> WA (at end, not after digit)
216
+ ("([^\u1040-\u1049])\u1040$", "\\1\u101d"),
217
+
218
+ # Reorder E vowel after consonant and medials
219
+ ("\u1031([\u1000-\u1021\u103f])(\u103e)?(\u103b)?", "\\1\\2\\3\u1031"),
220
+
221
+ # Reorder E vowel after consonant and remaining medials
222
+ ("([\u1000-\u1021])\u1031([\u103b\u103c\u103d\u103e]+)", "\\1\\2\u1031"),
223
+
224
+ # Reorder AI and medial WA
225
+ ("\u1032\u103d", "\u103d\u1032"),
226
+
227
+ # Reorder vowel I/II and medial YA
228
+ ("([\u102d\u102e])\u103b", "\u103b\\1"),
229
+
230
+ # Reorder medial WA and YA
231
+ ("\u103d\u103b", "\u103b\u103d"),
232
+
233
+ # Reorder asat and dot below
234
+ ("\u103a\u1037", "\u1037\u103a"),
235
+
236
+ # Remove duplicate U after vowel
237
+ ("\u102f(\u102d|\u102e|\u1036|\u1037)\u102f", "\u102f\\1"),
238
+
239
+ # Reorder U/UU and vowel I/II
240
+ ("(\u102f|\u1030)(\u102d|\u102e)", "\\2\\1"),
241
+
242
+ # Reorder medial HA and YA/RA
243
+ ("(\u103e)(\u103b|\u103c)", "\\2\\1"),
244
+
245
+ # U+1025 -> U+1009 before asat/AA
246
+ ("\u1025(?=[\u1037]?[\u103a\u102c])", "\u1009"),
247
+
248
+ # U+1025 + vowel II -> U+1026
249
+ ("\u1025\u102e", "\u1026"),
250
+
251
+ # CA + medial YA -> JHA
252
+ ("\u1005\u103b", "\u1008"),
253
+
254
+ # Reorder anusvara and U/UU
255
+ ("\u1036(\u102f|\u1030)", "\\1\u1036"),
256
+
257
+ # Reorder E + dot + medial HA
258
+ ("\u1031\u1037\u103e", "\u103e\u1031\u1037"),
259
+
260
+ # Reorder E + medial HA + AA
261
+ ("\u1031\u103e\u102c", "\u103e\u1031\u102c"),
262
+
263
+ # Tall AA + asat combination
264
+ ("\u105a", "\u102b\u103a"),
265
+
266
+ # Reorder E + medial YA + medial HA
267
+ ("\u1031\u103b\u103e", "\u103b\u103e\u1031"),
268
+
269
+ # Reorder vowel I/II and medial WA/HA
270
+ ("(\u102d|\u102e)(\u103d|\u103e)", "\\2\\1"),
271
+
272
+ # Reorder AA and stacked consonant
273
+ ("\u102c\u1039([\u1000-\u1021])", "\u1039\\1\u102c"),
274
+
275
+ # Complex reordering with medial RA + asat + stacked
276
+ ("\u1039\u103c\u103a\u1039([\u1000-\u1021])", "\u103a\u1039\\1\u103c"),
277
+
278
+ # Reorder medial RA and stacked consonant
279
+ ("\u103c\u1039([\u1000-\u1021])", "\u1039\\1\u103c"),
280
+
281
+ # Reorder anusvara and stacked consonant
282
+ ("\u1036\u1039([\u1000-\u1021])", "\u1039\\1\u1036"),
283
+
284
+ # Expand abbreviated form
285
+ ("\u104e", "\u104e\u1004\u103a\u1038"),
286
+
287
+ # Digit 0 + AA/AI -> WA + AA/AI
288
+ ("\u1040(\u102b|\u102c|\u1036)", "\u101d\\1"),
289
+
290
+ # U+1025 + asat -> U+1009 + asat
291
+ ("\u1025\u1039", "\u1009\u1039"),
292
+
293
+ # Reorder consonant + medial RA + E + medial WA
294
+ ("([\u1000-\u1021])\u103c\u1031\u103d", "\\1\u103c\u103d\u1031"),
295
+
296
+ # Reorder consonant + medial YA + E + medial WA + optional HA
297
+ ("([\u1000-\u1021])\u103b\u1031\u103d(\u103e)?", "\\1\u103b\u103d\\2\u1031"),
298
+
299
+ # Reorder consonant + medial WA + E + medial YA
300
+ ("([\u1000-\u1021])\u103d\u1031\u103b", "\\1\u103b\u103d\u1031"),
301
+
302
+ # Reorder consonant + E + stacked consonant
303
+ ("([\u1000-\u1021])\u1031(\u1039[\u1000-\u1021]\u103d?)", "\\1\\2\u1031"),
304
+
305
+ # Reorder visarga and asat
306
+ ("\u1038\u103a", "\u103a\u1038"),
307
+
308
+ # Remove redundant vowel I + asat combinations
309
+ ("\u102d\u103a|\u103a\u102d", "\u102d"),
310
+
311
+ # Remove asat after vowel I + U
312
+ ("\u102d\u102f\u103a", "\u102d\u102f"),
313
+
314
+ # Remove space before dot below
315
+ ("\u0020\u1037", "\u1037"),
316
+
317
+ # Reorder dot below and anusvara
318
+ ("\u1037\u1036", "\u1036\u1037"),
319
+
320
+ # Remove duplicate vowel I
321
+ ("[\u102d]+", "\u102d"),
322
+
323
+ # Remove duplicate asat
324
+ ("[\u103a]+", "\u103a"),
325
+
326
+ # Remove duplicate medial WA
327
+ ("[\u103d]+", "\u103d"),
328
+
329
+ # Remove duplicate dot below
330
+ ("[\u1037]+", "\u1037"),
331
+
332
+ # Remove duplicate vowel II
333
+ ("[\u102e]+", "\u102e"),
334
+
335
+ # Normalize vowel I + II -> II
336
+ ("\u102d\u102e|\u102e\u102d", "\u102e"),
337
+
338
+ # Reorder U + vowel I
339
+ ("\u102f\u102d", "\u102d\u102f"),
340
+
341
+ # Remove double dot below
342
+ ("\u1037\u1037", "\u1037"),
343
+
344
+ # Remove double AI
345
+ ("\u1032\u1032", "\u1032"),
346
+
347
+ # Digit 4 + NGA + asat + visarga -> abbreviated form
348
+ ("\u1044\u1004\u103a\u1038", "\u104E\u1004\u103a\u1038"),
349
+
350
+ # Reorder vowel I/II + stacked consonant
351
+ ("([\u102d\u102e])\u1039([\u1000-\u1021])", "\u1039\\2\\1"),
352
+
353
+ # Reorder medial RA + E + stacked consonant
354
+ ("(\u103c\u1031)\u1039([\u1000-\u1021])", "\u1039\\2\\1"),
355
+
356
+ # Reorder anusvara and medial WA
357
+ ("\u1036\u103d", "\u103d\u1036"),
358
+
359
+ # Digit 7 -> RA in certain contexts (final)
360
+ ("\u1047((?=[\u1000-\u1021]\u103a)|(?=[\u102c-\u1030\u1032\u1036-\u1038\u103d\u103e]))", "\u101b"),
361
+ ]
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: paraencoder
3
+ Version: 0.1.0
4
+ Summary: Burmese text detection and conversion toolkit for Zawgyi and Unicode
5
+ Project-URL: Homepage, https://github.com/Laitei40/ParaEncoder
6
+ Project-URL: Repository, https://github.com/Laitei40/ParaEncoder
7
+ Project-URL: Issues, https://github.com/Laitei40/ParaEncoder/issues
8
+ Author: Para Maintainers
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: burmese,conversion,myanmar,text,unicode,zawgyi
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Requires-Python: >=3.9
18
+ Provides-Extra: test
19
+ Requires-Dist: pytest>=7; extra == 'test'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # Para
23
+
24
+ Para is a small, boring, and transparent toolkit for working with Burmese text. It detects whether text is encoded in Zawgyi or Unicode and converts Zawgyi to Unicode using a rule-based approach. Para never invents a new encoding and keeps its APIs explicit.
25
+
26
+ ## Goals
27
+ - Be Unicode-first and never invent a new encoding.
28
+ - Offer stable, explicit APIs without side effects or magic imports.
29
+ - Provide deterministic Zawgyi vs Unicode detection.
30
+ - Convert Zawgyi to Unicode with maintainable, rule-based logic (Parabaik-style), not machine learning.
31
+ - Stay batch-friendly for spreadsheets, CSVs, and plain text.
32
+ - Avoid heavy native dependencies.
33
+ - Be honest about limitations and edge cases.
34
+
35
+ ## Installation
36
+ ```bash
37
+ pip install para
38
+ ```
39
+
40
+ ## Usage
41
+ ```python
42
+ from para.detect import is_zawgyi, detect_encoding
43
+ from para.convert import zg_to_unicode
44
+ from para.normalize import normalize_unicode
45
+
46
+ text = "\u1031\u1010\u1004\u103a" # Zawgyi-encoded string
47
+ if is_zawgyi(text):
48
+ cleaned = zg_to_unicode(text)
49
+ cleaned = normalize_unicode(cleaned)
50
+ ```
51
+
52
+ ### CLI
53
+ Detect encoding:
54
+ ```bash
55
+ echo "\u1031\u1010\u1004\u103a" | para detect
56
+ ```
57
+
58
+ Convert Zawgyi to Unicode:
59
+ ```bash
60
+ echo "\u1031\u1010\u1004\u103a" | para convert > output.txt
61
+ ```
62
+
63
+ Process a file in place (write to stdout by default):
64
+ ```bash
65
+ para convert --input input.txt --output output.txt
66
+ ```
67
+
68
+ #### Windows / PowerShell note
69
+ PowerShell's default encoding corrupts Myanmar text in pipes. Before piping Burmese text, set UTF-8 encoding:
70
+ ```powershell
71
+ $OutputEncoding = [System.Text.Encoding]::UTF8
72
+ [Console]::InputEncoding = [System.Text.Encoding]::UTF8
73
+ [Console]::OutputEncoding = [System.Text.Encoding]::UTF8
74
+ echo "ျမန္မာ" | para convert
75
+ ```
76
+ Or use file-based input/output to avoid pipe issues:
77
+ ```powershell
78
+ para convert --input input.txt --output output.txt
79
+ ```
80
+
81
+ ## API surface
82
+ - `para.detect.is_zawgyi(text: str) -> bool`
83
+ - Input: `text` string.
84
+ - Output: `True` only when the detector score prefers Zawgyi; otherwise `False`.
85
+ - Guarantee: Never raises on empty/ASCII-only input; returns `False` for those.
86
+
87
+ - `para.detect.detect_encoding(text: str) -> Literal["zawgyi", "unicode", "unknown"]`
88
+ - Input: `text` string.
89
+ - Output: One of the three labels. Ties or insufficient evidence → `"unknown"` (no auto-conversion).
90
+ - Guarantee: Deterministic, no network/ML, explicit tie handling.
91
+
92
+ - `para.convert.zg_to_unicode(text: str, *, normalize: bool = True, force: bool = False) -> str`
93
+ - Input: `text` string.
94
+ - Output: Converted Unicode string when detection prefers Zawgyi (or when `force=True`). Otherwise passes through (optionally normalized).
95
+ - Guarantee: Ordered, test-backed regex rules; no Unicode→Zawgyi path; `force=False` avoids silent conversion on ambiguous text.
96
+
97
+ - `para.normalize.normalize_unicode(text: str) -> str`
98
+ - Input: `text` string.
99
+ - Output: NFC-normalized string with simple Myanmar ordering tweaks.
100
+ - Guarantee: Idempotent on already-normalized Unicode Burmese.
101
+
102
+ - `para.io.read_text(path: str, *, encoding: str = "utf-8") -> str`
103
+ - `para.io.write_text(path: str, data: str, *, encoding: str = "utf-8") -> None`
104
+ - `para.io.convert_file(...) -> str`
105
+ - Batch helpers for files; never guess encodings beyond the provided `encoding` argument.
106
+
107
+ ## Detection approach
108
+ Detection is deterministic and rule-based. Para scores the input with Zawgyi-specific patterns (e.g., `U+1031` prefix order, `U+105A`, stacked medials) and Unicode-only patterns (e.g., valid ordering of medials, `U+103A` usage). The side with the higher score wins; ties produce `"unknown"`. No machine learning, no network calls.
109
+
110
+ ## Conversion approach
111
+ Conversion uses an ordered list of regex replacements derived from Parabaik-style mappings. The rules are explicit, unit-tested, and live in `para.rules`. The converter does not attempt Unicode-to-Zawgyi; it only supports Zawgyi-to-Unicode because Unicode is the target canonical encoding.
112
+
113
+ ## Limitations
114
+ - Ambiguous short strings (e.g., ASCII-only) return `"unknown"` and pass through unchanged.
115
+ - Extremely malformed Zawgyi text may require manual cleanup.
116
+ - The converter focuses on common Zawgyi usage; rare legacy ligatures may need additional rules.
117
+
118
+ ## Non-goals
119
+ - Creating or endorsing any new Burmese encoding.
120
+ - Unicode-to-Zawgyi conversion.
121
+ - ML-based detection or probabilistic auto-conversion.
122
+ - Silent mutation of text when detection confidence is low; ties stay `"unknown"`.
123
+
124
+ ## Contributing
125
+ Issues and pull requests are welcome. Keep changes readable and testable.
126
+
127
+ ## Packaging
128
+ - Build a wheel/sdist locally: `python -m pip install build` then `python -m build`.
129
+ - Publish to PyPI (once ready): `python -m pip install twine` then `twine upload dist/*`.
130
+ - The package metadata in `pyproject.toml` is PyPI-ready (MIT license, explicit packages, CLI entrypoint).
131
+
132
+ ## License
133
+ MIT
@@ -0,0 +1,12 @@
1
+ para/__init__.py,sha256=XgvX7tM1z4fLz6yEjcJJU4jW1OzR5SAUaXYuKwZ352s,319
2
+ para/cli.py,sha256=_hZsUTXKAS_X1zO7GDM1zbNjexnJ8dfPsFpMUz9BJIg,3154
3
+ para/convert.py,sha256=hpsqjjt8kgEnOfryw1sDYE6RsTX-INNm8hGuL1pqZeA,1370
4
+ para/detect.py,sha256=rGask21S1ST1KwZnvPT-SFpODGXJ6-VAAkLfaalsKKk,1929
5
+ para/io.py,sha256=jG-vB7y_x7dn-nHjMrygn3e9jz-FDsxRRtjJylCHDeA,1074
6
+ para/normalize.py,sha256=k4a8-OtYh-bbPAwGytpP92CwiX_R9QNZSDjdccSgYEM,784
7
+ para/rules.py,sha256=U1uIxYW2Ag-Y8ZNa0DY5KsdLKNw5fwHztpYhjRo9GfA,9753
8
+ paraencoder-0.1.0.dist-info/METADATA,sha256=iU0fpxoFo9RW2omwMxjJ-CrYDk3a8PP-WMQktZEuG-I,5700
9
+ paraencoder-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
10
+ paraencoder-0.1.0.dist-info/entry_points.txt,sha256=Dn1jwtUjVRTWNPcpkWvVzPcCZMJTvDcOmi6DT1F_A2E,39
11
+ paraencoder-0.1.0.dist-info/licenses/LICENSE,sha256=ykJYlrfnN4vfXeFv-XrRR5Yzftp-F9TlSYiXDcNTfTY,1073
12
+ paraencoder-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ para = para.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Para Maintainers
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.