paraencoder 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paraencoder-0.1.0/.gitignore +10 -0
- paraencoder-0.1.0/LICENSE +21 -0
- paraencoder-0.1.0/PKG-INFO +133 -0
- paraencoder-0.1.0/README.md +112 -0
- paraencoder-0.1.0/output.txt +1 -0
- paraencoder-0.1.0/para/__init__.py +14 -0
- paraencoder-0.1.0/para/cli.py +101 -0
- paraencoder-0.1.0/para/convert.py +46 -0
- paraencoder-0.1.0/para/detect.py +65 -0
- paraencoder-0.1.0/para/io.py +42 -0
- paraencoder-0.1.0/para/normalize.py +20 -0
- paraencoder-0.1.0/para/rules.py +361 -0
- paraencoder-0.1.0/pyproject.toml +41 -0
- paraencoder-0.1.0/test.py +46 -0
- paraencoder-0.1.0/tests/test_cli.py +28 -0
- paraencoder-0.1.0/tests/test_convert.py +200 -0
- paraencoder-0.1.0/tests/test_detect.py +32 -0
- paraencoder-0.1.0/tests/test_normalize.py +14 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Para Maintainers
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: paraencoder
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Burmese text detection and conversion toolkit for Zawgyi and Unicode
|
|
5
|
+
Project-URL: Homepage, https://github.com/Laitei40/ParaEncoder
|
|
6
|
+
Project-URL: Repository, https://github.com/Laitei40/ParaEncoder
|
|
7
|
+
Project-URL: Issues, https://github.com/Laitei40/ParaEncoder/issues
|
|
8
|
+
Author: Para Maintainers
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: burmese,conversion,myanmar,text,unicode,zawgyi
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Provides-Extra: test
|
|
19
|
+
Requires-Dist: pytest>=7; extra == 'test'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# Para
|
|
23
|
+
|
|
24
|
+
Para is a small, boring, and transparent toolkit for working with Burmese text. It detects whether text is encoded in Zawgyi or Unicode and converts Zawgyi to Unicode using a rule-based approach. Para never invents a new encoding and keeps its APIs explicit.
|
|
25
|
+
|
|
26
|
+
## Goals
|
|
27
|
+
- Be Unicode-first and never invent a new encoding.
|
|
28
|
+
- Offer stable, explicit APIs without side effects or magic imports.
|
|
29
|
+
- Provide deterministic Zawgyi vs Unicode detection.
|
|
30
|
+
- Convert Zawgyi to Unicode with maintainable, rule-based logic (Parabaik-style), not machine learning.
|
|
31
|
+
- Stay batch-friendly for spreadsheets, CSVs, and plain text.
|
|
32
|
+
- Avoid heavy native dependencies.
|
|
33
|
+
- Be honest about limitations and edge cases.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
```bash
|
|
37
|
+
pip install para
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
```python
|
|
42
|
+
from para.detect import is_zawgyi, detect_encoding
|
|
43
|
+
from para.convert import zg_to_unicode
|
|
44
|
+
from para.normalize import normalize_unicode
|
|
45
|
+
|
|
46
|
+
text = "\u1031\u1010\u1004\u103a" # Zawgyi-encoded string
|
|
47
|
+
if is_zawgyi(text):
|
|
48
|
+
cleaned = zg_to_unicode(text)
|
|
49
|
+
cleaned = normalize_unicode(cleaned)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### CLI
|
|
53
|
+
Detect encoding:
|
|
54
|
+
```bash
|
|
55
|
+
echo "\u1031\u1010\u1004\u103a" | para detect
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Convert Zawgyi to Unicode:
|
|
59
|
+
```bash
|
|
60
|
+
echo "\u1031\u1010\u1004\u103a" | para convert > output.txt
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Process a file in place (write to stdout by default):
|
|
64
|
+
```bash
|
|
65
|
+
para convert --input input.txt --output output.txt
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
#### Windows / PowerShell note
|
|
69
|
+
PowerShell's default encoding corrupts Myanmar text in pipes. Before piping Burmese text, set UTF-8 encoding:
|
|
70
|
+
```powershell
|
|
71
|
+
$OutputEncoding = [System.Text.Encoding]::UTF8
|
|
72
|
+
[Console]::InputEncoding = [System.Text.Encoding]::UTF8
|
|
73
|
+
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
|
|
74
|
+
echo "ျမန္မာ" | para convert
|
|
75
|
+
```
|
|
76
|
+
Or use file-based input/output to avoid pipe issues:
|
|
77
|
+
```powershell
|
|
78
|
+
para convert --input input.txt --output output.txt
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## API surface
|
|
82
|
+
- `para.detect.is_zawgyi(text: str) -> bool`
|
|
83
|
+
- Input: `text` string.
|
|
84
|
+
- Output: `True` only when the detector score prefers Zawgyi; otherwise `False`.
|
|
85
|
+
- Guarantee: Never raises on empty/ASCII-only input; returns `False` for those.
|
|
86
|
+
|
|
87
|
+
- `para.detect.detect_encoding(text: str) -> Literal["zawgyi", "unicode", "unknown"]`
|
|
88
|
+
- Input: `text` string.
|
|
89
|
+
- Output: One of the three labels. Ties or insufficient evidence → `"unknown"` (no auto-conversion).
|
|
90
|
+
- Guarantee: Deterministic, no network/ML, explicit tie handling.
|
|
91
|
+
|
|
92
|
+
- `para.convert.zg_to_unicode(text: str, *, normalize: bool = True, force: bool = False) -> str`
|
|
93
|
+
- Input: `text` string.
|
|
94
|
+
- Output: Converted Unicode string when detection prefers Zawgyi (or when `force=True`). Otherwise passes through (optionally normalized).
|
|
95
|
+
- Guarantee: Ordered, test-backed regex rules; no Unicode→Zawgyi path; `force=False` avoids silent conversion on ambiguous text.
|
|
96
|
+
|
|
97
|
+
- `para.normalize.normalize_unicode(text: str) -> str`
|
|
98
|
+
- Input: `text` string.
|
|
99
|
+
- Output: NFC-normalized string with simple Myanmar ordering tweaks.
|
|
100
|
+
- Guarantee: Idempotent on already-normalized Unicode Burmese.
|
|
101
|
+
|
|
102
|
+
- `para.io.read_text(path: str, *, encoding: str = "utf-8") -> str`
|
|
103
|
+
- `para.io.write_text(path: str, data: str, *, encoding: str = "utf-8") -> None`
|
|
104
|
+
- `para.io.convert_file(...) -> str`
|
|
105
|
+
- Batch helpers for files; never guess encodings beyond the provided `encoding` argument.
|
|
106
|
+
|
|
107
|
+
## Detection approach
|
|
108
|
+
Detection is deterministic and rule-based. Para scores the input with Zawgyi-specific patterns (e.g., `U+1031` prefix order, `U+105A`, stacked medials) and Unicode-only patterns (e.g., valid ordering of medials, `U+103A` usage). The side with the higher score wins; ties produce `"unknown"`. No machine learning, no network calls.
|
|
109
|
+
|
|
110
|
+
## Conversion approach
|
|
111
|
+
Conversion uses an ordered list of regex replacements derived from Parabaik-style mappings. The rules are explicit, unit-tested, and live in `para.rules`. The converter does not attempt Unicode-to-Zawgyi; it only supports Zawgyi-to-Unicode because Unicode is the target canonical encoding.
|
|
112
|
+
|
|
113
|
+
## Limitations
|
|
114
|
+
- Ambiguous short strings (e.g., ASCII-only) return `"unknown"` and pass through unchanged.
|
|
115
|
+
- Extremely malformed Zawgyi text may require manual cleanup.
|
|
116
|
+
- The converter focuses on common Zawgyi usage; rare legacy ligatures may need additional rules.
|
|
117
|
+
|
|
118
|
+
## Non-goals
|
|
119
|
+
- Creating or endorsing any new Burmese encoding.
|
|
120
|
+
- Unicode-to-Zawgyi conversion.
|
|
121
|
+
- ML-based detection or probabilistic auto-conversion.
|
|
122
|
+
- Silent mutation of text when detection confidence is low; ties stay `"unknown"`.
|
|
123
|
+
|
|
124
|
+
## Contributing
|
|
125
|
+
Issues and pull requests are welcome. Keep changes readable and testable.
|
|
126
|
+
|
|
127
|
+
## Packaging
|
|
128
|
+
- Build a wheel/sdist locally: `python -m pip install build` then `python -m build`.
|
|
129
|
+
- Publish to PyPI (once ready): `python -m pip install twine` then `twine upload dist/*`.
|
|
130
|
+
- The package metadata in `pyproject.toml` is PyPI-ready (MIT license, explicit packages, CLI entrypoint).
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
MIT
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# Para
|
|
2
|
+
|
|
3
|
+
Para is a small, boring, and transparent toolkit for working with Burmese text. It detects whether text is encoded in Zawgyi or Unicode and converts Zawgyi to Unicode using a rule-based approach. Para never invents a new encoding and keeps its APIs explicit.
|
|
4
|
+
|
|
5
|
+
## Goals
|
|
6
|
+
- Be Unicode-first and never invent a new encoding.
|
|
7
|
+
- Offer stable, explicit APIs without side effects or magic imports.
|
|
8
|
+
- Provide deterministic Zawgyi vs Unicode detection.
|
|
9
|
+
- Convert Zawgyi to Unicode with maintainable, rule-based logic (Parabaik-style), not machine learning.
|
|
10
|
+
- Stay batch-friendly for spreadsheets, CSVs, and plain text.
|
|
11
|
+
- Avoid heavy native dependencies.
|
|
12
|
+
- Be honest about limitations and edge cases.
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
```bash
|
|
16
|
+
pip install para
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
```python
|
|
21
|
+
from para.detect import is_zawgyi, detect_encoding
|
|
22
|
+
from para.convert import zg_to_unicode
|
|
23
|
+
from para.normalize import normalize_unicode
|
|
24
|
+
|
|
25
|
+
text = "\u1031\u1010\u1004\u103a" # Zawgyi-encoded string
|
|
26
|
+
if is_zawgyi(text):
|
|
27
|
+
cleaned = zg_to_unicode(text)
|
|
28
|
+
cleaned = normalize_unicode(cleaned)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### CLI
|
|
32
|
+
Detect encoding:
|
|
33
|
+
```bash
|
|
34
|
+
echo "\u1031\u1010\u1004\u103a" | para detect
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Convert Zawgyi to Unicode:
|
|
38
|
+
```bash
|
|
39
|
+
echo "\u1031\u1010\u1004\u103a" | para convert > output.txt
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Process a file in place (write to stdout by default):
|
|
43
|
+
```bash
|
|
44
|
+
para convert --input input.txt --output output.txt
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
#### Windows / PowerShell note
|
|
48
|
+
PowerShell's default encoding corrupts Myanmar text in pipes. Before piping Burmese text, set UTF-8 encoding:
|
|
49
|
+
```powershell
|
|
50
|
+
$OutputEncoding = [System.Text.Encoding]::UTF8
|
|
51
|
+
[Console]::InputEncoding = [System.Text.Encoding]::UTF8
|
|
52
|
+
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
|
|
53
|
+
echo "ျမန္မာ" | para convert
|
|
54
|
+
```
|
|
55
|
+
Or use file-based input/output to avoid pipe issues:
|
|
56
|
+
```powershell
|
|
57
|
+
para convert --input input.txt --output output.txt
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## API surface
|
|
61
|
+
- `para.detect.is_zawgyi(text: str) -> bool`
|
|
62
|
+
- Input: `text` string.
|
|
63
|
+
- Output: `True` only when the detector score prefers Zawgyi; otherwise `False`.
|
|
64
|
+
- Guarantee: Never raises on empty/ASCII-only input; returns `False` for those.
|
|
65
|
+
|
|
66
|
+
- `para.detect.detect_encoding(text: str) -> Literal["zawgyi", "unicode", "unknown"]`
|
|
67
|
+
- Input: `text` string.
|
|
68
|
+
- Output: One of the three labels. Ties or insufficient evidence → `"unknown"` (no auto-conversion).
|
|
69
|
+
- Guarantee: Deterministic, no network/ML, explicit tie handling.
|
|
70
|
+
|
|
71
|
+
- `para.convert.zg_to_unicode(text: str, *, normalize: bool = True, force: bool = False) -> str`
|
|
72
|
+
- Input: `text` string.
|
|
73
|
+
- Output: Converted Unicode string when detection prefers Zawgyi (or when `force=True`). Otherwise passes through (optionally normalized).
|
|
74
|
+
- Guarantee: Ordered, test-backed regex rules; no Unicode→Zawgyi path; `force=False` avoids silent conversion on ambiguous text.
|
|
75
|
+
|
|
76
|
+
- `para.normalize.normalize_unicode(text: str) -> str`
|
|
77
|
+
- Input: `text` string.
|
|
78
|
+
- Output: NFC-normalized string with simple Myanmar ordering tweaks.
|
|
79
|
+
- Guarantee: Idempotent on already-normalized Unicode Burmese.
|
|
80
|
+
|
|
81
|
+
- `para.io.read_text(path: str, *, encoding: str = "utf-8") -> str`
|
|
82
|
+
- `para.io.write_text(path: str, data: str, *, encoding: str = "utf-8") -> None`
|
|
83
|
+
- `para.io.convert_file(...) -> str`
|
|
84
|
+
- Batch helpers for files; never guess encodings beyond the provided `encoding` argument.
|
|
85
|
+
|
|
86
|
+
## Detection approach
|
|
87
|
+
Detection is deterministic and rule-based. Para scores the input with Zawgyi-specific patterns (e.g., `U+1031` prefix order, `U+105A`, stacked medials) and Unicode-only patterns (e.g., valid ordering of medials, `U+103A` usage). The side with the higher score wins; ties produce `"unknown"`. No machine learning, no network calls.
|
|
88
|
+
|
|
89
|
+
## Conversion approach
|
|
90
|
+
Conversion uses an ordered list of regex replacements derived from Parabaik-style mappings. The rules are explicit, unit-tested, and live in `para.rules`. The converter does not attempt Unicode-to-Zawgyi; it only supports Zawgyi-to-Unicode because Unicode is the target canonical encoding.
|
|
91
|
+
|
|
92
|
+
## Limitations
|
|
93
|
+
- Ambiguous short strings (e.g., ASCII-only) return `"unknown"` and pass through unchanged.
|
|
94
|
+
- Extremely malformed Zawgyi text may require manual cleanup.
|
|
95
|
+
- The converter focuses on common Zawgyi usage; rare legacy ligatures may need additional rules.
|
|
96
|
+
|
|
97
|
+
## Non-goals
|
|
98
|
+
- Creating or endorsing any new Burmese encoding.
|
|
99
|
+
- Unicode-to-Zawgyi conversion.
|
|
100
|
+
- ML-based detection or probabilistic auto-conversion.
|
|
101
|
+
- Silent mutation of text when detection confidence is low; ties stay `"unknown"`.
|
|
102
|
+
|
|
103
|
+
## Contributing
|
|
104
|
+
Issues and pull requests are welcome. Keep changes readable and testable.
|
|
105
|
+
|
|
106
|
+
## Packaging
|
|
107
|
+
- Build a wheel/sdist locally: `python -m pip install build` then `python -m build`.
|
|
108
|
+
- Publish to PyPI (once ready): `python -m pip install twine` then `twine upload dist/*`.
|
|
109
|
+
- The package metadata in `pyproject.toml` is PyPI-ready (MIT license, explicit packages, CLI entrypoint).
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
မျန္မာပျည္ကိုခ်စ္တယ္
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Para: Burmese text detection and conversion toolkit."""
|
|
2
|
+
|
|
3
|
+
__all__ = [
|
|
4
|
+
"is_zawgyi",
|
|
5
|
+
"detect_encoding",
|
|
6
|
+
"zg_to_unicode",
|
|
7
|
+
"normalize_unicode",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
from para.detect import detect_encoding, is_zawgyi
|
|
11
|
+
from para.convert import zg_to_unicode
|
|
12
|
+
from para.normalize import normalize_unicode
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""Command line entrypoint for Para."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from para.convert import zg_to_unicode
|
|
10
|
+
from para.detect import detect_encoding, is_zawgyi
|
|
11
|
+
from para.io import convert_file, read_text, write_text
|
|
12
|
+
from para.normalize import normalize_unicode
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _read_input(input_path: Optional[str]) -> str:
|
|
16
|
+
if input_path:
|
|
17
|
+
return read_text(input_path)
|
|
18
|
+
return sys.stdin.read()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _write_output(data: str, output_path: Optional[str]) -> None:
|
|
22
|
+
if output_path:
|
|
23
|
+
write_text(output_path, data)
|
|
24
|
+
else:
|
|
25
|
+
sys.stdout.write(data)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _cmd_detect(args: argparse.Namespace) -> int:
|
|
29
|
+
data = _read_input(args.input)
|
|
30
|
+
encoding = detect_encoding(data)
|
|
31
|
+
sys.stdout.write(f"{encoding}\n")
|
|
32
|
+
return 0
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _cmd_convert(args: argparse.Namespace) -> int:
|
|
36
|
+
if args.input:
|
|
37
|
+
converted = convert_file(
|
|
38
|
+
input_path=args.input,
|
|
39
|
+
output_path=args.output,
|
|
40
|
+
assume_zawgyi=args.force,
|
|
41
|
+
normalize=not args.no_normalize,
|
|
42
|
+
)
|
|
43
|
+
if not args.output:
|
|
44
|
+
sys.stdout.write(converted)
|
|
45
|
+
else:
|
|
46
|
+
data = sys.stdin.read()
|
|
47
|
+
converted = zg_to_unicode(
|
|
48
|
+
data,
|
|
49
|
+
normalize=not args.no_normalize,
|
|
50
|
+
force=args.force,
|
|
51
|
+
)
|
|
52
|
+
_write_output(converted, args.output)
|
|
53
|
+
return 0
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _cmd_normalize(args: argparse.Namespace) -> int:
|
|
57
|
+
data = _read_input(args.input)
|
|
58
|
+
normalized = normalize_unicode(data)
|
|
59
|
+
_write_output(normalized, args.output)
|
|
60
|
+
return 0
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
64
|
+
parser = argparse.ArgumentParser(description="Para: Zawgyi ↔ Unicode tooling")
|
|
65
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
66
|
+
|
|
67
|
+
detect_parser = sub.add_parser("detect", help="Detect encoding of input text")
|
|
68
|
+
detect_parser.add_argument("--input", help="Input file path; defaults to stdin")
|
|
69
|
+
detect_parser.set_defaults(func=_cmd_detect)
|
|
70
|
+
|
|
71
|
+
convert_parser = sub.add_parser("convert", help="Convert Zawgyi text to Unicode")
|
|
72
|
+
convert_parser.add_argument("--input", help="Input file path; defaults to stdin")
|
|
73
|
+
convert_parser.add_argument("--output", help="Output file path; defaults to stdout")
|
|
74
|
+
convert_parser.add_argument(
|
|
75
|
+
"--force",
|
|
76
|
+
action="store_true",
|
|
77
|
+
help="Force conversion even if detection is uncertain",
|
|
78
|
+
)
|
|
79
|
+
convert_parser.add_argument(
|
|
80
|
+
"--no-normalize",
|
|
81
|
+
action="store_true",
|
|
82
|
+
help="Skip Unicode normalization step",
|
|
83
|
+
)
|
|
84
|
+
convert_parser.set_defaults(func=_cmd_convert)
|
|
85
|
+
|
|
86
|
+
normalize_parser = sub.add_parser("normalize", help="Normalize Unicode Burmese text")
|
|
87
|
+
normalize_parser.add_argument("--input", help="Input file path; defaults to stdin")
|
|
88
|
+
normalize_parser.add_argument("--output", help="Output file path; defaults to stdout")
|
|
89
|
+
normalize_parser.set_defaults(func=_cmd_normalize)
|
|
90
|
+
|
|
91
|
+
return parser
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def main(argv: Optional[list[str]] = None) -> int:
|
|
95
|
+
parser = build_parser()
|
|
96
|
+
args = parser.parse_args(argv)
|
|
97
|
+
return args.func(args)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__": # pragma: no cover
|
|
101
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Rule-based Zawgyi to Unicode conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from para.detect import detect_encoding, is_zawgyi
|
|
9
|
+
from para.normalize import normalize_unicode
|
|
10
|
+
from para.rules import ZAWGYI_TO_UNICODE_RULES
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _compile_rules(rules: Iterable[tuple[str, str]]) -> list[tuple[re.Pattern[str], str]]:
|
|
14
|
+
compiled: list[tuple[re.Pattern[str], str]] = []
|
|
15
|
+
for pattern, replacement in rules:
|
|
16
|
+
compiled.append((re.compile(pattern), replacement))
|
|
17
|
+
return compiled
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_COMPILED_RULES = _compile_rules(ZAWGYI_TO_UNICODE_RULES)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def zg_to_unicode(text: str, *, normalize: bool = True, force: bool = False) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Convert Zawgyi text to Unicode using ordered regex rules.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: Input text that may be Zawgyi.
|
|
29
|
+
normalize: Whether to apply Unicode normalization and basic reordering.
|
|
30
|
+
force: When False, conversion only runs if the detector believes the text is Zawgyi.
|
|
31
|
+
"""
|
|
32
|
+
if not text:
|
|
33
|
+
return ""
|
|
34
|
+
|
|
35
|
+
# Hard guard: never modify non-Zawgyi input (contract guarantee).
|
|
36
|
+
if not force and detect_encoding(text) != "zawgyi":
|
|
37
|
+
return text
|
|
38
|
+
|
|
39
|
+
converted = text
|
|
40
|
+
for pattern, repl in _COMPILED_RULES:
|
|
41
|
+
converted = pattern.sub(repl, converted)
|
|
42
|
+
|
|
43
|
+
if normalize:
|
|
44
|
+
converted = normalize_unicode(converted)
|
|
45
|
+
|
|
46
|
+
return converted
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Deterministic detection for Zawgyi vs Unicode Burmese text."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
Encoding = Literal["zawgyi", "unicode", "unknown"]
|
|
9
|
+
|
|
10
|
+
_MYANMAR_RANGE = re.compile(r"[\u1000-\u109F]")
|
|
11
|
+
|
|
12
|
+
# If scores are equal or differ by less than this margin, result is "unknown".
|
|
13
|
+
SCORE_TIE_MARGIN = 0
|
|
14
|
+
|
|
15
|
+
# Patterns that strongly suggest Zawgyi encoding.
|
|
16
|
+
_ZG_PATTERNS = [
|
|
17
|
+
(re.compile(r"[\u105A\u1060-\u1097]"), 4),
|
|
18
|
+
(re.compile(r"\u1031[\u103B-\u103E]"), 3),
|
|
19
|
+
(re.compile(r"\u1039[\u1000-\u1021]?\u1031"), 3),
|
|
20
|
+
(re.compile(r"\u103A\u103A"), 2),
|
|
21
|
+
(re.compile(r"\u1039[\u1000-\u109F]"), 2),
|
|
22
|
+
(re.compile(r"\u1031\u108A"), 3),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
# Patterns that indicate proper Unicode ordering or characters.
|
|
26
|
+
_UNI_PATTERNS = [
|
|
27
|
+
(re.compile(r"\u1031[\u1000-\u1021]"), 3),
|
|
28
|
+
(re.compile(r"\u102B\u103A"), 2),
|
|
29
|
+
(re.compile(r"\u103B[\u103C\u103D]"), 2),
|
|
30
|
+
(re.compile(r"\u103C[\u103E]"), 2),
|
|
31
|
+
(re.compile(r"\u1037[\u103A]"), 2),
|
|
32
|
+
(re.compile(r"\u1004\u103A\u1039"), 3),
|
|
33
|
+
(re.compile(r"[\u1000-\u1021]\u103C"), 2),
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _score(text: str, patterns: list[tuple[re.Pattern[str], int]]) -> int:
|
|
38
|
+
score = 0
|
|
39
|
+
for pattern, weight in patterns:
|
|
40
|
+
matches = pattern.findall(text)
|
|
41
|
+
if matches:
|
|
42
|
+
score += len(matches) * weight
|
|
43
|
+
return score
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def detect_encoding(text: str) -> Encoding:
|
|
47
|
+
"""Return "zawgyi", "unicode", or "unknown" based on heuristic scoring."""
|
|
48
|
+
if not text:
|
|
49
|
+
return "unknown"
|
|
50
|
+
|
|
51
|
+
if not _MYANMAR_RANGE.search(text):
|
|
52
|
+
return "unknown"
|
|
53
|
+
|
|
54
|
+
zg_score = _score(text, _ZG_PATTERNS)
|
|
55
|
+
uni_score = _score(text, _UNI_PATTERNS)
|
|
56
|
+
|
|
57
|
+
if abs(zg_score - uni_score) <= SCORE_TIE_MARGIN:
|
|
58
|
+
return "unknown"
|
|
59
|
+
|
|
60
|
+
return "zawgyi" if zg_score > uni_score else "unicode"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def is_zawgyi(text: str) -> bool:
|
|
64
|
+
"""Convenience boolean: True when the detector prefers Zawgyi."""
|
|
65
|
+
return detect_encoding(text) == "zawgyi"
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Batch-friendly I/O helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from para.convert import zg_to_unicode
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
DEFAULT_ENCODING = "utf-8"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def read_text(path: str, *, encoding: str = DEFAULT_ENCODING) -> str:
|
|
15
|
+
return Path(path).read_text(encoding=encoding)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def write_text(path: str, data: str, *, encoding: str = DEFAULT_ENCODING) -> None:
|
|
19
|
+
Path(path).write_text(data, encoding=encoding)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def convert_file(
|
|
23
|
+
*,
|
|
24
|
+
input_path: str,
|
|
25
|
+
output_path: Optional[str] = None,
|
|
26
|
+
assume_zawgyi: bool = False,
|
|
27
|
+
normalize: bool = True,
|
|
28
|
+
encoding: str = DEFAULT_ENCODING,
|
|
29
|
+
) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Convert a file from Zawgyi to Unicode and write the result.
|
|
32
|
+
|
|
33
|
+
Returns the converted text. When ``output_path`` is None, the caller can
|
|
34
|
+
capture the returned string.
|
|
35
|
+
"""
|
|
36
|
+
data = read_text(input_path, encoding=encoding)
|
|
37
|
+
converted = zg_to_unicode(data, normalize=normalize, force=assume_zawgyi)
|
|
38
|
+
|
|
39
|
+
if output_path:
|
|
40
|
+
write_text(output_path, converted, encoding=encoding)
|
|
41
|
+
|
|
42
|
+
return converted
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Unicode-focused normalization helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
# NOTE (v0.1.0): Normalization is intentionally disabled for safety.
|
|
6
|
+
# Previous reordering logic corrupted valid canonical Unicode text.
|
|
7
|
+
# Until a provably-safe implementation is available, this function is a
|
|
8
|
+
# strict no-op. Unicode safety > clever normalization.
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def normalize_unicode(text: str) -> str:
|
|
12
|
+
"""Return text unchanged (normalization disabled in v0.1.0 for safety).
|
|
13
|
+
|
|
14
|
+
ParaEncoder must never modify valid Unicode text unless explicitly and
|
|
15
|
+
provably necessary. Reordering / NFC logic has been removed because it
|
|
16
|
+
corrupted canonical input such as "မင်္ဂလာပါ".
|
|
17
|
+
|
|
18
|
+
Future versions may re-introduce opt-in, test-backed normalization.
|
|
19
|
+
"""
|
|
20
|
+
return text
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""Zawgyi-to-Unicode conversion rules ported from ParaEncoder.
|
|
2
|
+
|
|
3
|
+
Rules are applied in order. Each rule is a (pattern, replacement) tuple.
|
|
4
|
+
Ported from: https://github.com/Laitei40/ParaEncoder/issues/new
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
ZAWGYI_TO_UNICODE_RULES = [
|
|
8
|
+
# Remove duplicate diacritics
|
|
9
|
+
(r"([\u102D\u102E\u103D\u102F\u1037\u1095])\1+", r"\1"),
|
|
10
|
+
|
|
11
|
+
# Remove zero-width space
|
|
12
|
+
("\u200B", ""),
|
|
13
|
+
|
|
14
|
+
# Medial combinations
|
|
15
|
+
("\u103d\u103c", "\u108a"),
|
|
16
|
+
|
|
17
|
+
# Medial HA variants -> U+103E
|
|
18
|
+
("(\u103d|\u1087)", "\u103e"),
|
|
19
|
+
|
|
20
|
+
# Medial WA: U+103C -> U+103D
|
|
21
|
+
("\u103c", "\u103d"),
|
|
22
|
+
|
|
23
|
+
# Medial RA variants -> U+103C
|
|
24
|
+
("(\u103b|\u107e|\u107f|\u1080|\u1081|\u1082|\u1083|\u1084)", "\u103c"),
|
|
25
|
+
|
|
26
|
+
# Medial YA variants -> U+103B
|
|
27
|
+
("(\u103a|\u107d)", "\u103b"),
|
|
28
|
+
|
|
29
|
+
# Asat: U+1039 -> U+103A
|
|
30
|
+
("\u1039", "\u103a"),
|
|
31
|
+
|
|
32
|
+
# Stacked SA variants
|
|
33
|
+
("(\u1066|\u1067)", "\u1039\u1006"),
|
|
34
|
+
|
|
35
|
+
# NGA variant
|
|
36
|
+
("\u106a", "\u1009"),
|
|
37
|
+
|
|
38
|
+
# NYA variant
|
|
39
|
+
("\u106b", "\u100a"),
|
|
40
|
+
|
|
41
|
+
# Stacked TTA
|
|
42
|
+
("\u106c", "\u1039\u100b"),
|
|
43
|
+
|
|
44
|
+
# Stacked TTHA
|
|
45
|
+
("\u106d", "\u1039\u100c"),
|
|
46
|
+
|
|
47
|
+
# Stacked DDA + DDA
|
|
48
|
+
("\u106e", "\u100d\u1039\u100d"),
|
|
49
|
+
|
|
50
|
+
# Stacked DDA + DDHA
|
|
51
|
+
("\u106f", "\u100d\u1039\u100e"),
|
|
52
|
+
|
|
53
|
+
# Stacked NNA
|
|
54
|
+
("\u1070", "\u1039\u100f"),
|
|
55
|
+
|
|
56
|
+
# Stacked TA variants
|
|
57
|
+
("(\u1071|\u1072)", "\u1039\u1010"),
|
|
58
|
+
|
|
59
|
+
# Stacked KA
|
|
60
|
+
("\u1060", "\u1039\u1000"),
|
|
61
|
+
|
|
62
|
+
# Stacked KHA
|
|
63
|
+
("\u1061", "\u1039\u1001"),
|
|
64
|
+
|
|
65
|
+
# Stacked GA
|
|
66
|
+
("\u1062", "\u1039\u1002"),
|
|
67
|
+
|
|
68
|
+
# Stacked GHA
|
|
69
|
+
("\u1063", "\u1039\u1003"),
|
|
70
|
+
|
|
71
|
+
# Stacked CA
|
|
72
|
+
("\u1065", "\u1039\u1005"),
|
|
73
|
+
|
|
74
|
+
# Stacked JA
|
|
75
|
+
("\u1068", "\u1039\u1007"),
|
|
76
|
+
|
|
77
|
+
# Stacked JHA
|
|
78
|
+
("\u1069", "\u1039\u1008"),
|
|
79
|
+
|
|
80
|
+
# Stacked THA variants
|
|
81
|
+
("(\u1073|\u1074)", "\u1039\u1011"),
|
|
82
|
+
|
|
83
|
+
# Stacked DA
|
|
84
|
+
("\u1075", "\u1039\u1012"),
|
|
85
|
+
|
|
86
|
+
# Stacked DHA
|
|
87
|
+
("\u1076", "\u1039\u1013"),
|
|
88
|
+
|
|
89
|
+
# Stacked NA
|
|
90
|
+
("\u1077", "\u1039\u1014"),
|
|
91
|
+
|
|
92
|
+
# Stacked PA
|
|
93
|
+
("\u1078", "\u1039\u1015"),
|
|
94
|
+
|
|
95
|
+
# Stacked PHA
|
|
96
|
+
("\u1079", "\u1039\u1016"),
|
|
97
|
+
|
|
98
|
+
# Stacked BA
|
|
99
|
+
("\u107a", "\u1039\u1017"),
|
|
100
|
+
|
|
101
|
+
# Stacked MA
|
|
102
|
+
("\u107c", "\u1039\u1019"),
|
|
103
|
+
|
|
104
|
+
# Stacked LA
|
|
105
|
+
("\u1085", "\u1039\u101c"),
|
|
106
|
+
|
|
107
|
+
# Tall AA -> U+102F
|
|
108
|
+
("\u1033", "\u102f"),
|
|
109
|
+
|
|
110
|
+
# Tall AA variant -> U+1030
|
|
111
|
+
("\u1034", "\u1030"),
|
|
112
|
+
|
|
113
|
+
# Another U variant -> U+1030
|
|
114
|
+
("\u103f", "\u1030"),
|
|
115
|
+
|
|
116
|
+
# Great SA -> U+103F
|
|
117
|
+
("\u1086", "\u103f"),
|
|
118
|
+
|
|
119
|
+
# Reorder anusvara and medial HA+U
|
|
120
|
+
("\u1036\u1088", "\u1088\u1036"),
|
|
121
|
+
|
|
122
|
+
# Medial HA + U combination
|
|
123
|
+
("\u1088", "\u103e\u102f"),
|
|
124
|
+
|
|
125
|
+
# Medial HA + UU combination
|
|
126
|
+
("\u1089", "\u103e\u1030"),
|
|
127
|
+
|
|
128
|
+
# Medial WA + HA combination
|
|
129
|
+
("\u108a", "\u103d\u103e"),
|
|
130
|
+
|
|
131
|
+
# Reorder kinzi and medial YA
|
|
132
|
+
("\u103B\u1064", "\u1064\u103B"),
|
|
133
|
+
|
|
134
|
+
# Reorder medial RA + consonant + kinzi
|
|
135
|
+
("\u103c([\u1000-\u1021])([\u1064\u108b\u108d])", "\\1\u103c\\2"),
|
|
136
|
+
|
|
137
|
+
# Kinzi basic form
|
|
138
|
+
("(\u1031)?([\u1000-\u1021\u1040-\u1049])(\u103c)?\u1064", "\u1004\u103a\u1039\\1\\2\\3"),
|
|
139
|
+
|
|
140
|
+
# Kinzi + vowel I
|
|
141
|
+
("(\u1031)?([\u1000-\u1021])(\u103b|\u103c)?\u108b", "\u1004\u103a\u1039\\1\\2\\3\u102d"),
|
|
142
|
+
|
|
143
|
+
# Kinzi + vowel II
|
|
144
|
+
("(\u1031)?([\u1000-\u1021])(\u103b)?\u108c", "\u1004\u103a\u1039\\1\\2\\3\u102e"),
|
|
145
|
+
|
|
146
|
+
# Kinzi + anusvara
|
|
147
|
+
("(\u1031)?([\u1000-\u1021])([\u103b\u103c])?\u108d", "\u1004\u103a\u1039\\1\\2\\3\u1036"),
|
|
148
|
+
|
|
149
|
+
# Vowel I + anusvara combination
|
|
150
|
+
("\u108e", "\u102d\u1036"),
|
|
151
|
+
|
|
152
|
+
# NA variant
|
|
153
|
+
("\u108f", "\u1014"),
|
|
154
|
+
|
|
155
|
+
# RA variant
|
|
156
|
+
("\u1090", "\u101b"),
|
|
157
|
+
|
|
158
|
+
# NNA + DDA stacked
|
|
159
|
+
("\u1091", "\u100f\u1039\u100d"),
|
|
160
|
+
|
|
161
|
+
# TTA + TTHA stacked
|
|
162
|
+
("\u1092", "\u100b\u1039\u100c"),
|
|
163
|
+
|
|
164
|
+
# Special MA + BBA combination
|
|
165
|
+
("\u1019\u102c(\u107b|\u1093)", "\u1019\u1039\u1018\u102c"),
|
|
166
|
+
|
|
167
|
+
# Stacked BHA variants
|
|
168
|
+
("(\u107b|\u1093)", "\u1039\u1018"),
|
|
169
|
+
|
|
170
|
+
# Dot below variants -> U+1037
|
|
171
|
+
("(\u1094|\u1095)", "\u1037"),
|
|
172
|
+
|
|
173
|
+
# Reorder consonant + dot + AI
|
|
174
|
+
("([\u1000-\u1021])\u1037\u1032", "\\1\u1032\u1037"),
|
|
175
|
+
|
|
176
|
+
# Stacked TA + medial WA combination
|
|
177
|
+
("\u1096", "\u1039\u1010\u103d"),
|
|
178
|
+
|
|
179
|
+
# Stacked TTA + TTA
|
|
180
|
+
("\u1097", "\u100b\u1039\u100b"),
|
|
181
|
+
|
|
182
|
+
# Reorder medial RA + consonant
|
|
183
|
+
("\u103c([\u1000-\u1021])([\u1000-\u1021])?", "\\1\u103c\\2"),
|
|
184
|
+
|
|
185
|
+
# Reorder consonant + medial RA + medial YA
|
|
186
|
+
("([\u1000-\u1021])\u103c\u103a", "\u103c\\1\u103a"),
|
|
187
|
+
|
|
188
|
+
# Digit 7 -> RA in certain contexts
|
|
189
|
+
("\u1047(?=[\u102c-\u1030\u1032\u1036-\u1038\u103d\u103e])", "\u101b"),
|
|
190
|
+
|
|
191
|
+
# E vowel + digit 7 -> E vowel + RA
|
|
192
|
+
("\u1031\u1047", "\u1031\u101b"),
|
|
193
|
+
|
|
194
|
+
# Digit 0 -> WA in certain contexts
|
|
195
|
+
("\u1040(\u102e|\u102f|\u102d\u102f|\u1030|\u1036|\u103d|\u103e)", "\u101d\\1"),
|
|
196
|
+
|
|
197
|
+
# Digit 0 + AA -> WA + AA (not after digits)
|
|
198
|
+
("([^\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049])\u1040\u102b", "\\1\u101d\u102b"),
|
|
199
|
+
|
|
200
|
+
# Digit 0 + AA -> WA + AA (after digits, not followed by visarga)
|
|
201
|
+
("([\u1040\u1041\u1042\u1043\u1044\u1045\u1046\u1047\u1048\u1049])\u1040\u102b(?!\u1038)", "\\1\u101d\u102b"),
|
|
202
|
+
|
|
203
|
+
# Digit 0 at start + AA -> WA
|
|
204
|
+
("^\u1040(?=\u102b)", "\u101d"),
|
|
205
|
+
|
|
206
|
+
# Digit 0 + vowel I -> WA + vowel I (not before space+slash)
|
|
207
|
+
("\u1040\u102d(?!\u0020?/)", "\u101d\u102d"),
|
|
208
|
+
|
|
209
|
+
# Digit 0 -> WA (between non-digits)
|
|
210
|
+
("([^\u1040-\u1049])\u1040([^\u1040-\u1049\u0020]|[\u104a\u104b])", "\\1\u101d\\2"),
|
|
211
|
+
|
|
212
|
+
# Digit 0 -> WA (before newline, not after digit)
|
|
213
|
+
("([^\u1040-\u1049])\u1040(?=[\\f\\n\\r])", "\\1\u101d"),
|
|
214
|
+
|
|
215
|
+
# Digit 0 -> WA (at end, not after digit)
|
|
216
|
+
("([^\u1040-\u1049])\u1040$", "\\1\u101d"),
|
|
217
|
+
|
|
218
|
+
# Reorder E vowel after consonant and medials
|
|
219
|
+
("\u1031([\u1000-\u1021\u103f])(\u103e)?(\u103b)?", "\\1\\2\\3\u1031"),
|
|
220
|
+
|
|
221
|
+
# Reorder E vowel after consonant and remaining medials
|
|
222
|
+
("([\u1000-\u1021])\u1031([\u103b\u103c\u103d\u103e]+)", "\\1\\2\u1031"),
|
|
223
|
+
|
|
224
|
+
# Reorder AI and medial WA
|
|
225
|
+
("\u1032\u103d", "\u103d\u1032"),
|
|
226
|
+
|
|
227
|
+
# Reorder vowel I/II and medial YA
|
|
228
|
+
("([\u102d\u102e])\u103b", "\u103b\\1"),
|
|
229
|
+
|
|
230
|
+
# Reorder medial WA and YA
|
|
231
|
+
("\u103d\u103b", "\u103b\u103d"),
|
|
232
|
+
|
|
233
|
+
# Reorder asat and dot below
|
|
234
|
+
("\u103a\u1037", "\u1037\u103a"),
|
|
235
|
+
|
|
236
|
+
# Remove duplicate U after vowel
|
|
237
|
+
("\u102f(\u102d|\u102e|\u1036|\u1037)\u102f", "\u102f\\1"),
|
|
238
|
+
|
|
239
|
+
# Reorder U/UU and vowel I/II
|
|
240
|
+
("(\u102f|\u1030)(\u102d|\u102e)", "\\2\\1"),
|
|
241
|
+
|
|
242
|
+
# Reorder medial HA and YA/RA
|
|
243
|
+
("(\u103e)(\u103b|\u103c)", "\\2\\1"),
|
|
244
|
+
|
|
245
|
+
# U+1025 -> U+1009 before asat/AA
|
|
246
|
+
("\u1025(?=[\u1037]?[\u103a\u102c])", "\u1009"),
|
|
247
|
+
|
|
248
|
+
# U+1025 + vowel II -> U+1026
|
|
249
|
+
("\u1025\u102e", "\u1026"),
|
|
250
|
+
|
|
251
|
+
# CA + medial YA -> JHA
|
|
252
|
+
("\u1005\u103b", "\u1008"),
|
|
253
|
+
|
|
254
|
+
# Reorder anusvara and U/UU
|
|
255
|
+
("\u1036(\u102f|\u1030)", "\\1\u1036"),
|
|
256
|
+
|
|
257
|
+
# Reorder E + dot + medial HA
|
|
258
|
+
("\u1031\u1037\u103e", "\u103e\u1031\u1037"),
|
|
259
|
+
|
|
260
|
+
# Reorder E + medial HA + AA
|
|
261
|
+
("\u1031\u103e\u102c", "\u103e\u1031\u102c"),
|
|
262
|
+
|
|
263
|
+
# Tall AA + asat combination
|
|
264
|
+
("\u105a", "\u102b\u103a"),
|
|
265
|
+
|
|
266
|
+
# Reorder E + medial YA + medial HA
|
|
267
|
+
("\u1031\u103b\u103e", "\u103b\u103e\u1031"),
|
|
268
|
+
|
|
269
|
+
# Reorder vowel I/II and medial WA/HA
|
|
270
|
+
("(\u102d|\u102e)(\u103d|\u103e)", "\\2\\1"),
|
|
271
|
+
|
|
272
|
+
# Reorder AA and stacked consonant
|
|
273
|
+
("\u102c\u1039([\u1000-\u1021])", "\u1039\\1\u102c"),
|
|
274
|
+
|
|
275
|
+
# Complex reordering with medial RA + asat + stacked
|
|
276
|
+
("\u1039\u103c\u103a\u1039([\u1000-\u1021])", "\u103a\u1039\\1\u103c"),
|
|
277
|
+
|
|
278
|
+
# Reorder medial RA and stacked consonant
|
|
279
|
+
("\u103c\u1039([\u1000-\u1021])", "\u1039\\1\u103c"),
|
|
280
|
+
|
|
281
|
+
# Reorder anusvara and stacked consonant
|
|
282
|
+
("\u1036\u1039([\u1000-\u1021])", "\u1039\\1\u1036"),
|
|
283
|
+
|
|
284
|
+
# Expand abbreviated form
|
|
285
|
+
("\u104e", "\u104e\u1004\u103a\u1038"),
|
|
286
|
+
|
|
287
|
+
# Digit 0 + AA/AI -> WA + AA/AI
|
|
288
|
+
("\u1040(\u102b|\u102c|\u1036)", "\u101d\\1"),
|
|
289
|
+
|
|
290
|
+
# U+1025 + asat -> U+1009 + asat
|
|
291
|
+
("\u1025\u1039", "\u1009\u1039"),
|
|
292
|
+
|
|
293
|
+
# Reorder consonant + medial RA + E + medial WA
|
|
294
|
+
("([\u1000-\u1021])\u103c\u1031\u103d", "\\1\u103c\u103d\u1031"),
|
|
295
|
+
|
|
296
|
+
# Reorder consonant + medial YA + E + medial WA + optional HA
|
|
297
|
+
("([\u1000-\u1021])\u103b\u1031\u103d(\u103e)?", "\\1\u103b\u103d\\2\u1031"),
|
|
298
|
+
|
|
299
|
+
# Reorder consonant + medial WA + E + medial YA
|
|
300
|
+
("([\u1000-\u1021])\u103d\u1031\u103b", "\\1\u103b\u103d\u1031"),
|
|
301
|
+
|
|
302
|
+
# Reorder consonant + E + stacked consonant
|
|
303
|
+
("([\u1000-\u1021])\u1031(\u1039[\u1000-\u1021]\u103d?)", "\\1\\2\u1031"),
|
|
304
|
+
|
|
305
|
+
# Reorder visarga and asat
|
|
306
|
+
("\u1038\u103a", "\u103a\u1038"),
|
|
307
|
+
|
|
308
|
+
# Remove redundant vowel I + asat combinations
|
|
309
|
+
("\u102d\u103a|\u103a\u102d", "\u102d"),
|
|
310
|
+
|
|
311
|
+
# Remove asat after vowel I + U
|
|
312
|
+
("\u102d\u102f\u103a", "\u102d\u102f"),
|
|
313
|
+
|
|
314
|
+
# Remove space before dot below
|
|
315
|
+
("\u0020\u1037", "\u1037"),
|
|
316
|
+
|
|
317
|
+
# Reorder dot below and anusvara
|
|
318
|
+
("\u1037\u1036", "\u1036\u1037"),
|
|
319
|
+
|
|
320
|
+
# Remove duplicate vowel I
|
|
321
|
+
("[\u102d]+", "\u102d"),
|
|
322
|
+
|
|
323
|
+
# Remove duplicate asat
|
|
324
|
+
("[\u103a]+", "\u103a"),
|
|
325
|
+
|
|
326
|
+
# Remove duplicate medial WA
|
|
327
|
+
("[\u103d]+", "\u103d"),
|
|
328
|
+
|
|
329
|
+
# Remove duplicate dot below
|
|
330
|
+
("[\u1037]+", "\u1037"),
|
|
331
|
+
|
|
332
|
+
# Remove duplicate vowel II
|
|
333
|
+
("[\u102e]+", "\u102e"),
|
|
334
|
+
|
|
335
|
+
# Normalize vowel I + II -> II
|
|
336
|
+
("\u102d\u102e|\u102e\u102d", "\u102e"),
|
|
337
|
+
|
|
338
|
+
# Reorder U + vowel I
|
|
339
|
+
("\u102f\u102d", "\u102d\u102f"),
|
|
340
|
+
|
|
341
|
+
# Remove double dot below
|
|
342
|
+
("\u1037\u1037", "\u1037"),
|
|
343
|
+
|
|
344
|
+
# Remove double AI
|
|
345
|
+
("\u1032\u1032", "\u1032"),
|
|
346
|
+
|
|
347
|
+
# Digit 4 + NGA + asat + visarga -> abbreviated form
|
|
348
|
+
("\u1044\u1004\u103a\u1038", "\u104E\u1004\u103a\u1038"),
|
|
349
|
+
|
|
350
|
+
# Reorder vowel I/II + stacked consonant
|
|
351
|
+
("([\u102d\u102e])\u1039([\u1000-\u1021])", "\u1039\\2\\1"),
|
|
352
|
+
|
|
353
|
+
# Reorder medial RA + E + stacked consonant
|
|
354
|
+
("(\u103c\u1031)\u1039([\u1000-\u1021])", "\u1039\\2\\1"),
|
|
355
|
+
|
|
356
|
+
# Reorder anusvara and medial WA
|
|
357
|
+
("\u1036\u103d", "\u103d\u1036"),
|
|
358
|
+
|
|
359
|
+
# Digit 7 -> RA in certain contexts (final)
|
|
360
|
+
("\u1047((?=[\u1000-\u1021]\u103a)|(?=[\u102c-\u1030\u1032\u1036-\u1038\u103d\u103e]))", "\u101b"),
|
|
361
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "paraencoder"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Burmese text detection and conversion toolkit for Zawgyi and Unicode"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Para Maintainers" }
|
|
13
|
+
]
|
|
14
|
+
license = { text = "MIT" }
|
|
15
|
+
keywords = ["myanmar", "burmese", "zawgyi", "unicode", "text", "conversion"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Topic :: Text Processing :: Linguistic",
|
|
22
|
+
]
|
|
23
|
+
dependencies = []
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/Laitei40/ParaEncoder"
|
|
27
|
+
Repository = "https://github.com/Laitei40/ParaEncoder"
|
|
28
|
+
Issues = "https://github.com/Laitei40/ParaEncoder/issues"
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
test = ["pytest>=7"]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
para = "para.cli:main"
|
|
35
|
+
|
|
36
|
+
[tool.hatch.build.targets.wheel]
|
|
37
|
+
packages = ["para"]
|
|
38
|
+
|
|
39
|
+
[tool.pytest.ini_options]
|
|
40
|
+
testpaths = ["tests"]
|
|
41
|
+
addopts = "-q"
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Manual test runner for the `para` package.
|
|
3
|
+
|
|
4
|
+
Run this script to perform a quick smoke check that imports
|
|
5
|
+
the main functions and exercises them on a couple of sample inputs.
|
|
6
|
+
"""
|
|
7
|
+
from para import detect_encoding, is_zawgyi, zg_to_unicode, normalize_unicode, __version__
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_sample(s):
|
|
11
|
+
print("input:", repr(s))
|
|
12
|
+
try:
|
|
13
|
+
enc = detect_encoding(s)
|
|
14
|
+
except Exception as e:
|
|
15
|
+
enc = f"error: {e}"
|
|
16
|
+
print(" detect_encoding:", enc)
|
|
17
|
+
try:
|
|
18
|
+
zg = is_zawgyi(s)
|
|
19
|
+
except Exception as e:
|
|
20
|
+
zg = f"error: {e}"
|
|
21
|
+
print(" is_zawgyi:", zg)
|
|
22
|
+
try:
|
|
23
|
+
conv = zg_to_unicode(s)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
conv = f"error: {e}"
|
|
26
|
+
print(" zg_to_unicode:", conv)
|
|
27
|
+
try:
|
|
28
|
+
norm = normalize_unicode(s)
|
|
29
|
+
except Exception as e:
|
|
30
|
+
norm = f"error: {e}"
|
|
31
|
+
print(" normalize_unicode:", norm)
|
|
32
|
+
print("-")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def main():
|
|
36
|
+
print("para version:", __version__)
|
|
37
|
+
samples = [
|
|
38
|
+
"မင်္ဂလာပါ", # common Myanmar greeting (Unicode)
|
|
39
|
+
"", # empty string
|
|
40
|
+
]
|
|
41
|
+
for s in samples:
|
|
42
|
+
run_sample(s)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
if __name__ == "__main__":
|
|
46
|
+
main()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import para.cli as cli
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def run_cli(args: List[str], input_text: str) -> str:
|
|
8
|
+
stdin = io.StringIO(input_text)
|
|
9
|
+
stdout = io.StringIO()
|
|
10
|
+
original_stdin, original_stdout = cli.sys.stdin, cli.sys.stdout
|
|
11
|
+
try:
|
|
12
|
+
cli.sys.stdin = stdin
|
|
13
|
+
cli.sys.stdout = stdout
|
|
14
|
+
cli.main(args)
|
|
15
|
+
return stdout.getvalue()
|
|
16
|
+
finally:
|
|
17
|
+
cli.sys.stdin = original_stdin
|
|
18
|
+
cli.sys.stdout = original_stdout
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_cli_detect_reports_zawgyi():
|
|
22
|
+
output = run_cli(["detect"], "\u106A")
|
|
23
|
+
assert "zawgyi" in output
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_cli_convert_stdin():
|
|
27
|
+
output = run_cli(["convert", "--force"], "\u106A")
|
|
28
|
+
assert "\u1009" in output
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
from para.convert import zg_to_unicode
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_unicode_mingalaba_is_preserved():
|
|
5
|
+
"""Unicode input must NEVER be modified by zg_to_unicode (contract guarantee)."""
|
|
6
|
+
assert zg_to_unicode("မင်္ဂလာပါ") == "မင်္ဂလာပါ"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_zawgyi_myanmarjpyay_conversion():
|
|
10
|
+
"""Full Zawgyi sentence 'I love Myanmar' converts correctly."""
|
|
11
|
+
zg = "ျမန္မာျပည္ကိုခ်စ္တယ္"
|
|
12
|
+
expected = "မြန်မာပြည်ကိုချစ်တယ်"
|
|
13
|
+
assert zg_to_unicode(zg, force=True) == expected
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_simple_replacement():
|
|
17
|
+
"""Basic NGA + tall AA conversion."""
|
|
18
|
+
zg = "\u106A\u1033" # NGA variant + tall AA
|
|
19
|
+
expected = "\u1009\u102F" # NGA + U
|
|
20
|
+
assert zg_to_unicode(zg, force=True) == expected
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_kinzi_basic_conversion():
|
|
24
|
+
"""Kinzi character converts to NGA + asat + virama sequence."""
|
|
25
|
+
# Kinzi typically appears with a consonant, e.g., ကၤ -> ကင်္
|
|
26
|
+
zg = "\u1000\u1064" # KA + kinzi
|
|
27
|
+
converted = zg_to_unicode(zg, force=True)
|
|
28
|
+
assert "\u1004\u103a\u1039" in converted
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_kinzi_with_vowel_i():
|
|
32
|
+
"""Kinzi with vowel I marker."""
|
|
33
|
+
# Kinzi+I typically appears with a consonant
|
|
34
|
+
zg = "\u1000\u108b" # KA + kinzi+I
|
|
35
|
+
converted = zg_to_unicode(zg, force=True)
|
|
36
|
+
assert "\u1004\u103a\u1039" in converted
|
|
37
|
+
assert "\u102d" in converted
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_tall_aa_mapping():
|
|
41
|
+
"""Tall AA (U+1033) -> U+102F."""
|
|
42
|
+
zg = "\u1033"
|
|
43
|
+
assert zg_to_unicode(zg, force=True) == "\u102F"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_tall_uu_mapping():
|
|
47
|
+
"""Tall UU (U+1034) -> U+1030."""
|
|
48
|
+
zg = "\u1034"
|
|
49
|
+
assert zg_to_unicode(zg, force=True) == "\u1030"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_stacked_consonant_ka():
|
|
53
|
+
"""Stacked KA (U+1060) -> virama + KA."""
|
|
54
|
+
zg = "\u1060"
|
|
55
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1000"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_stacked_consonant_kha():
|
|
59
|
+
"""Stacked KHA (U+1061) -> virama + KHA."""
|
|
60
|
+
zg = "\u1061"
|
|
61
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1001"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_stacked_consonant_ga():
|
|
65
|
+
"""Stacked GA (U+1062) -> virama + GA."""
|
|
66
|
+
zg = "\u1062"
|
|
67
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1002"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_stacked_consonant_gha():
|
|
71
|
+
"""Stacked GHA (U+1063) -> virama + GHA."""
|
|
72
|
+
zg = "\u1063"
|
|
73
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1003"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_stacked_consonant_ca():
|
|
77
|
+
"""Stacked CA (U+1065) -> virama + CA."""
|
|
78
|
+
zg = "\u1065"
|
|
79
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1005"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_stacked_consonant_ja():
|
|
83
|
+
"""Stacked JA (U+1068) -> virama + JA."""
|
|
84
|
+
zg = "\u1068"
|
|
85
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1007"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_stacked_consonant_ta():
|
|
89
|
+
"""Stacked TA variants (U+1071/U+1072) -> virama + TA."""
|
|
90
|
+
assert zg_to_unicode("\u1071", force=True) == "\u1039\u1010"
|
|
91
|
+
assert zg_to_unicode("\u1072", force=True) == "\u1039\u1010"
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_stacked_consonant_tha():
|
|
95
|
+
"""Stacked THA variants (U+1073/U+1074) -> virama + THA."""
|
|
96
|
+
assert zg_to_unicode("\u1073", force=True) == "\u1039\u1011"
|
|
97
|
+
assert zg_to_unicode("\u1074", force=True) == "\u1039\u1011"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_stacked_consonant_pa():
|
|
101
|
+
"""Stacked PA (U+1078) -> virama + PA."""
|
|
102
|
+
zg = "\u1078"
|
|
103
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1015"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_stacked_consonant_ma():
|
|
107
|
+
"""Stacked MA (U+107C) -> virama + MA."""
|
|
108
|
+
zg = "\u107C"
|
|
109
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u1019"
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_stacked_consonant_la():
|
|
113
|
+
"""Stacked LA (U+1085) -> virama + LA."""
|
|
114
|
+
zg = "\u1085"
|
|
115
|
+
assert zg_to_unicode(zg, force=True) == "\u1039\u101C"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_medial_ya_conversion():
|
|
119
|
+
"""Medial YA variant (U+103A in Zawgyi) -> U+103B."""
|
|
120
|
+
# After asat conversion, U+1039 becomes U+103A (asat)
|
|
121
|
+
# Then U+103A becomes U+103B (medial YA)
|
|
122
|
+
zg = "\u103a" # Zawgyi medial YA
|
|
123
|
+
converted = zg_to_unicode(zg, force=True)
|
|
124
|
+
assert converted == "\u103b"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_medial_ra_conversion():
|
|
128
|
+
"""Medial RA variants -> U+103C."""
|
|
129
|
+
# U+103B in Zawgyi is medial RA
|
|
130
|
+
zg = "\u103b"
|
|
131
|
+
converted = zg_to_unicode(zg, force=True)
|
|
132
|
+
assert converted == "\u103c"
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_medial_wa_conversion():
|
|
136
|
+
"""Medial WA (U+103C in Zawgyi) -> U+103D."""
|
|
137
|
+
zg = "\u103c"
|
|
138
|
+
converted = zg_to_unicode(zg, force=True)
|
|
139
|
+
assert converted == "\u103d"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_medial_ha_conversion():
|
|
143
|
+
"""Medial HA (U+103D in Zawgyi) -> U+103E."""
|
|
144
|
+
zg = "\u103d"
|
|
145
|
+
converted = zg_to_unicode(zg, force=True)
|
|
146
|
+
assert converted == "\u103e"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_asat_conversion():
|
|
150
|
+
"""Asat/virama (U+1039 in Zawgyi) -> U+103A."""
|
|
151
|
+
zg = "\u1039"
|
|
152
|
+
converted = zg_to_unicode(zg, force=True)
|
|
153
|
+
assert converted == "\u103a"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def test_dot_below_variants():
|
|
157
|
+
"""Dot below variants (U+1094/U+1095) -> U+1037."""
|
|
158
|
+
assert zg_to_unicode("\u1094", force=True) == "\u1037"
|
|
159
|
+
assert zg_to_unicode("\u1095", force=True) == "\u1037"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_na_variant():
|
|
163
|
+
"""NA variant (U+108F) -> U+1014."""
|
|
164
|
+
zg = "\u108F"
|
|
165
|
+
assert zg_to_unicode(zg, force=True) == "\u1014"
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def test_ra_variant():
|
|
169
|
+
"""RA variant (U+1090) -> U+101B."""
|
|
170
|
+
zg = "\u1090"
|
|
171
|
+
assert zg_to_unicode(zg, force=True) == "\u101B"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def test_nga_variant():
|
|
175
|
+
"""NGA variant (U+106A) -> U+1009."""
|
|
176
|
+
zg = "\u106A"
|
|
177
|
+
assert zg_to_unicode(zg, force=True) == "\u1009"
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def test_great_sa():
|
|
181
|
+
"""Great SA (U+1086) -> U+103F."""
|
|
182
|
+
zg = "\u1086"
|
|
183
|
+
assert zg_to_unicode(zg, force=True) == "\u103F"
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_tall_aa_asat():
|
|
187
|
+
"""Tall AA + asat combination (U+105A) -> U+102B U+103A."""
|
|
188
|
+
zg = "\u105A"
|
|
189
|
+
assert zg_to_unicode(zg, force=True) == "\u102B\u103A"
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_empty_string():
|
|
193
|
+
"""Empty string returns empty."""
|
|
194
|
+
assert zg_to_unicode("") == ""
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def test_ascii_passthrough():
|
|
198
|
+
"""ASCII text passes through unchanged."""
|
|
199
|
+
assert zg_to_unicode("hello world", force=True) == "hello world"
|
|
200
|
+
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import para.detect as detect
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_is_zawgyi_true():
|
|
5
|
+
assert detect.is_zawgyi("\u106A\u1031\u1000") is True
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_detect_unknown_on_ascii():
|
|
9
|
+
assert detect.detect_encoding("hello") == "unknown"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
ZAWGYI_FIXTURES = [
|
|
13
|
+
"\u1031\u103B\u1000\u103A\u102C", # prefixed E + medial order
|
|
14
|
+
"\u1064\u102D\u1031\u1000", # kinzi with i-vowel and E before base
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
UNICODE_FIXTURES = [
|
|
19
|
+
"\u1019\u103C\u1014\u103A\u1038", # မြန်မာ
|
|
20
|
+
"\u1019\u103C\u1014\u103A\u1038\u1005\u102C", # မြန်မာစာ
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_detect_on_corpus_samples():
|
|
25
|
+
for sample in ZAWGYI_FIXTURES:
|
|
26
|
+
assert detect.detect_encoding(sample) == "zawgyi"
|
|
27
|
+
for sample in UNICODE_FIXTURES:
|
|
28
|
+
assert detect.detect_encoding(sample) == "unicode"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_detect_unknown_on_short_myanmar():
|
|
32
|
+
assert detect.detect_encoding("\u1010\u1014") == "unknown"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from para.normalize import normalize_unicode
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def test_unicode_mingalaba_normalization_is_noop():
|
|
5
|
+
"""normalize_unicode must NEVER modify valid Unicode (contract guarantee)."""
|
|
6
|
+
assert normalize_unicode("မင်္ဂလာပါ") == "မင်္ဂလာပါ"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_empty_string():
|
|
10
|
+
assert normalize_unicode("") == ""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_ascii_passthrough():
|
|
14
|
+
assert normalize_unicode("hello") == "hello"
|