pysinrom 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pysinrom-0.1.0/LICENSE +21 -0
- pysinrom-0.1.0/PKG-INFO +84 -0
- pysinrom-0.1.0/README.md +63 -0
- pysinrom-0.1.0/pyproject.toml +34 -0
- pysinrom-0.1.0/setup.cfg +4 -0
- pysinrom-0.1.0/src/pysinrom/__init__.py +28 -0
- pysinrom-0.1.0/src/pysinrom/cantromzj1_maps.py +100 -0
- pysinrom-0.1.0/src/pysinrom/cantromzj1_parse.py +121 -0
- pysinrom-0.1.0/src/pysinrom/cantromzj1_to_jyutping.py +71 -0
- pysinrom-0.1.0/src/pysinrom/jyutping_to_cantromzj1.py +97 -0
- pysinrom-0.1.0/src/pysinrom.egg-info/PKG-INFO +84 -0
- pysinrom-0.1.0/src/pysinrom.egg-info/SOURCES.txt +14 -0
- pysinrom-0.1.0/src/pysinrom.egg-info/dependency_links.txt +1 -0
- pysinrom-0.1.0/src/pysinrom.egg-info/requires.txt +4 -0
- pysinrom-0.1.0/src/pysinrom.egg-info/top_level.txt +1 -0
- pysinrom-0.1.0/tests/test_conversion.py +37 -0
pysinrom-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anonymous Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pysinrom-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pysinrom
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python tools for Sinitic romanization, currently only as research utilities for conversion between Jyutping and CantRomZJ1
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: pycantonese<6,>=5.0
|
|
18
|
+
Provides-Extra: test
|
|
19
|
+
Requires-Dist: pytest>=8; extra == "test"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# PySinRom
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
After publication on PyPI:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install pysinrom
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
For local installation from the project directory:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
python -m pip install .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from pysinrom import (
|
|
42
|
+
jyutping_to_cantromzj1,
|
|
43
|
+
cantromzj1_to_jyutping,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
cantromzj1, syllables = jyutping_to_cantromzj1("hoeng1gong2")
|
|
47
|
+
print(cantromzj1)
|
|
48
|
+
# heong1A|55gong2A|35
|
|
49
|
+
|
|
50
|
+
jyutping, syllables = cantromzj1_to_jyutping("heong1A|55gong2A|35")
|
|
51
|
+
print(jyutping)
|
|
52
|
+
# hoeng1gong2
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Space-separated input and output are also supported:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
result = jyutping_to_cantromzj1(
|
|
59
|
+
"hoeng1 gong2",
|
|
60
|
+
output_separator=" ",
|
|
61
|
+
return_mode="string",
|
|
62
|
+
)
|
|
63
|
+
print(result)
|
|
64
|
+
# heong1A|55 gong2A|35
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Main functions
|
|
68
|
+
|
|
69
|
+
- `jyutping_to_cantromzj1()`
|
|
70
|
+
- `jyutping_syllable_to_cantromzj1()`
|
|
71
|
+
- `cantromzj1_to_jyutping()`
|
|
72
|
+
- `cantromzj1_syllable_to_jyutping()`
|
|
73
|
+
- `parse_cantromzj1()`
|
|
74
|
+
- `parse_cantromzj1_syllable()`
|
|
75
|
+
|
|
76
|
+
The default return mode for full-string conversion is a tuple containing the joined output string and a list of converted syllables. Set `return_mode="string"` or `return_mode="list"` to request only one representation.
|
|
77
|
+
|
|
78
|
+
## Development status
|
|
79
|
+
|
|
80
|
+
This is research software released as an alpha version. The current implementation focuses on the romanization correspondences described in the accompanying anonymous manuscript.
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
MIT License.
|
pysinrom-0.1.0/README.md
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# PySinRom
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
After publication on PyPI:
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install pysinrom
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
For local installation from the project directory:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
python -m pip install .
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from pysinrom import (
|
|
21
|
+
jyutping_to_cantromzj1,
|
|
22
|
+
cantromzj1_to_jyutping,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
cantromzj1, syllables = jyutping_to_cantromzj1("hoeng1gong2")
|
|
26
|
+
print(cantromzj1)
|
|
27
|
+
# heong1A|55gong2A|35
|
|
28
|
+
|
|
29
|
+
jyutping, syllables = cantromzj1_to_jyutping("heong1A|55gong2A|35")
|
|
30
|
+
print(jyutping)
|
|
31
|
+
# hoeng1gong2
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Space-separated input and output are also supported:
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
result = jyutping_to_cantromzj1(
|
|
38
|
+
"hoeng1 gong2",
|
|
39
|
+
output_separator=" ",
|
|
40
|
+
return_mode="string",
|
|
41
|
+
)
|
|
42
|
+
print(result)
|
|
43
|
+
# heong1A|55 gong2A|35
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Main functions
|
|
47
|
+
|
|
48
|
+
- `jyutping_to_cantromzj1()`
|
|
49
|
+
- `jyutping_syllable_to_cantromzj1()`
|
|
50
|
+
- `cantromzj1_to_jyutping()`
|
|
51
|
+
- `cantromzj1_syllable_to_jyutping()`
|
|
52
|
+
- `parse_cantromzj1()`
|
|
53
|
+
- `parse_cantromzj1_syllable()`
|
|
54
|
+
|
|
55
|
+
The default return mode for full-string conversion is a tuple containing the joined output string and a list of converted syllables. Set `return_mode="string"` or `return_mode="list"` to request only one representation.
|
|
56
|
+
|
|
57
|
+
## Development status
|
|
58
|
+
|
|
59
|
+
This is research software released as an alpha version. The current implementation focuses on the romanization correspondences described in the accompanying anonymous manuscript.
|
|
60
|
+
|
|
61
|
+
## License
|
|
62
|
+
|
|
63
|
+
MIT License.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77.0.3"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pysinrom"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Python tools for Sinitic romanization, currently only as research utilities for conversion between Jyutping and CantRomZJ1"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"pycantonese>=5.0,<6",
|
|
15
|
+
]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Topic :: Text Processing :: Linguistic",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
test = ["pytest>=8"]
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
where = ["src"]
|
|
32
|
+
|
|
33
|
+
[tool.pytest.ini_options]
|
|
34
|
+
testpaths = ["tests"]
|
pysinrom-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Bidirectional conversion between Jyutping and CantRomZJ1."""
|
|
2
|
+
|
|
3
|
+
from .jyutping_to_cantromzj1 import (
|
|
4
|
+
jyutping_syllable_to_cantromzj1,
|
|
5
|
+
jyutping_to_cantromzj1,
|
|
6
|
+
parse_jyutping_to_cantromzj1_objects,
|
|
7
|
+
)
|
|
8
|
+
from .cantromzj1_parse import (
|
|
9
|
+
parse_cantromzj1,
|
|
10
|
+
parse_cantromzj1_syllable,
|
|
11
|
+
)
|
|
12
|
+
from .cantromzj1_to_jyutping import (
|
|
13
|
+
cantromzj1_syllable_to_jyutping,
|
|
14
|
+
cantromzj1_to_jyutping,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"__version__",
|
|
21
|
+
"jyutping_syllable_to_cantromzj1",
|
|
22
|
+
"jyutping_to_cantromzj1",
|
|
23
|
+
"parse_jyutping_to_cantromzj1_objects",
|
|
24
|
+
"parse_cantromzj1",
|
|
25
|
+
"parse_cantromzj1_syllable",
|
|
26
|
+
"cantromzj1_syllable_to_jyutping",
|
|
27
|
+
"cantromzj1_to_jyutping",
|
|
28
|
+
]
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Shared maps for Jyutping <-> CantRomZJ1 conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
CHECKED_CODAS = {"p", "t", "k"}
|
|
6
|
+
|
|
7
|
+
# Jyutping nucleus -> CantRomZJ1 nucleus
|
|
8
|
+
JYUTPING_NUCLEUS_TO_CANTROMZJ1 = {
|
|
9
|
+
"aa": "a",
|
|
10
|
+
"a": "e",
|
|
11
|
+
"e": "ea",
|
|
12
|
+
"oe": "eo",
|
|
13
|
+
"eo": "oe",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
CANTROMZJ1_NUCLEUS_TO_JYUTPING = {
|
|
17
|
+
value: key for key, value in JYUTPING_NUCLEUS_TO_CANTROMZJ1.items()
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# Non-entering tones
|
|
21
|
+
JYUTPING_TONE_TO_CANTROMZJ1_OPEN = {
|
|
22
|
+
"1": "1A|55",
|
|
23
|
+
"4": "1B|21",
|
|
24
|
+
"2": "2A|35",
|
|
25
|
+
"5": "2B|13",
|
|
26
|
+
"3": "3A|33",
|
|
27
|
+
"6": "3B|22",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Entering tones, identified by checked codas -p, -t, -k
|
|
31
|
+
JYUTPING_TONE_TO_CANTROMZJ1_CHECKED = {
|
|
32
|
+
"1": "4Aa|5",
|
|
33
|
+
"3": "4Ab|3",
|
|
34
|
+
"6": "4B|2",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
CANTROMZJ1_TONE_TO_JYUTPING = {
|
|
38
|
+
"1A|55": "1",
|
|
39
|
+
"1B|21": "4",
|
|
40
|
+
"2A|35": "2",
|
|
41
|
+
"2B|13": "5",
|
|
42
|
+
"3A|33": "3",
|
|
43
|
+
"3B|22": "6",
|
|
44
|
+
"4Aa|5": "1",
|
|
45
|
+
"4Ab|3": "3",
|
|
46
|
+
"4B|2": "6",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
CANTROMZJ1_TONES = tuple(
|
|
50
|
+
sorted(CANTROMZJ1_TONE_TO_JYUTPING.keys(), key=len, reverse=True)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# PyCantonese/Jyutping onset inventory.
|
|
54
|
+
# Keep "onset" in the returned PyCantonese Jyutping object because that is the library field name.
|
|
55
|
+
JYUTPING_ONSETS = (
|
|
56
|
+
"gw", "kw", "ng",
|
|
57
|
+
"b", "p", "m", "f",
|
|
58
|
+
"d", "t", "n", "l",
|
|
59
|
+
"g", "k", "h",
|
|
60
|
+
"w", "z", "c", "s", "j",
|
|
61
|
+
"",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
JYUTPING_CODAS = ("ng", "p", "t", "k", "m", "n", "i", "u", "")
|
|
65
|
+
|
|
66
|
+
# Nuclei after Jyutping -> CantRomZJ1 conversion.
|
|
67
|
+
VALID_CANTROMZJ1_NUCLEI = {
|
|
68
|
+
"a", "e", "ea", "i", "o", "u", "eo", "oe", "yu", "m", "ng"
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def convert_nucleus_jyutping_to_cantromzj1(nucleus: str) -> str:
|
|
73
|
+
"""Convert one Jyutping nucleus to CantRomZJ1."""
|
|
74
|
+
return JYUTPING_NUCLEUS_TO_CANTROMZJ1.get(nucleus, nucleus)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def convert_nucleus_cantromzj1_to_jyutping(nucleus: str) -> str:
|
|
78
|
+
"""Convert one CantRomZJ1 nucleus back to Jyutping."""
|
|
79
|
+
return CANTROMZJ1_NUCLEUS_TO_JYUTPING.get(nucleus, nucleus)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def convert_tone_jyutping_to_cantromzj1(tone: str, coda: str) -> str:
|
|
83
|
+
"""Convert one Jyutping tone number to CantRomZJ1 tone notation."""
|
|
84
|
+
if coda in CHECKED_CODAS:
|
|
85
|
+
tone_map = JYUTPING_TONE_TO_CANTROMZJ1_CHECKED
|
|
86
|
+
else:
|
|
87
|
+
tone_map = JYUTPING_TONE_TO_CANTROMZJ1_OPEN
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
return tone_map[tone]
|
|
91
|
+
except KeyError as exc:
|
|
92
|
+
raise ValueError(f"Unsupported Jyutping tone {tone!r} for coda {coda!r}") from exc
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def convert_tone_cantromzj1_to_jyutping(tone: str) -> str:
|
|
96
|
+
"""Convert one CantRomZJ1 tone notation back to a Jyutping tone number."""
|
|
97
|
+
try:
|
|
98
|
+
return CANTROMZJ1_TONE_TO_JYUTPING[tone]
|
|
99
|
+
except KeyError as exc:
|
|
100
|
+
raise ValueError(f"Unsupported CantRomZJ1 tone {tone!r}") from exc
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Parse CantRomZJ1 strings into PyCantonese-style Jyutping objects."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pycantonese.jyutping import Jyutping
|
|
6
|
+
|
|
7
|
+
from .cantromzj1_maps import (
|
|
8
|
+
CANTROMZJ1_TONES,
|
|
9
|
+
JYUTPING_CODAS,
|
|
10
|
+
JYUTPING_ONSETS,
|
|
11
|
+
VALID_CANTROMZJ1_NUCLEI,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _split_cantromzj1_tone(syllable: str) -> tuple[str, str]:
|
|
16
|
+
"""Split one CantRomZJ1 syllable into segmental body and tone notation."""
|
|
17
|
+
for tone in CANTROMZJ1_TONES:
|
|
18
|
+
if syllable.endswith(tone):
|
|
19
|
+
body = syllable[: -len(tone)]
|
|
20
|
+
if not body:
|
|
21
|
+
raise ValueError(f"Missing segmental body before tone in {syllable!r}")
|
|
22
|
+
return body, tone
|
|
23
|
+
raise ValueError(f"Cannot find a valid CantRomZJ1 tone suffix in {syllable!r}")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _split_body(body: str) -> tuple[str, str, str]:
|
|
27
|
+
"""
|
|
28
|
+
Split CantRomZJ1 segmental body into onset, nucleus, coda.
|
|
29
|
+
|
|
30
|
+
Returned field name in the PyCantonese object is still 'onset',
|
|
31
|
+
because PyCantonese uses Jyutping(onset=..., nucleus=..., coda=..., tone=...).
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
# Syllabic nasals, e.g. m4 / ng4 after conversion.
|
|
35
|
+
if body in {"m", "ng"}:
|
|
36
|
+
return "", body, ""
|
|
37
|
+
|
|
38
|
+
for onset in JYUTPING_ONSETS:
|
|
39
|
+
if onset and not body.startswith(onset):
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
rest = body[len(onset):] if onset else body
|
|
43
|
+
if not rest:
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
for coda in JYUTPING_CODAS:
|
|
47
|
+
if coda and not rest.endswith(coda):
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
nucleus = rest[: -len(coda)] if coda else rest
|
|
51
|
+
if nucleus in VALID_CANTROMZJ1_NUCLEI:
|
|
52
|
+
return onset, nucleus, coda
|
|
53
|
+
|
|
54
|
+
raise ValueError(f"Cannot split CantRomZJ1 syllable body {body!r}")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_cantromzj1_syllable(syllable: str) -> Jyutping:
|
|
58
|
+
"""
|
|
59
|
+
Parse one CantRomZJ1 syllable into a PyCantonese-style Jyutping object.
|
|
60
|
+
|
|
61
|
+
Example:
|
|
62
|
+
parse_cantromzj1_syllable("heong1A|55")
|
|
63
|
+
-> Jyutping(onset='h', nucleus='eo', coda='ng', tone='1A|55')
|
|
64
|
+
"""
|
|
65
|
+
body, tone = _split_cantromzj1_tone(syllable.strip())
|
|
66
|
+
onset, nucleus, coda = _split_body(body)
|
|
67
|
+
return Jyutping(onset=onset, nucleus=nucleus, coda=coda, tone=tone)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _split_cantromzj1_syllables(text: str) -> list[str]:
|
|
71
|
+
"""
|
|
72
|
+
Split a concatenated CantRomZJ1 string into syllables by tone suffixes.
|
|
73
|
+
|
|
74
|
+
Example:
|
|
75
|
+
"heong1A|55gong2A|35" -> ["heong1A|55", "gong2A|35"]
|
|
76
|
+
"""
|
|
77
|
+
text = text.strip()
|
|
78
|
+
if not text:
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
if " " in text:
|
|
82
|
+
return [item for item in text.split() if item]
|
|
83
|
+
|
|
84
|
+
syllables: list[str] = []
|
|
85
|
+
start = 0
|
|
86
|
+
i = 0
|
|
87
|
+
|
|
88
|
+
while i < len(text):
|
|
89
|
+
matched_tone = None
|
|
90
|
+
for tone in CANTROMZJ1_TONES:
|
|
91
|
+
if text.startswith(tone, i):
|
|
92
|
+
matched_tone = tone
|
|
93
|
+
break
|
|
94
|
+
|
|
95
|
+
if matched_tone is None:
|
|
96
|
+
i += 1
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
end = i + len(matched_tone)
|
|
100
|
+
syllables.append(text[start:end])
|
|
101
|
+
start = end
|
|
102
|
+
i = end
|
|
103
|
+
|
|
104
|
+
if start != len(text):
|
|
105
|
+
raise ValueError(f"Unparsed trailing content in CantRomZJ1 string: {text[start:]!r}")
|
|
106
|
+
|
|
107
|
+
return syllables
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def parse_cantromzj1(text: str) -> list[Jyutping]:
|
|
111
|
+
"""
|
|
112
|
+
Parse a CantRomZJ1 string into PyCantonese-style Jyutping objects.
|
|
113
|
+
|
|
114
|
+
Both concatenated and space-separated input are accepted.
|
|
115
|
+
|
|
116
|
+
Example:
|
|
117
|
+
parse_cantromzj1("heong1A|55gong2A|35")
|
|
118
|
+
-> [Jyutping(onset='h', nucleus='eo', coda='ng', tone='1A|55'),
|
|
119
|
+
Jyutping(onset='g', nucleus='o', coda='ng', tone='2A|35')]
|
|
120
|
+
"""
|
|
121
|
+
return [parse_cantromzj1_syllable(syl) for syl in _split_cantromzj1_syllables(text)]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Convert CantRomZJ1 strings back to Jyutping strings."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from .cantromzj1_maps import (
|
|
8
|
+
convert_nucleus_cantromzj1_to_jyutping,
|
|
9
|
+
convert_tone_cantromzj1_to_jyutping,
|
|
10
|
+
)
|
|
11
|
+
from .cantromzj1_parse import parse_cantromzj1, parse_cantromzj1_syllable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
ReturnMode = Literal["tuple", "string", "list"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def cantromzj1_syllable_to_jyutping(cantromzj1_syllable: str) -> str:
|
|
18
|
+
"""
|
|
19
|
+
Convert one CantRomZJ1 syllable string to one Jyutping syllable string.
|
|
20
|
+
|
|
21
|
+
Example:
|
|
22
|
+
cantromzj1_syllable_to_jyutping("heong1A|55") -> "hoeng1"
|
|
23
|
+
"""
|
|
24
|
+
parsed = parse_cantromzj1_syllable(cantromzj1_syllable)
|
|
25
|
+
nucleus = convert_nucleus_cantromzj1_to_jyutping(parsed.nucleus)
|
|
26
|
+
tone = convert_tone_cantromzj1_to_jyutping(parsed.tone)
|
|
27
|
+
return f"{parsed.onset}{nucleus}{parsed.coda}{tone}"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def cantromzj1_to_jyutping(
|
|
31
|
+
cantromzj1: str,
|
|
32
|
+
*,
|
|
33
|
+
output_separator: str = "",
|
|
34
|
+
return_mode: ReturnMode = "tuple",
|
|
35
|
+
) -> str | list[str] | tuple[str, list[str]]:
|
|
36
|
+
"""
|
|
37
|
+
Convert a CantRomZJ1 string to Jyutping.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
cantromzj1:
|
|
41
|
+
A CantRomZJ1 string. Both concatenated and space-separated input are accepted.
|
|
42
|
+
output_separator:
|
|
43
|
+
Separator used when joining converted syllables. Use "" for concatenated
|
|
44
|
+
output or " " for space-separated output.
|
|
45
|
+
return_mode:
|
|
46
|
+
"tuple": return (joined_string, syllable_list).
|
|
47
|
+
"string": return only the joined string.
|
|
48
|
+
"list": return only the list of converted syllables.
|
|
49
|
+
|
|
50
|
+
Example:
|
|
51
|
+
cantromzj1_to_jyutping("heong1A|55gong2A|35")
|
|
52
|
+
-> ("hoeng1gong2", ["hoeng1", "gong2"])
|
|
53
|
+
"""
|
|
54
|
+
parsed_items = parse_cantromzj1(cantromzj1)
|
|
55
|
+
converted = []
|
|
56
|
+
|
|
57
|
+
for item in parsed_items:
|
|
58
|
+
nucleus = convert_nucleus_cantromzj1_to_jyutping(item.nucleus)
|
|
59
|
+
tone = convert_tone_cantromzj1_to_jyutping(item.tone)
|
|
60
|
+
converted.append(f"{item.onset}{nucleus}{item.coda}{tone}")
|
|
61
|
+
|
|
62
|
+
joined = output_separator.join(converted)
|
|
63
|
+
|
|
64
|
+
if return_mode == "tuple":
|
|
65
|
+
return joined, converted
|
|
66
|
+
if return_mode == "string":
|
|
67
|
+
return joined
|
|
68
|
+
if return_mode == "list":
|
|
69
|
+
return converted
|
|
70
|
+
|
|
71
|
+
raise ValueError("return_mode must be one of: 'tuple', 'string', 'list'")
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Convert Jyutping strings to CantRomZJ1 strings or PyCantonese-style objects."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
import pycantonese
|
|
8
|
+
from pycantonese.jyutping import Jyutping
|
|
9
|
+
|
|
10
|
+
from .cantromzj1_maps import (
|
|
11
|
+
convert_nucleus_jyutping_to_cantromzj1,
|
|
12
|
+
convert_tone_jyutping_to_cantromzj1,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
ReturnMode = Literal["tuple", "string", "list"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _convert_parsed_jyutping_to_cantromzj1(parsed: Jyutping) -> tuple[str, Jyutping]:
|
|
20
|
+
"""Convert one parsed PyCantonese Jyutping object to CantRomZJ1."""
|
|
21
|
+
initial = parsed.onset
|
|
22
|
+
nucleus = convert_nucleus_jyutping_to_cantromzj1(parsed.nucleus)
|
|
23
|
+
coda = parsed.coda
|
|
24
|
+
tone = convert_tone_jyutping_to_cantromzj1(parsed.tone, coda)
|
|
25
|
+
|
|
26
|
+
syllable = f"{initial}{nucleus}{coda}{tone}"
|
|
27
|
+
obj = Jyutping(onset=initial, nucleus=nucleus, coda=coda, tone=tone)
|
|
28
|
+
return syllable, obj
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def parse_jyutping_to_cantromzj1_objects(jyutping: str) -> list[Jyutping]:
|
|
32
|
+
"""
|
|
33
|
+
Convert a Jyutping string into PyCantonese-style Jyutping objects,
|
|
34
|
+
but with CantRomZJ1-style nucleus and tone fields.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
parse_jyutping_to_cantromzj1_objects("hoeng1gong2")
|
|
38
|
+
-> [Jyutping(onset='h', nucleus='eo', coda='ng', tone='1A|55'), ...]
|
|
39
|
+
"""
|
|
40
|
+
parsed_items = pycantonese.parse_jyutping(jyutping.replace(" ", ""))
|
|
41
|
+
return [_convert_parsed_jyutping_to_cantromzj1(item)[1] for item in parsed_items]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def jyutping_syllable_to_cantromzj1(jyutping_syllable: str) -> str:
|
|
45
|
+
"""Convert one Jyutping syllable to one CantRomZJ1 syllable string."""
|
|
46
|
+
parsed_items = pycantonese.parse_jyutping(jyutping_syllable.strip())
|
|
47
|
+
if len(parsed_items) != 1:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Expected exactly one Jyutping syllable, got {len(parsed_items)}: "
|
|
50
|
+
f"{jyutping_syllable!r}"
|
|
51
|
+
)
|
|
52
|
+
return _convert_parsed_jyutping_to_cantromzj1(parsed_items[0])[0]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def jyutping_to_cantromzj1(
|
|
56
|
+
jyutping: str,
|
|
57
|
+
*,
|
|
58
|
+
output_separator: str = "",
|
|
59
|
+
return_mode: ReturnMode = "tuple",
|
|
60
|
+
) -> str | list[str] | tuple[str, list[str]]:
|
|
61
|
+
"""
|
|
62
|
+
Convert a Jyutping string to CantRomZJ1.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
jyutping:
|
|
66
|
+
A Jyutping string. Both concatenated and space-separated input are accepted,
|
|
67
|
+
e.g. "hoeng1gong2" or "hoeng1 gong2".
|
|
68
|
+
output_separator:
|
|
69
|
+
Separator used when joining converted syllables. Use "" for concatenated
|
|
70
|
+
output or " " for space-separated output.
|
|
71
|
+
return_mode:
|
|
72
|
+
"tuple": return (joined_string, syllable_list), matching the old behavior.
|
|
73
|
+
"string": return only the joined string.
|
|
74
|
+
"list": return only the list of converted syllables.
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
jyutping_to_cantromzj1("hoeng1gong2")
|
|
78
|
+
-> ("heong1A|55gong2A|35", ["heong1A|55", "gong2A|35"])
|
|
79
|
+
|
|
80
|
+
jyutping_to_cantromzj1("hoeng1gong2", output_separator=" ", return_mode="string")
|
|
81
|
+
-> "heong1A|55 gong2A|35"
|
|
82
|
+
"""
|
|
83
|
+
parsed_items = pycantonese.parse_jyutping(jyutping.replace(" ", ""))
|
|
84
|
+
converted = [
|
|
85
|
+
_convert_parsed_jyutping_to_cantromzj1(item)[0]
|
|
86
|
+
for item in parsed_items
|
|
87
|
+
]
|
|
88
|
+
joined = output_separator.join(converted)
|
|
89
|
+
|
|
90
|
+
if return_mode == "tuple":
|
|
91
|
+
return joined, converted
|
|
92
|
+
if return_mode == "string":
|
|
93
|
+
return joined
|
|
94
|
+
if return_mode == "list":
|
|
95
|
+
return converted
|
|
96
|
+
|
|
97
|
+
raise ValueError("return_mode must be one of: 'tuple', 'string', 'list'")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pysinrom
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python tools for Sinitic romanization, currently only as research utilities for conversion between Jyutping and CantRomZJ1
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Classifier: Development Status :: 3 - Alpha
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: pycantonese<6,>=5.0
|
|
18
|
+
Provides-Extra: test
|
|
19
|
+
Requires-Dist: pytest>=8; extra == "test"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# PySinRom
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
After publication on PyPI:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install pysinrom
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
For local installation from the project directory:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
python -m pip install .
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from pysinrom import (
|
|
42
|
+
jyutping_to_cantromzj1,
|
|
43
|
+
cantromzj1_to_jyutping,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
cantromzj1, syllables = jyutping_to_cantromzj1("hoeng1gong2")
|
|
47
|
+
print(cantromzj1)
|
|
48
|
+
# heong1A|55gong2A|35
|
|
49
|
+
|
|
50
|
+
jyutping, syllables = cantromzj1_to_jyutping("heong1A|55gong2A|35")
|
|
51
|
+
print(jyutping)
|
|
52
|
+
# hoeng1gong2
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Space-separated input and output are also supported:
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
result = jyutping_to_cantromzj1(
|
|
59
|
+
"hoeng1 gong2",
|
|
60
|
+
output_separator=" ",
|
|
61
|
+
return_mode="string",
|
|
62
|
+
)
|
|
63
|
+
print(result)
|
|
64
|
+
# heong1A|55 gong2A|35
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Main functions
|
|
68
|
+
|
|
69
|
+
- `jyutping_to_cantromzj1()`
|
|
70
|
+
- `jyutping_syllable_to_cantromzj1()`
|
|
71
|
+
- `cantromzj1_to_jyutping()`
|
|
72
|
+
- `cantromzj1_syllable_to_jyutping()`
|
|
73
|
+
- `parse_cantromzj1()`
|
|
74
|
+
- `parse_cantromzj1_syllable()`
|
|
75
|
+
|
|
76
|
+
The default return mode for full-string conversion is a tuple containing the joined output string and a list of converted syllables. Set `return_mode="string"` or `return_mode="list"` to request only one representation.
|
|
77
|
+
|
|
78
|
+
## Development status
|
|
79
|
+
|
|
80
|
+
This is research software released as an alpha version. The current implementation focuses on the romanization correspondences described in the accompanying anonymous manuscript.
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
MIT License.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/pysinrom/__init__.py
|
|
5
|
+
src/pysinrom/cantromzj1_maps.py
|
|
6
|
+
src/pysinrom/cantromzj1_parse.py
|
|
7
|
+
src/pysinrom/cantromzj1_to_jyutping.py
|
|
8
|
+
src/pysinrom/jyutping_to_cantromzj1.py
|
|
9
|
+
src/pysinrom.egg-info/PKG-INFO
|
|
10
|
+
src/pysinrom.egg-info/SOURCES.txt
|
|
11
|
+
src/pysinrom.egg-info/dependency_links.txt
|
|
12
|
+
src/pysinrom.egg-info/requires.txt
|
|
13
|
+
src/pysinrom.egg-info/top_level.txt
|
|
14
|
+
tests/test_conversion.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pysinrom
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from pysinrom import (
|
|
4
|
+
cantromzj1_syllable_to_jyutping,
|
|
5
|
+
cantromzj1_to_jyutping,
|
|
6
|
+
jyutping_syllable_to_cantromzj1,
|
|
7
|
+
jyutping_to_cantromzj1,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_jyutping_to_cantromzj1_default_tuple():
|
|
12
|
+
joined, syllables = jyutping_to_cantromzj1("hoeng1gong2")
|
|
13
|
+
assert joined == "heong1A|55gong2A|35"
|
|
14
|
+
assert syllables == ["heong1A|55", "gong2A|35"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_cantromzj1_to_jyutping_default_tuple():
|
|
18
|
+
joined, syllables = cantromzj1_to_jyutping("heong1A|55gong2A|35")
|
|
19
|
+
assert joined == "hoeng1gong2"
|
|
20
|
+
assert syllables == ["hoeng1", "gong2"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def test_space_separated_conversion():
|
|
24
|
+
result = jyutping_to_cantromzj1(
|
|
25
|
+
"hoeng1 gong2", output_separator=" ", return_mode="string"
|
|
26
|
+
)
|
|
27
|
+
assert result == "heong1A|55 gong2A|35"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_single_syllable_conversion():
|
|
31
|
+
assert jyutping_syllable_to_cantromzj1("hoeng1") == "heong1A|55"
|
|
32
|
+
assert cantromzj1_syllable_to_jyutping("heong1A|55") == "hoeng1"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_invalid_return_mode():
|
|
36
|
+
with pytest.raises(ValueError):
|
|
37
|
+
jyutping_to_cantromzj1("hoeng1", return_mode="invalid")
|