opencc-purepy 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opencc_purepy-1.0.0/LICENSE +21 -0
- opencc_purepy-1.0.0/PKG-INFO +118 -0
- opencc_purepy-1.0.0/README.md +100 -0
- opencc_purepy-1.0.0/opencc_purepy/__init__.py +6 -0
- opencc_purepy-1.0.0/opencc_purepy/__main__.py +46 -0
- opencc_purepy-1.0.0/opencc_purepy/convert_cmd.py +28 -0
- opencc_purepy-1.0.0/opencc_purepy/core.py +362 -0
- opencc_purepy-1.0.0/opencc_purepy/dictgen_cmd.py +18 -0
- opencc_purepy-1.0.0/opencc_purepy/dictionary_lib.py +87 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/HKVariants.txt +63 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/HKVariantsRev.txt +70 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/HKVariantsRevPhrases.txt +156 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/JPShinjitaiCharacters.txt +7 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/JPShinjitaiPhrases.txt +176 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/JPVariants.txt +369 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/JPVariantsRev.txt +371 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/STCharacters.txt +3980 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/STPhrases.txt +49097 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/TSCharacters.txt +4113 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/TSPhrases.txt +281 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/TWPhrases.txt +512 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/TWPhrasesRev.txt +541 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/TWVariants.txt +39 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/TWVariantsRev.txt +39 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/TWVariantsRevPhrases.txt +68 -0
- opencc_purepy-1.0.0/opencc_purepy/dicts/dictionary_maxlength.json +59943 -0
- opencc_purepy-1.0.0/opencc_purepy.egg-info/PKG-INFO +118 -0
- opencc_purepy-1.0.0/opencc_purepy.egg-info/SOURCES.txt +32 -0
- opencc_purepy-1.0.0/opencc_purepy.egg-info/dependency_links.txt +1 -0
- opencc_purepy-1.0.0/opencc_purepy.egg-info/entry_points.txt +2 -0
- opencc_purepy-1.0.0/opencc_purepy.egg-info/top_level.txt +1 -0
- opencc_purepy-1.0.0/pyproject.toml +33 -0
- opencc_purepy-1.0.0/setup.cfg +4 -0
- opencc_purepy-1.0.0/tests/test_basic.py +41 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
ο»ΏMIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2018 The Python Packaging Authority
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: opencc-purepy
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Pure Python implementation of OpenCC for Chinese text conversion
|
|
5
|
+
Author-email: laisuk <laisuk@yahoo.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/laisuk/opencc_purepy
|
|
8
|
+
Project-URL: Issues, https://github.com/laisuk/opencc_purepy/issues
|
|
9
|
+
Keywords: opencc,chinese,text,conversion,pure-python
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
14
|
+
Requires-Python: >=3.7
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Dynamic: license-file
|
|
18
|
+
|
|
19
|
+
# opencc_purepy
|
|
20
|
+
|
|
21
|
+
[](https://pypi.org/project/opencc-purepy/)
|
|
22
|
+
[](https://github.com/laisuk/opencc_pyo3/blob/main/LICENSE)
|
|
23
|
+
|
|
24
|
+
**`opencc_purepy`** is a **pure Python implementation** of OpenCC (Open Chinese Convert), enabling conversion between different Chinese text variants such as Simplified, Traditional, Hong Kong, Taiwan, and Japanese Kanji.
|
|
25
|
+
It uses dictionary-based segmentation and mapping logic inspired by [BYVoid/OpenCC](https://github.com/BYVoid/OpenCC).
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## π§ Features
|
|
30
|
+
|
|
31
|
+
- β
Pure Python β no native dependencies
|
|
32
|
+
- π Supports conversion between multiple Chinese locales:
|
|
33
|
+
- Simplified β Traditional
|
|
34
|
+
- Traditional β Hong Kong / Taiwan / Japanese
|
|
35
|
+
- β¨ Optional punctuation style conversion
|
|
36
|
+
- π§ Automatic simplified/traditional code detection
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## π Supported Conversion Configs
|
|
41
|
+
|
|
42
|
+
| Code | Description |
|
|
43
|
+
|----------|--------------------------------------|
|
|
44
|
+
| `s2t` | Simplified β Traditional |
|
|
45
|
+
| `t2s` | Traditional β Simplified |
|
|
46
|
+
| `s2tw` | Simplified β Traditional (Taiwan) |
|
|
47
|
+
| `tw2s` | Taiwan β Simplified |
|
|
48
|
+
| `s2twp` | Simplified β Traditional β Taiwan |
|
|
49
|
+
| `tw2sp` | Taiwan β Traditional β Simplified |
|
|
50
|
+
| `s2hk` | Simplified β Hong Kong |
|
|
51
|
+
| `hk2s` | Hong Kong β Simplified |
|
|
52
|
+
| `t2tw` | Traditional β Taiwan |
|
|
53
|
+
| `tw2t` | Taiwan β Traditional |
|
|
54
|
+
| `t2twp` | Traditional β Taiwan |
|
|
55
|
+
| `tw2tp` | Taiwan β Traditional |
|
|
56
|
+
| `t2hk` | Traditional β Hong Kong |
|
|
57
|
+
| `hk2t` | Hong Kong β Traditional |
|
|
58
|
+
| `t2jp` | Traditional β Japanese Kanji |
|
|
59
|
+
| `jp2t` | Japanese Kanji β Traditional |
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## π¦ Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install opencc-purepy
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## π Usage
|
|
70
|
+
|
|
71
|
+
### π Python
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from opencc_purepy import OpenCC
|
|
75
|
+
|
|
76
|
+
text = "βζ₯η δΈθ§ζοΌε€ε€ι»εΌιΈγβ"
|
|
77
|
+
opencc = OpenCC("s2t")
|
|
78
|
+
converted = opencc.convert(text, punctuation=True)
|
|
79
|
+
print(converted) # γζ₯η δΈθ¦ΊζοΌθθθεΌι³₯γγ
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### π₯ CLI
|
|
83
|
+
|
|
84
|
+
```sh
|
|
85
|
+
python -m opencc_purepy convert -i input.txt -o output.txt -c s2t -p
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Or if installed as a script:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
opencc-purepy convert -i input.txt -o output.txt -c s2t -p
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## π§© API Reference
|
|
95
|
+
|
|
96
|
+
### Class: `OpenCC`
|
|
97
|
+
|
|
98
|
+
- `OpenCC(config: str = "s2t")`
|
|
99
|
+
- `config`: Conversion configuration (see above).
|
|
100
|
+
- `convert(input: str, punctuation: bool = False) -> str`
|
|
101
|
+
- Convert text with optional punctuation conversion.
|
|
102
|
+
- `zho_check(input: str) -> int`
|
|
103
|
+
- Detects the code of the input text.
|
|
104
|
+
- 1 - Traditional,
|
|
105
|
+
- 2 - Simplified,
|
|
106
|
+
- 0 - others
|
|
107
|
+
|
|
108
|
+
## π Development
|
|
109
|
+
|
|
110
|
+
- Python bindings: [opencc_purepy/__init__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__init__.py), [opencc_purepy/opencc_purepy.pyi](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/opencc_purepy.pyi)
|
|
111
|
+
- CLI: [opencc_purepy/__main__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__main__.py)
|
|
112
|
+
|
|
113
|
+
## π License
|
|
114
|
+
This project is licensed under the [MIT](https://github.com/laisuk/opencc_purepy/blob/master/LICENSE) License.
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
Powered by Pure Python and OpenCC Lexicons.
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# opencc_purepy
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/opencc-purepy/)
|
|
4
|
+
[](https://github.com/laisuk/opencc_pyo3/blob/main/LICENSE)
|
|
5
|
+
|
|
6
|
+
**`opencc_purepy`** is a **pure Python implementation** of OpenCC (Open Chinese Convert), enabling conversion between different Chinese text variants such as Simplified, Traditional, Hong Kong, Taiwan, and Japanese Kanji.
|
|
7
|
+
It uses dictionary-based segmentation and mapping logic inspired by [BYVoid/OpenCC](https://github.com/BYVoid/OpenCC).
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## π§ Features
|
|
12
|
+
|
|
13
|
+
- β
Pure Python β no native dependencies
|
|
14
|
+
- π Supports conversion between multiple Chinese locales:
|
|
15
|
+
- Simplified β Traditional
|
|
16
|
+
- Traditional β Hong Kong / Taiwan / Japanese
|
|
17
|
+
- β¨ Optional punctuation style conversion
|
|
18
|
+
- π§ Automatic simplified/traditional code detection
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## π Supported Conversion Configs
|
|
23
|
+
|
|
24
|
+
| Code | Description |
|
|
25
|
+
|----------|--------------------------------------|
|
|
26
|
+
| `s2t` | Simplified β Traditional |
|
|
27
|
+
| `t2s` | Traditional β Simplified |
|
|
28
|
+
| `s2tw` | Simplified β Traditional (Taiwan) |
|
|
29
|
+
| `tw2s` | Taiwan β Simplified |
|
|
30
|
+
| `s2twp` | Simplified β Traditional β Taiwan |
|
|
31
|
+
| `tw2sp` | Taiwan β Traditional β Simplified |
|
|
32
|
+
| `s2hk` | Simplified β Hong Kong |
|
|
33
|
+
| `hk2s` | Hong Kong β Simplified |
|
|
34
|
+
| `t2tw` | Traditional β Taiwan |
|
|
35
|
+
| `tw2t` | Taiwan β Traditional |
|
|
36
|
+
| `t2twp` | Traditional β Taiwan |
|
|
37
|
+
| `tw2tp` | Taiwan β Traditional |
|
|
38
|
+
| `t2hk` | Traditional β Hong Kong |
|
|
39
|
+
| `hk2t` | Hong Kong β Traditional |
|
|
40
|
+
| `t2jp` | Traditional β Japanese Kanji |
|
|
41
|
+
| `jp2t` | Japanese Kanji β Traditional |
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## π¦ Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install opencc-purepy
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## π Usage
|
|
52
|
+
|
|
53
|
+
### π Python
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from opencc_purepy import OpenCC
|
|
57
|
+
|
|
58
|
+
text = "βζ₯η δΈθ§ζοΌε€ε€ι»εΌιΈγβ"
|
|
59
|
+
opencc = OpenCC("s2t")
|
|
60
|
+
converted = opencc.convert(text, punctuation=True)
|
|
61
|
+
print(converted) # γζ₯η δΈθ¦ΊζοΌθθθεΌι³₯γγ
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### π₯ CLI
|
|
65
|
+
|
|
66
|
+
```sh
|
|
67
|
+
python -m opencc_purepy convert -i input.txt -o output.txt -c s2t -p
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Or if installed as a script:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
opencc-purepy convert -i input.txt -o output.txt -c s2t -p
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## π§© API Reference
|
|
77
|
+
|
|
78
|
+
### Class: `OpenCC`
|
|
79
|
+
|
|
80
|
+
- `OpenCC(config: str = "s2t")`
|
|
81
|
+
- `config`: Conversion configuration (see above).
|
|
82
|
+
- `convert(input: str, punctuation: bool = False) -> str`
|
|
83
|
+
- Convert text with optional punctuation conversion.
|
|
84
|
+
- `zho_check(input: str) -> int`
|
|
85
|
+
- Detects the code of the input text.
|
|
86
|
+
- 1 - Traditional,
|
|
87
|
+
- 2 - Simplified,
|
|
88
|
+
- 0 - others
|
|
89
|
+
|
|
90
|
+
## π Development
|
|
91
|
+
|
|
92
|
+
- Python bindings: [opencc_purepy/__init__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__init__.py), [opencc_purepy/opencc_purepy.pyi](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/opencc_purepy.pyi)
|
|
93
|
+
- CLI: [opencc_purepy/__main__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__main__.py)
|
|
94
|
+
|
|
95
|
+
## π License
|
|
96
|
+
This project is licensed under the [MIT](https://github.com/laisuk/opencc_purepy/blob/master/LICENSE) License.
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
Powered by Pure Python and OpenCC Lexicons.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import print_function
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from . import dictgen_cmd
|
|
6
|
+
from . import convert_cmd # We'll move your current logic into convert_cmd.py
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
parser = argparse.ArgumentParser(
|
|
10
|
+
prog='opencc_purepy',
|
|
11
|
+
description='OpenCC CLI with multiple tools',
|
|
12
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
subparsers = parser.add_subparsers(dest='command', required=True)
|
|
16
|
+
|
|
17
|
+
# ---- convert subcommand ----
|
|
18
|
+
parser_convert = subparsers.add_parser('convert', help='Convert text using OpenCC')
|
|
19
|
+
parser_convert.add_argument('-i', '--input', metavar='<file>', help='Input file')
|
|
20
|
+
parser_convert.add_argument('-o', '--output', metavar='<file>', help='Output file')
|
|
21
|
+
parser_convert.add_argument('-c', '--config', metavar='<conversion>', help='Conversion configuration')
|
|
22
|
+
parser_convert.add_argument('-p', '--punct', action='store_true', default=False, help='Punctuation conversion: True/False')
|
|
23
|
+
parser_convert.add_argument('--in-enc', metavar='<encoding>', default='UTF-8', help='Input encoding')
|
|
24
|
+
parser_convert.add_argument('--out-enc', metavar='<encoding>', default='UTF-8', help='Output encoding')
|
|
25
|
+
parser_convert.set_defaults(func=convert_cmd.main)
|
|
26
|
+
|
|
27
|
+
# ---- dictgen subcommand ----
|
|
28
|
+
parser_dictgen = subparsers.add_parser('dictgen', help='Generate dictionary')
|
|
29
|
+
parser_dictgen.add_argument(
|
|
30
|
+
"-f", "--format",
|
|
31
|
+
choices=["json"],
|
|
32
|
+
default="json",
|
|
33
|
+
help="Dictionary format: [json]"
|
|
34
|
+
)
|
|
35
|
+
parser_dictgen.add_argument(
|
|
36
|
+
"-o", "--output",
|
|
37
|
+
metavar="<filename>",
|
|
38
|
+
help="Write generated dictionary to <filename>. If not specified, a default filename is used."
|
|
39
|
+
)
|
|
40
|
+
parser_dictgen.set_defaults(func=dictgen_cmd.main)
|
|
41
|
+
|
|
42
|
+
args = parser.parse_args()
|
|
43
|
+
return args.func(args)
|
|
44
|
+
|
|
45
|
+
if __name__ == '__main__':
|
|
46
|
+
sys.exit(main())
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import sys
|
|
3
|
+
from opencc_purepy import OpenCC
|
|
4
|
+
|
|
5
|
+
def main(args):
|
|
6
|
+
if args.config is None:
|
|
7
|
+
print("Please specify conversion.", file=sys.stderr)
|
|
8
|
+
return 1
|
|
9
|
+
|
|
10
|
+
opencc = OpenCC(args.config)
|
|
11
|
+
|
|
12
|
+
# Prompt only if reading from stdin, and it's interactive (i.e., not piped or redirected)
|
|
13
|
+
if args.input is None and sys.stdin.isatty():
|
|
14
|
+
print("Input text to convert, <Ctrl+Z> (Windows) or <Ctrl+D> (Unix) then Enter to submit:", file=sys.stderr)
|
|
15
|
+
|
|
16
|
+
with io.open(args.input if args.input else 0, encoding=args.in_enc) as f:
|
|
17
|
+
input_str = f.read()
|
|
18
|
+
output_str = opencc.convert(input_str, args.punct)
|
|
19
|
+
|
|
20
|
+
with io.open(args.output if args.output else 1, 'w', encoding=args.out_enc) as f:
|
|
21
|
+
f.write(output_str)
|
|
22
|
+
|
|
23
|
+
in_from = args.input if args.input else "<stdin>"
|
|
24
|
+
out_to = args.output if args.output else "stdout"
|
|
25
|
+
if sys.stderr.isatty():
|
|
26
|
+
print(f"Conversion completed ({args.config}): {in_from} -> {out_to}", file=sys.stderr)
|
|
27
|
+
|
|
28
|
+
return 0
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Dict, Tuple
|
|
3
|
+
from .dictionary_lib import DictionaryMaxlength
|
|
4
|
+
|
|
5
|
+
DELIMITERS = set(
|
|
6
|
+
" \t\n\r!\"#$%&'()*+,-./:;<=>?@[\\]^_{}|~οΌγγββββγγγγοΉοΉβοΌοΌοΌγγγγοΌοΌβ¦οΌοΌΌοΈοΈοΈοΈοΈΏοΉοΈΉοΈΊοΈοΈοΌ»οΉοΌ½οΉοΈοΈοΈ°οΈ³οΈ΄οΈ½οΈΎοΈ΅οΈΆο½οΈ·ο½οΈΈοΉοΉγοΈ»γοΈΌγο½οΌοΌοΌοΌ")
|
|
7
|
+
STRIP_REGEX = re.compile(r"[!-/:-@\[-`{-~\t\n\v\f\r 0-9A-Za-z_]")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DictRefs:
|
|
11
|
+
def __init__(self, round_1):
|
|
12
|
+
self.round_1 = round_1
|
|
13
|
+
self.round_2 = None
|
|
14
|
+
self.round_3 = None
|
|
15
|
+
|
|
16
|
+
def with_round_2(self, round_2):
|
|
17
|
+
self.round_2 = round_2
|
|
18
|
+
return self
|
|
19
|
+
|
|
20
|
+
def with_round_3(self, round_3):
|
|
21
|
+
self.round_3 = round_3
|
|
22
|
+
return self
|
|
23
|
+
|
|
24
|
+
def apply_segment_replace(self, input_text, segment_replace):
|
|
25
|
+
output = segment_replace(input_text, self.round_1)
|
|
26
|
+
if self.round_2:
|
|
27
|
+
output = segment_replace(output, self.round_2)
|
|
28
|
+
if self.round_3:
|
|
29
|
+
output = segment_replace(output, self.round_3)
|
|
30
|
+
return output
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class OpenCC:
|
|
34
|
+
def __init__(self, config=None):
|
|
35
|
+
_config_list = [
|
|
36
|
+
"s2t", "t2s", "s2tw", "tw2s", "s2twp", "tw2sp", "s2hk", "hk2s", "t2tw", "tw2t", "t2twp", "tw2t", "tw2tp",
|
|
37
|
+
"t2hk", "hk2t", "t2jp", "jp2t"
|
|
38
|
+
]
|
|
39
|
+
if config in _config_list:
|
|
40
|
+
self.config = config
|
|
41
|
+
else:
|
|
42
|
+
self._last_error = f"Invalid config: {config}"
|
|
43
|
+
self.config = "s2t"
|
|
44
|
+
try:
|
|
45
|
+
self.dictionary = DictionaryMaxlength.new()
|
|
46
|
+
# self.dictionary = DictionaryMaxlength.from_dicts()
|
|
47
|
+
except Exception as e:
|
|
48
|
+
self._last_error = str(e) # <- Use thread-safe setter
|
|
49
|
+
self.dictionary = DictionaryMaxlength()
|
|
50
|
+
|
|
51
|
+
self.delimiters = DELIMITERS
|
|
52
|
+
|
|
53
|
+
def get_last_error(self):
|
|
54
|
+
return self._last_error
|
|
55
|
+
|
|
56
|
+
def segment_replace(self, text: str, dictionaries: List[Tuple[Dict[str, str], int]]) -> str:
|
|
57
|
+
max_word_length = max((length for _, length in dictionaries), default=1)
|
|
58
|
+
ranges = self.get_split_ranges(text)
|
|
59
|
+
return "".join(
|
|
60
|
+
self.convert_by(list(text[start:end]), dictionaries, max_word_length)
|
|
61
|
+
for start, end in ranges
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def convert_by(self, text_chars: List[str], dictionaries, max_word_length: int) -> str:
|
|
65
|
+
if not text_chars:
|
|
66
|
+
return ""
|
|
67
|
+
|
|
68
|
+
delimiters = self.delimiters # Local variable for speed
|
|
69
|
+
if len(text_chars) == 1 and text_chars[0] in delimiters:
|
|
70
|
+
return text_chars[0]
|
|
71
|
+
|
|
72
|
+
result = []
|
|
73
|
+
i = 0
|
|
74
|
+
text_chars_len = len(text_chars)
|
|
75
|
+
while i < text_chars_len:
|
|
76
|
+
best_match = None
|
|
77
|
+
best_length = 0
|
|
78
|
+
# Use local variable for dictionaries
|
|
79
|
+
for length in range(min(max_word_length, text_chars_len - i), 0, -1):
|
|
80
|
+
word = "".join(text_chars[i:i + length])
|
|
81
|
+
for d, _ in dictionaries:
|
|
82
|
+
match = d.get(word)
|
|
83
|
+
if match is not None:
|
|
84
|
+
best_match = match
|
|
85
|
+
best_length = length
|
|
86
|
+
break
|
|
87
|
+
if best_length:
|
|
88
|
+
break
|
|
89
|
+
if not best_length:
|
|
90
|
+
best_match = text_chars[i]
|
|
91
|
+
best_length = 1
|
|
92
|
+
result.append(best_match)
|
|
93
|
+
i += best_length
|
|
94
|
+
return "".join(result)
|
|
95
|
+
|
|
96
|
+
def get_split_ranges(self, text: str) -> List[Tuple[int, int]]:
|
|
97
|
+
"""
|
|
98
|
+
Returns a list of (start, end) index tuples, where each tuple represents
|
|
99
|
+
the start (inclusive) and end (exclusive) indices of a chunk in the text.
|
|
100
|
+
"""
|
|
101
|
+
ranges = []
|
|
102
|
+
start = 0
|
|
103
|
+
for i, ch in enumerate(text):
|
|
104
|
+
if ch in self.delimiters:
|
|
105
|
+
ranges.append((start, i + 1)) # include the delimiter
|
|
106
|
+
start = i + 1
|
|
107
|
+
if start < len(text):
|
|
108
|
+
ranges.append((start, len(text)))
|
|
109
|
+
return ranges
|
|
110
|
+
|
|
111
|
+
def s2t(self, input_text: str, punctuation: bool = False) -> str:
|
|
112
|
+
if not input_text:
|
|
113
|
+
self._last_error = "Input text is empty"
|
|
114
|
+
return ""
|
|
115
|
+
refs = DictRefs([
|
|
116
|
+
self.dictionary.st_phrases,
|
|
117
|
+
self.dictionary.st_characters
|
|
118
|
+
])
|
|
119
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
120
|
+
return self.convert_punctuation(output, "s") if punctuation else output
|
|
121
|
+
|
|
122
|
+
def t2s(self, input_text: str, punctuation: bool = False) -> str:
|
|
123
|
+
refs = DictRefs([
|
|
124
|
+
self.dictionary.ts_phrases,
|
|
125
|
+
self.dictionary.ts_characters
|
|
126
|
+
])
|
|
127
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
128
|
+
return self.convert_punctuation(output, "t") if punctuation else output
|
|
129
|
+
|
|
130
|
+
def s2tw(self, input_text: str, punctuation: bool = False) -> str:
|
|
131
|
+
refs = DictRefs([
|
|
132
|
+
self.dictionary.st_phrases,
|
|
133
|
+
self.dictionary.st_characters
|
|
134
|
+
]).with_round_2([
|
|
135
|
+
self.dictionary.tw_variants
|
|
136
|
+
])
|
|
137
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
138
|
+
return self.convert_punctuation(output, "s") if punctuation else output
|
|
139
|
+
|
|
140
|
+
def tw2s(self, input_text: str, punctuation: bool = False) -> str:
|
|
141
|
+
refs = DictRefs([
|
|
142
|
+
self.dictionary.tw_variants_rev_phrases,
|
|
143
|
+
self.dictionary.tw_variants_rev
|
|
144
|
+
]).with_round_2([
|
|
145
|
+
self.dictionary.ts_phrases,
|
|
146
|
+
self.dictionary.ts_characters
|
|
147
|
+
])
|
|
148
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
149
|
+
return self.convert_punctuation(output, "t") if punctuation else output
|
|
150
|
+
|
|
151
|
+
def s2twp(self, input_text: str, punctuation: bool = False) -> str:
|
|
152
|
+
refs = DictRefs([
|
|
153
|
+
self.dictionary.st_phrases,
|
|
154
|
+
self.dictionary.st_characters
|
|
155
|
+
]).with_round_2([
|
|
156
|
+
self.dictionary.tw_phrases
|
|
157
|
+
]).with_round_3([
|
|
158
|
+
self.dictionary.tw_variants
|
|
159
|
+
])
|
|
160
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
161
|
+
return self.convert_punctuation(output, "s") if punctuation else output
|
|
162
|
+
|
|
163
|
+
def tw2sp(self, input_text: str, punctuation: bool = False) -> str:
|
|
164
|
+
refs = DictRefs([
|
|
165
|
+
self.dictionary.tw_phrases_rev,
|
|
166
|
+
self.dictionary.tw_variants_rev_phrases,
|
|
167
|
+
self.dictionary.tw_variants_rev
|
|
168
|
+
]).with_round_2([
|
|
169
|
+
self.dictionary.ts_phrases,
|
|
170
|
+
self.dictionary.ts_characters
|
|
171
|
+
])
|
|
172
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
173
|
+
return self.convert_punctuation(output, "t") if punctuation else output
|
|
174
|
+
|
|
175
|
+
def s2hk(self, input_text: str, punctuation: bool = False) -> str:
|
|
176
|
+
refs = DictRefs([
|
|
177
|
+
self.dictionary.st_phrases,
|
|
178
|
+
self.dictionary.st_characters
|
|
179
|
+
]).with_round_2([
|
|
180
|
+
self.dictionary.hk_variants
|
|
181
|
+
])
|
|
182
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
183
|
+
return self.convert_punctuation(output, "s") if punctuation else output
|
|
184
|
+
|
|
185
|
+
def hk2s(self, input_text: str, punctuation: bool = False) -> str:
|
|
186
|
+
refs = DictRefs([
|
|
187
|
+
self.dictionary.hk_variants_rev_phrases,
|
|
188
|
+
self.dictionary.hk_variants_rev
|
|
189
|
+
]).with_round_2([
|
|
190
|
+
self.dictionary.ts_phrases,
|
|
191
|
+
self.dictionary.ts_characters
|
|
192
|
+
])
|
|
193
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
194
|
+
return self.convert_punctuation(output, "t") if punctuation else output
|
|
195
|
+
|
|
196
|
+
def t2tw(self, input_text: str) -> str:
|
|
197
|
+
refs = DictRefs([
|
|
198
|
+
self.dictionary.tw_variants
|
|
199
|
+
])
|
|
200
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
201
|
+
return output
|
|
202
|
+
|
|
203
|
+
def t2twp(self, input_text: str) -> str:
|
|
204
|
+
refs = DictRefs([
|
|
205
|
+
self.dictionary.tw_phrases
|
|
206
|
+
]).with_round_2([
|
|
207
|
+
self.dictionary.tw_variants
|
|
208
|
+
])
|
|
209
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
210
|
+
return output
|
|
211
|
+
|
|
212
|
+
def tw2t(self, input_text: str) -> str:
|
|
213
|
+
refs = DictRefs([
|
|
214
|
+
self.dictionary.tw_variants_rev_phrases,
|
|
215
|
+
self.dictionary.tw_variants_rev
|
|
216
|
+
])
|
|
217
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
218
|
+
return output
|
|
219
|
+
|
|
220
|
+
def tw2tp(self, input_text: str) -> str:
|
|
221
|
+
refs = DictRefs([
|
|
222
|
+
self.dictionary.tw_variants_rev_phrases,
|
|
223
|
+
self.dictionary.tw_variants_rev
|
|
224
|
+
]).with_round_2([
|
|
225
|
+
self.dictionary.tw_phrases_rev
|
|
226
|
+
])
|
|
227
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
228
|
+
return output
|
|
229
|
+
|
|
230
|
+
def t2hk(self, input_text: str) -> str:
|
|
231
|
+
refs = DictRefs([
|
|
232
|
+
self.dictionary.hk_variants
|
|
233
|
+
])
|
|
234
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
235
|
+
return output
|
|
236
|
+
|
|
237
|
+
def hk2t(self, input_text: str) -> str:
|
|
238
|
+
refs = DictRefs([
|
|
239
|
+
self.dictionary.hk_variants_rev_phrases,
|
|
240
|
+
self.dictionary.hk_variants_rev
|
|
241
|
+
])
|
|
242
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
243
|
+
return output
|
|
244
|
+
|
|
245
|
+
def t2jp(self, input_text: str) -> str:
|
|
246
|
+
refs = DictRefs([
|
|
247
|
+
self.dictionary.jp_variants
|
|
248
|
+
])
|
|
249
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
250
|
+
return output
|
|
251
|
+
|
|
252
|
+
def jp2t(self, input_text: str) -> str:
|
|
253
|
+
refs = DictRefs([
|
|
254
|
+
self.dictionary.jps_phrases,
|
|
255
|
+
self.dictionary.jps_characters,
|
|
256
|
+
self.dictionary.jp_variants_rev
|
|
257
|
+
])
|
|
258
|
+
output = refs.apply_segment_replace(input_text, self.segment_replace)
|
|
259
|
+
return output
|
|
260
|
+
|
|
261
|
+
def convert(self, input_text: str, punctuation: bool = False) -> str:
|
|
262
|
+
config = self.config.lower()
|
|
263
|
+
try:
|
|
264
|
+
if config == "s2t":
|
|
265
|
+
return self.s2t(input_text, punctuation)
|
|
266
|
+
elif config == "s2tw":
|
|
267
|
+
return self.s2tw(input_text, punctuation)
|
|
268
|
+
elif config == "s2twp":
|
|
269
|
+
return self.s2twp(input_text, punctuation)
|
|
270
|
+
elif config == "s2hk":
|
|
271
|
+
return self.s2hk(input_text, punctuation)
|
|
272
|
+
elif config == "t2s":
|
|
273
|
+
return self.t2s(input_text, punctuation)
|
|
274
|
+
elif config == "t2tw":
|
|
275
|
+
return self.t2tw(input_text)
|
|
276
|
+
elif config == "t2twp":
|
|
277
|
+
return self.t2twp(input_text)
|
|
278
|
+
elif config == "t2hk":
|
|
279
|
+
return self.t2hk(input_text)
|
|
280
|
+
elif config == "tw2s":
|
|
281
|
+
return self.tw2s(input_text, punctuation)
|
|
282
|
+
elif config == "tw2sp":
|
|
283
|
+
return self.tw2sp(input_text, punctuation)
|
|
284
|
+
elif config == "tw2t":
|
|
285
|
+
return self.tw2t(input_text)
|
|
286
|
+
elif config == "tw2tp":
|
|
287
|
+
return self.tw2tp(input_text)
|
|
288
|
+
elif config == "hk2s":
|
|
289
|
+
return self.hk2s(input_text, punctuation)
|
|
290
|
+
elif config == "hk2t":
|
|
291
|
+
return self.hk2t(input_text)
|
|
292
|
+
elif config == "jp2t":
|
|
293
|
+
return self.jp2t(input_text)
|
|
294
|
+
elif config == "t2jp":
|
|
295
|
+
return self.t2jp(input_text)
|
|
296
|
+
else:
|
|
297
|
+
self._last_error = f"Invalid config: {config}"
|
|
298
|
+
return self._last_error
|
|
299
|
+
except Exception as e:
|
|
300
|
+
self._last_error = f"Conversion failed: {e}"
|
|
301
|
+
return self._last_error
|
|
302
|
+
|
|
303
|
+
def st(self, input_text: str) -> str:
|
|
304
|
+
dict_refs = [self.dictionary.st_characters]
|
|
305
|
+
chars = list(input_text) # converts str into list of chars
|
|
306
|
+
return self.convert_by(chars, dict_refs, 1)
|
|
307
|
+
|
|
308
|
+
def ts(self, input_text: str) -> str:
|
|
309
|
+
dict_refs = [self.dictionary.ts_characters]
|
|
310
|
+
chars = list(input_text) # converts str into list of chars
|
|
311
|
+
return self.convert_by(chars, dict_refs, 1)
|
|
312
|
+
|
|
313
|
+
def zho_check(self, input_text: str) -> int:
|
|
314
|
+
if not input_text:
|
|
315
|
+
return 0
|
|
316
|
+
|
|
317
|
+
stripped = STRIP_REGEX.sub("", input_text)
|
|
318
|
+
max_chars = find_max_utf8_length(stripped, 200)
|
|
319
|
+
strip_text = stripped[:max_chars]
|
|
320
|
+
|
|
321
|
+
if strip_text != self.ts(strip_text):
|
|
322
|
+
return 1
|
|
323
|
+
elif strip_text != self.st(strip_text):
|
|
324
|
+
return 2
|
|
325
|
+
else:
|
|
326
|
+
return 0
|
|
327
|
+
|
|
328
|
+
@staticmethod
|
|
329
|
+
def convert_punctuation(input_text: str, config: str) -> str:
|
|
330
|
+
s2t = {
|
|
331
|
+
'β': 'γ',
|
|
332
|
+
'β': 'γ',
|
|
333
|
+
'β': 'γ',
|
|
334
|
+
'β': 'γ',
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
t2s = {
|
|
338
|
+
'γ': 'β',
|
|
339
|
+
'γ': 'β',
|
|
340
|
+
'γ': 'β',
|
|
341
|
+
'γ': 'β',
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
if config[0] == 's':
|
|
345
|
+
mapping = s2t
|
|
346
|
+
pattern = "[" + "".join(re.escape(c) for c in s2t.keys()) + "]"
|
|
347
|
+
else:
|
|
348
|
+
pattern = "[" + "".join(re.escape(c) for c in t2s.keys()) + "]"
|
|
349
|
+
mapping = t2s
|
|
350
|
+
|
|
351
|
+
return re.sub(pattern, lambda m: mapping[m.group()], input_text)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def find_max_utf8_length(s: str, max_byte_count: int) -> int:
|
|
355
|
+
encoded = s.encode('utf-8')
|
|
356
|
+
if len(encoded) <= max_byte_count:
|
|
357
|
+
return len(encoded)
|
|
358
|
+
|
|
359
|
+
byte_count = max_byte_count
|
|
360
|
+
while byte_count > 0 and (encoded[byte_count] & 0b11000000) == 0b10000000:
|
|
361
|
+
byte_count -= 1
|
|
362
|
+
return byte_count
|