opencc-purepy 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. opencc_purepy-1.0.0/LICENSE +21 -0
  2. opencc_purepy-1.0.0/PKG-INFO +118 -0
  3. opencc_purepy-1.0.0/README.md +100 -0
  4. opencc_purepy-1.0.0/opencc_purepy/__init__.py +6 -0
  5. opencc_purepy-1.0.0/opencc_purepy/__main__.py +46 -0
  6. opencc_purepy-1.0.0/opencc_purepy/convert_cmd.py +28 -0
  7. opencc_purepy-1.0.0/opencc_purepy/core.py +362 -0
  8. opencc_purepy-1.0.0/opencc_purepy/dictgen_cmd.py +18 -0
  9. opencc_purepy-1.0.0/opencc_purepy/dictionary_lib.py +87 -0
  10. opencc_purepy-1.0.0/opencc_purepy/dicts/HKVariants.txt +63 -0
  11. opencc_purepy-1.0.0/opencc_purepy/dicts/HKVariantsRev.txt +70 -0
  12. opencc_purepy-1.0.0/opencc_purepy/dicts/HKVariantsRevPhrases.txt +156 -0
  13. opencc_purepy-1.0.0/opencc_purepy/dicts/JPShinjitaiCharacters.txt +7 -0
  14. opencc_purepy-1.0.0/opencc_purepy/dicts/JPShinjitaiPhrases.txt +176 -0
  15. opencc_purepy-1.0.0/opencc_purepy/dicts/JPVariants.txt +369 -0
  16. opencc_purepy-1.0.0/opencc_purepy/dicts/JPVariantsRev.txt +371 -0
  17. opencc_purepy-1.0.0/opencc_purepy/dicts/STCharacters.txt +3980 -0
  18. opencc_purepy-1.0.0/opencc_purepy/dicts/STPhrases.txt +49097 -0
  19. opencc_purepy-1.0.0/opencc_purepy/dicts/TSCharacters.txt +4113 -0
  20. opencc_purepy-1.0.0/opencc_purepy/dicts/TSPhrases.txt +281 -0
  21. opencc_purepy-1.0.0/opencc_purepy/dicts/TWPhrases.txt +512 -0
  22. opencc_purepy-1.0.0/opencc_purepy/dicts/TWPhrasesRev.txt +541 -0
  23. opencc_purepy-1.0.0/opencc_purepy/dicts/TWVariants.txt +39 -0
  24. opencc_purepy-1.0.0/opencc_purepy/dicts/TWVariantsRev.txt +39 -0
  25. opencc_purepy-1.0.0/opencc_purepy/dicts/TWVariantsRevPhrases.txt +68 -0
  26. opencc_purepy-1.0.0/opencc_purepy/dicts/dictionary_maxlength.json +59943 -0
  27. opencc_purepy-1.0.0/opencc_purepy.egg-info/PKG-INFO +118 -0
  28. opencc_purepy-1.0.0/opencc_purepy.egg-info/SOURCES.txt +32 -0
  29. opencc_purepy-1.0.0/opencc_purepy.egg-info/dependency_links.txt +1 -0
  30. opencc_purepy-1.0.0/opencc_purepy.egg-info/entry_points.txt +2 -0
  31. opencc_purepy-1.0.0/opencc_purepy.egg-info/top_level.txt +1 -0
  32. opencc_purepy-1.0.0/pyproject.toml +33 -0
  33. opencc_purepy-1.0.0/setup.cfg +4 -0
  34. opencc_purepy-1.0.0/tests/test_basic.py +41 -0
@@ -0,0 +1,21 @@
1
+ ο»ΏMIT License
2
+
3
+ Copyright (c) 2018 The Python Packaging Authority
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: opencc-purepy
3
+ Version: 1.0.0
4
+ Summary: Pure Python implementation of OpenCC for Chinese text conversion
5
+ Author-email: laisuk <laisuk@yahoo.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/laisuk/opencc_purepy
8
+ Project-URL: Issues, https://github.com/laisuk/opencc_purepy/issues
9
+ Keywords: opencc,chinese,text,conversion,pure-python
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Topic :: Text Processing :: Linguistic
14
+ Requires-Python: >=3.7
15
+ Description-Content-Type: text/markdown
16
+ License-File: LICENSE
17
+ Dynamic: license-file
18
+
19
+ # opencc_purepy
20
+
21
+ [![PyPI version](https://img.shields.io/pypi/v/opencc-purepy.svg)](https://pypi.org/project/opencc-purepy/)
22
+ [![License](https://img.shields.io/github/license/laisuk/opencc_pyo3)](https://github.com/laisuk/opencc_pyo3/blob/main/LICENSE)
23
+
24
+ **`opencc_purepy`** is a **pure Python implementation** of OpenCC (Open Chinese Convert), enabling conversion between different Chinese text variants such as Simplified, Traditional, Hong Kong, Taiwan, and Japanese Kanji.
25
+ It uses dictionary-based segmentation and mapping logic inspired by [BYVoid/OpenCC](https://github.com/BYVoid/OpenCC).
26
+
27
+ ---
28
+
29
+ ## πŸ”§ Features
30
+
31
+ - βœ… Pure Python – no native dependencies
32
+ - πŸ”„ Supports conversion between multiple Chinese locales:
33
+ - Simplified ↔ Traditional
34
+ - Traditional ↔ Hong Kong / Taiwan / Japanese
35
+ - ✨ Optional punctuation style conversion
36
+ - 🧠 Automatic simplified/traditional code detection
37
+
38
+ ---
39
+
40
+ ## πŸ” Supported Conversion Configs
41
+
42
+ | Code | Description |
43
+ |----------|--------------------------------------|
44
+ | `s2t` | Simplified β†’ Traditional |
45
+ | `t2s` | Traditional β†’ Simplified |
46
+ | `s2tw` | Simplified β†’ Traditional (Taiwan) |
47
+ | `tw2s` | Taiwan β†’ Simplified |
48
+ | `s2twp` | Simplified β†’ Traditional β†’ Taiwan |
49
+ | `tw2sp` | Taiwan β†’ Traditional β†’ Simplified |
50
+ | `s2hk` | Simplified β†’ Hong Kong |
51
+ | `hk2s` | Hong Kong β†’ Simplified |
52
+ | `t2tw` | Traditional β†’ Taiwan |
53
+ | `tw2t` | Taiwan β†’ Traditional |
54
+ | `t2twp` | Traditional β†’ Taiwan |
55
+ | `tw2tp` | Taiwan β†’ Traditional |
56
+ | `t2hk` | Traditional β†’ Hong Kong |
57
+ | `hk2t` | Hong Kong β†’ Traditional |
58
+ | `t2jp` | Traditional β†’ Japanese Kanji |
59
+ | `jp2t` | Japanese Kanji β†’ Traditional |
60
+
61
+ ---
62
+
63
+ ## πŸ“¦ Installation
64
+
65
+ ```bash
66
+ pip install opencc-purepy
67
+ ```
68
+
69
+ ## πŸš€ Usage
70
+
71
+ ### 🐍 Python
72
+
73
+ ```python
74
+ from opencc_purepy import OpenCC
75
+
76
+ text = "β€œζ˜₯ηœ δΈθ§‰ζ™“οΌŒε€„ε€„ι—»ε•ΌιΈŸγ€‚β€"
77
+ opencc = OpenCC("s2t")
78
+ converted = opencc.convert(text, punctuation=True)
79
+ print(converted) # γ€Œζ˜₯ηœ δΈθ¦Ίζ›‰οΌŒθ™•θ™•θžε•Όι³₯。」
80
+ ```
81
+
82
+ ### πŸ–₯ CLI
83
+
84
+ ```sh
85
+ python -m opencc_purepy convert -i input.txt -o output.txt -c s2t -p
86
+ ```
87
+
88
+ Or if installed as a script:
89
+
90
+ ```bash
91
+ opencc-purepy convert -i input.txt -o output.txt -c s2t -p
92
+ ```
93
+
94
+ ## 🧩 API Reference
95
+
96
+ ### Class: `OpenCC`
97
+
98
+ - `OpenCC(config: str = "s2t")`
99
+ - `config`: Conversion configuration (see above).
100
+ - `convert(input: str, punctuation: bool = False) -> str`
101
+ - Convert text with optional punctuation conversion.
102
+ - `zho_check(input: str) -> int`
103
+ - Detects the code of the input text.
104
+ - 1 - Traditional,
105
+ - 2 - Simplified,
106
+ - 0 - others
107
+
108
+ ## πŸ›  Development
109
+
110
+ - Python bindings: [opencc_purepy/__init__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__init__.py), [opencc_purepy/opencc_purepy.pyi](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/opencc_purepy.pyi)
111
+ - CLI: [opencc_purepy/__main__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__main__.py)
112
+
113
+ ## πŸ“„ License
114
+ This project is licensed under the [MIT](https://github.com/laisuk/opencc_purepy/blob/master/LICENSE) License.
115
+
116
+ ---
117
+
118
+ Powered by Pure Python and OpenCC Lexicons.
@@ -0,0 +1,100 @@
1
+ # opencc_purepy
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/opencc-purepy.svg)](https://pypi.org/project/opencc-purepy/)
4
+ [![License](https://img.shields.io/github/license/laisuk/opencc_pyo3)](https://github.com/laisuk/opencc_pyo3/blob/main/LICENSE)
5
+
6
+ **`opencc_purepy`** is a **pure Python implementation** of OpenCC (Open Chinese Convert), enabling conversion between different Chinese text variants such as Simplified, Traditional, Hong Kong, Taiwan, and Japanese Kanji.
7
+ It uses dictionary-based segmentation and mapping logic inspired by [BYVoid/OpenCC](https://github.com/BYVoid/OpenCC).
8
+
9
+ ---
10
+
11
+ ## πŸ”§ Features
12
+
13
+ - βœ… Pure Python – no native dependencies
14
+ - πŸ”„ Supports conversion between multiple Chinese locales:
15
+ - Simplified ↔ Traditional
16
+ - Traditional ↔ Hong Kong / Taiwan / Japanese
17
+ - ✨ Optional punctuation style conversion
18
+ - 🧠 Automatic simplified/traditional code detection
19
+
20
+ ---
21
+
22
+ ## πŸ” Supported Conversion Configs
23
+
24
+ | Code | Description |
25
+ |----------|--------------------------------------|
26
+ | `s2t` | Simplified β†’ Traditional |
27
+ | `t2s` | Traditional β†’ Simplified |
28
+ | `s2tw` | Simplified β†’ Traditional (Taiwan) |
29
+ | `tw2s` | Taiwan β†’ Simplified |
30
+ | `s2twp` | Simplified β†’ Traditional β†’ Taiwan |
31
+ | `tw2sp` | Taiwan β†’ Traditional β†’ Simplified |
32
+ | `s2hk` | Simplified β†’ Hong Kong |
33
+ | `hk2s` | Hong Kong β†’ Simplified |
34
+ | `t2tw` | Traditional β†’ Taiwan |
35
+ | `tw2t` | Taiwan β†’ Traditional |
36
+ | `t2twp` | Traditional β†’ Taiwan |
37
+ | `tw2tp` | Taiwan β†’ Traditional |
38
+ | `t2hk` | Traditional β†’ Hong Kong |
39
+ | `hk2t` | Hong Kong β†’ Traditional |
40
+ | `t2jp` | Traditional β†’ Japanese Kanji |
41
+ | `jp2t` | Japanese Kanji β†’ Traditional |
42
+
43
+ ---
44
+
45
+ ## πŸ“¦ Installation
46
+
47
+ ```bash
48
+ pip install opencc-purepy
49
+ ```
50
+
51
+ ## πŸš€ Usage
52
+
53
+ ### 🐍 Python
54
+
55
+ ```python
56
+ from opencc_purepy import OpenCC
57
+
58
+ text = "β€œζ˜₯ηœ δΈθ§‰ζ™“οΌŒε€„ε€„ι—»ε•ΌιΈŸγ€‚β€"
59
+ opencc = OpenCC("s2t")
60
+ converted = opencc.convert(text, punctuation=True)
61
+ print(converted) # γ€Œζ˜₯ηœ δΈθ¦Ίζ›‰οΌŒθ™•θ™•θžε•Όι³₯。」
62
+ ```
63
+
64
+ ### πŸ–₯ CLI
65
+
66
+ ```sh
67
+ python -m opencc_purepy convert -i input.txt -o output.txt -c s2t -p
68
+ ```
69
+
70
+ Or if installed as a script:
71
+
72
+ ```bash
73
+ opencc-purepy convert -i input.txt -o output.txt -c s2t -p
74
+ ```
75
+
76
+ ## 🧩 API Reference
77
+
78
+ ### Class: `OpenCC`
79
+
80
+ - `OpenCC(config: str = "s2t")`
81
+ - `config`: Conversion configuration (see above).
82
+ - `convert(input: str, punctuation: bool = False) -> str`
83
+ - Convert text with optional punctuation conversion.
84
+ - `zho_check(input: str) -> int`
85
+ - Detects the code of the input text.
86
+ - 1 - Traditional,
87
+ - 2 - Simplified,
88
+ - 0 - others
89
+
90
+ ## πŸ›  Development
91
+
92
+ - Python bindings: [opencc_purepy/__init__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__init__.py), [opencc_purepy/opencc_purepy.pyi](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/opencc_purepy.pyi)
93
+ - CLI: [opencc_purepy/__main__.py](https://github.com/laisuk/opencc_purepy/blob/master/opencc_purepy/__main__.py)
94
+
95
+ ## πŸ“„ License
96
+ This project is licensed under the [MIT](https://github.com/laisuk/opencc_purepy/blob/master/LICENSE) License.
97
+
98
+ ---
99
+
100
+ Powered by Pure Python and OpenCC Lexicons.
@@ -0,0 +1,6 @@
1
+ ##########################################################
2
+ # Author: Bryan Lai
3
+ # GitHub: laisuk
4
+ # January, 2025
5
+ ##########################################################
6
+ from .core import OpenCC
@@ -0,0 +1,46 @@
1
+ from __future__ import print_function
2
+
3
+ import argparse
4
+ import sys
5
+ from . import dictgen_cmd
6
+ from . import convert_cmd # We'll move your current logic into convert_cmd.py
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser(
10
+ prog='opencc_purepy',
11
+ description='OpenCC CLI with multiple tools',
12
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
13
+ )
14
+
15
+ subparsers = parser.add_subparsers(dest='command', required=True)
16
+
17
+ # ---- convert subcommand ----
18
+ parser_convert = subparsers.add_parser('convert', help='Convert text using OpenCC')
19
+ parser_convert.add_argument('-i', '--input', metavar='<file>', help='Input file')
20
+ parser_convert.add_argument('-o', '--output', metavar='<file>', help='Output file')
21
+ parser_convert.add_argument('-c', '--config', metavar='<conversion>', help='Conversion configuration')
22
+ parser_convert.add_argument('-p', '--punct', action='store_true', default=False, help='Punctuation conversion: True/False')
23
+ parser_convert.add_argument('--in-enc', metavar='<encoding>', default='UTF-8', help='Input encoding')
24
+ parser_convert.add_argument('--out-enc', metavar='<encoding>', default='UTF-8', help='Output encoding')
25
+ parser_convert.set_defaults(func=convert_cmd.main)
26
+
27
+ # ---- dictgen subcommand ----
28
+ parser_dictgen = subparsers.add_parser('dictgen', help='Generate dictionary')
29
+ parser_dictgen.add_argument(
30
+ "-f", "--format",
31
+ choices=["json"],
32
+ default="json",
33
+ help="Dictionary format: [json]"
34
+ )
35
+ parser_dictgen.add_argument(
36
+ "-o", "--output",
37
+ metavar="<filename>",
38
+ help="Write generated dictionary to <filename>. If not specified, a default filename is used."
39
+ )
40
+ parser_dictgen.set_defaults(func=dictgen_cmd.main)
41
+
42
+ args = parser.parse_args()
43
+ return args.func(args)
44
+
45
+ if __name__ == '__main__':
46
+ sys.exit(main())
@@ -0,0 +1,28 @@
1
+ import io
2
+ import sys
3
+ from opencc_purepy import OpenCC
4
+
5
+ def main(args):
6
+ if args.config is None:
7
+ print("Please specify conversion.", file=sys.stderr)
8
+ return 1
9
+
10
+ opencc = OpenCC(args.config)
11
+
12
+ # Prompt only if reading from stdin, and it's interactive (i.e., not piped or redirected)
13
+ if args.input is None and sys.stdin.isatty():
14
+ print("Input text to convert, <Ctrl+Z> (Windows) or <Ctrl+D> (Unix) then Enter to submit:", file=sys.stderr)
15
+
16
+ with io.open(args.input if args.input else 0, encoding=args.in_enc) as f:
17
+ input_str = f.read()
18
+ output_str = opencc.convert(input_str, args.punct)
19
+
20
+ with io.open(args.output if args.output else 1, 'w', encoding=args.out_enc) as f:
21
+ f.write(output_str)
22
+
23
+ in_from = args.input if args.input else "<stdin>"
24
+ out_to = args.output if args.output else "stdout"
25
+ if sys.stderr.isatty():
26
+ print(f"Conversion completed ({args.config}): {in_from} -> {out_to}", file=sys.stderr)
27
+
28
+ return 0
@@ -0,0 +1,362 @@
1
+ import re
2
+ from typing import List, Dict, Tuple
3
+ from .dictionary_lib import DictionaryMaxlength
4
+
5
+ DELIMITERS = set(
6
+ " \t\n\r!\"#$%&'()*+,-./:;<=>?@[\\]^_{}|~οΌγ€γ€‚β€œβ€β€˜β€™γ€Žγ€γ€Œγ€οΉοΉ‚β€”οΌοΌˆοΌ‰γ€Šγ€‹γ€ˆγ€‰οΌŸοΌβ€¦οΌοΌΌοΈ’οΈ‘οΈ”οΈ“οΈΏοΉ€οΈΉοΈΊοΈ™οΈοΌ»οΉ‡οΌ½οΉˆοΈ•οΈ–οΈ°οΈ³οΈ΄οΈ½οΈΎοΈ΅οΈΆο½›οΈ·ο½οΈΈοΉƒοΉ„γ€οΈ»γ€‘οΈΌγ€€ο½žοΌŽοΌŒοΌ›οΌš")
7
+ STRIP_REGEX = re.compile(r"[!-/:-@\[-`{-~\t\n\v\f\r 0-9A-Za-z_]")
8
+
9
+
10
+ class DictRefs:
11
+ def __init__(self, round_1):
12
+ self.round_1 = round_1
13
+ self.round_2 = None
14
+ self.round_3 = None
15
+
16
+ def with_round_2(self, round_2):
17
+ self.round_2 = round_2
18
+ return self
19
+
20
+ def with_round_3(self, round_3):
21
+ self.round_3 = round_3
22
+ return self
23
+
24
+ def apply_segment_replace(self, input_text, segment_replace):
25
+ output = segment_replace(input_text, self.round_1)
26
+ if self.round_2:
27
+ output = segment_replace(output, self.round_2)
28
+ if self.round_3:
29
+ output = segment_replace(output, self.round_3)
30
+ return output
31
+
32
+
33
+ class OpenCC:
34
+ def __init__(self, config=None):
35
+ _config_list = [
36
+ "s2t", "t2s", "s2tw", "tw2s", "s2twp", "tw2sp", "s2hk", "hk2s", "t2tw", "tw2t", "t2twp", "tw2t", "tw2tp",
37
+ "t2hk", "hk2t", "t2jp", "jp2t"
38
+ ]
39
+ if config in _config_list:
40
+ self.config = config
41
+ else:
42
+ self._last_error = f"Invalid config: {config}"
43
+ self.config = "s2t"
44
+ try:
45
+ self.dictionary = DictionaryMaxlength.new()
46
+ # self.dictionary = DictionaryMaxlength.from_dicts()
47
+ except Exception as e:
48
+ self._last_error = str(e) # <- Use thread-safe setter
49
+ self.dictionary = DictionaryMaxlength()
50
+
51
+ self.delimiters = DELIMITERS
52
+
53
+ def get_last_error(self):
54
+ return self._last_error
55
+
56
+ def segment_replace(self, text: str, dictionaries: List[Tuple[Dict[str, str], int]]) -> str:
57
+ max_word_length = max((length for _, length in dictionaries), default=1)
58
+ ranges = self.get_split_ranges(text)
59
+ return "".join(
60
+ self.convert_by(list(text[start:end]), dictionaries, max_word_length)
61
+ for start, end in ranges
62
+ )
63
+
64
+ def convert_by(self, text_chars: List[str], dictionaries, max_word_length: int) -> str:
65
+ if not text_chars:
66
+ return ""
67
+
68
+ delimiters = self.delimiters # Local variable for speed
69
+ if len(text_chars) == 1 and text_chars[0] in delimiters:
70
+ return text_chars[0]
71
+
72
+ result = []
73
+ i = 0
74
+ text_chars_len = len(text_chars)
75
+ while i < text_chars_len:
76
+ best_match = None
77
+ best_length = 0
78
+ # Use local variable for dictionaries
79
+ for length in range(min(max_word_length, text_chars_len - i), 0, -1):
80
+ word = "".join(text_chars[i:i + length])
81
+ for d, _ in dictionaries:
82
+ match = d.get(word)
83
+ if match is not None:
84
+ best_match = match
85
+ best_length = length
86
+ break
87
+ if best_length:
88
+ break
89
+ if not best_length:
90
+ best_match = text_chars[i]
91
+ best_length = 1
92
+ result.append(best_match)
93
+ i += best_length
94
+ return "".join(result)
95
+
96
+ def get_split_ranges(self, text: str) -> List[Tuple[int, int]]:
97
+ """
98
+ Returns a list of (start, end) index tuples, where each tuple represents
99
+ the start (inclusive) and end (exclusive) indices of a chunk in the text.
100
+ """
101
+ ranges = []
102
+ start = 0
103
+ for i, ch in enumerate(text):
104
+ if ch in self.delimiters:
105
+ ranges.append((start, i + 1)) # include the delimiter
106
+ start = i + 1
107
+ if start < len(text):
108
+ ranges.append((start, len(text)))
109
+ return ranges
110
+
111
+ def s2t(self, input_text: str, punctuation: bool = False) -> str:
112
+ if not input_text:
113
+ self._last_error = "Input text is empty"
114
+ return ""
115
+ refs = DictRefs([
116
+ self.dictionary.st_phrases,
117
+ self.dictionary.st_characters
118
+ ])
119
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
120
+ return self.convert_punctuation(output, "s") if punctuation else output
121
+
122
+ def t2s(self, input_text: str, punctuation: bool = False) -> str:
123
+ refs = DictRefs([
124
+ self.dictionary.ts_phrases,
125
+ self.dictionary.ts_characters
126
+ ])
127
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
128
+ return self.convert_punctuation(output, "t") if punctuation else output
129
+
130
+ def s2tw(self, input_text: str, punctuation: bool = False) -> str:
131
+ refs = DictRefs([
132
+ self.dictionary.st_phrases,
133
+ self.dictionary.st_characters
134
+ ]).with_round_2([
135
+ self.dictionary.tw_variants
136
+ ])
137
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
138
+ return self.convert_punctuation(output, "s") if punctuation else output
139
+
140
+ def tw2s(self, input_text: str, punctuation: bool = False) -> str:
141
+ refs = DictRefs([
142
+ self.dictionary.tw_variants_rev_phrases,
143
+ self.dictionary.tw_variants_rev
144
+ ]).with_round_2([
145
+ self.dictionary.ts_phrases,
146
+ self.dictionary.ts_characters
147
+ ])
148
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
149
+ return self.convert_punctuation(output, "t") if punctuation else output
150
+
151
+ def s2twp(self, input_text: str, punctuation: bool = False) -> str:
152
+ refs = DictRefs([
153
+ self.dictionary.st_phrases,
154
+ self.dictionary.st_characters
155
+ ]).with_round_2([
156
+ self.dictionary.tw_phrases
157
+ ]).with_round_3([
158
+ self.dictionary.tw_variants
159
+ ])
160
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
161
+ return self.convert_punctuation(output, "s") if punctuation else output
162
+
163
+ def tw2sp(self, input_text: str, punctuation: bool = False) -> str:
164
+ refs = DictRefs([
165
+ self.dictionary.tw_phrases_rev,
166
+ self.dictionary.tw_variants_rev_phrases,
167
+ self.dictionary.tw_variants_rev
168
+ ]).with_round_2([
169
+ self.dictionary.ts_phrases,
170
+ self.dictionary.ts_characters
171
+ ])
172
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
173
+ return self.convert_punctuation(output, "t") if punctuation else output
174
+
175
+ def s2hk(self, input_text: str, punctuation: bool = False) -> str:
176
+ refs = DictRefs([
177
+ self.dictionary.st_phrases,
178
+ self.dictionary.st_characters
179
+ ]).with_round_2([
180
+ self.dictionary.hk_variants
181
+ ])
182
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
183
+ return self.convert_punctuation(output, "s") if punctuation else output
184
+
185
+ def hk2s(self, input_text: str, punctuation: bool = False) -> str:
186
+ refs = DictRefs([
187
+ self.dictionary.hk_variants_rev_phrases,
188
+ self.dictionary.hk_variants_rev
189
+ ]).with_round_2([
190
+ self.dictionary.ts_phrases,
191
+ self.dictionary.ts_characters
192
+ ])
193
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
194
+ return self.convert_punctuation(output, "t") if punctuation else output
195
+
196
+ def t2tw(self, input_text: str) -> str:
197
+ refs = DictRefs([
198
+ self.dictionary.tw_variants
199
+ ])
200
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
201
+ return output
202
+
203
+ def t2twp(self, input_text: str) -> str:
204
+ refs = DictRefs([
205
+ self.dictionary.tw_phrases
206
+ ]).with_round_2([
207
+ self.dictionary.tw_variants
208
+ ])
209
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
210
+ return output
211
+
212
+ def tw2t(self, input_text: str) -> str:
213
+ refs = DictRefs([
214
+ self.dictionary.tw_variants_rev_phrases,
215
+ self.dictionary.tw_variants_rev
216
+ ])
217
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
218
+ return output
219
+
220
+ def tw2tp(self, input_text: str) -> str:
221
+ refs = DictRefs([
222
+ self.dictionary.tw_variants_rev_phrases,
223
+ self.dictionary.tw_variants_rev
224
+ ]).with_round_2([
225
+ self.dictionary.tw_phrases_rev
226
+ ])
227
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
228
+ return output
229
+
230
+ def t2hk(self, input_text: str) -> str:
231
+ refs = DictRefs([
232
+ self.dictionary.hk_variants
233
+ ])
234
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
235
+ return output
236
+
237
+ def hk2t(self, input_text: str) -> str:
238
+ refs = DictRefs([
239
+ self.dictionary.hk_variants_rev_phrases,
240
+ self.dictionary.hk_variants_rev
241
+ ])
242
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
243
+ return output
244
+
245
+ def t2jp(self, input_text: str) -> str:
246
+ refs = DictRefs([
247
+ self.dictionary.jp_variants
248
+ ])
249
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
250
+ return output
251
+
252
+ def jp2t(self, input_text: str) -> str:
253
+ refs = DictRefs([
254
+ self.dictionary.jps_phrases,
255
+ self.dictionary.jps_characters,
256
+ self.dictionary.jp_variants_rev
257
+ ])
258
+ output = refs.apply_segment_replace(input_text, self.segment_replace)
259
+ return output
260
+
261
+ def convert(self, input_text: str, punctuation: bool = False) -> str:
262
+ config = self.config.lower()
263
+ try:
264
+ if config == "s2t":
265
+ return self.s2t(input_text, punctuation)
266
+ elif config == "s2tw":
267
+ return self.s2tw(input_text, punctuation)
268
+ elif config == "s2twp":
269
+ return self.s2twp(input_text, punctuation)
270
+ elif config == "s2hk":
271
+ return self.s2hk(input_text, punctuation)
272
+ elif config == "t2s":
273
+ return self.t2s(input_text, punctuation)
274
+ elif config == "t2tw":
275
+ return self.t2tw(input_text)
276
+ elif config == "t2twp":
277
+ return self.t2twp(input_text)
278
+ elif config == "t2hk":
279
+ return self.t2hk(input_text)
280
+ elif config == "tw2s":
281
+ return self.tw2s(input_text, punctuation)
282
+ elif config == "tw2sp":
283
+ return self.tw2sp(input_text, punctuation)
284
+ elif config == "tw2t":
285
+ return self.tw2t(input_text)
286
+ elif config == "tw2tp":
287
+ return self.tw2tp(input_text)
288
+ elif config == "hk2s":
289
+ return self.hk2s(input_text, punctuation)
290
+ elif config == "hk2t":
291
+ return self.hk2t(input_text)
292
+ elif config == "jp2t":
293
+ return self.jp2t(input_text)
294
+ elif config == "t2jp":
295
+ return self.t2jp(input_text)
296
+ else:
297
+ self._last_error = f"Invalid config: {config}"
298
+ return self._last_error
299
+ except Exception as e:
300
+ self._last_error = f"Conversion failed: {e}"
301
+ return self._last_error
302
+
303
+ def st(self, input_text: str) -> str:
304
+ dict_refs = [self.dictionary.st_characters]
305
+ chars = list(input_text) # converts str into list of chars
306
+ return self.convert_by(chars, dict_refs, 1)
307
+
308
+ def ts(self, input_text: str) -> str:
309
+ dict_refs = [self.dictionary.ts_characters]
310
+ chars = list(input_text) # converts str into list of chars
311
+ return self.convert_by(chars, dict_refs, 1)
312
+
313
+ def zho_check(self, input_text: str) -> int:
314
+ if not input_text:
315
+ return 0
316
+
317
+ stripped = STRIP_REGEX.sub("", input_text)
318
+ max_chars = find_max_utf8_length(stripped, 200)
319
+ strip_text = stripped[:max_chars]
320
+
321
+ if strip_text != self.ts(strip_text):
322
+ return 1
323
+ elif strip_text != self.st(strip_text):
324
+ return 2
325
+ else:
326
+ return 0
327
+
328
+ @staticmethod
329
+ def convert_punctuation(input_text: str, config: str) -> str:
330
+ s2t = {
331
+ 'β€œ': 'γ€Œ',
332
+ '”': '」',
333
+ 'β€˜': 'γ€Ž',
334
+ '’': '』',
335
+ }
336
+
337
+ t2s = {
338
+ 'γ€Œ': 'β€œ',
339
+ '」': '”',
340
+ 'γ€Ž': 'β€˜',
341
+ '』': '’',
342
+ }
343
+
344
+ if config[0] == 's':
345
+ mapping = s2t
346
+ pattern = "[" + "".join(re.escape(c) for c in s2t.keys()) + "]"
347
+ else:
348
+ pattern = "[" + "".join(re.escape(c) for c in t2s.keys()) + "]"
349
+ mapping = t2s
350
+
351
+ return re.sub(pattern, lambda m: mapping[m.group()], input_text)
352
+
353
+
354
+ def find_max_utf8_length(s: str, max_byte_count: int) -> int:
355
+ encoded = s.encode('utf-8')
356
+ if len(encoded) <= max_byte_count:
357
+ return len(encoded)
358
+
359
+ byte_count = max_byte_count
360
+ while byte_count > 0 and (encoded[byte_count] & 0b11000000) == 0b10000000:
361
+ byte_count -= 1
362
+ return byte_count