romanization 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- romanization-1.0.0/LICENSE +21 -0
- romanization-1.0.0/PKG-INFO +47 -0
- romanization-1.0.0/README.md +28 -0
- romanization-1.0.0/pyproject.toml +6 -0
- romanization-1.0.0/romanization.egg-info/PKG-INFO +47 -0
- romanization-1.0.0/romanization.egg-info/SOURCES.txt +22 -0
- romanization-1.0.0/romanization.egg-info/dependency_links.txt +1 -0
- romanization-1.0.0/romanization.egg-info/requires.txt +1 -0
- romanization-1.0.0/romanization.egg-info/top_level.txt +1 -0
- romanization-1.0.0/setup.cfg +4 -0
- romanization-1.0.0/setup.py +46 -0
- romanization-1.0.0/src/romanization/__init__.py +1 -0
- romanization-1.0.0/src/romanization/const.py +57 -0
- romanization-1.0.0/src/romanization/convert/data/latin/JUNGSUNG +71 -0
- romanization-1.0.0/src/romanization/convert/data/provisions +14 -0
- romanization-1.0.0/src/romanization/convert/data/raw/CHOSUNG +95 -0
- romanization-1.0.0/src/romanization/convert/data/raw/JONGSUNG +88 -0
- romanization-1.0.0/src/romanization/convert/data/raw/JUNGSUNG +71 -0
- romanization-1.0.0/src/romanization/convert/output/latin/jungsung.json +1 -0
- romanization-1.0.0/src/romanization/convert/output/raw/chosung.json +1 -0
- romanization-1.0.0/src/romanization/convert/output/raw/jongsung.json +1 -0
- romanization-1.0.0/src/romanization/romanize.py +96 -0
- romanization-1.0.0/src/romanization/utils.py +264 -0
- romanization-1.0.0/tests/test_romanize.py +5 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Joumaico Maulas
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: romanization
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Revised Romanization of Korean
|
|
5
|
+
Home-page: https://github.com/joumaico/romanization
|
|
6
|
+
Author: Joumaico Maulas
|
|
7
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.7
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
|
|
20
|
+
# Revised Romanization of Korean
|
|
21
|
+
|
|
22
|
+
This software converts Korean Hangul text into Latin alphabet, following the guidelines of the Revised Romanization of Korean which ensures accurate and consistent transliteration. It's particularly useful for applications in language learning, text processing, and international communication, making it easier for non-Korean speakers to read and pronounce Korean words.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
Dependencies:
|
|
28
|
+
|
|
29
|
+
- Python 3.7, 3.8, 3.9, 3.11, 3.12
|
|
30
|
+
|
|
31
|
+
Installation:
|
|
32
|
+
|
|
33
|
+
$ pip install romanization
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
>>> from romanization import romanize
|
|
40
|
+
|
|
41
|
+
>>> romanize("좋아 첫 눈에 반해 버린")
|
|
42
|
+
"joha cheot nune banhae beorin"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## References
|
|
46
|
+
|
|
47
|
+
- https://en.wikipedia.org/w/index.php?title=Revised_Romanization_of_Korean&oldid=1064463473
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Revised Romanization of Korean
|
|
2
|
+
|
|
3
|
+
This software converts Korean Hangul text into Latin alphabet, following the guidelines of the Revised Romanization of Korean which ensures accurate and consistent transliteration. It's particularly useful for applications in language learning, text processing, and international communication, making it easier for non-Korean speakers to read and pronounce Korean words.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
Dependencies:
|
|
9
|
+
|
|
10
|
+
- Python 3.7, 3.8, 3.9, 3.11, 3.12
|
|
11
|
+
|
|
12
|
+
Installation:
|
|
13
|
+
|
|
14
|
+
$ pip install romanization
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Usage
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
>>> from romanization import romanize
|
|
21
|
+
|
|
22
|
+
>>> romanize("좋아 첫 눈에 반해 버린")
|
|
23
|
+
"joha cheot nune banhae beorin"
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## References
|
|
27
|
+
|
|
28
|
+
- https://en.wikipedia.org/w/index.php?title=Revised_Romanization_of_Korean&oldid=1064463473
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: romanization
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Revised Romanization of Korean
|
|
5
|
+
Home-page: https://github.com/joumaico/romanization
|
|
6
|
+
Author: Joumaico Maulas
|
|
7
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.7
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: numpy
|
|
19
|
+
|
|
20
|
+
# Revised Romanization of Korean
|
|
21
|
+
|
|
22
|
+
This software converts Korean Hangul text into Latin alphabet, following the guidelines of the Revised Romanization of Korean which ensures accurate and consistent transliteration. It's particularly useful for applications in language learning, text processing, and international communication, making it easier for non-Korean speakers to read and pronounce Korean words.
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
Dependencies:
|
|
28
|
+
|
|
29
|
+
- Python 3.7, 3.8, 3.9, 3.11, 3.12
|
|
30
|
+
|
|
31
|
+
Installation:
|
|
32
|
+
|
|
33
|
+
$ pip install romanization
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
>>> from romanization import romanize
|
|
40
|
+
|
|
41
|
+
>>> romanize("좋아 첫 눈에 반해 버린")
|
|
42
|
+
"joha cheot nune banhae beorin"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## References
|
|
46
|
+
|
|
47
|
+
- https://en.wikipedia.org/w/index.php?title=Revised_Romanization_of_Korean&oldid=1064463473
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
romanization.egg-info/PKG-INFO
|
|
6
|
+
romanization.egg-info/SOURCES.txt
|
|
7
|
+
romanization.egg-info/dependency_links.txt
|
|
8
|
+
romanization.egg-info/requires.txt
|
|
9
|
+
romanization.egg-info/top_level.txt
|
|
10
|
+
src/romanization/__init__.py
|
|
11
|
+
src/romanization/const.py
|
|
12
|
+
src/romanization/romanize.py
|
|
13
|
+
src/romanization/utils.py
|
|
14
|
+
src/romanization/convert/data/provisions
|
|
15
|
+
src/romanization/convert/data/latin/JUNGSUNG
|
|
16
|
+
src/romanization/convert/data/raw/CHOSUNG
|
|
17
|
+
src/romanization/convert/data/raw/JONGSUNG
|
|
18
|
+
src/romanization/convert/data/raw/JUNGSUNG
|
|
19
|
+
src/romanization/convert/output/latin/jungsung.json
|
|
20
|
+
src/romanization/convert/output/raw/chosung.json
|
|
21
|
+
src/romanization/convert/output/raw/jongsung.json
|
|
22
|
+
tests/test_romanize.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
numpy
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
romanization
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from setuptools import setup
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def readme():
|
|
5
|
+
with open("README.md") as f:
|
|
6
|
+
return f.read()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
setup(
|
|
10
|
+
name="romanization",
|
|
11
|
+
version="1.0.0",
|
|
12
|
+
author="Joumaico Maulas",
|
|
13
|
+
description="Revised Romanization of Korean",
|
|
14
|
+
long_description=readme(),
|
|
15
|
+
long_description_content_type="text/markdown",
|
|
16
|
+
url="https://github.com/joumaico/romanization",
|
|
17
|
+
classifiers=[
|
|
18
|
+
"Development Status :: 5 - Production/Stable",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python :: 3.7",
|
|
22
|
+
"Programming Language :: Python :: 3.8",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
],
|
|
27
|
+
packages=[
|
|
28
|
+
"romanization",
|
|
29
|
+
],
|
|
30
|
+
package_dir={
|
|
31
|
+
"romanization": "src/romanization",
|
|
32
|
+
},
|
|
33
|
+
package_data={
|
|
34
|
+
"romanization": [
|
|
35
|
+
"convert/data/latin/*",
|
|
36
|
+
"convert/data/raw/*",
|
|
37
|
+
"convert/data/provisions",
|
|
38
|
+
"convert/output/latin/*",
|
|
39
|
+
"convert/output/raw/*",
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
python_requires=">=3.7",
|
|
43
|
+
install_requires=[
|
|
44
|
+
"numpy",
|
|
45
|
+
],
|
|
46
|
+
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .romanize import romanize
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pathlib
|
|
3
|
+
|
|
4
|
+
PATH = pathlib.Path(__file__).parent
|
|
5
|
+
|
|
6
|
+
LATIN = {
|
|
7
|
+
"CHOSUNG": {
|
|
8
|
+
"": "",
|
|
9
|
+
"ㅇ": "",
|
|
10
|
+
"ㄱ": "g",
|
|
11
|
+
"ㄴ": "n",
|
|
12
|
+
"ㄷ": "d",
|
|
13
|
+
"ㄹ": "r",
|
|
14
|
+
"ㅁ": "m",
|
|
15
|
+
"ㅂ": "b",
|
|
16
|
+
"ㅅ": "s",
|
|
17
|
+
"ㅈ": "j",
|
|
18
|
+
"ㅊ": "ch",
|
|
19
|
+
"ㅋ": "k",
|
|
20
|
+
"ㅌ": "t",
|
|
21
|
+
"ㅍ": "p",
|
|
22
|
+
"ㅎ": "h",
|
|
23
|
+
},
|
|
24
|
+
"JONGSUNG": {
|
|
25
|
+
"ㄱ": "k",
|
|
26
|
+
"ㄴ": "n",
|
|
27
|
+
"ㄷ": "t",
|
|
28
|
+
"ㄹ": "l",
|
|
29
|
+
"ㅁ": "m",
|
|
30
|
+
"ㅂ": "p",
|
|
31
|
+
"ㅅ": "t",
|
|
32
|
+
"ㅇ": "ng",
|
|
33
|
+
"ㅈ": "t",
|
|
34
|
+
"ㅊ": "t",
|
|
35
|
+
"ㅌ": "t",
|
|
36
|
+
"ㅎ": "t",
|
|
37
|
+
"ㅋ": "k",
|
|
38
|
+
"ㅍ": "p",
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
TWIN_CASE_PROVISION = {
|
|
43
|
+
"ㅂ": "pp",
|
|
44
|
+
"ㅈ": "jj",
|
|
45
|
+
"ㄷ": "tt",
|
|
46
|
+
"ㄱ": "kk",
|
|
47
|
+
"ㅅ": "ss"
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
with open(PATH / "convert/output/raw/chosung.json", "r") as f:
|
|
51
|
+
CHOSUNG = json.load(f)
|
|
52
|
+
|
|
53
|
+
with open(PATH / "convert/output/latin/jungsung.json", "r") as f:
|
|
54
|
+
JUNGSUNG = json.load(f)
|
|
55
|
+
|
|
56
|
+
with open(PATH / "convert/output/raw/jongsung.json", "r") as f:
|
|
57
|
+
JONGSUNG = json.load(f)
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
4449 'ᅡ' A
|
|
2
|
+
4450 'ᅢ' AE
|
|
3
|
+
4451 'ᅣ' YA
|
|
4
|
+
4452 'ᅤ' YAE
|
|
5
|
+
4453 'ᅥ' EO
|
|
6
|
+
4454 'ᅦ' E
|
|
7
|
+
4455 'ᅧ' YEO
|
|
8
|
+
4456 'ᅨ' YE
|
|
9
|
+
4457 'ᅩ' O
|
|
10
|
+
4458 'ᅪ' WA
|
|
11
|
+
4459 'ᅫ' WAE
|
|
12
|
+
4460 'ᅬ' OE
|
|
13
|
+
4461 'ᅭ' YO
|
|
14
|
+
4462 'ᅮ' U
|
|
15
|
+
4463 'ᅯ' WO
|
|
16
|
+
4464 'ᅰ' WE
|
|
17
|
+
4465 'ᅱ' WI
|
|
18
|
+
4466 'ᅲ' YU
|
|
19
|
+
4467 'ᅳ' EU
|
|
20
|
+
4468 'ᅴ' UI
|
|
21
|
+
4469 'ᅵ' I
|
|
22
|
+
4470 'ᅶ'
|
|
23
|
+
4471 'ᅷ'
|
|
24
|
+
4472 'ᅸ'
|
|
25
|
+
4473 'ᅹ'
|
|
26
|
+
4474 'ᅺ'
|
|
27
|
+
4475 'ᅻ'
|
|
28
|
+
4476 'ᅼ'
|
|
29
|
+
4477 'ᅽ'
|
|
30
|
+
4478 'ᅾ'
|
|
31
|
+
4479 'ᅿ'
|
|
32
|
+
4480 'ᆀ'
|
|
33
|
+
4481 'ᆁ'
|
|
34
|
+
4482 'ᆂ'
|
|
35
|
+
4483 'ᆃ'
|
|
36
|
+
4484 'ᆄ'
|
|
37
|
+
4485 'ᆅ'
|
|
38
|
+
4486 'ᆆ'
|
|
39
|
+
4487 'ᆇ'
|
|
40
|
+
4488 'ᆈ'
|
|
41
|
+
4489 'ᆉ'
|
|
42
|
+
4490 'ᆊ'
|
|
43
|
+
4491 'ᆋ'
|
|
44
|
+
4492 'ᆌ'
|
|
45
|
+
4493 'ᆍ'
|
|
46
|
+
4494 'ᆎ'
|
|
47
|
+
4495 'ᆏ'
|
|
48
|
+
4496 'ᆐ'
|
|
49
|
+
4497 'ᆑ'
|
|
50
|
+
4498 'ᆒ'
|
|
51
|
+
4499 'ᆓ'
|
|
52
|
+
4500 'ᆔ'
|
|
53
|
+
4501 'ᆕ'
|
|
54
|
+
4502 'ᆖ'
|
|
55
|
+
4503 'ᆗ'
|
|
56
|
+
4504 'ᆘ'
|
|
57
|
+
4505 'ᆙ'
|
|
58
|
+
4506 'ᆚ'
|
|
59
|
+
4507 'ᆛ'
|
|
60
|
+
4508 'ᆜ'
|
|
61
|
+
4509 'ᆝ'
|
|
62
|
+
4510 'ᆞ'
|
|
63
|
+
4511 'ᆟ'
|
|
64
|
+
4512 'ᆠ'
|
|
65
|
+
4513 'ᆡ'
|
|
66
|
+
4514 'ᆢ'
|
|
67
|
+
4515 'ᆣ'
|
|
68
|
+
4516 'ᆤ'
|
|
69
|
+
4517 'ᆥ'
|
|
70
|
+
4518 'ᆦ'
|
|
71
|
+
4519 'ᆧ'
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
k g kg ngn kd ngn ngm kb ks kj kch kk kt kp kh
|
|
2
|
+
n n ng nn nd ll nm nb ns nj nch nk nt np nh
|
|
3
|
+
t d tg nn td nn nm tb ts tj tch tk tt tp th
|
|
4
|
+
l r lg ll ld ll lm lb ls lj lch lk lt lp lh
|
|
5
|
+
m m mg mn md mn mm mb ms mj mch mk mt mp mh
|
|
6
|
+
p b pg mn pd mn mm pb ps pj pch pk pt pp ph
|
|
7
|
+
t s tg nn td nn nm tb ts tj tch tk tt tp th
|
|
8
|
+
ng ng ngg ngn ngd ngn ngm ngb ngs ngj ngch ngk ngt ngp ngh
|
|
9
|
+
t j tg nn td nn nm tb ts tj tch tk tt tp th
|
|
10
|
+
t ch tg nn td nn nm tb ts tj tch tk tt tp th
|
|
11
|
+
t t tg nn td nn nm tb ts tj tch tk tt tp th
|
|
12
|
+
t h k nn t nn nm p hs ch tch tk t tp t
|
|
13
|
+
k k kg ngn kd ngn ngm kb ks kj kch kk kt kp kh
|
|
14
|
+
p p pg mn pd mn mm pb ps pj pch pk pt pp ph
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
4352 'ᄀ' ㄱ
|
|
2
|
+
4353 'ᄁ' ㄱㄱ
|
|
3
|
+
4354 'ᄂ' ㄴ
|
|
4
|
+
4355 'ᄃ' ㄷ
|
|
5
|
+
4356 'ᄄ' ㄷㄷ
|
|
6
|
+
4357 'ᄅ' ㄹ
|
|
7
|
+
4358 'ᄆ' ㅁ
|
|
8
|
+
4359 'ᄇ' ㅂ
|
|
9
|
+
4360 'ᄈ' ㅂㅂ
|
|
10
|
+
4361 'ᄉ' ㅅ
|
|
11
|
+
4362 'ᄊ' ㅅㅅ
|
|
12
|
+
4363 'ᄋ' ㅇ
|
|
13
|
+
4364 'ᄌ' ㅈ
|
|
14
|
+
4365 'ᄍ' ㅈㅈ
|
|
15
|
+
4366 'ᄎ' ㅊ
|
|
16
|
+
4367 'ᄏ' ㅋ
|
|
17
|
+
4368 'ᄐ' ㅌ
|
|
18
|
+
4369 'ᄑ' ㅍ
|
|
19
|
+
4370 'ᄒ' ㅎ
|
|
20
|
+
4371 'ᄓ' ㄴㄱ
|
|
21
|
+
4372 'ᄔ' ㄴㄴ
|
|
22
|
+
4373 'ᄕ' ㄴㄷ
|
|
23
|
+
4374 'ᄖ' ㄴㅂ
|
|
24
|
+
4375 'ᄗ' ㄷㄱ
|
|
25
|
+
4376 'ᄘ' ㄹㄴ
|
|
26
|
+
4377 'ᄙ' ㄹㄹ
|
|
27
|
+
4378 'ᄚ' ㄹㅎ
|
|
28
|
+
4379 'ᄛ'
|
|
29
|
+
4380 'ᄜ' ㅁㅂ
|
|
30
|
+
4381 'ᄝ'
|
|
31
|
+
4382 'ᄞ' ㅂㄱ
|
|
32
|
+
4383 'ᄟ' ㅂㄴ
|
|
33
|
+
4384 'ᄠ' ㅂㄷ
|
|
34
|
+
4385 'ᄡ' ㅂㅅ
|
|
35
|
+
4386 'ᄢ' ㅂㅅㄱ
|
|
36
|
+
4387 'ᄣ' ㅂㅅㄷ
|
|
37
|
+
4388 'ᄤ' ㅂㅅㅂ
|
|
38
|
+
4389 'ᄥ' ㅂㅅㅅ
|
|
39
|
+
4390 'ᄦ' ㅂㅅㅈ
|
|
40
|
+
4391 'ᄧ' ㅂㅈ
|
|
41
|
+
4392 'ᄨ' ㅂㅊ
|
|
42
|
+
4393 'ᄩ' ㅂㅌ
|
|
43
|
+
4394 'ᄪ' ㅂㅍ
|
|
44
|
+
4395 'ᄫ'
|
|
45
|
+
4396 'ᄬ'
|
|
46
|
+
4397 'ᄭ' ㅅㄱ
|
|
47
|
+
4398 'ᄮ' ㅅㄴ
|
|
48
|
+
4399 'ᄯ' ㅅㄷ
|
|
49
|
+
4400 'ᄰ' ㅅㄹ
|
|
50
|
+
4401 'ᄱ' ㅅㅁ
|
|
51
|
+
4402 'ᄲ' ㅅㅂ
|
|
52
|
+
4403 'ᄳ' ㅅㅂㄱ
|
|
53
|
+
4404 'ᄴ' ㅅㅅㅅ
|
|
54
|
+
4405 'ᄵ'
|
|
55
|
+
4406 'ᄶ' ㅅㅈ
|
|
56
|
+
4407 'ᄷ' ㅅㅊ
|
|
57
|
+
4408 'ᄸ' ㅅㅋ
|
|
58
|
+
4409 'ᄹ' ㅅㅌ
|
|
59
|
+
4410 'ᄺ' ㅅㅍ
|
|
60
|
+
4411 'ᄻ' ㅅㅎ
|
|
61
|
+
4412 'ᄼ'
|
|
62
|
+
4413 'ᄽ'
|
|
63
|
+
4414 'ᄾ'
|
|
64
|
+
4415 'ᄿ'
|
|
65
|
+
4416 'ᅀ'
|
|
66
|
+
4417 'ᅁ'
|
|
67
|
+
4418 'ᅂ'
|
|
68
|
+
4419 'ᅃ'
|
|
69
|
+
4420 'ᅄ'
|
|
70
|
+
4421 'ᅅ'
|
|
71
|
+
4422 'ᅆ'
|
|
72
|
+
4423 'ᅇ'
|
|
73
|
+
4424 'ᅈ'
|
|
74
|
+
4425 'ᅉ'
|
|
75
|
+
4426 'ᅊ'
|
|
76
|
+
4427 'ᅋ'
|
|
77
|
+
4428 'ᅌ'
|
|
78
|
+
4429 'ᅍ'
|
|
79
|
+
4430 'ᅎ'
|
|
80
|
+
4431 'ᅏ'
|
|
81
|
+
4432 'ᅐ'
|
|
82
|
+
4433 'ᅑ'
|
|
83
|
+
4434 'ᅒ' ㅊㅋ
|
|
84
|
+
4435 'ᅓ' ㅊㅎ
|
|
85
|
+
4436 'ᅔ'
|
|
86
|
+
4437 'ᅕ'
|
|
87
|
+
4438 'ᅖ' ㅍㅂ
|
|
88
|
+
4439 'ᅗ'
|
|
89
|
+
4440 'ᅘ' ㅎㅎ
|
|
90
|
+
4441 'ᅙ'
|
|
91
|
+
4442 'ᅚ' ㄱㄷ
|
|
92
|
+
4443 'ᅛ' ㄴㅅ
|
|
93
|
+
4444 'ᅜ' ㄴㅈ
|
|
94
|
+
4445 'ᅝ' ㄴㅎ
|
|
95
|
+
4446 'ᅞ' ㄷㄹ
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
4520 'ᆨ' ㄱ
|
|
2
|
+
4521 'ᆩ' ㄱㄱ
|
|
3
|
+
4522 'ᆪ' ㄱㅅ
|
|
4
|
+
4523 'ᆫ' ㄴ
|
|
5
|
+
4524 'ᆬ' ㄴㅈ
|
|
6
|
+
4525 'ᆭ' ㄴㅎ
|
|
7
|
+
4526 'ᆮ' ㄷ
|
|
8
|
+
4527 'ᆯ' ㄹ
|
|
9
|
+
4528 'ᆰ' ㄹㄱ
|
|
10
|
+
4529 'ᆱ' ㄹㅁ
|
|
11
|
+
4530 'ᆲ' ㄹㅂ
|
|
12
|
+
4531 'ᆳ' ㄹㅅ
|
|
13
|
+
4532 'ᆴ' ㄹㅌ
|
|
14
|
+
4533 'ᆵ' ㄹㅍ
|
|
15
|
+
4534 'ᆶ' ㄹㅎ
|
|
16
|
+
4535 'ᆷ' ㅁ
|
|
17
|
+
4536 'ᆸ' ㅂ
|
|
18
|
+
4537 'ᆹ' ㅂㅅ
|
|
19
|
+
4538 'ᆺ' ㅅ
|
|
20
|
+
4539 'ᆻ' ㅅㅅ
|
|
21
|
+
4540 'ᆼ' ㅇ
|
|
22
|
+
4541 'ᆽ' ㅈ
|
|
23
|
+
4542 'ᆾ' ㅊ
|
|
24
|
+
4543 'ᆿ' ㅋ
|
|
25
|
+
4544 'ᇀ' ㅌ
|
|
26
|
+
4545 'ᇁ' ㅍ
|
|
27
|
+
4546 'ᇂ' ㅎ
|
|
28
|
+
4547 'ᇃ' ㄱㄹ
|
|
29
|
+
4548 'ᇄ' ㄱㅅㄱ
|
|
30
|
+
4549 'ᇅ' ㄴㄱ
|
|
31
|
+
4550 'ᇆ' ㄴㄷ
|
|
32
|
+
4551 'ᇇ' ㄴㅅ
|
|
33
|
+
4552 'ᇈ'
|
|
34
|
+
4553 'ᇉ' ㄴㅌ
|
|
35
|
+
4554 'ᇊ' ㄷㄱ
|
|
36
|
+
4555 'ᇋ' ㄷㄹ
|
|
37
|
+
4556 'ᇌ' ㄹㄱㅅ
|
|
38
|
+
4557 'ᇍ' ㄹㄴ
|
|
39
|
+
4558 'ᇎ' ㄹㄷ
|
|
40
|
+
4559 'ᇏ' ㄹㄷㅎ
|
|
41
|
+
4560 'ᇐ' ㄹㄹ
|
|
42
|
+
4561 'ᇑ' ㄹㅁㄱ
|
|
43
|
+
4562 'ᇒ' ㄹㅁㅅ
|
|
44
|
+
4563 'ᇓ' ㄹㅂㅅ
|
|
45
|
+
4564 'ᇔ' ㄹㅂㅎ
|
|
46
|
+
4565 'ᇕ'
|
|
47
|
+
4566 'ᇖ' ㄹㅅㅅ
|
|
48
|
+
4567 'ᇗ'
|
|
49
|
+
4568 'ᇘ' ㄹㅋ
|
|
50
|
+
4569 'ᇙ' ㄹㅎ
|
|
51
|
+
4570 'ᇚ' ㅁㄱ
|
|
52
|
+
4571 'ᇛ' ㅁㄹ
|
|
53
|
+
4572 'ᇜ' ㅁㅂ
|
|
54
|
+
4573 'ᇝ' ㅁㅅ
|
|
55
|
+
4574 'ᇞ' ㅁㅅㅅ
|
|
56
|
+
4575 'ᇟ'
|
|
57
|
+
4576 'ᇠ' ㅁㅊ
|
|
58
|
+
4577 'ᇡ' ㅁㅎ
|
|
59
|
+
4578 'ᇢ'
|
|
60
|
+
4579 'ᇣ' ㅂㄹ
|
|
61
|
+
4580 'ᇤ' ㅂㅍ
|
|
62
|
+
4581 'ᇥ' ㅂㅎ
|
|
63
|
+
4582 'ᇦ'
|
|
64
|
+
4583 'ᇧ' ㅅㄱ
|
|
65
|
+
4584 'ᇨ' ㅅㄷ
|
|
66
|
+
4585 'ᇩ' ㅅㄹ
|
|
67
|
+
4586 'ᇪ' ㅅㅂ
|
|
68
|
+
4587 'ᇫ'
|
|
69
|
+
4588 'ᇬ'
|
|
70
|
+
4589 'ᇭ'
|
|
71
|
+
4590 'ᇮ'
|
|
72
|
+
4591 'ᇯ'
|
|
73
|
+
4592 'ᇰ'
|
|
74
|
+
4593 'ᇱ'
|
|
75
|
+
4594 'ᇲ'
|
|
76
|
+
4595 'ᇳ' ㅍㅂ
|
|
77
|
+
4596 'ᇴ'
|
|
78
|
+
4597 'ᇵ' ㅎㄴ
|
|
79
|
+
4598 'ᇶ' ㅎㄹ
|
|
80
|
+
4599 'ᇷ' ㅎㅁ
|
|
81
|
+
4600 'ᇸ' ㅎㅂ
|
|
82
|
+
4601 'ᇹ'
|
|
83
|
+
4602 'ᇺ' ㄱㄴ
|
|
84
|
+
4603 'ᇻ' ㄱㅂ
|
|
85
|
+
4604 'ᇼ' ㄱㅊ
|
|
86
|
+
4605 'ᇽ' ㄱㅋ
|
|
87
|
+
4606 'ᇾ' ㄱㅎ
|
|
88
|
+
4607 'ᇿ' ㄴㄴ
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
4449 'ᅡ'
|
|
2
|
+
4450 'ᅢ'
|
|
3
|
+
4451 'ᅣ'
|
|
4
|
+
4452 'ᅤ'
|
|
5
|
+
4453 'ᅥ'
|
|
6
|
+
4454 'ᅦ'
|
|
7
|
+
4455 'ᅧ'
|
|
8
|
+
4456 'ᅨ'
|
|
9
|
+
4457 'ᅩ'
|
|
10
|
+
4458 'ᅪ'
|
|
11
|
+
4459 'ᅫ'
|
|
12
|
+
4460 'ᅬ'
|
|
13
|
+
4461 'ᅭ'
|
|
14
|
+
4462 'ᅮ'
|
|
15
|
+
4463 'ᅯ'
|
|
16
|
+
4464 'ᅰ'
|
|
17
|
+
4465 'ᅱ'
|
|
18
|
+
4466 'ᅲ'
|
|
19
|
+
4467 'ᅳ'
|
|
20
|
+
4468 'ᅴ'
|
|
21
|
+
4469 'ᅵ'
|
|
22
|
+
4470 'ᅶ'
|
|
23
|
+
4471 'ᅷ'
|
|
24
|
+
4472 'ᅸ'
|
|
25
|
+
4473 'ᅹ'
|
|
26
|
+
4474 'ᅺ'
|
|
27
|
+
4475 'ᅻ'
|
|
28
|
+
4476 'ᅼ'
|
|
29
|
+
4477 'ᅽ'
|
|
30
|
+
4478 'ᅾ'
|
|
31
|
+
4479 'ᅿ'
|
|
32
|
+
4480 'ᆀ'
|
|
33
|
+
4481 'ᆁ'
|
|
34
|
+
4482 'ᆂ'
|
|
35
|
+
4483 'ᆃ'
|
|
36
|
+
4484 'ᆄ'
|
|
37
|
+
4485 'ᆅ'
|
|
38
|
+
4486 'ᆆ'
|
|
39
|
+
4487 'ᆇ'
|
|
40
|
+
4488 'ᆈ'
|
|
41
|
+
4489 'ᆉ'
|
|
42
|
+
4490 'ᆊ'
|
|
43
|
+
4491 'ᆋ'
|
|
44
|
+
4492 'ᆌ'
|
|
45
|
+
4493 'ᆍ'
|
|
46
|
+
4494 'ᆎ'
|
|
47
|
+
4495 'ᆏ'
|
|
48
|
+
4496 'ᆐ'
|
|
49
|
+
4497 'ᆑ'
|
|
50
|
+
4498 'ᆒ'
|
|
51
|
+
4499 'ᆓ'
|
|
52
|
+
4500 'ᆔ'
|
|
53
|
+
4501 'ᆕ'
|
|
54
|
+
4502 'ᆖ'
|
|
55
|
+
4503 'ᆗ'
|
|
56
|
+
4504 'ᆘ'
|
|
57
|
+
4505 'ᆙ'
|
|
58
|
+
4506 'ᆚ'
|
|
59
|
+
4507 'ᆛ'
|
|
60
|
+
4508 'ᆜ'
|
|
61
|
+
4509 'ᆝ'
|
|
62
|
+
4510 'ᆞ'
|
|
63
|
+
4511 'ᆟ'
|
|
64
|
+
4512 'ᆠ'
|
|
65
|
+
4513 'ᆡ'
|
|
66
|
+
4514 'ᆢ'
|
|
67
|
+
4515 'ᆣ'
|
|
68
|
+
4516 'ᆤ'
|
|
69
|
+
4517 'ᆥ'
|
|
70
|
+
4518 'ᆦ'
|
|
71
|
+
4519 'ᆧ'
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"'\u1161'": "A", "'\u1162'": "AE", "'\u1163'": "YA", "'\u1164'": "YAE", "'\u1165'": "EO", "'\u1166'": "E", "'\u1167'": "YEO", "'\u1168'": "YE", "'\u1169'": "O", "'\u116a'": "WA", "'\u116b'": "WAE", "'\u116c'": "OE", "'\u116d'": "YO", "'\u116e'": "U", "'\u116f'": "WO", "'\u1170'": "WE", "'\u1171'": "WI", "'\u1172'": "YU", "'\u1173'": "EU", "'\u1174'": "UI", "'\u1175'": "I", "'\u1176'": "", "'\u1177'": "", "'\u1178'": "", "'\u1179'": "", "'\u117a'": "", "'\u117b'": "", "'\u117c'": "", "'\u117d'": "", "'\u117e'": "", "'\u117f'": "", "'\u1180'": "", "'\u1181'": "", "'\u1182'": "", "'\u1183'": "", "'\u1184'": "", "'\u1185'": "", "'\u1186'": "", "'\u1187'": "", "'\u1188'": "", "'\u1189'": "", "'\u118a'": "", "'\u118b'": "", "'\u118c'": "", "'\u118d'": "", "'\u118e'": "", "'\u118f'": "", "'\u1190'": "", "'\u1191'": "", "'\u1192'": "", "'\u1193'": "", "'\u1194'": "", "'\u1195'": "", "'\u1196'": "", "'\u1197'": "", "'\u1198'": "", "'\u1199'": "", "'\u119a'": "", "'\u119b'": "", "'\u119c'": "", "'\u119d'": "", "'\u119e'": "", "'\u119f'": "", "'\u11a0'": "", "'\u11a1'": "", "'\u11a2'": "", "'\u11a3'": "", "'\u11a4'": "", "'\u11a5'": "", "'\u11a6'": "", "'\u11a7'": ""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"'\u1100'": ["\u3131"], "'\u1101'": ["\u3131", "\u3131"], "'\u1102'": ["\u3134"], "'\u1103'": ["\u3137"], "'\u1104'": ["\u3137", "\u3137"], "'\u1105'": ["\u3139"], "'\u1106'": ["\u3141"], "'\u1107'": ["\u3142"], "'\u1108'": ["\u3142", "\u3142"], "'\u1109'": ["\u3145"], "'\u110a'": ["\u3145", "\u3145"], "'\u110b'": ["\u3147"], "'\u110c'": ["\u3148"], "'\u110d'": ["\u3148", "\u3148"], "'\u110e'": ["\u314a"], "'\u110f'": ["\u314b"], "'\u1110'": ["\u314c"], "'\u1111'": ["\u314d"], "'\u1112'": ["\u314e"], "'\u1113'": ["\u3134", "\u3131"], "'\u1114'": ["\u3134", "\u3134"], "'\u1115'": ["\u3134", "\u3137"], "'\u1116'": ["\u3134", "\u3142"], "'\u1117'": ["\u3137", "\u3131"], "'\u1118'": ["\u3139", "\u3134"], "'\u1119'": ["\u3139", "\u3139"], "'\u111a'": ["\u3139", "\u314e"], "'\u111b'": [], "'\u111c'": ["\u3141", "\u3142"], "'\u111d'": [], "'\u111e'": ["\u3142", "\u3131"], "'\u111f'": ["\u3142", "\u3134"], "'\u1120'": ["\u3142", "\u3137"], "'\u1121'": ["\u3142", "\u3145"], "'\u1122'": ["\u3142", "\u3145", "\u3131"], "'\u1123'": ["\u3142", "\u3145", "\u3137"], "'\u1124'": ["\u3142", "\u3145", "\u3142"], "'\u1125'": ["\u3142", "\u3145", "\u3145"], "'\u1126'": ["\u3142", "\u3145", "\u3148"], "'\u1127'": ["\u3142", "\u3148"], "'\u1128'": ["\u3142", "\u314a"], "'\u1129'": ["\u3142", "\u314c"], "'\u112a'": ["\u3142", "\u314d"], "'\u112b'": [], "'\u112c'": [], "'\u112d'": ["\u3145", "\u3131"], "'\u112e'": ["\u3145", "\u3134"], "'\u112f'": ["\u3145", "\u3137"], "'\u1130'": ["\u3145", "\u3139"], "'\u1131'": ["\u3145", "\u3141"], "'\u1132'": ["\u3145", "\u3142"], "'\u1133'": ["\u3145", "\u3142", "\u3131"], "'\u1134'": ["\u3145", "\u3145", "\u3145"], "'\u1135'": [], "'\u1136'": ["\u3145", "\u3148"], "'\u1137'": ["\u3145", "\u314a"], "'\u1138'": ["\u3145", "\u314b"], "'\u1139'": ["\u3145", "\u314c"], "'\u113a'": ["\u3145", "\u314d"], "'\u113b'": ["\u3145", "\u314e"], "'\u113c'": [], "'\u113d'": [], "'\u113e'": [], "'\u113f'": [], "'\u1140'": [], "'\u1141'": [], "'\u1142'": [], "'\u1143'": [], "'\u1144'": [], "'\u1145'": [], "'\u1146'": [], "'\u1147'": [], "'\u1148'": [], "'\u1149'": [], "'\u114a'": [], "'\u114b'": [], "'\u114c'": [], "'\u114d'": [], "'\u114e'": [], "'\u114f'": [], "'\u1150'": [], "'\u1151'": [], "'\u1152'": ["\u314a", "\u314b"], "'\u1153'": ["\u314a", "\u314e"], "'\u1154'": [], "'\u1155'": [], "'\u1156'": ["\u314d", "\u3142"], "'\u1157'": [], "'\u1158'": ["\u314e", "\u314e"], "'\u1159'": [], "'\u115a'": ["\u3131", "\u3137"], "'\u115b'": ["\u3134", "\u3145"], "'\u115c'": ["\u3134", "\u3148"], "'\u115d'": ["\u3134", "\u314e"], "'\u115e'": ["\u3137", "\u3139"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"'\u11a8'": ["\u3131"], "'\u11a9'": ["\u3131", "\u3131"], "'\u11aa'": ["\u3131", "\u3145"], "'\u11ab'": ["\u3134"], "'\u11ac'": ["\u3134", "\u3148"], "'\u11ad'": ["\u3134", "\u314e"], "'\u11ae'": ["\u3137"], "'\u11af'": ["\u3139"], "'\u11b0'": ["\u3139", "\u3131"], "'\u11b1'": ["\u3139", "\u3141"], "'\u11b2'": ["\u3139", "\u3142"], "'\u11b3'": ["\u3139", "\u3145"], "'\u11b4'": ["\u3139", "\u314c"], "'\u11b5'": ["\u3139", "\u314d"], "'\u11b6'": ["\u3139", "\u314e"], "'\u11b7'": ["\u3141"], "'\u11b8'": ["\u3142"], "'\u11b9'": ["\u3142", "\u3145"], "'\u11ba'": ["\u3145"], "'\u11bb'": ["\u3145", "\u3145"], "'\u11bc'": ["\u3147"], "'\u11bd'": ["\u3148"], "'\u11be'": ["\u314a"], "'\u11bf'": ["\u314b"], "'\u11c0'": ["\u314c"], "'\u11c1'": ["\u314d"], "'\u11c2'": ["\u314e"], "'\u11c3'": ["\u3131", "\u3139"], "'\u11c4'": ["\u3131", "\u3145", "\u3131"], "'\u11c5'": ["\u3134", "\u3131"], "'\u11c6'": ["\u3134", "\u3137"], "'\u11c7'": ["\u3134", "\u3145"], "'\u11c8'": [], "'\u11c9'": ["\u3134", "\u314c"], "'\u11ca'": ["\u3137", "\u3131"], "'\u11cb'": ["\u3137", "\u3139"], "'\u11cc'": ["\u3139", "\u3131", "\u3145"], "'\u11cd'": ["\u3139", "\u3134"], "'\u11ce'": ["\u3139", "\u3137"], "'\u11cf'": ["\u3139", "\u3137", "\u314e"], "'\u11d0'": ["\u3139", "\u3139"], "'\u11d1'": ["\u3139", "\u3141", "\u3131"], "'\u11d2'": ["\u3139", "\u3141", "\u3145"], "'\u11d3'": ["\u3139", "\u3142", "\u3145"], "'\u11d4'": ["\u3139", "\u3142", "\u314e"], "'\u11d5'": [], "'\u11d6'": ["\u3139", "\u3145", "\u3145"], "'\u11d7'": [], "'\u11d8'": ["\u3139", "\u314b"], "'\u11d9'": ["\u3139", "\u314e"], "'\u11da'": ["\u3141", "\u3131"], "'\u11db'": ["\u3141", "\u3139"], "'\u11dc'": ["\u3141", "\u3142"], "'\u11dd'": ["\u3141", "\u3145"], "'\u11de'": ["\u3141", "\u3145", "\u3145"], "'\u11df'": [], "'\u11e0'": ["\u3141", "\u314a"], "'\u11e1'": ["\u3141", "\u314e"], "'\u11e2'": [], "'\u11e3'": ["\u3142", "\u3139"], "'\u11e4'": ["\u3142", "\u314d"], "'\u11e5'": ["\u3142", "\u314e"], "'\u11e6'": [], "'\u11e7'": ["\u3145", "\u3131"], "'\u11e8'": ["\u3145", "\u3137"], "'\u11e9'": ["\u3145", "\u3139"], "'\u11ea'": ["\u3145", "\u3142"], "'\u11eb'": [], "'\u11ec'": [], "'\u11ed'": [], "'\u11ee'": [], "'\u11ef'": [], "'\u11f0'": [], "'\u11f1'": [], "'\u11f2'": [], "'\u11f3'": ["\u314d", "\u3142"], "'\u11f4'": [], "'\u11f5'": ["\u314e", "\u3134"], "'\u11f6'": ["\u314e", "\u3139"], "'\u11f7'": ["\u314e", "\u3141"], "'\u11f8'": ["\u314e", "\u3142"], "'\u11f9'": [], "'\u11fa'": ["\u3131", "\u3134"], "'\u11fb'": ["\u3131", "\u3142"], "'\u11fc'": ["\u3131", "\u314a"], "'\u11fd'": ["\u3131", "\u314b"], "'\u11fe'": ["\u3131", "\u314e"], "'\u11ff'": ["\u3134", "\u3134"]}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import unicodedata
|
|
3
|
+
|
|
4
|
+
from .const import LATIN
|
|
5
|
+
from .const import TWIN_CASE_PROVISION
|
|
6
|
+
|
|
7
|
+
from .utils import Locator
|
|
8
|
+
from .utils import custom_split
|
|
9
|
+
from .utils import get_chosung
|
|
10
|
+
from .utils import get_jongsung
|
|
11
|
+
from .utils import get_jungsung
|
|
12
|
+
from .utils import split_into_chunks
|
|
13
|
+
from .utils import split_jamo
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
locator = Locator()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def romanize(text: str) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Romanizes Korean Hangul text into the Latin alphabet according to the Revised Romanization of Korean.
|
|
22
|
+
|
|
23
|
+
Parameters
|
|
24
|
+
----------
|
|
25
|
+
text : str
|
|
26
|
+
The input string containing Korean Hangul text to be romanized.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
str
|
|
31
|
+
The romanized string.
|
|
32
|
+
|
|
33
|
+
Examples
|
|
34
|
+
--------
|
|
35
|
+
>>> romanize("좋아 첫 눈에 반해 버린")
|
|
36
|
+
"joha cheot nune banhae beorin"
|
|
37
|
+
|
|
38
|
+
References
|
|
39
|
+
----------
|
|
40
|
+
https://en.wikipedia.org/w/index.php?title=Revised_Romanization_of_Korean&oldid=1064463473
|
|
41
|
+
"""
|
|
42
|
+
result = []
|
|
43
|
+
|
|
44
|
+
for word in custom_split(text):
|
|
45
|
+
process = False
|
|
46
|
+
for i in word:
|
|
47
|
+
if unicodedata.category(i) == "Lo": # checks if a word has Hangul syllable
|
|
48
|
+
process = True
|
|
49
|
+
break
|
|
50
|
+
if process:
|
|
51
|
+
dump = []
|
|
52
|
+
for index, block in enumerate(split_into_chunks((j for i in split_jamo(word) for j in i), 3)):
|
|
53
|
+
if len(block) == 1: # for standalone syllable "책"
|
|
54
|
+
block = (block[0], "", "")
|
|
55
|
+
|
|
56
|
+
chosung = get_chosung(block[1])
|
|
57
|
+
jungsung = get_jungsung(block[2])
|
|
58
|
+
jongsung = get_jongsung(block[0])
|
|
59
|
+
|
|
60
|
+
if chosung:
|
|
61
|
+
if len(chosung) > 1 and len(set(chosung)) == 1:
|
|
62
|
+
if jongsung: # "올까"
|
|
63
|
+
for i in jongsung:
|
|
64
|
+
dump.append(LATIN["JONGSUNG"][i])
|
|
65
|
+
if ord(chosung[0]) == 12593: # ㄲ: "깐다"
|
|
66
|
+
dump.append(TWIN_CASE_PROVISION[chosung[0]])
|
|
67
|
+
else:
|
|
68
|
+
if index == 0: # "뚜두"
|
|
69
|
+
dump.append(LATIN["CHOSUNG"][chosung[0]] * 2)
|
|
70
|
+
else: # "오빠"
|
|
71
|
+
dump.append(TWIN_CASE_PROVISION[chosung[0]])
|
|
72
|
+
else:
|
|
73
|
+
if jongsung:
|
|
74
|
+
if len(jongsung) > 1 and len(set(jongsung)) == 1 and ord(chosung[0]) == 12615: # ㅇ: "있을까"
|
|
75
|
+
dump.append(TWIN_CASE_PROVISION[jongsung[0]])
|
|
76
|
+
else:
|
|
77
|
+
if len(jongsung) > 1 and len(set(jongsung)) >= 2: # "없어요"
|
|
78
|
+
for i in jongsung[:-1]:
|
|
79
|
+
dump.append(LATIN["JONGSUNG"][i])
|
|
80
|
+
col_index = np.where(locator.COL_LABELS == chosung[0])[0][0]
|
|
81
|
+
row_index = np.where(locator.ROW_LABELS == jongsung[-1])[0][0]
|
|
82
|
+
dump.append(locator.TABLE[row_index, col_index])
|
|
83
|
+
else:
|
|
84
|
+
dump.append(LATIN["CHOSUNG"][chosung[0]])
|
|
85
|
+
else:
|
|
86
|
+
if jongsung:
|
|
87
|
+
dump.append(LATIN["JONGSUNG"][jongsung[0]])
|
|
88
|
+
if jungsung:
|
|
89
|
+
dump.append(jungsung.lower())
|
|
90
|
+
else:
|
|
91
|
+
dump.append(block[2])
|
|
92
|
+
result.append("".join(dump))
|
|
93
|
+
else:
|
|
94
|
+
result.append(word)
|
|
95
|
+
|
|
96
|
+
return "".join(result)
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import numpy as np
|
|
3
|
+
import re
|
|
4
|
+
import typing as t
|
|
5
|
+
|
|
6
|
+
from .const import CHOSUNG
|
|
7
|
+
from .const import JONGSUNG
|
|
8
|
+
from .const import JUNGSUNG
|
|
9
|
+
from .const import LATIN
|
|
10
|
+
from .const import PATH
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Locator:
|
|
14
|
+
"""
|
|
15
|
+
A class used to represent a Locator that reads provision data and stores it
|
|
16
|
+
in a table, along with column and row labels.
|
|
17
|
+
|
|
18
|
+
Attributes
|
|
19
|
+
----------
|
|
20
|
+
TABLE : np.ndarray
|
|
21
|
+
A 2D numpy array containing the provision data read from the file "convert/data/provisions".
|
|
22
|
+
COL_LABELS : np.ndarray
|
|
23
|
+
A 1D numpy array containing column labels derived from the keys of the "CHOSUNG" dictionary in LATIN.
|
|
24
|
+
ROW_LABELS : np.ndarray
|
|
25
|
+
A 1D numpy array containing row labels derived from the keys of the "JONGSUNG" dictionary in LATIN.
|
|
26
|
+
|
|
27
|
+
Methods
|
|
28
|
+
-------
|
|
29
|
+
None
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
with open(PATH / "convert/data/provisions", "r") as f:
|
|
33
|
+
TABLE = np.array([i.strip().split(" ") for i in f.readlines()])
|
|
34
|
+
|
|
35
|
+
COL_LABELS = np.array(list(LATIN["CHOSUNG"].keys()))
|
|
36
|
+
ROW_LABELS = np.array(list(LATIN["JONGSUNG"].keys()))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def custom_split(text: str) -> t.List[str]:
|
|
40
|
+
"""
|
|
41
|
+
Splits a given text into a list of strings, separating by whitespace and non-whitespace sequences.
|
|
42
|
+
Consecutive whitespace characters are split into individual space characters.
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
text : str
|
|
47
|
+
The input text to be split.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
List[str]
|
|
52
|
+
A list of substrings, where each substring is either a single non-whitespace sequence
|
|
53
|
+
or a single whitespace character.
|
|
54
|
+
|
|
55
|
+
Examples
|
|
56
|
+
--------
|
|
57
|
+
>>> custom_split("Hello world")
|
|
58
|
+
["Hello", " ", " ", " ", "world"]
|
|
59
|
+
|
|
60
|
+
>>> custom_split("a\nb\tc")
|
|
61
|
+
["a", "\n", "b", "\t", "c"]
|
|
62
|
+
"""
|
|
63
|
+
pattern = re.compile(r"(\s+|[^\s]+)")
|
|
64
|
+
matches = pattern.findall(text)
|
|
65
|
+
result = []
|
|
66
|
+
for match in matches:
|
|
67
|
+
if len(match) > 1 and match.isspace():
|
|
68
|
+
result.extend(list(match))
|
|
69
|
+
else:
|
|
70
|
+
result.append(match)
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def decompose_hangul(char: str) -> t.Tuple[str, str, str]:
|
|
75
|
+
"""
|
|
76
|
+
Decomposes a Hangul syllable character into its constituent Jamo components.
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
char : str
|
|
81
|
+
A single Hangul syllable character.
|
|
82
|
+
|
|
83
|
+
Returns
|
|
84
|
+
-------
|
|
85
|
+
Tuple[str, str, str]
|
|
86
|
+
A tuple containing three strings:
|
|
87
|
+
- The leading consonant (choseong)
|
|
88
|
+
- The vowel (jungseong)
|
|
89
|
+
- The trailing consonant (jongseong), or an empty string if there is no trailing consonant.
|
|
90
|
+
|
|
91
|
+
Raises
|
|
92
|
+
------
|
|
93
|
+
ValueError
|
|
94
|
+
If the input character is not a Hangul syllable character.
|
|
95
|
+
|
|
96
|
+
Examples
|
|
97
|
+
--------
|
|
98
|
+
>>> decompose_hangul("가")
|
|
99
|
+
("ᄀ", "ᅡ", "")
|
|
100
|
+
|
|
101
|
+
>>> decompose_hangul("각")
|
|
102
|
+
("ᄀ", "ᅡ", "ᆨ")
|
|
103
|
+
"""
|
|
104
|
+
x = ord(char)
|
|
105
|
+
if 44032 <= x <= 55203:
|
|
106
|
+
a = x - 44032
|
|
107
|
+
b = a % 28
|
|
108
|
+
c = 1 + ((a - b) % 588) // 28
|
|
109
|
+
d = 1 + a // 588
|
|
110
|
+
q = [*map(sum, zip(*[[d, c, b], [4351, 4448, 4519]]))]
|
|
111
|
+
if b:
|
|
112
|
+
return (chr(q[0]), chr(q[1]), chr(q[2]))
|
|
113
|
+
return (chr(q[0]), chr(q[1]), "")
|
|
114
|
+
return ("", char, "")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_chosung(char: str) -> t.List[str]:
|
|
118
|
+
"""
|
|
119
|
+
Retrieves the Chosung (initial consonant) candidates for a given character.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
char : str
|
|
124
|
+
A single character to look up in the CHOSUNG dictionary.
|
|
125
|
+
|
|
126
|
+
Returns
|
|
127
|
+
-------
|
|
128
|
+
List[str]
|
|
129
|
+
A list of Chosung candidates corresponding to the input character.
|
|
130
|
+
Returns an empty list if the character is not found in the CHOSUNG dictionary.
|
|
131
|
+
|
|
132
|
+
Examples
|
|
133
|
+
--------
|
|
134
|
+
>>> get_chosung("ㄱ")
|
|
135
|
+
["ᄀ", "ᄁ"]
|
|
136
|
+
|
|
137
|
+
>>> get_chosung("x")
|
|
138
|
+
[]
|
|
139
|
+
"""
|
|
140
|
+
try:
|
|
141
|
+
return CHOSUNG[f"'{char}'"]
|
|
142
|
+
except KeyError:
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_jongsung(char: str) -> t.List[str]:
|
|
147
|
+
"""
|
|
148
|
+
Retrieves the Jongsung (final consonant) candidates for a given character.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
char : str
|
|
153
|
+
A single character to look up in the JONGSUNG dictionary.
|
|
154
|
+
|
|
155
|
+
Returns
|
|
156
|
+
-------
|
|
157
|
+
List[str]
|
|
158
|
+
A list of Jongsung candidates corresponding to the input character.
|
|
159
|
+
Returns an empty list if the character is not found in the JONGSUNG dictionary.
|
|
160
|
+
|
|
161
|
+
Examples
|
|
162
|
+
--------
|
|
163
|
+
>>> get_jongsung("ㄱ")
|
|
164
|
+
["ᆨ", "ᆩ"]
|
|
165
|
+
|
|
166
|
+
>>> get_jongsung("x")
|
|
167
|
+
[]
|
|
168
|
+
"""
|
|
169
|
+
try:
|
|
170
|
+
return JONGSUNG[f"'{char}'"]
|
|
171
|
+
except KeyError:
|
|
172
|
+
return []
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_jungsung(char: str) -> str:
|
|
176
|
+
"""
|
|
177
|
+
Retrieves the Jungsung (medial vowel) for a given character.
|
|
178
|
+
|
|
179
|
+
Parameters
|
|
180
|
+
----------
|
|
181
|
+
char : str
|
|
182
|
+
A single character to look up in the JUNGSUNG dictionary.
|
|
183
|
+
|
|
184
|
+
Returns
|
|
185
|
+
-------
|
|
186
|
+
str
|
|
187
|
+
The corresponding Jungsung for the input character.
|
|
188
|
+
Returns an empty string if the character is not found in the JUNGSUNG dictionary.
|
|
189
|
+
|
|
190
|
+
Examples
|
|
191
|
+
--------
|
|
192
|
+
>>> get_jungsung("ㅏ")
|
|
193
|
+
"ᅡ"
|
|
194
|
+
|
|
195
|
+
>>> get_jungsung("x")
|
|
196
|
+
""
|
|
197
|
+
"""
|
|
198
|
+
try:
|
|
199
|
+
return JUNGSUNG[f"'{char}'"]
|
|
200
|
+
except KeyError:
|
|
201
|
+
return ""
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def split_into_chunks(data: t.Iterable[t.Any], size: int) -> t.Iterator[t.List[str]]:
|
|
205
|
+
"""
|
|
206
|
+
Splits an iterable into chunks of a specified size.
|
|
207
|
+
|
|
208
|
+
Parameters
|
|
209
|
+
----------
|
|
210
|
+
data : iterable
|
|
211
|
+
The iterable to be split into chunks.
|
|
212
|
+
size : int
|
|
213
|
+
The size of each chunk.
|
|
214
|
+
|
|
215
|
+
Returns
|
|
216
|
+
-------
|
|
217
|
+
Iterator[List[str]]
|
|
218
|
+
An iterator where each item is a list containing a chunk of the original data.
|
|
219
|
+
|
|
220
|
+
Examples
|
|
221
|
+
--------
|
|
222
|
+
>>> list(split_into_chunks([1, 2, 3, 4, 5], 2))
|
|
223
|
+
[[1, 2], [3, 4], [5]]
|
|
224
|
+
|
|
225
|
+
>>> list(split_into_chunks("abcdef", 3))
|
|
226
|
+
[["a", "b", "c"], ["d", "e", "f"]]
|
|
227
|
+
"""
|
|
228
|
+
def slize_size(g):
|
|
229
|
+
return lambda: tuple(itertools.islice(g, size))
|
|
230
|
+
return iter(slize_size(iter(data)), ())
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def split_jamo(text: str) -> t.List[t.Tuple[str]]:
|
|
234
|
+
"""
|
|
235
|
+
Splits a string of Hangul characters into their constituent Jamo components.
|
|
236
|
+
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
text : str
|
|
240
|
+
The string of Hangul characters to be split.
|
|
241
|
+
|
|
242
|
+
Returns
|
|
243
|
+
-------
|
|
244
|
+
List[Tuple[str]]
|
|
245
|
+
A list of strings where each item is a tuple containing the Jamo components of the corresponding Hangul character.
|
|
246
|
+
|
|
247
|
+
Examples
|
|
248
|
+
--------
|
|
249
|
+
>>> split_jamo("한글")
|
|
250
|
+
[("ᄒ", "ᅡ", "ᆫ"), ("ᄀ", "ᅳ", "ᆯ")]
|
|
251
|
+
|
|
252
|
+
>>> split_jamo("가")
|
|
253
|
+
[("ᄀ", "ᅡ", "")]
|
|
254
|
+
"""
|
|
255
|
+
result = []
|
|
256
|
+
for i, char in enumerate(text):
|
|
257
|
+
jamo_components = decompose_hangul(char)
|
|
258
|
+
if i == 0:
|
|
259
|
+
result.append(("", *jamo_components))
|
|
260
|
+
elif i == len(text) - 1:
|
|
261
|
+
result.append((*jamo_components, "", ""))
|
|
262
|
+
else:
|
|
263
|
+
result.append(jamo_components)
|
|
264
|
+
return result
|