koroman 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- koroman-1.0.0/LICENSE +21 -0
- koroman-1.0.0/PKG-INFO +93 -0
- koroman-1.0.0/README.md +57 -0
- koroman-1.0.0/koroman/__init__.py +4 -0
- koroman-1.0.0/koroman/core.py +217 -0
- koroman-1.0.0/koroman.egg-info/PKG-INFO +93 -0
- koroman-1.0.0/koroman.egg-info/SOURCES.txt +10 -0
- koroman-1.0.0/koroman.egg-info/dependency_links.txt +1 -0
- koroman-1.0.0/koroman.egg-info/top_level.txt +1 -0
- koroman-1.0.0/setup.cfg +4 -0
- koroman-1.0.0/setup.py +36 -0
- koroman-1.0.0/tests/test_koroman.py +45 -0
koroman-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Donghe Youn (Daissue)
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
koroman-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: koroman
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Korean Romanizer with pronunciation rules based on 국립국어원 표기법
|
|
5
|
+
Home-page: https://github.com/gerosyab/koroman
|
|
6
|
+
Author: Donghe Youn (Daissue)
|
|
7
|
+
Author-email: gerosyab@gmail.com
|
|
8
|
+
Project-URL: Bug Reports, https://github.com/gerosyab/koroman/issues
|
|
9
|
+
Project-URL: Source, https://github.com/gerosyab/koroman
|
|
10
|
+
Keywords: korean,romanizer,romanization,hangul,transliteration,linguistics,korean romanizer,korean romanization,korean transliteration,korean linguistics,koroman
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Requires-Python: >=3.6
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Dynamic: author
|
|
26
|
+
Dynamic: author-email
|
|
27
|
+
Dynamic: classifier
|
|
28
|
+
Dynamic: description
|
|
29
|
+
Dynamic: description-content-type
|
|
30
|
+
Dynamic: home-page
|
|
31
|
+
Dynamic: keywords
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
Dynamic: project-url
|
|
34
|
+
Dynamic: requires-python
|
|
35
|
+
Dynamic: summary
|
|
36
|
+
|
|
37
|
+
> 🇰🇷 [한국어로 보기](./README.ko.md)
|
|
38
|
+
|
|
39
|
+
# KOROMAN - Korean Romanizer
|
|
40
|
+
|
|
41
|
+
**KOROMAN** is a multilingual Romanizer for Korean text, based on the Revised Romanization system (국립국어원 표기법) with additional pronunciation rules. It converts Hangul syllables into Romanized Latin script across multiple languages: **JavaScript, Python, and Java**.
|
|
42
|
+
|
|
43
|
+
## 🌐 Live Demo
|
|
44
|
+
- [한국어 버전](https://daissue.app/romanizer)
|
|
45
|
+
- [English version](https://daissue.app/en/romanizer)
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 📦 Features
|
|
50
|
+
- Supports Revised Romanization of Korean
|
|
51
|
+
- Applies key Korean phonological rules:
|
|
52
|
+
- Liaison (연음화)
|
|
53
|
+
- Nasal assimilation (비음화)
|
|
54
|
+
- Lateralization (유음화)
|
|
55
|
+
- Fortis/tense consonants (경음화)
|
|
56
|
+
- Provides casing options (lower, upper, capitalized)
|
|
57
|
+
- Fully tested in each language
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 🚀 Getting Started
|
|
62
|
+
|
|
63
|
+
### Python
|
|
64
|
+
```bash
|
|
65
|
+
pip install koroman
|
|
66
|
+
```
|
|
67
|
+
```python
|
|
68
|
+
from koroman import romanize
|
|
69
|
+
|
|
70
|
+
# Basic usage
|
|
71
|
+
romanize("한글") # → "hangul"
|
|
72
|
+
|
|
73
|
+
# With pronunciation rules disabled
|
|
74
|
+
romanize("해돋이", use_pronunciation_rules=False) # → "haedodi"
|
|
75
|
+
|
|
76
|
+
# With pronunciation rules enabled (default)
|
|
77
|
+
romanize("해돋이") # → "haedoji"
|
|
78
|
+
|
|
79
|
+
# With different casing options
|
|
80
|
+
romanize("한글", casing_option="uppercase") # → "HANGUL"
|
|
81
|
+
romanize("안녕 한글", casing_option="capitalize-word") # → "Annyeong Hangeul"
|
|
82
|
+
romanize("안녕\n한글 로마자 변환", casing_option="capitalize-line") # → "Annyeong\nHangeul Romaja Byeonhwan"
|
|
83
|
+
|
|
84
|
+
# Combining options
|
|
85
|
+
romanize("해돋이", use_pronunciation_rules=False, casing_option="uppercase") # → "HAEDODI"
|
|
86
|
+
```
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## 📜 LICENSE
|
|
90
|
+
[MIT License](LICENSE)
|
|
91
|
+
|
|
92
|
+
2025 ⓒ Donghe Youn (Daissue)
|
|
93
|
+
|
koroman-1.0.0/README.md
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
> 🇰🇷 [한국어로 보기](./README.ko.md)
|
|
2
|
+
|
|
3
|
+
# KOROMAN - Korean Romanizer
|
|
4
|
+
|
|
5
|
+
**KOROMAN** is a multilingual Romanizer for Korean text, based on the Revised Romanization system (국립국어원 표기법) with additional pronunciation rules. It converts Hangul syllables into Romanized Latin script across multiple languages: **JavaScript, Python, and Java**.
|
|
6
|
+
|
|
7
|
+
## 🌐 Live Demo
|
|
8
|
+
- [한국어 버전](https://daissue.app/romanizer)
|
|
9
|
+
- [English version](https://daissue.app/en/romanizer)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## 📦 Features
|
|
14
|
+
- Supports Revised Romanization of Korean
|
|
15
|
+
- Applies key Korean phonological rules:
|
|
16
|
+
- Liaison (연음화)
|
|
17
|
+
- Nasal assimilation (비음화)
|
|
18
|
+
- Lateralization (유음화)
|
|
19
|
+
- Fortis/tense consonants (경음화)
|
|
20
|
+
- Provides casing options (lower, upper, capitalized)
|
|
21
|
+
- Fully tested in each language
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## 🚀 Getting Started
|
|
26
|
+
|
|
27
|
+
### Python
|
|
28
|
+
```bash
|
|
29
|
+
pip install koroman
|
|
30
|
+
```
|
|
31
|
+
```python
|
|
32
|
+
from koroman import romanize
|
|
33
|
+
|
|
34
|
+
# Basic usage
|
|
35
|
+
romanize("한글") # → "hangul"
|
|
36
|
+
|
|
37
|
+
# With pronunciation rules disabled
|
|
38
|
+
romanize("해돋이", use_pronunciation_rules=False) # → "haedodi"
|
|
39
|
+
|
|
40
|
+
# With pronunciation rules enabled (default)
|
|
41
|
+
romanize("해돋이") # → "haedoji"
|
|
42
|
+
|
|
43
|
+
# With different casing options
|
|
44
|
+
romanize("한글", casing_option="uppercase") # → "HANGUL"
|
|
45
|
+
romanize("안녕 한글", casing_option="capitalize-word") # → "Annyeong Hangeul"
|
|
46
|
+
romanize("안녕\n한글 로마자 변환", casing_option="capitalize-line") # → "Annyeong\nHangeul Romaja Byeonhwan"
|
|
47
|
+
|
|
48
|
+
# Combining options
|
|
49
|
+
romanize("해돋이", use_pronunciation_rules=False, casing_option="uppercase") # → "HAEDODI"
|
|
50
|
+
```
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## 📜 LICENSE
|
|
54
|
+
[MIT License](LICENSE)
|
|
55
|
+
|
|
56
|
+
2025 ⓒ Donghe Youn (Daissue)
|
|
57
|
+
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
HANGUL_BASE = 0xAC00
|
|
4
|
+
HANGUL_END = 0xD7A3
|
|
5
|
+
|
|
6
|
+
CHO = [
|
|
7
|
+
"ᄀ", "ᄁ", "ᄂ", "ᄃ", "ᄄ", "ᄅ", "ᄆ", "ᄇ", "ᄈ", "ᄉ",
|
|
8
|
+
"ᄊ", "ᄋ", "ᄌ", "ᄍ", "ᄎ", "ᄏ", "ᄐ", "ᄑ", "ᄒ"
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
JUNG = [
|
|
12
|
+
"ᅡ", "ᅢ", "ᅣ", "ᅤ", "ᅥ", "ᅦ", "ᅧ", "ᅨ", "ᅩ", "ᅪ", "ᅫ",
|
|
13
|
+
"ᅬ", "ᅭ", "ᅮ", "ᅯ", "ᅰ", "ᅱ", "ᅲ", "ᅳ", "ᅴ", "ᅵ"
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
JONG = [
|
|
17
|
+
"", "ᆨ", "ᆩ", "ᆪ", "ᆫ", "ᆬ", "ᆭ", "ᆮ", "ᆯ", "ᆰ", "ᆱ", "ᆲ",
|
|
18
|
+
"ᆳ", "ᆴ", "ᆵ", "ᆶ", "ᆷ", "ᆸ", "ᆹ", "ᆺ", "ᆻ", "ᆼ", "ᆽ", "ᆾ",
|
|
19
|
+
"ᆿ", "ᇀ", "ᇁ", "ᇂ"
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
ROMAN_MAP = {
|
|
23
|
+
"ᄀ": "g", "ᄁ": "kk", "ᄂ": "n", "ᄃ": "d", "ᄄ": "tt",
|
|
24
|
+
"ᄅ": "r", "ᄆ": "m", "ᄇ": "b", "ᄈ": "pp", "ᄉ": "s", "ᄊ": "ss",
|
|
25
|
+
"ᄋ": "", "ᄌ": "j", "ᄍ": "jj", "ᄎ": "ch", "ᄏ": "k",
|
|
26
|
+
"ᄐ": "t", "ᄑ": "p", "ᄒ": "h",
|
|
27
|
+
|
|
28
|
+
"ᅡ": "a", "ᅢ": "ae", "ᅣ": "ya", "ᅤ": "yae", "ᅥ": "eo", "ᅦ": "e",
|
|
29
|
+
"ᅧ": "yeo", "ᅨ": "ye", "ᅩ": "o", "ᅪ": "wa", "ᅫ": "wae",
|
|
30
|
+
"ᅬ": "oe", "ᅭ": "yo", "ᅮ": "u", "ᅯ": "wo", "ᅰ": "we",
|
|
31
|
+
"ᅱ": "wi", "ᅲ": "yu", "ᅳ": "eu", "ᅴ": "ui", "ᅵ": "i",
|
|
32
|
+
|
|
33
|
+
"ᆨ": "k", "ᆩ": "k", "ᆪ": "k", "ᆫ": "n", "ᆬ": "n", "ᆭ": "n", "ᆮ": "d",
|
|
34
|
+
"ᆯ": "l", "ᆰ": "k", "ᆱ": "m", "ᆲ": "p", "ᆳ": "t", "ᆴ": "t", "ᆵ": "p", "ᆶ": "h",
|
|
35
|
+
"ᆷ": "m", "ᆸ": "p", "ᆹ": "p", "ᆺ": "t", "ᆻ": "t", "ᆼ": "ng",
|
|
36
|
+
"ᆽ": "t", "ᆾ": "t", "ᆿ": "k", "ᇀ": "t", "ᇁ": "p", "ᇂ": "h"
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
def apply_pronunciation_rules(jamo_str):
|
|
40
|
+
# ==============================
|
|
41
|
+
# 1. 무효화 처리
|
|
42
|
+
# ==============================
|
|
43
|
+
rules = [
|
|
44
|
+
(r"\u11a7", ""), # 'ᆧ'(U+11A7) → 제거 (사용되지 않는 종성)
|
|
45
|
+
|
|
46
|
+
# ==============================
|
|
47
|
+
# 2. 비음화 (ㄴ, ㅁ, ㅇ)
|
|
48
|
+
# ==============================
|
|
49
|
+
(r"[\u11b8\u11c1\u11b9\u11b2\u11b5](?=[\u1102\u1106])", "\u11b7"),
|
|
50
|
+
# 종성 'ᆸ(ㅂ)' 'ᇁ(ㅍ)' 'ᆹ(ㅂㅅ)' 'ᆲ(ㄹㅂ)' 'ᆵ(ㄹㅍ)' + 다음 초성 'ᄂ(ㄴ)' or 'ᄆ(ㅁ)' → 'ᆷ'
|
|
51
|
+
|
|
52
|
+
(r"[\u11ae\u11c0\u11bd\u11be\u11ba\u11bb\u11c2](?=[\u1102\u1106])", "\u11ab"),
|
|
53
|
+
# 종성 'ᆮ(ㄷ)' 'ᇀ(ㅌ)' 'ᆽ(ㅈ)' 'ᆾ(ㅊ)' 'ᆺ(ㅅ)' 'ᆻ(ㅆ)' 'ᇂ(ㅎ)' + 다음 초성 'ᄂ(ㄴ)' or 'ᄆ(ㅁ)' → 'ᆫ'
|
|
54
|
+
|
|
55
|
+
(r"[\u11a8\u11a9\u11bf\u11aa\u11b0](?=[\u1102\u1106])", "\u11bc"),
|
|
56
|
+
# 종성 'ᆨ(ㄱ)' 'ᆩ(ㄲ)' 'ᆿ(ㅋ)' 'ᆪ(ㄱㅅ)' 'ᆰ(ㄹㄱ)' + 다음 초성 'ᄂ'/'ᄆ' → 'ᆼ'
|
|
57
|
+
|
|
58
|
+
# ==============================
|
|
59
|
+
# 3. 연음/연철
|
|
60
|
+
# ==============================
|
|
61
|
+
(r"\u11a8\u110b(?=[\u1163\u1164\u1167\u1168\u116d\u1172])", "\u11bc\u1102"),
|
|
62
|
+
# 'ᆨ' + 'ᄋ' + 중성 'ㅑㅒㅕㅖㅛㅠ' → 'ᆼᄂ' (연음화)
|
|
63
|
+
|
|
64
|
+
(r"\u11af\u110b(?=[\u1163\u1164\u1167\u1168\u116d\u1172])", "\u11af\u1105"),
|
|
65
|
+
# 'ᆯ' + 'ᄋ' + 중성 위와 같음 → 'ᆯᄅ'
|
|
66
|
+
|
|
67
|
+
(r"[\u11a8\u11bc]\u1105", "\u11bc\u1102"),
|
|
68
|
+
# 'ᆨ(ㄱ)', 'ᆼ(ㅇ)' + 'ᄅ(ㄹ)' → 'ᆼᄂ'
|
|
69
|
+
|
|
70
|
+
(r"\u11ab\u1105(?=\u1169)", "\u11ab\u1102"),
|
|
71
|
+
# 'ᆫ(ㄴ)' + 'ᄅ' + 중성 'ㅗ' → 'ᆫᄂ'
|
|
72
|
+
|
|
73
|
+
(r"\u11af\u1102|\u11ab\u1105", "\u11af\u1105"),
|
|
74
|
+
# 'ᆯ(ㄹ)' + 'ᄂ(ㄴ)', 'ᆫ(ㄴ)' + 'ᄅ(ㄹ)' → 'ᆯᄅ'
|
|
75
|
+
|
|
76
|
+
(r"[\u11b7\u11b8]\u1105", "\u11b7\u1102"),
|
|
77
|
+
# 'ᆷ(ㅁ)', 'ᆸ(ㅂ)' + 'ᄅ' → 'ᆷᄂ'
|
|
78
|
+
|
|
79
|
+
(r"\u11b0\u1105", "\u11a8\u1105"),
|
|
80
|
+
# 'ᆰ(ㄹㄱ)' + 'ᄅ' → 'ᆨᄅ'
|
|
81
|
+
|
|
82
|
+
# ==============================
|
|
83
|
+
# 4. 격음화 / 자음군 분해
|
|
84
|
+
# ==============================
|
|
85
|
+
(r"\u11a8\u110f", "\u11a8-\u110f"), # 'ᆨ' + 'ᄏ' → 'ᆨ-ᄏ'
|
|
86
|
+
(r"\u11b8\u1111", "\u11b8-\u1111"), # 'ᆸ' + 'ᄑ' → 'ᆸ-ᄑ'
|
|
87
|
+
(r"\u11ae\u1110", "\u11ae-\u1110"), # 'ᆮ' + 'ᄐ' → 'ᆮ-ᄐ'
|
|
88
|
+
|
|
89
|
+
# ==============================
|
|
90
|
+
# 5. 복합 종성 분해
|
|
91
|
+
# ==============================
|
|
92
|
+
(r"\u11aa", "\u11a8\u11ba"), # 'ᆪ(ㄱㅅ)' → 'ᆨᆺ'
|
|
93
|
+
(r"\u11ac", "\u11ab\u11bd"), # 'ᆬ(ㄴㅈ)' → 'ᆫᆽ'
|
|
94
|
+
(r"\u11ad", "\u11ab\u11c2"), # 'ᆭ(ㄴㅎ)' → 'ᆫᇂ'
|
|
95
|
+
(r"\u11b0", "\u11af\u11a8"), # 'ᆰ(ㄹㄱ)' → 'ᆯᆨ'
|
|
96
|
+
(r"\u11b1", "\u11af\u11b7"), # 'ᆱ(ㄹㅁ)' → 'ᆯᆷ'
|
|
97
|
+
(r"\u11b2", "\u11af\u11b8"), # 'ᆲ(ㄹㅂ)' → 'ᆯᆸ'
|
|
98
|
+
(r"\u11b3", "\u11af\u11ba"), # 'ᆳ(ㄹㅅ)' → 'ᆯᆺ'
|
|
99
|
+
(r"\u11b4", "\u11af\u11c0"), # 'ᆴ(ㄹㅌ)' → 'ᆯᇀ'
|
|
100
|
+
(r"\u11b5", "\u11af\u11c1"), # 'ᆵ(ㄹㅍ)' → 'ᆯᇁ'
|
|
101
|
+
(r"\u11b6", "\u11af\u11c2"), # 'ᆶ(ㄹㅎ)' → 'ᆯᇂ'
|
|
102
|
+
(r"\u11b9", "\u11b8\u11ba"), # 'ᆹ(ㅂㅅ)' → 'ᆸᆺ'
|
|
103
|
+
|
|
104
|
+
# ==============================
|
|
105
|
+
# 6. 경음화/축약 등 특수 규칙
|
|
106
|
+
# ==============================
|
|
107
|
+
(r"\u11ae\u110b\u1175", "\u110c\u1175"), # 'ᆮ' + 'ᄋ' + 'ᅵ' → '지'
|
|
108
|
+
(r"\u11c0\u110b\u1175", "\u110e\u1175"), # 'ᇀ' + 'ᄋ' + 'ᅵ' → '치'
|
|
109
|
+
|
|
110
|
+
# ==============================
|
|
111
|
+
# 7. 받침 탈락 또는 이음자 제거
|
|
112
|
+
# ==============================
|
|
113
|
+
(r"\u11a8\u110b", "\u1100"), # 'ᆨ' + 'ᄋ' → 'ᄀ'
|
|
114
|
+
(r"\u11a9\u110b", "\u1101"), # 'ᆩ' + 'ᄋ' → 'ᄁ'
|
|
115
|
+
(r"\u11ae\u110b", "\u1103"), # 'ᆮ' + 'ᄋ' → 'ᄃ'
|
|
116
|
+
(r"\u11af\u110b", "\u1105"), # 'ᆯ' + 'ᄋ' → 'ᄅ'
|
|
117
|
+
(r"\u11b8\u110b", "\u1107"), # 'ᆸ' + 'ᄋ' → 'ᄇ'
|
|
118
|
+
(r"\u11ba\u110b", "\u1109"), # 'ᆺ' + 'ᄋ' → 'ᄉ'
|
|
119
|
+
(r"\u11bb\u110b", "\u110a"), # 'ᆻ' + 'ᄋ' → 'ᄊ'
|
|
120
|
+
(r"\u11bd\u110b", "\u110c"), # 'ᆽ' + 'ᄋ' → 'ᄌ'
|
|
121
|
+
(r"\u11be\u110b", "\u110e"), # 'ᆾ' + 'ᄋ' → 'ᄎ'
|
|
122
|
+
(r"\u11c2\u110b", ""), # 'ᇂ' + 'ᄋ' → 제거
|
|
123
|
+
|
|
124
|
+
# ==============================
|
|
125
|
+
# 8. 격음화 (종성 + ㅎ/히읗)
|
|
126
|
+
# ==============================
|
|
127
|
+
(r"\u11c2\u1100|\u11a8\u1112", "\u110f"), # 'ᇂ'+'ᄀ' 또는 'ᆨ'+'ᄒ' → 'ᄏ'
|
|
128
|
+
(r"\u11c2\u1103|\u11ae\u1112", "\u1110"), # 'ᇂ'+'ᄃ' 또는 'ᆮ'+'ᄒ' → 'ᄐ'
|
|
129
|
+
(r"\u11c2\u110c|\u11bd\u1112", "\u110e"), # 'ᇂ'+'ᄌ' 또는 'ᆽ'+'ᄒ' → 'ᄎ'
|
|
130
|
+
(r"\u11c2\u1107", "\u1107"), # 'ᇂ'+'ᄇ' → 'ᄇ'
|
|
131
|
+
(r"\u11b8\u1112", "\u1111"), # 'ᆸ'+'ᄒ' → 'ᄑ'
|
|
132
|
+
|
|
133
|
+
# ==============================
|
|
134
|
+
# 9. 특수 처리 및 최종 정리
|
|
135
|
+
# ==============================
|
|
136
|
+
(r"\u11af\u1105", "ll"), # 'ᆯ' + 'ᄅ' → ll
|
|
137
|
+
(r"\u11c2(?!\s|$)", ""), # 'ᇂ' (종성) 단독 → 제거
|
|
138
|
+
(r"([\u11a8-\u11c2])([\u11a8-\u11c2])", r"\1") # 이중 종성 제거
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
for pattern, repl in rules:
|
|
142
|
+
jamo_str = re.sub(pattern, repl, jamo_str)
|
|
143
|
+
return jamo_str
|
|
144
|
+
|
|
145
|
+
def split_hangul_to_jamos(text):
|
|
146
|
+
result = ""
|
|
147
|
+
for char in text:
|
|
148
|
+
code = ord(char)
|
|
149
|
+
if code < HANGUL_BASE or code > HANGUL_END:
|
|
150
|
+
result += char
|
|
151
|
+
continue
|
|
152
|
+
index = code - HANGUL_BASE
|
|
153
|
+
cho = CHO[index // (21 * 28)]
|
|
154
|
+
jung = JUNG[(index % (21 * 28)) // 28]
|
|
155
|
+
jong = JONG[index % 28]
|
|
156
|
+
result += cho + jung + jong
|
|
157
|
+
return result
|
|
158
|
+
|
|
159
|
+
def capitalize_words(text):
|
|
160
|
+
result = []
|
|
161
|
+
capitalize_next = True
|
|
162
|
+
for char in text:
|
|
163
|
+
if char.isspace():
|
|
164
|
+
capitalize_next = True
|
|
165
|
+
result.append(char)
|
|
166
|
+
else:
|
|
167
|
+
if capitalize_next:
|
|
168
|
+
result.append(char.upper())
|
|
169
|
+
capitalize_next = False
|
|
170
|
+
else:
|
|
171
|
+
result.append(char.lower())
|
|
172
|
+
return "".join(result)
|
|
173
|
+
|
|
174
|
+
def capitalize_lines(text):
|
|
175
|
+
result = []
|
|
176
|
+
capitalize_next = True
|
|
177
|
+
for char in text:
|
|
178
|
+
if char == "\n":
|
|
179
|
+
capitalize_next = True
|
|
180
|
+
result.append(char)
|
|
181
|
+
else:
|
|
182
|
+
if capitalize_next:
|
|
183
|
+
result.append(char.upper())
|
|
184
|
+
capitalize_next = False
|
|
185
|
+
else:
|
|
186
|
+
result.append(char.lower())
|
|
187
|
+
return "".join(result)
|
|
188
|
+
|
|
189
|
+
def romanize(text, **options):
|
|
190
|
+
"""
|
|
191
|
+
Convert Korean text to Romanized form.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
text (str): Korean text to romanize
|
|
195
|
+
**options: Optional parameters:
|
|
196
|
+
- use_pronunciation_rules (bool): Whether to apply pronunciation rules (default: True)
|
|
197
|
+
- casing_option (str): Casing option (default: "lowercase")
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
str: Romanized text
|
|
201
|
+
"""
|
|
202
|
+
use_pronunciation_rules = options.get('use_pronunciation_rules', True)
|
|
203
|
+
casing_option = options.get('casing_option', "lowercase")
|
|
204
|
+
|
|
205
|
+
jamo_str = split_hangul_to_jamos(text)
|
|
206
|
+
if use_pronunciation_rules:
|
|
207
|
+
jamo_str = apply_pronunciation_rules(jamo_str)
|
|
208
|
+
result = "".join(ROMAN_MAP.get(c, c) for c in jamo_str)
|
|
209
|
+
|
|
210
|
+
if casing_option == "uppercase":
|
|
211
|
+
return result.upper()
|
|
212
|
+
elif casing_option == "capitalize-word":
|
|
213
|
+
return capitalize_words(result)
|
|
214
|
+
elif casing_option == "capitalize-line":
|
|
215
|
+
return capitalize_lines(result)
|
|
216
|
+
else: # LOWERCASE
|
|
217
|
+
return result.lower()
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: koroman
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Korean Romanizer with pronunciation rules based on 국립국어원 표기법
|
|
5
|
+
Home-page: https://github.com/gerosyab/koroman
|
|
6
|
+
Author: Donghe Youn (Daissue)
|
|
7
|
+
Author-email: gerosyab@gmail.com
|
|
8
|
+
Project-URL: Bug Reports, https://github.com/gerosyab/koroman/issues
|
|
9
|
+
Project-URL: Source, https://github.com/gerosyab/koroman
|
|
10
|
+
Keywords: korean,romanizer,romanization,hangul,transliteration,linguistics,korean romanizer,korean romanization,korean transliteration,korean linguistics,koroman
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
14
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.6
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
21
|
+
Classifier: Operating System :: OS Independent
|
|
22
|
+
Requires-Python: >=3.6
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Dynamic: author
|
|
26
|
+
Dynamic: author-email
|
|
27
|
+
Dynamic: classifier
|
|
28
|
+
Dynamic: description
|
|
29
|
+
Dynamic: description-content-type
|
|
30
|
+
Dynamic: home-page
|
|
31
|
+
Dynamic: keywords
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
Dynamic: project-url
|
|
34
|
+
Dynamic: requires-python
|
|
35
|
+
Dynamic: summary
|
|
36
|
+
|
|
37
|
+
> 🇰🇷 [한국어로 보기](./README.ko.md)
|
|
38
|
+
|
|
39
|
+
# KOROMAN - Korean Romanizer
|
|
40
|
+
|
|
41
|
+
**KOROMAN** is a multilingual Romanizer for Korean text, based on the Revised Romanization system (국립국어원 표기법) with additional pronunciation rules. It converts Hangul syllables into Romanized Latin script across multiple languages: **JavaScript, Python, and Java**.
|
|
42
|
+
|
|
43
|
+
## 🌐 Live Demo
|
|
44
|
+
- [한국어 버전](https://daissue.app/romanizer)
|
|
45
|
+
- [English version](https://daissue.app/en/romanizer)
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 📦 Features
|
|
50
|
+
- Supports Revised Romanization of Korean
|
|
51
|
+
- Applies key Korean phonological rules:
|
|
52
|
+
- Liaison (연음화)
|
|
53
|
+
- Nasal assimilation (비음화)
|
|
54
|
+
- Lateralization (유음화)
|
|
55
|
+
- Fortis/tense consonants (경음화)
|
|
56
|
+
- Provides casing options (lower, upper, capitalized)
|
|
57
|
+
- Fully tested in each language
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 🚀 Getting Started
|
|
62
|
+
|
|
63
|
+
### Python
|
|
64
|
+
```bash
|
|
65
|
+
pip install koroman
|
|
66
|
+
```
|
|
67
|
+
```python
|
|
68
|
+
from koroman import romanize
|
|
69
|
+
|
|
70
|
+
# Basic usage
|
|
71
|
+
romanize("한글") # → "hangul"
|
|
72
|
+
|
|
73
|
+
# With pronunciation rules disabled
|
|
74
|
+
romanize("해돋이", use_pronunciation_rules=False) # → "haedodi"
|
|
75
|
+
|
|
76
|
+
# With pronunciation rules enabled (default)
|
|
77
|
+
romanize("해돋이") # → "haedoji"
|
|
78
|
+
|
|
79
|
+
# With different casing options
|
|
80
|
+
romanize("한글", casing_option="uppercase") # → "HANGUL"
|
|
81
|
+
romanize("안녕 한글", casing_option="capitalize-word") # → "Annyeong Hangeul"
|
|
82
|
+
romanize("안녕\n한글 로마자 변환", casing_option="capitalize-line") # → "Annyeong\nHangeul Romaja Byeonhwan"
|
|
83
|
+
|
|
84
|
+
# Combining options
|
|
85
|
+
romanize("해돋이", use_pronunciation_rules=False, casing_option="uppercase") # → "HAEDODI"
|
|
86
|
+
```
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## 📜 LICENSE
|
|
90
|
+
[MIT License](LICENSE)
|
|
91
|
+
|
|
92
|
+
2025 ⓒ Donghe Youn (Daissue)
|
|
93
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
koroman
|
koroman-1.0.0/setup.cfg
ADDED
koroman-1.0.0/setup.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
this_directory = Path(__file__).parent
|
|
5
|
+
long_description = (this_directory / "README.md").read_text(encoding="utf-8")
|
|
6
|
+
|
|
7
|
+
setup(
|
|
8
|
+
name='koroman',
|
|
9
|
+
version='1.0.0',
|
|
10
|
+
description='Korean Romanizer with pronunciation rules based on 국립국어원 표기법',
|
|
11
|
+
author='Donghe Youn (Daissue)',
|
|
12
|
+
author_email='gerosyab@gmail.com',
|
|
13
|
+
url='https://github.com/gerosyab/koroman',
|
|
14
|
+
packages=find_packages(),
|
|
15
|
+
classifiers=[
|
|
16
|
+
'Development Status :: 5 - Production/Stable',
|
|
17
|
+
'Intended Audience :: Developers',
|
|
18
|
+
'Topic :: Software Development :: Libraries :: Python Modules',
|
|
19
|
+
'Topic :: Text Processing :: Linguistic',
|
|
20
|
+
'Programming Language :: Python :: 3',
|
|
21
|
+
'Programming Language :: Python :: 3.6',
|
|
22
|
+
'Programming Language :: Python :: 3.7',
|
|
23
|
+
'Programming Language :: Python :: 3.8',
|
|
24
|
+
'Programming Language :: Python :: 3.9',
|
|
25
|
+
'License :: OSI Approved :: MIT License',
|
|
26
|
+
'Operating System :: OS Independent',
|
|
27
|
+
],
|
|
28
|
+
python_requires='>=3.6',
|
|
29
|
+
keywords='korean, romanizer, romanization, hangul, transliteration, linguistics, korean romanizer, korean romanization, korean transliteration, korean linguistics, koroman',
|
|
30
|
+
project_urls={
|
|
31
|
+
'Bug Reports': 'https://github.com/gerosyab/koroman/issues',
|
|
32
|
+
'Source': 'https://github.com/gerosyab/koroman',
|
|
33
|
+
},
|
|
34
|
+
long_description=long_description,
|
|
35
|
+
long_description_content_type='text/markdown',
|
|
36
|
+
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from koroman import romanize
|
|
3
|
+
|
|
4
|
+
class TestKoroman(unittest.TestCase):
|
|
5
|
+
def test_basic(self):
|
|
6
|
+
self.assertEqual(romanize("한글"), "hangeul")
|
|
7
|
+
self.assertEqual(romanize("로마자"), "romaja")
|
|
8
|
+
self.assertEqual(romanize("안녕하세요"), "annyeonghaseyo")
|
|
9
|
+
self.assertEqual(romanize("테스트"), "teseuteu")
|
|
10
|
+
|
|
11
|
+
def test_casing(self):
|
|
12
|
+
self.assertEqual(romanize("한글", casing_option="lowercase"), "hangeul")
|
|
13
|
+
self.assertEqual(romanize("한글", casing_option="uppercase"), "HANGEUL")
|
|
14
|
+
self.assertEqual(romanize("한글 로마자 안녕하세요", casing_option="capitalize-word"), "Hangeul Romaja Annyeonghaseyo")
|
|
15
|
+
self.assertEqual(romanize("한글 로마자 안녕하세요", casing_option="capitalize-line"), "Hangeul romaja annyeonghaseyo")
|
|
16
|
+
|
|
17
|
+
def test_pronunciation_rules(self):
|
|
18
|
+
self.assertEqual(romanize("해돋이"), "haedoji")
|
|
19
|
+
self.assertEqual(romanize("해돋이", use_pronunciation_rules=False), "haedodi")
|
|
20
|
+
self.assertEqual(romanize("문래역"), "mullaeyeok")
|
|
21
|
+
self.assertEqual(romanize("문래역", use_pronunciation_rules=False), "munraeyeok")
|
|
22
|
+
self.assertEqual(romanize("선릉역"), "seolleungyeok")
|
|
23
|
+
self.assertEqual(romanize("선릉역", use_pronunciation_rules=False), "seonreungyeok")
|
|
24
|
+
self.assertEqual(romanize("역량"), "yeongnyang")
|
|
25
|
+
self.assertEqual(romanize("역량", use_pronunciation_rules=True), "yeongnyang")
|
|
26
|
+
|
|
27
|
+
def test_multiline_and_spacing(self):
|
|
28
|
+
self.assertEqual(
|
|
29
|
+
romanize("여기는 선릉역 입니다.\n해돋이와 문래역 그리고 역량 개발."),
|
|
30
|
+
"yeogineun seolleungyeok imnida.\nhaedojiwa mullaeyeok geurigo yeongnyang gaebal."
|
|
31
|
+
)
|
|
32
|
+
self.assertEqual(
|
|
33
|
+
romanize("여기는 선릉역 입니다.\r\n해돋이와 문래역 그리고 역량 개발."),
|
|
34
|
+
"yeogineun seolleungyeok imnida.\r\nhaedojiwa mullaeyeok geurigo yeongnyang gaebal."
|
|
35
|
+
)
|
|
36
|
+
self.assertEqual(
|
|
37
|
+
romanize("여기는 선릉역 입니다.\n\r해돋이와 문래역 그리고 역량 개발."),
|
|
38
|
+
"yeogineun seolleungyeok imnida.\n\rhaedojiwa mullaeyeok geurigo yeongnyang gaebal."
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
if __name__ == '__main__':
|
|
42
|
+
print(romanize("여기는 선릉역 입니다.\n\r해돋이와 문래역 그리고 역량 개발."))
|
|
43
|
+
print(romanize("해돋이"))
|
|
44
|
+
unittest.main()
|
|
45
|
+
|