gimchi 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +134 -0
- data/config/default.yml +5 -10
- data/lib/gimchi/char.rb +104 -104
- data/lib/gimchi/korean.rb +291 -277
- data/lib/gimchi/patch_1.8.rb +18 -18
- data/lib/gimchi/pronouncer.rb +488 -488
- data/test/test_gimchi.rb +4 -0
- metadata +14 -14
- data/README.rdoc +0 -120
data/README.markdown
ADDED
@@ -0,0 +1,134 @@
|
|
1
|
+
# gimchi
|
2
|
+
|
3
|
+
Gimchi is a simple Ruby gem which knows how to handle Korean strings. It knows
|
4
|
+
how to dissect Korean characters into its 3 components, namely chosung,
|
5
|
+
jungsung and optional jongsung. It knows how Korean sentences are pronounced
|
6
|
+
and how they're written in roman alphabet.
|
7
|
+
|
8
|
+
Gimchi (only partially) implements the following rules dictated by
|
9
|
+
The National Institute of The Korean Language (http://www.korean.go.kr)
|
10
|
+
* Korean Standard Pronunciation
|
11
|
+
* Korean Romanization
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
```
|
15
|
+
gem install gimchi
|
16
|
+
```
|
17
|
+
|
18
|
+
## Usage
|
19
|
+
|
20
|
+
### Creating Gimchi::Korean instance
|
21
|
+
```ruby
|
22
|
+
require 'gimchi'
|
23
|
+
|
24
|
+
ko = Gimchi::Korean.new
|
25
|
+
```
|
26
|
+
|
27
|
+
### Checks if the given character is in Korean alphabet
|
28
|
+
```ruby
|
29
|
+
ko.korean_char? 'ㄱ' # true
|
30
|
+
ko.complete_korean_char? 'ㄱ' # false
|
31
|
+
|
32
|
+
ko.korean_char? 'ㅏ' # true
|
33
|
+
ko.complete_korean_char? 'ㅏ' # false
|
34
|
+
|
35
|
+
ko.korean_char? '가' # true
|
36
|
+
ko.complete_korean_char? '가' # true
|
37
|
+
```
|
38
|
+
|
39
|
+
### Usage of Gimchi::Korean::Char
|
40
|
+
```ruby
|
41
|
+
arr = ko.dissect '이것은 한글입니다.'
|
42
|
+
# [이, 것, 은, " ", 한, 글, 입, 니, 다, "."]
|
43
|
+
|
44
|
+
arr[4].class # Gimchi::Korean::Char
|
45
|
+
|
46
|
+
arr[4].chosung # "ㅎ"
|
47
|
+
arr[4].jungsung # "ㅏ"
|
48
|
+
arr[4].jongsung # "ㄴ"
|
49
|
+
arr[4].to_a # ["ㅎ", "ㅏ", "ㄴ"]
|
50
|
+
arr[4].to_s # "한"
|
51
|
+
|
52
|
+
arr[4].chosung = 'ㄷ'
|
53
|
+
arr[4].jongsung = 'ㄹ'
|
54
|
+
arr[4].to_s # "달"
|
55
|
+
arr[4].complete? # true
|
56
|
+
arr[4].partial? # false
|
57
|
+
|
58
|
+
arr[4].chosung = nil
|
59
|
+
arr[4].jongsung = nil
|
60
|
+
arr[4].complete? # false
|
61
|
+
arr[4].partial? # true
|
62
|
+
```
|
63
|
+
|
64
|
+
### Reading numbers in Korean
|
65
|
+
```ruby
|
66
|
+
ko.read_number(1999) # "천 구백 구십 구"
|
67
|
+
ko.read_number(- 100.123) # "마이너스 백점일이삼"
|
68
|
+
ko.read_number("153,191,100,678.3214")
|
69
|
+
# "천 오백 삼십 일억 구천 백 십만 육백 칠십 팔점삼이일사"
|
70
|
+
|
71
|
+
# Age, Time ( -살, -시 )
|
72
|
+
ko.read_number("20살") # "스무살"
|
73
|
+
ko.read_number("13 살") # "열세 살"
|
74
|
+
ko.read_number("7시 30분") # "일곱시 삼십분"
|
75
|
+
```
|
76
|
+
|
77
|
+
### Standard pronunciation (partially implemented)
|
78
|
+
```ruby
|
79
|
+
str = "됐어 됐어 이제 그런 가르침은 됐어 매일 아침 7 시 30 분까지 우릴 조그만 교실로 몰아넣고"
|
80
|
+
ko.pronounce str
|
81
|
+
# "돼써 돼써 이제 그런 가르치믄 돼써 매일 아침 일곱 시 삼십 분까지 우릴 조그만 교실로 모라너코"
|
82
|
+
|
83
|
+
ko.pronounce str, :slur => true
|
84
|
+
# "돼써 돼써 이제 그런 가르치믄 돼써 매이 라치 밀곱 씨 삼십 뿐까지 우릴 조그만 교실로 모라너코"
|
85
|
+
|
86
|
+
ko.pronounce str, :pronounce_each_char => true
|
87
|
+
# "됃어 됃어 이제 그런 가르침은 됃어 매일 아침 일곱 시 삼십 분까지 우릴 조그만 교실로 몰아너고"
|
88
|
+
|
89
|
+
ko.pronounce str, :number => false
|
90
|
+
# "돼써 돼써 이제 그런 가르치믄 돼써 매일 아침 7 시 30 분까지 우릴 조그만 교실로 모라너코"
|
91
|
+
```
|
92
|
+
|
93
|
+
### Romanization (partially implemented)
|
94
|
+
```ruby
|
95
|
+
str = "됐어 됐어 이제 그런 가르침은 됐어 매일 아침 7 시 30 분까지 우릴 조그만 교실로 몰아넣고"
|
96
|
+
|
97
|
+
ko.romanize str
|
98
|
+
# "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-il achim ilgop si samsip bunkkaji uril jogeuman gyosillo moraneoko"
|
99
|
+
ko.romanize str, :slur => true
|
100
|
+
# "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-i rachi milgop ssi samsip ppunkkaji uril jogeuman gyosillo moraneoko"
|
101
|
+
ko.romanize str, :as_pronounced => false
|
102
|
+
# "Dwaet-eo dwaet-eo ije geureon gareuchim-eun dwaet-eo mae-il achim ilgop si samsip bunkkaji uril jogeuman gyosillo mol-aneogo"
|
103
|
+
ko.romanize str, :number => false
|
104
|
+
# "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-il achim 7 si 30 bunkkaji uril jogeuman gyosillo moraneoko"
|
105
|
+
```
|
106
|
+
|
107
|
+
## Limitation of the implementation
|
108
|
+
|
109
|
+
Unfortunately in order to implement the complete specification of Korean
|
110
|
+
pronunciation and romanization, we need NLP, huge Korean dictionaries and even
|
111
|
+
semantic analysis of the given string. And even with all those complex
|
112
|
+
processing, we cannot guarantee 100% accuracy of the output. So yes, that is
|
113
|
+
definitely not what this gem tries to achieve. Gimchi tries to achieve "some"
|
114
|
+
level of accuracy with relatively simple code.
|
115
|
+
|
116
|
+
Currently, Gimchi code contains a lot of ad-hoc (possibly invalid) patches
|
117
|
+
that try to improve the quality of the output, which should better be
|
118
|
+
refactored anytime soon.
|
119
|
+
|
120
|
+
## Contributing to gimchi
|
121
|
+
|
122
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
123
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
124
|
+
* Fork the project
|
125
|
+
* Start a feature/bugfix branch
|
126
|
+
* Commit and push until you are happy with your contribution
|
127
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
128
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
129
|
+
|
130
|
+
## Copyright
|
131
|
+
|
132
|
+
Copyright (c) 2011 Junegunn Choi. See LICENSE.txt for
|
133
|
+
further details.
|
134
|
+
|
data/config/default.yml
CHANGED
@@ -90,8 +90,7 @@ number:
|
|
90
90
|
units: ["", 만, 억, 조, 경, 해, 자, 양, 구, 간, 정, 재, 극, 항하사, 아승기, 나유타, 불가사의, 무량대수]
|
91
91
|
digits: [영, 일, 이, 삼, 사, 오, 육, 칠, 팔, 구]
|
92
92
|
post substitution:
|
93
|
-
|
94
|
-
: 만
|
93
|
+
"^일만": 만
|
95
94
|
|
96
95
|
# 정수형일 때 또다른 표현법 (나이, 시간)
|
97
96
|
alt notation:
|
@@ -107,8 +106,7 @@ number:
|
|
107
106
|
tenfolds: [열, 스물, 서른, 마흔, 쉰, 예순, 일흔, 여든, 아흔, 백]
|
108
107
|
digits: ["", 한, 두, 세, 네, 다섯, 여섯, 일곱, 여덟, 아홉]
|
109
108
|
post substitution:
|
110
|
-
|
111
|
-
: 스무
|
109
|
+
"스물$": 스무
|
112
110
|
|
113
111
|
romanization:
|
114
112
|
chosung:
|
@@ -171,9 +169,6 @@ romanization:
|
|
171
169
|
ttt: t-tt
|
172
170
|
ppp: p-pp
|
173
171
|
"--": "-"
|
174
|
-
|
175
|
-
: "
|
176
|
-
|
177
|
-
: "\\1"
|
178
|
-
? !ruby/regexp /-$/
|
179
|
-
: ""
|
172
|
+
"n-([^gaeiou])": "n\\1"
|
173
|
+
"-(\\s)": "\\1"
|
174
|
+
"-$": ""
|
data/lib/gimchi/char.rb
CHANGED
@@ -2,122 +2,122 @@
|
|
2
2
|
|
3
3
|
module Gimchi
|
4
4
|
class Korean
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
5
|
+
# Class representing each Korean character. Its three components,
|
6
|
+
# `chosung', `jungsung' and `jongsung' can be get and set.
|
7
|
+
#
|
8
|
+
# `to_s' merges components into a String. `to_a' returns the three components.
|
9
|
+
class Char
|
10
|
+
# @return [String] Chosung component of this character.
|
11
|
+
attr_reader :chosung
|
12
|
+
# @return [String] Jungsung component of this character.
|
13
|
+
attr_reader :jungsung
|
14
|
+
# @return [String] Jongsung component of this character.
|
15
|
+
attr_reader :jongsung
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
17
|
+
# @param [Gimchi::Korean] kor Gimchi::Korean instance
|
18
|
+
# @param [String] kchar Korean character string
|
19
|
+
def initialize kor, kchar
|
20
|
+
raise ArgumentError('Not a korean character') unless kor.korean_char? kchar
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
22
|
+
@kor = kor
|
23
|
+
@cur = []
|
24
|
+
if @kor.complete_korean_char? kchar
|
25
|
+
c = kchar.unpack('U').first
|
26
|
+
n = c - 0xAC00
|
27
|
+
# '가' ~ '깋' -> 'ㄱ'
|
28
|
+
n1 = n / (21 * 28)
|
29
|
+
# '가' ~ '깋'에서의 순서
|
30
|
+
n = n % (21 * 28)
|
31
|
+
n2 = n / 28;
|
32
|
+
n3 = n % 28;
|
33
|
+
self.chosung = @kor.chosungs[n1]
|
34
|
+
self.jungsung = @kor.jungsungs[n2]
|
35
|
+
self.jongsung = ([nil] + @kor.jongsungs)[n3]
|
36
|
+
elsif @kor.chosungs.include? kchar
|
37
|
+
self.chosung = kchar
|
38
|
+
elsif @kor.jungsungs.include? kchar
|
39
|
+
self.jungsung = kchar
|
40
|
+
elsif @kor.jongsungs.include? kchar
|
41
|
+
self.jongsung = kchar
|
42
|
+
end
|
43
|
+
end
|
44
44
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
45
|
+
# Recombines components into a korean character.
|
46
|
+
# @return [String] Combined korean character
|
47
|
+
def to_s
|
48
|
+
if chosung.nil? && jungsung.nil?
|
49
|
+
""
|
50
|
+
elsif chosung && jungsung
|
51
|
+
n1, n2, n3 =
|
52
|
+
n1 = @kor.chosungs.index(chosung) || 0
|
53
|
+
n2 = @kor.jungsungs.index(jungsung) || 0
|
54
|
+
n3 = ([nil] + @kor.jongsungs).index(jongsung) || 0
|
55
|
+
[ 0xAC00 + n1 * (21 * 28) + n2 * 28 + n3 ].pack('U')
|
56
|
+
else
|
57
|
+
chosung || jungsung
|
58
|
+
end
|
59
|
+
end
|
60
60
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
61
|
+
# Sets the chosung component.
|
62
|
+
# @param [String]
|
63
|
+
def chosung= c
|
64
|
+
raise ArgumentError.new('Invalid chosung component') if
|
65
|
+
c && @kor.chosungs.include?(c) == false
|
66
|
+
@chosung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
|
67
|
+
end
|
68
68
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
69
|
+
# Sets the jungsung component
|
70
|
+
# @param [String]
|
71
|
+
def jungsung= c
|
72
|
+
raise ArgumentError.new('Invalid jungsung component') if
|
73
|
+
c && @kor.jungsungs.include?(c) == false
|
74
|
+
@jungsung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
|
75
|
+
end
|
76
76
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
77
|
+
# Sets the jongsung component
|
78
|
+
#
|
79
|
+
# @param [String]
|
80
|
+
def jongsung= c
|
81
|
+
raise ArgumentError.new('Invalid jongsung component') if
|
82
|
+
c && @kor.jongsungs.include?(c) == false
|
83
|
+
@jongsung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
|
84
|
+
end
|
85
85
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
86
|
+
# Returns Array of three components.
|
87
|
+
#
|
88
|
+
# @return [Array] Array of three components
|
89
|
+
def to_a
|
90
|
+
[chosung, jungsung, jongsung]
|
91
|
+
end
|
92
92
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
93
|
+
# Checks if this is a complete Korean character.
|
94
|
+
def complete?
|
95
|
+
chosung.nil? == false && jungsung.nil? == false
|
96
|
+
end
|
97
97
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
98
|
+
# Checks if this is a non-complete Korean character.
|
99
|
+
# e.g. ㅇ, ㅏ
|
100
|
+
def partial?
|
101
|
+
chosung.nil? || jungsung.nil?
|
102
|
+
end
|
103
103
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
104
|
+
private
|
105
|
+
# Three components of Korean::Char are extended to support #vowel? and #consonant? method.
|
106
|
+
module Component
|
107
|
+
# @return [Korean] Hosting Korean instance
|
108
|
+
attr_accessor :kor
|
109
109
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
110
|
+
# Is this component a vowel?
|
111
|
+
def vowel?
|
112
|
+
kor.jungsungs.include? self
|
113
|
+
end
|
114
114
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
115
|
+
# Is this component a consonant?
|
116
|
+
def consonant?
|
117
|
+
self != 'ㅇ' && kor.chosungs.include?(self)
|
118
|
+
end
|
119
|
+
end#Component
|
120
|
+
end#Char
|
121
121
|
end#Korean
|
122
122
|
end#Gimchi
|
123
123
|
|
data/lib/gimchi/korean.rb
CHANGED
@@ -2,285 +2,299 @@
|
|
2
2
|
|
3
3
|
module Gimchi
|
4
4
|
class Korean
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
5
|
+
DEFAULT_CONFIG_FILE_PATH =
|
6
|
+
File.dirname(__FILE__) + '/../../config/default.yml'
|
7
|
+
|
8
|
+
# Returns the YAML configuration used by this Korean instance.
|
9
|
+
# @return [String]
|
10
|
+
attr_reader :config
|
11
|
+
|
12
|
+
# Initialize Gimchi::Korean.
|
13
|
+
# @param [String] config_file You can override many parts of the implementation by customizing config file
|
14
|
+
def initialize config_file = DEFAULT_CONFIG_FILE_PATH
|
15
|
+
require 'yaml'
|
16
|
+
@config = YAML.load(File.read config_file)
|
17
|
+
|
18
|
+
[
|
19
|
+
@config['romanization']['post substitution'],
|
20
|
+
@config['number']['post substitution'],
|
21
|
+
@config['number']['alt notation']['post substitution']
|
22
|
+
].each do |r|
|
23
|
+
r.keys.each do |k|
|
24
|
+
r[Regexp.compile k] = r.delete k
|
25
|
+
end
|
26
|
+
end
|
27
|
+
@config.freeze
|
28
|
+
|
29
|
+
@pronouncer = Korean::Pronouncer.send :new, self
|
30
|
+
end
|
31
|
+
|
32
|
+
# Array of chosung's.
|
33
|
+
#
|
34
|
+
# @return [Array] Array of chosung strings
|
35
|
+
def chosungs
|
36
|
+
config['structure']['chosung']
|
37
|
+
end
|
38
|
+
|
39
|
+
# Array of jungsung's.
|
40
|
+
# @return [Array] Array of jungsung strings
|
41
|
+
def jungsungs
|
42
|
+
config['structure']['jungsung']
|
43
|
+
end
|
44
|
+
|
45
|
+
# Array of jongsung's.
|
46
|
+
# @return [Array] Array of jongsung strings
|
47
|
+
def jongsungs
|
48
|
+
config['structure']['jongsung']
|
49
|
+
end
|
50
|
+
|
51
|
+
# Checks if the given character is a korean character.
|
52
|
+
# @param [String] ch A string of size 1
|
53
|
+
def korean_char? ch
|
54
|
+
raise ArgumentError.new('Lengthy input') if ch.length > 1
|
55
|
+
|
56
|
+
complete_korean_char?(ch) ||
|
57
|
+
(chosungs + jungsungs + jongsungs).include?(ch)
|
58
|
+
end
|
59
|
+
|
60
|
+
# Checks if the given character is a "complete" korean character.
|
61
|
+
# "Complete" Korean character must have chosung and jungsung, with optional jongsung.
|
62
|
+
# @param [String] ch A string of size 1
|
63
|
+
def complete_korean_char? ch
|
64
|
+
raise ArgumentError.new('Lengthy input') if ch.length > 1
|
65
|
+
|
66
|
+
# Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
|
67
|
+
ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
|
68
|
+
end
|
69
|
+
|
70
|
+
# Splits the given string into an array of Korean::Char's and Strings of length 1.
|
71
|
+
# @param [String] str Input string.
|
72
|
+
# @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters)
|
73
|
+
def dissect str
|
74
|
+
str.each_char.map { |c|
|
75
|
+
korean_char?(c) ? Korean::Char.new(self, c) : c
|
76
|
+
}
|
77
|
+
end
|
78
|
+
|
79
|
+
# Reads numeric expressions in Korean way.
|
80
|
+
# @param [String, Number] str Numeric type or String containing numeric expressions
|
81
|
+
# @return [String] Output string
|
82
|
+
def read_number str
|
83
|
+
nconfig = config['number']
|
84
|
+
|
85
|
+
str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
|
86
|
+
read_number_sub($1, $5)
|
87
|
+
}
|
88
|
+
end
|
89
|
+
|
90
|
+
# Returns the pronunciation of the given string containing Korean characters.
|
91
|
+
# Takes optional options hash.
|
92
|
+
#
|
93
|
+
# @param [String] Input string
|
94
|
+
# @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively.
|
95
|
+
# @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous.
|
96
|
+
# @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean.
|
97
|
+
# @param [Array] options[:except] Allows you to skip certain transformations.
|
98
|
+
# @return [String] Output string
|
99
|
+
def pronounce str, options = {}
|
100
|
+
options = {
|
101
|
+
:pronounce_each_char => false,
|
102
|
+
:slur => false,
|
103
|
+
:number => true,
|
104
|
+
:except => [],
|
105
|
+
:debug => false
|
106
|
+
}.merge options
|
107
|
+
|
108
|
+
str = read_number(str) if options[:number]
|
109
|
+
|
110
|
+
result, transforms = @pronouncer.send :pronounce!, str, options
|
111
|
+
|
112
|
+
if options[:debug]
|
113
|
+
return result, transforms
|
114
|
+
else
|
115
|
+
return result
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns the romanization (alphabetical notation) of the given Korean string.
|
120
|
+
# http://en.wikipedia.org/wiki/Korean_romanization
|
121
|
+
# @param [String] str Input Korean string
|
122
|
+
# @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize
|
123
|
+
# @param [Boolean] options[:number] Whether to read numeric expressions in the string
|
124
|
+
# @param [Boolean] options[:slur] Same as :slur in #pronounce
|
125
|
+
# @return [String] Output string in Roman Alphabet
|
126
|
+
# @see Korean#pronounce
|
127
|
+
def romanize str, options = {}
|
128
|
+
options = {
|
129
|
+
:as_pronounced => true,
|
130
|
+
:number => true,
|
131
|
+
:slur => false
|
132
|
+
}.merge options
|
133
|
+
|
134
|
+
require 'yaml'
|
135
|
+
rdata = config['romanization']
|
136
|
+
post_subs = rdata["post substitution"]
|
137
|
+
rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]]
|
138
|
+
|
139
|
+
str = pronounce str,
|
140
|
+
:pronounce_each_char => !options[:as_pronounced],
|
141
|
+
:number => options[:number],
|
142
|
+
:slur => options[:slur],
|
143
|
+
# 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
|
144
|
+
:except => %w[rule_5_3]
|
145
|
+
dash = rdata[0]["ㅇ"]
|
146
|
+
romanization = ""
|
147
|
+
|
148
|
+
romanize_chunk = lambda do | chunk |
|
149
|
+
dissect(chunk).each do | kc |
|
150
|
+
kc.to_a.each_with_index do | comp, idx |
|
151
|
+
next if comp.nil?
|
152
|
+
comp = rdata[idx][comp] || comp
|
153
|
+
comp = comp[1..-1] if comp[0, 1] == dash &&
|
154
|
+
(romanization.empty? || romanization[-1, 1] =~ /\s/)
|
155
|
+
romanization += comp
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
return post_subs.keys.inject(romanization) { | output, pattern |
|
160
|
+
output.gsub(pattern, post_subs[pattern])
|
161
|
+
}
|
162
|
+
end
|
163
|
+
|
164
|
+
k_chunk = ""
|
165
|
+
str.each_char do | c |
|
166
|
+
if korean_char? c
|
167
|
+
k_chunk += c
|
168
|
+
else
|
169
|
+
unless k_chunk.empty?
|
170
|
+
romanization = romanize_chunk.call k_chunk
|
171
|
+
k_chunk = ""
|
172
|
+
end
|
173
|
+
romanization += c
|
174
|
+
end
|
175
|
+
end
|
176
|
+
romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
|
177
|
+
romanization
|
178
|
+
end
|
169
179
|
|
170
180
|
private
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
181
|
+
def read_number_sub num, next_char
|
182
|
+
nconfig = config['number']
|
183
|
+
|
184
|
+
if num == '0'
|
185
|
+
return nconfig['digits'].first
|
186
|
+
end
|
187
|
+
|
188
|
+
num = num.gsub(',', '')
|
189
|
+
next_char = next_char.to_s
|
190
|
+
is_float = num.match(/[\.e]/) != nil
|
191
|
+
|
192
|
+
# Alternative notation for integers with proper suffix
|
193
|
+
alt = false
|
194
|
+
if is_float == false &&
|
195
|
+
nconfig['alt notation']['when suffix'].keys.include?(next_char.strip)
|
196
|
+
max = nconfig['alt notation']['when suffix'][next_char.strip]['max']
|
197
|
+
|
198
|
+
if max.nil? || num.to_i <= max
|
199
|
+
alt = true
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# Sign
|
204
|
+
sign = []
|
205
|
+
negative = false
|
206
|
+
if num =~ /^-/
|
207
|
+
num = num.sub(/^-\s*/, '')
|
208
|
+
sign << nconfig['negative']
|
209
|
+
negative = true
|
210
|
+
elsif num =~ /^\+/
|
211
|
+
num = num.sub(/^\+\s*/, '')
|
212
|
+
sign << nconfig['positive']
|
213
|
+
end
|
214
|
+
|
215
|
+
if is_float
|
216
|
+
below = nconfig['decimal point']
|
217
|
+
below = nconfig['digits'][0] + below if num.to_f < 1
|
218
|
+
|
219
|
+
if md = num.match(/(.*)e(.*)/)
|
220
|
+
dp = md[1].index('.')
|
221
|
+
num = md[1].tr '.', ''
|
222
|
+
exp = md[2].to_i
|
223
|
+
|
224
|
+
dp += exp
|
225
|
+
if dp > num.length
|
226
|
+
num = num.ljust(dp, '0')
|
227
|
+
num = num.sub(/^0+([1-9])/, "\\1")
|
228
|
+
|
229
|
+
below = ""
|
230
|
+
elsif dp < 0
|
231
|
+
num = '0.' + '0' * (-dp) + num
|
232
|
+
else
|
233
|
+
num[dp, 1] = '.' + num[dp, 1]
|
234
|
+
end
|
235
|
+
end
|
236
|
+
num.sub(/.*\./, '').each_char do | char |
|
237
|
+
below += nconfig['digits'][char.to_i]
|
238
|
+
end if num.include? '.'
|
239
|
+
num = num.sub(/\..*/, '')
|
240
|
+
else
|
241
|
+
below = ""
|
242
|
+
end
|
243
|
+
|
244
|
+
tokens = []
|
245
|
+
unit_idx = -1
|
246
|
+
num = num.to_i
|
247
|
+
while num > 0
|
248
|
+
v = num % 10000
|
249
|
+
|
250
|
+
unit_idx += 1
|
251
|
+
if v > 0
|
252
|
+
if alt == false || unit_idx >= 1
|
253
|
+
str = ""
|
254
|
+
# Cannot use hash as they're unordered in 1.8
|
255
|
+
[[1000, '천'],
|
256
|
+
[100, '백'],
|
257
|
+
[10, '십']].each do | arr |
|
258
|
+
u, sub_unit = arr
|
259
|
+
str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
|
260
|
+
v %= u
|
261
|
+
end
|
262
|
+
str += nconfig['digits'][v] if v > 0
|
263
|
+
|
264
|
+
tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
|
265
|
+
else
|
266
|
+
str = ""
|
267
|
+
tenfolds = nconfig['alt notation']['tenfolds']
|
268
|
+
digits = nconfig['alt notation']['digits']
|
269
|
+
alt_post_subs = nconfig['alt notation']['post substitution']
|
270
|
+
|
271
|
+
# Likewise.
|
272
|
+
[[1000, '천'],
|
273
|
+
[100, '백']].each do | u, sub_unit |
|
274
|
+
str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
|
275
|
+
v %= u
|
276
|
+
end
|
277
|
+
|
278
|
+
str += tenfolds[(v / 10) - 1] if v / 10 > 0
|
279
|
+
v %= 10
|
280
|
+
str += digits[v] if v > 0
|
281
|
+
|
282
|
+
alt_post_subs.each do | k, v |
|
283
|
+
str.gsub!(k, v)
|
284
|
+
end if alt
|
285
|
+
tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
|
286
|
+
end
|
287
|
+
end
|
288
|
+
num /= 10000
|
289
|
+
end
|
290
|
+
|
291
|
+
tokens += sign unless sign.empty?
|
292
|
+
ret = tokens.reverse.join(' ') + below + next_char
|
293
|
+
nconfig['post substitution'].each do | k, v |
|
294
|
+
ret.gsub!(k, v)
|
295
|
+
end
|
296
|
+
ret
|
297
|
+
end
|
284
298
|
end#Korean
|
285
299
|
end#Gimchi
|
286
300
|
|