gimchi 0.1.9 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +17 -0
- data/CHANGELOG.rdoc +42 -0
- data/Gemfile +2 -0
- data/LICENSE.txt +3 -1
- data/{README.ko.markdown → README.ko.md} +68 -66
- data/README.md +162 -0
- data/Rakefile +7 -0
- data/config/default.yml +162 -162
- data/crawler/crawler.rb +49 -0
- data/gimchi.gemspec +21 -0
- data/lib/gimchi.rb +374 -4
- data/lib/gimchi/char.rb +26 -38
- data/lib/gimchi/patch_1.8.rb +9 -9
- data/lib/gimchi/pronouncer.rb +26 -27
- data/test/helper.rb +1 -0
- data/test/test_gimchi.rb +114 -86
- metadata +23 -51
- data/README.markdown +0 -155
- data/lib/gimchi/korean.rb +0 -323
data/config/default.yml
CHANGED
@@ -1,174 +1,174 @@
|
|
1
1
|
---
|
2
2
|
structure:
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
chosung: [ㄱ, ㄲ, ㄴ, ㄷ, ㄸ, ㄹ, ㅁ, ㅂ, ㅃ, ㅅ, ㅆ, ㅇ, ㅈ, ㅉ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ]
|
4
|
+
jungsung: [ㅏ, ㅐ, ㅑ, ㅒ, ㅓ, ㅔ, ㅕ, ㅖ, ㅗ, ㅘ, ㅙ, ㅚ, ㅛ, ㅜ, ㅝ, ㅞ, ㅟ, ㅠ, ㅡ, ㅢ, ㅣ]
|
5
|
+
jongsung: [ㄱ, ㄲ, ㄳ, ㄴ, ㄵ, ㄶ, ㄷ, ㄹ, ㄺ, ㄻ, ㄼ, ㄽ, ㄾ, ㄿ, ㅀ, ㅁ, ㅂ, ㅄ, ㅅ,
|
6
|
+
ㅆ, ㅇ, ㅈ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ]
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
fortis map:
|
9
|
+
ㄱ: ㄲ
|
10
|
+
ㄷ: ㄸ
|
11
|
+
ㅂ: ㅃ
|
12
|
+
ㅅ: ㅆ
|
13
|
+
ㅈ: ㅉ
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
15
|
+
double consonant map:
|
16
|
+
ㄳ: [ㄱ, ㅅ]
|
17
|
+
ㄵ: [ㄴ, ㅈ]
|
18
|
+
ㄶ: [ㄴ, ㅎ]
|
19
|
+
ㄺ: [ㄹ, ㄱ]
|
20
|
+
ㄻ: [ㄹ, ㅁ]
|
21
|
+
ㄼ: [ㄹ, ㅂ]
|
22
|
+
ㄽ: [ㄹ, ㅅ]
|
23
|
+
ㄾ: [ㄹ, ㅌ]
|
24
|
+
ㄿ: [ㄹ, ㅍ]
|
25
|
+
ㅀ: [ㄹ, ㅎ]
|
26
|
+
ㅄ: [ㅂ, ㅅ]
|
27
27
|
|
28
28
|
pronouncer:
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
29
|
+
jongsung sound:
|
30
|
+
ㄱ: ㄱ
|
31
|
+
ㄲ: ㄱ
|
32
|
+
ㄳ: ㄱ
|
33
|
+
ㄴ: ㄴ
|
34
|
+
ㄵ: ㄴ
|
35
|
+
ㄶ: ㄴ
|
36
|
+
ㄷ: ㄷ
|
37
|
+
ㄹ: ㄹ
|
38
|
+
ㄺ: ㄱ
|
39
|
+
ㄻ: ㅁ
|
40
|
+
ㄼ: ㄹ
|
41
|
+
ㄽ: ㄹ
|
42
|
+
ㄾ: ㅌ
|
43
|
+
ㄿ: ㅂ
|
44
|
+
ㅀ: ㄹ
|
45
|
+
ㅁ: ㅁ
|
46
|
+
ㅂ: ㅂ
|
47
|
+
ㅄ: ㅂ
|
48
|
+
ㅅ: ㄷ
|
49
|
+
ㅆ: ㄷ
|
50
|
+
ㅇ: ㅇ
|
51
|
+
ㅈ: ㄷ
|
52
|
+
ㅊ: ㄷ
|
53
|
+
ㅋ: ㄱ
|
54
|
+
ㅌ: ㄷ
|
55
|
+
ㅍ: ㅂ
|
56
|
+
ㅎ:
|
57
|
+
transformation:
|
58
|
+
# changing the order affects the quality of the transformation
|
59
|
+
sequence for 1:
|
60
|
+
- rule_5_1
|
61
|
+
- rule_5_3
|
62
62
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
63
|
+
sequence for 2:
|
64
|
+
- rule_16
|
65
|
+
- rule_17
|
66
|
+
- rule_18
|
67
|
+
- rule_19
|
68
|
+
- rule_5_1
|
69
|
+
- rule_5_3
|
70
|
+
- rule_30
|
71
|
+
- rule_23
|
72
|
+
- rule_24
|
73
|
+
- rule_25
|
74
|
+
- rule_12
|
75
|
+
- rule_20
|
76
|
+
- rule_10
|
77
|
+
- rule_27
|
78
|
+
- rule_9
|
79
|
+
- rule_11
|
80
|
+
- rule_14
|
81
|
+
- rule_13
|
82
|
+
- rule_15
|
83
|
+
blocking rule:
|
84
|
+
rule_16: [rule_30]
|
85
85
|
|
86
86
|
number:
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
87
|
+
positive: 플러스
|
88
|
+
negative: 마이너스
|
89
|
+
decimal point: 점
|
90
|
+
units: ["", 만, 억, 조, 경, 해, 자, 양, 구, 간, 정, 재, 극, 항하사, 아승기, 나유타, 불가사의, 무량대수]
|
91
|
+
digits: [영, 일, 이, 삼, 사, 오, 육, 칠, 팔, 구]
|
92
|
+
post substitution:
|
93
|
+
"^일만": 만
|
94
94
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
95
|
+
# 정수형일 때 또다른 표현법 (나이, 시간)
|
96
|
+
alt notation:
|
97
|
+
when suffix:
|
98
|
+
개:
|
99
|
+
max:
|
100
|
+
명:
|
101
|
+
max:
|
102
|
+
살:
|
103
|
+
max:
|
104
|
+
시:
|
105
|
+
max: 12
|
106
|
+
tenfolds: [열, 스물, 서른, 마흔, 쉰, 예순, 일흔, 여든, 아흔, 백]
|
107
|
+
digits: ["", 한, 두, 세, 네, 다섯, 여섯, 일곱, 여덟, 아홉]
|
108
|
+
post substitution:
|
109
|
+
"스물$": 스무
|
110
110
|
|
111
111
|
romanization:
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
112
|
+
chosung:
|
113
|
+
ㄱ: g
|
114
|
+
ㄲ: kk
|
115
|
+
ㅋ: k
|
116
|
+
ㄷ: d
|
117
|
+
ㄸ: tt
|
118
|
+
ㅌ: t
|
119
|
+
ㅂ: b
|
120
|
+
ㅃ: pp
|
121
|
+
ㅍ: p
|
122
|
+
ㅈ: j
|
123
|
+
ㅉ: jj
|
124
|
+
ㅊ: ch
|
125
|
+
ㅅ: s
|
126
|
+
ㅆ: ss
|
127
|
+
ㅎ: h
|
128
|
+
ㄴ: n
|
129
|
+
ㅁ: m
|
130
|
+
ㄹ: r
|
131
|
+
ㅇ: "-"
|
132
|
+
jungsung:
|
133
|
+
ㅏ: a
|
134
|
+
ㅓ: eo
|
135
|
+
ㅗ: o
|
136
|
+
ㅜ: u
|
137
|
+
ㅡ: eu
|
138
|
+
ㅣ: i
|
139
|
+
ㅐ: ae
|
140
|
+
ㅔ: e
|
141
|
+
ㅚ: oe
|
142
|
+
ㅟ: wi
|
143
|
+
ㅑ: ya
|
144
|
+
ㅕ: yeo
|
145
|
+
ㅛ: yo
|
146
|
+
ㅠ: yu
|
147
|
+
ㅒ: yae
|
148
|
+
ㅖ: ye
|
149
|
+
ㅘ: wa
|
150
|
+
ㅙ: wae
|
151
|
+
ㅝ: wo
|
152
|
+
ㅞ: we
|
153
|
+
ㅢ: ui
|
154
|
+
jongsung:
|
155
|
+
ㄱ: k
|
156
|
+
ㄴ: n-
|
157
|
+
ㄷ: t
|
158
|
+
ㄹ: l
|
159
|
+
ㅁ: m
|
160
|
+
ㅂ: p
|
161
|
+
ㅇ: ng
|
162
|
+
post substitution:
|
163
|
+
# 제2항 [붙임 2]‘ㄹ’은 모음 앞에서는 ‘r’로, 자음 앞이나 어말에서는
|
164
|
+
# ‘l’로 적는다. 단, ‘ㄹㄹ’은 ‘ll’로 적는다.
|
165
|
+
lr: ll
|
166
|
+
"-w": w
|
167
|
+
"-y": y
|
168
|
+
kkk: k-kk
|
169
|
+
ttt: t-tt
|
170
|
+
ppp: p-pp
|
171
|
+
"--": "-"
|
172
|
+
"n-([^gaeiou])": "n\\1"
|
173
|
+
"-(\\s)": "\\1"
|
174
|
+
"-$": ""
|
data/crawler/crawler.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# Junegunn Choi (junegunn.c@gmail.com)
|
4
|
+
# 2011/04/02-
|
5
|
+
|
6
|
+
# A dirty little script to fetch test sets from http://www.korean.go.kr
|
7
|
+
|
8
|
+
require 'open-uri'
|
9
|
+
require 'yaml'
|
10
|
+
|
11
|
+
# Crawl romanization test set
|
12
|
+
rdata = open('http://www.korean.go.kr/09_new/dic/rule/rule_roman_0101.jsp').read.
|
13
|
+
scan(%r{th>(.*?)</td}m).flatten.map { |e| e.split %r{<.*>}m }.
|
14
|
+
select { |e| e.length == 2 }
|
15
|
+
|
16
|
+
File.open(File.dirname(__FILE__) + '/../test/romanization.yml', 'w') do | f |
|
17
|
+
f.puts "---"
|
18
|
+
|
19
|
+
rdata.each do | arr |
|
20
|
+
f.puts "\"#{arr.first}\": \"#{arr.last}\""
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
exit
|
25
|
+
|
26
|
+
# Crawl pronunciation test set
|
27
|
+
m = {}
|
28
|
+
%w[
|
29
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0202.jsp
|
30
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0204.jsp
|
31
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0205.jsp
|
32
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0206.jsp
|
33
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0207.jsp
|
34
|
+
].each do | url |
|
35
|
+
open(url).read.scan(/>([^0-9<>);?]+?)\[(.*?)\]</).each do | match |
|
36
|
+
puts match[0, 2].join(' => ')
|
37
|
+
m[match[0]] = match[1]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
File.open(File.dirname(__FILE__) + '/../test/pronunciation.yml', 'w') do | f |
|
42
|
+
f.puts "---"
|
43
|
+
m.each do | k, v |
|
44
|
+
k = k.sub(/.*→/, '').gsub(/-/, '')
|
45
|
+
v = v.sub(/.*→/, '').gsub(/[\(:ː\)]/, '').split(%r{[/∼]})
|
46
|
+
f.puts "\"#{k}\": [#{v.join(', ')}]"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
data/gimchi.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
Gem::Specification.new do |gem|
|
6
|
+
gem.name = %q{gimchi}
|
7
|
+
gem.version = "0.2.0"
|
8
|
+
gem.authors = ["Junegunn Choi"]
|
9
|
+
gem.email = ["junegunn.c@gmail.com"]
|
10
|
+
gem.description = %q{A Ruby gem for Korean characters}
|
11
|
+
gem.summary = %q{A Ruby gem for Korean characters}
|
12
|
+
gem.homepage = "https://github.com/junegunn/gimchi"
|
13
|
+
|
14
|
+
gem.files = `git ls-files`.split($/).reject { |f| f =~ %r[^viz/] }
|
15
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
|
+
gem.require_paths = ["lib"]
|
18
|
+
gem.license = "MIT"
|
19
|
+
|
20
|
+
gem.add_development_dependency 'ansi'
|
21
|
+
end
|
data/lib/gimchi.rb
CHANGED
@@ -2,10 +2,380 @@
|
|
2
2
|
# encoding: UTF-8
|
3
3
|
# Junegunn Choi (junegunn.c@gmail.com)
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'yaml'
|
6
|
+
require 'set'
|
6
7
|
require 'gimchi/char'
|
7
8
|
require 'gimchi/pronouncer'
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
class Gimchi
|
11
|
+
class << self
|
12
|
+
def setup
|
13
|
+
@@default ||= Gimchi.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def Char ch
|
17
|
+
@@default.kchar ch
|
18
|
+
end
|
19
|
+
|
20
|
+
[
|
21
|
+
:decompose,
|
22
|
+
:compose,
|
23
|
+
:korean_char?,
|
24
|
+
:complete_korean_char?,
|
25
|
+
:kchar,
|
26
|
+
:kchar?,
|
27
|
+
:chosung?,
|
28
|
+
:jungsung?,
|
29
|
+
:jongsung?,
|
30
|
+
:read_number,
|
31
|
+
:pronounce,
|
32
|
+
:romanize
|
33
|
+
].each do |sym|
|
34
|
+
define_method(sym) do |*arg, &b|
|
35
|
+
@@default.send sym, *arg, &b
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
CONFIG_FILE_PATH = File.expand_path('../../config/default.yml', __FILE__)
|
41
|
+
attr_reader :config, :chosungs, :jungsungs, :jongsungs
|
42
|
+
|
43
|
+
# Initialize Gimchi::Korean.
|
44
|
+
def initialize
|
45
|
+
symbolize_keys = lambda do |val|
|
46
|
+
case val
|
47
|
+
when Hash
|
48
|
+
{}.tap do |h|
|
49
|
+
val.each do |k, v|
|
50
|
+
k = k.gsub(' ', '_').to_sym if k =~ /[a-z0-9 ]/
|
51
|
+
h[k] = symbolize_keys.call v
|
52
|
+
end
|
53
|
+
end
|
54
|
+
when Array
|
55
|
+
val.map { |v| symbolize_keys.call v }
|
56
|
+
else
|
57
|
+
val
|
58
|
+
end
|
59
|
+
end
|
60
|
+
@config = symbolize_keys.call YAML.load(File.read CONFIG_FILE_PATH)
|
61
|
+
|
62
|
+
[
|
63
|
+
@config[:romanization][:post_substitution],
|
64
|
+
@config[:number][:post_substitution],
|
65
|
+
@config[:number][:alt_notation][:post_substitution]
|
66
|
+
].each do |r|
|
67
|
+
r.keys.each do |k|
|
68
|
+
r[Regexp.compile k.to_s] = r.delete k
|
69
|
+
end
|
70
|
+
end
|
71
|
+
@config.freeze
|
72
|
+
|
73
|
+
@pronouncer = Gimchi::Pronouncer.send :new, self
|
74
|
+
|
75
|
+
@chosungs = config[:structure][:chosung]
|
76
|
+
@jungsungs = config[:structure][:jungsung]
|
77
|
+
@jongsungs = config[:structure][:jongsung]
|
78
|
+
@chosung_set = Set[*@chosungs]
|
79
|
+
@jungsung_set = Set[*@jungsungs]
|
80
|
+
@jongsung_set = Set[*@jongsungs]
|
81
|
+
@all = @chosung_set + @jungsung_set + @jongsung_set
|
82
|
+
end
|
83
|
+
|
84
|
+
# Decompose a Korean character into 3 components
|
85
|
+
# @param [String] ch Korean character
|
86
|
+
# @return [Array]
|
87
|
+
def decompose ch
|
88
|
+
kchar(ch).to_a
|
89
|
+
end
|
90
|
+
|
91
|
+
# Compose 3 elements into a Korean character String
|
92
|
+
# @param [String] chosung
|
93
|
+
# @param [String] jungsung
|
94
|
+
# @param [String] jongsung
|
95
|
+
# @return [String]
|
96
|
+
def compose chosung, jungsung = nil, jongsung = nil
|
97
|
+
if chosung.nil? && jungsung.nil?
|
98
|
+
""
|
99
|
+
elsif chosung && jungsung
|
100
|
+
n1, n2, n3 =
|
101
|
+
n1 = chosungs.index(chosung) || 0
|
102
|
+
n2 = jungsungs.index(jungsung) || 0
|
103
|
+
n3 = ([nil] + jongsungs).index(jongsung) || 0
|
104
|
+
[ 0xAC00 + n1 * (21 * 28) + n2 * 28 + n3 ].pack('U')
|
105
|
+
else
|
106
|
+
chosung || jungsung
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# @param [String] ch
|
111
|
+
# @return [Boolean]
|
112
|
+
def chosung? ch
|
113
|
+
@chosung_set.include? ch
|
114
|
+
end
|
115
|
+
|
116
|
+
# @param [String] ch
|
117
|
+
# @return [Boolean]
|
118
|
+
def jungsung? ch
|
119
|
+
@jungsung_set.include? ch
|
120
|
+
end
|
121
|
+
|
122
|
+
# @param [String] ch
|
123
|
+
# @return [Boolean]
|
124
|
+
def jongsung? ch
|
125
|
+
@jongsung_set.include? ch
|
126
|
+
end
|
127
|
+
|
128
|
+
# Checks if the given character is a korean character.
|
129
|
+
# @param [String] ch A string of size 1
|
130
|
+
def korean_char? ch
|
131
|
+
raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
|
132
|
+
|
133
|
+
complete_korean_char?(ch) || @all.include?(ch)
|
134
|
+
end
|
135
|
+
alias kchar? korean_char?
|
136
|
+
|
137
|
+
# Checks if the given character is a "complete" korean character.
|
138
|
+
# "Complete" Korean character must have chosung and jungsung, with optional jongsung.
|
139
|
+
# @param [String] ch A string of size 1
|
140
|
+
def complete_korean_char? ch
|
141
|
+
raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
|
142
|
+
|
143
|
+
# Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
|
144
|
+
ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns a Gimchi::Char object for the given Korean character.
|
148
|
+
# @param [String] ch Korean character in String
|
149
|
+
# @return [Gimchi::Char] Gimchi::Char instance
|
150
|
+
def kchar ch
|
151
|
+
Gimchi::Char.new(self, ch)
|
152
|
+
end
|
153
|
+
|
154
|
+
# Reads numeric expressions in Korean way.
|
155
|
+
# @param [String, Number] str Numeric type or String containing numeric expressions
|
156
|
+
# @return [String] Output string
|
157
|
+
def read_number str
|
158
|
+
str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
|
159
|
+
read_number_sub($1, $5)
|
160
|
+
}
|
161
|
+
end
|
162
|
+
|
163
|
+
# Returns the pronunciation of the given string containing Korean characters.
|
164
|
+
# Takes optional options hash.
|
165
|
+
#
|
166
|
+
# @param [String] Input string
|
167
|
+
# @param [Hash] options Options
|
168
|
+
# @option options [Boolean] each_char Each character of the string is pronounced respectively.
|
169
|
+
# @option options [Boolean] slur Strings separated by whitespaces are processed again as if they were contiguous.
|
170
|
+
# @option options [Boolean] number Numberic parts of the string is also pronounced in Korean.
|
171
|
+
# @option options [Array] except Allows you to skip certain transformations.
|
172
|
+
# @return [String] Output string
|
173
|
+
def pronounce str, options = {}
|
174
|
+
options = {
|
175
|
+
:each_char => false,
|
176
|
+
:slur => false,
|
177
|
+
:number => true,
|
178
|
+
:except => [],
|
179
|
+
:debug => false
|
180
|
+
}.merge options
|
181
|
+
|
182
|
+
str = read_number(str) if options[:number]
|
183
|
+
|
184
|
+
result, transforms = @pronouncer.send :pronounce!, str, options
|
185
|
+
|
186
|
+
if options[:debug]
|
187
|
+
return result, transforms
|
188
|
+
else
|
189
|
+
return result
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
# Returns the romanization (alphabetical notation) of the given Korean string.
|
194
|
+
# http://en.wikipedia.org/wiki/Korean_romanization
|
195
|
+
# @param [String] str Input Korean string
|
196
|
+
# @param [Hash] options Options
|
197
|
+
# @option options [Boolean] as_pronounced If true, #pronounce is internally called before romanize
|
198
|
+
# @option options [Boolean] number Whether to read numeric expressions in the string
|
199
|
+
# @option options [Boolean] slur Same as :slur in #pronounce
|
200
|
+
# @return [String] Output string in Roman Alphabet
|
201
|
+
# @see Korean#pronounce
|
202
|
+
def romanize str, options = {}
|
203
|
+
options = {
|
204
|
+
:as_pronounced => true,
|
205
|
+
:number => true,
|
206
|
+
:slur => false
|
207
|
+
}.merge options
|
208
|
+
|
209
|
+
rdata = config[:romanization]
|
210
|
+
post_subs = rdata[:post_substitution]
|
211
|
+
rdata = [rdata[:chosung], rdata[:jungsung], rdata[:jongsung]]
|
212
|
+
|
213
|
+
str = pronounce str,
|
214
|
+
:each_char => !options[:as_pronounced],
|
215
|
+
:number => options[:number],
|
216
|
+
:slur => options[:slur],
|
217
|
+
# 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
|
218
|
+
:except => %w[rule_5_3]
|
219
|
+
dash = rdata[0]["ㅇ"]
|
220
|
+
romanization = ""
|
221
|
+
|
222
|
+
romanize_chunk = lambda do |chunk|
|
223
|
+
chunk.each_char.map { |ch| kchar(ch) rescue ch }.each do |kc|
|
224
|
+
kc.to_a.each_with_index do |comp, idx|
|
225
|
+
next if comp.nil?
|
226
|
+
comp = rdata[idx][comp] || comp
|
227
|
+
comp = comp[1..-1] if comp[0, 1] == dash &&
|
228
|
+
(romanization.empty? || romanization[-1, 1] =~ /\s/)
|
229
|
+
romanization += comp
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
return post_subs.keys.inject(romanization) { | output, pattern |
|
234
|
+
output.gsub(pattern, post_subs[pattern])
|
235
|
+
}
|
236
|
+
end
|
237
|
+
|
238
|
+
k_chunk = ""
|
239
|
+
str.each_char do | c |
|
240
|
+
if korean_char? c
|
241
|
+
k_chunk += c
|
242
|
+
else
|
243
|
+
unless k_chunk.empty?
|
244
|
+
romanization = romanize_chunk.call k_chunk
|
245
|
+
k_chunk = ""
|
246
|
+
end
|
247
|
+
romanization += c
|
248
|
+
end
|
249
|
+
end
|
250
|
+
romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
|
251
|
+
romanization
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
def str_length str
|
256
|
+
str.length
|
257
|
+
end
|
258
|
+
|
259
|
+
def read_number_sub num, next_char
|
260
|
+
nconfig = config[:number]
|
261
|
+
|
262
|
+
if num == '0'
|
263
|
+
return nconfig[:digits].first
|
264
|
+
end
|
265
|
+
|
266
|
+
num = num.gsub(',', '')
|
267
|
+
next_char = next_char.to_s
|
268
|
+
is_float = num.match(/[\.e]/) != nil
|
269
|
+
|
270
|
+
# Alternative notation for integers with proper suffix
|
271
|
+
alt = false
|
272
|
+
if is_float == false &&
|
273
|
+
nconfig[:alt_notation][:when_suffix].keys.include?(next_char.strip)
|
274
|
+
max = nconfig[:alt_notation][:when_suffix][next_char.strip][:max]
|
275
|
+
|
276
|
+
if max.nil? || num.to_i <= max
|
277
|
+
alt = true
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
# Sign
|
282
|
+
sign = []
|
283
|
+
negative = false
|
284
|
+
if num =~ /^-/
|
285
|
+
num = num.sub(/^-\s*/, '')
|
286
|
+
sign << nconfig[:negative]
|
287
|
+
negative = true
|
288
|
+
elsif num =~ /^\+/
|
289
|
+
num = num.sub(/^\+\s*/, '')
|
290
|
+
sign << nconfig[:positive]
|
291
|
+
end
|
292
|
+
|
293
|
+
if is_float
|
294
|
+
below = nconfig[:decimal_point]
|
295
|
+
below = nconfig[:digits][0] + below if num.to_f < 1
|
296
|
+
|
297
|
+
if md = num.match(/(.*)e(.*)/)
|
298
|
+
dp = md[1].index('.')
|
299
|
+
num = md[1].tr '.', ''
|
300
|
+
exp = md[2].to_i
|
301
|
+
|
302
|
+
dp += exp
|
303
|
+
if dp > num.length
|
304
|
+
num = num.ljust(dp, '0')
|
305
|
+
num = num.sub(/^0+([1-9])/, "\\1")
|
306
|
+
|
307
|
+
below = ""
|
308
|
+
elsif dp < 0
|
309
|
+
num = '0.' + '0' * (-dp) + num
|
310
|
+
else
|
311
|
+
num[dp, 1] = '.' + num[dp, 1]
|
312
|
+
end
|
313
|
+
end
|
314
|
+
num.sub(/.*\./, '').each_char do | char |
|
315
|
+
below += nconfig[:digits][char.to_i]
|
316
|
+
end if num.include? '.'
|
317
|
+
num = num.sub(/\..*/, '')
|
318
|
+
else
|
319
|
+
below = ""
|
320
|
+
end
|
321
|
+
|
322
|
+
tokens = []
|
323
|
+
unit_idx = -1
|
324
|
+
num = num.to_i
|
325
|
+
while num > 0
|
326
|
+
v = num % 10000
|
327
|
+
|
328
|
+
unit_idx += 1
|
329
|
+
if v > 0
|
330
|
+
if alt == false || unit_idx >= 1
|
331
|
+
str = ""
|
332
|
+
# Cannot use hash as they're unordered in 1.8
|
333
|
+
[[1000, '천'],
|
334
|
+
[100, '백'],
|
335
|
+
[10, '십']].each do | arr |
|
336
|
+
u, sub_unit = arr
|
337
|
+
str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
|
338
|
+
v %= u
|
339
|
+
end
|
340
|
+
str += nconfig[:digits][v] if v > 0
|
341
|
+
|
342
|
+
raise RangeError, "number too large" unless nconfig[:units][unit_idx]
|
343
|
+
tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx]
|
344
|
+
else
|
345
|
+
str = ""
|
346
|
+
tenfolds = nconfig[:alt_notation][:tenfolds]
|
347
|
+
digits = nconfig[:alt_notation][:digits]
|
348
|
+
alt_post_subs = nconfig[:alt_notation][:post_substitution]
|
349
|
+
|
350
|
+
# Likewise.
|
351
|
+
[[1000, '천'],
|
352
|
+
[100, '백']].each do |u, sub_unit|
|
353
|
+
str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
|
354
|
+
v %= u
|
355
|
+
end
|
356
|
+
|
357
|
+
str += tenfolds[(v / 10) - 1] if v / 10 > 0
|
358
|
+
v %= 10
|
359
|
+
str += digits[v] if v > 0
|
360
|
+
|
361
|
+
alt_post_subs.each do |p, s|
|
362
|
+
str.gsub!(p, s)
|
363
|
+
end if alt
|
364
|
+
tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx]
|
365
|
+
end
|
366
|
+
end
|
367
|
+
num /= 10000
|
368
|
+
end
|
369
|
+
|
370
|
+
tokens += sign unless sign.empty?
|
371
|
+
ret = tokens.reverse.join(' ') + below + next_char
|
372
|
+
nconfig[:post_substitution].each do |p, s|
|
373
|
+
ret.gsub!(p, s)
|
374
|
+
end
|
375
|
+
ret
|
376
|
+
end
|
377
|
+
end#Gimchi
|
378
|
+
|
379
|
+
require 'gimchi/patch_1.8'
|
380
|
+
|
381
|
+
Gimchi.setup
|