gimchi 0.1.9 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +17 -0
- data/CHANGELOG.rdoc +42 -0
- data/Gemfile +2 -0
- data/LICENSE.txt +3 -1
- data/{README.ko.markdown → README.ko.md} +68 -66
- data/README.md +162 -0
- data/Rakefile +7 -0
- data/config/default.yml +162 -162
- data/crawler/crawler.rb +49 -0
- data/gimchi.gemspec +21 -0
- data/lib/gimchi.rb +374 -4
- data/lib/gimchi/char.rb +26 -38
- data/lib/gimchi/patch_1.8.rb +9 -9
- data/lib/gimchi/pronouncer.rb +26 -27
- data/test/helper.rb +1 -0
- data/test/test_gimchi.rb +114 -86
- metadata +23 -51
- data/README.markdown +0 -155
- data/lib/gimchi/korean.rb +0 -323
data/config/default.yml
CHANGED
@@ -1,174 +1,174 @@
|
|
1
1
|
---
|
2
2
|
structure:
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
chosung: [ㄱ, ㄲ, ㄴ, ㄷ, ㄸ, ㄹ, ㅁ, ㅂ, ㅃ, ㅅ, ㅆ, ㅇ, ㅈ, ㅉ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ]
|
4
|
+
jungsung: [ㅏ, ㅐ, ㅑ, ㅒ, ㅓ, ㅔ, ㅕ, ㅖ, ㅗ, ㅘ, ㅙ, ㅚ, ㅛ, ㅜ, ㅝ, ㅞ, ㅟ, ㅠ, ㅡ, ㅢ, ㅣ]
|
5
|
+
jongsung: [ㄱ, ㄲ, ㄳ, ㄴ, ㄵ, ㄶ, ㄷ, ㄹ, ㄺ, ㄻ, ㄼ, ㄽ, ㄾ, ㄿ, ㅀ, ㅁ, ㅂ, ㅄ, ㅅ,
|
6
|
+
ㅆ, ㅇ, ㅈ, ㅊ, ㅋ, ㅌ, ㅍ, ㅎ]
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
8
|
+
fortis map:
|
9
|
+
ㄱ: ㄲ
|
10
|
+
ㄷ: ㄸ
|
11
|
+
ㅂ: ㅃ
|
12
|
+
ㅅ: ㅆ
|
13
|
+
ㅈ: ㅉ
|
14
14
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
15
|
+
double consonant map:
|
16
|
+
ㄳ: [ㄱ, ㅅ]
|
17
|
+
ㄵ: [ㄴ, ㅈ]
|
18
|
+
ㄶ: [ㄴ, ㅎ]
|
19
|
+
ㄺ: [ㄹ, ㄱ]
|
20
|
+
ㄻ: [ㄹ, ㅁ]
|
21
|
+
ㄼ: [ㄹ, ㅂ]
|
22
|
+
ㄽ: [ㄹ, ㅅ]
|
23
|
+
ㄾ: [ㄹ, ㅌ]
|
24
|
+
ㄿ: [ㄹ, ㅍ]
|
25
|
+
ㅀ: [ㄹ, ㅎ]
|
26
|
+
ㅄ: [ㅂ, ㅅ]
|
27
27
|
|
28
28
|
pronouncer:
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
29
|
+
jongsung sound:
|
30
|
+
ㄱ: ㄱ
|
31
|
+
ㄲ: ㄱ
|
32
|
+
ㄳ: ㄱ
|
33
|
+
ㄴ: ㄴ
|
34
|
+
ㄵ: ㄴ
|
35
|
+
ㄶ: ㄴ
|
36
|
+
ㄷ: ㄷ
|
37
|
+
ㄹ: ㄹ
|
38
|
+
ㄺ: ㄱ
|
39
|
+
ㄻ: ㅁ
|
40
|
+
ㄼ: ㄹ
|
41
|
+
ㄽ: ㄹ
|
42
|
+
ㄾ: ㅌ
|
43
|
+
ㄿ: ㅂ
|
44
|
+
ㅀ: ㄹ
|
45
|
+
ㅁ: ㅁ
|
46
|
+
ㅂ: ㅂ
|
47
|
+
ㅄ: ㅂ
|
48
|
+
ㅅ: ㄷ
|
49
|
+
ㅆ: ㄷ
|
50
|
+
ㅇ: ㅇ
|
51
|
+
ㅈ: ㄷ
|
52
|
+
ㅊ: ㄷ
|
53
|
+
ㅋ: ㄱ
|
54
|
+
ㅌ: ㄷ
|
55
|
+
ㅍ: ㅂ
|
56
|
+
ㅎ:
|
57
|
+
transformation:
|
58
|
+
# changing the order affects the quality of the transformation
|
59
|
+
sequence for 1:
|
60
|
+
- rule_5_1
|
61
|
+
- rule_5_3
|
62
62
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
63
|
+
sequence for 2:
|
64
|
+
- rule_16
|
65
|
+
- rule_17
|
66
|
+
- rule_18
|
67
|
+
- rule_19
|
68
|
+
- rule_5_1
|
69
|
+
- rule_5_3
|
70
|
+
- rule_30
|
71
|
+
- rule_23
|
72
|
+
- rule_24
|
73
|
+
- rule_25
|
74
|
+
- rule_12
|
75
|
+
- rule_20
|
76
|
+
- rule_10
|
77
|
+
- rule_27
|
78
|
+
- rule_9
|
79
|
+
- rule_11
|
80
|
+
- rule_14
|
81
|
+
- rule_13
|
82
|
+
- rule_15
|
83
|
+
blocking rule:
|
84
|
+
rule_16: [rule_30]
|
85
85
|
|
86
86
|
number:
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
87
|
+
positive: 플러스
|
88
|
+
negative: 마이너스
|
89
|
+
decimal point: 점
|
90
|
+
units: ["", 만, 억, 조, 경, 해, 자, 양, 구, 간, 정, 재, 극, 항하사, 아승기, 나유타, 불가사의, 무량대수]
|
91
|
+
digits: [영, 일, 이, 삼, 사, 오, 육, 칠, 팔, 구]
|
92
|
+
post substitution:
|
93
|
+
"^일만": 만
|
94
94
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
95
|
+
# 정수형일 때 또다른 표현법 (나이, 시간)
|
96
|
+
alt notation:
|
97
|
+
when suffix:
|
98
|
+
개:
|
99
|
+
max:
|
100
|
+
명:
|
101
|
+
max:
|
102
|
+
살:
|
103
|
+
max:
|
104
|
+
시:
|
105
|
+
max: 12
|
106
|
+
tenfolds: [열, 스물, 서른, 마흔, 쉰, 예순, 일흔, 여든, 아흔, 백]
|
107
|
+
digits: ["", 한, 두, 세, 네, 다섯, 여섯, 일곱, 여덟, 아홉]
|
108
|
+
post substitution:
|
109
|
+
"스물$": 스무
|
110
110
|
|
111
111
|
romanization:
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
112
|
+
chosung:
|
113
|
+
ㄱ: g
|
114
|
+
ㄲ: kk
|
115
|
+
ㅋ: k
|
116
|
+
ㄷ: d
|
117
|
+
ㄸ: tt
|
118
|
+
ㅌ: t
|
119
|
+
ㅂ: b
|
120
|
+
ㅃ: pp
|
121
|
+
ㅍ: p
|
122
|
+
ㅈ: j
|
123
|
+
ㅉ: jj
|
124
|
+
ㅊ: ch
|
125
|
+
ㅅ: s
|
126
|
+
ㅆ: ss
|
127
|
+
ㅎ: h
|
128
|
+
ㄴ: n
|
129
|
+
ㅁ: m
|
130
|
+
ㄹ: r
|
131
|
+
ㅇ: "-"
|
132
|
+
jungsung:
|
133
|
+
ㅏ: a
|
134
|
+
ㅓ: eo
|
135
|
+
ㅗ: o
|
136
|
+
ㅜ: u
|
137
|
+
ㅡ: eu
|
138
|
+
ㅣ: i
|
139
|
+
ㅐ: ae
|
140
|
+
ㅔ: e
|
141
|
+
ㅚ: oe
|
142
|
+
ㅟ: wi
|
143
|
+
ㅑ: ya
|
144
|
+
ㅕ: yeo
|
145
|
+
ㅛ: yo
|
146
|
+
ㅠ: yu
|
147
|
+
ㅒ: yae
|
148
|
+
ㅖ: ye
|
149
|
+
ㅘ: wa
|
150
|
+
ㅙ: wae
|
151
|
+
ㅝ: wo
|
152
|
+
ㅞ: we
|
153
|
+
ㅢ: ui
|
154
|
+
jongsung:
|
155
|
+
ㄱ: k
|
156
|
+
ㄴ: n-
|
157
|
+
ㄷ: t
|
158
|
+
ㄹ: l
|
159
|
+
ㅁ: m
|
160
|
+
ㅂ: p
|
161
|
+
ㅇ: ng
|
162
|
+
post substitution:
|
163
|
+
# 제2항 [붙임 2]‘ㄹ’은 모음 앞에서는 ‘r’로, 자음 앞이나 어말에서는
|
164
|
+
# ‘l’로 적는다. 단, ‘ㄹㄹ’은 ‘ll’로 적는다.
|
165
|
+
lr: ll
|
166
|
+
"-w": w
|
167
|
+
"-y": y
|
168
|
+
kkk: k-kk
|
169
|
+
ttt: t-tt
|
170
|
+
ppp: p-pp
|
171
|
+
"--": "-"
|
172
|
+
"n-([^gaeiou])": "n\\1"
|
173
|
+
"-(\\s)": "\\1"
|
174
|
+
"-$": ""
|
data/crawler/crawler.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
# Junegunn Choi (junegunn.c@gmail.com)
|
4
|
+
# 2011/04/02-
|
5
|
+
|
6
|
+
# A dirty little script to fetch test sets from http://www.korean.go.kr
|
7
|
+
|
8
|
+
require 'open-uri'
|
9
|
+
require 'yaml'
|
10
|
+
|
11
|
+
# Crawl romanization test set
|
12
|
+
rdata = open('http://www.korean.go.kr/09_new/dic/rule/rule_roman_0101.jsp').read.
|
13
|
+
scan(%r{th>(.*?)</td}m).flatten.map { |e| e.split %r{<.*>}m }.
|
14
|
+
select { |e| e.length == 2 }
|
15
|
+
|
16
|
+
File.open(File.dirname(__FILE__) + '/../test/romanization.yml', 'w') do | f |
|
17
|
+
f.puts "---"
|
18
|
+
|
19
|
+
rdata.each do | arr |
|
20
|
+
f.puts "\"#{arr.first}\": \"#{arr.last}\""
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
exit
|
25
|
+
|
26
|
+
# Crawl pronunciation test set
|
27
|
+
m = {}
|
28
|
+
%w[
|
29
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0202.jsp
|
30
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0204.jsp
|
31
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0205.jsp
|
32
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0206.jsp
|
33
|
+
http://www.korean.go.kr/09_new/dic/rule/rule02_0207.jsp
|
34
|
+
].each do | url |
|
35
|
+
open(url).read.scan(/>([^0-9<>);?]+?)\[(.*?)\]</).each do | match |
|
36
|
+
puts match[0, 2].join(' => ')
|
37
|
+
m[match[0]] = match[1]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
File.open(File.dirname(__FILE__) + '/../test/pronunciation.yml', 'w') do | f |
|
42
|
+
f.puts "---"
|
43
|
+
m.each do | k, v |
|
44
|
+
k = k.sub(/.*→/, '').gsub(/-/, '')
|
45
|
+
v = v.sub(/.*→/, '').gsub(/[\(:ː\)]/, '').split(%r{[/∼]})
|
46
|
+
f.puts "\"#{k}\": [#{v.join(', ')}]"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
data/gimchi.gemspec
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
|
5
|
+
Gem::Specification.new do |gem|
|
6
|
+
gem.name = %q{gimchi}
|
7
|
+
gem.version = "0.2.0"
|
8
|
+
gem.authors = ["Junegunn Choi"]
|
9
|
+
gem.email = ["junegunn.c@gmail.com"]
|
10
|
+
gem.description = %q{A Ruby gem for Korean characters}
|
11
|
+
gem.summary = %q{A Ruby gem for Korean characters}
|
12
|
+
gem.homepage = "https://github.com/junegunn/gimchi"
|
13
|
+
|
14
|
+
gem.files = `git ls-files`.split($/).reject { |f| f =~ %r[^viz/] }
|
15
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
16
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
17
|
+
gem.require_paths = ["lib"]
|
18
|
+
gem.license = "MIT"
|
19
|
+
|
20
|
+
gem.add_development_dependency 'ansi'
|
21
|
+
end
|
data/lib/gimchi.rb
CHANGED
@@ -2,10 +2,380 @@
|
|
2
2
|
# encoding: UTF-8
|
3
3
|
# Junegunn Choi (junegunn.c@gmail.com)
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'yaml'
|
6
|
+
require 'set'
|
6
7
|
require 'gimchi/char'
|
7
8
|
require 'gimchi/pronouncer'
|
8
9
|
|
9
|
-
|
10
|
-
|
11
|
-
|
10
|
+
class Gimchi
|
11
|
+
class << self
|
12
|
+
def setup
|
13
|
+
@@default ||= Gimchi.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def Char ch
|
17
|
+
@@default.kchar ch
|
18
|
+
end
|
19
|
+
|
20
|
+
[
|
21
|
+
:decompose,
|
22
|
+
:compose,
|
23
|
+
:korean_char?,
|
24
|
+
:complete_korean_char?,
|
25
|
+
:kchar,
|
26
|
+
:kchar?,
|
27
|
+
:chosung?,
|
28
|
+
:jungsung?,
|
29
|
+
:jongsung?,
|
30
|
+
:read_number,
|
31
|
+
:pronounce,
|
32
|
+
:romanize
|
33
|
+
].each do |sym|
|
34
|
+
define_method(sym) do |*arg, &b|
|
35
|
+
@@default.send sym, *arg, &b
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
CONFIG_FILE_PATH = File.expand_path('../../config/default.yml', __FILE__)
|
41
|
+
attr_reader :config, :chosungs, :jungsungs, :jongsungs
|
42
|
+
|
43
|
+
# Initialize Gimchi::Korean.
|
44
|
+
def initialize
|
45
|
+
symbolize_keys = lambda do |val|
|
46
|
+
case val
|
47
|
+
when Hash
|
48
|
+
{}.tap do |h|
|
49
|
+
val.each do |k, v|
|
50
|
+
k = k.gsub(' ', '_').to_sym if k =~ /[a-z0-9 ]/
|
51
|
+
h[k] = symbolize_keys.call v
|
52
|
+
end
|
53
|
+
end
|
54
|
+
when Array
|
55
|
+
val.map { |v| symbolize_keys.call v }
|
56
|
+
else
|
57
|
+
val
|
58
|
+
end
|
59
|
+
end
|
60
|
+
@config = symbolize_keys.call YAML.load(File.read CONFIG_FILE_PATH)
|
61
|
+
|
62
|
+
[
|
63
|
+
@config[:romanization][:post_substitution],
|
64
|
+
@config[:number][:post_substitution],
|
65
|
+
@config[:number][:alt_notation][:post_substitution]
|
66
|
+
].each do |r|
|
67
|
+
r.keys.each do |k|
|
68
|
+
r[Regexp.compile k.to_s] = r.delete k
|
69
|
+
end
|
70
|
+
end
|
71
|
+
@config.freeze
|
72
|
+
|
73
|
+
@pronouncer = Gimchi::Pronouncer.send :new, self
|
74
|
+
|
75
|
+
@chosungs = config[:structure][:chosung]
|
76
|
+
@jungsungs = config[:structure][:jungsung]
|
77
|
+
@jongsungs = config[:structure][:jongsung]
|
78
|
+
@chosung_set = Set[*@chosungs]
|
79
|
+
@jungsung_set = Set[*@jungsungs]
|
80
|
+
@jongsung_set = Set[*@jongsungs]
|
81
|
+
@all = @chosung_set + @jungsung_set + @jongsung_set
|
82
|
+
end
|
83
|
+
|
84
|
+
# Decompose a Korean character into 3 components
|
85
|
+
# @param [String] ch Korean character
|
86
|
+
# @return [Array]
|
87
|
+
def decompose ch
|
88
|
+
kchar(ch).to_a
|
89
|
+
end
|
90
|
+
|
91
|
+
# Compose 3 elements into a Korean character String
|
92
|
+
# @param [String] chosung
|
93
|
+
# @param [String] jungsung
|
94
|
+
# @param [String] jongsung
|
95
|
+
# @return [String]
|
96
|
+
def compose chosung, jungsung = nil, jongsung = nil
|
97
|
+
if chosung.nil? && jungsung.nil?
|
98
|
+
""
|
99
|
+
elsif chosung && jungsung
|
100
|
+
n1, n2, n3 =
|
101
|
+
n1 = chosungs.index(chosung) || 0
|
102
|
+
n2 = jungsungs.index(jungsung) || 0
|
103
|
+
n3 = ([nil] + jongsungs).index(jongsung) || 0
|
104
|
+
[ 0xAC00 + n1 * (21 * 28) + n2 * 28 + n3 ].pack('U')
|
105
|
+
else
|
106
|
+
chosung || jungsung
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# @param [String] ch
|
111
|
+
# @return [Boolean]
|
112
|
+
def chosung? ch
|
113
|
+
@chosung_set.include? ch
|
114
|
+
end
|
115
|
+
|
116
|
+
# @param [String] ch
|
117
|
+
# @return [Boolean]
|
118
|
+
def jungsung? ch
|
119
|
+
@jungsung_set.include? ch
|
120
|
+
end
|
121
|
+
|
122
|
+
# @param [String] ch
|
123
|
+
# @return [Boolean]
|
124
|
+
def jongsung? ch
|
125
|
+
@jongsung_set.include? ch
|
126
|
+
end
|
127
|
+
|
128
|
+
# Checks if the given character is a korean character.
|
129
|
+
# @param [String] ch A string of size 1
|
130
|
+
def korean_char? ch
|
131
|
+
raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
|
132
|
+
|
133
|
+
complete_korean_char?(ch) || @all.include?(ch)
|
134
|
+
end
|
135
|
+
alias kchar? korean_char?
|
136
|
+
|
137
|
+
# Checks if the given character is a "complete" korean character.
|
138
|
+
# "Complete" Korean character must have chosung and jungsung, with optional jongsung.
|
139
|
+
# @param [String] ch A string of size 1
|
140
|
+
def complete_korean_char? ch
|
141
|
+
raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
|
142
|
+
|
143
|
+
# Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
|
144
|
+
ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
|
145
|
+
end
|
146
|
+
|
147
|
+
# Returns a Gimchi::Char object for the given Korean character.
|
148
|
+
# @param [String] ch Korean character in String
|
149
|
+
# @return [Gimchi::Char] Gimchi::Char instance
|
150
|
+
def kchar ch
|
151
|
+
Gimchi::Char.new(self, ch)
|
152
|
+
end
|
153
|
+
|
154
|
+
# Reads numeric expressions in Korean way.
|
155
|
+
# @param [String, Number] str Numeric type or String containing numeric expressions
|
156
|
+
# @return [String] Output string
|
157
|
+
def read_number str
|
158
|
+
str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
|
159
|
+
read_number_sub($1, $5)
|
160
|
+
}
|
161
|
+
end
|
162
|
+
|
163
|
+
# Returns the pronunciation of the given string containing Korean characters.
|
164
|
+
# Takes optional options hash.
|
165
|
+
#
|
166
|
+
# @param [String] Input string
|
167
|
+
# @param [Hash] options Options
|
168
|
+
# @option options [Boolean] each_char Each character of the string is pronounced respectively.
|
169
|
+
# @option options [Boolean] slur Strings separated by whitespaces are processed again as if they were contiguous.
|
170
|
+
# @option options [Boolean] number Numberic parts of the string is also pronounced in Korean.
|
171
|
+
# @option options [Array] except Allows you to skip certain transformations.
|
172
|
+
# @return [String] Output string
|
173
|
+
def pronounce str, options = {}
|
174
|
+
options = {
|
175
|
+
:each_char => false,
|
176
|
+
:slur => false,
|
177
|
+
:number => true,
|
178
|
+
:except => [],
|
179
|
+
:debug => false
|
180
|
+
}.merge options
|
181
|
+
|
182
|
+
str = read_number(str) if options[:number]
|
183
|
+
|
184
|
+
result, transforms = @pronouncer.send :pronounce!, str, options
|
185
|
+
|
186
|
+
if options[:debug]
|
187
|
+
return result, transforms
|
188
|
+
else
|
189
|
+
return result
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
# Returns the romanization (alphabetical notation) of the given Korean string.
|
194
|
+
# http://en.wikipedia.org/wiki/Korean_romanization
|
195
|
+
# @param [String] str Input Korean string
|
196
|
+
# @param [Hash] options Options
|
197
|
+
# @option options [Boolean] as_pronounced If true, #pronounce is internally called before romanize
|
198
|
+
# @option options [Boolean] number Whether to read numeric expressions in the string
|
199
|
+
# @option options [Boolean] slur Same as :slur in #pronounce
|
200
|
+
# @return [String] Output string in Roman Alphabet
|
201
|
+
# @see Korean#pronounce
|
202
|
+
def romanize str, options = {}
|
203
|
+
options = {
|
204
|
+
:as_pronounced => true,
|
205
|
+
:number => true,
|
206
|
+
:slur => false
|
207
|
+
}.merge options
|
208
|
+
|
209
|
+
rdata = config[:romanization]
|
210
|
+
post_subs = rdata[:post_substitution]
|
211
|
+
rdata = [rdata[:chosung], rdata[:jungsung], rdata[:jongsung]]
|
212
|
+
|
213
|
+
str = pronounce str,
|
214
|
+
:each_char => !options[:as_pronounced],
|
215
|
+
:number => options[:number],
|
216
|
+
:slur => options[:slur],
|
217
|
+
# 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
|
218
|
+
:except => %w[rule_5_3]
|
219
|
+
dash = rdata[0]["ㅇ"]
|
220
|
+
romanization = ""
|
221
|
+
|
222
|
+
romanize_chunk = lambda do |chunk|
|
223
|
+
chunk.each_char.map { |ch| kchar(ch) rescue ch }.each do |kc|
|
224
|
+
kc.to_a.each_with_index do |comp, idx|
|
225
|
+
next if comp.nil?
|
226
|
+
comp = rdata[idx][comp] || comp
|
227
|
+
comp = comp[1..-1] if comp[0, 1] == dash &&
|
228
|
+
(romanization.empty? || romanization[-1, 1] =~ /\s/)
|
229
|
+
romanization += comp
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
return post_subs.keys.inject(romanization) { | output, pattern |
|
234
|
+
output.gsub(pattern, post_subs[pattern])
|
235
|
+
}
|
236
|
+
end
|
237
|
+
|
238
|
+
k_chunk = ""
|
239
|
+
str.each_char do | c |
|
240
|
+
if korean_char? c
|
241
|
+
k_chunk += c
|
242
|
+
else
|
243
|
+
unless k_chunk.empty?
|
244
|
+
romanization = romanize_chunk.call k_chunk
|
245
|
+
k_chunk = ""
|
246
|
+
end
|
247
|
+
romanization += c
|
248
|
+
end
|
249
|
+
end
|
250
|
+
romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
|
251
|
+
romanization
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
def str_length str
|
256
|
+
str.length
|
257
|
+
end
|
258
|
+
|
259
|
+
def read_number_sub num, next_char
|
260
|
+
nconfig = config[:number]
|
261
|
+
|
262
|
+
if num == '0'
|
263
|
+
return nconfig[:digits].first
|
264
|
+
end
|
265
|
+
|
266
|
+
num = num.gsub(',', '')
|
267
|
+
next_char = next_char.to_s
|
268
|
+
is_float = num.match(/[\.e]/) != nil
|
269
|
+
|
270
|
+
# Alternative notation for integers with proper suffix
|
271
|
+
alt = false
|
272
|
+
if is_float == false &&
|
273
|
+
nconfig[:alt_notation][:when_suffix].keys.include?(next_char.strip)
|
274
|
+
max = nconfig[:alt_notation][:when_suffix][next_char.strip][:max]
|
275
|
+
|
276
|
+
if max.nil? || num.to_i <= max
|
277
|
+
alt = true
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
# Sign
|
282
|
+
sign = []
|
283
|
+
negative = false
|
284
|
+
if num =~ /^-/
|
285
|
+
num = num.sub(/^-\s*/, '')
|
286
|
+
sign << nconfig[:negative]
|
287
|
+
negative = true
|
288
|
+
elsif num =~ /^\+/
|
289
|
+
num = num.sub(/^\+\s*/, '')
|
290
|
+
sign << nconfig[:positive]
|
291
|
+
end
|
292
|
+
|
293
|
+
if is_float
|
294
|
+
below = nconfig[:decimal_point]
|
295
|
+
below = nconfig[:digits][0] + below if num.to_f < 1
|
296
|
+
|
297
|
+
if md = num.match(/(.*)e(.*)/)
|
298
|
+
dp = md[1].index('.')
|
299
|
+
num = md[1].tr '.', ''
|
300
|
+
exp = md[2].to_i
|
301
|
+
|
302
|
+
dp += exp
|
303
|
+
if dp > num.length
|
304
|
+
num = num.ljust(dp, '0')
|
305
|
+
num = num.sub(/^0+([1-9])/, "\\1")
|
306
|
+
|
307
|
+
below = ""
|
308
|
+
elsif dp < 0
|
309
|
+
num = '0.' + '0' * (-dp) + num
|
310
|
+
else
|
311
|
+
num[dp, 1] = '.' + num[dp, 1]
|
312
|
+
end
|
313
|
+
end
|
314
|
+
num.sub(/.*\./, '').each_char do | char |
|
315
|
+
below += nconfig[:digits][char.to_i]
|
316
|
+
end if num.include? '.'
|
317
|
+
num = num.sub(/\..*/, '')
|
318
|
+
else
|
319
|
+
below = ""
|
320
|
+
end
|
321
|
+
|
322
|
+
tokens = []
|
323
|
+
unit_idx = -1
|
324
|
+
num = num.to_i
|
325
|
+
while num > 0
|
326
|
+
v = num % 10000
|
327
|
+
|
328
|
+
unit_idx += 1
|
329
|
+
if v > 0
|
330
|
+
if alt == false || unit_idx >= 1
|
331
|
+
str = ""
|
332
|
+
# Cannot use hash as they're unordered in 1.8
|
333
|
+
[[1000, '천'],
|
334
|
+
[100, '백'],
|
335
|
+
[10, '십']].each do | arr |
|
336
|
+
u, sub_unit = arr
|
337
|
+
str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
|
338
|
+
v %= u
|
339
|
+
end
|
340
|
+
str += nconfig[:digits][v] if v > 0
|
341
|
+
|
342
|
+
raise RangeError, "number too large" unless nconfig[:units][unit_idx]
|
343
|
+
tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx]
|
344
|
+
else
|
345
|
+
str = ""
|
346
|
+
tenfolds = nconfig[:alt_notation][:tenfolds]
|
347
|
+
digits = nconfig[:alt_notation][:digits]
|
348
|
+
alt_post_subs = nconfig[:alt_notation][:post_substitution]
|
349
|
+
|
350
|
+
# Likewise.
|
351
|
+
[[1000, '천'],
|
352
|
+
[100, '백']].each do |u, sub_unit|
|
353
|
+
str += (nconfig[:digits][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
|
354
|
+
v %= u
|
355
|
+
end
|
356
|
+
|
357
|
+
str += tenfolds[(v / 10) - 1] if v / 10 > 0
|
358
|
+
v %= 10
|
359
|
+
str += digits[v] if v > 0
|
360
|
+
|
361
|
+
alt_post_subs.each do |p, s|
|
362
|
+
str.gsub!(p, s)
|
363
|
+
end if alt
|
364
|
+
tokens << str.sub(/ $/, '') + nconfig[:units][unit_idx]
|
365
|
+
end
|
366
|
+
end
|
367
|
+
num /= 10000
|
368
|
+
end
|
369
|
+
|
370
|
+
tokens += sign unless sign.empty?
|
371
|
+
ret = tokens.reverse.join(' ') + below + next_char
|
372
|
+
nconfig[:post_substitution].each do |p, s|
|
373
|
+
ret.gsub!(p, s)
|
374
|
+
end
|
375
|
+
ret
|
376
|
+
end
|
377
|
+
end#Gimchi
|
378
|
+
|
379
|
+
require 'gimchi/patch_1.8'
|
380
|
+
|
381
|
+
Gimchi.setup
|