gimchi 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/README.markdown ADDED
@@ -0,0 +1,134 @@
1
+ # gimchi
2
+
3
+ Gimchi is a simple Ruby gem which knows how to handle Korean strings. It knows
4
+ how to dissect Korean characters into its 3 components, namely chosung,
5
+ jungsung and optional jongsung. It knows how Korean sentences are pronounced
6
+ and how they're written in roman alphabet.
7
+
8
+ Gimchi (only partially) implements the following rules dictated by
9
+ The National Institute of The Korean Language (http://www.korean.go.kr)
10
+ * Korean Standard Pronunciation
11
+ * Korean Romanization
12
+
13
+ ## Installation
14
+ ```
15
+ gem install gimchi
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ### Creating Gimchi::Korean instance
21
+ ```ruby
22
+ require 'gimchi'
23
+
24
+ ko = Gimchi::Korean.new
25
+ ```
26
+
27
+ ### Checks if the given character is in Korean alphabet
28
+ ```ruby
29
+ ko.korean_char? 'ㄱ' # true
30
+ ko.complete_korean_char? 'ㄱ' # false
31
+
32
+ ko.korean_char? 'ㅏ' # true
33
+ ko.complete_korean_char? 'ㅏ' # false
34
+
35
+ ko.korean_char? '가' # true
36
+ ko.complete_korean_char? '가' # true
37
+ ```
38
+
39
+ ### Usage of Gimchi::Korean::Char
40
+ ```ruby
41
+ arr = ko.dissect '이것은 한글입니다.'
42
+ # [이, 것, 은, " ", 한, 글, 입, 니, 다, "."]
43
+
44
+ arr[4].class # Gimchi::Korean::Char
45
+
46
+ arr[4].chosung # "ㅎ"
47
+ arr[4].jungsung # "ㅏ"
48
+ arr[4].jongsung # "ㄴ"
49
+ arr[4].to_a # ["ㅎ", "ㅏ", "ㄴ"]
50
+ arr[4].to_s # "한"
51
+
52
+ arr[4].chosung = 'ㄷ'
53
+ arr[4].jongsung = 'ㄹ'
54
+ arr[4].to_s # "달"
55
+ arr[4].complete? # true
56
+ arr[4].partial? # false
57
+
58
+ arr[4].chosung = nil
59
+ arr[4].jongsung = nil
60
+ arr[4].complete? # false
61
+ arr[4].partial? # true
62
+ ```
63
+
64
+ ### Reading numbers in Korean
65
+ ```ruby
66
+ ko.read_number(1999) # "천 구백 구십 구"
67
+ ko.read_number(- 100.123) # "마이너스 백점일이삼"
68
+ ko.read_number("153,191,100,678.3214")
69
+ # "천 오백 삼십 일억 구천 백 십만 육백 칠십 팔점삼이일사"
70
+
71
+ # Age, Time ( -살, -시 )
72
+ ko.read_number("20살") # "스무살"
73
+ ko.read_number("13 살") # "열세 살"
74
+ ko.read_number("7시 30분") # "일곱시 삼십분"
75
+ ```
76
+
77
+ ### Standard pronunciation (partially implemented)
78
+ ```ruby
79
+ str = "됐어 됐어 이제 그런 가르침은 됐어 매일 아침 7 시 30 분까지 우릴 조그만 교실로 몰아넣고"
80
+ ko.pronounce str
81
+ # "돼써 돼써 이제 그런 가르치믄 돼써 매일 아침 일곱 시 삼십 분까지 우릴 조그만 교실로 모라너코"
82
+
83
+ ko.pronounce str, :slur => true
84
+ # "돼써 돼써 이제 그런 가르치믄 돼써 매이 라치 밀곱 씨 삼십 뿐까지 우릴 조그만 교실로 모라너코"
85
+
86
+ ko.pronounce str, :pronounce_each_char => true
87
+ # "됃어 됃어 이제 그런 가르침은 됃어 매일 아침 일곱 시 삼십 분까지 우릴 조그만 교실로 몰아너고"
88
+
89
+ ko.pronounce str, :number => false
90
+ # "돼써 돼써 이제 그런 가르치믄 돼써 매일 아침 7 시 30 분까지 우릴 조그만 교실로 모라너코"
91
+ ```
92
+
93
+ ### Romanization (partially implemented)
94
+ ```ruby
95
+ str = "됐어 됐어 이제 그런 가르침은 됐어 매일 아침 7 시 30 분까지 우릴 조그만 교실로 몰아넣고"
96
+
97
+ ko.romanize str
98
+ # "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-il achim ilgop si samsip bunkkaji uril jogeuman gyosillo moraneoko"
99
+ ko.romanize str, :slur => true
100
+ # "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-i rachi milgop ssi samsip ppunkkaji uril jogeuman gyosillo moraneoko"
101
+ ko.romanize str, :as_pronounced => false
102
+ # "Dwaet-eo dwaet-eo ije geureon gareuchim-eun dwaet-eo mae-il achim ilgop si samsip bunkkaji uril jogeuman gyosillo mol-aneogo"
103
+ ko.romanize str, :number => false
104
+ # "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-il achim 7 si 30 bunkkaji uril jogeuman gyosillo moraneoko"
105
+ ```
106
+
107
+ ## Limitation of the implementation
108
+
109
+ Unfortunately in order to implement the complete specification of Korean
110
+ pronunciation and romanization, we need NLP, huge Korean dictionaries and even
111
+ semantic analysis of the given string. And even with all those complex
112
+ processing, we cannot guarantee 100% accuracy of the output. So yes, that is
113
+ definitely not what this gem tries to achieve. Gimchi tries to achieve "some"
114
+ level of accuracy with relatively simple code.
115
+
116
+ Currently, Gimchi code contains a lot of ad-hoc (possibly invalid) patches
117
+ that try to improve the quality of the output, which should better be
118
+ refactored anytime soon.
119
+
120
+ ## Contributing to gimchi
121
+
122
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
123
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
124
+ * Fork the project
125
+ * Start a feature/bugfix branch
126
+ * Commit and push until you are happy with your contribution
127
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
128
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
129
+
130
+ ## Copyright
131
+
132
+ Copyright (c) 2011 Junegunn Choi. See LICENSE.txt for
133
+ further details.
134
+
data/config/default.yml CHANGED
@@ -90,8 +90,7 @@ number:
90
90
  units: ["", 만, 억, 조, 경, 해, 자, 양, 구, 간, 정, 재, 극, 항하사, 아승기, 나유타, 불가사의, 무량대수]
91
91
  digits: [영, 일, 이, 삼, 사, 오, 육, 칠, 팔, 구]
92
92
  post substitution:
93
- ? !ruby/regexp /^일만/
94
- : 만
93
+ "^일만":
95
94
 
96
95
  # 정수형일 때 또다른 표현법 (나이, 시간)
97
96
  alt notation:
@@ -107,8 +106,7 @@ number:
107
106
  tenfolds: [열, 스물, 서른, 마흔, 쉰, 예순, 일흔, 여든, 아흔, 백]
108
107
  digits: ["", 한, 두, 세, 네, 다섯, 여섯, 일곱, 여덟, 아홉]
109
108
  post substitution:
110
- ? !ruby/regexp /스물$/
111
- : 스무
109
+ "스물$": 스무
112
110
 
113
111
  romanization:
114
112
  chosung:
@@ -171,9 +169,6 @@ romanization:
171
169
  ttt: t-tt
172
170
  ppp: p-pp
173
171
  "--": "-"
174
- ? !ruby/regexp /n-([^gaeiou])/
175
- : "n\\1"
176
- ? !ruby/regexp /-(\s)/
177
- : "\\1"
178
- ? !ruby/regexp /-$/
179
- : ""
172
+ "n-([^gaeiou])": "n\\1"
173
+ "-(\\s)": "\\1"
174
+ "-$": ""
data/lib/gimchi/char.rb CHANGED
@@ -2,122 +2,122 @@
2
2
 
3
3
  module Gimchi
4
4
  class Korean
5
- # Class representing each Korean character. Its three components,
6
- # `chosung', `jungsung' and `jongsung' can be get and set.
7
- #
8
- # `to_s' merges components into a String. `to_a' returns the three components.
9
- class Char
10
- # @return [String] Chosung component of this character.
11
- attr_reader :chosung
12
- # @return [String] Jungsung component of this character.
13
- attr_reader :jungsung
14
- # @return [String] Jongsung component of this character.
15
- attr_reader :jongsung
5
+ # Class representing each Korean character. Its three components,
6
+ # `chosung', `jungsung' and `jongsung' can be get and set.
7
+ #
8
+ # `to_s' merges components into a String. `to_a' returns the three components.
9
+ class Char
10
+ # @return [String] Chosung component of this character.
11
+ attr_reader :chosung
12
+ # @return [String] Jungsung component of this character.
13
+ attr_reader :jungsung
14
+ # @return [String] Jongsung component of this character.
15
+ attr_reader :jongsung
16
16
 
17
- # @param [Gimchi::Korean] kor Gimchi::Korean instance
18
- # @param [String] kchar Korean character string
19
- def initialize kor, kchar
20
- raise ArgumentError('Not a korean character') unless kor.korean_char? kchar
17
+ # @param [Gimchi::Korean] kor Gimchi::Korean instance
18
+ # @param [String] kchar Korean character string
19
+ def initialize kor, kchar
20
+ raise ArgumentError('Not a korean character') unless kor.korean_char? kchar
21
21
 
22
- @kor = kor
23
- @cur = []
24
- if @kor.complete_korean_char? kchar
25
- c = kchar.unpack('U').first
26
- n = c - 0xAC00
27
- # '가' ~ '깋' -> 'ㄱ'
28
- n1 = n / (21 * 28)
29
- # '가' ~ '깋'에서의 순서
30
- n = n % (21 * 28)
31
- n2 = n / 28;
32
- n3 = n % 28;
33
- self.chosung = @kor.chosungs[n1]
34
- self.jungsung = @kor.jungsungs[n2]
35
- self.jongsung = ([nil] + @kor.jongsungs)[n3]
36
- elsif @kor.chosungs.include? kchar
37
- self.chosung = kchar
38
- elsif @kor.jungsungs.include? kchar
39
- self.jungsung = kchar
40
- elsif @kor.jongsungs.include? kchar
41
- self.jongsung = kchar
42
- end
43
- end
22
+ @kor = kor
23
+ @cur = []
24
+ if @kor.complete_korean_char? kchar
25
+ c = kchar.unpack('U').first
26
+ n = c - 0xAC00
27
+ # '가' ~ '깋' -> 'ㄱ'
28
+ n1 = n / (21 * 28)
29
+ # '가' ~ '깋'에서의 순서
30
+ n = n % (21 * 28)
31
+ n2 = n / 28;
32
+ n3 = n % 28;
33
+ self.chosung = @kor.chosungs[n1]
34
+ self.jungsung = @kor.jungsungs[n2]
35
+ self.jongsung = ([nil] + @kor.jongsungs)[n3]
36
+ elsif @kor.chosungs.include? kchar
37
+ self.chosung = kchar
38
+ elsif @kor.jungsungs.include? kchar
39
+ self.jungsung = kchar
40
+ elsif @kor.jongsungs.include? kchar
41
+ self.jongsung = kchar
42
+ end
43
+ end
44
44
 
45
- # Recombines components into a korean character.
46
- # @return [String] Combined korean character
47
- def to_s
48
- if chosung.nil? && jungsung.nil?
49
- ""
50
- elsif chosung && jungsung
51
- n1, n2, n3 =
52
- n1 = @kor.chosungs.index(chosung) || 0
53
- n2 = @kor.jungsungs.index(jungsung) || 0
54
- n3 = ([nil] + @kor.jongsungs).index(jongsung) || 0
55
- [ 0xAC00 + n1 * (21 * 28) + n2 * 28 + n3 ].pack('U')
56
- else
57
- chosung || jungsung
58
- end
59
- end
45
+ # Recombines components into a korean character.
46
+ # @return [String] Combined korean character
47
+ def to_s
48
+ if chosung.nil? && jungsung.nil?
49
+ ""
50
+ elsif chosung && jungsung
51
+ n1, n2, n3 =
52
+ n1 = @kor.chosungs.index(chosung) || 0
53
+ n2 = @kor.jungsungs.index(jungsung) || 0
54
+ n3 = ([nil] + @kor.jongsungs).index(jongsung) || 0
55
+ [ 0xAC00 + n1 * (21 * 28) + n2 * 28 + n3 ].pack('U')
56
+ else
57
+ chosung || jungsung
58
+ end
59
+ end
60
60
 
61
- # Sets the chosung component.
62
- # @param [String]
63
- def chosung= c
64
- raise ArgumentError.new('Invalid chosung component') if
65
- c && @kor.chosungs.include?(c) == false
66
- @chosung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
67
- end
61
+ # Sets the chosung component.
62
+ # @param [String]
63
+ def chosung= c
64
+ raise ArgumentError.new('Invalid chosung component') if
65
+ c && @kor.chosungs.include?(c) == false
66
+ @chosung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
67
+ end
68
68
 
69
- # Sets the jungsung component
70
- # @param [String]
71
- def jungsung= c
72
- raise ArgumentError.new('Invalid jungsung component') if
73
- c && @kor.jungsungs.include?(c) == false
74
- @jungsung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
75
- end
69
+ # Sets the jungsung component
70
+ # @param [String]
71
+ def jungsung= c
72
+ raise ArgumentError.new('Invalid jungsung component') if
73
+ c && @kor.jungsungs.include?(c) == false
74
+ @jungsung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
75
+ end
76
76
 
77
- # Sets the jongsung component
78
- #
79
- # @param [String]
80
- def jongsung= c
81
- raise ArgumentError.new('Invalid jongsung component') if
82
- c && @kor.jongsungs.include?(c) == false
83
- @jongsung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
84
- end
77
+ # Sets the jongsung component
78
+ #
79
+ # @param [String]
80
+ def jongsung= c
81
+ raise ArgumentError.new('Invalid jongsung component') if
82
+ c && @kor.jongsungs.include?(c) == false
83
+ @jongsung = c && c.dup.extend(Component).tap { |e| e.kor = @kor }
84
+ end
85
85
 
86
- # Returns Array of three components.
87
- #
88
- # @return [Array] Array of three components
89
- def to_a
90
- [chosung, jungsung, jongsung]
91
- end
86
+ # Returns Array of three components.
87
+ #
88
+ # @return [Array] Array of three components
89
+ def to_a
90
+ [chosung, jungsung, jongsung]
91
+ end
92
92
 
93
- # Checks if this is a complete Korean character.
94
- def complete?
95
- chosung.nil? == false && jungsung.nil? == false
96
- end
93
+ # Checks if this is a complete Korean character.
94
+ def complete?
95
+ chosung.nil? == false && jungsung.nil? == false
96
+ end
97
97
 
98
- # Checks if this is a non-complete Korean character.
99
- # e.g. ㅇ, ㅏ
100
- def partial?
101
- chosung.nil? || jungsung.nil?
102
- end
98
+ # Checks if this is a non-complete Korean character.
99
+ # e.g. ㅇ, ㅏ
100
+ def partial?
101
+ chosung.nil? || jungsung.nil?
102
+ end
103
103
 
104
- private
105
- # Three components of Korean::Char are extended to support #vowel? and #consonant? method.
106
- module Component
107
- # @return [Korean] Hosting Korean instance
108
- attr_accessor :kor
104
+ private
105
+ # Three components of Korean::Char are extended to support #vowel? and #consonant? method.
106
+ module Component
107
+ # @return [Korean] Hosting Korean instance
108
+ attr_accessor :kor
109
109
 
110
- # Is this component a vowel?
111
- def vowel?
112
- kor.jungsungs.include? self
113
- end
110
+ # Is this component a vowel?
111
+ def vowel?
112
+ kor.jungsungs.include? self
113
+ end
114
114
 
115
- # Is this component a consonant?
116
- def consonant?
117
- self != 'ㅇ' && kor.chosungs.include?(self)
118
- end
119
- end#Component
120
- end#Char
115
+ # Is this component a consonant?
116
+ def consonant?
117
+ self != 'ㅇ' && kor.chosungs.include?(self)
118
+ end
119
+ end#Component
120
+ end#Char
121
121
  end#Korean
122
122
  end#Gimchi
123
123
 
data/lib/gimchi/korean.rb CHANGED
@@ -2,285 +2,299 @@
2
2
 
3
3
  module Gimchi
4
4
  class Korean
5
- DEFAULT_CONFIG_FILE_PATH =
6
- File.dirname(__FILE__) + '/../../config/default.yml'
7
-
8
- # Returns the YAML configuration used by this Korean instance.
9
- # @return [String]
10
- attr_reader :config
11
-
12
- # Initialize Gimchi::Korean.
13
- # @param [String] config_file You can override many parts of the implementation by customizing config file
14
- def initialize config_file = DEFAULT_CONFIG_FILE_PATH
15
- require 'yaml'
16
- @config = YAML.load(File.read config_file)
17
- @config.freeze
18
-
19
- @pronouncer = Korean::Pronouncer.send :new, self
20
- end
21
-
22
- # Array of chosung's.
23
- #
24
- # @return [Array] Array of chosung strings
25
- def chosungs
26
- config['structure']['chosung']
27
- end
28
-
29
- # Array of jungsung's.
30
- # @return [Array] Array of jungsung strings
31
- def jungsungs
32
- config['structure']['jungsung']
33
- end
34
-
35
- # Array of jongsung's.
36
- # @return [Array] Array of jongsung strings
37
- def jongsungs
38
- config['structure']['jongsung']
39
- end
40
-
41
- # Checks if the given character is a korean character.
42
- # @param [String] ch A string of size 1
43
- def korean_char? ch
44
- raise ArgumentError.new('Lengthy input') if ch.length > 1
45
-
46
- complete_korean_char?(ch) ||
47
- (chosungs + jungsungs + jongsungs).include?(ch)
48
- end
49
-
50
- # Checks if the given character is a "complete" korean character.
51
- # "Complete" Korean character must have chosung and jungsung, with optional jongsung.
52
- # @param [String] ch A string of size 1
53
- def complete_korean_char? ch
54
- raise ArgumentError.new('Lengthy input') if ch.length > 1
55
-
56
- # Range of Korean chracters in Unicode 2.0: AC00() ~ D7A3(힣)
57
- ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
58
- end
59
-
60
- # Splits the given string into an array of Korean::Char's and Strings of length 1.
61
- # @param [String] str Input string.
62
- # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters)
63
- def dissect str
64
- str.each_char.map { |c|
65
- korean_char?(c) ? Korean::Char.new(self, c) : c
66
- }
67
- end
68
-
69
- # Reads numeric expressions in Korean way.
70
- # @param [String, Number] str Numeric type or String containing numeric expressions
71
- # @return [String] Output string
72
- def read_number str
73
- nconfig = config['number']
74
-
75
- str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
76
- read_number_sub($1, $5)
77
- }
78
- end
79
-
80
- # Returns the pronunciation of the given string containing Korean characters.
81
- # Takes optional options hash.
82
- #
83
- # @param [String] Input string
84
- # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively.
85
- # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous.
86
- # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean.
87
- # @param [Array] options[:except] Allows you to skip certain transformations.
88
- # @return [String] Output string
89
- def pronounce str, options = {}
90
- options = {
91
- :pronounce_each_char => false,
92
- :slur => false,
93
- :number => true,
94
- :except => [],
95
- :debug => false
96
- }.merge options
97
-
98
- str = read_number(str) if options[:number]
99
-
100
- result, transforms = @pronouncer.send :pronounce!, str, options
101
-
102
- if options[:debug]
103
- return result, transforms
104
- else
105
- return result
106
- end
107
- end
108
-
109
- # Returns the romanization (alphabetical notation) of the given Korean string.
110
- # http://en.wikipedia.org/wiki/Korean_romanization
111
- # @param [String] str Input Korean string
112
- # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize
113
- # @param [Boolean] options[:number] Whether to read numeric expressions in the string
114
- # @param [Boolean] options[:slur] Same as :slur in #pronounce
115
- # @return [String] Output string in Roman Alphabet
116
- # @see Korean#pronounce
117
- def romanize str, options = {}
118
- options = {
119
- :as_pronounced => true,
120
- :number => true,
121
- :slur => false
122
- }.merge options
123
-
124
- require 'yaml'
125
- rdata = config['romanization']
126
- post_subs = rdata["post substitution"]
127
- rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]]
128
-
129
- str = pronounce str,
130
- :pronounce_each_char => !options[:as_pronounced],
131
- :number => options[:number],
132
- :slur => options[:slur],
133
- # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
134
- :except => %w[rule_5_3]
135
- dash = rdata[0]["ㅇ"]
136
- romanization = ""
137
-
138
- romanize_chunk = lambda do | chunk |
139
- dissect(chunk).each do | kc |
140
- kc.to_a.each_with_index do | comp, idx |
141
- next if comp.nil?
142
- comp = rdata[idx][comp] || comp
143
- comp = comp[1..-1] if comp[0, 1] == dash &&
144
- (romanization.empty? || romanization[-1, 1] =~ /\s/)
145
- romanization += comp
146
- end
147
- end
148
-
149
- return post_subs.keys.inject(romanization) { | output, pattern |
150
- output.gsub(pattern, post_subs[pattern])
151
- }
152
- end
153
-
154
- k_chunk = ""
155
- str.each_char do | c |
156
- if korean_char? c
157
- k_chunk += c
158
- else
159
- unless k_chunk.empty?
160
- romanization = romanize_chunk.call k_chunk
161
- k_chunk = ""
162
- end
163
- romanization += c
164
- end
165
- end
166
- romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
167
- romanization
168
- end
5
+ DEFAULT_CONFIG_FILE_PATH =
6
+ File.dirname(__FILE__) + '/../../config/default.yml'
7
+
8
+ # Returns the YAML configuration used by this Korean instance.
9
+ # @return [String]
10
+ attr_reader :config
11
+
12
+ # Initialize Gimchi::Korean.
13
+ # @param [String] config_file You can override many parts of the implementation by customizing config file
14
+ def initialize config_file = DEFAULT_CONFIG_FILE_PATH
15
+ require 'yaml'
16
+ @config = YAML.load(File.read config_file)
17
+
18
+ [
19
+ @config['romanization']['post substitution'],
20
+ @config['number']['post substitution'],
21
+ @config['number']['alt notation']['post substitution']
22
+ ].each do |r|
23
+ r.keys.each do |k|
24
+ r[Regexp.compile k] = r.delete k
25
+ end
26
+ end
27
+ @config.freeze
28
+
29
+ @pronouncer = Korean::Pronouncer.send :new, self
30
+ end
31
+
32
+ # Array of chosung's.
33
+ #
34
+ # @return [Array] Array of chosung strings
35
+ def chosungs
36
+ config['structure']['chosung']
37
+ end
38
+
39
+ # Array of jungsung's.
40
+ # @return [Array] Array of jungsung strings
41
+ def jungsungs
42
+ config['structure']['jungsung']
43
+ end
44
+
45
+ # Array of jongsung's.
46
+ # @return [Array] Array of jongsung strings
47
+ def jongsungs
48
+ config['structure']['jongsung']
49
+ end
50
+
51
+ # Checks if the given character is a korean character.
52
+ # @param [String] ch A string of size 1
53
+ def korean_char? ch
54
+ raise ArgumentError.new('Lengthy input') if ch.length > 1
55
+
56
+ complete_korean_char?(ch) ||
57
+ (chosungs + jungsungs + jongsungs).include?(ch)
58
+ end
59
+
60
+ # Checks if the given character is a "complete" korean character.
61
+ # "Complete" Korean character must have chosung and jungsung, with optional jongsung.
62
+ # @param [String] ch A string of size 1
63
+ def complete_korean_char? ch
64
+ raise ArgumentError.new('Lengthy input') if ch.length > 1
65
+
66
+ # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
67
+ ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
68
+ end
69
+
70
+ # Splits the given string into an array of Korean::Char's and Strings of length 1.
71
+ # @param [String] str Input string.
72
+ # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters)
73
+ def dissect str
74
+ str.each_char.map { |c|
75
+ korean_char?(c) ? Korean::Char.new(self, c) : c
76
+ }
77
+ end
78
+
79
+ # Reads numeric expressions in Korean way.
80
+ # @param [String, Number] str Numeric type or String containing numeric expressions
81
+ # @return [String] Output string
82
+ def read_number str
83
+ nconfig = config['number']
84
+
85
+ str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
86
+ read_number_sub($1, $5)
87
+ }
88
+ end
89
+
90
+ # Returns the pronunciation of the given string containing Korean characters.
91
+ # Takes optional options hash.
92
+ #
93
+ # @param [String] Input string
94
+ # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively.
95
+ # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous.
96
+ # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean.
97
+ # @param [Array] options[:except] Allows you to skip certain transformations.
98
+ # @return [String] Output string
99
+ def pronounce str, options = {}
100
+ options = {
101
+ :pronounce_each_char => false,
102
+ :slur => false,
103
+ :number => true,
104
+ :except => [],
105
+ :debug => false
106
+ }.merge options
107
+
108
+ str = read_number(str) if options[:number]
109
+
110
+ result, transforms = @pronouncer.send :pronounce!, str, options
111
+
112
+ if options[:debug]
113
+ return result, transforms
114
+ else
115
+ return result
116
+ end
117
+ end
118
+
119
+ # Returns the romanization (alphabetical notation) of the given Korean string.
120
+ # http://en.wikipedia.org/wiki/Korean_romanization
121
+ # @param [String] str Input Korean string
122
+ # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize
123
+ # @param [Boolean] options[:number] Whether to read numeric expressions in the string
124
+ # @param [Boolean] options[:slur] Same as :slur in #pronounce
125
+ # @return [String] Output string in Roman Alphabet
126
+ # @see Korean#pronounce
127
+ def romanize str, options = {}
128
+ options = {
129
+ :as_pronounced => true,
130
+ :number => true,
131
+ :slur => false
132
+ }.merge options
133
+
134
+ require 'yaml'
135
+ rdata = config['romanization']
136
+ post_subs = rdata["post substitution"]
137
+ rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]]
138
+
139
+ str = pronounce str,
140
+ :pronounce_each_char => !options[:as_pronounced],
141
+ :number => options[:number],
142
+ :slur => options[:slur],
143
+ # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
144
+ :except => %w[rule_5_3]
145
+ dash = rdata[0]["ㅇ"]
146
+ romanization = ""
147
+
148
+ romanize_chunk = lambda do | chunk |
149
+ dissect(chunk).each do | kc |
150
+ kc.to_a.each_with_index do | comp, idx |
151
+ next if comp.nil?
152
+ comp = rdata[idx][comp] || comp
153
+ comp = comp[1..-1] if comp[0, 1] == dash &&
154
+ (romanization.empty? || romanization[-1, 1] =~ /\s/)
155
+ romanization += comp
156
+ end
157
+ end
158
+
159
+ return post_subs.keys.inject(romanization) { | output, pattern |
160
+ output.gsub(pattern, post_subs[pattern])
161
+ }
162
+ end
163
+
164
+ k_chunk = ""
165
+ str.each_char do | c |
166
+ if korean_char? c
167
+ k_chunk += c
168
+ else
169
+ unless k_chunk.empty?
170
+ romanization = romanize_chunk.call k_chunk
171
+ k_chunk = ""
172
+ end
173
+ romanization += c
174
+ end
175
+ end
176
+ romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
177
+ romanization
178
+ end
169
179
 
170
180
  private
171
- def read_number_sub num, next_char
172
- nconfig = config['number']
173
-
174
- num = num.gsub(',', '')
175
- next_char = next_char.to_s
176
- is_float = num.match(/[\.e]/) != nil
177
-
178
- # Alternative notation for integers with proper suffix
179
- alt = false
180
- if is_float == false &&
181
- nconfig['alt notation']['when suffix'].keys.include?(next_char.strip)
182
- max = nconfig['alt notation']['when suffix'][next_char.strip]['max']
183
-
184
- if max.nil? || num.to_i <= max
185
- alt = true
186
- end
187
- end
188
-
189
- # Sign
190
- sign = []
191
- negative = false
192
- if num =~ /^-/
193
- num = num.sub(/^-\s*/, '')
194
- sign << nconfig['negative']
195
- negative = true
196
- elsif num =~ /^\+/
197
- num = num.sub(/^\+\s*/, '')
198
- sign << nconfig['positive']
199
- end
200
-
201
- if is_float
202
- below = nconfig['decimal point']
203
- below = nconfig['digits'][0] + below if num.to_f < 1
204
-
205
- if md = num.match(/(.*)e(.*)/)
206
- dp = md[1].index('.')
207
- num = md[1].tr '.', ''
208
- exp = md[2].to_i
209
-
210
- dp += exp
211
- if dp > num.length
212
- num = num.ljust(dp, '0')
213
- num = num.sub(/^0+([1-9])/, "\\1")
214
-
215
- below = ""
216
- elsif dp < 0
217
- num = '0.' + '0' * (-dp) + num
218
- else
219
- num[dp] = '.' + num[dp]
220
- end
221
- end
222
- num.sub(/.*\./, '').each_char do | char |
223
- below += nconfig['digits'][char.to_i]
224
- end if num.include? '.'
225
- num = num.sub(/\..*/, '')
226
- else
227
- below = ""
228
- end
229
-
230
- tokens = []
231
- unit_idx = -1
232
- num = num.to_i
233
- while num > 0
234
- v = num % 10000
235
-
236
- unit_idx += 1
237
- if v > 0
238
- if alt == false || unit_idx >= 1
239
- str = ""
240
- # Cannot use hash as they're unordered in 1.8
241
- [[1000, '천'],
242
- [100, '백'],
243
- [10, '십']].each do | arr |
244
- u, sub_unit = arr
245
- str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
246
- v %= u
247
- end
248
- str += nconfig['digits'][v] if v > 0
249
-
250
- tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
251
- else
252
- str = ""
253
- tenfolds = nconfig['alt notation']['tenfolds']
254
- digits = nconfig['alt notation']['digits']
255
- alt_post_subs = nconfig['alt notation']['post substitution']
256
-
257
- # Likewise.
258
- [[1000, ''],
259
- [100, '']].each do | u, sub_unit |
260
- str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
261
- v %= u
262
- end
263
-
264
- str += tenfolds[(v / 10) - 1] if v / 10 > 0
265
- v %= 10
266
- str += digits[v] if v > 0
267
-
268
- alt_post_subs.each do | k, v |
269
- str.gsub!(k, v)
270
- end if alt
271
- tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
272
- end
273
- end
274
- num /= 10000
275
- end
276
-
277
- tokens += sign unless sign.empty?
278
- ret = tokens.reverse.join(' ') + below + next_char
279
- nconfig['post substitution'].each do | k, v |
280
- ret.gsub!(k, v)
281
- end
282
- ret
283
- end
181
+ def read_number_sub num, next_char
182
+ nconfig = config['number']
183
+
184
+ if num == '0'
185
+ return nconfig['digits'].first
186
+ end
187
+
188
+ num = num.gsub(',', '')
189
+ next_char = next_char.to_s
190
+ is_float = num.match(/[\.e]/) != nil
191
+
192
+ # Alternative notation for integers with proper suffix
193
+ alt = false
194
+ if is_float == false &&
195
+ nconfig['alt notation']['when suffix'].keys.include?(next_char.strip)
196
+ max = nconfig['alt notation']['when suffix'][next_char.strip]['max']
197
+
198
+ if max.nil? || num.to_i <= max
199
+ alt = true
200
+ end
201
+ end
202
+
203
+ # Sign
204
+ sign = []
205
+ negative = false
206
+ if num =~ /^-/
207
+ num = num.sub(/^-\s*/, '')
208
+ sign << nconfig['negative']
209
+ negative = true
210
+ elsif num =~ /^\+/
211
+ num = num.sub(/^\+\s*/, '')
212
+ sign << nconfig['positive']
213
+ end
214
+
215
+ if is_float
216
+ below = nconfig['decimal point']
217
+ below = nconfig['digits'][0] + below if num.to_f < 1
218
+
219
+ if md = num.match(/(.*)e(.*)/)
220
+ dp = md[1].index('.')
221
+ num = md[1].tr '.', ''
222
+ exp = md[2].to_i
223
+
224
+ dp += exp
225
+ if dp > num.length
226
+ num = num.ljust(dp, '0')
227
+ num = num.sub(/^0+([1-9])/, "\\1")
228
+
229
+ below = ""
230
+ elsif dp < 0
231
+ num = '0.' + '0' * (-dp) + num
232
+ else
233
+ num[dp, 1] = '.' + num[dp, 1]
234
+ end
235
+ end
236
+ num.sub(/.*\./, '').each_char do | char |
237
+ below += nconfig['digits'][char.to_i]
238
+ end if num.include? '.'
239
+ num = num.sub(/\..*/, '')
240
+ else
241
+ below = ""
242
+ end
243
+
244
+ tokens = []
245
+ unit_idx = -1
246
+ num = num.to_i
247
+ while num > 0
248
+ v = num % 10000
249
+
250
+ unit_idx += 1
251
+ if v > 0
252
+ if alt == false || unit_idx >= 1
253
+ str = ""
254
+ # Cannot use hash as they're unordered in 1.8
255
+ [[1000, ''],
256
+ [100, '백'],
257
+ [10, '십']].each do | arr |
258
+ u, sub_unit = arr
259
+ str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
260
+ v %= u
261
+ end
262
+ str += nconfig['digits'][v] if v > 0
263
+
264
+ tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
265
+ else
266
+ str = ""
267
+ tenfolds = nconfig['alt notation']['tenfolds']
268
+ digits = nconfig['alt notation']['digits']
269
+ alt_post_subs = nconfig['alt notation']['post substitution']
270
+
271
+ # Likewise.
272
+ [[1000, '천'],
273
+ [100, '백']].each do | u, sub_unit |
274
+ str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
275
+ v %= u
276
+ end
277
+
278
+ str += tenfolds[(v / 10) - 1] if v / 10 > 0
279
+ v %= 10
280
+ str += digits[v] if v > 0
281
+
282
+ alt_post_subs.each do | k, v |
283
+ str.gsub!(k, v)
284
+ end if alt
285
+ tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
286
+ end
287
+ end
288
+ num /= 10000
289
+ end
290
+
291
+ tokens += sign unless sign.empty?
292
+ ret = tokens.reverse.join(' ') + below + next_char
293
+ nconfig['post substitution'].each do | k, v |
294
+ ret.gsub!(k, v)
295
+ end
296
+ ret
297
+ end
284
298
  end#Korean
285
299
  end#Gimchi
286
300