gimchi 0.1.9 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gimchi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.9
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,33 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-02-20 00:00:00.000000000 Z
12
+ date: 2013-03-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: bundler
16
- requirement: &2158357040 !ruby/object:Gem::Requirement
17
- none: false
18
- requirements:
19
- - - ~>
20
- - !ruby/object:Gem::Version
21
- version: 1.0.0
22
- type: :development
23
- prerelease: false
24
- version_requirements: *2158357040
25
- - !ruby/object:Gem::Dependency
26
- name: jeweler
27
- requirement: &2158358840 !ruby/object:Gem::Requirement
28
- none: false
29
- requirements:
30
- - - ~>
31
- - !ruby/object:Gem::Version
32
- version: 1.5.2
33
- type: :development
34
- prerelease: false
35
- version_requirements: *2158358840
36
- - !ruby/object:Gem::Dependency
37
- name: rcov
38
- requirement: &2158315180 !ruby/object:Gem::Requirement
15
+ name: ansi
16
+ requirement: !ruby/object:Gem::Requirement
39
17
  none: false
40
18
  requirements:
41
19
  - - ! '>='
@@ -43,42 +21,39 @@ dependencies:
43
21
  version: '0'
44
22
  type: :development
45
23
  prerelease: false
46
- version_requirements: *2158315180
47
- - !ruby/object:Gem::Dependency
48
- name: ansi
49
- requirement: &2154666280 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
50
25
  none: false
51
26
  requirements:
52
27
  - - ! '>='
53
28
  - !ruby/object:Gem::Version
54
- version: 1.2.2
55
- type: :development
56
- prerelease: false
57
- version_requirements: *2154666280
58
- description: Gimchi knows how to pronounce Korean strings and how to write them in
59
- roman alphabet.
60
- email: junegunn.c@gmail.com
29
+ version: '0'
30
+ description: A Ruby gem for Korean characters
31
+ email:
32
+ - junegunn.c@gmail.com
61
33
  executables: []
62
34
  extensions: []
63
- extra_rdoc_files:
64
- - LICENSE.txt
65
- - README.ko.markdown
66
- - README.markdown
35
+ extra_rdoc_files: []
67
36
  files:
37
+ - .document
38
+ - .gitignore
39
+ - CHANGELOG.rdoc
40
+ - Gemfile
41
+ - LICENSE.txt
42
+ - README.ko.md
43
+ - README.md
44
+ - Rakefile
68
45
  - config/default.yml
46
+ - crawler/crawler.rb
47
+ - gimchi.gemspec
69
48
  - lib/gimchi.rb
70
49
  - lib/gimchi/char.rb
71
- - lib/gimchi/korean.rb
72
50
  - lib/gimchi/patch_1.8.rb
73
51
  - lib/gimchi/pronouncer.rb
74
- - LICENSE.txt
75
- - README.ko.markdown
76
- - README.markdown
77
52
  - test/helper.rb
78
53
  - test/pronunciation.yml
79
54
  - test/romanization.yml
80
55
  - test/test_gimchi.rb
81
- homepage: http://github.com/junegunn/gimchi
56
+ homepage: https://github.com/junegunn/gimchi
82
57
  licenses:
83
58
  - MIT
84
59
  post_install_message:
@@ -91,9 +66,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
91
66
  - - ! '>='
92
67
  - !ruby/object:Gem::Version
93
68
  version: '0'
94
- segments:
95
- - 0
96
- hash: -4061568131035276090
97
69
  required_rubygems_version: !ruby/object:Gem::Requirement
98
70
  none: false
99
71
  requirements:
@@ -102,10 +74,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
102
74
  version: '0'
103
75
  requirements: []
104
76
  rubyforge_project:
105
- rubygems_version: 1.8.15
77
+ rubygems_version: 1.8.25
106
78
  signing_key:
107
79
  specification_version: 3
108
- summary: Gimchi reads Korean.
80
+ summary: A Ruby gem for Korean characters
109
81
  test_files:
110
82
  - test/helper.rb
111
83
  - test/pronunciation.yml
@@ -1,155 +0,0 @@
1
- # gimchi
2
-
3
- Gimchi is a simple Ruby gem which knows how to handle Korean strings. It knows
4
- how to dissect Korean characters into its 3 components, namely chosung,
5
- jungsung and optional jongsung. It knows how Korean sentences are pronounced
6
- and how they're written in roman alphabet.
7
-
8
- Gimchi (only partially) implements the following rules dictated by
9
- The National Institute of The Korean Language (http://www.korean.go.kr)
10
- * Korean Standard Pronunciation
11
- * Korean Romanization
12
-
13
- ## Installation
14
- ```
15
- gem install gimchi
16
- ```
17
-
18
- ## Usage
19
-
20
- ### Creating Gimchi::Korean instance
21
- ```ruby
22
- require 'gimchi'
23
-
24
- ko = Gimchi::Korean.new
25
- ```
26
-
27
- ### Checks if the given character is in Korean alphabet
28
- ```ruby
29
- ko.korean_char? 'ㄱ' # true
30
- ko.complete_korean_char? 'ㄱ' # false
31
-
32
- ko.korean_char? 'ㅏ' # true
33
- ko.complete_korean_char? 'ㅏ' # false
34
-
35
- ko.korean_char? '가' # true
36
- ko.complete_korean_char? '가' # true
37
-
38
- # Shorthand of korean_char?
39
- ko.kchar? '가' # true
40
- ```
41
-
42
- ### Usage of Gimchi::Korean::Char
43
- ```ruby
44
- kc = ko.kchar "한"
45
- kc.class # Gimchi::Korean::Char
46
-
47
- kc.chosung # "ㅎ"
48
- kc.jungsung # "ㅏ"
49
- kc.jongsung # "ㄴ"
50
- kc.to_a # ["ㅎ", "ㅏ", "ㄴ"]
51
- kc.to_s # "한"
52
-
53
- kc.complete? # true
54
- kc.partial? # false
55
- ko.kchar("ㅏ").partial? # true
56
-
57
- # Modifying its elements
58
- kc.chosung = 'ㄷ'
59
- kc.jongsung = 'ㄹ'
60
- kc.to_s # "달"
61
- kc.complete? # true
62
- kc.partial? # false
63
-
64
- kc.chosung = nil
65
- kc.jongsung = nil
66
- kc.complete? # false
67
- kc.partial? # true
68
-
69
- # Alias of kchar
70
- kc = ko.korean_char "한"
71
-
72
- # Array of Gimchi::Korean::Char's
73
- arr = ko.convert '이것은 한글입니다.'
74
- # [이, 것, 은, " ", 한, 글, 입, 니, 다, "."]
75
-
76
- arr[0].class # Gimchi::Korean::Char
77
-
78
- # Dissects given String
79
- arr = ko.dissect '이것은 한글입니다.'
80
- # ["ㅇ", "ㅣ", "ㄱ", "ㅓ", "ㅅ", "ㅇ", "ㅡ", "ㄴ", " ",
81
- # "H", "a", "n", "g", "u", "l", " ", "ㅇ", "ㅣ", "ㅂ",
82
- # "ㄴ", "ㅣ", "ㄷ", "ㅏ", "."]
83
- ```
84
-
85
- ### Reading numbers in Korean
86
- ```ruby
87
- ko.read_number(1999) # "천 구백 구십 구"
88
- ko.read_number(- 100.123) # "마이너스 백점일이삼"
89
- ko.read_number("153,191,100,678.3214")
90
- # "천 오백 삼십 일억 구천 백 십만 육백 칠십 팔점삼이일사"
91
-
92
- # Age, Time ( -살, -시 )
93
- ko.read_number("20살") # "스무살"
94
- ko.read_number("13 살") # "열세 살"
95
- ko.read_number("7시 30분") # "일곱시 삼십분"
96
- ```
97
-
98
- ### Standard pronunciation (partially implemented)
99
- ```ruby
100
- str = "됐어 됐어 이제 그런 가르침은 됐어 매일 아침 7 시 30 분까지 우릴 조그만 교실로 몰아넣고"
101
- ko.pronounce str
102
- # "돼써 돼써 이제 그런 가르치믄 돼써 매일 아침 일곱 시 삼십 분까지 우릴 조그만 교실로 모라너코"
103
-
104
- ko.pronounce str, :slur => true
105
- # "돼써 돼써 이제 그런 가르치믄 돼써 매이 라치 밀곱 씨 삼십 뿐까지 우릴 조그만 교실로 모라너코"
106
-
107
- ko.pronounce str, :pronounce_each_char => true
108
- # "됃어 됃어 이제 그런 가르침은 됃어 매일 아침 일곱 시 삼십 분까지 우릴 조그만 교실로 몰아너고"
109
-
110
- ko.pronounce str, :number => false
111
- # "돼써 돼써 이제 그런 가르치믄 돼써 매일 아침 7 시 30 분까지 우릴 조그만 교실로 모라너코"
112
- ```
113
-
114
- ### Romanization (partially implemented)
115
- ```ruby
116
- str = "됐어 됐어 이제 그런 가르침은 됐어 매일 아침 7 시 30 분까지 우릴 조그만 교실로 몰아넣고"
117
-
118
- ko.romanize str
119
- # "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-il achim ilgop si samsip bunkkaji uril jogeuman gyosillo moraneoko"
120
- ko.romanize str, :slur => true
121
- # "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-i rachi milgop ssi samsip ppunkkaji uril jogeuman gyosillo moraneoko"
122
- ko.romanize str, :as_pronounced => false
123
- # "Dwaet-eo dwaet-eo ije geureon gareuchim-eun dwaet-eo mae-il achim ilgop si samsip bunkkaji uril jogeuman gyosillo mol-aneogo"
124
- ko.romanize str, :number => false
125
- # "Dwaesseo dwaesseo ije geureon gareuchimeun dwaesseo mae-il achim 7 si 30 bunkkaji uril jogeuman gyosillo moraneoko"
126
- ```
127
-
128
- ## Limitation of the implementation
129
-
130
- Unfortunately in order to implement the complete specification of Korean
131
- pronunciation and romanization, we need NLP, huge Korean dictionaries and even
132
- semantic analysis of the given string. And even with all those complex
133
- processing, we cannot guarantee 100% accuracy of the output. So yes, that is
134
- definitely not what this gem tries to achieve. Gimchi tries to achieve "some"
135
- level of accuracy with relatively simple code.
136
-
137
- Currently, Gimchi code contains a lot of ad-hoc (possibly invalid) patches
138
- that try to improve the quality of the output, which should better be
139
- refactored anytime soon.
140
-
141
- ## Contributing to gimchi
142
-
143
- * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
144
- * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
145
- * Fork the project
146
- * Start a feature/bugfix branch
147
- * Commit and push until you are happy with your contribution
148
- * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
149
- * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
150
-
151
- ## Copyright
152
-
153
- Copyright (c) 2011 Junegunn Choi. See LICENSE.txt for
154
- further details.
155
-
@@ -1,323 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- module Gimchi
4
- class Korean
5
- DEFAULT_CONFIG_FILE_PATH =
6
- File.dirname(__FILE__) + '/../../config/default.yml'
7
-
8
- # Returns the YAML configuration used by this Korean instance.
9
- # @return [String]
10
- attr_reader :config
11
-
12
- # Initialize Gimchi::Korean.
13
- # @param [String] config_file You can override many parts of the implementation by customizing config file
14
- def initialize config_file = DEFAULT_CONFIG_FILE_PATH
15
- require 'yaml'
16
- @config = YAML.load(File.read config_file)
17
-
18
- [
19
- @config['romanization']['post substitution'],
20
- @config['number']['post substitution'],
21
- @config['number']['alt notation']['post substitution']
22
- ].each do |r|
23
- r.keys.each do |k|
24
- r[Regexp.compile k] = r.delete k
25
- end
26
- end
27
- @config.freeze
28
-
29
- @pronouncer = Korean::Pronouncer.send :new, self
30
- end
31
-
32
- # Array of chosung's.
33
- #
34
- # @return [Array] Array of chosung strings
35
- def chosungs
36
- config['structure']['chosung']
37
- end
38
-
39
- # Array of jungsung's.
40
- # @return [Array] Array of jungsung strings
41
- def jungsungs
42
- config['structure']['jungsung']
43
- end
44
-
45
- # Array of jongsung's.
46
- # @return [Array] Array of jongsung strings
47
- def jongsungs
48
- config['structure']['jongsung']
49
- end
50
-
51
- # Checks if the given character is a korean character.
52
- # @param [String] ch A string of size 1
53
- def korean_char? ch
54
- raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
55
-
56
- complete_korean_char?(ch) ||
57
- (chosungs + jungsungs + jongsungs).include?(ch)
58
- end
59
- alias kchar? korean_char?
60
-
61
- # Checks if the given character is a "complete" korean character.
62
- # "Complete" Korean character must have chosung and jungsung, with optional jongsung.
63
- # @param [String] ch A string of size 1
64
- def complete_korean_char? ch
65
- raise ArgumentError.new('Lengthy input') if str_length(ch) > 1
66
-
67
- # Range of Korean chracters in Unicode 2.0: AC00(가) ~ D7A3(힣)
68
- ch.unpack('U').all? { | c | c >= 0xAC00 && c <= 0xD7A3 }
69
- end
70
-
71
- # Splits the given string into an array of Korean::Char's and Strings of length 1.
72
- # @param [String] str Input string.
73
- # @return [Array] Mixed array of Korean::Char instances and Strings of length 1 (for non-korean characters)
74
- def convert str
75
- str.each_char.map { |c|
76
- korean_char?(c) ? kchar(c) : c
77
- }
78
- end
79
-
80
- # Splits the given string into an array of Korean character components
81
- # @param [String] str Input string.
82
- # @return [Array] Array of Korean character components
83
- def dissect str
84
- str.each_char.inject([]) { |arr, c|
85
- arr += korean_char?(c) ? kchar(c).to_a.compact : [c]
86
- }
87
- end
88
-
89
- # Returns a Korean::Char object for the given Korean character.
90
- # @param [String] ch Korean character in String
91
- # @return [Korean::Char] Korean::Char instance
92
- def kchar ch
93
- Korean::Char.new(self, ch)
94
- end
95
- alias korean_char kchar
96
-
97
- # Reads numeric expressions in Korean way.
98
- # @param [String, Number] str Numeric type or String containing numeric expressions
99
- # @return [String] Output string
100
- def read_number str
101
- nconfig = config['number']
102
-
103
- str.to_s.gsub(/(([+-]\s*)?[0-9,]*,*[0-9]+(\.[0-9]+(e[+-][0-9]+)?)?)(\s*.)?/) {
104
- read_number_sub($1, $5)
105
- }
106
- end
107
-
108
- # Returns the pronunciation of the given string containing Korean characters.
109
- # Takes optional options hash.
110
- #
111
- # @param [String] Input string
112
- # @param [Boolean] options[:pronounce_each_char] Each character of the string is pronounced respectively.
113
- # @param [Boolean] options[:slur] Strings separated by whitespaces are processed again as if they were contiguous.
114
- # @param [Boolean] options[:number] Numberic parts of the string is also pronounced in Korean.
115
- # @param [Array] options[:except] Allows you to skip certain transformations.
116
- # @return [String] Output string
117
- def pronounce str, options = {}
118
- options = {
119
- :pronounce_each_char => false,
120
- :slur => false,
121
- :number => true,
122
- :except => [],
123
- :debug => false
124
- }.merge options
125
-
126
- str = read_number(str) if options[:number]
127
-
128
- result, transforms = @pronouncer.send :pronounce!, str, options
129
-
130
- if options[:debug]
131
- return result, transforms
132
- else
133
- return result
134
- end
135
- end
136
-
137
- # Returns the romanization (alphabetical notation) of the given Korean string.
138
- # http://en.wikipedia.org/wiki/Korean_romanization
139
- # @param [String] str Input Korean string
140
- # @param [Boolean] options[:as_pronounced] If true, #pronounce is internally called before romanize
141
- # @param [Boolean] options[:number] Whether to read numeric expressions in the string
142
- # @param [Boolean] options[:slur] Same as :slur in #pronounce
143
- # @return [String] Output string in Roman Alphabet
144
- # @see Korean#pronounce
145
- def romanize str, options = {}
146
- options = {
147
- :as_pronounced => true,
148
- :number => true,
149
- :slur => false
150
- }.merge options
151
-
152
- require 'yaml'
153
- rdata = config['romanization']
154
- post_subs = rdata["post substitution"]
155
- rdata = [rdata["chosung"], rdata["jungsung"], rdata["jongsung"]]
156
-
157
- str = pronounce str,
158
- :pronounce_each_char => !options[:as_pronounced],
159
- :number => options[:number],
160
- :slur => options[:slur],
161
- # 제1항 [붙임 1] ‘ㅢ’는 ‘ㅣ’로 소리 나더라도 ‘ui’로 적는다.
162
- :except => %w[rule_5_3]
163
- dash = rdata[0]["ㅇ"]
164
- romanization = ""
165
-
166
- romanize_chunk = lambda do | chunk |
167
- convert(chunk).each do | kc |
168
- kc.to_a.each_with_index do | comp, idx |
169
- next if comp.nil?
170
- comp = rdata[idx][comp] || comp
171
- comp = comp[1..-1] if comp[0, 1] == dash &&
172
- (romanization.empty? || romanization[-1, 1] =~ /\s/)
173
- romanization += comp
174
- end
175
- end
176
-
177
- return post_subs.keys.inject(romanization) { | output, pattern |
178
- output.gsub(pattern, post_subs[pattern])
179
- }
180
- end
181
-
182
- k_chunk = ""
183
- str.each_char do | c |
184
- if korean_char? c
185
- k_chunk += c
186
- else
187
- unless k_chunk.empty?
188
- romanization = romanize_chunk.call k_chunk
189
- k_chunk = ""
190
- end
191
- romanization += c
192
- end
193
- end
194
- romanization = romanize_chunk.call k_chunk unless k_chunk.empty?
195
- romanization
196
- end
197
-
198
- private
199
- def str_length str
200
- str.length
201
- end
202
-
203
- def read_number_sub num, next_char
204
- nconfig = config['number']
205
-
206
- if num == '0'
207
- return nconfig['digits'].first
208
- end
209
-
210
- num = num.gsub(',', '')
211
- next_char = next_char.to_s
212
- is_float = num.match(/[\.e]/) != nil
213
-
214
- # Alternative notation for integers with proper suffix
215
- alt = false
216
- if is_float == false &&
217
- nconfig['alt notation']['when suffix'].keys.include?(next_char.strip)
218
- max = nconfig['alt notation']['when suffix'][next_char.strip]['max']
219
-
220
- if max.nil? || num.to_i <= max
221
- alt = true
222
- end
223
- end
224
-
225
- # Sign
226
- sign = []
227
- negative = false
228
- if num =~ /^-/
229
- num = num.sub(/^-\s*/, '')
230
- sign << nconfig['negative']
231
- negative = true
232
- elsif num =~ /^\+/
233
- num = num.sub(/^\+\s*/, '')
234
- sign << nconfig['positive']
235
- end
236
-
237
- if is_float
238
- below = nconfig['decimal point']
239
- below = nconfig['digits'][0] + below if num.to_f < 1
240
-
241
- if md = num.match(/(.*)e(.*)/)
242
- dp = md[1].index('.')
243
- num = md[1].tr '.', ''
244
- exp = md[2].to_i
245
-
246
- dp += exp
247
- if dp > num.length
248
- num = num.ljust(dp, '0')
249
- num = num.sub(/^0+([1-9])/, "\\1")
250
-
251
- below = ""
252
- elsif dp < 0
253
- num = '0.' + '0' * (-dp) + num
254
- else
255
- num[dp, 1] = '.' + num[dp, 1]
256
- end
257
- end
258
- num.sub(/.*\./, '').each_char do | char |
259
- below += nconfig['digits'][char.to_i]
260
- end if num.include? '.'
261
- num = num.sub(/\..*/, '')
262
- else
263
- below = ""
264
- end
265
-
266
- tokens = []
267
- unit_idx = -1
268
- num = num.to_i
269
- while num > 0
270
- v = num % 10000
271
-
272
- unit_idx += 1
273
- if v > 0
274
- if alt == false || unit_idx >= 1
275
- str = ""
276
- # Cannot use hash as they're unordered in 1.8
277
- [[1000, '천'],
278
- [100, '백'],
279
- [10, '십']].each do | arr |
280
- u, sub_unit = arr
281
- str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
282
- v %= u
283
- end
284
- str += nconfig['digits'][v] if v > 0
285
-
286
- tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
287
- else
288
- str = ""
289
- tenfolds = nconfig['alt notation']['tenfolds']
290
- digits = nconfig['alt notation']['digits']
291
- alt_post_subs = nconfig['alt notation']['post substitution']
292
-
293
- # Likewise.
294
- [[1000, '천'],
295
- [100, '백']].each do | u, sub_unit |
296
- str += (nconfig['digits'][v/u] if v/u != 1).to_s + sub_unit + ' ' if v / u > 0
297
- v %= u
298
- end
299
-
300
- str += tenfolds[(v / 10) - 1] if v / 10 > 0
301
- v %= 10
302
- str += digits[v] if v > 0
303
-
304
- alt_post_subs.each do | k, v |
305
- str.gsub!(k, v)
306
- end if alt
307
- tokens << str.sub(/ $/, '') + nconfig['units'][unit_idx]
308
- end
309
- end
310
- num /= 10000
311
- end
312
-
313
- tokens += sign unless sign.empty?
314
- ret = tokens.reverse.join(' ') + below + next_char
315
- nconfig['post substitution'].each do | k, v |
316
- ret.gsub!(k, v)
317
- end
318
- ret
319
- end
320
- end#Korean
321
- end#Gimchi
322
-
323
-