ve 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,293 @@
1
+ # Encoding: UTF-8
2
+
3
+ class Ve
4
+ class Provider
5
+ class JapaneseTransliterators < Ve::Provider
6
+
7
+ def initialize(config = {})
8
+ end
9
+
10
+ def works?
11
+ true
12
+ end
13
+
14
+ def parse(text, options = {})
15
+ Ve::Parse::JapaneseTransliterators.new(text)
16
+ end
17
+
18
+ end
19
+ end
20
+ end
21
+
22
+ class Ve
23
+ class Parse
24
+ class JapaneseTransliterators < Ve::Parse
25
+
26
+ H_SYLLABIC_N = 'ん'
27
+ H_SMALL_TSU = 'っ'
28
+
29
+ HIRA_TO_LATN = {
30
+ "あ"=>"a", "い"=>"i", "う"=>"u", "え"=>"e", "お"=>"o",
31
+ "か"=>"ka", "き"=>"ki", "く"=>"ku", "け"=>"ke", "こ"=>"ko",
32
+ "が"=>"ga", "ぎ"=>"gi", "ぐ"=>"gu", "げ"=>"ge", "ご"=>"go",
33
+ "さ"=>"sa", "し"=>"shi", "す"=>"su", "せ"=>"se", "そ"=>"so",
34
+ "ざ"=>"za", "じ"=>"ji", "ず"=>"zu", "ぜ"=>"ze", "ぞ"=>"zo",
35
+ "た"=>"ta", "ち"=>"chi", "つ"=>"tsu", "て"=>"te", "と"=>"to",
36
+ "だ"=>"da", "ぢ"=>"ji", "づ"=>"zu", "で"=>"de", "ど"=>"do",
37
+ "な"=>"na", "に"=>"ni", "ぬ"=>"nu", "ね"=>"ne", "の"=>"no",
38
+ "は"=>"ha", "ひ"=>"hi", "ふ"=>"fu", "へ"=>"he", "ほ"=>"ho",
39
+ "ば"=>"ba", "び"=>"bi", "ぶ"=>"bu", "べ"=>"be", "ぼ"=>"bo",
40
+ "ぱ"=>"pa", "ぴ"=>"pi", "ぷ"=>"pu", "ぺ"=>"pe", "ぽ"=>"po",
41
+ "ま"=>"ma", "み"=>"mi", "む"=>"mu", "め"=>"me", "も"=>"mo",
42
+ "や"=>"ya", "ゆ"=>"yu", "よ"=>"yo",
43
+ "ら"=>"ra", "り"=>"ri", "る"=>"ru", "れ"=>"re", "ろ"=>"ro",
44
+ "わ"=>"wa", "うぃ"=>"whi", "うぇ"=>"whe", "を"=>"wo",
45
+ "ゑ"=>"wye", "ゐ"=>"wyi", "ー"=>"-", "ん"=>"n",
46
+
47
+ "きゃ"=>"kya", "きゅ"=>"kyu", "きょ"=>"kyo", "きぇ"=>"kye", "きぃ"=>"kyi",
48
+ "ぎゃ"=>"gya", "ぎゅ"=>"gyu", "ぎょ"=>"gyo", "ぎぇ"=>"gye", "ぎぃ"=>"gyi",
49
+ "くぁ"=>"kwa", "くぃ"=>"kwi", "くぅ"=>"kwu", "くぇ"=>"kwe", "くぉ"=>"kwo",
50
+ "ぐぁ"=>"qwa", "ぐぃ"=>"gwi", "ぐぅ"=>"gwu", "ぐぇ"=>"gwe", "ぐぉ"=>"gwo",
51
+ "しゃ"=>"sha", "しぃ"=>"syi", "しゅ"=>"shu", "しぇ"=>"she", "しょ"=>"sho",
52
+ "じゃ"=>"jya", "じゅ"=>"zyu", "じぇ"=>"zye", "じょ"=>"zyo", "じぃ"=>"zyi",
53
+ "すぁ"=>"swa", "すぃ"=>"swi", "すぅ"=>"swu", "すぇ"=>"swe", "すぉ"=>"swo",
54
+ "ちゃ"=>"tya", "ちゅ"=>"tyu", "ちぇ"=>"tye", "ちょ"=>"tyo", "ちぃ"=>"tyi",
55
+ "ぢゃ"=>"dya", "ぢぃ"=>"dyi", "ぢゅ"=>"dyu", "ぢぇ"=>"dye", "ぢょ"=>"dyo",
56
+ "つぁ"=>"tsa", "つぃ"=>"tsi", "つぇ"=>"tse", "つぉ"=>"tso", "てゃ"=>"tha",
57
+ "てぃ"=>"thi", "てゅ"=>"thu", "てぇ"=>"the", "てょ"=>"tho", "とぁ"=>"twa",
58
+ "とぃ"=>"twi", "とぅ"=>"twu", "とぇ"=>"twe", "とぉ"=>"two", "でゃ"=>"dha",
59
+ "でぃ"=>"dhi", "でゅ"=>"dhu", "でぇ"=>"dhe", "でょ"=>"dho", "どぁ"=>"dwa",
60
+ "どぃ"=>"dwi", "どぅ"=>"dwu", "どぇ"=>"dwe", "どぉ"=>"dwo", "にゃ"=>"nya",
61
+ "にゅ"=>"nyu", "にょ"=>"nyo", "にぇ"=>"nye", "にぃ"=>"nyi", "ひゃ"=>"hya",
62
+ "ひぃ"=>"hyi", "ひゅ"=>"hyu", "ひぇ"=>"hye", "ひょ"=>"hyo", "びゃ"=>"bya",
63
+ "びぃ"=>"byi", "びゅ"=>"byu", "びぇ"=>"bye", "びょ"=>"byo", "ぴゃ"=>"pya",
64
+ "ぴぃ"=>"pyi", "ぴゅ"=>"pyu", "ぴぇ"=>"pye", "ぴょ"=>"pyo", "ふぁ"=>"fwa",
65
+ "ふぃ"=>"fyi", "ふぇ"=>"fye", "ふぉ"=>"fwo", "ふぅ"=>"fwu", "ふゃ"=>"fya",
66
+ "ふゅ"=>"fyu", "ふょ"=>"fyo", "みゃ"=>"mya", "みぃ"=>"myi", "みゅ"=>"myu",
67
+ "みぇ"=>"mye", "みょ"=>"myo", "りゃ"=>"rya", "りぃ"=>"ryi", "りゅ"=>"ryu",
68
+ "りぇ"=>"rye", "りょ"=>"ryo",
69
+ "ゔぁ"=>"va", "ゔぃ"=>"vyi", "ゔ"=>"vu", "ゔぇ"=>"vye", "ゔぉ"=>"vo",
70
+ "ゔゃ"=>"vya", "ゔゅ"=>"vyu", "ゔょ"=>"vyo",
71
+ "うぁ"=>"wha", "いぇ"=>"ye", "うぉ"=>"who",
72
+ "ぁ"=>"xa", "ぃ"=>"xi", "ぅ"=>"xu", "ぇ"=>"xe", "ぉ"=>"xo",
73
+ "ゕ"=>"xka", "ゖ"=>"xke", "ゎ"=>"xwa"
74
+ }
75
+
76
+ LATN_TO_HIRA = {
77
+ 'a' => 'あ', 'i' => 'い', 'u' => 'う', 'e' => 'え', 'o' => 'お',
78
+ 'ka' => 'か', 'ki' => 'き', 'ku' => 'く', 'ke' => 'け', 'ko' => 'こ',
79
+ 'ga' => 'が', 'gi' => 'ぎ', 'gu' => 'ぐ', 'ge' => 'げ', 'go' => 'ご',
80
+ 'sa' => 'さ', 'si' => 'し', 'shi' => 'し', 'su' => 'す', 'se' => 'せ', 'so' => 'そ',
81
+ 'za' => 'ざ', 'zi' => 'じ', 'ji' => 'じ', 'zu' => 'ず', 'ze' => 'ぜ', 'zo' => 'ぞ',
82
+ 'ta' => 'た', 'ti' => 'ち', 'chi' => 'ち', 'tu' => 'つ', 'tsu'=> 'つ', 'te' => 'て', 'to' => 'と',
83
+ 'da' => 'だ', 'di' => 'ぢ', 'du' => 'づ', 'dzu'=> 'づ', 'de' => 'で', 'do' => 'ど',
84
+ 'na' => 'な', 'ni' => 'に', 'nu' => 'ぬ', 'ne' => 'ね', 'no' => 'の',
85
+ 'ha' => 'は', 'hi' => 'ひ', 'hu' => 'ふ', 'fu' => 'ふ', 'he' => 'へ', 'ho' => 'ほ',
86
+ 'ba' => 'ば', 'bi' => 'び', 'bu' => 'ぶ', 'be' => 'べ', 'bo' => 'ぼ',
87
+ 'pa' => 'ぱ', 'pi' => 'ぴ', 'pu' => 'ぷ', 'pe' => 'ぺ', 'po' => 'ぽ',
88
+ 'ma' => 'ま', 'mi' => 'み', 'mu' => 'む', 'me' => 'め', 'mo' => 'も',
89
+ 'ya' => 'や', 'yu' => 'ゆ', 'yo' => 'よ',
90
+ 'ra' => 'ら', 'ri' => 'り', 'ru' => 'る', 're' => 'れ', 'ro' => 'ろ',
91
+ 'la' => 'ら', 'li' => 'り', 'lu' => 'る', 'le' => 'れ', 'lo' => 'ろ',
92
+ 'wa' => 'わ', 'wi' => 'うぃ', 'we' => 'うぇ', 'wo' => 'を',
93
+ 'wye' => 'ゑ', 'wyi' => 'ゐ', '-' => 'ー',
94
+
95
+ 'n' => 'ん', 'nn' => 'ん', "n'"=> 'ん',
96
+
97
+ 'kya' => 'きゃ', 'kyu' => 'きゅ', 'kyo' => 'きょ', 'kye' => 'きぇ', 'kyi' => 'きぃ',
98
+ 'gya' => 'ぎゃ', 'gyu' => 'ぎゅ', 'gyo' => 'ぎょ', 'gye' => 'ぎぇ', 'gyi' => 'ぎぃ',
99
+ 'kwa' => 'くぁ', 'kwi' => 'くぃ', 'kwu' => 'くぅ', 'kwe' => 'くぇ', 'kwo' => 'くぉ',
100
+ 'gwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
101
+ 'qwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
102
+
103
+ 'sya' => 'しゃ', 'syi' => 'しぃ', 'syu' => 'しゅ', 'sye' => 'しぇ', 'syo' => 'しょ',
104
+ 'sha' => 'しゃ', 'shu' => 'しゅ', 'she' => 'しぇ', 'sho' => 'しょ',
105
+ 'ja' => 'じゃ', 'ju' => 'じゅ', 'je' => 'じぇ', 'jo' => 'じょ',
106
+ 'jya' => 'じゃ', 'jyi' => 'じぃ', 'jyu' => 'じゅ', 'jye' => 'じぇ', 'jyo' => 'じょ',
107
+ 'zya' => 'じゃ', 'zyu' => 'じゅ', 'zyo' => 'じょ', 'zye' => 'じぇ', 'zyi' => 'じぃ',
108
+ 'swa' => 'すぁ', 'swi' => 'すぃ', 'swu' => 'すぅ', 'swe' => 'すぇ', 'swo' => 'すぉ',
109
+
110
+ 'cha' => 'ちゃ', 'chu' => 'ちゅ', 'che' => 'ちぇ', 'cho' => 'ちょ',
111
+ 'cya' => 'ちゃ', 'cyi' => 'ちぃ', 'cyu' => 'ちゅ', 'cye' => 'ちぇ', 'cyo' => 'ちょ',
112
+ 'tya' => 'ちゃ', 'tyi' => 'ちぃ', 'tyu' => 'ちゅ', 'tye' => 'ちぇ', 'tyo' => 'ちょ',
113
+ 'dya' => 'ぢゃ', 'dyi' => 'ぢぃ', 'dyu' => 'ぢゅ', 'dye' => 'ぢぇ', 'dyo' => 'ぢょ',
114
+ 'tsa' => 'つぁ', 'tsi' => 'つぃ', 'tse' => 'つぇ', 'tso' => 'つぉ',
115
+ 'tha' => 'てゃ', 'thi' => 'てぃ', 'thu' => 'てゅ', 'the' => 'てぇ', 'tho' => 'てょ',
116
+ 'twa' => 'とぁ', 'twi' => 'とぃ', 'twu' => 'とぅ', 'twe' => 'とぇ', 'two' => 'とぉ',
117
+ 'dha' => 'でゃ', 'dhi' => 'でぃ', 'dhu' => 'でゅ', 'dhe' => 'でぇ', 'dho' => 'でょ',
118
+ 'dwa' => 'どぁ', 'dwi' => 'どぃ', 'dwu' => 'どぅ', 'dwe' => 'どぇ', 'dwo' => 'どぉ',
119
+
120
+ 'nya' => 'にゃ', 'nyu' => 'にゅ', 'nyo' => 'にょ', 'nye' => 'にぇ', 'nyi' => 'にぃ',
121
+
122
+ 'hya' => 'ひゃ', 'hyi' => 'ひぃ', 'hyu' => 'ひゅ', 'hye' => 'ひぇ', 'hyo' => 'ひょ',
123
+ 'bya' => 'びゃ', 'byi' => 'びぃ', 'byu' => 'びゅ', 'bye' => 'びぇ', 'byo' => 'びょ',
124
+ 'pya' => 'ぴゃ', 'pyi' => 'ぴぃ', 'pyu' => 'ぴゅ', 'pye' => 'ぴぇ', 'pyo' => 'ぴょ',
125
+ 'fa' => 'ふぁ', 'fi' => 'ふぃ', 'fe' => 'ふぇ', 'fo' => 'ふぉ',
126
+ 'fwa' => 'ふぁ', 'fwi' => 'ふぃ', 'fwu' => 'ふぅ', 'fwe' => 'ふぇ', 'fwo' => 'ふぉ',
127
+ 'fya' => 'ふゃ', 'fyi' => 'ふぃ', 'fyu' => 'ふゅ', 'fye' => 'ふぇ', 'fyo' => 'ふょ',
128
+
129
+ 'mya' => 'みゃ', 'myi' => 'みぃ', 'myu' => 'みゅ', 'mye' => 'みぇ', 'myo' => 'みょ',
130
+
131
+ 'rya' => 'りゃ', 'ryi' => 'りぃ', 'ryu' => 'りゅ', 'rye' => 'りぇ', 'ryo' => 'りょ',
132
+ 'lya' => 'りゃ', 'lyu' => 'りゅ', 'lyo' => 'りょ', 'lye' => 'りぇ', 'lyi' => 'りぃ',
133
+
134
+ 'va' => 'ゔぁ', 'vi' => 'ゔぃ', 'vu' => 'ゔ', 've' => 'ゔぇ', 'vo' => 'ゔぉ',
135
+ 'vya' => 'ゔゃ', 'vyi' => 'ゔぃ', 'vyu' => 'ゔゅ', 'vye' => 'ゔぇ', 'vyo' => 'ゔょ',
136
+ 'wha' => 'うぁ', 'whi' => 'うぃ', 'ye' => 'いぇ', 'whe' => 'うぇ', 'who' => 'うぉ',
137
+
138
+ 'xa' => 'ぁ', 'xi' => 'ぃ', 'xu' => 'ぅ', 'xe' => 'ぇ', 'xo' => 'ぉ',
139
+ 'xya' => 'ゃ', 'xyu' => 'ゅ', 'xyo' => 'ょ',
140
+ 'xtu' => 'っ', 'xtsu' => 'っ',
141
+ 'xka' => 'ゕ', 'xke' => 'ゖ', 'xwa' => 'ゎ',
142
+
143
+ '@@' => ' ', '#[' => '「', '#]' => '」', '#,' => '、', '#.' => '。', '#/' => '・',
144
+ }
145
+
146
+ attr_reader :tokens, :text
147
+
148
+ def initialize(text)
149
+ @tokens = []
150
+ @text = text
151
+ end
152
+
153
+ def transliterate_from_hrkt_to_latn
154
+ @text = transliterate_from_kana_to_hira
155
+ transliterate_from_hira_to_latn
156
+ end
157
+
158
+ def transliterate_from_hira_to_latn
159
+ # Hepburn style romaji
160
+ kana = @text.dup
161
+ romaji = ''
162
+ geminate = false
163
+
164
+ while kana.length > 0
165
+ [2, 1].each do |length|
166
+ mora = ''
167
+ for_conversion = kana[0, length]
168
+
169
+ if for_conversion == H_SMALL_TSU
170
+ geminate = true
171
+ kana[0, length] = ''
172
+ break
173
+ elsif for_conversion == H_SYLLABIC_N && kana[1, 1].match(/[やゆよ]/)
174
+ # Syllabic N before ya, yu or yo
175
+ mora = "n'"
176
+ elsif HIRA_TO_LATN[for_conversion]
177
+ # Generic cases
178
+ mora = HIRA_TO_LATN[for_conversion]
179
+ end
180
+
181
+ if mora.length > 0
182
+ if geminate
183
+ geminate = false
184
+ romaji << mora[0, 1]
185
+ end
186
+ romaji << mora
187
+ kana[0, length] = ''
188
+ break
189
+ elsif length == 1
190
+ # Nothing found
191
+ romaji << for_conversion
192
+ kana[0, length] = ''
193
+ end
194
+ end
195
+ end
196
+
197
+ return romaji
198
+ end
199
+
200
+ def transliterate_from_latn_to_hrkt
201
+ romaji = @text.dup
202
+ kana = ''
203
+
204
+ romaji.gsub!(/m([BbPp])/, 'n\1')
205
+ romaji.gsub!(/M([BbPp])/, 'N\1')
206
+
207
+ while romaji.length > 0
208
+ [3, 2, 1].each do |length|
209
+ mora = ''
210
+ for_removal = length
211
+ for_conversion = romaji[0, length]
212
+ is_upper = !!(for_conversion.match(/^\p{Upper}/))
213
+ for_conversion.downcase!
214
+
215
+ if for_conversion.match(/nn[aiueo]/)
216
+ # nna should kanafy to んな instead of んあ
217
+ # This is what people expect for words like konna, anna, zannen
218
+ mora = H_SYLLABIC_N
219
+ for_removal = 1
220
+ elsif LATN_TO_HIRA[for_conversion]
221
+ # Generic cases
222
+ mora = LATN_TO_HIRA[for_conversion]
223
+ elsif for_conversion == 'tch' || ( length == 2 && for_conversion.match(/([kgsztdnbpmyrlwc])\1/))
224
+ # tch and double-consonants for small tsu
225
+ mora = H_SMALL_TSU
226
+ for_removal = 1
227
+ end
228
+
229
+ if mora.length > 0
230
+ if is_upper
231
+ # Dance so we can call transliterate_from_hira_to_kana on internal data
232
+ # TODO: Need a better way for this
233
+ temp_text = @text
234
+ @text = mora.dup
235
+ kana << transliterate_from_hira_to_kana
236
+ @text = temp_text
237
+ else
238
+ kana << mora
239
+ end
240
+
241
+ romaji[0, for_removal] = ''
242
+ break
243
+ elsif length == 1
244
+ # Nothing found
245
+ kana << for_conversion
246
+ romaji[0, 1] = ''
247
+ end
248
+ end
249
+ end
250
+
251
+ return kana
252
+ end
253
+
254
+ def transliterate_from_kana_to_hira
255
+ transpose_codepoints_in_range(@text, -96, 12449..12534)
256
+ end
257
+
258
+ def transliterate_from_hira_to_kana
259
+ transpose_codepoints_in_range(@text, 96, 12353..12438)
260
+ end
261
+
262
+ def transliterate_from_fullwidth_to_halfwidth
263
+ res = transpose_codepoints_in_range(@text, -65248, 65281..65374)
264
+ transpose_codepoints_in_range(res, -12256, 12288..12288)
265
+ end
266
+
267
+ def transliterate_from_halfwidth_to_fullwidth
268
+ res = transpose_codepoints_in_range(@text, 65248, 33..126)
269
+ transpose_codepoints_in_range(res, 12256, 32..32)
270
+ end
271
+
272
+ private
273
+
274
+ def transpose_codepoints_in_range(text, distance, range)
275
+ result = ''
276
+
277
+ text.each_codepoint do |c|
278
+ if c >= range.first and c <= range.last
279
+ result << (c + distance).chr(Encoding::UTF_8)
280
+ else
281
+ result << c.chr(Encoding::UTF_8)
282
+ end
283
+ end
284
+
285
+ return result
286
+ end
287
+
288
+ end
289
+ end
290
+ end
291
+
292
+ Ve::Manager.register(Ve::Provider::JapaneseTransliterators, :ja)
293
+
@@ -0,0 +1,362 @@
1
+ # Encoding: UTF-8
2
+
3
+ require 'open3'
4
+
5
+ class Ve
6
+ class Provider
7
+ class MecabIpadic < Ve::Provider
8
+
9
+ BIT_STOP = 'VeEnd'
10
+
11
+ def initialize(config = {})
12
+ # TODO: Make config handling better
13
+ @config = {:app => 'mecab',
14
+ :path => '',
15
+ :flags => ''}.merge(config)
16
+
17
+ @config[:app] = `which #{@config[:app]}`
18
+
19
+ start!
20
+ end
21
+
22
+ def works?
23
+ (["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
24
+ "た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
25
+ "EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
26
+ end
27
+
28
+ # Talks to the app and returns a parse object
29
+ def parse(text, options = {})
30
+ start! if @stdin.nil? # Restart if the provider crashed
31
+
32
+ @stdin.puts "#{text} #{BIT_STOP}"
33
+ output = []
34
+
35
+ while line = @stdout.readline.force_encoding('UTF-8')
36
+ if line =~ /#{BIT_STOP}/x
37
+ output << @stdout.readline # Catch the EOS
38
+ break
39
+ end
40
+ output << line
41
+ end
42
+
43
+ Ve::Parse::MecabIpadic.new(text, output)
44
+ rescue
45
+ # TODO: No good to catch all errors like this
46
+ # I need a backtrace when something unexpected fails
47
+ Ve::Parse::MecabIpadic.new(text, [])
48
+ end
49
+
50
+ private
51
+
52
+ # TODO: Use Process.spawn/kill for process control?
53
+ def start!
54
+ @stdin, @stdout, @stderr = Open3.popen3(@config[:app])
55
+ @stdin.set_encoding('UTF-8')
56
+ @stdout.set_encoding('UTF-8')
57
+ rescue Errno::ENOENT
58
+ # The parser couldn't be started. Probably not installed on this system
59
+ end
60
+
61
+ end
62
+ end
63
+ end
64
+
65
+ class Ve
66
+ class Parse
67
+ class MecabIpadic < Ve::Parse
68
+
69
+ PARSER = %r{^ (.+?) \t (.+) }x
70
+ attr_reader :tokens, :text
71
+
72
+ def initialize(text, output)
73
+ @tokens = []
74
+ @text = text
75
+ position = 0
76
+
77
+ output.each_with_index do |line, index|
78
+ line.rstrip!
79
+ token = {:raw => line}
80
+ # Anything unparsed at the end of the text
81
+ # This must happen before sentence splits are detected to avoid funny ordering
82
+ if output.length > 1 && output.length == index + 1
83
+ unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
84
+ if unparsed_md[1].length > 0
85
+ unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''}
86
+ unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
87
+ @tokens << unparsed_token
88
+ end
89
+ end
90
+
91
+ if line =~ %r{^ EOS $}x
92
+ token[:type] = :sentence_split
93
+ token[:literal] = ''
94
+ elsif md = PARSER.match(line)
95
+ # The parsed token
96
+ token[:type] = :parsed
97
+ token[:literal] = md[1]
98
+ info = md[2].split(',')
99
+ [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
100
+ token[attr] = info[i]
101
+ end
102
+
103
+ # Anything unparsed preceding this token
104
+ unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
105
+ if unparsed_md[1].length > 0
106
+ unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
107
+ unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
108
+ @tokens << unparsed_token
109
+ position += unparsed_token[:literal].length
110
+ end
111
+
112
+ token[:characters] = (position..(position+token[:literal].length-1))
113
+ position += token[:literal].length
114
+ else
115
+ # C'est une catastrophe
116
+ end
117
+
118
+ @tokens << token
119
+ end
120
+ end
121
+
122
+ # PoS
123
+ MEISHI = '名詞'
124
+ KOYUUMEISHI = '固有名詞'
125
+ DAIMEISHI = '代名詞'
126
+ JODOUSHI = '助動詞'
127
+ KAZU = '数'
128
+ JOSHI = '助詞'
129
+ SETTOUSHI = '接頭詞'
130
+ DOUSHI = '動詞'
131
+ KIGOU = '記号'
132
+ FIRAA = 'フィラー'
133
+ SONOTA = 'その他'
134
+ KANDOUSHI = '感動詞'
135
+ RENTAISHI = '連体詞'
136
+ SETSUZOKUSHI = '接続詞'
137
+ FUKUSHI = '副詞'
138
+ SETSUZOKUJOSHI = '接続助詞'
139
+ KEIYOUSHI = '形容詞'
140
+
141
+ # Pos2 and Inflection types
142
+ HIJIRITSU = '非自立'
143
+ FUKUSHIKANOU = '副詞可能'
144
+ SAHENSETSUZOKU = 'サ変接続'
145
+ KEIYOUDOUSHIGOKAN = '形容動詞語幹'
146
+ NAIKEIYOUSHIGOKAN = 'ナイ形容詞語幹'
147
+ JODOUSHIGOKAN = '助動詞語幹'
148
+ FUKUSHIKA = '副詞化'
149
+ TAIGENSETSUZOKU = '体言接続'
150
+ RENTAIKA = '連体化'
151
+ TOKUSHU = '特殊'
152
+ SETSUBI = '接尾'
153
+ SETSUZOKUSHITEKI = '接続詞的'
154
+ DOUSHIHIJIRITSUTEKI = '動詞非自立的'
155
+ SAHEN_SURU = 'サ変・スル'
156
+ TOKUSHU_TA = '特殊・タ'
157
+ TOKUSHU_NAI = '特殊・ナイ'
158
+ TOKUSHU_TAI = '特殊・タイ'
159
+ TOKUSHU_DESU = '特殊・デス'
160
+ TOKUSHU_DA = '特殊・ダ'
161
+ TOKUSHU_MASU = '特殊・マス'
162
+
163
+ # Etc
164
+ NA = 'な'
165
+ NI = 'に'
166
+ TE = 'て'
167
+ DE = 'で'
168
+ BA = 'ば'
169
+
170
+ def words
171
+ words = []
172
+ tokens = @tokens.find_all { |t| t[:type] == :parsed }
173
+ tokens = tokens.to_enum
174
+
175
+ # This is becoming very big
176
+ begin
177
+ while token = tokens.next
178
+ pos = nil
179
+ grammar = nil
180
+ eat_next = false
181
+ eat_lemma = true
182
+ attach_to_previous = false
183
+ also_attach_to_lemma = false
184
+
185
+ case token[:pos]
186
+ when MEISHI
187
+ pos = Ve::PartOfSpeech::Noun
188
+
189
+ case token[:pos2]
190
+ when KOYUUMEISHI
191
+ pos = Ve::PartOfSpeech::ProperNoun
192
+ when DAIMEISHI
193
+ pos = Ve::PartOfSpeech::Pronoun
194
+ when FUKUSHIKANOU, SAHENSETSUZOKU, KEIYOUDOUSHIGOKAN, NAIKEIYOUSHIGOKAN
195
+ if tokens.more?
196
+ following = tokens.peek
197
+ if following[:inflection_type] == SAHEN_SURU
198
+ pos = Ve::PartOfSpeech::Verb
199
+ eat_next = true
200
+ elsif following[:inflection_type] == TOKUSHU_DA
201
+ pos = Ve::PartOfSpeech::Adjective
202
+ if following[:inflection_form] == TAIGENSETSUZOKU
203
+ eat_next = true
204
+ eat_lemma = false
205
+ end
206
+ elsif following[:inflection_type] == TOKUSHU_NAI
207
+ pos = Ve::PartOfSpeech::Adjective
208
+ eat_next = true
209
+ elsif following[:pos] == JOSHI && following[:literal] == NI
210
+ pos = Ve::PartOfSpeech::Adverb
211
+ eat_next = true
212
+ end
213
+ end
214
+ when HIJIRITSU, TOKUSHU
215
+ if tokens.more?
216
+ following = tokens.peek
217
+ case token[:pos3]
218
+ when FUKUSHIKANOU
219
+ if following[:pos] == JOSHI && following[:literal] == NI
220
+ pos = Ve::PartOfSpeech::Adverb
221
+ eat_next = true
222
+ end
223
+ when JODOUSHIGOKAN
224
+ if following[:inflection_type] == TOKUSHU_DA
225
+ pos = Ve::PartOfSpeech::Verb
226
+ grammar = :auxillary
227
+ if following[:inflection_form] == TAIGENSETSUZOKU
228
+ eat_next = true
229
+ end
230
+ elsif following[:pos] == JOSHI && following[:pos2] == FUKUSHIKA
231
+ pos = Ve::PartOfSpeech::Adverb
232
+ eat_next = true
233
+ end
234
+ when KEIYOUDOUSHIGOKAN
235
+ pos = Ve::PartOfSpeech::Adjective
236
+ if (following[:inflection_type] == TOKUSHU_DA && following[:inflection_form] == TAIGENSETSUZOKU) || following[:pos2] == RENTAIKA
237
+ eat_next = true
238
+ end
239
+ end
240
+ end
241
+ when KAZU
242
+ # TODO: recurse and find following numbers and add to this word. Except non-numbers like 幾
243
+ pos = Ve::PartOfSpeech::Number
244
+ if words.length > 0 && words[-1].part_of_speech == Ve::PartOfSpeech::Number
245
+ attach_to_previous = true
246
+ also_attach_to_lemma = true
247
+ end
248
+ when SETSUBI
249
+ # TODO: elaborate a bit?
250
+ pos = Ve::PartOfSpeech::Suffix
251
+ when SETSUZOKUSHITEKI
252
+ pos = Ve::PartOfSpeech::Conjunction
253
+ when DOUSHIHIJIRITSUTEKI
254
+ pos = Ve::PartOfSpeech::Verb
255
+ grammar = :nominal
256
+ end
257
+ when SETTOUSHI
258
+ # TODO: elaborate this when we have the "main part" feature for words?
259
+ pos = Ve::PartOfSpeech::Prefix
260
+ when JODOUSHI
261
+ pos = Ve::PartOfSpeech::Postposition
262
+
263
+ if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type])
264
+ attach_to_previous = true
265
+ elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
266
+ pos = Ve::PartOfSpeech::Verb
267
+ end
268
+ when DOUSHI
269
+ pos = Ve::PartOfSpeech::Verb
270
+ if token[:pos2] == SETSUBI
271
+ attach_to_previous = true
272
+ elsif token[:pos2] == HIJIRITSU
273
+ grammar = :auxillary
274
+ end
275
+ when KEIYOUSHI
276
+ pos = Ve::PartOfSpeech::Adjective
277
+ when JOSHI
278
+ pos = Ve::PartOfSpeech::Postposition
279
+ if token[:pos2] == SETSUZOKUJOSHI && [TE, DE, BA].include?(token[:literal])
280
+ attach_to_previous = true
281
+ end
282
+ when RENTAISHI
283
+ pos = Ve::PartOfSpeech::Determiner
284
+ when SETSUZOKUSHI
285
+ pos = Ve::PartOfSpeech::Conjunction
286
+ when FUKUSHI
287
+ pos = Ve::PartOfSpeech::Adverb
288
+ when KIGOU
289
+ pos = Ve::PartOfSpeech::Symbol
290
+ when FIRAA, KANDOUSHI
291
+ pos = Ve::PartOfSpeech::Interjection
292
+ when SONOTA
293
+ pos = Ve::PartOfSpeech::Other
294
+ else
295
+ # C'est une catastrophe
296
+ end
297
+
298
+ if attach_to_previous && words.length > 0
299
+ words[-1].tokens << token
300
+ words[-1].word << token[:literal]
301
+ words[-1].extra[:reading] << (token[:reading] || '')
302
+ words[-1].extra[:transcription] << (token[:hatsuon] || '')
303
+ words[-1].lemma << token[:lemma] if also_attach_to_lemma
304
+ else
305
+ pos = Ve::PartOfSpeech::TBD if pos.nil?
306
+ word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
307
+ :reading => token[:reading] || '',
308
+ :transcription => token[:hatsuon] || '',
309
+ :grammar => grammar
310
+ }, {
311
+ :reading_script => :kata,
312
+ :transcription_script => :kata
313
+ })
314
+
315
+ if eat_next
316
+ following = tokens.next
317
+ word.tokens << following
318
+ word.word << following[:literal]
319
+ word.extra[:reading] << following[:reading]
320
+ word.extra[:transcription] << following[:hatsuon]
321
+ word.lemma << following[:lemma] if eat_lemma
322
+ end
323
+
324
+ words << word
325
+ end
326
+ end
327
+ rescue StopIteration
328
+ end
329
+
330
+ return words
331
+ end
332
+
333
+ def sentences
334
+ # TODO: Sentence objects that keep track of the sentence's tokens
335
+ sentences = []
336
+ current = ''
337
+
338
+ @tokens.each do |token|
339
+ if token[:type] == :sentence_split
340
+ sentences << current
341
+ current = ''
342
+ elsif token[:literal] == '。'
343
+ current << token[:literal]
344
+ sentences << current
345
+ current = ''
346
+ else
347
+ current << token[:literal]
348
+ end
349
+ end
350
+
351
+ # In case there is no :sentence_split at the end
352
+ sentences << current if current.length > 0
353
+
354
+ sentences
355
+ end
356
+
357
+ end
358
+ end
359
+ end
360
+
361
+ Ve::Manager.register(Ve::Provider::MecabIpadic, :ja)
362
+