ve 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +22 -0
- data/Rakefile +9 -0
- data/Readme.md +60 -0
- data/js/test.html +32 -0
- data/js/ve.js +57 -0
- data/lib/language.rb +2 -0
- data/lib/languages/english.rb +6 -0
- data/lib/languages/japanese.rb +9 -0
- data/lib/misc.rb +10 -0
- data/lib/part_of_speech.rb +30 -0
- data/lib/provider.rb +29 -0
- data/lib/providers/fallbacks.rb +0 -0
- data/lib/providers/freeling_en.rb +229 -0
- data/lib/providers/japanese_transliterators.rb +293 -0
- data/lib/providers/mecab_ipadic.rb +362 -0
- data/lib/ve.rb +111 -0
- data/lib/word.rb +43 -0
- data/sinatra/server.rb +46 -0
- data/tests/freeling_en_test.rb +135 -0
- data/tests/japanese_transliterators_test.rb +79 -0
- data/tests/mecab_ipadic_test.rb +452 -0
- data/tests/test_helper.rb +26 -0
- data/tests/ve_test.rb +20 -0
- data/ve.gemspec +20 -0
- metadata +80 -0
@@ -0,0 +1,293 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
class Ve
|
4
|
+
class Provider
|
5
|
+
class JapaneseTransliterators < Ve::Provider
|
6
|
+
|
7
|
+
def initialize(config = {})
|
8
|
+
end
|
9
|
+
|
10
|
+
def works?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse(text, options = {})
|
15
|
+
Ve::Parse::JapaneseTransliterators.new(text)
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class Ve
|
23
|
+
class Parse
|
24
|
+
class JapaneseTransliterators < Ve::Parse
|
25
|
+
|
26
|
+
H_SYLLABIC_N = 'ん'
|
27
|
+
H_SMALL_TSU = 'っ'
|
28
|
+
|
29
|
+
HIRA_TO_LATN = {
|
30
|
+
"あ"=>"a", "い"=>"i", "う"=>"u", "え"=>"e", "お"=>"o",
|
31
|
+
"か"=>"ka", "き"=>"ki", "く"=>"ku", "け"=>"ke", "こ"=>"ko",
|
32
|
+
"が"=>"ga", "ぎ"=>"gi", "ぐ"=>"gu", "げ"=>"ge", "ご"=>"go",
|
33
|
+
"さ"=>"sa", "し"=>"shi", "す"=>"su", "せ"=>"se", "そ"=>"so",
|
34
|
+
"ざ"=>"za", "じ"=>"ji", "ず"=>"zu", "ぜ"=>"ze", "ぞ"=>"zo",
|
35
|
+
"た"=>"ta", "ち"=>"chi", "つ"=>"tsu", "て"=>"te", "と"=>"to",
|
36
|
+
"だ"=>"da", "ぢ"=>"ji", "づ"=>"zu", "で"=>"de", "ど"=>"do",
|
37
|
+
"な"=>"na", "に"=>"ni", "ぬ"=>"nu", "ね"=>"ne", "の"=>"no",
|
38
|
+
"は"=>"ha", "ひ"=>"hi", "ふ"=>"fu", "へ"=>"he", "ほ"=>"ho",
|
39
|
+
"ば"=>"ba", "び"=>"bi", "ぶ"=>"bu", "べ"=>"be", "ぼ"=>"bo",
|
40
|
+
"ぱ"=>"pa", "ぴ"=>"pi", "ぷ"=>"pu", "ぺ"=>"pe", "ぽ"=>"po",
|
41
|
+
"ま"=>"ma", "み"=>"mi", "む"=>"mu", "め"=>"me", "も"=>"mo",
|
42
|
+
"や"=>"ya", "ゆ"=>"yu", "よ"=>"yo",
|
43
|
+
"ら"=>"ra", "り"=>"ri", "る"=>"ru", "れ"=>"re", "ろ"=>"ro",
|
44
|
+
"わ"=>"wa", "うぃ"=>"whi", "うぇ"=>"whe", "を"=>"wo",
|
45
|
+
"ゑ"=>"wye", "ゐ"=>"wyi", "ー"=>"-", "ん"=>"n",
|
46
|
+
|
47
|
+
"きゃ"=>"kya", "きゅ"=>"kyu", "きょ"=>"kyo", "きぇ"=>"kye", "きぃ"=>"kyi",
|
48
|
+
"ぎゃ"=>"gya", "ぎゅ"=>"gyu", "ぎょ"=>"gyo", "ぎぇ"=>"gye", "ぎぃ"=>"gyi",
|
49
|
+
"くぁ"=>"kwa", "くぃ"=>"kwi", "くぅ"=>"kwu", "くぇ"=>"kwe", "くぉ"=>"kwo",
|
50
|
+
"ぐぁ"=>"qwa", "ぐぃ"=>"gwi", "ぐぅ"=>"gwu", "ぐぇ"=>"gwe", "ぐぉ"=>"gwo",
|
51
|
+
"しゃ"=>"sha", "しぃ"=>"syi", "しゅ"=>"shu", "しぇ"=>"she", "しょ"=>"sho",
|
52
|
+
"じゃ"=>"jya", "じゅ"=>"zyu", "じぇ"=>"zye", "じょ"=>"zyo", "じぃ"=>"zyi",
|
53
|
+
"すぁ"=>"swa", "すぃ"=>"swi", "すぅ"=>"swu", "すぇ"=>"swe", "すぉ"=>"swo",
|
54
|
+
"ちゃ"=>"tya", "ちゅ"=>"tyu", "ちぇ"=>"tye", "ちょ"=>"tyo", "ちぃ"=>"tyi",
|
55
|
+
"ぢゃ"=>"dya", "ぢぃ"=>"dyi", "ぢゅ"=>"dyu", "ぢぇ"=>"dye", "ぢょ"=>"dyo",
|
56
|
+
"つぁ"=>"tsa", "つぃ"=>"tsi", "つぇ"=>"tse", "つぉ"=>"tso", "てゃ"=>"tha",
|
57
|
+
"てぃ"=>"thi", "てゅ"=>"thu", "てぇ"=>"the", "てょ"=>"tho", "とぁ"=>"twa",
|
58
|
+
"とぃ"=>"twi", "とぅ"=>"twu", "とぇ"=>"twe", "とぉ"=>"two", "でゃ"=>"dha",
|
59
|
+
"でぃ"=>"dhi", "でゅ"=>"dhu", "でぇ"=>"dhe", "でょ"=>"dho", "どぁ"=>"dwa",
|
60
|
+
"どぃ"=>"dwi", "どぅ"=>"dwu", "どぇ"=>"dwe", "どぉ"=>"dwo", "にゃ"=>"nya",
|
61
|
+
"にゅ"=>"nyu", "にょ"=>"nyo", "にぇ"=>"nye", "にぃ"=>"nyi", "ひゃ"=>"hya",
|
62
|
+
"ひぃ"=>"hyi", "ひゅ"=>"hyu", "ひぇ"=>"hye", "ひょ"=>"hyo", "びゃ"=>"bya",
|
63
|
+
"びぃ"=>"byi", "びゅ"=>"byu", "びぇ"=>"bye", "びょ"=>"byo", "ぴゃ"=>"pya",
|
64
|
+
"ぴぃ"=>"pyi", "ぴゅ"=>"pyu", "ぴぇ"=>"pye", "ぴょ"=>"pyo", "ふぁ"=>"fwa",
|
65
|
+
"ふぃ"=>"fyi", "ふぇ"=>"fye", "ふぉ"=>"fwo", "ふぅ"=>"fwu", "ふゃ"=>"fya",
|
66
|
+
"ふゅ"=>"fyu", "ふょ"=>"fyo", "みゃ"=>"mya", "みぃ"=>"myi", "みゅ"=>"myu",
|
67
|
+
"みぇ"=>"mye", "みょ"=>"myo", "りゃ"=>"rya", "りぃ"=>"ryi", "りゅ"=>"ryu",
|
68
|
+
"りぇ"=>"rye", "りょ"=>"ryo",
|
69
|
+
"ゔぁ"=>"va", "ゔぃ"=>"vyi", "ゔ"=>"vu", "ゔぇ"=>"vye", "ゔぉ"=>"vo",
|
70
|
+
"ゔゃ"=>"vya", "ゔゅ"=>"vyu", "ゔょ"=>"vyo",
|
71
|
+
"うぁ"=>"wha", "いぇ"=>"ye", "うぉ"=>"who",
|
72
|
+
"ぁ"=>"xa", "ぃ"=>"xi", "ぅ"=>"xu", "ぇ"=>"xe", "ぉ"=>"xo",
|
73
|
+
"ゕ"=>"xka", "ゖ"=>"xke", "ゎ"=>"xwa"
|
74
|
+
}
|
75
|
+
|
76
|
+
LATN_TO_HIRA = {
|
77
|
+
'a' => 'あ', 'i' => 'い', 'u' => 'う', 'e' => 'え', 'o' => 'お',
|
78
|
+
'ka' => 'か', 'ki' => 'き', 'ku' => 'く', 'ke' => 'け', 'ko' => 'こ',
|
79
|
+
'ga' => 'が', 'gi' => 'ぎ', 'gu' => 'ぐ', 'ge' => 'げ', 'go' => 'ご',
|
80
|
+
'sa' => 'さ', 'si' => 'し', 'shi' => 'し', 'su' => 'す', 'se' => 'せ', 'so' => 'そ',
|
81
|
+
'za' => 'ざ', 'zi' => 'じ', 'ji' => 'じ', 'zu' => 'ず', 'ze' => 'ぜ', 'zo' => 'ぞ',
|
82
|
+
'ta' => 'た', 'ti' => 'ち', 'chi' => 'ち', 'tu' => 'つ', 'tsu'=> 'つ', 'te' => 'て', 'to' => 'と',
|
83
|
+
'da' => 'だ', 'di' => 'ぢ', 'du' => 'づ', 'dzu'=> 'づ', 'de' => 'で', 'do' => 'ど',
|
84
|
+
'na' => 'な', 'ni' => 'に', 'nu' => 'ぬ', 'ne' => 'ね', 'no' => 'の',
|
85
|
+
'ha' => 'は', 'hi' => 'ひ', 'hu' => 'ふ', 'fu' => 'ふ', 'he' => 'へ', 'ho' => 'ほ',
|
86
|
+
'ba' => 'ば', 'bi' => 'び', 'bu' => 'ぶ', 'be' => 'べ', 'bo' => 'ぼ',
|
87
|
+
'pa' => 'ぱ', 'pi' => 'ぴ', 'pu' => 'ぷ', 'pe' => 'ぺ', 'po' => 'ぽ',
|
88
|
+
'ma' => 'ま', 'mi' => 'み', 'mu' => 'む', 'me' => 'め', 'mo' => 'も',
|
89
|
+
'ya' => 'や', 'yu' => 'ゆ', 'yo' => 'よ',
|
90
|
+
'ra' => 'ら', 'ri' => 'り', 'ru' => 'る', 're' => 'れ', 'ro' => 'ろ',
|
91
|
+
'la' => 'ら', 'li' => 'り', 'lu' => 'る', 'le' => 'れ', 'lo' => 'ろ',
|
92
|
+
'wa' => 'わ', 'wi' => 'うぃ', 'we' => 'うぇ', 'wo' => 'を',
|
93
|
+
'wye' => 'ゑ', 'wyi' => 'ゐ', '-' => 'ー',
|
94
|
+
|
95
|
+
'n' => 'ん', 'nn' => 'ん', "n'"=> 'ん',
|
96
|
+
|
97
|
+
'kya' => 'きゃ', 'kyu' => 'きゅ', 'kyo' => 'きょ', 'kye' => 'きぇ', 'kyi' => 'きぃ',
|
98
|
+
'gya' => 'ぎゃ', 'gyu' => 'ぎゅ', 'gyo' => 'ぎょ', 'gye' => 'ぎぇ', 'gyi' => 'ぎぃ',
|
99
|
+
'kwa' => 'くぁ', 'kwi' => 'くぃ', 'kwu' => 'くぅ', 'kwe' => 'くぇ', 'kwo' => 'くぉ',
|
100
|
+
'gwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
|
101
|
+
'qwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
|
102
|
+
|
103
|
+
'sya' => 'しゃ', 'syi' => 'しぃ', 'syu' => 'しゅ', 'sye' => 'しぇ', 'syo' => 'しょ',
|
104
|
+
'sha' => 'しゃ', 'shu' => 'しゅ', 'she' => 'しぇ', 'sho' => 'しょ',
|
105
|
+
'ja' => 'じゃ', 'ju' => 'じゅ', 'je' => 'じぇ', 'jo' => 'じょ',
|
106
|
+
'jya' => 'じゃ', 'jyi' => 'じぃ', 'jyu' => 'じゅ', 'jye' => 'じぇ', 'jyo' => 'じょ',
|
107
|
+
'zya' => 'じゃ', 'zyu' => 'じゅ', 'zyo' => 'じょ', 'zye' => 'じぇ', 'zyi' => 'じぃ',
|
108
|
+
'swa' => 'すぁ', 'swi' => 'すぃ', 'swu' => 'すぅ', 'swe' => 'すぇ', 'swo' => 'すぉ',
|
109
|
+
|
110
|
+
'cha' => 'ちゃ', 'chu' => 'ちゅ', 'che' => 'ちぇ', 'cho' => 'ちょ',
|
111
|
+
'cya' => 'ちゃ', 'cyi' => 'ちぃ', 'cyu' => 'ちゅ', 'cye' => 'ちぇ', 'cyo' => 'ちょ',
|
112
|
+
'tya' => 'ちゃ', 'tyi' => 'ちぃ', 'tyu' => 'ちゅ', 'tye' => 'ちぇ', 'tyo' => 'ちょ',
|
113
|
+
'dya' => 'ぢゃ', 'dyi' => 'ぢぃ', 'dyu' => 'ぢゅ', 'dye' => 'ぢぇ', 'dyo' => 'ぢょ',
|
114
|
+
'tsa' => 'つぁ', 'tsi' => 'つぃ', 'tse' => 'つぇ', 'tso' => 'つぉ',
|
115
|
+
'tha' => 'てゃ', 'thi' => 'てぃ', 'thu' => 'てゅ', 'the' => 'てぇ', 'tho' => 'てょ',
|
116
|
+
'twa' => 'とぁ', 'twi' => 'とぃ', 'twu' => 'とぅ', 'twe' => 'とぇ', 'two' => 'とぉ',
|
117
|
+
'dha' => 'でゃ', 'dhi' => 'でぃ', 'dhu' => 'でゅ', 'dhe' => 'でぇ', 'dho' => 'でょ',
|
118
|
+
'dwa' => 'どぁ', 'dwi' => 'どぃ', 'dwu' => 'どぅ', 'dwe' => 'どぇ', 'dwo' => 'どぉ',
|
119
|
+
|
120
|
+
'nya' => 'にゃ', 'nyu' => 'にゅ', 'nyo' => 'にょ', 'nye' => 'にぇ', 'nyi' => 'にぃ',
|
121
|
+
|
122
|
+
'hya' => 'ひゃ', 'hyi' => 'ひぃ', 'hyu' => 'ひゅ', 'hye' => 'ひぇ', 'hyo' => 'ひょ',
|
123
|
+
'bya' => 'びゃ', 'byi' => 'びぃ', 'byu' => 'びゅ', 'bye' => 'びぇ', 'byo' => 'びょ',
|
124
|
+
'pya' => 'ぴゃ', 'pyi' => 'ぴぃ', 'pyu' => 'ぴゅ', 'pye' => 'ぴぇ', 'pyo' => 'ぴょ',
|
125
|
+
'fa' => 'ふぁ', 'fi' => 'ふぃ', 'fe' => 'ふぇ', 'fo' => 'ふぉ',
|
126
|
+
'fwa' => 'ふぁ', 'fwi' => 'ふぃ', 'fwu' => 'ふぅ', 'fwe' => 'ふぇ', 'fwo' => 'ふぉ',
|
127
|
+
'fya' => 'ふゃ', 'fyi' => 'ふぃ', 'fyu' => 'ふゅ', 'fye' => 'ふぇ', 'fyo' => 'ふょ',
|
128
|
+
|
129
|
+
'mya' => 'みゃ', 'myi' => 'みぃ', 'myu' => 'みゅ', 'mye' => 'みぇ', 'myo' => 'みょ',
|
130
|
+
|
131
|
+
'rya' => 'りゃ', 'ryi' => 'りぃ', 'ryu' => 'りゅ', 'rye' => 'りぇ', 'ryo' => 'りょ',
|
132
|
+
'lya' => 'りゃ', 'lyu' => 'りゅ', 'lyo' => 'りょ', 'lye' => 'りぇ', 'lyi' => 'りぃ',
|
133
|
+
|
134
|
+
'va' => 'ゔぁ', 'vi' => 'ゔぃ', 'vu' => 'ゔ', 've' => 'ゔぇ', 'vo' => 'ゔぉ',
|
135
|
+
'vya' => 'ゔゃ', 'vyi' => 'ゔぃ', 'vyu' => 'ゔゅ', 'vye' => 'ゔぇ', 'vyo' => 'ゔょ',
|
136
|
+
'wha' => 'うぁ', 'whi' => 'うぃ', 'ye' => 'いぇ', 'whe' => 'うぇ', 'who' => 'うぉ',
|
137
|
+
|
138
|
+
'xa' => 'ぁ', 'xi' => 'ぃ', 'xu' => 'ぅ', 'xe' => 'ぇ', 'xo' => 'ぉ',
|
139
|
+
'xya' => 'ゃ', 'xyu' => 'ゅ', 'xyo' => 'ょ',
|
140
|
+
'xtu' => 'っ', 'xtsu' => 'っ',
|
141
|
+
'xka' => 'ゕ', 'xke' => 'ゖ', 'xwa' => 'ゎ',
|
142
|
+
|
143
|
+
'@@' => ' ', '#[' => '「', '#]' => '」', '#,' => '、', '#.' => '。', '#/' => '・',
|
144
|
+
}
|
145
|
+
|
146
|
+
attr_reader :tokens, :text
|
147
|
+
|
148
|
+
def initialize(text)
|
149
|
+
@tokens = []
|
150
|
+
@text = text
|
151
|
+
end
|
152
|
+
|
153
|
+
def transliterate_from_hrkt_to_latn
|
154
|
+
@text = transliterate_from_kana_to_hira
|
155
|
+
transliterate_from_hira_to_latn
|
156
|
+
end
|
157
|
+
|
158
|
+
def transliterate_from_hira_to_latn
|
159
|
+
# Hepburn style romaji
|
160
|
+
kana = @text.dup
|
161
|
+
romaji = ''
|
162
|
+
geminate = false
|
163
|
+
|
164
|
+
while kana.length > 0
|
165
|
+
[2, 1].each do |length|
|
166
|
+
mora = ''
|
167
|
+
for_conversion = kana[0, length]
|
168
|
+
|
169
|
+
if for_conversion == H_SMALL_TSU
|
170
|
+
geminate = true
|
171
|
+
kana[0, length] = ''
|
172
|
+
break
|
173
|
+
elsif for_conversion == H_SYLLABIC_N && kana[1, 1].match(/[やゆよ]/)
|
174
|
+
# Syllabic N before ya, yu or yo
|
175
|
+
mora = "n'"
|
176
|
+
elsif HIRA_TO_LATN[for_conversion]
|
177
|
+
# Generic cases
|
178
|
+
mora = HIRA_TO_LATN[for_conversion]
|
179
|
+
end
|
180
|
+
|
181
|
+
if mora.length > 0
|
182
|
+
if geminate
|
183
|
+
geminate = false
|
184
|
+
romaji << mora[0, 1]
|
185
|
+
end
|
186
|
+
romaji << mora
|
187
|
+
kana[0, length] = ''
|
188
|
+
break
|
189
|
+
elsif length == 1
|
190
|
+
# Nothing found
|
191
|
+
romaji << for_conversion
|
192
|
+
kana[0, length] = ''
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
|
197
|
+
return romaji
|
198
|
+
end
|
199
|
+
|
200
|
+
def transliterate_from_latn_to_hrkt
|
201
|
+
romaji = @text.dup
|
202
|
+
kana = ''
|
203
|
+
|
204
|
+
romaji.gsub!(/m([BbPp])/, 'n\1')
|
205
|
+
romaji.gsub!(/M([BbPp])/, 'N\1')
|
206
|
+
|
207
|
+
while romaji.length > 0
|
208
|
+
[3, 2, 1].each do |length|
|
209
|
+
mora = ''
|
210
|
+
for_removal = length
|
211
|
+
for_conversion = romaji[0, length]
|
212
|
+
is_upper = !!(for_conversion.match(/^\p{Upper}/))
|
213
|
+
for_conversion.downcase!
|
214
|
+
|
215
|
+
if for_conversion.match(/nn[aiueo]/)
|
216
|
+
# nna should kanafy to んな instead of んあ
|
217
|
+
# This is what people expect for words like konna, anna, zannen
|
218
|
+
mora = H_SYLLABIC_N
|
219
|
+
for_removal = 1
|
220
|
+
elsif LATN_TO_HIRA[for_conversion]
|
221
|
+
# Generic cases
|
222
|
+
mora = LATN_TO_HIRA[for_conversion]
|
223
|
+
elsif for_conversion == 'tch' || ( length == 2 && for_conversion.match(/([kgsztdnbpmyrlwc])\1/))
|
224
|
+
# tch and double-consonants for small tsu
|
225
|
+
mora = H_SMALL_TSU
|
226
|
+
for_removal = 1
|
227
|
+
end
|
228
|
+
|
229
|
+
if mora.length > 0
|
230
|
+
if is_upper
|
231
|
+
# Dance so we can call transliterate_from_hira_to_kana on internal data
|
232
|
+
# TODO: Need a better way for this
|
233
|
+
temp_text = @text
|
234
|
+
@text = mora.dup
|
235
|
+
kana << transliterate_from_hira_to_kana
|
236
|
+
@text = temp_text
|
237
|
+
else
|
238
|
+
kana << mora
|
239
|
+
end
|
240
|
+
|
241
|
+
romaji[0, for_removal] = ''
|
242
|
+
break
|
243
|
+
elsif length == 1
|
244
|
+
# Nothing found
|
245
|
+
kana << for_conversion
|
246
|
+
romaji[0, 1] = ''
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
return kana
|
252
|
+
end
|
253
|
+
|
254
|
+
def transliterate_from_kana_to_hira
|
255
|
+
transpose_codepoints_in_range(@text, -96, 12449..12534)
|
256
|
+
end
|
257
|
+
|
258
|
+
def transliterate_from_hira_to_kana
|
259
|
+
transpose_codepoints_in_range(@text, 96, 12353..12438)
|
260
|
+
end
|
261
|
+
|
262
|
+
def transliterate_from_fullwidth_to_halfwidth
|
263
|
+
res = transpose_codepoints_in_range(@text, -65248, 65281..65374)
|
264
|
+
transpose_codepoints_in_range(res, -12256, 12288..12288)
|
265
|
+
end
|
266
|
+
|
267
|
+
def transliterate_from_halfwidth_to_fullwidth
|
268
|
+
res = transpose_codepoints_in_range(@text, 65248, 33..126)
|
269
|
+
transpose_codepoints_in_range(res, 12256, 32..32)
|
270
|
+
end
|
271
|
+
|
272
|
+
private
|
273
|
+
|
274
|
+
def transpose_codepoints_in_range(text, distance, range)
|
275
|
+
result = ''
|
276
|
+
|
277
|
+
text.each_codepoint do |c|
|
278
|
+
if c >= range.first and c <= range.last
|
279
|
+
result << (c + distance).chr(Encoding::UTF_8)
|
280
|
+
else
|
281
|
+
result << c.chr(Encoding::UTF_8)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
return result
|
286
|
+
end
|
287
|
+
|
288
|
+
end
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
Ve::Manager.register(Ve::Provider::JapaneseTransliterators, :ja)
|
293
|
+
|
@@ -0,0 +1,362 @@
|
|
1
|
+
# Encoding: UTF-8
|
2
|
+
|
3
|
+
require 'open3'
|
4
|
+
|
5
|
+
class Ve
|
6
|
+
class Provider
|
7
|
+
class MecabIpadic < Ve::Provider
|
8
|
+
|
9
|
+
BIT_STOP = 'VeEnd'
|
10
|
+
|
11
|
+
def initialize(config = {})
|
12
|
+
# TODO: Make config handling better
|
13
|
+
@config = {:app => 'mecab',
|
14
|
+
:path => '',
|
15
|
+
:flags => ''}.merge(config)
|
16
|
+
|
17
|
+
@config[:app] = `which #{@config[:app]}`
|
18
|
+
|
19
|
+
start!
|
20
|
+
end
|
21
|
+
|
22
|
+
def works?
|
23
|
+
(["だっ\t助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダッ",
|
24
|
+
"た\t助動詞,*,*,*,特殊・タ,基本形,た,タ,タ",
|
25
|
+
"EOS"] == parse('だった').tokens.collect { |t| t[:raw] } )
|
26
|
+
end
|
27
|
+
|
28
|
+
# Talks to the app and returns a parse object
|
29
|
+
def parse(text, options = {})
|
30
|
+
start! if @stdin.nil? # Restart if the provider crashed
|
31
|
+
|
32
|
+
@stdin.puts "#{text} #{BIT_STOP}"
|
33
|
+
output = []
|
34
|
+
|
35
|
+
while line = @stdout.readline.force_encoding('UTF-8')
|
36
|
+
if line =~ /#{BIT_STOP}/x
|
37
|
+
output << @stdout.readline # Catch the EOS
|
38
|
+
break
|
39
|
+
end
|
40
|
+
output << line
|
41
|
+
end
|
42
|
+
|
43
|
+
Ve::Parse::MecabIpadic.new(text, output)
|
44
|
+
rescue
|
45
|
+
# TODO: No good to catch all errors like this
|
46
|
+
# I need a backtrace when something unexpected fails
|
47
|
+
Ve::Parse::MecabIpadic.new(text, [])
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
# TODO: Use Process.spawn/kill for process control?
|
53
|
+
def start!
|
54
|
+
@stdin, @stdout, @stderr = Open3.popen3(@config[:app])
|
55
|
+
@stdin.set_encoding('UTF-8')
|
56
|
+
@stdout.set_encoding('UTF-8')
|
57
|
+
rescue Errno::ENOENT
|
58
|
+
# The parser couldn't be started. Probably not installed on this system
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class Ve
|
66
|
+
class Parse
|
67
|
+
class MecabIpadic < Ve::Parse
|
68
|
+
|
69
|
+
PARSER = %r{^ (.+?) \t (.+) }x
|
70
|
+
attr_reader :tokens, :text
|
71
|
+
|
72
|
+
def initialize(text, output)
|
73
|
+
@tokens = []
|
74
|
+
@text = text
|
75
|
+
position = 0
|
76
|
+
|
77
|
+
output.each_with_index do |line, index|
|
78
|
+
line.rstrip!
|
79
|
+
token = {:raw => line}
|
80
|
+
# Anything unparsed at the end of the text
|
81
|
+
# This must happen before sentence splits are detected to avoid funny ordering
|
82
|
+
if output.length > 1 && output.length == index + 1
|
83
|
+
unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
|
84
|
+
if unparsed_md[1].length > 0
|
85
|
+
unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''}
|
86
|
+
unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
|
87
|
+
@tokens << unparsed_token
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
if line =~ %r{^ EOS $}x
|
92
|
+
token[:type] = :sentence_split
|
93
|
+
token[:literal] = ''
|
94
|
+
elsif md = PARSER.match(line)
|
95
|
+
# The parsed token
|
96
|
+
token[:type] = :parsed
|
97
|
+
token[:literal] = md[1]
|
98
|
+
info = md[2].split(',')
|
99
|
+
[:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
|
100
|
+
token[attr] = info[i]
|
101
|
+
end
|
102
|
+
|
103
|
+
# Anything unparsed preceding this token
|
104
|
+
unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
|
105
|
+
if unparsed_md[1].length > 0
|
106
|
+
unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
|
107
|
+
unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
|
108
|
+
@tokens << unparsed_token
|
109
|
+
position += unparsed_token[:literal].length
|
110
|
+
end
|
111
|
+
|
112
|
+
token[:characters] = (position..(position+token[:literal].length-1))
|
113
|
+
position += token[:literal].length
|
114
|
+
else
|
115
|
+
# C'est une catastrophe
|
116
|
+
end
|
117
|
+
|
118
|
+
@tokens << token
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
# PoS
|
123
|
+
MEISHI = '名詞'
|
124
|
+
KOYUUMEISHI = '固有名詞'
|
125
|
+
DAIMEISHI = '代名詞'
|
126
|
+
JODOUSHI = '助動詞'
|
127
|
+
KAZU = '数'
|
128
|
+
JOSHI = '助詞'
|
129
|
+
SETTOUSHI = '接頭詞'
|
130
|
+
DOUSHI = '動詞'
|
131
|
+
KIGOU = '記号'
|
132
|
+
FIRAA = 'フィラー'
|
133
|
+
SONOTA = 'その他'
|
134
|
+
KANDOUSHI = '感動詞'
|
135
|
+
RENTAISHI = '連体詞'
|
136
|
+
SETSUZOKUSHI = '接続詞'
|
137
|
+
FUKUSHI = '副詞'
|
138
|
+
SETSUZOKUJOSHI = '接続助詞'
|
139
|
+
KEIYOUSHI = '形容詞'
|
140
|
+
|
141
|
+
# Pos2 and Inflection types
|
142
|
+
HIJIRITSU = '非自立'
|
143
|
+
FUKUSHIKANOU = '副詞可能'
|
144
|
+
SAHENSETSUZOKU = 'サ変接続'
|
145
|
+
KEIYOUDOUSHIGOKAN = '形容動詞語幹'
|
146
|
+
NAIKEIYOUSHIGOKAN = 'ナイ形容詞語幹'
|
147
|
+
JODOUSHIGOKAN = '助動詞語幹'
|
148
|
+
FUKUSHIKA = '副詞化'
|
149
|
+
TAIGENSETSUZOKU = '体言接続'
|
150
|
+
RENTAIKA = '連体化'
|
151
|
+
TOKUSHU = '特殊'
|
152
|
+
SETSUBI = '接尾'
|
153
|
+
SETSUZOKUSHITEKI = '接続詞的'
|
154
|
+
DOUSHIHIJIRITSUTEKI = '動詞非自立的'
|
155
|
+
SAHEN_SURU = 'サ変・スル'
|
156
|
+
TOKUSHU_TA = '特殊・タ'
|
157
|
+
TOKUSHU_NAI = '特殊・ナイ'
|
158
|
+
TOKUSHU_TAI = '特殊・タイ'
|
159
|
+
TOKUSHU_DESU = '特殊・デス'
|
160
|
+
TOKUSHU_DA = '特殊・ダ'
|
161
|
+
TOKUSHU_MASU = '特殊・マス'
|
162
|
+
|
163
|
+
# Etc
|
164
|
+
NA = 'な'
|
165
|
+
NI = 'に'
|
166
|
+
TE = 'て'
|
167
|
+
DE = 'で'
|
168
|
+
BA = 'ば'
|
169
|
+
|
170
|
+
def words
|
171
|
+
words = []
|
172
|
+
tokens = @tokens.find_all { |t| t[:type] == :parsed }
|
173
|
+
tokens = tokens.to_enum
|
174
|
+
|
175
|
+
# This is becoming very big
|
176
|
+
begin
|
177
|
+
while token = tokens.next
|
178
|
+
pos = nil
|
179
|
+
grammar = nil
|
180
|
+
eat_next = false
|
181
|
+
eat_lemma = true
|
182
|
+
attach_to_previous = false
|
183
|
+
also_attach_to_lemma = false
|
184
|
+
|
185
|
+
case token[:pos]
|
186
|
+
when MEISHI
|
187
|
+
pos = Ve::PartOfSpeech::Noun
|
188
|
+
|
189
|
+
case token[:pos2]
|
190
|
+
when KOYUUMEISHI
|
191
|
+
pos = Ve::PartOfSpeech::ProperNoun
|
192
|
+
when DAIMEISHI
|
193
|
+
pos = Ve::PartOfSpeech::Pronoun
|
194
|
+
when FUKUSHIKANOU, SAHENSETSUZOKU, KEIYOUDOUSHIGOKAN, NAIKEIYOUSHIGOKAN
|
195
|
+
if tokens.more?
|
196
|
+
following = tokens.peek
|
197
|
+
if following[:inflection_type] == SAHEN_SURU
|
198
|
+
pos = Ve::PartOfSpeech::Verb
|
199
|
+
eat_next = true
|
200
|
+
elsif following[:inflection_type] == TOKUSHU_DA
|
201
|
+
pos = Ve::PartOfSpeech::Adjective
|
202
|
+
if following[:inflection_form] == TAIGENSETSUZOKU
|
203
|
+
eat_next = true
|
204
|
+
eat_lemma = false
|
205
|
+
end
|
206
|
+
elsif following[:inflection_type] == TOKUSHU_NAI
|
207
|
+
pos = Ve::PartOfSpeech::Adjective
|
208
|
+
eat_next = true
|
209
|
+
elsif following[:pos] == JOSHI && following[:literal] == NI
|
210
|
+
pos = Ve::PartOfSpeech::Adverb
|
211
|
+
eat_next = true
|
212
|
+
end
|
213
|
+
end
|
214
|
+
when HIJIRITSU, TOKUSHU
|
215
|
+
if tokens.more?
|
216
|
+
following = tokens.peek
|
217
|
+
case token[:pos3]
|
218
|
+
when FUKUSHIKANOU
|
219
|
+
if following[:pos] == JOSHI && following[:literal] == NI
|
220
|
+
pos = Ve::PartOfSpeech::Adverb
|
221
|
+
eat_next = true
|
222
|
+
end
|
223
|
+
when JODOUSHIGOKAN
|
224
|
+
if following[:inflection_type] == TOKUSHU_DA
|
225
|
+
pos = Ve::PartOfSpeech::Verb
|
226
|
+
grammar = :auxillary
|
227
|
+
if following[:inflection_form] == TAIGENSETSUZOKU
|
228
|
+
eat_next = true
|
229
|
+
end
|
230
|
+
elsif following[:pos] == JOSHI && following[:pos2] == FUKUSHIKA
|
231
|
+
pos = Ve::PartOfSpeech::Adverb
|
232
|
+
eat_next = true
|
233
|
+
end
|
234
|
+
when KEIYOUDOUSHIGOKAN
|
235
|
+
pos = Ve::PartOfSpeech::Adjective
|
236
|
+
if (following[:inflection_type] == TOKUSHU_DA && following[:inflection_form] == TAIGENSETSUZOKU) || following[:pos2] == RENTAIKA
|
237
|
+
eat_next = true
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
241
|
+
when KAZU
|
242
|
+
# TODO: recurse and find following numbers and add to this word. Except non-numbers like 幾
|
243
|
+
pos = Ve::PartOfSpeech::Number
|
244
|
+
if words.length > 0 && words[-1].part_of_speech == Ve::PartOfSpeech::Number
|
245
|
+
attach_to_previous = true
|
246
|
+
also_attach_to_lemma = true
|
247
|
+
end
|
248
|
+
when SETSUBI
|
249
|
+
# TODO: elaborate a bit?
|
250
|
+
pos = Ve::PartOfSpeech::Suffix
|
251
|
+
when SETSUZOKUSHITEKI
|
252
|
+
pos = Ve::PartOfSpeech::Conjunction
|
253
|
+
when DOUSHIHIJIRITSUTEKI
|
254
|
+
pos = Ve::PartOfSpeech::Verb
|
255
|
+
grammar = :nominal
|
256
|
+
end
|
257
|
+
when SETTOUSHI
|
258
|
+
# TODO: elaborate this when we have the "main part" feature for words?
|
259
|
+
pos = Ve::PartOfSpeech::Prefix
|
260
|
+
when JODOUSHI
|
261
|
+
pos = Ve::PartOfSpeech::Postposition
|
262
|
+
|
263
|
+
if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type])
|
264
|
+
attach_to_previous = true
|
265
|
+
elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
|
266
|
+
pos = Ve::PartOfSpeech::Verb
|
267
|
+
end
|
268
|
+
when DOUSHI
|
269
|
+
pos = Ve::PartOfSpeech::Verb
|
270
|
+
if token[:pos2] == SETSUBI
|
271
|
+
attach_to_previous = true
|
272
|
+
elsif token[:pos2] == HIJIRITSU
|
273
|
+
grammar = :auxillary
|
274
|
+
end
|
275
|
+
when KEIYOUSHI
|
276
|
+
pos = Ve::PartOfSpeech::Adjective
|
277
|
+
when JOSHI
|
278
|
+
pos = Ve::PartOfSpeech::Postposition
|
279
|
+
if token[:pos2] == SETSUZOKUJOSHI && [TE, DE, BA].include?(token[:literal])
|
280
|
+
attach_to_previous = true
|
281
|
+
end
|
282
|
+
when RENTAISHI
|
283
|
+
pos = Ve::PartOfSpeech::Determiner
|
284
|
+
when SETSUZOKUSHI
|
285
|
+
pos = Ve::PartOfSpeech::Conjunction
|
286
|
+
when FUKUSHI
|
287
|
+
pos = Ve::PartOfSpeech::Adverb
|
288
|
+
when KIGOU
|
289
|
+
pos = Ve::PartOfSpeech::Symbol
|
290
|
+
when FIRAA, KANDOUSHI
|
291
|
+
pos = Ve::PartOfSpeech::Interjection
|
292
|
+
when SONOTA
|
293
|
+
pos = Ve::PartOfSpeech::Other
|
294
|
+
else
|
295
|
+
# C'est une catastrophe
|
296
|
+
end
|
297
|
+
|
298
|
+
if attach_to_previous && words.length > 0
|
299
|
+
words[-1].tokens << token
|
300
|
+
words[-1].word << token[:literal]
|
301
|
+
words[-1].extra[:reading] << (token[:reading] || '')
|
302
|
+
words[-1].extra[:transcription] << (token[:hatsuon] || '')
|
303
|
+
words[-1].lemma << token[:lemma] if also_attach_to_lemma
|
304
|
+
else
|
305
|
+
pos = Ve::PartOfSpeech::TBD if pos.nil?
|
306
|
+
word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
|
307
|
+
:reading => token[:reading] || '',
|
308
|
+
:transcription => token[:hatsuon] || '',
|
309
|
+
:grammar => grammar
|
310
|
+
}, {
|
311
|
+
:reading_script => :kata,
|
312
|
+
:transcription_script => :kata
|
313
|
+
})
|
314
|
+
|
315
|
+
if eat_next
|
316
|
+
following = tokens.next
|
317
|
+
word.tokens << following
|
318
|
+
word.word << following[:literal]
|
319
|
+
word.extra[:reading] << following[:reading]
|
320
|
+
word.extra[:transcription] << following[:hatsuon]
|
321
|
+
word.lemma << following[:lemma] if eat_lemma
|
322
|
+
end
|
323
|
+
|
324
|
+
words << word
|
325
|
+
end
|
326
|
+
end
|
327
|
+
rescue StopIteration
|
328
|
+
end
|
329
|
+
|
330
|
+
return words
|
331
|
+
end
|
332
|
+
|
333
|
+
def sentences
|
334
|
+
# TODO: Sentence objects that keep track of the sentence's tokens
|
335
|
+
sentences = []
|
336
|
+
current = ''
|
337
|
+
|
338
|
+
@tokens.each do |token|
|
339
|
+
if token[:type] == :sentence_split
|
340
|
+
sentences << current
|
341
|
+
current = ''
|
342
|
+
elsif token[:literal] == '。'
|
343
|
+
current << token[:literal]
|
344
|
+
sentences << current
|
345
|
+
current = ''
|
346
|
+
else
|
347
|
+
current << token[:literal]
|
348
|
+
end
|
349
|
+
end
|
350
|
+
|
351
|
+
# In case there is no :sentence_split at the end
|
352
|
+
sentences << current if current.length > 0
|
353
|
+
|
354
|
+
sentences
|
355
|
+
end
|
356
|
+
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
|
361
|
+
Ve::Manager.register(Ve::Provider::MecabIpadic, :ja)
|
362
|
+
|