indonesian_stemmer 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/indonesian_stemmer/irregular_words.rb +51 -0
- data/lib/indonesian_stemmer/irregular_words/akhiran-i.txt +3508 -0
- data/lib/indonesian_stemmer/irregular_words/k.txt +734 -0
- data/lib/indonesian_stemmer/irregular_words/kah.txt +40 -0
- data/lib/indonesian_stemmer/irregular_words/ku.txt +28 -0
- data/lib/indonesian_stemmer/irregular_words/lah.txt +41 -0
- data/lib/indonesian_stemmer/irregular_words/mu.txt +8 -0
- data/lib/indonesian_stemmer/irregular_words/n.txt +96 -0
- data/lib/indonesian_stemmer/irregular_words/nya.txt +1 -0
- data/lib/indonesian_stemmer/irregular_words/p.txt +499 -0
- data/lib/indonesian_stemmer/irregular_words/pun.txt +6 -0
- data/lib/indonesian_stemmer/irregular_words/r.txt +527 -0
- data/lib/indonesian_stemmer/morphological_utility.rb +49 -79
- data/lib/indonesian_stemmer/version.rb +1 -1
- data/spec/indonesian_stemmer_spec.rb +2 -0
- data/spec/lib/indonesian_stemmer/morphological_utility_spec.rb +14 -0
- metadata +16 -4
@@ -1,4 +1,5 @@
|
|
1
1
|
require "indonesian_stemmer/stemmer_utility"
|
2
|
+
require "indonesian_stemmer/irregular_words"
|
2
3
|
|
3
4
|
module IndonesianStemmer
|
4
5
|
|
@@ -15,57 +16,6 @@ module IndonesianStemmer
|
|
15
16
|
SUFFIX_CHARACTERS = %w( kan an i )
|
16
17
|
WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS = %w( meny peny men pen )
|
17
18
|
|
18
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_K = %w(
|
19
|
-
aget alah andung ata ejar eluar embali empis emuka ena enal encang endali ering
|
20
|
-
erja erut etahui etik ibar irim uasai uliti umpul unci unjung unyah upas urang )
|
21
|
-
|
22
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_P = %w(
|
23
|
-
adam ahat akai amer anas ancang anggang anggil anjat antul asang asti atuhi
|
24
|
-
ecah ecat elihara eluk ercik eriksa erintah esan ikir ilah ilih injam inta
|
25
|
-
isah otong otret uja uji ukul usat utar-balik utus )
|
26
|
-
|
27
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_N = %w( aas ada adi afi afsu aif aik akal akoda
|
28
|
-
alar ama anti arasi asab asib asional atif asehat asihat atural etral ikah )
|
29
|
-
|
30
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_R = %w( aba abak aban abas abat abet abit
|
31
|
-
abuk abun abung abut acak acau acik acuh acun adah adai adak adang adiasi
|
32
|
-
adikal adio adu aga agam agas agi agu aguk ahap ahasia ahat ahim ahmat aih
|
33
|
-
aja ajah ajalela ajam ajang ajin ajuk ajut akap akat akit aksi akuk akus
|
34
|
-
akut akyat alat alip amah amahtamah amah-tamah amai amal ambah ambai ambak
|
35
|
-
amban ambang ambat ambeh ambu ambut amin ampai ampak ampang ampas ampat
|
36
|
-
amping ampok ampung ampus amu amus anap anca ancah ancak ancang ancap
|
37
|
-
ancu ancung anda andai andak andat andau andek anduk andung angah angai
|
38
|
-
angak anggah asa usak )
|
39
|
-
|
40
|
-
IRREGULAR_PREFIX_CHARACTERS_ON_WORDS = {
|
41
|
-
'meng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
|
42
|
-
'peng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
|
43
|
-
'mem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P,
|
44
|
-
'pem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P, }
|
45
|
-
|
46
|
-
IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS = {
|
47
|
-
'kah' => %w( bengkah berkah bingkah bongkah cekah firkah halakah halkah
|
48
|
-
harakah ingkah jangkah jerkah kalah kekah kelakah kerakah kerkah
|
49
|
-
khalikah langkah lukah markah mukah musyarakah nafkah naskah
|
50
|
-
nikah pangkah rakah rekah rengkah sedekah sekah serakah serkah
|
51
|
-
sungkah takah tekah telingkah tingkah tongkah ),
|
52
|
-
|
53
|
-
'lah' => %w( balah belah beslah bilah celah galah islah istilah jumlah
|
54
|
-
kalah kelah kilah lalah lelah makalah malah masalah
|
55
|
-
muamalah mujadalah mukabalah olah onslah oplah pecahbelah
|
56
|
-
pecah-belah pilah milah sekolah rihlah risalah salah serlah
|
57
|
-
silsilah sudah sulalah telah tulah ulah uzlah walah wasilah ),
|
58
|
-
|
59
|
-
'pun' => %w( ampun depun himpun lapun rapun rumpun ),
|
60
|
-
|
61
|
-
'ku' => %w( awabeku baku bangku beku beluku biku buku ceku ciku cuku deku
|
62
|
-
jibaku kaku laku leku liku luku paku pangku peku perilaku saku
|
63
|
-
siku suku teleku terungku tungku waluku ),
|
64
|
-
|
65
|
-
'mu' => %w( ilmu jamu jemu kemu ramu selumu tamu temu ),
|
66
|
-
|
67
|
-
'nya' => %w( tanya ),
|
68
|
-
}
|
69
19
|
|
70
20
|
REMOVED_KE = 1
|
71
21
|
REMOVED_PENG = 2
|
@@ -143,6 +93,8 @@ module IndonesianStemmer
|
|
143
93
|
end
|
144
94
|
|
145
95
|
def remove_suffix(word)
|
96
|
+
return word if ambiguous_with_suffices_ending_words?(word)
|
97
|
+
|
146
98
|
@number_of_syllables ||= total_syllables(word)
|
147
99
|
|
148
100
|
SUFFIX_CHARACTERS.each do |character|
|
@@ -192,13 +144,12 @@ module IndonesianStemmer
|
|
192
144
|
|
193
145
|
def remove_characters_matching_collection(word, collection, position)
|
194
146
|
collection.each do |characters|
|
195
|
-
if
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
end
|
147
|
+
if match_position_and_not_ambiguous_with_characters?(word, characters, position)
|
148
|
+
next if characters == 'mem' && is_vowel?(word[characters.size])
|
149
|
+
@flags ||= collection_for(characters, 'removed')
|
150
|
+
reduce_syllable
|
151
|
+
slice_word_at_position(word, characters.size, position)
|
152
|
+
return word
|
202
153
|
end
|
203
154
|
end
|
204
155
|
|
@@ -211,32 +162,24 @@ module IndonesianStemmer
|
|
211
162
|
end
|
212
163
|
|
213
164
|
def remove_and_substitute_characters_matching_collection(word, collection, position)
|
214
|
-
word_size = word.size
|
215
165
|
collection.each do |characters|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
reduce_syllable
|
225
|
-
word = substitute_word_character(word, characters)
|
226
|
-
slice_word_at_position( word,
|
227
|
-
characters_size-1,
|
228
|
-
:start )
|
229
|
-
return word
|
230
|
-
end
|
166
|
+
if matching_characters_requires_substitution?(word, characters, position)
|
167
|
+
@flags ||= collection_for(characters, 'removed')
|
168
|
+
reduce_syllable
|
169
|
+
word = substitute_word_character(word, characters)
|
170
|
+
slice_word_at_position( word,
|
171
|
+
characters.size-1,
|
172
|
+
:start )
|
173
|
+
return word
|
231
174
|
end
|
232
175
|
end
|
233
176
|
end
|
234
177
|
|
235
178
|
def contains_irregular_prefix?(word, characters)
|
236
|
-
if
|
179
|
+
if IrregularWords::ON_PREFIX_CHARACTERS.keys.include?(characters)
|
237
180
|
chopped_word_match_words_collection?(
|
238
181
|
word[characters.size, word.size],
|
239
|
-
|
182
|
+
IrregularWords::ON_PREFIX_CHARACTERS[characters] )
|
240
183
|
end
|
241
184
|
end
|
242
185
|
|
@@ -250,7 +193,7 @@ module IndonesianStemmer
|
|
250
193
|
's'
|
251
194
|
when %w(men pen).include?(characters)
|
252
195
|
(chopped_word_match_words_collection?(
|
253
|
-
word[characters.size, word.size],
|
196
|
+
word[characters.size, word.size], IrregularWords::BEGINS_WITH_N
|
254
197
|
)
|
255
198
|
)? 'n' : 't'
|
256
199
|
when %w(meng peng).include?(characters)
|
@@ -266,12 +209,12 @@ module IndonesianStemmer
|
|
266
209
|
if position == :start
|
267
210
|
if characters == 'per'
|
268
211
|
chopped_word_match_words_collection?(word[3..-1],
|
269
|
-
|
212
|
+
IrregularWords::BEGINS_WITH_R )
|
270
213
|
else
|
271
214
|
return false
|
272
215
|
end
|
273
216
|
else
|
274
|
-
|
217
|
+
IrregularWords::ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
|
275
218
|
# To differentiate 'mobilmu' with 'berilmu'
|
276
219
|
return false unless %w(me be pe).include?(word[0,2])
|
277
220
|
# The rest is ok
|
@@ -280,6 +223,33 @@ module IndonesianStemmer
|
|
280
223
|
end
|
281
224
|
end
|
282
225
|
|
226
|
+
def ambiguous_with_suffices_ending_words?(word)
|
227
|
+
IrregularWords::ENDS_WITH_SUFFIX_CHARACTERS.include?(word)
|
228
|
+
end
|
229
|
+
|
230
|
+
def match_position_and_not_ambiguous_with_characters?(word, characters, position)
|
231
|
+
send("#{position}s_with?", word, word.size, characters) &&
|
232
|
+
!ambiguous_with_characters?(word, characters, position)
|
233
|
+
end
|
234
|
+
|
235
|
+
def match_characters_position_followed_by_vowel?(word, characters, position)
|
236
|
+
word_size = word.size
|
237
|
+
characters_size = characters.size
|
238
|
+
|
239
|
+
send("#{position}s_with?", word, word_size, characters) &&
|
240
|
+
word_size > characters_size && is_vowel?(word[characters_size])
|
241
|
+
end
|
242
|
+
|
243
|
+
def substitution_required?(word, characters)
|
244
|
+
WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
|
245
|
+
contains_irregular_prefix?(word, characters)
|
246
|
+
end
|
247
|
+
|
248
|
+
def matching_characters_requires_substitution?(word, characters, position)
|
249
|
+
match_characters_position_followed_by_vowel?(word, characters, position) &&
|
250
|
+
substitution_required?(word, characters)
|
251
|
+
end
|
252
|
+
|
283
253
|
def reduce_syllable
|
284
254
|
@number_of_syllables -= 1
|
285
255
|
end
|
@@ -263,6 +263,10 @@ describe IndonesianStemmer::MorphologicalUtility do
|
|
263
263
|
should_transform(:remove_first_order_prefix, 'membangun', 'bangun')
|
264
264
|
end
|
265
265
|
|
266
|
+
it "'mem' followed by vowel" do
|
267
|
+
should_transform(:remove_first_order_prefix, 'memilih', 'pilih')
|
268
|
+
end
|
269
|
+
|
266
270
|
it "'me'" do
|
267
271
|
should_transform(:remove_first_order_prefix, 'melukis', 'lukis')
|
268
272
|
end
|
@@ -630,5 +634,15 @@ describe IndonesianStemmer::MorphologicalUtility do
|
|
630
634
|
end
|
631
635
|
end
|
632
636
|
end
|
637
|
+
|
638
|
+
describe 'should not remove suffix characters for words ending with them' do
|
639
|
+
it "'kan'" do
|
640
|
+
should_not_transform(:remove_suffix, 'majikan')
|
641
|
+
end
|
642
|
+
|
643
|
+
it "'i'" do
|
644
|
+
should_not_transform(:remove_suffix, 'pandai')
|
645
|
+
end
|
646
|
+
end
|
633
647
|
end
|
634
648
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indonesian_stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -61,6 +61,18 @@ files:
|
|
61
61
|
- Rakefile
|
62
62
|
- indonesian_stemmer.gemspec
|
63
63
|
- lib/indonesian_stemmer.rb
|
64
|
+
- lib/indonesian_stemmer/irregular_words.rb
|
65
|
+
- lib/indonesian_stemmer/irregular_words/akhiran-i.txt
|
66
|
+
- lib/indonesian_stemmer/irregular_words/k.txt
|
67
|
+
- lib/indonesian_stemmer/irregular_words/kah.txt
|
68
|
+
- lib/indonesian_stemmer/irregular_words/ku.txt
|
69
|
+
- lib/indonesian_stemmer/irregular_words/lah.txt
|
70
|
+
- lib/indonesian_stemmer/irregular_words/mu.txt
|
71
|
+
- lib/indonesian_stemmer/irregular_words/n.txt
|
72
|
+
- lib/indonesian_stemmer/irregular_words/nya.txt
|
73
|
+
- lib/indonesian_stemmer/irregular_words/p.txt
|
74
|
+
- lib/indonesian_stemmer/irregular_words/pun.txt
|
75
|
+
- lib/indonesian_stemmer/irregular_words/r.txt
|
64
76
|
- lib/indonesian_stemmer/morphological_utility.rb
|
65
77
|
- lib/indonesian_stemmer/stemmer_utility.rb
|
66
78
|
- lib/indonesian_stemmer/version.rb
|
@@ -82,7 +94,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
82
94
|
version: '0'
|
83
95
|
segments:
|
84
96
|
- 0
|
85
|
-
hash:
|
97
|
+
hash: 550012699463393318
|
86
98
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
99
|
none: false
|
88
100
|
requirements:
|
@@ -91,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
91
103
|
version: '0'
|
92
104
|
segments:
|
93
105
|
- 0
|
94
|
-
hash:
|
106
|
+
hash: 550012699463393318
|
95
107
|
requirements: []
|
96
108
|
rubyforge_project:
|
97
109
|
rubygems_version: 1.8.25
|