indonesian_stemmer 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/indonesian_stemmer/irregular_words.rb +51 -0
- data/lib/indonesian_stemmer/irregular_words/akhiran-i.txt +3508 -0
- data/lib/indonesian_stemmer/irregular_words/k.txt +734 -0
- data/lib/indonesian_stemmer/irregular_words/kah.txt +40 -0
- data/lib/indonesian_stemmer/irregular_words/ku.txt +28 -0
- data/lib/indonesian_stemmer/irregular_words/lah.txt +41 -0
- data/lib/indonesian_stemmer/irregular_words/mu.txt +8 -0
- data/lib/indonesian_stemmer/irregular_words/n.txt +96 -0
- data/lib/indonesian_stemmer/irregular_words/nya.txt +1 -0
- data/lib/indonesian_stemmer/irregular_words/p.txt +499 -0
- data/lib/indonesian_stemmer/irregular_words/pun.txt +6 -0
- data/lib/indonesian_stemmer/irregular_words/r.txt +527 -0
- data/lib/indonesian_stemmer/morphological_utility.rb +49 -79
- data/lib/indonesian_stemmer/version.rb +1 -1
- data/spec/indonesian_stemmer_spec.rb +2 -0
- data/spec/lib/indonesian_stemmer/morphological_utility_spec.rb +14 -0
- metadata +16 -4
@@ -1,4 +1,5 @@
|
|
1
1
|
require "indonesian_stemmer/stemmer_utility"
|
2
|
+
require "indonesian_stemmer/irregular_words"
|
2
3
|
|
3
4
|
module IndonesianStemmer
|
4
5
|
|
@@ -15,57 +16,6 @@ module IndonesianStemmer
|
|
15
16
|
SUFFIX_CHARACTERS = %w( kan an i )
|
16
17
|
WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS = %w( meny peny men pen )
|
17
18
|
|
18
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_K = %w(
|
19
|
-
aget alah andung ata ejar eluar embali empis emuka ena enal encang endali ering
|
20
|
-
erja erut etahui etik ibar irim uasai uliti umpul unci unjung unyah upas urang )
|
21
|
-
|
22
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_P = %w(
|
23
|
-
adam ahat akai amer anas ancang anggang anggil anjat antul asang asti atuhi
|
24
|
-
ecah ecat elihara eluk ercik eriksa erintah esan ikir ilah ilih injam inta
|
25
|
-
isah otong otret uja uji ukul usat utar-balik utus )
|
26
|
-
|
27
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_N = %w( aas ada adi afi afsu aif aik akal akoda
|
28
|
-
alar ama anti arasi asab asib asional atif asehat asihat atural etral ikah )
|
29
|
-
|
30
|
-
IRREGULARS_FOR_WORDS_BEGINS_WITH_R = %w( aba abak aban abas abat abet abit
|
31
|
-
abuk abun abung abut acak acau acik acuh acun adah adai adak adang adiasi
|
32
|
-
adikal adio adu aga agam agas agi agu aguk ahap ahasia ahat ahim ahmat aih
|
33
|
-
aja ajah ajalela ajam ajang ajin ajuk ajut akap akat akit aksi akuk akus
|
34
|
-
akut akyat alat alip amah amahtamah amah-tamah amai amal ambah ambai ambak
|
35
|
-
amban ambang ambat ambeh ambu ambut amin ampai ampak ampang ampas ampat
|
36
|
-
amping ampok ampung ampus amu amus anap anca ancah ancak ancang ancap
|
37
|
-
ancu ancung anda andai andak andat andau andek anduk andung angah angai
|
38
|
-
angak anggah asa usak )
|
39
|
-
|
40
|
-
IRREGULAR_PREFIX_CHARACTERS_ON_WORDS = {
|
41
|
-
'meng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
|
42
|
-
'peng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
|
43
|
-
'mem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P,
|
44
|
-
'pem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P, }
|
45
|
-
|
46
|
-
IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS = {
|
47
|
-
'kah' => %w( bengkah berkah bingkah bongkah cekah firkah halakah halkah
|
48
|
-
harakah ingkah jangkah jerkah kalah kekah kelakah kerakah kerkah
|
49
|
-
khalikah langkah lukah markah mukah musyarakah nafkah naskah
|
50
|
-
nikah pangkah rakah rekah rengkah sedekah sekah serakah serkah
|
51
|
-
sungkah takah tekah telingkah tingkah tongkah ),
|
52
|
-
|
53
|
-
'lah' => %w( balah belah beslah bilah celah galah islah istilah jumlah
|
54
|
-
kalah kelah kilah lalah lelah makalah malah masalah
|
55
|
-
muamalah mujadalah mukabalah olah onslah oplah pecahbelah
|
56
|
-
pecah-belah pilah milah sekolah rihlah risalah salah serlah
|
57
|
-
silsilah sudah sulalah telah tulah ulah uzlah walah wasilah ),
|
58
|
-
|
59
|
-
'pun' => %w( ampun depun himpun lapun rapun rumpun ),
|
60
|
-
|
61
|
-
'ku' => %w( awabeku baku bangku beku beluku biku buku ceku ciku cuku deku
|
62
|
-
jibaku kaku laku leku liku luku paku pangku peku perilaku saku
|
63
|
-
siku suku teleku terungku tungku waluku ),
|
64
|
-
|
65
|
-
'mu' => %w( ilmu jamu jemu kemu ramu selumu tamu temu ),
|
66
|
-
|
67
|
-
'nya' => %w( tanya ),
|
68
|
-
}
|
69
19
|
|
70
20
|
REMOVED_KE = 1
|
71
21
|
REMOVED_PENG = 2
|
@@ -143,6 +93,8 @@ module IndonesianStemmer
|
|
143
93
|
end
|
144
94
|
|
145
95
|
def remove_suffix(word)
|
96
|
+
return word if ambiguous_with_suffices_ending_words?(word)
|
97
|
+
|
146
98
|
@number_of_syllables ||= total_syllables(word)
|
147
99
|
|
148
100
|
SUFFIX_CHARACTERS.each do |character|
|
@@ -192,13 +144,12 @@ module IndonesianStemmer
|
|
192
144
|
|
193
145
|
def remove_characters_matching_collection(word, collection, position)
|
194
146
|
collection.each do |characters|
|
195
|
-
if
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
end
|
147
|
+
if match_position_and_not_ambiguous_with_characters?(word, characters, position)
|
148
|
+
next if characters == 'mem' && is_vowel?(word[characters.size])
|
149
|
+
@flags ||= collection_for(characters, 'removed')
|
150
|
+
reduce_syllable
|
151
|
+
slice_word_at_position(word, characters.size, position)
|
152
|
+
return word
|
202
153
|
end
|
203
154
|
end
|
204
155
|
|
@@ -211,32 +162,24 @@ module IndonesianStemmer
|
|
211
162
|
end
|
212
163
|
|
213
164
|
def remove_and_substitute_characters_matching_collection(word, collection, position)
|
214
|
-
word_size = word.size
|
215
165
|
collection.each do |characters|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
reduce_syllable
|
225
|
-
word = substitute_word_character(word, characters)
|
226
|
-
slice_word_at_position( word,
|
227
|
-
characters_size-1,
|
228
|
-
:start )
|
229
|
-
return word
|
230
|
-
end
|
166
|
+
if matching_characters_requires_substitution?(word, characters, position)
|
167
|
+
@flags ||= collection_for(characters, 'removed')
|
168
|
+
reduce_syllable
|
169
|
+
word = substitute_word_character(word, characters)
|
170
|
+
slice_word_at_position( word,
|
171
|
+
characters.size-1,
|
172
|
+
:start )
|
173
|
+
return word
|
231
174
|
end
|
232
175
|
end
|
233
176
|
end
|
234
177
|
|
235
178
|
def contains_irregular_prefix?(word, characters)
|
236
|
-
if
|
179
|
+
if IrregularWords::ON_PREFIX_CHARACTERS.keys.include?(characters)
|
237
180
|
chopped_word_match_words_collection?(
|
238
181
|
word[characters.size, word.size],
|
239
|
-
|
182
|
+
IrregularWords::ON_PREFIX_CHARACTERS[characters] )
|
240
183
|
end
|
241
184
|
end
|
242
185
|
|
@@ -250,7 +193,7 @@ module IndonesianStemmer
|
|
250
193
|
's'
|
251
194
|
when %w(men pen).include?(characters)
|
252
195
|
(chopped_word_match_words_collection?(
|
253
|
-
word[characters.size, word.size],
|
196
|
+
word[characters.size, word.size], IrregularWords::BEGINS_WITH_N
|
254
197
|
)
|
255
198
|
)? 'n' : 't'
|
256
199
|
when %w(meng peng).include?(characters)
|
@@ -266,12 +209,12 @@ module IndonesianStemmer
|
|
266
209
|
if position == :start
|
267
210
|
if characters == 'per'
|
268
211
|
chopped_word_match_words_collection?(word[3..-1],
|
269
|
-
|
212
|
+
IrregularWords::BEGINS_WITH_R )
|
270
213
|
else
|
271
214
|
return false
|
272
215
|
end
|
273
216
|
else
|
274
|
-
|
217
|
+
IrregularWords::ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
|
275
218
|
# To differentiate 'mobilmu' with 'berilmu'
|
276
219
|
return false unless %w(me be pe).include?(word[0,2])
|
277
220
|
# The rest is ok
|
@@ -280,6 +223,33 @@ module IndonesianStemmer
|
|
280
223
|
end
|
281
224
|
end
|
282
225
|
|
226
|
+
def ambiguous_with_suffices_ending_words?(word)
|
227
|
+
IrregularWords::ENDS_WITH_SUFFIX_CHARACTERS.include?(word)
|
228
|
+
end
|
229
|
+
|
230
|
+
def match_position_and_not_ambiguous_with_characters?(word, characters, position)
|
231
|
+
send("#{position}s_with?", word, word.size, characters) &&
|
232
|
+
!ambiguous_with_characters?(word, characters, position)
|
233
|
+
end
|
234
|
+
|
235
|
+
def match_characters_position_followed_by_vowel?(word, characters, position)
|
236
|
+
word_size = word.size
|
237
|
+
characters_size = characters.size
|
238
|
+
|
239
|
+
send("#{position}s_with?", word, word_size, characters) &&
|
240
|
+
word_size > characters_size && is_vowel?(word[characters_size])
|
241
|
+
end
|
242
|
+
|
243
|
+
def substitution_required?(word, characters)
|
244
|
+
WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
|
245
|
+
contains_irregular_prefix?(word, characters)
|
246
|
+
end
|
247
|
+
|
248
|
+
def matching_characters_requires_substitution?(word, characters, position)
|
249
|
+
match_characters_position_followed_by_vowel?(word, characters, position) &&
|
250
|
+
substitution_required?(word, characters)
|
251
|
+
end
|
252
|
+
|
283
253
|
def reduce_syllable
|
284
254
|
@number_of_syllables -= 1
|
285
255
|
end
|
@@ -263,6 +263,10 @@ describe IndonesianStemmer::MorphologicalUtility do
|
|
263
263
|
should_transform(:remove_first_order_prefix, 'membangun', 'bangun')
|
264
264
|
end
|
265
265
|
|
266
|
+
it "'mem' followed by vowel" do
|
267
|
+
should_transform(:remove_first_order_prefix, 'memilih', 'pilih')
|
268
|
+
end
|
269
|
+
|
266
270
|
it "'me'" do
|
267
271
|
should_transform(:remove_first_order_prefix, 'melukis', 'lukis')
|
268
272
|
end
|
@@ -630,5 +634,15 @@ describe IndonesianStemmer::MorphologicalUtility do
|
|
630
634
|
end
|
631
635
|
end
|
632
636
|
end
|
637
|
+
|
638
|
+
describe 'should not remove suffix characters for words ending with them' do
|
639
|
+
it "'kan'" do
|
640
|
+
should_not_transform(:remove_suffix, 'majikan')
|
641
|
+
end
|
642
|
+
|
643
|
+
it "'i'" do
|
644
|
+
should_not_transform(:remove_suffix, 'pandai')
|
645
|
+
end
|
646
|
+
end
|
633
647
|
end
|
634
648
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indonesian_stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-04-
|
12
|
+
date: 2013-04-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -61,6 +61,18 @@ files:
|
|
61
61
|
- Rakefile
|
62
62
|
- indonesian_stemmer.gemspec
|
63
63
|
- lib/indonesian_stemmer.rb
|
64
|
+
- lib/indonesian_stemmer/irregular_words.rb
|
65
|
+
- lib/indonesian_stemmer/irregular_words/akhiran-i.txt
|
66
|
+
- lib/indonesian_stemmer/irregular_words/k.txt
|
67
|
+
- lib/indonesian_stemmer/irregular_words/kah.txt
|
68
|
+
- lib/indonesian_stemmer/irregular_words/ku.txt
|
69
|
+
- lib/indonesian_stemmer/irregular_words/lah.txt
|
70
|
+
- lib/indonesian_stemmer/irregular_words/mu.txt
|
71
|
+
- lib/indonesian_stemmer/irregular_words/n.txt
|
72
|
+
- lib/indonesian_stemmer/irregular_words/nya.txt
|
73
|
+
- lib/indonesian_stemmer/irregular_words/p.txt
|
74
|
+
- lib/indonesian_stemmer/irregular_words/pun.txt
|
75
|
+
- lib/indonesian_stemmer/irregular_words/r.txt
|
64
76
|
- lib/indonesian_stemmer/morphological_utility.rb
|
65
77
|
- lib/indonesian_stemmer/stemmer_utility.rb
|
66
78
|
- lib/indonesian_stemmer/version.rb
|
@@ -82,7 +94,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
82
94
|
version: '0'
|
83
95
|
segments:
|
84
96
|
- 0
|
85
|
-
hash:
|
97
|
+
hash: 550012699463393318
|
86
98
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
99
|
none: false
|
88
100
|
requirements:
|
@@ -91,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
91
103
|
version: '0'
|
92
104
|
segments:
|
93
105
|
- 0
|
94
|
-
hash:
|
106
|
+
hash: 550012699463393318
|
95
107
|
requirements: []
|
96
108
|
rubyforge_project:
|
97
109
|
rubygems_version: 1.8.25
|