indonesian_stemmer 0.0.1 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
data/README.md
CHANGED
@@ -1,6 +1,13 @@
|
|
1
1
|
# IndonesianStemmer
|
2
2
|
|
3
|
-
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/indonesian_stemmer.png)](http://badge.fury.io/rb/indonesian_stemmer)
|
4
|
+
[![Build Status](https://secure.travis-ci.org/apraditya/indonesian_stemmer.png)](http://travis-ci.org/apraditya/indonesian_stemmer)
|
5
|
+
[![Dependency Status](https://gemnasium.com/apraditya/indonesian_stemmer.png)](https://gemnasium.com/apraditya/indonesian_stemmer)
|
6
|
+
[![Code Climate](https://codeclimate.com/github/apraditya/indonesian_stemmer.png)](https://codeclimate.com/github/apraditya/indonesian_stemmer)
|
7
|
+
|
8
|
+
|
9
|
+
|
10
|
+
Stems Indonesian words based on Porter Stemmer, with the algorithm presented in [**A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia**](http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf), by Fadillah Z Tala.
|
4
11
|
|
5
12
|
## Installation
|
6
13
|
|
@@ -18,9 +25,28 @@ Or install it yourself as:
|
|
18
25
|
|
19
26
|
## Usage
|
20
27
|
|
21
|
-
|
28
|
+
require 'rubygems'
|
29
|
+
require 'indonesian_stemmer'
|
30
|
+
|
31
|
+
IndonesianStemmer.stem('mendengarkan') # => "dengar"
|
32
|
+
'beriman'.stem # => "iman"
|
33
|
+
|
34
|
+
## Known Problems
|
35
|
+
This gem is in active development, don't rely on this for your analysis or datamining projects. Currently there's no problems stemming Indonesian words. Please [submit a ticket](https://github.com/apraditya/indonesian_stemmer/issues/new) if you find one.
|
36
|
+
|
22
37
|
|
23
38
|
## Contributing
|
39
|
+
Initially, this gem is based on [Apache Lucene](http://lucene.apache.org/). Currently it's just a ruby port from its analyzer for Indonesian. Its stemmer library only analyze the word length, therefore some modifications added in order to get the actual stemmed word. Feel free to download Lucene's source code under `analysis/common/src/java/org/apache/lucene/analysis/id/`.
|
40
|
+
|
41
|
+
### References
|
42
|
+
Some references to help your contribution:
|
43
|
+
|
44
|
+
1. [The Official Kamus Bahasa Indonesia](http://bahasa.kemdiknas.go.id/kbbi/index.php)
|
45
|
+
2. To search Indonesian words and their roots, use the [Unofficial Kamus Besar Bahasa Indonesia](http://www.kamusbesar.com/)
|
46
|
+
3. Wikipedia's [Prefiks dalam Bahasa Indonesia](http://id.wikipedia.org/wiki/Prefiks_dalam_bahasa_Indonesia)
|
47
|
+
|
48
|
+
|
49
|
+
### Steps
|
24
50
|
|
25
51
|
1. Fork it
|
26
52
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
@@ -7,12 +7,65 @@ module IndonesianStemmer
|
|
7
7
|
POSSESSIVE_PRONOUN_CHARACTERS = %w( ku mu nya )
|
8
8
|
FIRST_ORDER_PREFIX_CHARACTERS = %w( meng meny men mem me
|
9
9
|
peng peny pen pem di ter ke )
|
10
|
-
SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meny peny pen
|
10
|
+
SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meng peng meny peny men pen
|
11
|
+
mem pem )
|
11
12
|
SECOND_ORDER_PREFIX_CHARACTERS = %w( ber be per pe )
|
12
|
-
SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( be )
|
13
13
|
NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( ber per pe )
|
14
14
|
SPECIAL_SECOND_ORDER_PREFIX_WORDS = %w( belajar pelajar belunjur )
|
15
15
|
SUFFIX_CHARACTERS = %w( kan an i )
|
16
|
+
WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS = %w( meny peny men pen )
|
17
|
+
|
18
|
+
IRREGULARS_FOR_WORDS_BEGINS_WITH_K = %w(
|
19
|
+
aget alah andung ata ejar eluar embali empis emuka ena enal encang endali ering
|
20
|
+
erja erut etahui etik ibar irim uasai uliti umpul unci unjung unyah upas urang )
|
21
|
+
|
22
|
+
IRREGULARS_FOR_WORDS_BEGINS_WITH_P = %w(
|
23
|
+
adam ahat akai amer anas ancang anggang anggil anjat antul asang asti atuhi
|
24
|
+
ecah ecat elihara eluk ercik eriksa erintah esan ikir ilah ilih injam inta
|
25
|
+
isah otong otret uja uji ukul usat utar-balik utus )
|
26
|
+
|
27
|
+
IRREGULARS_FOR_WORDS_BEGINS_WITH_N = %w( aas ada adi afi afsu aif aik akal akoda
|
28
|
+
alar ama anti arasi asab asib asional atif asehat asihat atural etral ikah )
|
29
|
+
|
30
|
+
IRREGULARS_FOR_WORDS_BEGINS_WITH_R = %w( aba abak aban abas abat abet abit
|
31
|
+
abuk abun abung abut acak acau acik acuh acun adah adai adak adang adiasi
|
32
|
+
adikal adio adu aga agam agas agi agu aguk ahap ahasia ahat ahim ahmat aih
|
33
|
+
aja ajah ajalela ajam ajang ajin ajuk ajut akap akat akit aksi akuk akus
|
34
|
+
akut akyat alat alip amah amahtamah amah-tamah amai amal ambah ambai ambak
|
35
|
+
amban ambang ambat ambeh ambu ambut amin ampai ampak ampang ampas ampat
|
36
|
+
amping ampok ampung ampus amu amus anap anca ancah ancak ancang ancap
|
37
|
+
ancu ancung anda andai andak andat andau andek anduk andung angah angai
|
38
|
+
angak anggah asa usak )
|
39
|
+
|
40
|
+
IRREGULAR_PREFIX_CHARACTERS_ON_WORDS = {
|
41
|
+
'meng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
|
42
|
+
'peng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
|
43
|
+
'mem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P,
|
44
|
+
'pem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P, }
|
45
|
+
|
46
|
+
IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS = {
|
47
|
+
'kah' => %w( bengkah berkah bingkah bongkah cekah firkah halakah halkah
|
48
|
+
harakah ingkah jangkah jerkah kalah kekah kelakah kerakah kerkah
|
49
|
+
khalikah langkah lukah markah mukah musyarakah nafkah naskah
|
50
|
+
nikah pangkah rakah rekah rengkah sedekah sekah serakah serkah
|
51
|
+
sungkah takah tekah telingkah tingkah tongkah ),
|
52
|
+
|
53
|
+
'lah' => %w( balah belah beslah bilah celah galah islah istilah jumlah
|
54
|
+
kalah kelah kilah lalah lelah makalah malah masalah
|
55
|
+
muamalah mujadalah mukabalah olah onslah oplah pecahbelah
|
56
|
+
pecah-belah pilah milah sekolah rihlah risalah salah serlah
|
57
|
+
silsilah sudah sulalah telah tulah ulah uzlah walah wasilah ),
|
58
|
+
|
59
|
+
'pun' => %w( ampun depun himpun lapun rapun rumpun ),
|
60
|
+
|
61
|
+
'ku' => %w( awabeku baku bangku beku beluku biku buku ceku ciku cuku deku
|
62
|
+
jibaku kaku laku leku liku luku paku pangku peku perilaku saku
|
63
|
+
siku suku teleku terungku tungku waluku ),
|
64
|
+
|
65
|
+
'mu' => %w( ilmu jamu jemu kemu ramu selumu tamu temu ),
|
66
|
+
|
67
|
+
'nya' => %w( tanya ),
|
68
|
+
}
|
16
69
|
|
17
70
|
REMOVED_KE = 1
|
18
71
|
REMOVED_PENG = 2
|
@@ -56,19 +109,10 @@ module IndonesianStemmer
|
|
56
109
|
def remove_first_order_prefix(word)
|
57
110
|
@number_of_syllables ||= total_syllables(word)
|
58
111
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
@flags ||= collection_for(characters, 'removed')
|
64
|
-
reduce_syllable
|
65
|
-
word = substitute_word_character(word, characters)
|
66
|
-
slice_word_at_position( word,
|
67
|
-
characters_size-1,
|
68
|
-
:start )
|
69
|
-
return word
|
70
|
-
end
|
71
|
-
end
|
112
|
+
previous_word = word.dup
|
113
|
+
remove_and_substitute_characters_matching_collection(
|
114
|
+
word, collection_for(:special_first_order_prefix), :start )
|
115
|
+
return word if previous_word != word
|
72
116
|
|
73
117
|
remove_characters_matching_collection( word,
|
74
118
|
collection_for(:first_order_prefix),
|
@@ -149,10 +193,12 @@ module IndonesianStemmer
|
|
149
193
|
def remove_characters_matching_collection(word, collection, position)
|
150
194
|
collection.each do |characters|
|
151
195
|
if send("#{position}s_with?", word, word.size, characters)
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
196
|
+
unless ambiguous_with_characters?(word, characters, position)
|
197
|
+
@flags ||= collection_for(characters, 'removed')
|
198
|
+
reduce_syllable
|
199
|
+
slice_word_at_position(word, characters.size, position)
|
200
|
+
return word
|
201
|
+
end
|
156
202
|
end
|
157
203
|
end
|
158
204
|
|
@@ -164,17 +210,76 @@ module IndonesianStemmer
|
|
164
210
|
word.slice!( multiplier*characters_size, characters_size)
|
165
211
|
end
|
166
212
|
|
213
|
+
def remove_and_substitute_characters_matching_collection(word, collection, position)
|
214
|
+
word_size = word.size
|
215
|
+
collection.each do |characters|
|
216
|
+
characters_size = characters.size
|
217
|
+
if send("#{position}s_with?", word, word_size, characters) &&
|
218
|
+
word_size > characters_size && is_vowel?(word[characters_size])
|
219
|
+
|
220
|
+
if WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
|
221
|
+
contains_irregular_prefix?(word, characters)
|
222
|
+
|
223
|
+
@flags ||= collection_for(characters, 'removed')
|
224
|
+
reduce_syllable
|
225
|
+
word = substitute_word_character(word, characters)
|
226
|
+
slice_word_at_position( word,
|
227
|
+
characters_size-1,
|
228
|
+
:start )
|
229
|
+
return word
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
def contains_irregular_prefix?(word, characters)
|
236
|
+
if IRREGULAR_PREFIX_CHARACTERS_ON_WORDS.keys.include?(characters)
|
237
|
+
chopped_word_match_words_collection?(
|
238
|
+
word[characters.size, word.size],
|
239
|
+
IRREGULAR_PREFIX_CHARACTERS_ON_WORDS[characters] )
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def chopped_word_match_words_collection?(chopped_word, collection)
|
244
|
+
collection.any? { |w| starts_with?(chopped_word, chopped_word.size, w) }
|
245
|
+
end
|
246
|
+
|
167
247
|
def substitute_word_character(word, characters)
|
168
248
|
substitute_char = case
|
169
249
|
when %w(meny peny).include?(characters)
|
170
250
|
's'
|
171
|
-
when
|
172
|
-
|
251
|
+
when %w(men pen).include?(characters)
|
252
|
+
(chopped_word_match_words_collection?(
|
253
|
+
word[characters.size, word.size], IRREGULARS_FOR_WORDS_BEGINS_WITH_N
|
254
|
+
)
|
255
|
+
)? 'n' : 't'
|
256
|
+
when %w(meng peng).include?(characters)
|
257
|
+
'k'
|
258
|
+
when %w(mem pem).include?(characters)
|
259
|
+
'p'
|
173
260
|
end
|
174
261
|
word[characters.size-1] = substitute_char if substitute_char
|
175
262
|
word
|
176
263
|
end
|
177
264
|
|
265
|
+
def ambiguous_with_characters?(word, characters, position)
|
266
|
+
if position == :start
|
267
|
+
if characters == 'per'
|
268
|
+
chopped_word_match_words_collection?(word[3..-1],
|
269
|
+
IRREGULARS_FOR_WORDS_BEGINS_WITH_R )
|
270
|
+
else
|
271
|
+
return false
|
272
|
+
end
|
273
|
+
else
|
274
|
+
IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
|
275
|
+
# To differentiate 'mobilmu' with 'berilmu'
|
276
|
+
return false unless %w(me be pe).include?(word[0,2])
|
277
|
+
# The rest is ok
|
278
|
+
ends_with?(word, word.size, ambiguous_word)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
|
178
283
|
def reduce_syllable
|
179
284
|
@number_of_syllables -= 1
|
180
285
|
end
|
@@ -581,6 +581,10 @@ describe IndonesianStemmer::MorphologicalUtility do
|
|
581
581
|
|
582
582
|
describe '#remove_suffix' do
|
583
583
|
describe "words with these suffix characters" do
|
584
|
+
before do
|
585
|
+
IndonesianStemmer.instance_variable_set("@flags", 0)
|
586
|
+
end
|
587
|
+
|
584
588
|
describe "at the end of the word, should remove the suffix characters" do
|
585
589
|
it "'kan'" do
|
586
590
|
should_transform(:remove_suffix, 'katakan', 'kata')
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: indonesian_stemmer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-04-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -82,7 +82,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
82
82
|
version: '0'
|
83
83
|
segments:
|
84
84
|
- 0
|
85
|
-
hash:
|
85
|
+
hash: -3033082141051403298
|
86
86
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
87
87
|
none: false
|
88
88
|
requirements:
|
@@ -91,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
91
91
|
version: '0'
|
92
92
|
segments:
|
93
93
|
- 0
|
94
|
-
hash:
|
94
|
+
hash: -3033082141051403298
|
95
95
|
requirements: []
|
96
96
|
rubyforge_project:
|
97
97
|
rubygems_version: 1.8.25
|