indonesian_stemmer 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  require "indonesian_stemmer/stemmer_utility"
2
+ require "indonesian_stemmer/irregular_words"
2
3
 
3
4
  module IndonesianStemmer
4
5
 
@@ -15,57 +16,6 @@ module IndonesianStemmer
15
16
  SUFFIX_CHARACTERS = %w( kan an i )
16
17
  WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS = %w( meny peny men pen )
17
18
 
18
- IRREGULARS_FOR_WORDS_BEGINS_WITH_K = %w(
19
- aget alah andung ata ejar eluar embali empis emuka ena enal encang endali ering
20
- erja erut etahui etik ibar irim uasai uliti umpul unci unjung unyah upas urang )
21
-
22
- IRREGULARS_FOR_WORDS_BEGINS_WITH_P = %w(
23
- adam ahat akai amer anas ancang anggang anggil anjat antul asang asti atuhi
24
- ecah ecat elihara eluk ercik eriksa erintah esan ikir ilah ilih injam inta
25
- isah otong otret uja uji ukul usat utar-balik utus )
26
-
27
- IRREGULARS_FOR_WORDS_BEGINS_WITH_N = %w( aas ada adi afi afsu aif aik akal akoda
28
- alar ama anti arasi asab asib asional atif asehat asihat atural etral ikah )
29
-
30
- IRREGULARS_FOR_WORDS_BEGINS_WITH_R = %w( aba abak aban abas abat abet abit
31
- abuk abun abung abut acak acau acik acuh acun adah adai adak adang adiasi
32
- adikal adio adu aga agam agas agi agu aguk ahap ahasia ahat ahim ahmat aih
33
- aja ajah ajalela ajam ajang ajin ajuk ajut akap akat akit aksi akuk akus
34
- akut akyat alat alip amah amahtamah amah-tamah amai amal ambah ambai ambak
35
- amban ambang ambat ambeh ambu ambut amin ampai ampak ampang ampas ampat
36
- amping ampok ampung ampus amu amus anap anca ancah ancak ancang ancap
37
- ancu ancung anda andai andak andat andau andek anduk andung angah angai
38
- angak anggah asa usak )
39
-
40
- IRREGULAR_PREFIX_CHARACTERS_ON_WORDS = {
41
- 'meng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
42
- 'peng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
43
- 'mem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P,
44
- 'pem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P, }
45
-
46
- IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS = {
47
- 'kah' => %w( bengkah berkah bingkah bongkah cekah firkah halakah halkah
48
- harakah ingkah jangkah jerkah kalah kekah kelakah kerakah kerkah
49
- khalikah langkah lukah markah mukah musyarakah nafkah naskah
50
- nikah pangkah rakah rekah rengkah sedekah sekah serakah serkah
51
- sungkah takah tekah telingkah tingkah tongkah ),
52
-
53
- 'lah' => %w( balah belah beslah bilah celah galah islah istilah jumlah
54
- kalah kelah kilah lalah lelah makalah malah masalah
55
- muamalah mujadalah mukabalah olah onslah oplah pecahbelah
56
- pecah-belah pilah milah sekolah rihlah risalah salah serlah
57
- silsilah sudah sulalah telah tulah ulah uzlah walah wasilah ),
58
-
59
- 'pun' => %w( ampun depun himpun lapun rapun rumpun ),
60
-
61
- 'ku' => %w( awabeku baku bangku beku beluku biku buku ceku ciku cuku deku
62
- jibaku kaku laku leku liku luku paku pangku peku perilaku saku
63
- siku suku teleku terungku tungku waluku ),
64
-
65
- 'mu' => %w( ilmu jamu jemu kemu ramu selumu tamu temu ),
66
-
67
- 'nya' => %w( tanya ),
68
- }
69
19
 
70
20
  REMOVED_KE = 1
71
21
  REMOVED_PENG = 2
@@ -143,6 +93,8 @@ module IndonesianStemmer
143
93
  end
144
94
 
145
95
  def remove_suffix(word)
96
+ return word if ambiguous_with_suffices_ending_words?(word)
97
+
146
98
  @number_of_syllables ||= total_syllables(word)
147
99
 
148
100
  SUFFIX_CHARACTERS.each do |character|
@@ -192,13 +144,12 @@ module IndonesianStemmer
192
144
 
193
145
  def remove_characters_matching_collection(word, collection, position)
194
146
  collection.each do |characters|
195
- if send("#{position}s_with?", word, word.size, characters)
196
- unless ambiguous_with_characters?(word, characters, position)
197
- @flags ||= collection_for(characters, 'removed')
198
- reduce_syllable
199
- slice_word_at_position(word, characters.size, position)
200
- return word
201
- end
147
+ if match_position_and_not_ambiguous_with_characters?(word, characters, position)
148
+ next if characters == 'mem' && is_vowel?(word[characters.size])
149
+ @flags ||= collection_for(characters, 'removed')
150
+ reduce_syllable
151
+ slice_word_at_position(word, characters.size, position)
152
+ return word
202
153
  end
203
154
  end
204
155
 
@@ -211,32 +162,24 @@ module IndonesianStemmer
211
162
  end
212
163
 
213
164
  def remove_and_substitute_characters_matching_collection(word, collection, position)
214
- word_size = word.size
215
165
  collection.each do |characters|
216
- characters_size = characters.size
217
- if send("#{position}s_with?", word, word_size, characters) &&
218
- word_size > characters_size && is_vowel?(word[characters_size])
219
-
220
- if WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
221
- contains_irregular_prefix?(word, characters)
222
-
223
- @flags ||= collection_for(characters, 'removed')
224
- reduce_syllable
225
- word = substitute_word_character(word, characters)
226
- slice_word_at_position( word,
227
- characters_size-1,
228
- :start )
229
- return word
230
- end
166
+ if matching_characters_requires_substitution?(word, characters, position)
167
+ @flags ||= collection_for(characters, 'removed')
168
+ reduce_syllable
169
+ word = substitute_word_character(word, characters)
170
+ slice_word_at_position( word,
171
+ characters.size-1,
172
+ :start )
173
+ return word
231
174
  end
232
175
  end
233
176
  end
234
177
 
235
178
  def contains_irregular_prefix?(word, characters)
236
- if IRREGULAR_PREFIX_CHARACTERS_ON_WORDS.keys.include?(characters)
179
+ if IrregularWords::ON_PREFIX_CHARACTERS.keys.include?(characters)
237
180
  chopped_word_match_words_collection?(
238
181
  word[characters.size, word.size],
239
- IRREGULAR_PREFIX_CHARACTERS_ON_WORDS[characters] )
182
+ IrregularWords::ON_PREFIX_CHARACTERS[characters] )
240
183
  end
241
184
  end
242
185
 
@@ -250,7 +193,7 @@ module IndonesianStemmer
250
193
  's'
251
194
  when %w(men pen).include?(characters)
252
195
  (chopped_word_match_words_collection?(
253
- word[characters.size, word.size], IRREGULARS_FOR_WORDS_BEGINS_WITH_N
196
+ word[characters.size, word.size], IrregularWords::BEGINS_WITH_N
254
197
  )
255
198
  )? 'n' : 't'
256
199
  when %w(meng peng).include?(characters)
@@ -266,12 +209,12 @@ module IndonesianStemmer
266
209
  if position == :start
267
210
  if characters == 'per'
268
211
  chopped_word_match_words_collection?(word[3..-1],
269
- IRREGULARS_FOR_WORDS_BEGINS_WITH_R )
212
+ IrregularWords::BEGINS_WITH_R )
270
213
  else
271
214
  return false
272
215
  end
273
216
  else
274
- IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
217
+ IrregularWords::ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
275
218
  # To differentiate 'mobilmu' with 'berilmu'
276
219
  return false unless %w(me be pe).include?(word[0,2])
277
220
  # The rest is ok
@@ -280,6 +223,33 @@ module IndonesianStemmer
280
223
  end
281
224
  end
282
225
 
226
+ def ambiguous_with_suffices_ending_words?(word)
227
+ IrregularWords::ENDS_WITH_SUFFIX_CHARACTERS.include?(word)
228
+ end
229
+
230
+ def match_position_and_not_ambiguous_with_characters?(word, characters, position)
231
+ send("#{position}s_with?", word, word.size, characters) &&
232
+ !ambiguous_with_characters?(word, characters, position)
233
+ end
234
+
235
+ def match_characters_position_followed_by_vowel?(word, characters, position)
236
+ word_size = word.size
237
+ characters_size = characters.size
238
+
239
+ send("#{position}s_with?", word, word_size, characters) &&
240
+ word_size > characters_size && is_vowel?(word[characters_size])
241
+ end
242
+
243
+ def substitution_required?(word, characters)
244
+ WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
245
+ contains_irregular_prefix?(word, characters)
246
+ end
247
+
248
+ def matching_characters_requires_substitution?(word, characters, position)
249
+ match_characters_position_followed_by_vowel?(word, characters, position) &&
250
+ substitution_required?(word, characters)
251
+ end
252
+
283
253
  def reduce_syllable
284
254
  @number_of_syllables -= 1
285
255
  end
@@ -1,3 +1,3 @@
1
1
  module IndonesianStemmer
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -68,6 +68,8 @@ describe IndonesianStemmer do
68
68
  describe "'me'" do
69
69
  it { should_stem 'merusak', 'rusak'}
70
70
  it { should_stem 'melayang', 'layang'}
71
+ it { should_stem 'memasak', 'masak'}
72
+ it { should_stem 'memandikan', 'mandi'}
71
73
  end
72
74
 
73
75
  describe "'peng'" do
@@ -263,6 +263,10 @@ describe IndonesianStemmer::MorphologicalUtility do
263
263
  should_transform(:remove_first_order_prefix, 'membangun', 'bangun')
264
264
  end
265
265
 
266
+ it "'mem' followed by vowel" do
267
+ should_transform(:remove_first_order_prefix, 'memilih', 'pilih')
268
+ end
269
+
266
270
  it "'me'" do
267
271
  should_transform(:remove_first_order_prefix, 'melukis', 'lukis')
268
272
  end
@@ -630,5 +634,15 @@ describe IndonesianStemmer::MorphologicalUtility do
630
634
  end
631
635
  end
632
636
  end
637
+
638
+ describe 'should not remove suffix characters for words ending with them' do
639
+ it "'kan'" do
640
+ should_not_transform(:remove_suffix, 'majikan')
641
+ end
642
+
643
+ it "'i'" do
644
+ should_not_transform(:remove_suffix, 'pandai')
645
+ end
646
+ end
633
647
  end
634
648
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indonesian_stemmer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-07 00:00:00.000000000 Z
12
+ date: 2013-04-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -61,6 +61,18 @@ files:
61
61
  - Rakefile
62
62
  - indonesian_stemmer.gemspec
63
63
  - lib/indonesian_stemmer.rb
64
+ - lib/indonesian_stemmer/irregular_words.rb
65
+ - lib/indonesian_stemmer/irregular_words/akhiran-i.txt
66
+ - lib/indonesian_stemmer/irregular_words/k.txt
67
+ - lib/indonesian_stemmer/irregular_words/kah.txt
68
+ - lib/indonesian_stemmer/irregular_words/ku.txt
69
+ - lib/indonesian_stemmer/irregular_words/lah.txt
70
+ - lib/indonesian_stemmer/irregular_words/mu.txt
71
+ - lib/indonesian_stemmer/irregular_words/n.txt
72
+ - lib/indonesian_stemmer/irregular_words/nya.txt
73
+ - lib/indonesian_stemmer/irregular_words/p.txt
74
+ - lib/indonesian_stemmer/irregular_words/pun.txt
75
+ - lib/indonesian_stemmer/irregular_words/r.txt
64
76
  - lib/indonesian_stemmer/morphological_utility.rb
65
77
  - lib/indonesian_stemmer/stemmer_utility.rb
66
78
  - lib/indonesian_stemmer/version.rb
@@ -82,7 +94,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
82
94
  version: '0'
83
95
  segments:
84
96
  - 0
85
- hash: 2800268474079069831
97
+ hash: 550012699463393318
86
98
  required_rubygems_version: !ruby/object:Gem::Requirement
87
99
  none: false
88
100
  requirements:
@@ -91,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
91
103
  version: '0'
92
104
  segments:
93
105
  - 0
94
- hash: 2800268474079069831
106
+ hash: 550012699463393318
95
107
  requirements: []
96
108
  rubyforge_project:
97
109
  rubygems_version: 1.8.25