indonesian_stemmer 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,5 @@
1
1
  require "indonesian_stemmer/stemmer_utility"
2
+ require "indonesian_stemmer/irregular_words"
2
3
 
3
4
  module IndonesianStemmer
4
5
 
@@ -15,57 +16,6 @@ module IndonesianStemmer
15
16
  SUFFIX_CHARACTERS = %w( kan an i )
16
17
  WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS = %w( meny peny men pen )
17
18
 
18
- IRREGULARS_FOR_WORDS_BEGINS_WITH_K = %w(
19
- aget alah andung ata ejar eluar embali empis emuka ena enal encang endali ering
20
- erja erut etahui etik ibar irim uasai uliti umpul unci unjung unyah upas urang )
21
-
22
- IRREGULARS_FOR_WORDS_BEGINS_WITH_P = %w(
23
- adam ahat akai amer anas ancang anggang anggil anjat antul asang asti atuhi
24
- ecah ecat elihara eluk ercik eriksa erintah esan ikir ilah ilih injam inta
25
- isah otong otret uja uji ukul usat utar-balik utus )
26
-
27
- IRREGULARS_FOR_WORDS_BEGINS_WITH_N = %w( aas ada adi afi afsu aif aik akal akoda
28
- alar ama anti arasi asab asib asional atif asehat asihat atural etral ikah )
29
-
30
- IRREGULARS_FOR_WORDS_BEGINS_WITH_R = %w( aba abak aban abas abat abet abit
31
- abuk abun abung abut acak acau acik acuh acun adah adai adak adang adiasi
32
- adikal adio adu aga agam agas agi agu aguk ahap ahasia ahat ahim ahmat aih
33
- aja ajah ajalela ajam ajang ajin ajuk ajut akap akat akit aksi akuk akus
34
- akut akyat alat alip amah amahtamah amah-tamah amai amal ambah ambai ambak
35
- amban ambang ambat ambeh ambu ambut amin ampai ampak ampang ampas ampat
36
- amping ampok ampung ampus amu amus anap anca ancah ancak ancang ancap
37
- ancu ancung anda andai andak andat andau andek anduk andung angah angai
38
- angak anggah asa usak )
39
-
40
- IRREGULAR_PREFIX_CHARACTERS_ON_WORDS = {
41
- 'meng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
42
- 'peng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
43
- 'mem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P,
44
- 'pem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P, }
45
-
46
- IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS = {
47
- 'kah' => %w( bengkah berkah bingkah bongkah cekah firkah halakah halkah
48
- harakah ingkah jangkah jerkah kalah kekah kelakah kerakah kerkah
49
- khalikah langkah lukah markah mukah musyarakah nafkah naskah
50
- nikah pangkah rakah rekah rengkah sedekah sekah serakah serkah
51
- sungkah takah tekah telingkah tingkah tongkah ),
52
-
53
- 'lah' => %w( balah belah beslah bilah celah galah islah istilah jumlah
54
- kalah kelah kilah lalah lelah makalah malah masalah
55
- muamalah mujadalah mukabalah olah onslah oplah pecahbelah
56
- pecah-belah pilah milah sekolah rihlah risalah salah serlah
57
- silsilah sudah sulalah telah tulah ulah uzlah walah wasilah ),
58
-
59
- 'pun' => %w( ampun depun himpun lapun rapun rumpun ),
60
-
61
- 'ku' => %w( awabeku baku bangku beku beluku biku buku ceku ciku cuku deku
62
- jibaku kaku laku leku liku luku paku pangku peku perilaku saku
63
- siku suku teleku terungku tungku waluku ),
64
-
65
- 'mu' => %w( ilmu jamu jemu kemu ramu selumu tamu temu ),
66
-
67
- 'nya' => %w( tanya ),
68
- }
69
19
 
70
20
  REMOVED_KE = 1
71
21
  REMOVED_PENG = 2
@@ -143,6 +93,8 @@ module IndonesianStemmer
143
93
  end
144
94
 
145
95
  def remove_suffix(word)
96
+ return word if ambiguous_with_suffices_ending_words?(word)
97
+
146
98
  @number_of_syllables ||= total_syllables(word)
147
99
 
148
100
  SUFFIX_CHARACTERS.each do |character|
@@ -192,13 +144,12 @@ module IndonesianStemmer
192
144
 
193
145
  def remove_characters_matching_collection(word, collection, position)
194
146
  collection.each do |characters|
195
- if send("#{position}s_with?", word, word.size, characters)
196
- unless ambiguous_with_characters?(word, characters, position)
197
- @flags ||= collection_for(characters, 'removed')
198
- reduce_syllable
199
- slice_word_at_position(word, characters.size, position)
200
- return word
201
- end
147
+ if match_position_and_not_ambiguous_with_characters?(word, characters, position)
148
+ next if characters == 'mem' && is_vowel?(word[characters.size])
149
+ @flags ||= collection_for(characters, 'removed')
150
+ reduce_syllable
151
+ slice_word_at_position(word, characters.size, position)
152
+ return word
202
153
  end
203
154
  end
204
155
 
@@ -211,32 +162,24 @@ module IndonesianStemmer
211
162
  end
212
163
 
213
164
  def remove_and_substitute_characters_matching_collection(word, collection, position)
214
- word_size = word.size
215
165
  collection.each do |characters|
216
- characters_size = characters.size
217
- if send("#{position}s_with?", word, word_size, characters) &&
218
- word_size > characters_size && is_vowel?(word[characters_size])
219
-
220
- if WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
221
- contains_irregular_prefix?(word, characters)
222
-
223
- @flags ||= collection_for(characters, 'removed')
224
- reduce_syllable
225
- word = substitute_word_character(word, characters)
226
- slice_word_at_position( word,
227
- characters_size-1,
228
- :start )
229
- return word
230
- end
166
+ if matching_characters_requires_substitution?(word, characters, position)
167
+ @flags ||= collection_for(characters, 'removed')
168
+ reduce_syllable
169
+ word = substitute_word_character(word, characters)
170
+ slice_word_at_position( word,
171
+ characters.size-1,
172
+ :start )
173
+ return word
231
174
  end
232
175
  end
233
176
  end
234
177
 
235
178
  def contains_irregular_prefix?(word, characters)
236
- if IRREGULAR_PREFIX_CHARACTERS_ON_WORDS.keys.include?(characters)
179
+ if IrregularWords::ON_PREFIX_CHARACTERS.keys.include?(characters)
237
180
  chopped_word_match_words_collection?(
238
181
  word[characters.size, word.size],
239
- IRREGULAR_PREFIX_CHARACTERS_ON_WORDS[characters] )
182
+ IrregularWords::ON_PREFIX_CHARACTERS[characters] )
240
183
  end
241
184
  end
242
185
 
@@ -250,7 +193,7 @@ module IndonesianStemmer
250
193
  's'
251
194
  when %w(men pen).include?(characters)
252
195
  (chopped_word_match_words_collection?(
253
- word[characters.size, word.size], IRREGULARS_FOR_WORDS_BEGINS_WITH_N
196
+ word[characters.size, word.size], IrregularWords::BEGINS_WITH_N
254
197
  )
255
198
  )? 'n' : 't'
256
199
  when %w(meng peng).include?(characters)
@@ -266,12 +209,12 @@ module IndonesianStemmer
266
209
  if position == :start
267
210
  if characters == 'per'
268
211
  chopped_word_match_words_collection?(word[3..-1],
269
- IRREGULARS_FOR_WORDS_BEGINS_WITH_R )
212
+ IrregularWords::BEGINS_WITH_R )
270
213
  else
271
214
  return false
272
215
  end
273
216
  else
274
- IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
217
+ IrregularWords::ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
275
218
  # To differentiate 'mobilmu' with 'berilmu'
276
219
  return false unless %w(me be pe).include?(word[0,2])
277
220
  # The rest is ok
@@ -280,6 +223,33 @@ module IndonesianStemmer
280
223
  end
281
224
  end
282
225
 
226
+ def ambiguous_with_suffices_ending_words?(word)
227
+ IrregularWords::ENDS_WITH_SUFFIX_CHARACTERS.include?(word)
228
+ end
229
+
230
+ def match_position_and_not_ambiguous_with_characters?(word, characters, position)
231
+ send("#{position}s_with?", word, word.size, characters) &&
232
+ !ambiguous_with_characters?(word, characters, position)
233
+ end
234
+
235
+ def match_characters_position_followed_by_vowel?(word, characters, position)
236
+ word_size = word.size
237
+ characters_size = characters.size
238
+
239
+ send("#{position}s_with?", word, word_size, characters) &&
240
+ word_size > characters_size && is_vowel?(word[characters_size])
241
+ end
242
+
243
+ def substitution_required?(word, characters)
244
+ WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
245
+ contains_irregular_prefix?(word, characters)
246
+ end
247
+
248
+ def matching_characters_requires_substitution?(word, characters, position)
249
+ match_characters_position_followed_by_vowel?(word, characters, position) &&
250
+ substitution_required?(word, characters)
251
+ end
252
+
283
253
  def reduce_syllable
284
254
  @number_of_syllables -= 1
285
255
  end
@@ -1,3 +1,3 @@
1
1
  module IndonesianStemmer
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -68,6 +68,8 @@ describe IndonesianStemmer do
68
68
  describe "'me'" do
69
69
  it { should_stem 'merusak', 'rusak'}
70
70
  it { should_stem 'melayang', 'layang'}
71
+ it { should_stem 'memasak', 'masak'}
72
+ it { should_stem 'memandikan', 'mandi'}
71
73
  end
72
74
 
73
75
  describe "'peng'" do
@@ -263,6 +263,10 @@ describe IndonesianStemmer::MorphologicalUtility do
263
263
  should_transform(:remove_first_order_prefix, 'membangun', 'bangun')
264
264
  end
265
265
 
266
+ it "'mem' followed by vowel" do
267
+ should_transform(:remove_first_order_prefix, 'memilih', 'pilih')
268
+ end
269
+
266
270
  it "'me'" do
267
271
  should_transform(:remove_first_order_prefix, 'melukis', 'lukis')
268
272
  end
@@ -630,5 +634,15 @@ describe IndonesianStemmer::MorphologicalUtility do
630
634
  end
631
635
  end
632
636
  end
637
+
638
+ describe 'should not remove suffix characters for words ending with them' do
639
+ it "'kan'" do
640
+ should_not_transform(:remove_suffix, 'majikan')
641
+ end
642
+
643
+ it "'i'" do
644
+ should_not_transform(:remove_suffix, 'pandai')
645
+ end
646
+ end
633
647
  end
634
648
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indonesian_stemmer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-04-07 00:00:00.000000000 Z
12
+ date: 2013-04-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -61,6 +61,18 @@ files:
61
61
  - Rakefile
62
62
  - indonesian_stemmer.gemspec
63
63
  - lib/indonesian_stemmer.rb
64
+ - lib/indonesian_stemmer/irregular_words.rb
65
+ - lib/indonesian_stemmer/irregular_words/akhiran-i.txt
66
+ - lib/indonesian_stemmer/irregular_words/k.txt
67
+ - lib/indonesian_stemmer/irregular_words/kah.txt
68
+ - lib/indonesian_stemmer/irregular_words/ku.txt
69
+ - lib/indonesian_stemmer/irregular_words/lah.txt
70
+ - lib/indonesian_stemmer/irregular_words/mu.txt
71
+ - lib/indonesian_stemmer/irregular_words/n.txt
72
+ - lib/indonesian_stemmer/irregular_words/nya.txt
73
+ - lib/indonesian_stemmer/irregular_words/p.txt
74
+ - lib/indonesian_stemmer/irregular_words/pun.txt
75
+ - lib/indonesian_stemmer/irregular_words/r.txt
64
76
  - lib/indonesian_stemmer/morphological_utility.rb
65
77
  - lib/indonesian_stemmer/stemmer_utility.rb
66
78
  - lib/indonesian_stemmer/version.rb
@@ -82,7 +94,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
82
94
  version: '0'
83
95
  segments:
84
96
  - 0
85
- hash: 2800268474079069831
97
+ hash: 550012699463393318
86
98
  required_rubygems_version: !ruby/object:Gem::Requirement
87
99
  none: false
88
100
  requirements:
@@ -91,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
91
103
  version: '0'
92
104
  segments:
93
105
  - 0
94
- hash: 2800268474079069831
106
+ hash: 550012699463393318
95
107
  requirements: []
96
108
  rubyforge_project:
97
109
  rubygems_version: 1.8.25