indonesian_stemmer 0.0.1 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,6 +1,13 @@
1
1
  # IndonesianStemmer
2
2
 
3
- TODO: Write a gem description
3
+ [![Gem Version](https://badge.fury.io/rb/indonesian_stemmer.png)](http://badge.fury.io/rb/indonesian_stemmer)
4
+ [![Build Status](https://secure.travis-ci.org/apraditya/indonesian_stemmer.png)](http://travis-ci.org/apraditya/indonesian_stemmer)
5
+ [![Dependency Status](https://gemnasium.com/apraditya/indonesian_stemmer.png)](https://gemnasium.com/apraditya/indonesian_stemmer)
6
+ [![Code Climate](https://codeclimate.com/github/apraditya/indonesian_stemmer.png)](https://codeclimate.com/github/apraditya/indonesian_stemmer)
7
+
8
+
9
+
10
+ Stems Indonesian words based on Porter Stemmer, with the algorithm presented in [**A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia**](http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf), by Fadillah Z Tala.
4
11
 
5
12
  ## Installation
6
13
 
@@ -18,9 +25,28 @@ Or install it yourself as:
18
25
 
19
26
  ## Usage
20
27
 
21
- TODO: Write usage instructions here
28
+ require 'rubygems'
29
+ require 'indonesian_stemmer'
30
+
31
+ IndonesianStemmer.stem('mendengarkan') # => "dengar"
32
+ 'beriman'.stem # => "iman"
33
+
34
+ ## Known Problems
35
+ This gem is in active development, don't rely on this for your analysis or datamining projects. Currently there's no problems stemming Indonesian words. Please [submit a ticket](https://github.com/apraditya/indonesian_stemmer/issues/new) if you find one.
36
+
22
37
 
23
38
  ## Contributing
39
+ Initially, this gem is based on [Apache Lucene](http://lucene.apache.org/). Currently it's just a ruby port from its analyzer for Indonesian. Its stemmer library only analyze the word length, therefore some modifications added in order to get the actual stemmed word. Feel free to download Lucene's source code under `analysis/common/src/java/org/apache/lucene/analysis/id/`.
40
+
41
+ ### References
42
+ Some references to help your contribution:
43
+
44
+ 1. [The Official Kamus Bahasa Indonesia](http://bahasa.kemdiknas.go.id/kbbi/index.php)
45
+ 2. To search Indonesian words and their roots, use the [Unofficial Kamus Besar Bahasa Indonesia](http://www.kamusbesar.com/)
46
+ 3. Wikipedia's [Prefiks dalam Bahasa Indonesia](http://id.wikipedia.org/wiki/Prefiks_dalam_bahasa_Indonesia)
47
+
48
+
49
+ ### Steps
24
50
 
25
51
  1. Fork it
26
52
  2. Create your feature branch (`git checkout -b my-new-feature`)
@@ -7,12 +7,65 @@ module IndonesianStemmer
7
7
  POSSESSIVE_PRONOUN_CHARACTERS = %w( ku mu nya )
8
8
  FIRST_ORDER_PREFIX_CHARACTERS = %w( meng meny men mem me
9
9
  peng peny pen pem di ter ke )
10
- SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meny peny pen )
10
+ SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meng peng meny peny men pen
11
+ mem pem )
11
12
  SECOND_ORDER_PREFIX_CHARACTERS = %w( ber be per pe )
12
- SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( be )
13
13
  NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( ber per pe )
14
14
  SPECIAL_SECOND_ORDER_PREFIX_WORDS = %w( belajar pelajar belunjur )
15
15
  SUFFIX_CHARACTERS = %w( kan an i )
16
+ WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS = %w( meny peny men pen )
17
+
18
+ IRREGULARS_FOR_WORDS_BEGINS_WITH_K = %w(
19
+ aget alah andung ata ejar eluar embali empis emuka ena enal encang endali ering
20
+ erja erut etahui etik ibar irim uasai uliti umpul unci unjung unyah upas urang )
21
+
22
+ IRREGULARS_FOR_WORDS_BEGINS_WITH_P = %w(
23
+ adam ahat akai amer anas ancang anggang anggil anjat antul asang asti atuhi
24
+ ecah ecat elihara eluk ercik eriksa erintah esan ikir ilah ilih injam inta
25
+ isah otong otret uja uji ukul usat utar-balik utus )
26
+
27
+ IRREGULARS_FOR_WORDS_BEGINS_WITH_N = %w( aas ada adi afi afsu aif aik akal akoda
28
+ alar ama anti arasi asab asib asional atif asehat asihat atural etral ikah )
29
+
30
+ IRREGULARS_FOR_WORDS_BEGINS_WITH_R = %w( aba abak aban abas abat abet abit
31
+ abuk abun abung abut acak acau acik acuh acun adah adai adak adang adiasi
32
+ adikal adio adu aga agam agas agi agu aguk ahap ahasia ahat ahim ahmat aih
33
+ aja ajah ajalela ajam ajang ajin ajuk ajut akap akat akit aksi akuk akus
34
+ akut akyat alat alip amah amahtamah amah-tamah amai amal ambah ambai ambak
35
+ amban ambang ambat ambeh ambu ambut amin ampai ampak ampang ampas ampat
36
+ amping ampok ampung ampus amu amus anap anca ancah ancak ancang ancap
37
+ ancu ancung anda andai andak andat andau andek anduk andung angah angai
38
+ angak anggah asa usak )
39
+
40
+ IRREGULAR_PREFIX_CHARACTERS_ON_WORDS = {
41
+ 'meng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
42
+ 'peng' => IRREGULARS_FOR_WORDS_BEGINS_WITH_K,
43
+ 'mem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P,
44
+ 'pem' => IRREGULARS_FOR_WORDS_BEGINS_WITH_P, }
45
+
46
+ IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS = {
47
+ 'kah' => %w( bengkah berkah bingkah bongkah cekah firkah halakah halkah
48
+ harakah ingkah jangkah jerkah kalah kekah kelakah kerakah kerkah
49
+ khalikah langkah lukah markah mukah musyarakah nafkah naskah
50
+ nikah pangkah rakah rekah rengkah sedekah sekah serakah serkah
51
+ sungkah takah tekah telingkah tingkah tongkah ),
52
+
53
+ 'lah' => %w( balah belah beslah bilah celah galah islah istilah jumlah
54
+ kalah kelah kilah lalah lelah makalah malah masalah
55
+ muamalah mujadalah mukabalah olah onslah oplah pecahbelah
56
+ pecah-belah pilah milah sekolah rihlah risalah salah serlah
57
+ silsilah sudah sulalah telah tulah ulah uzlah walah wasilah ),
58
+
59
+ 'pun' => %w( ampun depun himpun lapun rapun rumpun ),
60
+
61
+ 'ku' => %w( awabeku baku bangku beku beluku biku buku ceku ciku cuku deku
62
+ jibaku kaku laku leku liku luku paku pangku peku perilaku saku
63
+ siku suku teleku terungku tungku waluku ),
64
+
65
+ 'mu' => %w( ilmu jamu jemu kemu ramu selumu tamu temu ),
66
+
67
+ 'nya' => %w( tanya ),
68
+ }
16
69
 
17
70
  REMOVED_KE = 1
18
71
  REMOVED_PENG = 2
@@ -56,19 +109,10 @@ module IndonesianStemmer
56
109
  def remove_first_order_prefix(word)
57
110
  @number_of_syllables ||= total_syllables(word)
58
111
 
59
- word_size = word.size
60
- SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS.each do |characters|
61
- characters_size = characters.size
62
- if starts_with?(word, word_size, characters) && word_size > characters_size && is_vowel?(word[characters_size])
63
- @flags ||= collection_for(characters, 'removed')
64
- reduce_syllable
65
- word = substitute_word_character(word, characters)
66
- slice_word_at_position( word,
67
- characters_size-1,
68
- :start )
69
- return word
70
- end
71
- end
112
+ previous_word = word.dup
113
+ remove_and_substitute_characters_matching_collection(
114
+ word, collection_for(:special_first_order_prefix), :start )
115
+ return word if previous_word != word
72
116
 
73
117
  remove_characters_matching_collection( word,
74
118
  collection_for(:first_order_prefix),
@@ -149,10 +193,12 @@ module IndonesianStemmer
149
193
  def remove_characters_matching_collection(word, collection, position)
150
194
  collection.each do |characters|
151
195
  if send("#{position}s_with?", word, word.size, characters)
152
- @flags ||= collection_for(characters, 'removed')
153
- reduce_syllable
154
- slice_word_at_position(word, characters.size, position)
155
- return word
196
+ unless ambiguous_with_characters?(word, characters, position)
197
+ @flags ||= collection_for(characters, 'removed')
198
+ reduce_syllable
199
+ slice_word_at_position(word, characters.size, position)
200
+ return word
201
+ end
156
202
  end
157
203
  end
158
204
 
@@ -164,17 +210,76 @@ module IndonesianStemmer
164
210
  word.slice!( multiplier*characters_size, characters_size)
165
211
  end
166
212
 
213
+ def remove_and_substitute_characters_matching_collection(word, collection, position)
214
+ word_size = word.size
215
+ collection.each do |characters|
216
+ characters_size = characters.size
217
+ if send("#{position}s_with?", word, word_size, characters) &&
218
+ word_size > characters_size && is_vowel?(word[characters_size])
219
+
220
+ if WITH_VOWEL_SUBSTITUTION_PREFIX_CHARACTERS.include?(characters) ||
221
+ contains_irregular_prefix?(word, characters)
222
+
223
+ @flags ||= collection_for(characters, 'removed')
224
+ reduce_syllable
225
+ word = substitute_word_character(word, characters)
226
+ slice_word_at_position( word,
227
+ characters_size-1,
228
+ :start )
229
+ return word
230
+ end
231
+ end
232
+ end
233
+ end
234
+
235
+ def contains_irregular_prefix?(word, characters)
236
+ if IRREGULAR_PREFIX_CHARACTERS_ON_WORDS.keys.include?(characters)
237
+ chopped_word_match_words_collection?(
238
+ word[characters.size, word.size],
239
+ IRREGULAR_PREFIX_CHARACTERS_ON_WORDS[characters] )
240
+ end
241
+ end
242
+
243
+ def chopped_word_match_words_collection?(chopped_word, collection)
244
+ collection.any? { |w| starts_with?(chopped_word, chopped_word.size, w) }
245
+ end
246
+
167
247
  def substitute_word_character(word, characters)
168
248
  substitute_char = case
169
249
  when %w(meny peny).include?(characters)
170
250
  's'
171
- when characters == 'pen'
172
- 't'
251
+ when %w(men pen).include?(characters)
252
+ (chopped_word_match_words_collection?(
253
+ word[characters.size, word.size], IRREGULARS_FOR_WORDS_BEGINS_WITH_N
254
+ )
255
+ )? 'n' : 't'
256
+ when %w(meng peng).include?(characters)
257
+ 'k'
258
+ when %w(mem pem).include?(characters)
259
+ 'p'
173
260
  end
174
261
  word[characters.size-1] = substitute_char if substitute_char
175
262
  word
176
263
  end
177
264
 
265
+ def ambiguous_with_characters?(word, characters, position)
266
+ if position == :start
267
+ if characters == 'per'
268
+ chopped_word_match_words_collection?(word[3..-1],
269
+ IRREGULARS_FOR_WORDS_BEGINS_WITH_R )
270
+ else
271
+ return false
272
+ end
273
+ else
274
+ IRREGULAR_WORDS_ENDS_WITH_COMMON_CHARACTERS[characters].any? do |ambiguous_word|
275
+ # To differentiate 'mobilmu' with 'berilmu'
276
+ return false unless %w(me be pe).include?(word[0,2])
277
+ # The rest is ok
278
+ ends_with?(word, word.size, ambiguous_word)
279
+ end
280
+ end
281
+ end
282
+
178
283
  def reduce_syllable
179
284
  @number_of_syllables -= 1
180
285
  end
@@ -1,3 +1,3 @@
1
1
  module IndonesianStemmer
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.0"
3
3
  end
@@ -581,6 +581,10 @@ describe IndonesianStemmer::MorphologicalUtility do
581
581
 
582
582
  describe '#remove_suffix' do
583
583
  describe "words with these suffix characters" do
584
+ before do
585
+ IndonesianStemmer.instance_variable_set("@flags", 0)
586
+ end
587
+
584
588
  describe "at the end of the word, should remove the suffix characters" do
585
589
  it "'kan'" do
586
590
  should_transform(:remove_suffix, 'katakan', 'kata')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: indonesian_stemmer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-03-30 00:00:00.000000000 Z
12
+ date: 2013-04-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -82,7 +82,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
82
82
  version: '0'
83
83
  segments:
84
84
  - 0
85
- hash: 1838075541569491639
85
+ hash: -3033082141051403298
86
86
  required_rubygems_version: !ruby/object:Gem::Requirement
87
87
  none: false
88
88
  requirements:
@@ -91,7 +91,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
91
91
  version: '0'
92
92
  segments:
93
93
  - 0
94
- hash: 1838075541569491639
94
+ hash: -3033082141051403298
95
95
  requirements: []
96
96
  rubyforge_project:
97
97
  rubygems_version: 1.8.25