indonesian_stemmer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rvmrc ADDED
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 1.9.3" > .rvmrc
9
+ environment_id="ruby-1.9.3-p392@indonesian_stemmer"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.18.19 (stable)" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
27
+ \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
28
+ else
29
+ # If the environment file has not yet been created, use the RVM CLI to select.
30
+ rvm --create "$environment_id" || {
31
+ echo "Failed to create RVM environment '${environment_id}'."
32
+ return 1
33
+ }
34
+ fi
35
+
36
+ # If you use bundler, this might be useful to you:
37
+ # if [[ -s Gemfile ]] && {
38
+ # ! builtin command -v bundle >/dev/null ||
39
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
40
+ # }
41
+ # then
42
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
43
+ # gem install bundler
44
+ # fi
45
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
46
+ # then
47
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
48
+ # fi
data/Gemfile ADDED
@@ -0,0 +1,21 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in indonesian_stemmer.gemspec
4
+ gemspec
5
+
6
+ # Declare any dependencies that are still in development here instead of in
7
+ # your gemspec. Remember to move these dependencies to your gemspec before
8
+ # releasing your gem to rubygems.org.
9
+
10
+ # To use in development
11
+ group :development do
12
+ gem 'guard'
13
+ gem 'guard-rspec'
14
+ gem 'pry', require: true
15
+ gem 'pry-debugger'
16
+
17
+ if RbConfig::CONFIG['host_os'] =~ /darwin/
18
+ gem 'growl'
19
+ gem 'rb-fsevent', require: false
20
+ end
21
+ end
@@ -0,0 +1,9 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
9
+
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Adinda Praditya
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # IndonesianStemmer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'indonesian_stemmer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install indonesian_stemmer
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :default => :spec
7
+ task :test => :spec
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'indonesian_stemmer/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "indonesian_stemmer"
8
+ gem.version = IndonesianStemmer::VERSION
9
+ gem.authors = ["Adinda Praditya"]
10
+ gem.email = ["apraditya@gmail.com"]
11
+ gem.description = %q{Stems Indonesian words based on Porter Stemmer, with the algorithm presented in "A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia", Fadillah Z Tala.}
12
+ gem.summary = %q{Porter Stemmer for Bahasa Indonesia.}
13
+ gem.homepage = "https://github.com/apraditya/indonesian_stemmer"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'rake'
21
+ gem.add_development_dependency 'rspec'
22
+
23
+ end
@@ -0,0 +1,51 @@
1
+ require "indonesian_stemmer/version"
2
+ require "indonesian_stemmer/morphological_utility"
3
+
4
+ module IndonesianStemmer
5
+
6
+ class << self
7
+ include MorphologicalUtility
8
+
9
+ attr_accessor :number_of_syllables
10
+
11
+ def stem(word, derivational_stemming = true)
12
+ @flags = 0
13
+ @number_of_syllables = total_syllables word
14
+
15
+ remove_particle(word) if still_has_many_syllables?
16
+ remove_possessive_pronoun(word) if still_has_many_syllables?
17
+
18
+ stem_derivational(word) if derivational_stemming
19
+
20
+ word
21
+ end
22
+
23
+
24
+ private
25
+ def stem_derivational(word)
26
+ previous_size = word.size
27
+ remove_first_order_prefix(word) if still_has_many_syllables?
28
+ if previous_size != word.size
29
+ previous_size = word.size
30
+ remove_suffix(word) if still_has_many_syllables?
31
+
32
+ if previous_size != word.size
33
+ remove_second_order_prefix(word) if still_has_many_syllables?
34
+ end
35
+ else
36
+ remove_second_order_prefix(word) if still_has_many_syllables?
37
+ remove_suffix(word) if still_has_many_syllables?
38
+ end
39
+ end
40
+
41
+ def still_has_many_syllables?
42
+ @number_of_syllables > 2
43
+ end
44
+ end
45
+ end
46
+
47
+ class String
48
+ def stem
49
+ IndonesianStemmer.stem(self)
50
+ end
51
+ end
@@ -0,0 +1,183 @@
1
+ require "indonesian_stemmer/stemmer_utility"
2
+
3
+ module IndonesianStemmer
4
+
5
+ VOWEL_CHARACTERS = %w( a e i o u )
6
+ PARTICLE_CHARACTERS = %w( kah lah pun )
7
+ POSSESSIVE_PRONOUN_CHARACTERS = %w( ku mu nya )
8
+ FIRST_ORDER_PREFIX_CHARACTERS = %w( meng meny men mem me
9
+ peng peny pen pem di ter ke )
10
+ SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meny peny pen )
11
+ SECOND_ORDER_PREFIX_CHARACTERS = %w( ber be per pe )
12
+ SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( be )
13
+ NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( ber per pe )
14
+ SPECIAL_SECOND_ORDER_PREFIX_WORDS = %w( belajar pelajar belunjur )
15
+ SUFFIX_CHARACTERS = %w( kan an i )
16
+
17
+ REMOVED_KE = 1
18
+ REMOVED_PENG = 2
19
+ REMOVED_DI = 4
20
+ REMOVED_MENG = 8
21
+ REMOVED_TER = 16
22
+ REMOVED_BER = 32
23
+ REMOVED_PE = 64
24
+
25
+
26
+ module MorphologicalUtility
27
+ include StemmerUtility
28
+
29
+ def self.included(receiver)
30
+ receiver.send :include, InstanceMethods
31
+ end
32
+
33
+ module InstanceMethods
34
+ def total_syllables(word)
35
+ result = 0
36
+ word.size.times do |i|
37
+ result += 1 if is_vowel?(word[i])
38
+ end
39
+ result
40
+ end
41
+
42
+ def remove_particle(word)
43
+ @number_of_syllables ||= total_syllables(word)
44
+ remove_characters_matching_collection(word,
45
+ collection_for(:particle),
46
+ :end )
47
+ end
48
+
49
+ def remove_possessive_pronoun(word)
50
+ @number_of_syllables ||= total_syllables(word)
51
+ remove_characters_matching_collection(word,
52
+ collection_for(:possessive_pronoun),
53
+ :end )
54
+ end
55
+
56
+ def remove_first_order_prefix(word)
57
+ @number_of_syllables ||= total_syllables(word)
58
+
59
+ word_size = word.size
60
+ SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS.each do |characters|
61
+ characters_size = characters.size
62
+ if starts_with?(word, word_size, characters) && word_size > characters_size && is_vowel?(word[characters_size])
63
+ @flags ||= collection_for(characters, 'removed')
64
+ reduce_syllable
65
+ word = substitute_word_character(word, characters)
66
+ slice_word_at_position( word,
67
+ characters_size-1,
68
+ :start )
69
+ return word
70
+ end
71
+ end
72
+
73
+ remove_characters_matching_collection( word,
74
+ collection_for(:first_order_prefix),
75
+ :start )
76
+ end
77
+
78
+ def remove_second_order_prefix(word)
79
+ @number_of_syllables ||= total_syllables(word)
80
+ word_size = word.size
81
+
82
+ if SPECIAL_SECOND_ORDER_PREFIX_WORDS.include?(word)
83
+ @flags ||= REMOVED_BER if word[0..1] == 'be'
84
+ reduce_syllable
85
+ slice_word_at_position(word, 3, :start)
86
+ return word
87
+ end
88
+
89
+ if starts_with?(word, word_size, 'be') && word_size > 4 && !is_vowel?(word[2]) && word[3..4] == 'er'
90
+ @flags ||= REMOVED_BER
91
+ reduce_syllable
92
+ slice_word_at_position(word, 2, :start)
93
+ return word
94
+ end
95
+
96
+ remove_characters_matching_collection(word,
97
+ collection_for(:non_special_second_order_prefix),
98
+ :start)
99
+ end
100
+
101
+ def remove_suffix(word)
102
+ @number_of_syllables ||= total_syllables(word)
103
+
104
+ SUFFIX_CHARACTERS.each do |character|
105
+ constants_to_check = case character
106
+ when 'kan'
107
+ [REMOVED_KE, REMOVED_PENG, REMOVED_PE]
108
+ when 'an'
109
+ [REMOVED_DI, REMOVED_MENG, REMOVED_TER]
110
+ when 'i'
111
+ [REMOVED_BER, REMOVED_KE, REMOVED_PENG]
112
+ end
113
+
114
+ if ends_with?(word, word.size, character) &&
115
+ constants_to_check.all? { |c| (@flags & c) == 0 }
116
+ reduce_syllable
117
+ slice_word_at_position(word, character.size, :end)
118
+ return word
119
+ end
120
+ end
121
+
122
+ word
123
+ end
124
+
125
+
126
+ private
127
+ def is_vowel?(character)
128
+ VOWEL_CHARACTERS.include? character
129
+ end
130
+
131
+ def collection_for(name, type = 'characters')
132
+ constant_name = if type == 'characters'
133
+ "#{name}_#{type}"
134
+ else
135
+ name = case
136
+ when %w(meny men mem me).include?(name)
137
+ 'meng'
138
+ when %w(peny pen pem).include?(name)
139
+ 'peng'
140
+ else
141
+ name
142
+ end
143
+ "#{type}_#{name}"
144
+ end
145
+ const_get("#{constant_name}".upcase.to_sym)
146
+ rescue NameError
147
+ end
148
+
149
+ def remove_characters_matching_collection(word, collection, position)
150
+ collection.each do |characters|
151
+ if send("#{position}s_with?", word, word.size, characters)
152
+ @flags ||= collection_for(characters, 'removed')
153
+ reduce_syllable
154
+ slice_word_at_position(word, characters.size, position)
155
+ return word
156
+ end
157
+ end
158
+
159
+ word
160
+ end
161
+
162
+ def slice_word_at_position(word, characters_size, position)
163
+ multiplier = (position == :start)? 0 : -1
164
+ word.slice!( multiplier*characters_size, characters_size)
165
+ end
166
+
167
+ def substitute_word_character(word, characters)
168
+ substitute_char = case
169
+ when %w(meny peny).include?(characters)
170
+ 's'
171
+ when characters == 'pen'
172
+ 't'
173
+ end
174
+ word[characters.size-1] = substitute_char if substitute_char
175
+ word
176
+ end
177
+
178
+ def reduce_syllable
179
+ @number_of_syllables -= 1
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,27 @@
1
+ module IndonesianStemmer
2
+ module StemmerUtility
3
+
4
+ def self.included(receiver)
5
+ receiver.send :include, InstanceMethods
6
+ end
7
+
8
+ module InstanceMethods
9
+ def starts_with?(text, by_text_size, prefix)
10
+ return false if prefix.size > by_text_size
11
+ prefix.size.times do |i|
12
+ return false if text[i] != prefix[i]
13
+ end
14
+ return true
15
+ end
16
+
17
+ def ends_with?(text, by_text_size, suffix)
18
+ suffix_size = suffix.size
19
+ return false if suffix_size > by_text_size
20
+ suffix_size.times do |i|
21
+ return false if text[0 - (suffix_size - i)] != suffix[i]
22
+ end
23
+ return true
24
+ end
25
+ end
26
+ end
27
+ end