indonesian_stemmer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +17 -0
- data/.rvmrc +48 -0
- data/Gemfile +21 -0
- data/Guardfile +9 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +7 -0
- data/indonesian_stemmer.gemspec +23 -0
- data/lib/indonesian_stemmer.rb +51 -0
- data/lib/indonesian_stemmer/morphological_utility.rb +183 -0
- data/lib/indonesian_stemmer/stemmer_utility.rb +27 -0
- data/lib/indonesian_stemmer/version.rb +3 -0
- data/spec/indonesian_stemmer/indonesian_stemmer_spec.rb +145 -0
- data/spec/indonesian_stemmer/morphological_utility_spec.rb +630 -0
- data/spec/indonesian_stemmer/stemmer_utility_spec.rb +59 -0
- data/spec/spec_helper.rb +47 -0
- metadata +105 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
+
# development environment upon cd'ing into the directory
|
5
|
+
|
6
|
+
# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
|
7
|
+
# Only full ruby name is supported here, for short names use:
|
8
|
+
# echo "rvm use 1.9.3" > .rvmrc
|
9
|
+
environment_id="ruby-1.9.3-p392@indonesian_stemmer"
|
10
|
+
|
11
|
+
# Uncomment the following lines if you want to verify rvm version per project
|
12
|
+
# rvmrc_rvm_version="1.18.19 (stable)" # 1.10.1 seams as a safe start
|
13
|
+
# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
|
14
|
+
# echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
|
15
|
+
# return 1
|
16
|
+
# }
|
17
|
+
|
18
|
+
# First we attempt to load the desired environment directly from the environment
|
19
|
+
# file. This is very fast and efficient compared to running through the entire
|
20
|
+
# CLI and selector. If you want feedback on which environment was used then
|
21
|
+
# insert the word 'use' after --create as this triggers verbose mode.
|
22
|
+
if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
|
23
|
+
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
|
24
|
+
then
|
25
|
+
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
26
|
+
[[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
|
27
|
+
\. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
|
28
|
+
else
|
29
|
+
# If the environment file has not yet been created, use the RVM CLI to select.
|
30
|
+
rvm --create "$environment_id" || {
|
31
|
+
echo "Failed to create RVM environment '${environment_id}'."
|
32
|
+
return 1
|
33
|
+
}
|
34
|
+
fi
|
35
|
+
|
36
|
+
# If you use bundler, this might be useful to you:
|
37
|
+
# if [[ -s Gemfile ]] && {
|
38
|
+
# ! builtin command -v bundle >/dev/null ||
|
39
|
+
# builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
|
40
|
+
# }
|
41
|
+
# then
|
42
|
+
# printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
|
43
|
+
# gem install bundler
|
44
|
+
# fi
|
45
|
+
# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
|
46
|
+
# then
|
47
|
+
# bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
|
48
|
+
# fi
|
data/Gemfile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in indonesian_stemmer.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
# Declare any dependencies that are still in development here instead of in
|
7
|
+
# your gemspec. Remember to move these dependencies to your gemspec before
|
8
|
+
# releasing your gem to rubygems.org.
|
9
|
+
|
10
|
+
# To use in development
|
11
|
+
group :development do
|
12
|
+
gem 'guard'
|
13
|
+
gem 'guard-rspec'
|
14
|
+
gem 'pry', require: true
|
15
|
+
gem 'pry-debugger'
|
16
|
+
|
17
|
+
if RbConfig::CONFIG['host_os'] =~ /darwin/
|
18
|
+
gem 'growl'
|
19
|
+
gem 'rb-fsevent', require: false
|
20
|
+
end
|
21
|
+
end
|
data/Guardfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Adinda Praditya
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# IndonesianStemmer
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'indonesian_stemmer'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install indonesian_stemmer
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'indonesian_stemmer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "indonesian_stemmer"
|
8
|
+
gem.version = IndonesianStemmer::VERSION
|
9
|
+
gem.authors = ["Adinda Praditya"]
|
10
|
+
gem.email = ["apraditya@gmail.com"]
|
11
|
+
gem.description = %q{Stems Indonesian words based on Porter Stemmer, with the algorithm presented in "A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia", Fadillah Z Tala.}
|
12
|
+
gem.summary = %q{Porter Stemmer for Bahasa Indonesia.}
|
13
|
+
gem.homepage = "https://github.com/apraditya/indonesian_stemmer"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rake'
|
21
|
+
gem.add_development_dependency 'rspec'
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require "indonesian_stemmer/version"
|
2
|
+
require "indonesian_stemmer/morphological_utility"
|
3
|
+
|
4
|
+
module IndonesianStemmer
|
5
|
+
|
6
|
+
class << self
|
7
|
+
include MorphologicalUtility
|
8
|
+
|
9
|
+
attr_accessor :number_of_syllables
|
10
|
+
|
11
|
+
def stem(word, derivational_stemming = true)
|
12
|
+
@flags = 0
|
13
|
+
@number_of_syllables = total_syllables word
|
14
|
+
|
15
|
+
remove_particle(word) if still_has_many_syllables?
|
16
|
+
remove_possessive_pronoun(word) if still_has_many_syllables?
|
17
|
+
|
18
|
+
stem_derivational(word) if derivational_stemming
|
19
|
+
|
20
|
+
word
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
private
|
25
|
+
def stem_derivational(word)
|
26
|
+
previous_size = word.size
|
27
|
+
remove_first_order_prefix(word) if still_has_many_syllables?
|
28
|
+
if previous_size != word.size
|
29
|
+
previous_size = word.size
|
30
|
+
remove_suffix(word) if still_has_many_syllables?
|
31
|
+
|
32
|
+
if previous_size != word.size
|
33
|
+
remove_second_order_prefix(word) if still_has_many_syllables?
|
34
|
+
end
|
35
|
+
else
|
36
|
+
remove_second_order_prefix(word) if still_has_many_syllables?
|
37
|
+
remove_suffix(word) if still_has_many_syllables?
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def still_has_many_syllables?
|
42
|
+
@number_of_syllables > 2
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class String
|
48
|
+
def stem
|
49
|
+
IndonesianStemmer.stem(self)
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
require "indonesian_stemmer/stemmer_utility"
|
2
|
+
|
3
|
+
module IndonesianStemmer
|
4
|
+
|
5
|
+
VOWEL_CHARACTERS = %w( a e i o u )
|
6
|
+
PARTICLE_CHARACTERS = %w( kah lah pun )
|
7
|
+
POSSESSIVE_PRONOUN_CHARACTERS = %w( ku mu nya )
|
8
|
+
FIRST_ORDER_PREFIX_CHARACTERS = %w( meng meny men mem me
|
9
|
+
peng peny pen pem di ter ke )
|
10
|
+
SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meny peny pen )
|
11
|
+
SECOND_ORDER_PREFIX_CHARACTERS = %w( ber be per pe )
|
12
|
+
SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( be )
|
13
|
+
NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( ber per pe )
|
14
|
+
SPECIAL_SECOND_ORDER_PREFIX_WORDS = %w( belajar pelajar belunjur )
|
15
|
+
SUFFIX_CHARACTERS = %w( kan an i )
|
16
|
+
|
17
|
+
REMOVED_KE = 1
|
18
|
+
REMOVED_PENG = 2
|
19
|
+
REMOVED_DI = 4
|
20
|
+
REMOVED_MENG = 8
|
21
|
+
REMOVED_TER = 16
|
22
|
+
REMOVED_BER = 32
|
23
|
+
REMOVED_PE = 64
|
24
|
+
|
25
|
+
|
26
|
+
module MorphologicalUtility
|
27
|
+
include StemmerUtility
|
28
|
+
|
29
|
+
def self.included(receiver)
|
30
|
+
receiver.send :include, InstanceMethods
|
31
|
+
end
|
32
|
+
|
33
|
+
module InstanceMethods
|
34
|
+
def total_syllables(word)
|
35
|
+
result = 0
|
36
|
+
word.size.times do |i|
|
37
|
+
result += 1 if is_vowel?(word[i])
|
38
|
+
end
|
39
|
+
result
|
40
|
+
end
|
41
|
+
|
42
|
+
def remove_particle(word)
|
43
|
+
@number_of_syllables ||= total_syllables(word)
|
44
|
+
remove_characters_matching_collection(word,
|
45
|
+
collection_for(:particle),
|
46
|
+
:end )
|
47
|
+
end
|
48
|
+
|
49
|
+
def remove_possessive_pronoun(word)
|
50
|
+
@number_of_syllables ||= total_syllables(word)
|
51
|
+
remove_characters_matching_collection(word,
|
52
|
+
collection_for(:possessive_pronoun),
|
53
|
+
:end )
|
54
|
+
end
|
55
|
+
|
56
|
+
def remove_first_order_prefix(word)
|
57
|
+
@number_of_syllables ||= total_syllables(word)
|
58
|
+
|
59
|
+
word_size = word.size
|
60
|
+
SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS.each do |characters|
|
61
|
+
characters_size = characters.size
|
62
|
+
if starts_with?(word, word_size, characters) && word_size > characters_size && is_vowel?(word[characters_size])
|
63
|
+
@flags ||= collection_for(characters, 'removed')
|
64
|
+
reduce_syllable
|
65
|
+
word = substitute_word_character(word, characters)
|
66
|
+
slice_word_at_position( word,
|
67
|
+
characters_size-1,
|
68
|
+
:start )
|
69
|
+
return word
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
remove_characters_matching_collection( word,
|
74
|
+
collection_for(:first_order_prefix),
|
75
|
+
:start )
|
76
|
+
end
|
77
|
+
|
78
|
+
def remove_second_order_prefix(word)
|
79
|
+
@number_of_syllables ||= total_syllables(word)
|
80
|
+
word_size = word.size
|
81
|
+
|
82
|
+
if SPECIAL_SECOND_ORDER_PREFIX_WORDS.include?(word)
|
83
|
+
@flags ||= REMOVED_BER if word[0..1] == 'be'
|
84
|
+
reduce_syllable
|
85
|
+
slice_word_at_position(word, 3, :start)
|
86
|
+
return word
|
87
|
+
end
|
88
|
+
|
89
|
+
if starts_with?(word, word_size, 'be') && word_size > 4 && !is_vowel?(word[2]) && word[3..4] == 'er'
|
90
|
+
@flags ||= REMOVED_BER
|
91
|
+
reduce_syllable
|
92
|
+
slice_word_at_position(word, 2, :start)
|
93
|
+
return word
|
94
|
+
end
|
95
|
+
|
96
|
+
remove_characters_matching_collection(word,
|
97
|
+
collection_for(:non_special_second_order_prefix),
|
98
|
+
:start)
|
99
|
+
end
|
100
|
+
|
101
|
+
def remove_suffix(word)
|
102
|
+
@number_of_syllables ||= total_syllables(word)
|
103
|
+
|
104
|
+
SUFFIX_CHARACTERS.each do |character|
|
105
|
+
constants_to_check = case character
|
106
|
+
when 'kan'
|
107
|
+
[REMOVED_KE, REMOVED_PENG, REMOVED_PE]
|
108
|
+
when 'an'
|
109
|
+
[REMOVED_DI, REMOVED_MENG, REMOVED_TER]
|
110
|
+
when 'i'
|
111
|
+
[REMOVED_BER, REMOVED_KE, REMOVED_PENG]
|
112
|
+
end
|
113
|
+
|
114
|
+
if ends_with?(word, word.size, character) &&
|
115
|
+
constants_to_check.all? { |c| (@flags & c) == 0 }
|
116
|
+
reduce_syllable
|
117
|
+
slice_word_at_position(word, character.size, :end)
|
118
|
+
return word
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
word
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
private
|
127
|
+
def is_vowel?(character)
|
128
|
+
VOWEL_CHARACTERS.include? character
|
129
|
+
end
|
130
|
+
|
131
|
+
def collection_for(name, type = 'characters')
|
132
|
+
constant_name = if type == 'characters'
|
133
|
+
"#{name}_#{type}"
|
134
|
+
else
|
135
|
+
name = case
|
136
|
+
when %w(meny men mem me).include?(name)
|
137
|
+
'meng'
|
138
|
+
when %w(peny pen pem).include?(name)
|
139
|
+
'peng'
|
140
|
+
else
|
141
|
+
name
|
142
|
+
end
|
143
|
+
"#{type}_#{name}"
|
144
|
+
end
|
145
|
+
const_get("#{constant_name}".upcase.to_sym)
|
146
|
+
rescue NameError
|
147
|
+
end
|
148
|
+
|
149
|
+
def remove_characters_matching_collection(word, collection, position)
|
150
|
+
collection.each do |characters|
|
151
|
+
if send("#{position}s_with?", word, word.size, characters)
|
152
|
+
@flags ||= collection_for(characters, 'removed')
|
153
|
+
reduce_syllable
|
154
|
+
slice_word_at_position(word, characters.size, position)
|
155
|
+
return word
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
word
|
160
|
+
end
|
161
|
+
|
162
|
+
def slice_word_at_position(word, characters_size, position)
|
163
|
+
multiplier = (position == :start)? 0 : -1
|
164
|
+
word.slice!( multiplier*characters_size, characters_size)
|
165
|
+
end
|
166
|
+
|
167
|
+
def substitute_word_character(word, characters)
|
168
|
+
substitute_char = case
|
169
|
+
when %w(meny peny).include?(characters)
|
170
|
+
's'
|
171
|
+
when characters == 'pen'
|
172
|
+
't'
|
173
|
+
end
|
174
|
+
word[characters.size-1] = substitute_char if substitute_char
|
175
|
+
word
|
176
|
+
end
|
177
|
+
|
178
|
+
def reduce_syllable
|
179
|
+
@number_of_syllables -= 1
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module IndonesianStemmer
|
2
|
+
module StemmerUtility
|
3
|
+
|
4
|
+
def self.included(receiver)
|
5
|
+
receiver.send :include, InstanceMethods
|
6
|
+
end
|
7
|
+
|
8
|
+
module InstanceMethods
|
9
|
+
def starts_with?(text, by_text_size, prefix)
|
10
|
+
return false if prefix.size > by_text_size
|
11
|
+
prefix.size.times do |i|
|
12
|
+
return false if text[i] != prefix[i]
|
13
|
+
end
|
14
|
+
return true
|
15
|
+
end
|
16
|
+
|
17
|
+
def ends_with?(text, by_text_size, suffix)
|
18
|
+
suffix_size = suffix.size
|
19
|
+
return false if suffix_size > by_text_size
|
20
|
+
suffix_size.times do |i|
|
21
|
+
return false if text[0 - (suffix_size - i)] != suffix[i]
|
22
|
+
end
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|