indonesian_stemmer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +17 -0
- data/.rvmrc +48 -0
- data/Gemfile +21 -0
- data/Guardfile +9 -0
- data/LICENSE.txt +22 -0
- data/README.md +29 -0
- data/Rakefile +7 -0
- data/indonesian_stemmer.gemspec +23 -0
- data/lib/indonesian_stemmer.rb +51 -0
- data/lib/indonesian_stemmer/morphological_utility.rb +183 -0
- data/lib/indonesian_stemmer/stemmer_utility.rb +27 -0
- data/lib/indonesian_stemmer/version.rb +3 -0
- data/spec/indonesian_stemmer/indonesian_stemmer_spec.rb +145 -0
- data/spec/indonesian_stemmer/morphological_utility_spec.rb +630 -0
- data/spec/indonesian_stemmer/stemmer_utility_spec.rb +59 -0
- data/spec/spec_helper.rb +47 -0
- metadata +105 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
+
# development environment upon cd'ing into the directory
|
5
|
+
|
6
|
+
# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
|
7
|
+
# Only full ruby name is supported here, for short names use:
|
8
|
+
# echo "rvm use 1.9.3" > .rvmrc
|
9
|
+
environment_id="ruby-1.9.3-p392@indonesian_stemmer"
|
10
|
+
|
11
|
+
# Uncomment the following lines if you want to verify rvm version per project
|
12
|
+
# rvmrc_rvm_version="1.18.19 (stable)" # 1.10.1 seams as a safe start
|
13
|
+
# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
|
14
|
+
# echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
|
15
|
+
# return 1
|
16
|
+
# }
|
17
|
+
|
18
|
+
# First we attempt to load the desired environment directly from the environment
|
19
|
+
# file. This is very fast and efficient compared to running through the entire
|
20
|
+
# CLI and selector. If you want feedback on which environment was used then
|
21
|
+
# insert the word 'use' after --create as this triggers verbose mode.
|
22
|
+
if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
|
23
|
+
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
|
24
|
+
then
|
25
|
+
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
26
|
+
[[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
|
27
|
+
\. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
|
28
|
+
else
|
29
|
+
# If the environment file has not yet been created, use the RVM CLI to select.
|
30
|
+
rvm --create "$environment_id" || {
|
31
|
+
echo "Failed to create RVM environment '${environment_id}'."
|
32
|
+
return 1
|
33
|
+
}
|
34
|
+
fi
|
35
|
+
|
36
|
+
# If you use bundler, this might be useful to you:
|
37
|
+
# if [[ -s Gemfile ]] && {
|
38
|
+
# ! builtin command -v bundle >/dev/null ||
|
39
|
+
# builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
|
40
|
+
# }
|
41
|
+
# then
|
42
|
+
# printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
|
43
|
+
# gem install bundler
|
44
|
+
# fi
|
45
|
+
# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
|
46
|
+
# then
|
47
|
+
# bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
|
48
|
+
# fi
|
data/Gemfile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
source 'https://rubygems.org'
|
2
|
+
|
3
|
+
# Specify your gem's dependencies in indonesian_stemmer.gemspec
|
4
|
+
gemspec
|
5
|
+
|
6
|
+
# Declare any dependencies that are still in development here instead of in
|
7
|
+
# your gemspec. Remember to move these dependencies to your gemspec before
|
8
|
+
# releasing your gem to rubygems.org.
|
9
|
+
|
10
|
+
# To use in development
|
11
|
+
group :development do
|
12
|
+
gem 'guard'
|
13
|
+
gem 'guard-rspec'
|
14
|
+
gem 'pry', require: true
|
15
|
+
gem 'pry-debugger'
|
16
|
+
|
17
|
+
if RbConfig::CONFIG['host_os'] =~ /darwin/
|
18
|
+
gem 'growl'
|
19
|
+
gem 'rb-fsevent', require: false
|
20
|
+
end
|
21
|
+
end
|
data/Guardfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 Adinda Praditya
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# IndonesianStemmer
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'indonesian_stemmer'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install indonesian_stemmer
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'indonesian_stemmer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |gem|
|
7
|
+
gem.name = "indonesian_stemmer"
|
8
|
+
gem.version = IndonesianStemmer::VERSION
|
9
|
+
gem.authors = ["Adinda Praditya"]
|
10
|
+
gem.email = ["apraditya@gmail.com"]
|
11
|
+
gem.description = %q{Stems Indonesian words based on Porter Stemmer, with the algorithm presented in "A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia", Fadillah Z Tala.}
|
12
|
+
gem.summary = %q{Porter Stemmer for Bahasa Indonesia.}
|
13
|
+
gem.homepage = "https://github.com/apraditya/indonesian_stemmer"
|
14
|
+
|
15
|
+
gem.files = `git ls-files`.split($/)
|
16
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
17
|
+
gem.test_files = gem.files.grep(%r{^(test|spec)/})
|
18
|
+
gem.require_paths = ["lib"]
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rake'
|
21
|
+
gem.add_development_dependency 'rspec'
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require "indonesian_stemmer/version"
|
2
|
+
require "indonesian_stemmer/morphological_utility"
|
3
|
+
|
4
|
+
module IndonesianStemmer
|
5
|
+
|
6
|
+
class << self
|
7
|
+
include MorphologicalUtility
|
8
|
+
|
9
|
+
attr_accessor :number_of_syllables
|
10
|
+
|
11
|
+
def stem(word, derivational_stemming = true)
|
12
|
+
@flags = 0
|
13
|
+
@number_of_syllables = total_syllables word
|
14
|
+
|
15
|
+
remove_particle(word) if still_has_many_syllables?
|
16
|
+
remove_possessive_pronoun(word) if still_has_many_syllables?
|
17
|
+
|
18
|
+
stem_derivational(word) if derivational_stemming
|
19
|
+
|
20
|
+
word
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
private
|
25
|
+
def stem_derivational(word)
|
26
|
+
previous_size = word.size
|
27
|
+
remove_first_order_prefix(word) if still_has_many_syllables?
|
28
|
+
if previous_size != word.size
|
29
|
+
previous_size = word.size
|
30
|
+
remove_suffix(word) if still_has_many_syllables?
|
31
|
+
|
32
|
+
if previous_size != word.size
|
33
|
+
remove_second_order_prefix(word) if still_has_many_syllables?
|
34
|
+
end
|
35
|
+
else
|
36
|
+
remove_second_order_prefix(word) if still_has_many_syllables?
|
37
|
+
remove_suffix(word) if still_has_many_syllables?
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def still_has_many_syllables?
|
42
|
+
@number_of_syllables > 2
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class String
|
48
|
+
def stem
|
49
|
+
IndonesianStemmer.stem(self)
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
require "indonesian_stemmer/stemmer_utility"
|
2
|
+
|
3
|
+
module IndonesianStemmer
|
4
|
+
|
5
|
+
VOWEL_CHARACTERS = %w( a e i o u )
|
6
|
+
PARTICLE_CHARACTERS = %w( kah lah pun )
|
7
|
+
POSSESSIVE_PRONOUN_CHARACTERS = %w( ku mu nya )
|
8
|
+
FIRST_ORDER_PREFIX_CHARACTERS = %w( meng meny men mem me
|
9
|
+
peng peny pen pem di ter ke )
|
10
|
+
SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meny peny pen )
|
11
|
+
SECOND_ORDER_PREFIX_CHARACTERS = %w( ber be per pe )
|
12
|
+
SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( be )
|
13
|
+
NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( ber per pe )
|
14
|
+
SPECIAL_SECOND_ORDER_PREFIX_WORDS = %w( belajar pelajar belunjur )
|
15
|
+
SUFFIX_CHARACTERS = %w( kan an i )
|
16
|
+
|
17
|
+
REMOVED_KE = 1
|
18
|
+
REMOVED_PENG = 2
|
19
|
+
REMOVED_DI = 4
|
20
|
+
REMOVED_MENG = 8
|
21
|
+
REMOVED_TER = 16
|
22
|
+
REMOVED_BER = 32
|
23
|
+
REMOVED_PE = 64
|
24
|
+
|
25
|
+
|
26
|
+
module MorphologicalUtility
|
27
|
+
include StemmerUtility
|
28
|
+
|
29
|
+
def self.included(receiver)
|
30
|
+
receiver.send :include, InstanceMethods
|
31
|
+
end
|
32
|
+
|
33
|
+
module InstanceMethods
|
34
|
+
def total_syllables(word)
|
35
|
+
result = 0
|
36
|
+
word.size.times do |i|
|
37
|
+
result += 1 if is_vowel?(word[i])
|
38
|
+
end
|
39
|
+
result
|
40
|
+
end
|
41
|
+
|
42
|
+
def remove_particle(word)
|
43
|
+
@number_of_syllables ||= total_syllables(word)
|
44
|
+
remove_characters_matching_collection(word,
|
45
|
+
collection_for(:particle),
|
46
|
+
:end )
|
47
|
+
end
|
48
|
+
|
49
|
+
def remove_possessive_pronoun(word)
|
50
|
+
@number_of_syllables ||= total_syllables(word)
|
51
|
+
remove_characters_matching_collection(word,
|
52
|
+
collection_for(:possessive_pronoun),
|
53
|
+
:end )
|
54
|
+
end
|
55
|
+
|
56
|
+
def remove_first_order_prefix(word)
|
57
|
+
@number_of_syllables ||= total_syllables(word)
|
58
|
+
|
59
|
+
word_size = word.size
|
60
|
+
SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS.each do |characters|
|
61
|
+
characters_size = characters.size
|
62
|
+
if starts_with?(word, word_size, characters) && word_size > characters_size && is_vowel?(word[characters_size])
|
63
|
+
@flags ||= collection_for(characters, 'removed')
|
64
|
+
reduce_syllable
|
65
|
+
word = substitute_word_character(word, characters)
|
66
|
+
slice_word_at_position( word,
|
67
|
+
characters_size-1,
|
68
|
+
:start )
|
69
|
+
return word
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
remove_characters_matching_collection( word,
|
74
|
+
collection_for(:first_order_prefix),
|
75
|
+
:start )
|
76
|
+
end
|
77
|
+
|
78
|
+
def remove_second_order_prefix(word)
|
79
|
+
@number_of_syllables ||= total_syllables(word)
|
80
|
+
word_size = word.size
|
81
|
+
|
82
|
+
if SPECIAL_SECOND_ORDER_PREFIX_WORDS.include?(word)
|
83
|
+
@flags ||= REMOVED_BER if word[0..1] == 'be'
|
84
|
+
reduce_syllable
|
85
|
+
slice_word_at_position(word, 3, :start)
|
86
|
+
return word
|
87
|
+
end
|
88
|
+
|
89
|
+
if starts_with?(word, word_size, 'be') && word_size > 4 && !is_vowel?(word[2]) && word[3..4] == 'er'
|
90
|
+
@flags ||= REMOVED_BER
|
91
|
+
reduce_syllable
|
92
|
+
slice_word_at_position(word, 2, :start)
|
93
|
+
return word
|
94
|
+
end
|
95
|
+
|
96
|
+
remove_characters_matching_collection(word,
|
97
|
+
collection_for(:non_special_second_order_prefix),
|
98
|
+
:start)
|
99
|
+
end
|
100
|
+
|
101
|
+
def remove_suffix(word)
|
102
|
+
@number_of_syllables ||= total_syllables(word)
|
103
|
+
|
104
|
+
SUFFIX_CHARACTERS.each do |character|
|
105
|
+
constants_to_check = case character
|
106
|
+
when 'kan'
|
107
|
+
[REMOVED_KE, REMOVED_PENG, REMOVED_PE]
|
108
|
+
when 'an'
|
109
|
+
[REMOVED_DI, REMOVED_MENG, REMOVED_TER]
|
110
|
+
when 'i'
|
111
|
+
[REMOVED_BER, REMOVED_KE, REMOVED_PENG]
|
112
|
+
end
|
113
|
+
|
114
|
+
if ends_with?(word, word.size, character) &&
|
115
|
+
constants_to_check.all? { |c| (@flags & c) == 0 }
|
116
|
+
reduce_syllable
|
117
|
+
slice_word_at_position(word, character.size, :end)
|
118
|
+
return word
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
word
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
private
|
127
|
+
def is_vowel?(character)
|
128
|
+
VOWEL_CHARACTERS.include? character
|
129
|
+
end
|
130
|
+
|
131
|
+
def collection_for(name, type = 'characters')
|
132
|
+
constant_name = if type == 'characters'
|
133
|
+
"#{name}_#{type}"
|
134
|
+
else
|
135
|
+
name = case
|
136
|
+
when %w(meny men mem me).include?(name)
|
137
|
+
'meng'
|
138
|
+
when %w(peny pen pem).include?(name)
|
139
|
+
'peng'
|
140
|
+
else
|
141
|
+
name
|
142
|
+
end
|
143
|
+
"#{type}_#{name}"
|
144
|
+
end
|
145
|
+
const_get("#{constant_name}".upcase.to_sym)
|
146
|
+
rescue NameError
|
147
|
+
end
|
148
|
+
|
149
|
+
def remove_characters_matching_collection(word, collection, position)
|
150
|
+
collection.each do |characters|
|
151
|
+
if send("#{position}s_with?", word, word.size, characters)
|
152
|
+
@flags ||= collection_for(characters, 'removed')
|
153
|
+
reduce_syllable
|
154
|
+
slice_word_at_position(word, characters.size, position)
|
155
|
+
return word
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
word
|
160
|
+
end
|
161
|
+
|
162
|
+
def slice_word_at_position(word, characters_size, position)
|
163
|
+
multiplier = (position == :start)? 0 : -1
|
164
|
+
word.slice!( multiplier*characters_size, characters_size)
|
165
|
+
end
|
166
|
+
|
167
|
+
def substitute_word_character(word, characters)
|
168
|
+
substitute_char = case
|
169
|
+
when %w(meny peny).include?(characters)
|
170
|
+
's'
|
171
|
+
when characters == 'pen'
|
172
|
+
't'
|
173
|
+
end
|
174
|
+
word[characters.size-1] = substitute_char if substitute_char
|
175
|
+
word
|
176
|
+
end
|
177
|
+
|
178
|
+
def reduce_syllable
|
179
|
+
@number_of_syllables -= 1
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module IndonesianStemmer
|
2
|
+
module StemmerUtility
|
3
|
+
|
4
|
+
def self.included(receiver)
|
5
|
+
receiver.send :include, InstanceMethods
|
6
|
+
end
|
7
|
+
|
8
|
+
module InstanceMethods
|
9
|
+
def starts_with?(text, by_text_size, prefix)
|
10
|
+
return false if prefix.size > by_text_size
|
11
|
+
prefix.size.times do |i|
|
12
|
+
return false if text[i] != prefix[i]
|
13
|
+
end
|
14
|
+
return true
|
15
|
+
end
|
16
|
+
|
17
|
+
def ends_with?(text, by_text_size, suffix)
|
18
|
+
suffix_size = suffix.size
|
19
|
+
return false if suffix_size > by_text_size
|
20
|
+
suffix_size.times do |i|
|
21
|
+
return false if text[0 - (suffix_size - i)] != suffix[i]
|
22
|
+
end
|
23
|
+
return true
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|