indonesian_stemmer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/.rvmrc ADDED
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 1.9.3" > .rvmrc
9
+ environment_id="ruby-1.9.3-p392@indonesian_stemmer"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.18.19 (stable)" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
27
+ \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
28
+ else
29
+ # If the environment file has not yet been created, use the RVM CLI to select.
30
+ rvm --create "$environment_id" || {
31
+ echo "Failed to create RVM environment '${environment_id}'."
32
+ return 1
33
+ }
34
+ fi
35
+
36
+ # If you use bundler, this might be useful to you:
37
+ # if [[ -s Gemfile ]] && {
38
+ # ! builtin command -v bundle >/dev/null ||
39
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
40
+ # }
41
+ # then
42
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
43
+ # gem install bundler
44
+ # fi
45
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
46
+ # then
47
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
48
+ # fi
data/Gemfile ADDED
@@ -0,0 +1,21 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in indonesian_stemmer.gemspec
4
+ gemspec
5
+
6
+ # Declare any dependencies that are still in development here instead of in
7
+ # your gemspec. Remember to move these dependencies to your gemspec before
8
+ # releasing your gem to rubygems.org.
9
+
10
+ # To use in development
11
+ group :development do
12
+ gem 'guard'
13
+ gem 'guard-rspec'
14
+ gem 'pry', require: true
15
+ gem 'pry-debugger'
16
+
17
+ if RbConfig::CONFIG['host_os'] =~ /darwin/
18
+ gem 'growl'
19
+ gem 'rb-fsevent', require: false
20
+ end
21
+ end
@@ -0,0 +1,9 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard 'rspec' do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/lib/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
9
+
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Adinda Praditya
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # IndonesianStemmer
2
+
3
+ TODO: Write a gem description
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'indonesian_stemmer'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install indonesian_stemmer
18
+
19
+ ## Usage
20
+
21
+ TODO: Write usage instructions here
22
+
23
+ ## Contributing
24
+
25
+ 1. Fork it
26
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
27
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
28
+ 4. Push to the branch (`git push origin my-new-feature`)
29
+ 5. Create new Pull Request
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :default => :spec
7
+ task :test => :spec
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'indonesian_stemmer/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = "indonesian_stemmer"
8
+ gem.version = IndonesianStemmer::VERSION
9
+ gem.authors = ["Adinda Praditya"]
10
+ gem.email = ["apraditya@gmail.com"]
11
+ gem.description = %q{Stems Indonesian words based on Porter Stemmer, with the algorithm presented in "A Study of Stemming Effects on Information Retrieval in Bahasa Indonesia", Fadillah Z Tala.}
12
+ gem.summary = %q{Porter Stemmer for Bahasa Indonesia.}
13
+ gem.homepage = "https://github.com/apraditya/indonesian_stemmer"
14
+
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
+ gem.test_files = gem.files.grep(%r{^(test|spec)/})
18
+ gem.require_paths = ["lib"]
19
+
20
+ gem.add_development_dependency 'rake'
21
+ gem.add_development_dependency 'rspec'
22
+
23
+ end
@@ -0,0 +1,51 @@
1
+ require "indonesian_stemmer/version"
2
+ require "indonesian_stemmer/morphological_utility"
3
+
4
+ module IndonesianStemmer
5
+
6
+ class << self
7
+ include MorphologicalUtility
8
+
9
+ attr_accessor :number_of_syllables
10
+
11
+ def stem(word, derivational_stemming = true)
12
+ @flags = 0
13
+ @number_of_syllables = total_syllables word
14
+
15
+ remove_particle(word) if still_has_many_syllables?
16
+ remove_possessive_pronoun(word) if still_has_many_syllables?
17
+
18
+ stem_derivational(word) if derivational_stemming
19
+
20
+ word
21
+ end
22
+
23
+
24
+ private
25
+ def stem_derivational(word)
26
+ previous_size = word.size
27
+ remove_first_order_prefix(word) if still_has_many_syllables?
28
+ if previous_size != word.size
29
+ previous_size = word.size
30
+ remove_suffix(word) if still_has_many_syllables?
31
+
32
+ if previous_size != word.size
33
+ remove_second_order_prefix(word) if still_has_many_syllables?
34
+ end
35
+ else
36
+ remove_second_order_prefix(word) if still_has_many_syllables?
37
+ remove_suffix(word) if still_has_many_syllables?
38
+ end
39
+ end
40
+
41
+ def still_has_many_syllables?
42
+ @number_of_syllables > 2
43
+ end
44
+ end
45
+ end
46
+
47
+ class String
48
+ def stem
49
+ IndonesianStemmer.stem(self)
50
+ end
51
+ end
@@ -0,0 +1,183 @@
1
+ require "indonesian_stemmer/stemmer_utility"
2
+
3
+ module IndonesianStemmer
4
+
5
+ VOWEL_CHARACTERS = %w( a e i o u )
6
+ PARTICLE_CHARACTERS = %w( kah lah pun )
7
+ POSSESSIVE_PRONOUN_CHARACTERS = %w( ku mu nya )
8
+ FIRST_ORDER_PREFIX_CHARACTERS = %w( meng meny men mem me
9
+ peng peny pen pem di ter ke )
10
+ SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS = %w( meny peny pen )
11
+ SECOND_ORDER_PREFIX_CHARACTERS = %w( ber be per pe )
12
+ SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( be )
13
+ NON_SPECIAL_SECOND_ORDER_PREFIX_CHARACTERS = %w( ber per pe )
14
+ SPECIAL_SECOND_ORDER_PREFIX_WORDS = %w( belajar pelajar belunjur )
15
+ SUFFIX_CHARACTERS = %w( kan an i )
16
+
17
+ REMOVED_KE = 1
18
+ REMOVED_PENG = 2
19
+ REMOVED_DI = 4
20
+ REMOVED_MENG = 8
21
+ REMOVED_TER = 16
22
+ REMOVED_BER = 32
23
+ REMOVED_PE = 64
24
+
25
+
26
+ module MorphologicalUtility
27
+ include StemmerUtility
28
+
29
+ def self.included(receiver)
30
+ receiver.send :include, InstanceMethods
31
+ end
32
+
33
+ module InstanceMethods
34
+ def total_syllables(word)
35
+ result = 0
36
+ word.size.times do |i|
37
+ result += 1 if is_vowel?(word[i])
38
+ end
39
+ result
40
+ end
41
+
42
+ def remove_particle(word)
43
+ @number_of_syllables ||= total_syllables(word)
44
+ remove_characters_matching_collection(word,
45
+ collection_for(:particle),
46
+ :end )
47
+ end
48
+
49
+ def remove_possessive_pronoun(word)
50
+ @number_of_syllables ||= total_syllables(word)
51
+ remove_characters_matching_collection(word,
52
+ collection_for(:possessive_pronoun),
53
+ :end )
54
+ end
55
+
56
+ def remove_first_order_prefix(word)
57
+ @number_of_syllables ||= total_syllables(word)
58
+
59
+ word_size = word.size
60
+ SPECIAL_FIRST_ORDER_PREFIX_CHARACTERS.each do |characters|
61
+ characters_size = characters.size
62
+ if starts_with?(word, word_size, characters) && word_size > characters_size && is_vowel?(word[characters_size])
63
+ @flags ||= collection_for(characters, 'removed')
64
+ reduce_syllable
65
+ word = substitute_word_character(word, characters)
66
+ slice_word_at_position( word,
67
+ characters_size-1,
68
+ :start )
69
+ return word
70
+ end
71
+ end
72
+
73
+ remove_characters_matching_collection( word,
74
+ collection_for(:first_order_prefix),
75
+ :start )
76
+ end
77
+
78
+ def remove_second_order_prefix(word)
79
+ @number_of_syllables ||= total_syllables(word)
80
+ word_size = word.size
81
+
82
+ if SPECIAL_SECOND_ORDER_PREFIX_WORDS.include?(word)
83
+ @flags ||= REMOVED_BER if word[0..1] == 'be'
84
+ reduce_syllable
85
+ slice_word_at_position(word, 3, :start)
86
+ return word
87
+ end
88
+
89
+ if starts_with?(word, word_size, 'be') && word_size > 4 && !is_vowel?(word[2]) && word[3..4] == 'er'
90
+ @flags ||= REMOVED_BER
91
+ reduce_syllable
92
+ slice_word_at_position(word, 2, :start)
93
+ return word
94
+ end
95
+
96
+ remove_characters_matching_collection(word,
97
+ collection_for(:non_special_second_order_prefix),
98
+ :start)
99
+ end
100
+
101
+ def remove_suffix(word)
102
+ @number_of_syllables ||= total_syllables(word)
103
+
104
+ SUFFIX_CHARACTERS.each do |character|
105
+ constants_to_check = case character
106
+ when 'kan'
107
+ [REMOVED_KE, REMOVED_PENG, REMOVED_PE]
108
+ when 'an'
109
+ [REMOVED_DI, REMOVED_MENG, REMOVED_TER]
110
+ when 'i'
111
+ [REMOVED_BER, REMOVED_KE, REMOVED_PENG]
112
+ end
113
+
114
+ if ends_with?(word, word.size, character) &&
115
+ constants_to_check.all? { |c| (@flags & c) == 0 }
116
+ reduce_syllable
117
+ slice_word_at_position(word, character.size, :end)
118
+ return word
119
+ end
120
+ end
121
+
122
+ word
123
+ end
124
+
125
+
126
+ private
127
+ def is_vowel?(character)
128
+ VOWEL_CHARACTERS.include? character
129
+ end
130
+
131
+ def collection_for(name, type = 'characters')
132
+ constant_name = if type == 'characters'
133
+ "#{name}_#{type}"
134
+ else
135
+ name = case
136
+ when %w(meny men mem me).include?(name)
137
+ 'meng'
138
+ when %w(peny pen pem).include?(name)
139
+ 'peng'
140
+ else
141
+ name
142
+ end
143
+ "#{type}_#{name}"
144
+ end
145
+ const_get("#{constant_name}".upcase.to_sym)
146
+ rescue NameError
147
+ end
148
+
149
+ def remove_characters_matching_collection(word, collection, position)
150
+ collection.each do |characters|
151
+ if send("#{position}s_with?", word, word.size, characters)
152
+ @flags ||= collection_for(characters, 'removed')
153
+ reduce_syllable
154
+ slice_word_at_position(word, characters.size, position)
155
+ return word
156
+ end
157
+ end
158
+
159
+ word
160
+ end
161
+
162
+ def slice_word_at_position(word, characters_size, position)
163
+ multiplier = (position == :start)? 0 : -1
164
+ word.slice!( multiplier*characters_size, characters_size)
165
+ end
166
+
167
+ def substitute_word_character(word, characters)
168
+ substitute_char = case
169
+ when %w(meny peny).include?(characters)
170
+ 's'
171
+ when characters == 'pen'
172
+ 't'
173
+ end
174
+ word[characters.size-1] = substitute_char if substitute_char
175
+ word
176
+ end
177
+
178
+ def reduce_syllable
179
+ @number_of_syllables -= 1
180
+ end
181
+ end
182
+ end
183
+ end
@@ -0,0 +1,27 @@
1
+ module IndonesianStemmer
2
+ module StemmerUtility
3
+
4
+ def self.included(receiver)
5
+ receiver.send :include, InstanceMethods
6
+ end
7
+
8
+ module InstanceMethods
9
+ def starts_with?(text, by_text_size, prefix)
10
+ return false if prefix.size > by_text_size
11
+ prefix.size.times do |i|
12
+ return false if text[i] != prefix[i]
13
+ end
14
+ return true
15
+ end
16
+
17
+ def ends_with?(text, by_text_size, suffix)
18
+ suffix_size = suffix.size
19
+ return false if suffix_size > by_text_size
20
+ suffix_size.times do |i|
21
+ return false if text[0 - (suffix_size - i)] != suffix[i]
22
+ end
23
+ return true
24
+ end
25
+ end
26
+ end
27
+ end