sastrawi-ruby 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +23 -0
- data/.gitignore +51 -0
- data/.travis.yml +10 -0
- data/CONTRIBUTING.md +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +104 -0
- data/Rakefile +6 -0
- data/_config.yml +1 -0
- data/bin/sastrawi +24 -0
- data/data/base-word.txt +29933 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +67 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +28 -0
- data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
- data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +25 -0
- data/lib/sastrawi/stemmer/context/context.rb +217 -0
- data/lib/sastrawi/stemmer/context/removal.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +54 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +37 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
- data/lib/sastrawi/stemmer/stemmer.rb +101 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +49 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +27 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +124 -0
- data/lib/sastrawi/version.rb +5 -0
- data/lib/sastrawi.rb +4 -0
- data/sastrawi.gemspec +34 -0
- metadata +179 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
|
|
2
|
+
|
|
3
|
+
##
|
|
4
|
+
# Stemming context using Nazief and Adriani, Confix Stripping (CS),
|
|
5
|
+
# Enhanced Confix Stripping (ECS), and Improved (ECS)
|
|
6
|
+
|
|
7
|
+
module Sastrawi
|
|
8
|
+
module Stemmer
|
|
9
|
+
module Context
|
|
10
|
+
class Context
|
|
11
|
+
attr_reader :original_word, :dictionary, :visitor_provider, :visitors, :suffix_visitors, :prefix_visitors
|
|
12
|
+
attr_accessor :current_word, :process_is_stopped, :removals, :result
|
|
13
|
+
|
|
14
|
+
def initialize(original_word, dictionary, visitor_provider)
|
|
15
|
+
@original_word = original_word
|
|
16
|
+
@current_word = original_word
|
|
17
|
+
@dictionary = dictionary
|
|
18
|
+
@visitor_provider = visitor_provider
|
|
19
|
+
|
|
20
|
+
@process_is_stopped = false
|
|
21
|
+
@removals = []
|
|
22
|
+
@visitors = nil
|
|
23
|
+
@suffix_visitors = nil
|
|
24
|
+
@prefix_visitors = nil
|
|
25
|
+
@result = nil
|
|
26
|
+
|
|
27
|
+
init_visitors
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def init_visitors
|
|
31
|
+
@visitors = @visitor_provider.visitors
|
|
32
|
+
@suffix_visitors = @visitor_provider.suffix_visitors
|
|
33
|
+
@prefix_visitors = @visitor_provider.prefix_visitors
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def stop_process
|
|
37
|
+
@process_is_stopped = true
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def add_removal(removal)
|
|
41
|
+
@removals.push(removal)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# Execute stemming process
|
|
46
|
+
|
|
47
|
+
def execute
|
|
48
|
+
start_stemming_process
|
|
49
|
+
|
|
50
|
+
if @dictionary.contains?(@current_word)
|
|
51
|
+
@result = @current_word
|
|
52
|
+
else
|
|
53
|
+
@result = @original_word
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def start_stemming_process
|
|
58
|
+
return if @dictionary.contains?(@current_word)
|
|
59
|
+
|
|
60
|
+
accept_visitors(@visitors)
|
|
61
|
+
|
|
62
|
+
return if @dictionary.contains?(@current_word)
|
|
63
|
+
|
|
64
|
+
cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
|
|
65
|
+
|
|
66
|
+
##
|
|
67
|
+
# Confix stripping
|
|
68
|
+
# try to remove prefix before suffix if the specification is met
|
|
69
|
+
|
|
70
|
+
if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
|
|
71
|
+
# Interleave prefix and suffix removal in CS path:
|
|
72
|
+
# After each prefix removal round, try suffix removal to check
|
|
73
|
+
# if the combination yields a dictionary word.
|
|
74
|
+
cs_found = false
|
|
75
|
+
3.times do
|
|
76
|
+
prev_removals_len = @removals.length
|
|
77
|
+
accept_prefix_visitors(@prefix_visitors)
|
|
78
|
+
|
|
79
|
+
break if @removals.length == prev_removals_len # no prefix removed
|
|
80
|
+
|
|
81
|
+
if @dictionary.contains?(@current_word)
|
|
82
|
+
cs_found = true
|
|
83
|
+
break
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Try suffix removal from this intermediate state
|
|
87
|
+
saved_word = @current_word
|
|
88
|
+
saved_removals = @removals.dup
|
|
89
|
+
remove_suffixes
|
|
90
|
+
|
|
91
|
+
if @dictionary.contains?(@current_word)
|
|
92
|
+
cs_found = true
|
|
93
|
+
break
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Suffix didn't help, restore and try more prefix iterations
|
|
97
|
+
@current_word = saved_word
|
|
98
|
+
@removals = saved_removals
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
if cs_found
|
|
102
|
+
return
|
|
103
|
+
else
|
|
104
|
+
@current_word = @original_word
|
|
105
|
+
@removals = []
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
remove_suffixes
|
|
110
|
+
return if @dictionary.contains?(@current_word)
|
|
111
|
+
|
|
112
|
+
remove_prefixes
|
|
113
|
+
return if @dictionary.contains?(@current_word)
|
|
114
|
+
|
|
115
|
+
loop_last_return
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
##
|
|
119
|
+
# ECS loop last return
|
|
120
|
+
|
|
121
|
+
def loop_last_return
|
|
122
|
+
restore_prefix
|
|
123
|
+
|
|
124
|
+
removals = @removals
|
|
125
|
+
reversed_removals = removals.reverse
|
|
126
|
+
current_word = @current_word
|
|
127
|
+
|
|
128
|
+
reversed_removals.each do |reverse_removal|
|
|
129
|
+
next unless suffix_removal?(reverse_removal)
|
|
130
|
+
|
|
131
|
+
if reverse_removal.removed_part == 'kan'
|
|
132
|
+
@current_word = "#{reverse_removal.result}k"
|
|
133
|
+
|
|
134
|
+
remove_prefixes
|
|
135
|
+
return if @dictionary.contains?(@current_word)
|
|
136
|
+
|
|
137
|
+
@current_word = "#{reverse_removal.result}kan"
|
|
138
|
+
else
|
|
139
|
+
@current_word = reverse_removal.subject
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
remove_prefixes
|
|
143
|
+
return if @dictionary.contains?(@current_word)
|
|
144
|
+
|
|
145
|
+
@removals = removals
|
|
146
|
+
@current_word = current_word
|
|
147
|
+
end
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def remove_prefixes
|
|
151
|
+
3.times do
|
|
152
|
+
accept_prefix_visitors(@prefix_visitors)
|
|
153
|
+
|
|
154
|
+
return if @dictionary.contains?(@current_word)
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def remove_suffixes
|
|
159
|
+
accept_visitors(@suffix_visitors)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def accept(visitor)
|
|
163
|
+
visitor.visit(self)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def accept_visitors(visitors)
|
|
167
|
+
visitors.each do |visitor|
|
|
168
|
+
accept(visitor)
|
|
169
|
+
|
|
170
|
+
return @current_word if @dictionary.contains?(@current_word)
|
|
171
|
+
|
|
172
|
+
return @current_word if @process_is_stopped
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def accept_prefix_visitors(visitors)
|
|
177
|
+
removal_length = @removals.length
|
|
178
|
+
|
|
179
|
+
visitors.each do |visitor|
|
|
180
|
+
accept(visitor)
|
|
181
|
+
|
|
182
|
+
return @current_word if @dictionary.contains?(@current_word)
|
|
183
|
+
|
|
184
|
+
return @current_word if @process_is_stopped
|
|
185
|
+
|
|
186
|
+
return if @removals.length > removal_length
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
##
|
|
191
|
+
# Check whether the removed part is a suffix
|
|
192
|
+
|
|
193
|
+
def suffix_removal?(removal)
|
|
194
|
+
removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
##
|
|
198
|
+
# Restore prefix to proceed with ECS loop last return
|
|
199
|
+
|
|
200
|
+
def restore_prefix
|
|
201
|
+
@removals.each do |removal|
|
|
202
|
+
if removal.affix_type == 'DP'
|
|
203
|
+
@current_word = removal.subject
|
|
204
|
+
break
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
@removals.each do |removal|
|
|
209
|
+
if removal.affix_type == 'DP'
|
|
210
|
+
@removals.delete(removal)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
end
|
|
217
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Stemmer
|
|
3
|
+
module Context
|
|
4
|
+
class Removal
|
|
5
|
+
attr_reader :visitor, :subject, :result, :removed_part, :affix_type
|
|
6
|
+
|
|
7
|
+
def initialize(visitor, subject, result, removed_part, affix_type)
|
|
8
|
+
@visitor = visitor
|
|
9
|
+
@subject = subject
|
|
10
|
+
@result = result
|
|
11
|
+
@removed_part = removed_part
|
|
12
|
+
@affix_type = affix_type
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Stemmer
|
|
3
|
+
module Context
|
|
4
|
+
module Visitor
|
|
5
|
+
class DontStemShortWord
|
|
6
|
+
def visit(context)
|
|
7
|
+
context.stop_process if short_word?(context.current_word)
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def short_word?(word)
|
|
11
|
+
word.length <= 3
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Stemmer
|
|
3
|
+
module Context
|
|
4
|
+
module Visitor
|
|
5
|
+
class PrefixDisambiguator
|
|
6
|
+
attr_reader :disambiguators
|
|
7
|
+
|
|
8
|
+
def initialize(disambiguators = [])
|
|
9
|
+
@disambiguators = []
|
|
10
|
+
|
|
11
|
+
add_disambiguators(disambiguators)
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def visit(context)
|
|
15
|
+
first_result = nil
|
|
16
|
+
dict_result = nil
|
|
17
|
+
|
|
18
|
+
@disambiguators.each do |disambiguator|
|
|
19
|
+
result = disambiguator.disambiguate(context.current_word)
|
|
20
|
+
next if result.nil?
|
|
21
|
+
|
|
22
|
+
first_result ||= result
|
|
23
|
+
|
|
24
|
+
if context.dictionary.contains?(result)
|
|
25
|
+
dict_result = result
|
|
26
|
+
break
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
result = dict_result || first_result
|
|
31
|
+
return if result.nil?
|
|
32
|
+
|
|
33
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
|
34
|
+
|
|
35
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
|
36
|
+
|
|
37
|
+
context.add_removal(removal)
|
|
38
|
+
context.current_word = result
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def add_disambiguators(disambiguators)
|
|
42
|
+
disambiguators.each do |disambiguator|
|
|
43
|
+
add_disambiguator(disambiguator)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def add_disambiguator(disambiguator)
|
|
48
|
+
@disambiguators.push(disambiguator)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
require 'sastrawi/stemmer/context/removal'
|
|
2
|
+
|
|
3
|
+
##
|
|
4
|
+
# Remove derivational suffix
|
|
5
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
|
6
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
|
7
|
+
|
|
8
|
+
module Sastrawi
|
|
9
|
+
module Stemmer
|
|
10
|
+
module Context
|
|
11
|
+
module Visitor
|
|
12
|
+
class RemoveDerivationalSuffix
|
|
13
|
+
def visit(context)
|
|
14
|
+
result = remove_suffix(context.current_word)
|
|
15
|
+
|
|
16
|
+
if result != context.current_word
|
|
17
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
|
18
|
+
|
|
19
|
+
removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
|
|
20
|
+
|
|
21
|
+
context.add_removal(removal)
|
|
22
|
+
context.current_word = result
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
##
|
|
27
|
+
# Original rule: i|kan|an
|
|
28
|
+
# Added the adopted foreign suffix rule: is|isme|isasi
|
|
29
|
+
|
|
30
|
+
def remove_suffix(word)
|
|
31
|
+
word.sub(/(is|isme|isasi|i|kan|an)$/, '')
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
##
|
|
2
|
+
# Remove inflectional particle
|
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
|
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
|
5
|
+
|
|
6
|
+
module Sastrawi
|
|
7
|
+
module Stemmer
|
|
8
|
+
module Context
|
|
9
|
+
module Visitor
|
|
10
|
+
class RemoveInflectionalParticle
|
|
11
|
+
def visit(context)
|
|
12
|
+
result = remove(context.current_word)
|
|
13
|
+
|
|
14
|
+
if result != context.current_word
|
|
15
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
|
16
|
+
|
|
17
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'P')
|
|
18
|
+
|
|
19
|
+
context.add_removal(removal)
|
|
20
|
+
context.current_word = result
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
##
|
|
25
|
+
# Remove inflectional particle: lah|kah|tah|pun
|
|
26
|
+
|
|
27
|
+
def remove(word)
|
|
28
|
+
word.sub(/-*(lah|kah|tah|pun)$/, '')
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
##
|
|
2
|
+
# Remove inflectional possessive pronoun
|
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
|
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
|
5
|
+
|
|
6
|
+
module Sastrawi
|
|
7
|
+
module Stemmer
|
|
8
|
+
module Context
|
|
9
|
+
module Visitor
|
|
10
|
+
class RemoveInflectionalPossessivePronoun
|
|
11
|
+
def visit(context)
|
|
12
|
+
result = remove(context.current_word)
|
|
13
|
+
|
|
14
|
+
if result != context.current_word
|
|
15
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
|
16
|
+
|
|
17
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
|
|
18
|
+
|
|
19
|
+
context.add_removal(removal)
|
|
20
|
+
context.current_word = result
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
##
|
|
25
|
+
# Remove inflectional possessive pronoun: ku|mu|nya|
|
|
26
|
+
|
|
27
|
+
def remove(word)
|
|
28
|
+
word.sub(/-*(ku|mu|nya)$/, '')
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
##
|
|
2
|
+
# Remove plain prefix
|
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
|
5
|
+
|
|
6
|
+
module Sastrawi
|
|
7
|
+
module Stemmer
|
|
8
|
+
module Context
|
|
9
|
+
module Visitor
|
|
10
|
+
class RemovePlainPrefix
|
|
11
|
+
def visit(context)
|
|
12
|
+
result = remove(context.current_word)
|
|
13
|
+
|
|
14
|
+
if result != context.current_word
|
|
15
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
|
16
|
+
|
|
17
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
|
18
|
+
|
|
19
|
+
context.add_removal(removal)
|
|
20
|
+
context.current_word = result
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
##
|
|
25
|
+
# Remove plain prefix: di|ke|se
|
|
26
|
+
|
|
27
|
+
def remove(word)
|
|
28
|
+
word.sub(/^(di|ke|se)/, '')
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
require 'sastrawi/stemmer/context/visitor/dont_stem_short_word'
|
|
2
|
+
require 'sastrawi/stemmer/context/visitor/remove_inflectional_particle'
|
|
3
|
+
require 'sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun'
|
|
4
|
+
require 'sastrawi/stemmer/context/visitor/remove_derivational_suffix'
|
|
5
|
+
require 'sastrawi/stemmer/context/visitor/remove_plain_prefix'
|
|
6
|
+
require 'sastrawi/stemmer/context/visitor/prefix_disambiguator'
|
|
7
|
+
|
|
8
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a'
|
|
9
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b'
|
|
10
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule2'
|
|
11
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule3'
|
|
12
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule4'
|
|
13
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule5'
|
|
14
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a'
|
|
15
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b'
|
|
16
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule7'
|
|
17
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule8'
|
|
18
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule9'
|
|
19
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule10'
|
|
20
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule11'
|
|
21
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule12'
|
|
22
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a'
|
|
23
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b'
|
|
24
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule14'
|
|
25
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a'
|
|
26
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b'
|
|
27
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule16'
|
|
28
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a'
|
|
29
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b'
|
|
30
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c'
|
|
31
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d'
|
|
32
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a'
|
|
33
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b'
|
|
34
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule19'
|
|
35
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule20'
|
|
36
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a'
|
|
37
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b'
|
|
38
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule23'
|
|
39
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule24'
|
|
40
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule25'
|
|
41
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a'
|
|
42
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b'
|
|
43
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule27'
|
|
44
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a'
|
|
45
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b'
|
|
46
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule29'
|
|
47
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a'
|
|
48
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b'
|
|
49
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c'
|
|
50
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a'
|
|
51
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b'
|
|
52
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule32'
|
|
53
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule34'
|
|
54
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule35'
|
|
55
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule36'
|
|
56
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a'
|
|
57
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b'
|
|
58
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a'
|
|
59
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b'
|
|
60
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a'
|
|
61
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b'
|
|
62
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a'
|
|
63
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b'
|
|
64
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule41'
|
|
65
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule42'
|
|
66
|
+
|
|
67
|
+
module Sastrawi
|
|
68
|
+
module Stemmer
|
|
69
|
+
module Context
|
|
70
|
+
module Visitor
|
|
71
|
+
class VisitorProvider
|
|
72
|
+
attr_reader :visitors, :suffix_visitors, :prefix_visitors
|
|
73
|
+
|
|
74
|
+
def initialize
|
|
75
|
+
@visitors = []
|
|
76
|
+
@suffix_visitors = []
|
|
77
|
+
@prefix_visitors = []
|
|
78
|
+
|
|
79
|
+
init_visitors
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def init_visitors
|
|
83
|
+
@visitors.push(Sastrawi::Stemmer::Context::Visitor::DontStemShortWord.new)
|
|
84
|
+
|
|
85
|
+
@suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalParticle.new)
|
|
86
|
+
@suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalPossessivePronoun.new)
|
|
87
|
+
@suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveDerivationalSuffix.new)
|
|
88
|
+
|
|
89
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemovePlainPrefix.new)
|
|
90
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1b.new]))
|
|
91
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule2.new]))
|
|
92
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule3.new]))
|
|
93
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule4.new]))
|
|
94
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule5.new]))
|
|
95
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6b.new]))
|
|
96
|
+
|
|
97
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule7.new]))
|
|
98
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule8.new]))
|
|
99
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule9.new]))
|
|
100
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule10.new]))
|
|
101
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule11.new]))
|
|
102
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule12.new]))
|
|
103
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13b.new]))
|
|
104
|
+
|
|
105
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule14.new]))
|
|
106
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15b.new]))
|
|
107
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule16.new]))
|
|
108
|
+
|
|
109
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
|
|
110
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17b.new,
|
|
111
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17c.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17d.new
|
|
112
|
+
]))
|
|
113
|
+
|
|
114
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18b.new]))
|
|
115
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule19.new]))
|
|
116
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule20.new]))
|
|
117
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21b.new]))
|
|
118
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule23.new]))
|
|
119
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule24.new]))
|
|
120
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule25.new]))
|
|
121
|
+
|
|
122
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26b.new]))
|
|
123
|
+
|
|
124
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule27.new]))
|
|
125
|
+
|
|
126
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28b.new]))
|
|
127
|
+
|
|
128
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule29.new]))
|
|
129
|
+
|
|
130
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
|
|
131
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30b.new,
|
|
132
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30c.new
|
|
133
|
+
]))
|
|
134
|
+
|
|
135
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31b.new]))
|
|
136
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule32.new]))
|
|
137
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule34.new]))
|
|
138
|
+
|
|
139
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule35.new]))
|
|
140
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule36.new]))
|
|
141
|
+
|
|
142
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37b.new]))
|
|
143
|
+
|
|
144
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38b.new]))
|
|
145
|
+
|
|
146
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39b.new]))
|
|
147
|
+
|
|
148
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40b.new]))
|
|
149
|
+
|
|
150
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule41.new]))
|
|
151
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule42.new]))
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Stemmer
|
|
3
|
+
module Filter
|
|
4
|
+
class TextNormalizer
|
|
5
|
+
def self.normalize_text(text)
|
|
6
|
+
lowercase_text = text.downcase
|
|
7
|
+
replaced_text = lowercase_text.gsub(/[^a-z0-9 -]/im, ' ')
|
|
8
|
+
replaced_text = replaced_text.gsub(/( +)/im, ' ')
|
|
9
|
+
|
|
10
|
+
replaced_text.strip
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|