sastrawi 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.travis.yml +8 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +70 -0
- data/Rakefile +6 -0
- data/data/kata-dasar.txt +29932 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +33 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +24 -0
- data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
- data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +20 -0
- data/lib/sastrawi/stemmer/context/context.rb +170 -0
- data/lib/sastrawi/stemmer/context/removal.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +46 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +28 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +26 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +26 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +26 -0
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
- data/lib/sastrawi/stemmer/stemmer.rb +85 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +45 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +24 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +152 -0
- data/lib/sastrawi/version.rb +3 -0
- data/lib/sastrawi.rb +12 -0
- data/sastrawi.gemspec +25 -0
- metadata +173 -0
@@ -0,0 +1,170 @@
|
|
1
|
+
require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
|
2
|
+
|
3
|
+
module Sastrawi
|
4
|
+
module Stemmer
|
5
|
+
module Context
|
6
|
+
class Context
|
7
|
+
attr_accessor :original_word, :current_word, :dictionary, :visitor_provider, :process_is_stopped, :removals, :visitors, :suffix_visitors, :prefix_visitors, :result
|
8
|
+
|
9
|
+
def initialize(original_word, dictionary, visitor_provider)
|
10
|
+
@original_word = original_word
|
11
|
+
@current_word = original_word
|
12
|
+
@dictionary = dictionary
|
13
|
+
@visitor_provider = visitor_provider
|
14
|
+
|
15
|
+
@process_is_stopped = false
|
16
|
+
@removals = []
|
17
|
+
@visitors = []
|
18
|
+
@suffix_visitors = []
|
19
|
+
@prefix_visitors = []
|
20
|
+
@result = ''
|
21
|
+
|
22
|
+
init_visitors
|
23
|
+
end
|
24
|
+
|
25
|
+
def init_visitors
|
26
|
+
@visitors = @visitor_provider.visitors
|
27
|
+
@suffix_visitors = @visitor_provider.suffix_visitors
|
28
|
+
@prefix_visitors = @visitor_provider.prefix_visitors
|
29
|
+
end
|
30
|
+
|
31
|
+
def stop_process
|
32
|
+
@process_is_stopped = true
|
33
|
+
end
|
34
|
+
|
35
|
+
def add_removal(removal)
|
36
|
+
@removals.push(removal)
|
37
|
+
end
|
38
|
+
|
39
|
+
def execute
|
40
|
+
start_stemming_process
|
41
|
+
|
42
|
+
if @dictionary.contains?(@current_word)
|
43
|
+
@result = @current_word
|
44
|
+
else
|
45
|
+
@result = @original_word
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def start_stemming_process
|
50
|
+
return if @dictionary.contains?(@current_word)
|
51
|
+
|
52
|
+
accept_visitors(@visitors)
|
53
|
+
|
54
|
+
return if @dictionary.contains?(@current_word)
|
55
|
+
|
56
|
+
cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
|
57
|
+
|
58
|
+
if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
|
59
|
+
remove_prefixes
|
60
|
+
return if @dictionary.contains?(@current_word)
|
61
|
+
|
62
|
+
remove_suffixes
|
63
|
+
if @dictionary.contains?(@current_word)
|
64
|
+
return
|
65
|
+
else
|
66
|
+
@current_word = @original_word
|
67
|
+
@removals = []
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
remove_suffixes
|
72
|
+
return if @dictionary.contains?(@current_word)
|
73
|
+
|
74
|
+
remove_prefixes
|
75
|
+
return if @dictionary.contains?(@current_word)
|
76
|
+
|
77
|
+
loop_last_return
|
78
|
+
end
|
79
|
+
|
80
|
+
def loop_last_return
|
81
|
+
restore_prefix
|
82
|
+
|
83
|
+
removals = @removals
|
84
|
+
reversed_removals = removals.reverse
|
85
|
+
current_word = @current_word
|
86
|
+
|
87
|
+
reversed_removals.each do |reverse_removal|
|
88
|
+
next unless suffix_removal?(reverse_removal)
|
89
|
+
|
90
|
+
if reverse_removal.removed_part == 'kan'
|
91
|
+
@current_word = reverse_removal.result << 'k'
|
92
|
+
|
93
|
+
remove_prefixes
|
94
|
+
return if @dictionary.contains?(@current_word)
|
95
|
+
|
96
|
+
@current_word = reverse_removal.result << 'kan'
|
97
|
+
else
|
98
|
+
@current_word = reverse_removal.subject
|
99
|
+
end
|
100
|
+
|
101
|
+
remove_prefixes
|
102
|
+
return if @dictionary.contains?(@current_word)
|
103
|
+
|
104
|
+
@removals = removals
|
105
|
+
@current_word = current_word
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def remove_prefixes
|
110
|
+
3.times do
|
111
|
+
accept_prefix_visitors(@prefix_visitors)
|
112
|
+
|
113
|
+
return if @dictionary.contains?(@current_word)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def remove_suffixes
|
118
|
+
accept_visitors(@suffix_visitors)
|
119
|
+
end
|
120
|
+
|
121
|
+
def accept(visitor)
|
122
|
+
visitor.visit(self)
|
123
|
+
end
|
124
|
+
|
125
|
+
def accept_visitors(visitors)
|
126
|
+
visitors.each do |visitor|
|
127
|
+
accept(visitor)
|
128
|
+
|
129
|
+
return @current_word if @dictionary.contains?(@current_word)
|
130
|
+
|
131
|
+
return @current_word if @process_is_stopped
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def accept_prefix_visitors(visitors)
|
136
|
+
removal_length = @removals.length
|
137
|
+
|
138
|
+
visitors.each do |visitor|
|
139
|
+
accept(visitor)
|
140
|
+
|
141
|
+
return @current_word if @dictionary.contains?(@current_word)
|
142
|
+
|
143
|
+
return @current_word if @process_is_stopped
|
144
|
+
|
145
|
+
return if @removals.length > removal_length
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
def suffix_removal?(removal)
|
150
|
+
removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
|
151
|
+
end
|
152
|
+
|
153
|
+
def restore_prefix
|
154
|
+
@removals.each do |removal|
|
155
|
+
if removal.affix_type == 'DP'
|
156
|
+
@current_word = removal.subject
|
157
|
+
break
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
@removals.each do |removal|
|
162
|
+
if removal.affix_type == 'DP'
|
163
|
+
@removals.delete(removal)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Context
|
4
|
+
class Removal
|
5
|
+
attr_accessor :visitor, :subject, :result, :removed_part, :affix_type
|
6
|
+
|
7
|
+
def initialize(visitor, subject, result, removed_part, affix_type)
|
8
|
+
@visitor = visitor
|
9
|
+
@subject = subject
|
10
|
+
@result = result
|
11
|
+
@removed_part = removed_part
|
12
|
+
@affix_type = affix_type
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Context
|
4
|
+
module Visitor
|
5
|
+
class DontStemShortWord
|
6
|
+
def visit(context)
|
7
|
+
context.stop_process if short_word?(context.current_word)
|
8
|
+
end
|
9
|
+
|
10
|
+
def short_word?(word)
|
11
|
+
word.length <= 3
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Context
|
4
|
+
module Visitor
|
5
|
+
class PrefixDisambiguator
|
6
|
+
attr_accessor :disambiguators
|
7
|
+
|
8
|
+
def initialize(disambiguators = [])
|
9
|
+
@disambiguators = []
|
10
|
+
|
11
|
+
add_disambiguators(disambiguators)
|
12
|
+
end
|
13
|
+
|
14
|
+
def visit(context)
|
15
|
+
result = nil
|
16
|
+
|
17
|
+
@disambiguators.each do |disambiguator|
|
18
|
+
result = disambiguator.disambiguate(context.current_word)
|
19
|
+
|
20
|
+
break if context.dictionary.contains?(result)
|
21
|
+
end
|
22
|
+
|
23
|
+
return if result.nil?
|
24
|
+
|
25
|
+
removed_part = context.current_word.sub(result, '')
|
26
|
+
|
27
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
28
|
+
|
29
|
+
context.add_removal(removal)
|
30
|
+
context.current_word = result
|
31
|
+
end
|
32
|
+
|
33
|
+
def add_disambiguators(disambiguators)
|
34
|
+
disambiguators.each do |disambiguator|
|
35
|
+
add_disambiguator(disambiguator)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def add_disambiguator(disambiguator)
|
40
|
+
@disambiguators.push(disambiguator)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require 'sastrawi/stemmer/context/removal'
|
2
|
+
|
3
|
+
module Sastrawi
|
4
|
+
module Stemmer
|
5
|
+
module Context
|
6
|
+
module Visitor
|
7
|
+
class RemoveDerivationalSuffix
|
8
|
+
def visit(context)
|
9
|
+
result = remove_suffix(context.current_word)
|
10
|
+
|
11
|
+
if result != context.current_word
|
12
|
+
removed_part = context.current_word.sub(result, '')
|
13
|
+
|
14
|
+
removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
|
15
|
+
|
16
|
+
context.add_removal(removal)
|
17
|
+
context.current_word = result
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def remove_suffix(word)
|
22
|
+
word.sub(/(is|isme|isasi|i|kan|an)$/, '')
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Context
|
4
|
+
module Visitor
|
5
|
+
class RemoveInflectionalParticle
|
6
|
+
def visit(context)
|
7
|
+
result = remove(context.current_word)
|
8
|
+
|
9
|
+
if result != context.current_word
|
10
|
+
removed_part = context.current_word.sub(result, '')
|
11
|
+
|
12
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'P')
|
13
|
+
|
14
|
+
context.add_removal(removal)
|
15
|
+
context.current_word = result
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def remove(word)
|
20
|
+
word.sub(/-*(lah|kah|tah|pun)$/, '')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Context
|
4
|
+
module Visitor
|
5
|
+
class RemoveInflectionalPossessivePronoun
|
6
|
+
def visit(context)
|
7
|
+
result = remove(context.current_word)
|
8
|
+
|
9
|
+
if result != context.current_word
|
10
|
+
removed_part = context.current_word.sub(result, '')
|
11
|
+
|
12
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
|
13
|
+
|
14
|
+
context.add_removal(removal)
|
15
|
+
context.current_word = result
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def remove(word)
|
20
|
+
word.sub(/-*(ku|mu|nya)$/, '')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Context
|
4
|
+
module Visitor
|
5
|
+
class RemovePlainPrefix
|
6
|
+
def visit(context)
|
7
|
+
result = remove(context.current_word)
|
8
|
+
|
9
|
+
if result != context.current_word
|
10
|
+
removed_part = context.current_word.sub(result, '')
|
11
|
+
|
12
|
+
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
13
|
+
|
14
|
+
context.add_removal(removal)
|
15
|
+
context.current_word = result
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def remove(word)
|
20
|
+
word.sub(/^(di|ke|se)/, '')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'sastrawi/stemmer/context/visitor/dont_stem_short_word'
|
2
|
+
require 'sastrawi/stemmer/context/visitor/remove_inflectional_particle'
|
3
|
+
require 'sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun'
|
4
|
+
require 'sastrawi/stemmer/context/visitor/remove_derivational_suffix'
|
5
|
+
require 'sastrawi/stemmer/context/visitor/remove_plain_prefix'
|
6
|
+
require 'sastrawi/stemmer/context/visitor/prefix_disambiguator'
|
7
|
+
|
8
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a'
|
9
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b'
|
10
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule2'
|
11
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule3'
|
12
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule4'
|
13
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule5'
|
14
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a'
|
15
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b'
|
16
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule7'
|
17
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule8'
|
18
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule9'
|
19
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule10'
|
20
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule11'
|
21
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule12'
|
22
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a'
|
23
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b'
|
24
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule14'
|
25
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a'
|
26
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b'
|
27
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule16'
|
28
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a'
|
29
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b'
|
30
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c'
|
31
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d'
|
32
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a'
|
33
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b'
|
34
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule19'
|
35
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule20'
|
36
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a'
|
37
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b'
|
38
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule23'
|
39
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule24'
|
40
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule25'
|
41
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a'
|
42
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b'
|
43
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule27'
|
44
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a'
|
45
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b'
|
46
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule29'
|
47
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a'
|
48
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b'
|
49
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c'
|
50
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a'
|
51
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b'
|
52
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule32'
|
53
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule34'
|
54
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule35'
|
55
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule36'
|
56
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a'
|
57
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b'
|
58
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a'
|
59
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b'
|
60
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a'
|
61
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b'
|
62
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a'
|
63
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b'
|
64
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule41'
|
65
|
+
require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule42'
|
66
|
+
|
67
|
+
module Sastrawi
|
68
|
+
module Stemmer
|
69
|
+
module Context
|
70
|
+
module Visitor
|
71
|
+
class VisitorProvider
|
72
|
+
attr_accessor :visitors, :suffix_visitors, :prefix_visitors
|
73
|
+
|
74
|
+
def initialize
|
75
|
+
@visitors = []
|
76
|
+
@suffix_visitors = []
|
77
|
+
@prefix_visitors = []
|
78
|
+
|
79
|
+
init_visitors
|
80
|
+
end
|
81
|
+
|
82
|
+
def init_visitors
|
83
|
+
@visitors.push(Sastrawi::Stemmer::Context::Visitor::DontStemShortWord.new)
|
84
|
+
|
85
|
+
@suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalParticle.new)
|
86
|
+
@suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalPossessivePronoun.new)
|
87
|
+
@suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveDerivationalSuffix.new)
|
88
|
+
|
89
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemovePlainPrefix.new)
|
90
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1b.new]))
|
91
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule2.new]))
|
92
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule3.new]))
|
93
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule4.new]))
|
94
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule5.new]))
|
95
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6b.new]))
|
96
|
+
|
97
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule7.new]))
|
98
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule8.new]))
|
99
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule9.new]))
|
100
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule10.new]))
|
101
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule11.new]))
|
102
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule12.new]))
|
103
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13b.new]))
|
104
|
+
|
105
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule14.new]))
|
106
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15b.new]))
|
107
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule16.new]))
|
108
|
+
|
109
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
|
110
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17b.new,
|
111
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17c.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17d.new
|
112
|
+
]))
|
113
|
+
|
114
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18b.new]))
|
115
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule19.new]))
|
116
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule20.new]))
|
117
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21b.new]))
|
118
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule23.new]))
|
119
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule24.new]))
|
120
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule25.new]))
|
121
|
+
|
122
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26b.new]))
|
123
|
+
|
124
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule27.new]))
|
125
|
+
|
126
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28b.new]))
|
127
|
+
|
128
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule29.new]))
|
129
|
+
|
130
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
|
131
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30b.new,
|
132
|
+
Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30c.new
|
133
|
+
]))
|
134
|
+
|
135
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31b.new]))
|
136
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule32.new]))
|
137
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule34.new]))
|
138
|
+
|
139
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule35.new]))
|
140
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule36.new]))
|
141
|
+
|
142
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37b.new]))
|
143
|
+
|
144
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38b.new]))
|
145
|
+
|
146
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39b.new]))
|
147
|
+
|
148
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40b.new]))
|
149
|
+
|
150
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule41.new]))
|
151
|
+
@prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule42.new]))
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Filter
|
4
|
+
class TextNormalizer
|
5
|
+
def self.normalize_text(text)
|
6
|
+
lowercase_text = text.downcase
|
7
|
+
replaced_text = lowercase_text.gsub(/[^a-z0-9 -]/im, ' ')
|
8
|
+
replaced_text = replaced_text.gsub(/( +)/im, ' ')
|
9
|
+
|
10
|
+
replaced_text.strip
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'sastrawi/stemmer/context/context'
|
2
|
+
require 'sastrawi/stemmer/context/visitor/visitor_provider'
|
3
|
+
require 'sastrawi/stemmer/filter/text_normalizer'
|
4
|
+
|
5
|
+
module Sastrawi
|
6
|
+
module Stemmer
|
7
|
+
class Stemmer
|
8
|
+
attr_accessor :dictionary, :visitor_provider
|
9
|
+
|
10
|
+
def initialize(dictionary)
|
11
|
+
@dictionary = dictionary
|
12
|
+
@visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
|
13
|
+
end
|
14
|
+
|
15
|
+
def stem(text)
|
16
|
+
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
17
|
+
|
18
|
+
words = normalized_text.split(' ')
|
19
|
+
stems = []
|
20
|
+
|
21
|
+
words.each do |word|
|
22
|
+
stems.push(stem_word(word))
|
23
|
+
end
|
24
|
+
|
25
|
+
stems.join(' ')
|
26
|
+
end
|
27
|
+
|
28
|
+
def stem_word(word)
|
29
|
+
if plural?(word)
|
30
|
+
stem_plural_word(word)
|
31
|
+
else
|
32
|
+
stem_singular_word(word)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def plural?(word)
|
37
|
+
matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word)
|
38
|
+
|
39
|
+
if matches
|
40
|
+
true
|
41
|
+
else
|
42
|
+
false
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def stem_plural_word(word)
|
47
|
+
first_match = /^(.*)-(.*)$/.match(word)
|
48
|
+
|
49
|
+
unless first_match
|
50
|
+
return word
|
51
|
+
end
|
52
|
+
|
53
|
+
words = [first_match.captures[0], first_match.captures[1]]
|
54
|
+
|
55
|
+
suffix = words[1]
|
56
|
+
suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
|
57
|
+
second_match = /^(.*)-(.*)$/.match(words[0])
|
58
|
+
|
59
|
+
if suffixes.include?(suffix) && second_match
|
60
|
+
words[1] = words[1] + '-' + suffix
|
61
|
+
end
|
62
|
+
|
63
|
+
root_first_word = stem_singular_word(words[0])
|
64
|
+
root_second_word = stem_singular_word(words[1])
|
65
|
+
|
66
|
+
unless @dictionary.contains?(words[1]) && root_second_word == words[1]
|
67
|
+
root_second_word = stem_singular_word('me' + words[1])
|
68
|
+
end
|
69
|
+
|
70
|
+
if root_first_word == root_second_word
|
71
|
+
root_first_word
|
72
|
+
else
|
73
|
+
word
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def stem_singular_word(word)
|
78
|
+
context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
|
79
|
+
context.execute
|
80
|
+
|
81
|
+
context.result
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'sastrawi/dictionary/array_dictionary'
|
2
|
+
require 'sastrawi/stemmer/cached_stemmer'
|
3
|
+
require 'sastrawi/stemmer/stemmer'
|
4
|
+
require 'sastrawi/stemmer/cache/array_cache'
|
5
|
+
|
6
|
+
|
7
|
+
module Sastrawi
|
8
|
+
module Stemmer
|
9
|
+
class StemmerFactory
|
10
|
+
def create_stemmer(is_dev = false)
|
11
|
+
stemmer = Sastrawi::Stemmer::Stemmer.new(create_default_dictionary(is_dev))
|
12
|
+
|
13
|
+
cache_result = Sastrawi::Stemmer::Cache::ArrayCache.new
|
14
|
+
cached_stemmer = Sastrawi::Stemmer::CachedStemmer.new(cache_result, stemmer)
|
15
|
+
|
16
|
+
cached_stemmer
|
17
|
+
end
|
18
|
+
|
19
|
+
def create_default_dictionary(is_dev = false)
|
20
|
+
words = get_words(is_dev)
|
21
|
+
dictionary = Sastrawi::Dictionary::ArrayDictionary.new(words)
|
22
|
+
|
23
|
+
dictionary
|
24
|
+
end
|
25
|
+
|
26
|
+
def get_words(is_dev = false)
|
27
|
+
get_words_from_file
|
28
|
+
end
|
29
|
+
|
30
|
+
def get_words_from_file
|
31
|
+
root_directory = File.expand_path('../../../..', __FILE__)
|
32
|
+
dictionary_file_path = File.join(root_directory, 'data/kata-dasar.txt')
|
33
|
+
|
34
|
+
dictionary_content = []
|
35
|
+
File.open(dictionary_file_path, 'r') do |file|
|
36
|
+
file.each do |line|
|
37
|
+
dictionary_content.push(line.chomp)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
dictionary_content
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|