sastrawi-ruby 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ci.yml +23 -0
  3. data/.gitignore +51 -0
  4. data/.travis.yml +10 -0
  5. data/CONTRIBUTING.md +22 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +21 -0
  8. data/README.md +104 -0
  9. data/Rakefile +6 -0
  10. data/_config.yml +1 -0
  11. data/bin/sastrawi +24 -0
  12. data/data/base-word.txt +29933 -0
  13. data/lib/sastrawi/dictionary/array_dictionary.rb +67 -0
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
  16. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
  17. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
  18. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
  19. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
  20. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
  21. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
  22. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
  23. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
  24. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
  25. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
  26. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
  27. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
  28. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
  29. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
  30. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
  31. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
  32. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
  33. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
  34. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
  35. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
  36. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
  37. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
  38. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
  39. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
  40. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
  41. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
  42. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
  43. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
  44. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
  45. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
  46. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
  47. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
  48. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
  49. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
  50. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
  51. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
  52. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
  53. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
  54. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
  55. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
  56. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
  57. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
  58. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
  59. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
  60. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
  61. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
  62. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
  63. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
  64. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
  65. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
  66. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
  67. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
  68. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
  69. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
  70. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
  71. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
  72. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +28 -0
  73. data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
  74. data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
  75. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +25 -0
  76. data/lib/sastrawi/stemmer/context/context.rb +217 -0
  77. data/lib/sastrawi/stemmer/context/removal.rb +17 -0
  78. data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
  79. data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +54 -0
  80. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +37 -0
  81. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +34 -0
  82. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +34 -0
  83. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +34 -0
  84. data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
  85. data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
  86. data/lib/sastrawi/stemmer/stemmer.rb +101 -0
  87. data/lib/sastrawi/stemmer/stemmer_factory.rb +49 -0
  88. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +27 -0
  89. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +124 -0
  90. data/lib/sastrawi/version.rb +5 -0
  91. data/lib/sastrawi.rb +4 -0
  92. data/sastrawi.gemspec +34 -0
  93. metadata +179 -0
@@ -0,0 +1,217 @@
1
+ require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
2
+
3
+ ##
4
+ # Stemming context using Nazief and Adriani, Confix Stripping (CS),
5
+ # Enhanced Confix Stripping (ECS), and Improved (ECS)
6
+
7
+ module Sastrawi
8
+ module Stemmer
9
+ module Context
10
+ class Context
11
+ attr_reader :original_word, :dictionary, :visitor_provider, :visitors, :suffix_visitors, :prefix_visitors
12
+ attr_accessor :current_word, :process_is_stopped, :removals, :result
13
+
14
+ def initialize(original_word, dictionary, visitor_provider)
15
+ @original_word = original_word
16
+ @current_word = original_word
17
+ @dictionary = dictionary
18
+ @visitor_provider = visitor_provider
19
+
20
+ @process_is_stopped = false
21
+ @removals = []
22
+ @visitors = nil
23
+ @suffix_visitors = nil
24
+ @prefix_visitors = nil
25
+ @result = nil
26
+
27
+ init_visitors
28
+ end
29
+
30
+ def init_visitors
31
+ @visitors = @visitor_provider.visitors
32
+ @suffix_visitors = @visitor_provider.suffix_visitors
33
+ @prefix_visitors = @visitor_provider.prefix_visitors
34
+ end
35
+
36
+ def stop_process
37
+ @process_is_stopped = true
38
+ end
39
+
40
+ def add_removal(removal)
41
+ @removals.push(removal)
42
+ end
43
+
44
+ ##
45
+ # Execute stemming process
46
+
47
+ def execute
48
+ start_stemming_process
49
+
50
+ if @dictionary.contains?(@current_word)
51
+ @result = @current_word
52
+ else
53
+ @result = @original_word
54
+ end
55
+ end
56
+
57
+ def start_stemming_process
58
+ return if @dictionary.contains?(@current_word)
59
+
60
+ accept_visitors(@visitors)
61
+
62
+ return if @dictionary.contains?(@current_word)
63
+
64
+ cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
65
+
66
+ ##
67
+ # Confix stripping
68
+ # try to remove prefix before suffix if the specification is met
69
+
70
+ if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
71
+ # Interleave prefix and suffix removal in CS path:
72
+ # After each prefix removal round, try suffix removal to check
73
+ # if the combination yields a dictionary word.
74
+ cs_found = false
75
+ 3.times do
76
+ prev_removals_len = @removals.length
77
+ accept_prefix_visitors(@prefix_visitors)
78
+
79
+ break if @removals.length == prev_removals_len # no prefix removed
80
+
81
+ if @dictionary.contains?(@current_word)
82
+ cs_found = true
83
+ break
84
+ end
85
+
86
+ # Try suffix removal from this intermediate state
87
+ saved_word = @current_word
88
+ saved_removals = @removals.dup
89
+ remove_suffixes
90
+
91
+ if @dictionary.contains?(@current_word)
92
+ cs_found = true
93
+ break
94
+ end
95
+
96
+ # Suffix didn't help, restore and try more prefix iterations
97
+ @current_word = saved_word
98
+ @removals = saved_removals
99
+ end
100
+
101
+ if cs_found
102
+ return
103
+ else
104
+ @current_word = @original_word
105
+ @removals = []
106
+ end
107
+ end
108
+
109
+ remove_suffixes
110
+ return if @dictionary.contains?(@current_word)
111
+
112
+ remove_prefixes
113
+ return if @dictionary.contains?(@current_word)
114
+
115
+ loop_last_return
116
+ end
117
+
118
+ ##
119
+ # ECS loop last return
120
+
121
+ def loop_last_return
122
+ restore_prefix
123
+
124
+ removals = @removals
125
+ reversed_removals = removals.reverse
126
+ current_word = @current_word
127
+
128
+ reversed_removals.each do |reverse_removal|
129
+ next unless suffix_removal?(reverse_removal)
130
+
131
+ if reverse_removal.removed_part == 'kan'
132
+ @current_word = "#{reverse_removal.result}k"
133
+
134
+ remove_prefixes
135
+ return if @dictionary.contains?(@current_word)
136
+
137
+ @current_word = "#{reverse_removal.result}kan"
138
+ else
139
+ @current_word = reverse_removal.subject
140
+ end
141
+
142
+ remove_prefixes
143
+ return if @dictionary.contains?(@current_word)
144
+
145
+ @removals = removals
146
+ @current_word = current_word
147
+ end
148
+ end
149
+
150
+ def remove_prefixes
151
+ 3.times do
152
+ accept_prefix_visitors(@prefix_visitors)
153
+
154
+ return if @dictionary.contains?(@current_word)
155
+ end
156
+ end
157
+
158
+ def remove_suffixes
159
+ accept_visitors(@suffix_visitors)
160
+ end
161
+
162
+ def accept(visitor)
163
+ visitor.visit(self)
164
+ end
165
+
166
+ def accept_visitors(visitors)
167
+ visitors.each do |visitor|
168
+ accept(visitor)
169
+
170
+ return @current_word if @dictionary.contains?(@current_word)
171
+
172
+ return @current_word if @process_is_stopped
173
+ end
174
+ end
175
+
176
+ def accept_prefix_visitors(visitors)
177
+ removal_length = @removals.length
178
+
179
+ visitors.each do |visitor|
180
+ accept(visitor)
181
+
182
+ return @current_word if @dictionary.contains?(@current_word)
183
+
184
+ return @current_word if @process_is_stopped
185
+
186
+ return if @removals.length > removal_length
187
+ end
188
+ end
189
+
190
+ ##
191
+ # Check whether the removed part is a suffix
192
+
193
+ def suffix_removal?(removal)
194
+ removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
195
+ end
196
+
197
+ ##
198
+ # Restore prefix to proceed with ECS loop last return
199
+
200
+ def restore_prefix
201
+ @removals.each do |removal|
202
+ if removal.affix_type == 'DP'
203
+ @current_word = removal.subject
204
+ break
205
+ end
206
+ end
207
+
208
+ @removals.each do |removal|
209
+ if removal.affix_type == 'DP'
210
+ @removals.delete(removal)
211
+ end
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end
217
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ class Removal
5
+ attr_reader :visitor, :subject, :result, :removed_part, :affix_type
6
+
7
+ def initialize(visitor, subject, result, removed_part, affix_type)
8
+ @visitor = visitor
9
+ @subject = subject
10
+ @result = result
11
+ @removed_part = removed_part
12
+ @affix_type = affix_type
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ module Visitor
5
+ class DontStemShortWord
6
+ def visit(context)
7
+ context.stop_process if short_word?(context.current_word)
8
+ end
9
+
10
+ def short_word?(word)
11
+ word.length <= 3
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,54 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ module Visitor
5
+ class PrefixDisambiguator
6
+ attr_reader :disambiguators
7
+
8
+ def initialize(disambiguators = [])
9
+ @disambiguators = []
10
+
11
+ add_disambiguators(disambiguators)
12
+ end
13
+
14
+ def visit(context)
15
+ first_result = nil
16
+ dict_result = nil
17
+
18
+ @disambiguators.each do |disambiguator|
19
+ result = disambiguator.disambiguate(context.current_word)
20
+ next if result.nil?
21
+
22
+ first_result ||= result
23
+
24
+ if context.dictionary.contains?(result)
25
+ dict_result = result
26
+ break
27
+ end
28
+ end
29
+
30
+ result = dict_result || first_result
31
+ return if result.nil?
32
+
33
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
34
+
35
+ removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
36
+
37
+ context.add_removal(removal)
38
+ context.current_word = result
39
+ end
40
+
41
+ def add_disambiguators(disambiguators)
42
+ disambiguators.each do |disambiguator|
43
+ add_disambiguator(disambiguator)
44
+ end
45
+ end
46
+
47
+ def add_disambiguator(disambiguator)
48
+ @disambiguators.push(disambiguator)
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,37 @@
1
+ require 'sastrawi/stemmer/context/removal'
2
+
3
+ ##
4
+ # Remove derivational suffix
5
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
6
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
7
+
8
+ module Sastrawi
9
+ module Stemmer
10
+ module Context
11
+ module Visitor
12
+ class RemoveDerivationalSuffix
13
+ def visit(context)
14
+ result = remove_suffix(context.current_word)
15
+
16
+ if result != context.current_word
17
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
18
+
19
+ removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
20
+
21
+ context.add_removal(removal)
22
+ context.current_word = result
23
+ end
24
+ end
25
+
26
+ ##
27
+ # Original rule: i|kan|an
28
+ # Added the adopted foreign suffix rule: is|isme|isasi
29
+
30
+ def remove_suffix(word)
31
+ word.sub(/(is|isme|isasi|i|kan|an)$/, '')
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,34 @@
1
+ ##
2
+ # Remove inflectional particle
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
6
+ module Sastrawi
7
+ module Stemmer
8
+ module Context
9
+ module Visitor
10
+ class RemoveInflectionalParticle
11
+ def visit(context)
12
+ result = remove(context.current_word)
13
+
14
+ if result != context.current_word
15
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
16
+
17
+ removal = Removal.new(self, context.current_word, result, removed_part, 'P')
18
+
19
+ context.add_removal(removal)
20
+ context.current_word = result
21
+ end
22
+ end
23
+
24
+ ##
25
+ # Remove inflectional particle: lah|kah|tah|pun
26
+
27
+ def remove(word)
28
+ word.sub(/-*(lah|kah|tah|pun)$/, '')
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,34 @@
1
+ ##
2
+ # Remove inflectional possessive pronoun
3
+ # Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
6
+ module Sastrawi
7
+ module Stemmer
8
+ module Context
9
+ module Visitor
10
+ class RemoveInflectionalPossessivePronoun
11
+ def visit(context)
12
+ result = remove(context.current_word)
13
+
14
+ if result != context.current_word
15
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
16
+
17
+ removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
18
+
19
+ context.add_removal(removal)
20
+ context.current_word = result
21
+ end
22
+ end
23
+
24
+ ##
25
+ # Remove inflectional possessive pronoun: ku|mu|nya|
26
+
27
+ def remove(word)
28
+ word.sub(/-*(ku|mu|nya)$/, '')
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,34 @@
1
+ ##
2
+ # Remove plain prefix
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
6
+ module Sastrawi
7
+ module Stemmer
8
+ module Context
9
+ module Visitor
10
+ class RemovePlainPrefix
11
+ def visit(context)
12
+ result = remove(context.current_word)
13
+
14
+ if result != context.current_word
15
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
16
+
17
+ removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
18
+
19
+ context.add_removal(removal)
20
+ context.current_word = result
21
+ end
22
+ end
23
+
24
+ ##
25
+ # Remove plain prefix: di|ke|se
26
+
27
+ def remove(word)
28
+ word.sub(/^(di|ke|se)/, '')
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,157 @@
1
+ require 'sastrawi/stemmer/context/visitor/dont_stem_short_word'
2
+ require 'sastrawi/stemmer/context/visitor/remove_inflectional_particle'
3
+ require 'sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun'
4
+ require 'sastrawi/stemmer/context/visitor/remove_derivational_suffix'
5
+ require 'sastrawi/stemmer/context/visitor/remove_plain_prefix'
6
+ require 'sastrawi/stemmer/context/visitor/prefix_disambiguator'
7
+
8
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a'
9
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b'
10
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule2'
11
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule3'
12
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule4'
13
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule5'
14
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a'
15
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b'
16
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule7'
17
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule8'
18
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule9'
19
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule10'
20
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule11'
21
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule12'
22
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a'
23
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b'
24
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule14'
25
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a'
26
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b'
27
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule16'
28
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a'
29
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b'
30
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c'
31
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d'
32
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a'
33
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b'
34
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule19'
35
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule20'
36
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a'
37
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b'
38
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule23'
39
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule24'
40
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule25'
41
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a'
42
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b'
43
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule27'
44
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a'
45
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b'
46
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule29'
47
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a'
48
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b'
49
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c'
50
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a'
51
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b'
52
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule32'
53
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule34'
54
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule35'
55
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule36'
56
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a'
57
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b'
58
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a'
59
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b'
60
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a'
61
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b'
62
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a'
63
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b'
64
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule41'
65
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule42'
66
+
67
+ module Sastrawi
68
+ module Stemmer
69
+ module Context
70
+ module Visitor
71
+ class VisitorProvider
72
+ attr_reader :visitors, :suffix_visitors, :prefix_visitors
73
+
74
+ def initialize
75
+ @visitors = []
76
+ @suffix_visitors = []
77
+ @prefix_visitors = []
78
+
79
+ init_visitors
80
+ end
81
+
82
+ def init_visitors
83
+ @visitors.push(Sastrawi::Stemmer::Context::Visitor::DontStemShortWord.new)
84
+
85
+ @suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalParticle.new)
86
+ @suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalPossessivePronoun.new)
87
+ @suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveDerivationalSuffix.new)
88
+
89
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemovePlainPrefix.new)
90
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1b.new]))
91
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule2.new]))
92
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule3.new]))
93
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule4.new]))
94
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule5.new]))
95
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6b.new]))
96
+
97
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule7.new]))
98
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule8.new]))
99
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule9.new]))
100
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule10.new]))
101
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule11.new]))
102
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule12.new]))
103
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13b.new]))
104
+
105
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule14.new]))
106
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15b.new]))
107
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule16.new]))
108
+
109
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
110
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17b.new,
111
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17c.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17d.new
112
+ ]))
113
+
114
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18b.new]))
115
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule19.new]))
116
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule20.new]))
117
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21b.new]))
118
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule23.new]))
119
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule24.new]))
120
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule25.new]))
121
+
122
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26b.new]))
123
+
124
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule27.new]))
125
+
126
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28b.new]))
127
+
128
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule29.new]))
129
+
130
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
131
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30b.new,
132
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30c.new
133
+ ]))
134
+
135
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31b.new]))
136
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule32.new]))
137
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule34.new]))
138
+
139
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule35.new]))
140
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule36.new]))
141
+
142
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37b.new]))
143
+
144
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38b.new]))
145
+
146
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39b.new]))
147
+
148
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40b.new]))
149
+
150
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule41.new]))
151
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule42.new]))
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,15 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Filter
4
+ class TextNormalizer
5
+ def self.normalize_text(text)
6
+ lowercase_text = text.downcase
7
+ replaced_text = lowercase_text.gsub(/[^a-z0-9 -]/im, ' ')
8
+ replaced_text = replaced_text.gsub(/( +)/im, ' ')
9
+
10
+ replaced_text.strip
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end