sastrawi 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +50 -0
  3. data/.travis.yml +8 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +70 -0
  7. data/Rakefile +6 -0
  8. data/data/kata-dasar.txt +29932 -0
  9. data/lib/sastrawi/dictionary/array_dictionary.rb +33 -0
  10. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
  11. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
  12. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
  13. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
  16. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
  17. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
  18. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
  19. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
  20. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
  21. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
  22. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
  23. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
  24. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
  25. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
  26. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
  27. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
  28. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
  29. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
  30. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
  31. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
  32. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
  33. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
  34. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
  35. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
  36. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
  37. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
  38. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
  39. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
  40. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
  41. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
  42. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
  43. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
  44. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
  45. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
  46. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
  47. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
  48. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
  49. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
  50. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
  51. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
  52. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
  53. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
  54. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
  55. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
  56. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
  57. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
  58. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
  59. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
  60. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
  61. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
  62. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
  63. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
  64. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
  65. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
  66. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
  67. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
  68. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +24 -0
  69. data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
  70. data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
  71. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +20 -0
  72. data/lib/sastrawi/stemmer/context/context.rb +170 -0
  73. data/lib/sastrawi/stemmer/context/removal.rb +17 -0
  74. data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
  75. data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +46 -0
  76. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +28 -0
  77. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +26 -0
  78. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +26 -0
  79. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +26 -0
  80. data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
  81. data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
  82. data/lib/sastrawi/stemmer/stemmer.rb +85 -0
  83. data/lib/sastrawi/stemmer/stemmer_factory.rb +45 -0
  84. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +24 -0
  85. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +152 -0
  86. data/lib/sastrawi/version.rb +3 -0
  87. data/lib/sastrawi.rb +12 -0
  88. data/sastrawi.gemspec +25 -0
  89. metadata +173 -0
@@ -0,0 +1,170 @@
1
+ require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
2
+
3
+ module Sastrawi
4
+ module Stemmer
5
+ module Context
6
+ class Context
7
+ attr_accessor :original_word, :current_word, :dictionary, :visitor_provider, :process_is_stopped, :removals, :visitors, :suffix_visitors, :prefix_visitors, :result
8
+
9
+ def initialize(original_word, dictionary, visitor_provider)
10
+ @original_word = original_word
11
+ @current_word = original_word
12
+ @dictionary = dictionary
13
+ @visitor_provider = visitor_provider
14
+
15
+ @process_is_stopped = false
16
+ @removals = []
17
+ @visitors = []
18
+ @suffix_visitors = []
19
+ @prefix_visitors = []
20
+ @result = ''
21
+
22
+ init_visitors
23
+ end
24
+
25
+ def init_visitors
26
+ @visitors = @visitor_provider.visitors
27
+ @suffix_visitors = @visitor_provider.suffix_visitors
28
+ @prefix_visitors = @visitor_provider.prefix_visitors
29
+ end
30
+
31
+ def stop_process
32
+ @process_is_stopped = true
33
+ end
34
+
35
+ def add_removal(removal)
36
+ @removals.push(removal)
37
+ end
38
+
39
+ def execute
40
+ start_stemming_process
41
+
42
+ if @dictionary.contains?(@current_word)
43
+ @result = @current_word
44
+ else
45
+ @result = @original_word
46
+ end
47
+ end
48
+
49
+ def start_stemming_process
50
+ return if @dictionary.contains?(@current_word)
51
+
52
+ accept_visitors(@visitors)
53
+
54
+ return if @dictionary.contains?(@current_word)
55
+
56
+ cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
57
+
58
+ if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
59
+ remove_prefixes
60
+ return if @dictionary.contains?(@current_word)
61
+
62
+ remove_suffixes
63
+ if @dictionary.contains?(@current_word)
64
+ return
65
+ else
66
+ @current_word = @original_word
67
+ @removals = []
68
+ end
69
+ end
70
+
71
+ remove_suffixes
72
+ return if @dictionary.contains?(@current_word)
73
+
74
+ remove_prefixes
75
+ return if @dictionary.contains?(@current_word)
76
+
77
+ loop_last_return
78
+ end
79
+
80
+ def loop_last_return
81
+ restore_prefix
82
+
83
+ removals = @removals
84
+ reversed_removals = removals.reverse
85
+ current_word = @current_word
86
+
87
+ reversed_removals.each do |reverse_removal|
88
+ next unless suffix_removal?(reverse_removal)
89
+
90
+ if reverse_removal.removed_part == 'kan'
91
+ @current_word = reverse_removal.result << 'k'
92
+
93
+ remove_prefixes
94
+ return if @dictionary.contains?(@current_word)
95
+
96
+ @current_word = reverse_removal.result << 'kan'
97
+ else
98
+ @current_word = reverse_removal.subject
99
+ end
100
+
101
+ remove_prefixes
102
+ return if @dictionary.contains?(@current_word)
103
+
104
+ @removals = removals
105
+ @current_word = current_word
106
+ end
107
+ end
108
+
109
+ def remove_prefixes
110
+ 3.times do
111
+ accept_prefix_visitors(@prefix_visitors)
112
+
113
+ return if @dictionary.contains?(@current_word)
114
+ end
115
+ end
116
+
117
+ def remove_suffixes
118
+ accept_visitors(@suffix_visitors)
119
+ end
120
+
121
+ def accept(visitor)
122
+ visitor.visit(self)
123
+ end
124
+
125
+ def accept_visitors(visitors)
126
+ visitors.each do |visitor|
127
+ accept(visitor)
128
+
129
+ return @current_word if @dictionary.contains?(@current_word)
130
+
131
+ return @current_word if @process_is_stopped
132
+ end
133
+ end
134
+
135
+ def accept_prefix_visitors(visitors)
136
+ removal_length = @removals.length
137
+
138
+ visitors.each do |visitor|
139
+ accept(visitor)
140
+
141
+ return @current_word if @dictionary.contains?(@current_word)
142
+
143
+ return @current_word if @process_is_stopped
144
+
145
+ return if @removals.length > removal_length
146
+ end
147
+ end
148
+
149
+ def suffix_removal?(removal)
150
+ removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
151
+ end
152
+
153
+ def restore_prefix
154
+ @removals.each do |removal|
155
+ if removal.affix_type == 'DP'
156
+ @current_word = removal.subject
157
+ break
158
+ end
159
+ end
160
+
161
+ @removals.each do |removal|
162
+ if removal.affix_type == 'DP'
163
+ @removals.delete(removal)
164
+ end
165
+ end
166
+ end
167
+ end
168
+ end
169
+ end
170
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ class Removal
5
+ attr_accessor :visitor, :subject, :result, :removed_part, :affix_type
6
+
7
+ def initialize(visitor, subject, result, removed_part, affix_type)
8
+ @visitor = visitor
9
+ @subject = subject
10
+ @result = result
11
+ @removed_part = removed_part
12
+ @affix_type = affix_type
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ module Visitor
5
+ class DontStemShortWord
6
+ def visit(context)
7
+ context.stop_process if short_word?(context.current_word)
8
+ end
9
+
10
+ def short_word?(word)
11
+ word.length <= 3
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,46 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ module Visitor
5
+ class PrefixDisambiguator
6
+ attr_accessor :disambiguators
7
+
8
+ def initialize(disambiguators = [])
9
+ @disambiguators = []
10
+
11
+ add_disambiguators(disambiguators)
12
+ end
13
+
14
+ def visit(context)
15
+ result = nil
16
+
17
+ @disambiguators.each do |disambiguator|
18
+ result = disambiguator.disambiguate(context.current_word)
19
+
20
+ break if context.dictionary.contains?(result)
21
+ end
22
+
23
+ return if result.nil?
24
+
25
+ removed_part = context.current_word.sub(result, '')
26
+
27
+ removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
28
+
29
+ context.add_removal(removal)
30
+ context.current_word = result
31
+ end
32
+
33
+ def add_disambiguators(disambiguators)
34
+ disambiguators.each do |disambiguator|
35
+ add_disambiguator(disambiguator)
36
+ end
37
+ end
38
+
39
+ def add_disambiguator(disambiguator)
40
+ @disambiguators.push(disambiguator)
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,28 @@
1
+ require 'sastrawi/stemmer/context/removal'
2
+
3
+ module Sastrawi
4
+ module Stemmer
5
+ module Context
6
+ module Visitor
7
+ class RemoveDerivationalSuffix
8
+ def visit(context)
9
+ result = remove_suffix(context.current_word)
10
+
11
+ if result != context.current_word
12
+ removed_part = context.current_word.sub(result, '')
13
+
14
+ removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
15
+
16
+ context.add_removal(removal)
17
+ context.current_word = result
18
+ end
19
+ end
20
+
21
+ def remove_suffix(word)
22
+ word.sub(/(is|isme|isasi|i|kan|an)$/, '')
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,26 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ module Visitor
5
+ class RemoveInflectionalParticle
6
+ def visit(context)
7
+ result = remove(context.current_word)
8
+
9
+ if result != context.current_word
10
+ removed_part = context.current_word.sub(result, '')
11
+
12
+ removal = Removal.new(self, context.current_word, result, removed_part, 'P')
13
+
14
+ context.add_removal(removal)
15
+ context.current_word = result
16
+ end
17
+ end
18
+
19
+ def remove(word)
20
+ word.sub(/-*(lah|kah|tah|pun)$/, '')
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,26 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ module Visitor
5
+ class RemoveInflectionalPossessivePronoun
6
+ def visit(context)
7
+ result = remove(context.current_word)
8
+
9
+ if result != context.current_word
10
+ removed_part = context.current_word.sub(result, '')
11
+
12
+ removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
13
+
14
+ context.add_removal(removal)
15
+ context.current_word = result
16
+ end
17
+ end
18
+
19
+ def remove(word)
20
+ word.sub(/-*(ku|mu|nya)$/, '')
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,26 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Context
4
+ module Visitor
5
+ class RemovePlainPrefix
6
+ def visit(context)
7
+ result = remove(context.current_word)
8
+
9
+ if result != context.current_word
10
+ removed_part = context.current_word.sub(result, '')
11
+
12
+ removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
13
+
14
+ context.add_removal(removal)
15
+ context.current_word = result
16
+ end
17
+ end
18
+
19
+ def remove(word)
20
+ word.sub(/^(di|ke|se)/, '')
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,157 @@
1
+ require 'sastrawi/stemmer/context/visitor/dont_stem_short_word'
2
+ require 'sastrawi/stemmer/context/visitor/remove_inflectional_particle'
3
+ require 'sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun'
4
+ require 'sastrawi/stemmer/context/visitor/remove_derivational_suffix'
5
+ require 'sastrawi/stemmer/context/visitor/remove_plain_prefix'
6
+ require 'sastrawi/stemmer/context/visitor/prefix_disambiguator'
7
+
8
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a'
9
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b'
10
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule2'
11
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule3'
12
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule4'
13
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule5'
14
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a'
15
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b'
16
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule7'
17
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule8'
18
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule9'
19
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule10'
20
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule11'
21
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule12'
22
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a'
23
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b'
24
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule14'
25
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a'
26
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b'
27
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule16'
28
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a'
29
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b'
30
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c'
31
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d'
32
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a'
33
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b'
34
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule19'
35
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule20'
36
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a'
37
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b'
38
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule23'
39
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule24'
40
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule25'
41
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a'
42
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b'
43
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule27'
44
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a'
45
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b'
46
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule29'
47
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a'
48
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b'
49
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c'
50
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a'
51
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b'
52
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule32'
53
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule34'
54
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule35'
55
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule36'
56
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a'
57
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b'
58
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a'
59
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b'
60
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a'
61
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b'
62
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a'
63
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b'
64
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule41'
65
+ require 'sastrawi/morphology/disambiguator/disambiguator_prefix_rule42'
66
+
67
+ module Sastrawi
68
+ module Stemmer
69
+ module Context
70
+ module Visitor
71
+ class VisitorProvider
72
+ attr_accessor :visitors, :suffix_visitors, :prefix_visitors
73
+
74
+ def initialize
75
+ @visitors = []
76
+ @suffix_visitors = []
77
+ @prefix_visitors = []
78
+
79
+ init_visitors
80
+ end
81
+
82
+ def init_visitors
83
+ @visitors.push(Sastrawi::Stemmer::Context::Visitor::DontStemShortWord.new)
84
+
85
+ @suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalParticle.new)
86
+ @suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveInflectionalPossessivePronoun.new)
87
+ @suffix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemoveDerivationalSuffix.new)
88
+
89
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::RemovePlainPrefix.new)
90
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule1b.new]))
91
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule2.new]))
92
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule3.new]))
93
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule4.new]))
94
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule5.new]))
95
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule6b.new]))
96
+
97
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule7.new]))
98
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule8.new]))
99
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule9.new]))
100
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule10.new]))
101
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule11.new]))
102
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule12.new]))
103
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule13b.new]))
104
+
105
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule14.new]))
106
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule15b.new]))
107
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule16.new]))
108
+
109
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
110
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17b.new,
111
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17c.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule17d.new
112
+ ]))
113
+
114
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule18b.new]))
115
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule19.new]))
116
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule20.new]))
117
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule21b.new]))
118
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule23.new]))
119
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule24.new]))
120
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule25.new]))
121
+
122
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule26b.new]))
123
+
124
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule27.new]))
125
+
126
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule28b.new]))
127
+
128
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule29.new]))
129
+
130
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([
131
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30b.new,
132
+ Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule30c.new
133
+ ]))
134
+
135
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule31b.new]))
136
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule32.new]))
137
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule34.new]))
138
+
139
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule35.new]))
140
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule36.new]))
141
+
142
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule37b.new]))
143
+
144
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule38b.new]))
145
+
146
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule39b.new]))
147
+
148
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40a.new, Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule40b.new]))
149
+
150
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule41.new]))
151
+ @prefix_visitors.push(Sastrawi::Stemmer::Context::Visitor::PrefixDisambiguator.new([Sastrawi::Morphology::Disambiguator::DisambiguatorPrefixRule42.new]))
152
+ end
153
+ end
154
+ end
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,15 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Filter
4
+ class TextNormalizer
5
+ def self.normalize_text(text)
6
+ lowercase_text = text.downcase
7
+ replaced_text = lowercase_text.gsub(/[^a-z0-9 -]/im, ' ')
8
+ replaced_text = replaced_text.gsub(/( +)/im, ' ')
9
+
10
+ replaced_text.strip
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,85 @@
1
+ require 'sastrawi/stemmer/context/context'
2
+ require 'sastrawi/stemmer/context/visitor/visitor_provider'
3
+ require 'sastrawi/stemmer/filter/text_normalizer'
4
+
5
+ module Sastrawi
6
+ module Stemmer
7
+ class Stemmer
8
+ attr_accessor :dictionary, :visitor_provider
9
+
10
+ def initialize(dictionary)
11
+ @dictionary = dictionary
12
+ @visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
13
+ end
14
+
15
+ def stem(text)
16
+ normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
17
+
18
+ words = normalized_text.split(' ')
19
+ stems = []
20
+
21
+ words.each do |word|
22
+ stems.push(stem_word(word))
23
+ end
24
+
25
+ stems.join(' ')
26
+ end
27
+
28
+ def stem_word(word)
29
+ if plural?(word)
30
+ stem_plural_word(word)
31
+ else
32
+ stem_singular_word(word)
33
+ end
34
+ end
35
+
36
+ def plural?(word)
37
+ matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word)
38
+
39
+ if matches
40
+ true
41
+ else
42
+ false
43
+ end
44
+ end
45
+
46
+ def stem_plural_word(word)
47
+ first_match = /^(.*)-(.*)$/.match(word)
48
+
49
+ unless first_match
50
+ return word
51
+ end
52
+
53
+ words = [first_match.captures[0], first_match.captures[1]]
54
+
55
+ suffix = words[1]
56
+ suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
57
+ second_match = /^(.*)-(.*)$/.match(words[0])
58
+
59
+ if suffixes.include?(suffix) && second_match
60
+ words[1] = words[1] + '-' + suffix
61
+ end
62
+
63
+ root_first_word = stem_singular_word(words[0])
64
+ root_second_word = stem_singular_word(words[1])
65
+
66
+ unless @dictionary.contains?(words[1]) && root_second_word == words[1]
67
+ root_second_word = stem_singular_word('me' + words[1])
68
+ end
69
+
70
+ if root_first_word == root_second_word
71
+ root_first_word
72
+ else
73
+ word
74
+ end
75
+ end
76
+
77
+ def stem_singular_word(word)
78
+ context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
79
+ context.execute
80
+
81
+ context.result
82
+ end
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,45 @@
1
+ require 'sastrawi/dictionary/array_dictionary'
2
+ require 'sastrawi/stemmer/cached_stemmer'
3
+ require 'sastrawi/stemmer/stemmer'
4
+ require 'sastrawi/stemmer/cache/array_cache'
5
+
6
+
7
+ module Sastrawi
8
+ module Stemmer
9
+ class StemmerFactory
10
+ def create_stemmer(is_dev = false)
11
+ stemmer = Sastrawi::Stemmer::Stemmer.new(create_default_dictionary(is_dev))
12
+
13
+ cache_result = Sastrawi::Stemmer::Cache::ArrayCache.new
14
+ cached_stemmer = Sastrawi::Stemmer::CachedStemmer.new(cache_result, stemmer)
15
+
16
+ cached_stemmer
17
+ end
18
+
19
+ def create_default_dictionary(is_dev = false)
20
+ words = get_words(is_dev)
21
+ dictionary = Sastrawi::Dictionary::ArrayDictionary.new(words)
22
+
23
+ dictionary
24
+ end
25
+
26
+ def get_words(is_dev = false)
27
+ get_words_from_file
28
+ end
29
+
30
+ def get_words_from_file
31
+ root_directory = File.expand_path('../../../..', __FILE__)
32
+ dictionary_file_path = File.join(root_directory, 'data/kata-dasar.txt')
33
+
34
+ dictionary_content = []
35
+ File.open(dictionary_file_path, 'r') do |file|
36
+ file.each do |line|
37
+ dictionary_content.push(line.chomp)
38
+ end
39
+ end
40
+
41
+ dictionary_content
42
+ end
43
+ end
44
+ end
45
+ end