sastrawi 0.1.0.pre → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -1
- data/.travis.yml +7 -5
- data/CONTRIBUTING.md +22 -0
- data/Gemfile +0 -0
- data/LICENSE.txt +1 -1
- data/README.md +53 -19
- data/Rakefile +2 -2
- data/_config.yml +1 -0
- data/bin/sastrawi +24 -0
- data/data/{kata-dasar.txt → base-word.txt} +0 -0
- data/lib/sastrawi.rb +1 -9
- data/lib/sastrawi/dictionary/array_dictionary.rb +36 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +0 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +0 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +0 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +0 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +0 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +0 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +2 -2
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +0 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +2 -2
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +4 -0
- data/lib/sastrawi/stemmer/cache/array_cache.rb +2 -2
- data/lib/sastrawi/stemmer/cached_stemmer.rb +1 -1
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +5 -0
- data/lib/sastrawi/stemmer/context/context.rb +28 -7
- data/lib/sastrawi/stemmer/context/removal.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +0 -0
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +2 -2
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +10 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +9 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +9 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +9 -1
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +1 -1
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +0 -0
- data/lib/sastrawi/stemmer/stemmer.rb +31 -15
- data/lib/sastrawi/stemmer/stemmer_factory.rb +5 -1
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +5 -2
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +102 -130
- data/lib/sastrawi/version.rb +1 -1
- data/sastrawi.gemspec +6 -5
- metadata +22 -19
@@ -3,14 +3,14 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule9
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^te([bcdfghjklmnpqrstvwxyz])er(
|
6
|
+
contains = /^te([bcdfghjklmnpqrstvwxyz])er([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
10
10
|
|
11
11
|
return if matches[0] == 'r'
|
12
12
|
|
13
|
-
return matches[0]
|
13
|
+
return "#{matches[0]}er#{matches[1]}#{matches[2]}"
|
14
14
|
end
|
15
15
|
end
|
16
16
|
end
|
@@ -2,7 +2,7 @@ module Sastrawi
|
|
2
2
|
module Stemmer
|
3
3
|
module Cache
|
4
4
|
class ArrayCache
|
5
|
-
|
5
|
+
attr_reader :data
|
6
6
|
|
7
7
|
def initialize
|
8
8
|
@data = {}
|
@@ -13,7 +13,7 @@ module Sastrawi
|
|
13
13
|
end
|
14
14
|
|
15
15
|
def get(key)
|
16
|
-
|
16
|
+
@data[key.to_sym] if @data.key?(key.to_sym)
|
17
17
|
end
|
18
18
|
|
19
19
|
def has?(key)
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Confix Stripping Rule Precendence Adjustment Specification
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module ConfixStripping
|
@@ -1,10 +1,15 @@
|
|
1
1
|
require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
|
2
2
|
|
3
|
+
##
|
4
|
+
# Stemming context using Nazief and Adriani, Confix Stripping (CS),
|
5
|
+
# Enhanced Confix Stripping (ECS), and Improved (ECS)
|
6
|
+
|
3
7
|
module Sastrawi
|
4
8
|
module Stemmer
|
5
9
|
module Context
|
6
10
|
class Context
|
7
|
-
|
11
|
+
attr_reader :original_word, :dictionary, :visitor_provider, :visitors, :suffix_visitors, :prefix_visitors
|
12
|
+
attr_accessor :current_word, :process_is_stopped, :removals, :result
|
8
13
|
|
9
14
|
def initialize(original_word, dictionary, visitor_provider)
|
10
15
|
@original_word = original_word
|
@@ -14,10 +19,10 @@ module Sastrawi
|
|
14
19
|
|
15
20
|
@process_is_stopped = false
|
16
21
|
@removals = []
|
17
|
-
@visitors =
|
18
|
-
@suffix_visitors =
|
19
|
-
@prefix_visitors =
|
20
|
-
@result =
|
22
|
+
@visitors = nil
|
23
|
+
@suffix_visitors = nil
|
24
|
+
@prefix_visitors = nil
|
25
|
+
@result = nil
|
21
26
|
|
22
27
|
init_visitors
|
23
28
|
end
|
@@ -36,6 +41,9 @@ module Sastrawi
|
|
36
41
|
@removals.push(removal)
|
37
42
|
end
|
38
43
|
|
44
|
+
##
|
45
|
+
# Execute stemming process
|
46
|
+
|
39
47
|
def execute
|
40
48
|
start_stemming_process
|
41
49
|
|
@@ -55,6 +63,10 @@ module Sastrawi
|
|
55
63
|
|
56
64
|
cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
|
57
65
|
|
66
|
+
##
|
67
|
+
# Confix stripping
|
68
|
+
# try to remove prefix before suffix if the specification is met
|
69
|
+
|
58
70
|
if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
|
59
71
|
remove_prefixes
|
60
72
|
return if @dictionary.contains?(@current_word)
|
@@ -77,6 +89,9 @@ module Sastrawi
|
|
77
89
|
loop_last_return
|
78
90
|
end
|
79
91
|
|
92
|
+
##
|
93
|
+
# ECS loop last return
|
94
|
+
|
80
95
|
def loop_last_return
|
81
96
|
restore_prefix
|
82
97
|
|
@@ -88,12 +103,12 @@ module Sastrawi
|
|
88
103
|
next unless suffix_removal?(reverse_removal)
|
89
104
|
|
90
105
|
if reverse_removal.removed_part == 'kan'
|
91
|
-
@current_word = reverse_removal.result
|
106
|
+
@current_word = "#{reverse_removal.result}k"
|
92
107
|
|
93
108
|
remove_prefixes
|
94
109
|
return if @dictionary.contains?(@current_word)
|
95
110
|
|
96
|
-
@current_word = reverse_removal.result
|
111
|
+
@current_word = "#{reverse_removal.result}kan"
|
97
112
|
else
|
98
113
|
@current_word = reverse_removal.subject
|
99
114
|
end
|
@@ -146,10 +161,16 @@ module Sastrawi
|
|
146
161
|
end
|
147
162
|
end
|
148
163
|
|
164
|
+
##
|
165
|
+
# Check whether the removed part is a suffix
|
166
|
+
|
149
167
|
def suffix_removal?(removal)
|
150
168
|
removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
|
151
169
|
end
|
152
170
|
|
171
|
+
##
|
172
|
+
# Restore prefix to proceed with ECS loop last return
|
173
|
+
|
153
174
|
def restore_prefix
|
154
175
|
@removals.each do |removal|
|
155
176
|
if removal.affix_type == 'DP'
|
@@ -2,7 +2,7 @@ module Sastrawi
|
|
2
2
|
module Stemmer
|
3
3
|
module Context
|
4
4
|
class Removal
|
5
|
-
|
5
|
+
attr_reader :visitor, :subject, :result, :removed_part, :affix_type
|
6
6
|
|
7
7
|
def initialize(visitor, subject, result, removed_part, affix_type)
|
8
8
|
@visitor = visitor
|
File without changes
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Context
|
4
4
|
module Visitor
|
5
5
|
class PrefixDisambiguator
|
6
|
-
|
6
|
+
attr_reader :disambiguators
|
7
7
|
|
8
8
|
def initialize(disambiguators = [])
|
9
9
|
@disambiguators = []
|
@@ -22,7 +22,7 @@ module Sastrawi
|
|
22
22
|
|
23
23
|
return if result.nil?
|
24
24
|
|
25
|
-
removed_part = context.current_word.sub(result
|
25
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
26
26
|
|
27
27
|
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
28
28
|
|
@@ -1,5 +1,10 @@
|
|
1
1
|
require 'sastrawi/stemmer/context/removal'
|
2
2
|
|
3
|
+
##
|
4
|
+
# Remove derivational suffix
|
5
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
6
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
7
|
+
|
3
8
|
module Sastrawi
|
4
9
|
module Stemmer
|
5
10
|
module Context
|
@@ -9,7 +14,7 @@ module Sastrawi
|
|
9
14
|
result = remove_suffix(context.current_word)
|
10
15
|
|
11
16
|
if result != context.current_word
|
12
|
-
removed_part = context.current_word.sub(result
|
17
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
13
18
|
|
14
19
|
removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
|
15
20
|
|
@@ -18,6 +23,10 @@ module Sastrawi
|
|
18
23
|
end
|
19
24
|
end
|
20
25
|
|
26
|
+
##
|
27
|
+
# Original rule: i|kan|an
|
28
|
+
# Added the adopted foreign suffix rule: is|isme|isasi
|
29
|
+
|
21
30
|
def remove_suffix(word)
|
22
31
|
word.sub(/(is|isme|isasi|i|kan|an)$/, '')
|
23
32
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove inflectional particle
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -7,7 +12,7 @@ module Sastrawi
|
|
7
12
|
result = remove(context.current_word)
|
8
13
|
|
9
14
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
15
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
16
|
|
12
17
|
removal = Removal.new(self, context.current_word, result, removed_part, 'P')
|
13
18
|
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove inflectional particle: lah|kah|tah|pun
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/-*(lah|kah|tah|pun)$/, '')
|
21
29
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove inflectional possessive pronoun
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -7,7 +12,7 @@ module Sastrawi
|
|
7
12
|
result = remove(context.current_word)
|
8
13
|
|
9
14
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
15
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
16
|
|
12
17
|
removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
|
13
18
|
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove inflectional possessive pronoun: ku|mu|nya|
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/-*(ku|mu|nya)$/, '')
|
21
29
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove plain prefix
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -7,7 +12,7 @@ module Sastrawi
|
|
7
12
|
result = remove(context.current_word)
|
8
13
|
|
9
14
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
15
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
16
|
|
12
17
|
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
13
18
|
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove plain prefix: di|ke|se
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/^(di|ke|se)/, '')
|
21
29
|
end
|
File without changes
|
@@ -1,17 +1,26 @@
|
|
1
1
|
require 'sastrawi/stemmer/context/context'
|
2
|
+
|
2
3
|
require 'sastrawi/stemmer/context/visitor/visitor_provider'
|
4
|
+
|
3
5
|
require 'sastrawi/stemmer/filter/text_normalizer'
|
4
6
|
|
7
|
+
##
|
8
|
+
# Indonesian Stemmer
|
9
|
+
# Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
|
10
|
+
|
5
11
|
module Sastrawi
|
6
12
|
module Stemmer
|
7
13
|
class Stemmer
|
8
|
-
|
14
|
+
attr_reader :dictionary, :visitor_provider
|
9
15
|
|
10
16
|
def initialize(dictionary)
|
11
17
|
@dictionary = dictionary
|
12
18
|
@visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
|
13
19
|
end
|
14
20
|
|
21
|
+
##
|
22
|
+
# Stem a string to its base form
|
23
|
+
|
15
24
|
def stem(text)
|
16
25
|
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
17
26
|
|
@@ -25,6 +34,9 @@ module Sastrawi
|
|
25
34
|
stems.join(' ')
|
26
35
|
end
|
27
36
|
|
37
|
+
##
|
38
|
+
# Stem a word to its base form
|
39
|
+
|
28
40
|
def stem_word(word)
|
29
41
|
if plural?(word)
|
30
42
|
stem_plural_word(word)
|
@@ -36,35 +48,36 @@ module Sastrawi
|
|
36
48
|
def plural?(word)
|
37
49
|
matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word)
|
38
50
|
|
39
|
-
if matches
|
40
|
-
|
41
|
-
|
42
|
-
false
|
43
|
-
end
|
51
|
+
return matches[1].include?('-') if matches
|
52
|
+
|
53
|
+
return word.include?('-')
|
44
54
|
end
|
45
55
|
|
56
|
+
##
|
57
|
+
# Stem a plural word to its base form
|
58
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
|
59
|
+
# page 76-77
|
60
|
+
|
46
61
|
def stem_plural_word(word)
|
47
62
|
first_match = /^(.*)-(.*)$/.match(word)
|
48
63
|
|
49
|
-
unless first_match
|
50
|
-
return word
|
51
|
-
end
|
52
|
-
|
53
|
-
words = [first_match.captures[0], first_match.captures[1]]
|
64
|
+
return word unless first_match
|
54
65
|
|
66
|
+
words = [first_match[1], first_match[2]]
|
55
67
|
suffix = words[1]
|
56
|
-
suffixes = [
|
68
|
+
suffixes = %w[ku mu nya lah kah tah pun]
|
57
69
|
second_match = /^(.*)-(.*)$/.match(words[0])
|
58
70
|
|
59
71
|
if suffixes.include?(suffix) && second_match
|
60
|
-
words[
|
72
|
+
words[0] = second_match[1]
|
73
|
+
words[1] = "#{second_match[2]}-#{suffix}"
|
61
74
|
end
|
62
75
|
|
63
76
|
root_first_word = stem_singular_word(words[0])
|
64
77
|
root_second_word = stem_singular_word(words[1])
|
65
78
|
|
66
|
-
|
67
|
-
root_second_word = stem_singular_word(
|
79
|
+
if !@dictionary.contains?(words[1]) && root_second_word == words[1]
|
80
|
+
root_second_word = stem_singular_word("me#{words[1]}")
|
68
81
|
end
|
69
82
|
|
70
83
|
if root_first_word == root_second_word
|
@@ -74,6 +87,9 @@ module Sastrawi
|
|
74
87
|
end
|
75
88
|
end
|
76
89
|
|
90
|
+
##
|
91
|
+
# Stem a singular word to its base form
|
92
|
+
|
77
93
|
def stem_singular_word(word)
|
78
94
|
context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
|
79
95
|
context.execute
|
@@ -1,8 +1,12 @@
|
|
1
1
|
require 'sastrawi/dictionary/array_dictionary'
|
2
|
+
|
2
3
|
require 'sastrawi/stemmer/cached_stemmer'
|
3
4
|
require 'sastrawi/stemmer/stemmer'
|
5
|
+
|
4
6
|
require 'sastrawi/stemmer/cache/array_cache'
|
5
7
|
|
8
|
+
##
|
9
|
+
# Stemmer factory helps creating a pre-configured stemmer
|
6
10
|
|
7
11
|
module Sastrawi
|
8
12
|
module Stemmer
|
@@ -29,7 +33,7 @@ module Sastrawi
|
|
29
33
|
|
30
34
|
def get_words_from_file
|
31
35
|
root_directory = File.expand_path('../../../..', __FILE__)
|
32
|
-
dictionary_file_path = File.join(root_directory, 'data/
|
36
|
+
dictionary_file_path = File.join(root_directory, 'data/base-word.txt')
|
33
37
|
|
34
38
|
dictionary_content = []
|
35
39
|
File.open(dictionary_file_path, 'r') do |file|
|