sastrawi 0.1.0.pre → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (92) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +1 -1
  3. data/.travis.yml +7 -5
  4. data/CONTRIBUTING.md +22 -0
  5. data/Gemfile +0 -0
  6. data/LICENSE.txt +1 -1
  7. data/README.md +53 -19
  8. data/Rakefile +2 -2
  9. data/_config.yml +1 -0
  10. data/bin/sastrawi +24 -0
  11. data/data/{kata-dasar.txt → base-word.txt} +0 -0
  12. data/lib/sastrawi.rb +1 -9
  13. data/lib/sastrawi/dictionary/array_dictionary.rb +36 -2
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +1 -1
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +1 -1
  16. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +1 -1
  17. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +1 -1
  18. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +1 -1
  19. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +1 -1
  20. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +1 -1
  21. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +1 -1
  22. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +1 -1
  23. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +1 -1
  24. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +1 -1
  25. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +0 -0
  26. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +1 -1
  27. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +1 -1
  28. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +1 -1
  29. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +1 -1
  30. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +0 -0
  31. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +1 -1
  32. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +1 -1
  33. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +1 -1
  34. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +1 -1
  35. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +1 -1
  36. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +1 -1
  37. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +1 -1
  38. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +1 -1
  39. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +1 -1
  40. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +1 -1
  41. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +1 -1
  42. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +1 -1
  43. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +1 -1
  44. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +2 -2
  45. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +1 -1
  46. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +1 -1
  47. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +1 -1
  48. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +0 -0
  49. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +1 -1
  50. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +1 -1
  51. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +1 -1
  52. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +1 -1
  53. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +1 -1
  54. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +1 -1
  55. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +2 -2
  56. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +2 -2
  57. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +2 -2
  58. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +2 -2
  59. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +2 -2
  60. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +2 -2
  61. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +0 -0
  62. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +2 -2
  63. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +2 -2
  64. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +0 -0
  65. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +0 -0
  66. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +2 -2
  67. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +0 -0
  68. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +1 -1
  69. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +1 -1
  70. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +1 -1
  71. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +2 -2
  72. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +4 -0
  73. data/lib/sastrawi/stemmer/cache/array_cache.rb +2 -2
  74. data/lib/sastrawi/stemmer/cached_stemmer.rb +1 -1
  75. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +5 -0
  76. data/lib/sastrawi/stemmer/context/context.rb +28 -7
  77. data/lib/sastrawi/stemmer/context/removal.rb +1 -1
  78. data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +0 -0
  79. data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +2 -2
  80. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +10 -1
  81. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +9 -1
  82. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +9 -1
  83. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +9 -1
  84. data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +1 -1
  85. data/lib/sastrawi/stemmer/filter/text_normalizer.rb +0 -0
  86. data/lib/sastrawi/stemmer/stemmer.rb +31 -15
  87. data/lib/sastrawi/stemmer/stemmer_factory.rb +5 -1
  88. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +5 -2
  89. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +102 -130
  90. data/lib/sastrawi/version.rb +1 -1
  91. data/sastrawi.gemspec +6 -5
  92. metadata +22 -19
@@ -10,7 +10,7 @@ module Sastrawi
10
10
 
11
11
  return if matches[0] == 'r' || /^er(.*)$/.match(matches[1])
12
12
 
13
- return matches[0] << matches[1]
13
+ return "#{matches[0]}#{matches[1]}"
14
14
  end
15
15
  end
16
16
  end
@@ -3,14 +3,14 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule9
5
5
  def disambiguate(word)
6
- contains = /^te([bcdfghjklmnpqrstvwxyz])er(([bcdfghjklmnpqrstvwxyz]).*)$/.match(word)
6
+ contains = /^te([bcdfghjklmnpqrstvwxyz])er([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
10
10
 
11
11
  return if matches[0] == 'r'
12
12
 
13
- return matches[0] << 'er' << matches[1] << matches[2]
13
+ return "#{matches[0]}er#{matches[1]}#{matches[2]}"
14
14
  end
15
15
  end
16
16
  end
@@ -1,3 +1,7 @@
1
+ ##
2
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval". page 26
3
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
4
+
1
5
  module Sastrawi
2
6
  module Morphology
3
7
  class InvalidAffixPairSpecification
@@ -2,7 +2,7 @@ module Sastrawi
2
2
  module Stemmer
3
3
  module Cache
4
4
  class ArrayCache
5
- attr_accessor :data
5
+ attr_reader :data
6
6
 
7
7
  def initialize
8
8
  @data = {}
@@ -13,7 +13,7 @@ module Sastrawi
13
13
  end
14
14
 
15
15
  def get(key)
16
- return @data[key.to_sym] if @data.key?(key.to_sym)
16
+ @data[key.to_sym] if @data.key?(key.to_sym)
17
17
  end
18
18
 
19
19
  def has?(key)
@@ -3,7 +3,7 @@ require 'sastrawi/stemmer/filter/text_normalizer'
3
3
  module Sastrawi
4
4
  module Stemmer
5
5
  class CachedStemmer
6
- attr_accessor :cache, :delegated_stemmer
6
+ attr_reader :cache, :delegated_stemmer
7
7
 
8
8
  def initialize(cache, delegated_stemmer)
9
9
  @cache = cache
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Confix Stripping Rule Precendence Adjustment Specification
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module ConfixStripping
@@ -1,10 +1,15 @@
1
1
  require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
2
2
 
3
+ ##
4
+ # Stemming context using Nazief and Adriani, Confix Stripping (CS),
5
+ # Enhanced Confix Stripping (ECS), and Improved (ECS)
6
+
3
7
  module Sastrawi
4
8
  module Stemmer
5
9
  module Context
6
10
  class Context
7
- attr_accessor :original_word, :current_word, :dictionary, :visitor_provider, :process_is_stopped, :removals, :visitors, :suffix_visitors, :prefix_visitors, :result
11
+ attr_reader :original_word, :dictionary, :visitor_provider, :visitors, :suffix_visitors, :prefix_visitors
12
+ attr_accessor :current_word, :process_is_stopped, :removals, :result
8
13
 
9
14
  def initialize(original_word, dictionary, visitor_provider)
10
15
  @original_word = original_word
@@ -14,10 +19,10 @@ module Sastrawi
14
19
 
15
20
  @process_is_stopped = false
16
21
  @removals = []
17
- @visitors = []
18
- @suffix_visitors = []
19
- @prefix_visitors = []
20
- @result = ''
22
+ @visitors = nil
23
+ @suffix_visitors = nil
24
+ @prefix_visitors = nil
25
+ @result = nil
21
26
 
22
27
  init_visitors
23
28
  end
@@ -36,6 +41,9 @@ module Sastrawi
36
41
  @removals.push(removal)
37
42
  end
38
43
 
44
+ ##
45
+ # Execute stemming process
46
+
39
47
  def execute
40
48
  start_stemming_process
41
49
 
@@ -55,6 +63,10 @@ module Sastrawi
55
63
 
56
64
  cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
57
65
 
66
+ ##
67
+ # Confix stripping
68
+ # try to remove prefix before suffix if the specification is met
69
+
58
70
  if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
59
71
  remove_prefixes
60
72
  return if @dictionary.contains?(@current_word)
@@ -77,6 +89,9 @@ module Sastrawi
77
89
  loop_last_return
78
90
  end
79
91
 
92
+ ##
93
+ # ECS loop last return
94
+
80
95
  def loop_last_return
81
96
  restore_prefix
82
97
 
@@ -88,12 +103,12 @@ module Sastrawi
88
103
  next unless suffix_removal?(reverse_removal)
89
104
 
90
105
  if reverse_removal.removed_part == 'kan'
91
- @current_word = reverse_removal.result << 'k'
106
+ @current_word = "#{reverse_removal.result}k"
92
107
 
93
108
  remove_prefixes
94
109
  return if @dictionary.contains?(@current_word)
95
110
 
96
- @current_word = reverse_removal.result << 'kan'
111
+ @current_word = "#{reverse_removal.result}kan"
97
112
  else
98
113
  @current_word = reverse_removal.subject
99
114
  end
@@ -146,10 +161,16 @@ module Sastrawi
146
161
  end
147
162
  end
148
163
 
164
+ ##
165
+ # Check whether the removed part is a suffix
166
+
149
167
  def suffix_removal?(removal)
150
168
  removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
151
169
  end
152
170
 
171
+ ##
172
+ # Restore prefix to proceed with ECS loop last return
173
+
153
174
  def restore_prefix
154
175
  @removals.each do |removal|
155
176
  if removal.affix_type == 'DP'
@@ -2,7 +2,7 @@ module Sastrawi
2
2
  module Stemmer
3
3
  module Context
4
4
  class Removal
5
- attr_accessor :visitor, :subject, :result, :removed_part, :affix_type
5
+ attr_reader :visitor, :subject, :result, :removed_part, :affix_type
6
6
 
7
7
  def initialize(visitor, subject, result, removed_part, affix_type)
8
8
  @visitor = visitor
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Context
4
4
  module Visitor
5
5
  class PrefixDisambiguator
6
- attr_accessor :disambiguators
6
+ attr_reader :disambiguators
7
7
 
8
8
  def initialize(disambiguators = [])
9
9
  @disambiguators = []
@@ -22,7 +22,7 @@ module Sastrawi
22
22
 
23
23
  return if result.nil?
24
24
 
25
- removed_part = context.current_word.sub(result, '')
25
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
26
26
 
27
27
  removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
28
28
 
@@ -1,5 +1,10 @@
1
1
  require 'sastrawi/stemmer/context/removal'
2
2
 
3
+ ##
4
+ # Remove derivational suffix
5
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
6
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
7
+
3
8
  module Sastrawi
4
9
  module Stemmer
5
10
  module Context
@@ -9,7 +14,7 @@ module Sastrawi
9
14
  result = remove_suffix(context.current_word)
10
15
 
11
16
  if result != context.current_word
12
- removed_part = context.current_word.sub(result, '')
17
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
13
18
 
14
19
  removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
15
20
 
@@ -18,6 +23,10 @@ module Sastrawi
18
23
  end
19
24
  end
20
25
 
26
+ ##
27
+ # Original rule: i|kan|an
28
+ # Added the adopted foreign suffix rule: is|isme|isasi
29
+
21
30
  def remove_suffix(word)
22
31
  word.sub(/(is|isme|isasi|i|kan|an)$/, '')
23
32
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove inflectional particle
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -7,7 +12,7 @@ module Sastrawi
7
12
  result = remove(context.current_word)
8
13
 
9
14
  if result != context.current_word
10
- removed_part = context.current_word.sub(result, '')
15
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
11
16
 
12
17
  removal = Removal.new(self, context.current_word, result, removed_part, 'P')
13
18
 
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove inflectional particle: lah|kah|tah|pun
26
+
19
27
  def remove(word)
20
28
  word.sub(/-*(lah|kah|tah|pun)$/, '')
21
29
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove inflectional possessive pronoun
3
+ # Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -7,7 +12,7 @@ module Sastrawi
7
12
  result = remove(context.current_word)
8
13
 
9
14
  if result != context.current_word
10
- removed_part = context.current_word.sub(result, '')
15
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
11
16
 
12
17
  removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
13
18
 
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove inflectional possessive pronoun: ku|mu|nya|
26
+
19
27
  def remove(word)
20
28
  word.sub(/-*(ku|mu|nya)$/, '')
21
29
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove plain prefix
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -7,7 +12,7 @@ module Sastrawi
7
12
  result = remove(context.current_word)
8
13
 
9
14
  if result != context.current_word
10
- removed_part = context.current_word.sub(result, '')
15
+ removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
11
16
 
12
17
  removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
13
18
 
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove plain prefix: di|ke|se
26
+
19
27
  def remove(word)
20
28
  word.sub(/^(di|ke|se)/, '')
21
29
  end
@@ -69,7 +69,7 @@ module Sastrawi
69
69
  module Context
70
70
  module Visitor
71
71
  class VisitorProvider
72
- attr_accessor :visitors, :suffix_visitors, :prefix_visitors
72
+ attr_reader :visitors, :suffix_visitors, :prefix_visitors
73
73
 
74
74
  def initialize
75
75
  @visitors = []
File without changes
@@ -1,17 +1,26 @@
1
1
  require 'sastrawi/stemmer/context/context'
2
+
2
3
  require 'sastrawi/stemmer/context/visitor/visitor_provider'
4
+
3
5
  require 'sastrawi/stemmer/filter/text_normalizer'
4
6
 
7
+ ##
8
+ # Indonesian Stemmer
9
+ # Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
10
+
5
11
  module Sastrawi
6
12
  module Stemmer
7
13
  class Stemmer
8
- attr_accessor :dictionary, :visitor_provider
14
+ attr_reader :dictionary, :visitor_provider
9
15
 
10
16
  def initialize(dictionary)
11
17
  @dictionary = dictionary
12
18
  @visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
13
19
  end
14
20
 
21
+ ##
22
+ # Stem a string to its base form
23
+
15
24
  def stem(text)
16
25
  normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
17
26
 
@@ -25,6 +34,9 @@ module Sastrawi
25
34
  stems.join(' ')
26
35
  end
27
36
 
37
+ ##
38
+ # Stem a word to its base form
39
+
28
40
  def stem_word(word)
29
41
  if plural?(word)
30
42
  stem_plural_word(word)
@@ -36,35 +48,36 @@ module Sastrawi
36
48
  def plural?(word)
37
49
  matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word)
38
50
 
39
- if matches
40
- true
41
- else
42
- false
43
- end
51
+ return matches[1].include?('-') if matches
52
+
53
+ return word.include?('-')
44
54
  end
45
55
 
56
+ ##
57
+ # Stem a plural word to its base form
58
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
59
+ # page 76-77
60
+
46
61
  def stem_plural_word(word)
47
62
  first_match = /^(.*)-(.*)$/.match(word)
48
63
 
49
- unless first_match
50
- return word
51
- end
52
-
53
- words = [first_match.captures[0], first_match.captures[1]]
64
+ return word unless first_match
54
65
 
66
+ words = [first_match[1], first_match[2]]
55
67
  suffix = words[1]
56
- suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
68
+ suffixes = %w[ku mu nya lah kah tah pun]
57
69
  second_match = /^(.*)-(.*)$/.match(words[0])
58
70
 
59
71
  if suffixes.include?(suffix) && second_match
60
- words[1] = words[1] + '-' + suffix
72
+ words[0] = second_match[1]
73
+ words[1] = "#{second_match[2]}-#{suffix}"
61
74
  end
62
75
 
63
76
  root_first_word = stem_singular_word(words[0])
64
77
  root_second_word = stem_singular_word(words[1])
65
78
 
66
- unless @dictionary.contains?(words[1]) && root_second_word == words[1]
67
- root_second_word = stem_singular_word('me' + words[1])
79
+ if !@dictionary.contains?(words[1]) && root_second_word == words[1]
80
+ root_second_word = stem_singular_word("me#{words[1]}")
68
81
  end
69
82
 
70
83
  if root_first_word == root_second_word
@@ -74,6 +87,9 @@ module Sastrawi
74
87
  end
75
88
  end
76
89
 
90
+ ##
91
+ # Stem a singular word to its base form
92
+
77
93
  def stem_singular_word(word)
78
94
  context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
79
95
  context.execute
@@ -1,8 +1,12 @@
1
1
  require 'sastrawi/dictionary/array_dictionary'
2
+
2
3
  require 'sastrawi/stemmer/cached_stemmer'
3
4
  require 'sastrawi/stemmer/stemmer'
5
+
4
6
  require 'sastrawi/stemmer/cache/array_cache'
5
7
 
8
+ ##
9
+ # Stemmer factory helps creating a pre-configured stemmer
6
10
 
7
11
  module Sastrawi
8
12
  module Stemmer
@@ -29,7 +33,7 @@ module Sastrawi
29
33
 
30
34
  def get_words_from_file
31
35
  root_directory = File.expand_path('../../../..', __FILE__)
32
- dictionary_file_path = File.join(root_directory, 'data/kata-dasar.txt')
36
+ dictionary_file_path = File.join(root_directory, 'data/base-word.txt')
33
37
 
34
38
  dictionary_content = []
35
39
  File.open(dictionary_file_path, 'r') do |file|