sastrawi-ruby 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/MILESTONES.md +12 -0
- data/data/base-word.txt +17 -1
- data/data/stop-words.txt +842 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +19 -6
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +2 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +2 -0
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +2 -0
- data/lib/sastrawi/stemmer/cache/array_cache.rb +40 -5
- data/lib/sastrawi/stemmer/cached_stemmer.rb +8 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +2 -0
- data/lib/sastrawi/stemmer/context/context.rb +3 -5
- data/lib/sastrawi/stemmer/context/removal.rb +2 -0
- data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +2 -0
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +2 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +2 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +2 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +2 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +2 -0
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +2 -0
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +10 -0
- data/lib/sastrawi/stemmer/stemmer.rb +10 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +2 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +4 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +19 -107
- data/lib/sastrawi/version.rb +1 -1
- data/lib/sastrawi.rb +2 -0
- metadata +3 -1
|
@@ -1,14 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
1
5
|
module Sastrawi
|
|
2
6
|
module Dictionary
|
|
3
7
|
class ArrayDictionary
|
|
4
|
-
attr_reader :words
|
|
5
|
-
|
|
6
8
|
def initialize(words = [])
|
|
7
|
-
@words =
|
|
9
|
+
@words = Set.new
|
|
8
10
|
|
|
9
11
|
add_words(words)
|
|
10
12
|
end
|
|
11
13
|
|
|
14
|
+
##
|
|
15
|
+
# Return the words as an Array (for backward compatibility)
|
|
16
|
+
|
|
17
|
+
def words
|
|
18
|
+
@words.to_a
|
|
19
|
+
end
|
|
20
|
+
|
|
12
21
|
##
|
|
13
22
|
# Check whether a word is contained in the dictionary
|
|
14
23
|
|
|
@@ -20,7 +29,7 @@ module Sastrawi
|
|
|
20
29
|
# Count how many words in the dictionary
|
|
21
30
|
|
|
22
31
|
def count
|
|
23
|
-
@words.
|
|
32
|
+
@words.size
|
|
24
33
|
end
|
|
25
34
|
|
|
26
35
|
##
|
|
@@ -36,9 +45,13 @@ module Sastrawi
|
|
|
36
45
|
# Add a word to the dictionary
|
|
37
46
|
|
|
38
47
|
def add(word)
|
|
39
|
-
|
|
48
|
+
unless word.is_a?(String)
|
|
49
|
+
raise ArgumentError, "dictionary entries must be strings, got #{word.class}"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
return if word.strip == ''
|
|
40
53
|
|
|
41
|
-
@words.
|
|
54
|
+
@words.add(word)
|
|
42
55
|
end
|
|
43
56
|
|
|
44
57
|
##
|
|
@@ -1,23 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module Sastrawi
|
|
2
4
|
module Stemmer
|
|
3
5
|
module Cache
|
|
4
6
|
class ArrayCache
|
|
5
|
-
|
|
7
|
+
DEFAULT_MAX_SIZE = 10_000
|
|
8
|
+
|
|
9
|
+
attr_reader :max_size
|
|
6
10
|
|
|
7
|
-
def initialize
|
|
11
|
+
def initialize(max_size: DEFAULT_MAX_SIZE)
|
|
8
12
|
@data = {}
|
|
13
|
+
@mutex = Mutex.new
|
|
14
|
+
@max_size = max_size
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def data
|
|
18
|
+
@mutex.synchronize { @data.dup }
|
|
9
19
|
end
|
|
10
20
|
|
|
11
21
|
def set(key, value)
|
|
12
|
-
@
|
|
22
|
+
@mutex.synchronize do
|
|
23
|
+
evict_if_full
|
|
24
|
+
@data[key.to_sym] = value
|
|
25
|
+
end
|
|
13
26
|
end
|
|
14
27
|
|
|
15
28
|
def get(key)
|
|
16
|
-
@
|
|
29
|
+
@mutex.synchronize do
|
|
30
|
+
@data[key.to_sym] if @data.key?(key.to_sym)
|
|
31
|
+
end
|
|
17
32
|
end
|
|
18
33
|
|
|
19
34
|
def has?(key)
|
|
20
|
-
@
|
|
35
|
+
@mutex.synchronize do
|
|
36
|
+
@data.key?(key.to_sym)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def size
|
|
41
|
+
@mutex.synchronize { @data.size }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def clear!
|
|
45
|
+
@mutex.synchronize { @data.clear }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def evict_if_full
|
|
51
|
+
return if @data.size < @max_size
|
|
52
|
+
|
|
53
|
+
# Remove the oldest entry (first inserted key)
|
|
54
|
+
oldest_key = @data.keys.first
|
|
55
|
+
@data.delete(oldest_key)
|
|
21
56
|
end
|
|
22
57
|
end
|
|
23
58
|
end
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'sastrawi/stemmer/filter/text_normalizer'
|
|
2
4
|
|
|
3
5
|
module Sastrawi
|
|
@@ -10,9 +12,15 @@ module Sastrawi
|
|
|
10
12
|
@delegated_stemmer = delegated_stemmer
|
|
11
13
|
end
|
|
12
14
|
|
|
15
|
+
def clear_cache!
|
|
16
|
+
@cache.clear!
|
|
17
|
+
end
|
|
18
|
+
|
|
13
19
|
def stem(text)
|
|
14
20
|
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
|
15
21
|
|
|
22
|
+
return "" if normalized_text.empty?
|
|
23
|
+
|
|
16
24
|
words = normalized_text.split(' ')
|
|
17
25
|
stems = []
|
|
18
26
|
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
|
|
2
4
|
|
|
3
5
|
##
|
|
@@ -205,11 +207,7 @@ module Sastrawi
|
|
|
205
207
|
end
|
|
206
208
|
end
|
|
207
209
|
|
|
208
|
-
@removals.
|
|
209
|
-
if removal.affix_type == 'DP'
|
|
210
|
-
@removals.delete(removal)
|
|
211
|
-
end
|
|
212
|
-
end
|
|
210
|
+
@removals = @removals.reject { |removal| removal.affix_type == 'DP' }
|
|
213
211
|
end
|
|
214
212
|
end
|
|
215
213
|
end
|