sastrawi-ruby 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/sastrawi/dictionary/array_dictionary.rb +19 -6
- data/lib/sastrawi/stemmer/cache/array_cache.rb +40 -5
- data/lib/sastrawi/stemmer/cached_stemmer.rb +6 -0
- data/lib/sastrawi/stemmer/context/context.rb +1 -5
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +10 -0
- data/lib/sastrawi/stemmer/stemmer.rb +2 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +2 -0
- data/lib/sastrawi/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b33ab69d9a6a019e376bb620152f76de882282d279fe141b93a0086b03165ac5
|
|
4
|
+
data.tar.gz: 6fbe94a6ee7443ce97c5a83042a8bd3bccf473a1b9bf578da9b6a3ad04250eca
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c58d1702bb5ec1d2fa5a964c8aa15420e32946cb31a0c3227a8d8f27af737d9720ae1fd77b1a9d117d08fe969cad270f2a5e5ee7d54e3903a2a39c7f54ef793d
|
|
7
|
+
data.tar.gz: 5db74112b60b6c74fde9e1ad94488a998aa762f606b71a395ff99521ecea4db1e341e1bf1d6bafe8cf13a4d21ce55f7740085ad8f6546daedfdb3d97f2647e76
|
|
@@ -1,14 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
1
5
|
module Sastrawi
|
|
2
6
|
module Dictionary
|
|
3
7
|
class ArrayDictionary
|
|
4
|
-
attr_reader :words
|
|
5
|
-
|
|
6
8
|
def initialize(words = [])
|
|
7
|
-
@words =
|
|
9
|
+
@words = Set.new
|
|
8
10
|
|
|
9
11
|
add_words(words)
|
|
10
12
|
end
|
|
11
13
|
|
|
14
|
+
##
|
|
15
|
+
# Return the words as an Array (for backward compatibility)
|
|
16
|
+
|
|
17
|
+
def words
|
|
18
|
+
@words.to_a
|
|
19
|
+
end
|
|
20
|
+
|
|
12
21
|
##
|
|
13
22
|
# Check whether a word is contained in the dictionary
|
|
14
23
|
|
|
@@ -20,7 +29,7 @@ module Sastrawi
|
|
|
20
29
|
# Count how many words in the dictionary
|
|
21
30
|
|
|
22
31
|
def count
|
|
23
|
-
@words.
|
|
32
|
+
@words.size
|
|
24
33
|
end
|
|
25
34
|
|
|
26
35
|
##
|
|
@@ -36,9 +45,13 @@ module Sastrawi
|
|
|
36
45
|
# Add a word to the dictionary
|
|
37
46
|
|
|
38
47
|
def add(word)
|
|
39
|
-
|
|
48
|
+
unless word.is_a?(String)
|
|
49
|
+
raise ArgumentError, "dictionary entries must be strings, got #{word.class}"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
return if word.strip == ''
|
|
40
53
|
|
|
41
|
-
@words.
|
|
54
|
+
@words.add(word)
|
|
42
55
|
end
|
|
43
56
|
|
|
44
57
|
##
|
|
@@ -1,23 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module Sastrawi
|
|
2
4
|
module Stemmer
|
|
3
5
|
module Cache
|
|
4
6
|
class ArrayCache
|
|
5
|
-
|
|
7
|
+
DEFAULT_MAX_SIZE = 10_000
|
|
8
|
+
|
|
9
|
+
attr_reader :max_size
|
|
6
10
|
|
|
7
|
-
def initialize
|
|
11
|
+
def initialize(max_size: DEFAULT_MAX_SIZE)
|
|
8
12
|
@data = {}
|
|
13
|
+
@mutex = Mutex.new
|
|
14
|
+
@max_size = max_size
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def data
|
|
18
|
+
@mutex.synchronize { @data.dup }
|
|
9
19
|
end
|
|
10
20
|
|
|
11
21
|
def set(key, value)
|
|
12
|
-
@
|
|
22
|
+
@mutex.synchronize do
|
|
23
|
+
evict_if_full
|
|
24
|
+
@data[key.to_sym] = value
|
|
25
|
+
end
|
|
13
26
|
end
|
|
14
27
|
|
|
15
28
|
def get(key)
|
|
16
|
-
@
|
|
29
|
+
@mutex.synchronize do
|
|
30
|
+
@data[key.to_sym] if @data.key?(key.to_sym)
|
|
31
|
+
end
|
|
17
32
|
end
|
|
18
33
|
|
|
19
34
|
def has?(key)
|
|
20
|
-
@
|
|
35
|
+
@mutex.synchronize do
|
|
36
|
+
@data.key?(key.to_sym)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def size
|
|
41
|
+
@mutex.synchronize { @data.size }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def clear!
|
|
45
|
+
@mutex.synchronize { @data.clear }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
def evict_if_full
|
|
51
|
+
return if @data.size < @max_size
|
|
52
|
+
|
|
53
|
+
# Remove the oldest entry (first inserted key)
|
|
54
|
+
oldest_key = @data.keys.first
|
|
55
|
+
@data.delete(oldest_key)
|
|
21
56
|
end
|
|
22
57
|
end
|
|
23
58
|
end
|
|
@@ -10,9 +10,15 @@ module Sastrawi
|
|
|
10
10
|
@delegated_stemmer = delegated_stemmer
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
+
def clear_cache!
|
|
14
|
+
@cache.clear!
|
|
15
|
+
end
|
|
16
|
+
|
|
13
17
|
def stem(text)
|
|
14
18
|
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
|
15
19
|
|
|
20
|
+
return "" if normalized_text.empty?
|
|
21
|
+
|
|
16
22
|
words = normalized_text.split(' ')
|
|
17
23
|
stems = []
|
|
18
24
|
|
|
@@ -205,11 +205,7 @@ module Sastrawi
|
|
|
205
205
|
end
|
|
206
206
|
end
|
|
207
207
|
|
|
208
|
-
@removals.
|
|
209
|
-
if removal.affix_type == 'DP'
|
|
210
|
-
@removals.delete(removal)
|
|
211
|
-
end
|
|
212
|
-
end
|
|
208
|
+
@removals = @removals.reject { |removal| removal.affix_type == 'DP' }
|
|
213
209
|
end
|
|
214
210
|
end
|
|
215
211
|
end
|
|
@@ -1,8 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
module Sastrawi
|
|
2
4
|
module Stemmer
|
|
3
5
|
module Filter
|
|
4
6
|
class TextNormalizer
|
|
5
7
|
def self.normalize_text(text)
|
|
8
|
+
return "" if text.nil?
|
|
9
|
+
|
|
10
|
+
unless text.is_a?(String)
|
|
11
|
+
raise ArgumentError, "expected a String, got #{text.class}"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
return "" if text.empty?
|
|
15
|
+
|
|
6
16
|
lowercase_text = text.downcase
|
|
7
17
|
replaced_text = lowercase_text.gsub(/[^a-z0-9 -]/im, ' ')
|
|
8
18
|
replaced_text = replaced_text.gsub(/( +)/im, ' ')
|
data/lib/sastrawi/version.rb
CHANGED