sastrawi 0.1.0.pre
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +50 -0
- data/.travis.yml +8 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +70 -0
- data/Rakefile +6 -0
- data/data/kata-dasar.txt +29932 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +33 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +24 -0
- data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
- data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +20 -0
- data/lib/sastrawi/stemmer/context/context.rb +170 -0
- data/lib/sastrawi/stemmer/context/removal.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +46 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +28 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +26 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +26 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +26 -0
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
- data/lib/sastrawi/stemmer/stemmer.rb +85 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +45 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +24 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +152 -0
- data/lib/sastrawi/version.rb +3 -0
- data/lib/sastrawi.rb +12 -0
- data/sastrawi.gemspec +25 -0
- metadata +173 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule30c
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^penge(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule31a
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^peny([aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return 'ny' << matches[0] << matches[1]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule31b
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^peny([aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return 's' << matches[0] << matches[1]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule32
|
5
|
+
def disambiguate(word)
|
6
|
+
return 'ajar' if word == 'pelajar'
|
7
|
+
|
8
|
+
contains = /^pe(l[aiueo])(.*)$/.match(word)
|
9
|
+
|
10
|
+
if contains
|
11
|
+
matches = contains.captures
|
12
|
+
|
13
|
+
return matches[0] << matches[1]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule34
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^pe([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return if /^er(.*)$/.match(matches[1])
|
12
|
+
|
13
|
+
return matches[0] << matches[1]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule35
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^ter([bcdfghjkpqstvxz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule36
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^pe([bcdfghjkpqstvxz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule37a
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])(er[aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule37b
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])er([aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule38a
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])(el[aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule38b
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])el([aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule39a
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])(em[aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule39b
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])em([aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule40a
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])(in[aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule40b
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^([bcdfghjklmnpqstvwxyz])in([aiueo])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule41
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^ku(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule42
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^kau(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule5
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^be([bcdfghjklmnpqrstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0] << matches[1] << matches[2]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule6a
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^ter([aiueo].*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return matches[0]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule6b
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^ter([aiueo].*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return 'r' << matches[0]
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule7
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^ter([bcdfghjklmnpqrstvwxyz])er([aiueo].*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return if matches[0] == 'r'
|
12
|
+
|
13
|
+
return matches[0] << 'er' << matches[1]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule8
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^ter([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return if matches[0] == 'r' || /^er(.*)$/.match(matches[1])
|
12
|
+
|
13
|
+
return matches[0] << matches[1]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
module Disambiguator
|
4
|
+
class DisambiguatorPrefixRule9
|
5
|
+
def disambiguate(word)
|
6
|
+
contains = /^te([bcdfghjklmnpqrstvwxyz])er(([bcdfghjklmnpqrstvwxyz]).*)$/.match(word)
|
7
|
+
|
8
|
+
if contains
|
9
|
+
matches = contains.captures
|
10
|
+
|
11
|
+
return if matches[0] == 'r'
|
12
|
+
|
13
|
+
return matches[0] << 'er' << matches[1] << matches[2]
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Morphology
|
3
|
+
class InvalidAffixPairSpecification
|
4
|
+
def satisfied_by?(word)
|
5
|
+
return false if /^me(.*)kan$/.match(word)
|
6
|
+
|
7
|
+
return false if word == 'ketahui'
|
8
|
+
|
9
|
+
invalid_affixes = [
|
10
|
+
/^ber(.*)i$/, /^di(.*)an$/, /^ke(.*)i$/, /^ke(.*)an$/,
|
11
|
+
/^me(.*)an$/, /^me(.*)an$/, /^ter(.*)an$/, /^per(.*)an$/
|
12
|
+
]
|
13
|
+
|
14
|
+
matches = false
|
15
|
+
|
16
|
+
invalid_affixes.each do |invalid_affix|
|
17
|
+
matches = matches || !!(word =~ invalid_affix)
|
18
|
+
end
|
19
|
+
|
20
|
+
matches
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module Cache
|
4
|
+
class ArrayCache
|
5
|
+
attr_accessor :data
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@data = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def set(key, value)
|
12
|
+
@data[key.to_sym] = value
|
13
|
+
end
|
14
|
+
|
15
|
+
def get(key)
|
16
|
+
return @data[key.to_sym] if @data.key?(key.to_sym)
|
17
|
+
end
|
18
|
+
|
19
|
+
def has?(key)
|
20
|
+
@data.key?(key.to_sym)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'sastrawi/stemmer/filter/text_normalizer'
|
2
|
+
|
3
|
+
module Sastrawi
|
4
|
+
module Stemmer
|
5
|
+
class CachedStemmer
|
6
|
+
attr_accessor :cache, :delegated_stemmer
|
7
|
+
|
8
|
+
def initialize(cache, delegated_stemmer)
|
9
|
+
@cache = cache
|
10
|
+
@delegated_stemmer = delegated_stemmer
|
11
|
+
end
|
12
|
+
|
13
|
+
def stem(text)
|
14
|
+
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
15
|
+
|
16
|
+
words = normalized_text.split(' ')
|
17
|
+
stems = []
|
18
|
+
|
19
|
+
words.each do |word|
|
20
|
+
if @cache.has?(word)
|
21
|
+
stems.push(@cache.get(word))
|
22
|
+
else
|
23
|
+
stem = @delegated_stemmer.stem(word)
|
24
|
+
@cache.set(word, stem)
|
25
|
+
stems.push(stem)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
stems.join(' ')
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Sastrawi
|
2
|
+
module Stemmer
|
3
|
+
module ConfixStripping
|
4
|
+
class PrecedenceAdjustmentSpecification
|
5
|
+
def satisfied_by?(value)
|
6
|
+
regex_rules = [
|
7
|
+
/^be(.*)lah$/, /^be(.*)an$/, /^me(.*)i$/,
|
8
|
+
/^di(.*)i$/, /^pe(.*)i$/, /^ter(.*)i$/
|
9
|
+
]
|
10
|
+
|
11
|
+
regex_rules.each do |rule|
|
12
|
+
return true if rule.match(value)
|
13
|
+
end
|
14
|
+
|
15
|
+
false
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|