sastrawi-ruby 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +23 -0
- data/.gitignore +51 -0
- data/.travis.yml +10 -0
- data/CONTRIBUTING.md +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +104 -0
- data/Rakefile +6 -0
- data/_config.yml +1 -0
- data/bin/sastrawi +24 -0
- data/data/base-word.txt +29933 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +67 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +28 -0
- data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
- data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +25 -0
- data/lib/sastrawi/stemmer/context/context.rb +217 -0
- data/lib/sastrawi/stemmer/context/removal.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +54 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +37 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
- data/lib/sastrawi/stemmer/stemmer.rb +101 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +49 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +27 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +124 -0
- data/lib/sastrawi/version.rb +5 -0
- data/lib/sastrawi.rb +4 -0
- data/sastrawi.gemspec +34 -0
- metadata +179 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule30b
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^peng([aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "k#{matches[0]}#{matches[1]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule30c
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^penge(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return matches[0]
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule31a
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^peny([aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "ny#{matches[0]}#{matches[1]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule31b
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^peny([aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "s#{matches[0]}#{matches[1]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule32
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
return 'ajar' if word == 'pelajar'
|
|
7
|
+
|
|
8
|
+
contains = /^pe(l[aiueo])(.*)$/.match(word)
|
|
9
|
+
|
|
10
|
+
if contains
|
|
11
|
+
matches = contains.captures
|
|
12
|
+
|
|
13
|
+
return "#{matches[0]}#{matches[1]}"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule34
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^pe([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return if /^er(.*)$/.match(matches[1])
|
|
12
|
+
|
|
13
|
+
return "#{matches[0]}#{matches[1]}"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule35
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^ter([bcdfghjkpqstvxz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule36
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^pe([bcdfghjkpqstvxz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule37a
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(er[aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule37b
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])er([aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule38a
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(el[aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule38b
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])el([aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule39a
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(em[aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule39b
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])em([aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule40a
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(in[aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule40b
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])in([aiueo])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule41
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^ku(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return matches[0]
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule42
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^kau(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return matches[0]
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule5
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^be([bcdfghjklmnpqstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "#{matches[0]}#{matches[1]}#{matches[2]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule6a
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^ter([aiueo].*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return matches[0]
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule6b
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^ter([aiueo].*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return "r#{matches[0]}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule7
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^ter([bcdfghjklmnpqrstvwxyz])er([aiueo].*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return if matches[0] == 'r'
|
|
12
|
+
|
|
13
|
+
return "#{matches[0]}er#{matches[1]}"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule8
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^ter([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return if matches[0] == 'r' || /^er(.*)$/.match(matches[1])
|
|
12
|
+
|
|
13
|
+
return "#{matches[0]}#{matches[1]}"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Morphology
|
|
3
|
+
module Disambiguator
|
|
4
|
+
class DisambiguatorPrefixRule9
|
|
5
|
+
def disambiguate(word)
|
|
6
|
+
contains = /^te([bcdfghjklmnpqrstvwxyz])er([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
|
7
|
+
|
|
8
|
+
if contains
|
|
9
|
+
matches = contains.captures
|
|
10
|
+
|
|
11
|
+
return if matches[0] == 'r'
|
|
12
|
+
|
|
13
|
+
return "#{matches[0]}er#{matches[1]}#{matches[2]}"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
##
|
|
2
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval". page 26
|
|
3
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
|
4
|
+
|
|
5
|
+
module Sastrawi
|
|
6
|
+
module Morphology
|
|
7
|
+
class InvalidAffixPairSpecification
|
|
8
|
+
def satisfied_by?(word)
|
|
9
|
+
return false if /^me(.*)kan$/.match(word)
|
|
10
|
+
|
|
11
|
+
return false if word == 'ketahui'
|
|
12
|
+
|
|
13
|
+
invalid_affixes = [
|
|
14
|
+
/^ber(.*)i$/, /^di(.*)an$/, /^ke(.*)i$/, /^ke(.*)an$/,
|
|
15
|
+
/^me(.*)an$/, /^me(.*)an$/, /^ter(.*)an$/, /^per(.*)an$/
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
matches = false
|
|
19
|
+
|
|
20
|
+
invalid_affixes.each do |invalid_affix|
|
|
21
|
+
matches = matches || !!(word =~ invalid_affix)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
matches
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module Stemmer
|
|
3
|
+
module Cache
|
|
4
|
+
class ArrayCache
|
|
5
|
+
attr_reader :data
|
|
6
|
+
|
|
7
|
+
def initialize
|
|
8
|
+
@data = {}
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def set(key, value)
|
|
12
|
+
@data[key.to_sym] = value
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def get(key)
|
|
16
|
+
@data[key.to_sym] if @data.key?(key.to_sym)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def has?(key)
|
|
20
|
+
@data.key?(key.to_sym)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
require 'sastrawi/stemmer/filter/text_normalizer'
|
|
2
|
+
|
|
3
|
+
module Sastrawi
|
|
4
|
+
module Stemmer
|
|
5
|
+
class CachedStemmer
|
|
6
|
+
attr_reader :cache, :delegated_stemmer
|
|
7
|
+
|
|
8
|
+
def initialize(cache, delegated_stemmer)
|
|
9
|
+
@cache = cache
|
|
10
|
+
@delegated_stemmer = delegated_stemmer
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def stem(text)
|
|
14
|
+
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
|
15
|
+
|
|
16
|
+
words = normalized_text.split(' ')
|
|
17
|
+
stems = []
|
|
18
|
+
|
|
19
|
+
words.each do |word|
|
|
20
|
+
if @cache.has?(word)
|
|
21
|
+
stems.push(@cache.get(word))
|
|
22
|
+
else
|
|
23
|
+
stem = @delegated_stemmer.stem(word)
|
|
24
|
+
@cache.set(word, stem)
|
|
25
|
+
stems.push(stem)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
stems.join(' ')
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
##
|
|
2
|
+
# Confix Stripping Rule Precendence Adjustment Specification
|
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
|
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
|
5
|
+
|
|
6
|
+
module Sastrawi
|
|
7
|
+
module Stemmer
|
|
8
|
+
module ConfixStripping
|
|
9
|
+
class PrecedenceAdjustmentSpecification
|
|
10
|
+
def satisfied_by?(value)
|
|
11
|
+
regex_rules = [
|
|
12
|
+
/^be(.*)lah$/, /^be(.*)an$/, /^me(.*)i$/,
|
|
13
|
+
/^di(.*)i$/, /^pe(.*)i$/, /^ter(.*)i$/
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
regex_rules.each do |rule|
|
|
17
|
+
return true if rule.match(value)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
false
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|