sastrawi 0.1.0.pre

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +50 -0
  3. data/.travis.yml +8 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +21 -0
  6. data/README.md +70 -0
  7. data/Rakefile +6 -0
  8. data/data/kata-dasar.txt +29932 -0
  9. data/lib/sastrawi/dictionary/array_dictionary.rb +33 -0
  10. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
  11. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
  12. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
  13. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
  16. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
  17. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
  18. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
  19. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
  20. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
  21. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
  22. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
  23. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
  24. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
  25. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
  26. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
  27. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
  28. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
  29. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
  30. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
  31. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
  32. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
  33. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
  34. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
  35. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
  36. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
  37. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
  38. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
  39. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
  40. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
  41. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
  42. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
  43. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
  44. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
  45. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
  46. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
  47. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
  48. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
  49. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
  50. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
  51. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
  52. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
  53. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
  54. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
  55. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
  56. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
  57. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
  58. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
  59. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
  60. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
  61. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
  62. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
  63. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
  64. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
  65. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
  66. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
  67. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
  68. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +24 -0
  69. data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
  70. data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
  71. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +20 -0
  72. data/lib/sastrawi/stemmer/context/context.rb +170 -0
  73. data/lib/sastrawi/stemmer/context/removal.rb +17 -0
  74. data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
  75. data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +46 -0
  76. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +28 -0
  77. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +26 -0
  78. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +26 -0
  79. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +26 -0
  80. data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
  81. data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
  82. data/lib/sastrawi/stemmer/stemmer.rb +85 -0
  83. data/lib/sastrawi/stemmer/stemmer_factory.rb +45 -0
  84. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +24 -0
  85. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +152 -0
  86. data/lib/sastrawi/version.rb +3 -0
  87. data/lib/sastrawi.rb +12 -0
  88. data/sastrawi.gemspec +25 -0
  89. metadata +173 -0
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule30c
5
+ def disambiguate(word)
6
+ contains = /^penge(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule31a
5
+ def disambiguate(word)
6
+ contains = /^peny([aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return 'ny' << matches[0] << matches[1]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule31b
5
+ def disambiguate(word)
6
+ contains = /^peny([aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return 's' << matches[0] << matches[1]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule32
5
+ def disambiguate(word)
6
+ return 'ajar' if word == 'pelajar'
7
+
8
+ contains = /^pe(l[aiueo])(.*)$/.match(word)
9
+
10
+ if contains
11
+ matches = contains.captures
12
+
13
+ return matches[0] << matches[1]
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule34
5
+ def disambiguate(word)
6
+ contains = /^pe([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return if /^er(.*)$/.match(matches[1])
12
+
13
+ return matches[0] << matches[1]
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule35
5
+ def disambiguate(word)
6
+ contains = /^ter([bcdfghjkpqstvxz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule36
5
+ def disambiguate(word)
6
+ contains = /^pe([bcdfghjkpqstvxz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule37a
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])(er[aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule37b
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])er([aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule38a
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])(el[aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule38b
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])el([aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule39a
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])(em[aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule39b
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])em([aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,11 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule4
5
+ def disambiguate(word)
6
+ return 'ajar' if word == 'belajar'
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule40a
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])(in[aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule40b
5
+ def disambiguate(word)
6
+ contains = /^([bcdfghjklmnpqstvwxyz])in([aiueo])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule41
5
+ def disambiguate(word)
6
+ contains = /^ku(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule42
5
+ def disambiguate(word)
6
+ contains = /^kau(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule5
5
+ def disambiguate(word)
6
+ contains = /^be([bcdfghjklmnpqrstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0] << matches[1] << matches[2]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule6a
5
+ def disambiguate(word)
6
+ contains = /^ter([aiueo].*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return matches[0]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule6b
5
+ def disambiguate(word)
6
+ contains = /^ter([aiueo].*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return 'r' << matches[0]
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule7
5
+ def disambiguate(word)
6
+ contains = /^ter([bcdfghjklmnpqrstvwxyz])er([aiueo].*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return if matches[0] == 'r'
12
+
13
+ return matches[0] << 'er' << matches[1]
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule8
5
+ def disambiguate(word)
6
+ contains = /^ter([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return if matches[0] == 'r' || /^er(.*)$/.match(matches[1])
12
+
13
+ return matches[0] << matches[1]
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,19 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ module Disambiguator
4
+ class DisambiguatorPrefixRule9
5
+ def disambiguate(word)
6
+ contains = /^te([bcdfghjklmnpqrstvwxyz])er(([bcdfghjklmnpqrstvwxyz]).*)$/.match(word)
7
+
8
+ if contains
9
+ matches = contains.captures
10
+
11
+ return if matches[0] == 'r'
12
+
13
+ return matches[0] << 'er' << matches[1] << matches[2]
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,24 @@
1
+ module Sastrawi
2
+ module Morphology
3
+ class InvalidAffixPairSpecification
4
+ def satisfied_by?(word)
5
+ return false if /^me(.*)kan$/.match(word)
6
+
7
+ return false if word == 'ketahui'
8
+
9
+ invalid_affixes = [
10
+ /^ber(.*)i$/, /^di(.*)an$/, /^ke(.*)i$/, /^ke(.*)an$/,
11
+ /^me(.*)an$/, /^me(.*)an$/, /^ter(.*)an$/, /^per(.*)an$/
12
+ ]
13
+
14
+ matches = false
15
+
16
+ invalid_affixes.each do |invalid_affix|
17
+ matches = matches || !!(word =~ invalid_affix)
18
+ end
19
+
20
+ matches
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,25 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module Cache
4
+ class ArrayCache
5
+ attr_accessor :data
6
+
7
+ def initialize
8
+ @data = {}
9
+ end
10
+
11
+ def set(key, value)
12
+ @data[key.to_sym] = value
13
+ end
14
+
15
+ def get(key)
16
+ return @data[key.to_sym] if @data.key?(key.to_sym)
17
+ end
18
+
19
+ def has?(key)
20
+ @data.key?(key.to_sym)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,33 @@
1
+ require 'sastrawi/stemmer/filter/text_normalizer'
2
+
3
+ module Sastrawi
4
+ module Stemmer
5
+ class CachedStemmer
6
+ attr_accessor :cache, :delegated_stemmer
7
+
8
+ def initialize(cache, delegated_stemmer)
9
+ @cache = cache
10
+ @delegated_stemmer = delegated_stemmer
11
+ end
12
+
13
+ def stem(text)
14
+ normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
15
+
16
+ words = normalized_text.split(' ')
17
+ stems = []
18
+
19
+ words.each do |word|
20
+ if @cache.has?(word)
21
+ stems.push(@cache.get(word))
22
+ else
23
+ stem = @delegated_stemmer.stem(word)
24
+ @cache.set(word, stem)
25
+ stems.push(stem)
26
+ end
27
+ end
28
+
29
+ stems.join(' ')
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,20 @@
1
+ module Sastrawi
2
+ module Stemmer
3
+ module ConfixStripping
4
+ class PrecedenceAdjustmentSpecification
5
+ def satisfied_by?(value)
6
+ regex_rules = [
7
+ /^be(.*)lah$/, /^be(.*)an$/, /^me(.*)i$/,
8
+ /^di(.*)i$/, /^pe(.*)i$/, /^ter(.*)i$/
9
+ ]
10
+
11
+ regex_rules.each do |rule|
12
+ return true if rule.match(value)
13
+ end
14
+
15
+ false
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end