sastrawi-ruby 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/MILESTONES.md +12 -0
  4. data/data/base-word.txt +17 -1
  5. data/data/stop-words.txt +842 -0
  6. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +2 -0
  7. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +2 -0
  8. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +2 -0
  9. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +2 -0
  10. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +2 -0
  11. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +2 -0
  12. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +2 -0
  13. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +2 -0
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +2 -0
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +2 -0
  16. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +2 -0
  17. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +2 -0
  18. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +2 -0
  19. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +2 -0
  20. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +2 -0
  21. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +2 -0
  22. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +2 -0
  23. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +2 -0
  24. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +2 -0
  25. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +2 -0
  26. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +2 -0
  27. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +2 -0
  28. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +2 -0
  29. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +2 -0
  30. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +2 -0
  31. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +2 -0
  32. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +2 -0
  33. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +2 -0
  34. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +2 -0
  35. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +2 -0
  36. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +2 -0
  37. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +2 -0
  38. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +2 -0
  39. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +2 -0
  40. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +2 -0
  41. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +2 -0
  42. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +2 -0
  43. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +2 -0
  44. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +2 -0
  45. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +2 -0
  46. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +2 -0
  47. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +2 -0
  48. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +2 -0
  49. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +2 -0
  50. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +2 -0
  51. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +2 -0
  52. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +2 -0
  53. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +2 -0
  54. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +2 -0
  55. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +2 -0
  56. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +2 -0
  57. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +2 -0
  58. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +2 -0
  59. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +2 -0
  60. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +2 -0
  61. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +2 -0
  62. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +2 -0
  63. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +2 -0
  64. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +2 -0
  65. data/lib/sastrawi/stemmer/cached_stemmer.rb +2 -0
  66. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +2 -0
  67. data/lib/sastrawi/stemmer/context/context.rb +2 -0
  68. data/lib/sastrawi/stemmer/context/removal.rb +2 -0
  69. data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +2 -0
  70. data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +2 -0
  71. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +2 -0
  72. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +2 -0
  73. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +2 -0
  74. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +2 -0
  75. data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +2 -0
  76. data/lib/sastrawi/stemmer/stemmer.rb +8 -0
  77. data/lib/sastrawi/stemmer/stemmer_factory.rb +2 -0
  78. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +2 -0
  79. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +19 -107
  80. data/lib/sastrawi/version.rb +1 -1
  81. data/lib/sastrawi.rb +2 -0
  82. metadata +3 -1
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Morphology
3
5
  module Disambiguator
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  ##
2
4
  # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval". page 26
3
5
  # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sastrawi/stemmer/filter/text_normalizer'
2
4
 
3
5
  module Sastrawi
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  ##
2
4
  # Confix Stripping Rule Precendence Adjustment Specification
3
5
  # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
2
4
 
3
5
  ##
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Stemmer
3
5
  module Context
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Stemmer
3
5
  module Context
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module Stemmer
3
5
  module Context
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sastrawi/stemmer/context/removal'
2
4
 
3
5
  ##
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  ##
2
4
  # Remove inflectional particle
3
5
  # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  ##
2
4
  # Remove inflectional possessive pronoun
3
5
  # Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  ##
2
4
  # Remove plain prefix
3
5
  # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sastrawi/stemmer/context/visitor/dont_stem_short_word'
2
4
  require 'sastrawi/stemmer/context/visitor/remove_inflectional_particle'
3
5
  require 'sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sastrawi/stemmer/context/context'
2
4
 
3
5
  require 'sastrawi/stemmer/context/visitor/visitor_provider'
@@ -84,6 +86,12 @@ module Sastrawi
84
86
 
85
87
  if root_first_word == root_second_word
86
88
  root_first_word
89
+ elsif @dictionary.contains?(root_second_word)
90
+ # Handle partial/rhyming reduplication (bolak-balik, sayur-mayur, lauk-pauk)
91
+ # Prefer the second word's stem when it's a dictionary word
92
+ root_second_word
93
+ elsif @dictionary.contains?(root_first_word)
94
+ root_first_word
87
95
  else
88
96
  word
89
97
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'sastrawi/dictionary/array_dictionary'
2
4
 
3
5
  require 'sastrawi/stemmer/cached_stemmer'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Sastrawi
2
4
  module StopWordRemover
3
5
  class StopWordRemover