sastrawi-ruby 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. checksums.yaml +7 -0
  2. data/.github/workflows/ci.yml +23 -0
  3. data/.gitignore +51 -0
  4. data/.travis.yml +10 -0
  5. data/CONTRIBUTING.md +22 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +21 -0
  8. data/README.md +104 -0
  9. data/Rakefile +6 -0
  10. data/_config.yml +1 -0
  11. data/bin/sastrawi +24 -0
  12. data/data/base-word.txt +29933 -0
  13. data/lib/sastrawi/dictionary/array_dictionary.rb +67 -0
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
  16. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
  17. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
  18. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
  19. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
  20. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
  21. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
  22. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
  23. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
  24. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
  25. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
  26. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
  27. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
  28. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
  29. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
  30. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
  31. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
  32. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
  33. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
  34. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
  35. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
  36. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
  37. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
  38. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
  39. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
  40. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
  41. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
  42. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
  43. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
  44. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
  45. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
  46. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
  47. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
  48. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
  49. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
  50. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
  51. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
  52. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
  53. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
  54. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
  55. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
  56. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
  57. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
  58. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
  59. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
  60. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
  61. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
  62. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
  63. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
  64. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
  65. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
  66. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
  67. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
  68. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
  69. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
  70. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
  71. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
  72. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +28 -0
  73. data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
  74. data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
  75. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +25 -0
  76. data/lib/sastrawi/stemmer/context/context.rb +217 -0
  77. data/lib/sastrawi/stemmer/context/removal.rb +17 -0
  78. data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
  79. data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +54 -0
  80. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +37 -0
  81. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +34 -0
  82. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +34 -0
  83. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +34 -0
  84. data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
  85. data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
  86. data/lib/sastrawi/stemmer/stemmer.rb +101 -0
  87. data/lib/sastrawi/stemmer/stemmer_factory.rb +49 -0
  88. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +27 -0
  89. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +124 -0
  90. data/lib/sastrawi/version.rb +5 -0
  91. data/lib/sastrawi.rb +4 -0
  92. data/sastrawi.gemspec +34 -0
  93. metadata +179 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 293b0a6274835ba56e5df1e6c0a96b725e026ef703af178b7f791c3763b723c3
4
+ data.tar.gz: 2b74b53603072b2e4171abfc8117308dae795a78b2e0071fd73a7961e732808b
5
+ SHA512:
6
+ metadata.gz: dc74bcdbcb213b19c26e0dd6480077e69ae8dec455911b5004af662547fe0e7bd5af72943615025025d244b9f7532e1f6b062bf213b99ae8f0e96b7f8c8db73b
7
+ data.tar.gz: eb5807126530f67a26935fd85130206ee08da1a4817d0e1d1f7151b011dc6533f6ef16b4a41673a5c2af5ec85a776d0be84e77ee024da15a730d7b090786dab9
@@ -0,0 +1,23 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [master]
6
+ pull_request:
7
+ branches: [master]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ ruby-version: ['3.0', '3.1', '3.2', '3.3', '3.4']
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - name: Set up Ruby ${{ matrix.ruby-version }}
18
+ uses: ruby/setup-ruby@v1
19
+ with:
20
+ ruby-version: ${{ matrix.ruby-version }}
21
+ bundler-cache: true
22
+ - name: Run tests
23
+ run: bundle exec rspec
data/.gitignore ADDED
@@ -0,0 +1,51 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ ## Specific to RubyMotion:
17
+ .dat*
18
+ .repl_history
19
+ build/
20
+ *.bridgesupport
21
+ build-iPhoneOS/
22
+ build-iPhoneSimulator/
23
+
24
+ ## Specific to RubyMotion (use of CocoaPods):
25
+ #
26
+ # We recommend against adding the Pods directory to your .gitignore. However
27
+ # you should judge for yourself, the pros and cons are mentioned at:
28
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
29
+ #
30
+ # vendor/Pods/
31
+
32
+ ## Documentation cache and generated files:
33
+ /.yardoc/
34
+ /_yardoc/
35
+ /doc/
36
+ /rdoc/
37
+
38
+ ## Environment normalization:
39
+ /.bundle/
40
+ /vendor/bundle
41
+ /lib/bundler/man/
42
+
43
+ # for a library or gem, you might want to ignore these files since the code is
44
+ # intended to run in multiple environments; otherwise, check them in:
45
+ Gemfile.lock
46
+ # .ruby-version
47
+ # .ruby-gemset
48
+
49
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
50
+ .rvmrc
51
+ CLAUDE.md
data/.travis.yml ADDED
@@ -0,0 +1,10 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.8
5
+ - 2.4.5
6
+ - 2.5.3
7
+ before_install: gem install bundler -v 2.2.14 --no-document
8
+ notifications:
9
+ email:
10
+ on_success: never
data/CONTRIBUTING.md ADDED
@@ -0,0 +1,22 @@
1
+ # Contributing to sastrawi-ruby
2
+
3
+ If you find a bug, please report it to [issue tracker][issue]. You can also
4
+ contribute by writing codes.
5
+
6
+ ## How to contribute
7
+
8
+ There are some steps you must follow:
9
+
10
+ 1. Fork this repository and clone it to your local environment
11
+ 2. Create a named brached that contains your change
12
+ 3. Install the development dependencies by running `bundle install`
13
+ 4. Code
14
+ 5. Add or adjust unit tests and make sure everything passes by running `bundle
15
+ exec rake`
16
+ 6. Push your branch to GitHub
17
+ 7. Send a pull request for your branch
18
+
19
+ Use `dev` branch as a target of your branch for pull request. Both issue and pull
20
+ request details must be written in English.
21
+
22
+ [issue]: https://github.com/meisyal/sastrawi-ruby/issues
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in sastrawi.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016-2021 Andrias Meisyal
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,104 @@
1
+ # sastrawi-ruby
2
+
3
+ Indonesian language stemmer for Ruby. Stems words in Bahasa Indonesia using the Nazief & Adriani algorithm with Enhanced Confix Stripping (ECS).
4
+
5
+ This is an actively maintained fork of [meisyal/sastrawi-ruby](https://github.com/meisyal/sastrawi-ruby).
6
+
7
+ ## What's New in v0.2.0
8
+
9
+ - **Bug fixes**: Fixed 3 stemming bugs (`menerangi`, `berimanlah`, `kesepersepuluhnya`)
10
+ - **Dictionary**: Added missing words (`sepuluh`)
11
+ - **Modernized**: Ruby 3.0+ required, updated dependencies, GitHub Actions CI
12
+ - **Fixed regex warning** in disambiguator prefix rule 16
13
+
14
+ ## Installation
15
+
16
+ ```ruby
17
+ # Gemfile
18
+ gem "sastrawi"
19
+ ```
20
+
21
+ ```bash
22
+ gem install sastrawi
23
+ ```
24
+
25
+ Requires Ruby 3.0+.
26
+
27
+ ## Usage
28
+
29
+ ### Stemming
30
+
31
+ ```ruby
32
+ require "sastrawi"
33
+
34
+ factory = Sastrawi::Stemmer::StemmerFactory.new
35
+ stemmer = factory.create_stemmer
36
+
37
+ stemmer.stem("Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan")
38
+ # => "ekonomi indonesia sedang dalam tumbuh yang bangga"
39
+
40
+ stemmer.stem("membangunkan") # => "bangun"
41
+ stemmer.stem("bersembunyi") # => "sembunyi"
42
+ stemmer.stem("menerangi") # => "terang"
43
+ stemmer.stem("kesepersepuluhnya") # => "sepuluh"
44
+ ```
45
+
46
+ ### Stop Word Removal
47
+
48
+ ```ruby
49
+ require "sastrawi"
50
+
51
+ factory = Sastrawi::StopWordRemover::StopWordRemoverFactory.new
52
+ stop_words = factory.get_stop_word
53
+ # => ["a", "ada", "adalah", "agar", "akan", ...]
54
+ ```
55
+
56
+ ### Custom Dictionary
57
+
58
+ ```ruby
59
+ require "sastrawi"
60
+
61
+ factory = Sastrawi::Stemmer::StemmerFactory.new
62
+ dictionary = factory.create_default_dictionary
63
+
64
+ # Add words from file
65
+ dictionary.add_words_from_text_file("my-dictionary.txt")
66
+
67
+ # Add/remove individual words
68
+ dictionary.add("internet")
69
+ dictionary.remove("desa")
70
+
71
+ stemmer = Sastrawi::Stemmer::Stemmer.new(dictionary)
72
+ stemmer.stem("internetan") # => "internet"
73
+ ```
74
+
75
+ ## How It Works
76
+
77
+ Indonesian stemming removes affixes (prefixes, suffixes, infixes) to find base words:
78
+
79
+ | Affix Type | Examples | Algorithm Step |
80
+ |---|---|---|
81
+ | Inflectional Particle | -lah, -kah, -pun | Removed first |
82
+ | Possessive Pronoun | -ku, -mu, -nya | Removed second |
83
+ | Derivational Suffix | -i, -kan, -an | Removed third |
84
+ | Derivational Prefix | me-, ber-, ter-, pe-, di-, ke-, se- | Removed last (up to 3 layers) |
85
+
86
+ The algorithm uses Confix Stripping (CS) and Enhanced Confix Stripping (ECS) for handling complex prefix-suffix combinations, plus a dictionary lookup at each step to validate results.
87
+
88
+ ## Known Limitations
89
+
90
+ - `memuaskan` stems to `muas` instead of `puas` — both are valid dictionary words and the algorithm picks the first match (Rule13a). This is an inherent ambiguity in the Nazief-Adriani algorithm.
91
+
92
+ ## Contributing
93
+
94
+ Bug reports and pull requests are welcome on [GitHub](https://github.com/johannesdwicahyo/sastrawi-ruby).
95
+
96
+ ## License
97
+
98
+ MIT License. Contains base words from [Kateglo](https://kateglo.com) licensed under [CC BY-NC-SA 3.0](https://creativecommons.org/licenses/by-nc-sa/3.0/).
99
+
100
+ ## Credits
101
+
102
+ - Original PHP implementation: [sastrawi/sastrawi](https://github.com/sastrawi/sastrawi)
103
+ - Ruby port: [Andrias Meisyal](https://github.com/meisyal)
104
+ - Fork maintainer: [Johannes Dwi Cahyo](https://github.com/johannesdwicahyo)
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/_config.yml ADDED
@@ -0,0 +1 @@
1
+ theme: jekyll-theme-cayman
data/bin/sastrawi ADDED
@@ -0,0 +1,24 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'sastrawi'
4
+
5
+ # usage: $ sastrawi word/sentence
6
+ if ARGV.empty?
7
+ puts '--------------- sastrawi: ERROR ---------------'
8
+ puts 'Usage: Please specify a word or sentence to be stemmed.'
9
+ puts ' sastrawi word/sentence'
10
+ puts 'Example:'
11
+ puts ' sastrawi mengundang'
12
+ puts ' sastrawi Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan.'
13
+ else
14
+ stemmer_factory = Sastrawi::Stemmer::StemmerFactory.new
15
+ stemmer = stemmer_factory.create_stemmer
16
+
17
+ words = []
18
+
19
+ ARGV.each { |arg| words << arg }
20
+
21
+ sentence = words.join(' ')
22
+
23
+ puts stemmer.stem(sentence)
24
+ end