sastrawi 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +22 -0
  3. data/README.md +3 -4
  4. data/Rakefile +2 -2
  5. data/_config.yml +1 -0
  6. data/lib/sastrawi/dictionary/array_dictionary.rb +18 -0
  7. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +1 -1
  8. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +1 -1
  9. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +1 -1
  10. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +1 -1
  11. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +1 -1
  12. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +1 -1
  13. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +1 -1
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +1 -1
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +1 -1
  16. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +4 -0
  17. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +5 -0
  18. data/lib/sastrawi/stemmer/context/context.rb +20 -0
  19. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +9 -0
  20. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +8 -0
  21. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +8 -0
  22. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +8 -0
  23. data/lib/sastrawi/stemmer/stemmer.rb +18 -0
  24. data/lib/sastrawi/stemmer/stemmer_factory.rb +3 -0
  25. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +3 -0
  26. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +1 -1
  27. data/lib/sastrawi/version.rb +1 -1
  28. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2c5b1727acdc71972e767e95a444969a59982660
4
- data.tar.gz: 0cba68fa48adbfc2004b47b0050bd6bbfad921dd
3
+ metadata.gz: 18d883e3d11d60b35c08f9161c338da411e0b19f
4
+ data.tar.gz: b9b9c4aab58ca6c23d03f991cfe69851850a556f
5
5
  SHA512:
6
- metadata.gz: 1aabe63a7cc2d94eb34e7445fd2912e3a415248d62e871578ee9e9d0c3790e822eb1d76b2bcd092fb4137a42f18734ca78f82d0c24455244103b436f2b46406a
7
- data.tar.gz: 70114d83ab5d39081490308ed2437244445136282e439edfe19a76a556c3e4cfbbeda8c30e22546c780f4d8f9f1adbddcf91aedb9974f8bc98763b80ec42db0d
6
+ metadata.gz: 3e0838a8500a8225083c4504ead591ab769536c62cc93202da4bee6bd24cb4cde4f49121914346106f98f9344a326466ee10cedfed546abce8f33e4d6ba4a6c6
7
+ data.tar.gz: 8f5cf8a627c7a474dc64de6729843c4fd5f3bbf6e4e70f9d3ef6065efe7f53592185cee8894d3f5dacffb6ab13b145863a5762e24af781818aee2bdc6d29d3b9
data/CONTRIBUTING.md ADDED
@@ -0,0 +1,22 @@
1
+ # Contributing to sastrawi-ruby
2
+
3
+ If you find a bug, please report it to [issue tracker][issue]. You can also
4
+ contribute by writing codes.
5
+
6
+ ## How to contribute
7
+
8
+ There are some steps you must follow:
9
+
10
+ 1. Fork this repository and clone it to your local environment
11
+ 2. Create a named brached that contains your change
12
+ 3. Install the development dependencies by running `bundle install`
13
+ 4. Code
14
+ 5. Add or adjust unit tests and make sure everything passes by running `bundle
15
+ exec rake`
16
+ 6. Push your branch to GitHub
17
+ 7. Send a pull request for your branch
18
+
19
+ Use `dev` branch as a target of your branch for pull request. Both issue and pull
20
+ request details must be written in English.
21
+
22
+ [issue]: https://github.com/meisyal/sastrawi-ruby/issues
data/README.md CHANGED
@@ -77,9 +77,8 @@ puts stemmer.stem('internetan')
77
77
 
78
78
  ## Contributing
79
79
 
80
- Contributions are welcome. If you find a bug, please report it to [issue
81
- tracker][issue]. Use `dev` branch as a target of your feature branch for pull
82
- request. Both issue and pull request details must be written in English.
80
+ Contributions are welcome. Please, read [CONTRIBUTING][contributing]
81
+ guidelines.
83
82
 
84
83
  ## License
85
84
 
@@ -92,7 +91,7 @@ Attribution-NonCommercial-ShareAlike 3.0 Unported License][kateglolicense].
92
91
  [stemmingwiki]: https://en.wikipedia.org/wiki/Stemming
93
92
  [howstemmingworks]: https://github.com/sastrawi/sastrawi/wiki/Stemming-Bahasa-Indonesia
94
93
  [documentation]: https://github.com/meisyal/sastrawi-ruby/wiki
95
- [issue]: https://github.com/meisyal/sastrawi-ruby/issues
94
+ [contributing]: https://github/com/meisyal/sastrawi-ruby/blob/master/CONTRIBUTING.md
96
95
  [license]: https://github.com/meisyal/sastrawi-ruby/blob/master/LICENSE.txt
97
96
  [kateglo]: http://kateglo.com
98
97
  [kateglolicense]: https://creativecommons.org/licenses/by-nc-sa/3.0/
data/Rakefile CHANGED
@@ -1,5 +1,5 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
data/_config.yml ADDED
@@ -0,0 +1 @@
1
+ theme: jekyll-theme-cayman
@@ -9,26 +9,41 @@ module Sastrawi
9
9
  add_words(words)
10
10
  end
11
11
 
12
+ ##
13
+ # Check whether a word is contained in the dictionary
14
+
12
15
  def contains?(word)
13
16
  @words.include?(word)
14
17
  end
15
18
 
19
+ ##
20
+ # Count how many words in the dictionary
21
+
16
22
  def count
17
23
  @words.length
18
24
  end
19
25
 
26
+ ##
27
+ # Add multiple words to the dictionary
28
+
20
29
  def add_words(new_words)
21
30
  new_words.each do |word|
22
31
  add(word)
23
32
  end
24
33
  end
25
34
 
35
+ ##
36
+ # Add a word to the dictionary
37
+
26
38
  def add(word)
27
39
  return if word == ''
28
40
 
29
41
  @words.push(word)
30
42
  end
31
43
 
44
+ ##
45
+ # Add words from a text file to the dictionary
46
+
32
47
  def add_words_from_text_file(file_path)
33
48
  words = []
34
49
 
@@ -41,6 +56,9 @@ module Sastrawi
41
56
  add_words(words)
42
57
  end
43
58
 
59
+ ##
60
+ # Remove a word from the dictionary
61
+
44
62
  def remove(word)
45
63
  @words.delete(word)
46
64
  end
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule37a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(er[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(er[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule37b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])er([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])er([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule38a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(el[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(el[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule38b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])el([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])el([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule39a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(em[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(em[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule39b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])em([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])em([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule40a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(in[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(in[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule40b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])in([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])in([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule5
5
5
  def disambiguate(word)
6
- contains = /^be([bcdfghjklmnpqrstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
6
+ contains = /^be([bcdfghjklmnpqstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -1,3 +1,7 @@
1
+ ##
2
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval". page 26
3
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
4
+
1
5
  module Sastrawi
2
6
  module Morphology
3
7
  class InvalidAffixPairSpecification
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Confix Stripping Rule Precendence Adjustment Specification
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module ConfixStripping
@@ -1,5 +1,9 @@
1
1
  require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
2
2
 
3
+ ##
4
+ # Stemming context using Nazief and Adriani, Confix Stripping (CS),
5
+ # Enhanced Confix Stripping (ECS), and Improved (ECS)
6
+
3
7
  module Sastrawi
4
8
  module Stemmer
5
9
  module Context
@@ -37,6 +41,9 @@ module Sastrawi
37
41
  @removals.push(removal)
38
42
  end
39
43
 
44
+ ##
45
+ # Execute stemming process
46
+
40
47
  def execute
41
48
  start_stemming_process
42
49
 
@@ -56,6 +63,10 @@ module Sastrawi
56
63
 
57
64
  cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
58
65
 
66
+ ##
67
+ # Confix stripping
68
+ # try to remove prefix before suffix if the specification is met
69
+
59
70
  if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
60
71
  remove_prefixes
61
72
  return if @dictionary.contains?(@current_word)
@@ -78,6 +89,9 @@ module Sastrawi
78
89
  loop_last_return
79
90
  end
80
91
 
92
+ ##
93
+ # ECS loop last return
94
+
81
95
  def loop_last_return
82
96
  restore_prefix
83
97
 
@@ -147,10 +161,16 @@ module Sastrawi
147
161
  end
148
162
  end
149
163
 
164
+ ##
165
+ # Check whether the removed part is a suffix
166
+
150
167
  def suffix_removal?(removal)
151
168
  removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
152
169
  end
153
170
 
171
+ ##
172
+ # Restore prefix to proceed with ECS loop last return
173
+
154
174
  def restore_prefix
155
175
  @removals.each do |removal|
156
176
  if removal.affix_type == 'DP'
@@ -1,5 +1,10 @@
1
1
  require 'sastrawi/stemmer/context/removal'
2
2
 
3
+ ##
4
+ # Remove derivational suffix
5
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
6
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
7
+
3
8
  module Sastrawi
4
9
  module Stemmer
5
10
  module Context
@@ -18,6 +23,10 @@ module Sastrawi
18
23
  end
19
24
  end
20
25
 
26
+ ##
27
+ # Original rule: i|kan|an
28
+ # Added the adopted foreign suffix rule: is|isme|isasi
29
+
21
30
  def remove_suffix(word)
22
31
  word.sub(/(is|isme|isasi|i|kan|an)$/, '')
23
32
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove inflectional particle
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove inflectional particle: lah|kah|tah|pun
26
+
19
27
  def remove(word)
20
28
  word.sub(/-*(lah|kah|tah|pun)$/, '')
21
29
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove inflectional possessive pronoun
3
+ # Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove inflectional possessive pronoun: ku|mu|nya|
26
+
19
27
  def remove(word)
20
28
  word.sub(/-*(ku|mu|nya)$/, '')
21
29
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove plain prefix
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove plain prefix: di|ke|se
26
+
19
27
  def remove(word)
20
28
  word.sub(/^(di|ke|se)/, '')
21
29
  end
@@ -4,6 +4,10 @@ require 'sastrawi/stemmer/context/visitor/visitor_provider'
4
4
 
5
5
  require 'sastrawi/stemmer/filter/text_normalizer'
6
6
 
7
+ ##
8
+ # Indonesian Stemmer
9
+ # Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
10
+
7
11
  module Sastrawi
8
12
  module Stemmer
9
13
  class Stemmer
@@ -14,6 +18,9 @@ module Sastrawi
14
18
  @visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
15
19
  end
16
20
 
21
+ ##
22
+ # Stem a string to its base form
23
+
17
24
  def stem(text)
18
25
  normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
19
26
 
@@ -27,6 +34,9 @@ module Sastrawi
27
34
  stems.join(' ')
28
35
  end
29
36
 
37
+ ##
38
+ # Stem a word to its base form
39
+
30
40
  def stem_word(word)
31
41
  if plural?(word)
32
42
  stem_plural_word(word)
@@ -43,6 +53,11 @@ module Sastrawi
43
53
  return word.include?('-')
44
54
  end
45
55
 
56
+ ##
57
+ # Stem a plural word to its base form
58
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
59
+ # page 76-77
60
+
46
61
  def stem_plural_word(word)
47
62
  first_match = /^(.*)-(.*)$/.match(word)
48
63
 
@@ -72,6 +87,9 @@ module Sastrawi
72
87
  end
73
88
  end
74
89
 
90
+ ##
91
+ # Stem a singular word to its base form
92
+
75
93
  def stem_singular_word(word)
76
94
  context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
77
95
  context.execute
@@ -5,6 +5,9 @@ require 'sastrawi/stemmer/stemmer'
5
5
 
6
6
  require 'sastrawi/stemmer/cache/array_cache'
7
7
 
8
+ ##
9
+ # Stemmer factory helps creating a pre-configured stemmer
10
+
8
11
  module Sastrawi
9
12
  module Stemmer
10
13
  class StemmerFactory
@@ -7,6 +7,9 @@ module Sastrawi
7
7
  @dictionary = dictionary
8
8
  end
9
9
 
10
+ ##
11
+ # Remove stop words
12
+
10
13
  def remove(text)
11
14
  words = text.split(' ')
12
15
  stop_words = []
@@ -4,7 +4,7 @@ require 'sastrawi/stop_word_remover/stop_word_remover'
4
4
 
5
5
  module Sastrawi
6
6
  module StopWordRemover
7
- module StopWordRemoverFactory
7
+ class StopWordRemoverFactory
8
8
  def create_stop_word_remover
9
9
  stop_words = get_stop_word
10
10
 
@@ -1,3 +1,3 @@
1
1
  module Sastrawi
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sastrawi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrias Meisyal
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-21 00:00:00.000000000 Z
11
+ date: 2017-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -61,10 +61,12 @@ extra_rdoc_files: []
61
61
  files:
62
62
  - ".gitignore"
63
63
  - ".travis.yml"
64
+ - CONTRIBUTING.md
64
65
  - Gemfile
65
66
  - LICENSE.txt
66
67
  - README.md
67
68
  - Rakefile
69
+ - _config.yml
68
70
  - data/base-word.txt
69
71
  - lib/sastrawi.rb
70
72
  - lib/sastrawi/dictionary/array_dictionary.rb