sastrawi 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/CONTRIBUTING.md +22 -0
  3. data/README.md +3 -4
  4. data/Rakefile +2 -2
  5. data/_config.yml +1 -0
  6. data/lib/sastrawi/dictionary/array_dictionary.rb +18 -0
  7. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +1 -1
  8. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +1 -1
  9. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +1 -1
  10. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +1 -1
  11. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +1 -1
  12. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +1 -1
  13. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +1 -1
  14. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +1 -1
  15. data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +1 -1
  16. data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +4 -0
  17. data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +5 -0
  18. data/lib/sastrawi/stemmer/context/context.rb +20 -0
  19. data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +9 -0
  20. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +8 -0
  21. data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +8 -0
  22. data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +8 -0
  23. data/lib/sastrawi/stemmer/stemmer.rb +18 -0
  24. data/lib/sastrawi/stemmer/stemmer_factory.rb +3 -0
  25. data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +3 -0
  26. data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +1 -1
  27. data/lib/sastrawi/version.rb +1 -1
  28. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2c5b1727acdc71972e767e95a444969a59982660
4
- data.tar.gz: 0cba68fa48adbfc2004b47b0050bd6bbfad921dd
3
+ metadata.gz: 18d883e3d11d60b35c08f9161c338da411e0b19f
4
+ data.tar.gz: b9b9c4aab58ca6c23d03f991cfe69851850a556f
5
5
  SHA512:
6
- metadata.gz: 1aabe63a7cc2d94eb34e7445fd2912e3a415248d62e871578ee9e9d0c3790e822eb1d76b2bcd092fb4137a42f18734ca78f82d0c24455244103b436f2b46406a
7
- data.tar.gz: 70114d83ab5d39081490308ed2437244445136282e439edfe19a76a556c3e4cfbbeda8c30e22546c780f4d8f9f1adbddcf91aedb9974f8bc98763b80ec42db0d
6
+ metadata.gz: 3e0838a8500a8225083c4504ead591ab769536c62cc93202da4bee6bd24cb4cde4f49121914346106f98f9344a326466ee10cedfed546abce8f33e4d6ba4a6c6
7
+ data.tar.gz: 8f5cf8a627c7a474dc64de6729843c4fd5f3bbf6e4e70f9d3ef6065efe7f53592185cee8894d3f5dacffb6ab13b145863a5762e24af781818aee2bdc6d29d3b9
data/CONTRIBUTING.md ADDED
@@ -0,0 +1,22 @@
1
+ # Contributing to sastrawi-ruby
2
+
3
+ If you find a bug, please report it to [issue tracker][issue]. You can also
4
+ contribute by writing codes.
5
+
6
+ ## How to contribute
7
+
8
+ There are some steps you must follow:
9
+
10
+ 1. Fork this repository and clone it to your local environment
11
+ 2. Create a named brached that contains your change
12
+ 3. Install the development dependencies by running `bundle install`
13
+ 4. Code
14
+ 5. Add or adjust unit tests and make sure everything passes by running `bundle
15
+ exec rake`
16
+ 6. Push your branch to GitHub
17
+ 7. Send a pull request for your branch
18
+
19
+ Use `dev` branch as a target of your branch for pull request. Both issue and pull
20
+ request details must be written in English.
21
+
22
+ [issue]: https://github.com/meisyal/sastrawi-ruby/issues
data/README.md CHANGED
@@ -77,9 +77,8 @@ puts stemmer.stem('internetan')
77
77
 
78
78
  ## Contributing
79
79
 
80
- Contributions are welcome. If you find a bug, please report it to [issue
81
- tracker][issue]. Use `dev` branch as a target of your feature branch for pull
82
- request. Both issue and pull request details must be written in English.
80
+ Contributions are welcome. Please, read [CONTRIBUTING][contributing]
81
+ guidelines.
83
82
 
84
83
  ## License
85
84
 
@@ -92,7 +91,7 @@ Attribution-NonCommercial-ShareAlike 3.0 Unported License][kateglolicense].
92
91
  [stemmingwiki]: https://en.wikipedia.org/wiki/Stemming
93
92
  [howstemmingworks]: https://github.com/sastrawi/sastrawi/wiki/Stemming-Bahasa-Indonesia
94
93
  [documentation]: https://github.com/meisyal/sastrawi-ruby/wiki
95
- [issue]: https://github.com/meisyal/sastrawi-ruby/issues
94
+ [contributing]: https://github/com/meisyal/sastrawi-ruby/blob/master/CONTRIBUTING.md
96
95
  [license]: https://github.com/meisyal/sastrawi-ruby/blob/master/LICENSE.txt
97
96
  [kateglo]: http://kateglo.com
98
97
  [kateglolicense]: https://creativecommons.org/licenses/by-nc-sa/3.0/
data/Rakefile CHANGED
@@ -1,5 +1,5 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec)
5
5
 
data/_config.yml ADDED
@@ -0,0 +1 @@
1
+ theme: jekyll-theme-cayman
@@ -9,26 +9,41 @@ module Sastrawi
9
9
  add_words(words)
10
10
  end
11
11
 
12
+ ##
13
+ # Check whether a word is contained in the dictionary
14
+
12
15
  def contains?(word)
13
16
  @words.include?(word)
14
17
  end
15
18
 
19
+ ##
20
+ # Count how many words in the dictionary
21
+
16
22
  def count
17
23
  @words.length
18
24
  end
19
25
 
26
+ ##
27
+ # Add multiple words to the dictionary
28
+
20
29
  def add_words(new_words)
21
30
  new_words.each do |word|
22
31
  add(word)
23
32
  end
24
33
  end
25
34
 
35
+ ##
36
+ # Add a word to the dictionary
37
+
26
38
  def add(word)
27
39
  return if word == ''
28
40
 
29
41
  @words.push(word)
30
42
  end
31
43
 
44
+ ##
45
+ # Add words from a text file to the dictionary
46
+
32
47
  def add_words_from_text_file(file_path)
33
48
  words = []
34
49
 
@@ -41,6 +56,9 @@ module Sastrawi
41
56
  add_words(words)
42
57
  end
43
58
 
59
+ ##
60
+ # Remove a word from the dictionary
61
+
44
62
  def remove(word)
45
63
  @words.delete(word)
46
64
  end
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule37a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(er[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(er[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule37b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])er([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])er([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule38a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(el[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(el[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule38b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])el([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])el([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule39a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(em[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(em[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule39b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])em([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])em([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule40a
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])(in[aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])(in[aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule40b
5
5
  def disambiguate(word)
6
- contains = /^([bcdfghjklmnpqstvwxyz])in([aiueo])(.*)$/.match(word)
6
+ contains = /^([bcdfghjklmnpqrstvwxyz])in([aiueo])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -3,7 +3,7 @@ module Sastrawi
3
3
  module Disambiguator
4
4
  class DisambiguatorPrefixRule5
5
5
  def disambiguate(word)
6
- contains = /^be([bcdfghjklmnpqrstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
6
+ contains = /^be([bcdfghjklmnpqstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
7
7
 
8
8
  if contains
9
9
  matches = contains.captures
@@ -1,3 +1,7 @@
1
+ ##
2
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval". page 26
3
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
4
+
1
5
  module Sastrawi
2
6
  module Morphology
3
7
  class InvalidAffixPairSpecification
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Confix Stripping Rule Precendence Adjustment Specification
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module ConfixStripping
@@ -1,5 +1,9 @@
1
1
  require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
2
2
 
3
+ ##
4
+ # Stemming context using Nazief and Adriani, Confix Stripping (CS),
5
+ # Enhanced Confix Stripping (ECS), and Improved (ECS)
6
+
3
7
  module Sastrawi
4
8
  module Stemmer
5
9
  module Context
@@ -37,6 +41,9 @@ module Sastrawi
37
41
  @removals.push(removal)
38
42
  end
39
43
 
44
+ ##
45
+ # Execute stemming process
46
+
40
47
  def execute
41
48
  start_stemming_process
42
49
 
@@ -56,6 +63,10 @@ module Sastrawi
56
63
 
57
64
  cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
58
65
 
66
+ ##
67
+ # Confix stripping
68
+ # try to remove prefix before suffix if the specification is met
69
+
59
70
  if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
60
71
  remove_prefixes
61
72
  return if @dictionary.contains?(@current_word)
@@ -78,6 +89,9 @@ module Sastrawi
78
89
  loop_last_return
79
90
  end
80
91
 
92
+ ##
93
+ # ECS loop last return
94
+
81
95
  def loop_last_return
82
96
  restore_prefix
83
97
 
@@ -147,10 +161,16 @@ module Sastrawi
147
161
  end
148
162
  end
149
163
 
164
+ ##
165
+ # Check whether the removed part is a suffix
166
+
150
167
  def suffix_removal?(removal)
151
168
  removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
152
169
  end
153
170
 
171
+ ##
172
+ # Restore prefix to proceed with ECS loop last return
173
+
154
174
  def restore_prefix
155
175
  @removals.each do |removal|
156
176
  if removal.affix_type == 'DP'
@@ -1,5 +1,10 @@
1
1
  require 'sastrawi/stemmer/context/removal'
2
2
 
3
+ ##
4
+ # Remove derivational suffix
5
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
6
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
7
+
3
8
  module Sastrawi
4
9
  module Stemmer
5
10
  module Context
@@ -18,6 +23,10 @@ module Sastrawi
18
23
  end
19
24
  end
20
25
 
26
+ ##
27
+ # Original rule: i|kan|an
28
+ # Added the adopted foreign suffix rule: is|isme|isasi
29
+
21
30
  def remove_suffix(word)
22
31
  word.sub(/(is|isme|isasi|i|kan|an)$/, '')
23
32
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove inflectional particle
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove inflectional particle: lah|kah|tah|pun
26
+
19
27
  def remove(word)
20
28
  word.sub(/-*(lah|kah|tah|pun)$/, '')
21
29
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove inflectional possessive pronoun
3
+ # Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove inflectional possessive pronoun: ku|mu|nya|
26
+
19
27
  def remove(word)
20
28
  word.sub(/-*(ku|mu|nya)$/, '')
21
29
  end
@@ -1,3 +1,8 @@
1
+ ##
2
+ # Remove plain prefix
3
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
4
+ # http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
5
+
1
6
  module Sastrawi
2
7
  module Stemmer
3
8
  module Context
@@ -16,6 +21,9 @@ module Sastrawi
16
21
  end
17
22
  end
18
23
 
24
+ ##
25
+ # Remove plain prefix: di|ke|se
26
+
19
27
  def remove(word)
20
28
  word.sub(/^(di|ke|se)/, '')
21
29
  end
@@ -4,6 +4,10 @@ require 'sastrawi/stemmer/context/visitor/visitor_provider'
4
4
 
5
5
  require 'sastrawi/stemmer/filter/text_normalizer'
6
6
 
7
+ ##
8
+ # Indonesian Stemmer
9
+ # Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
10
+
7
11
  module Sastrawi
8
12
  module Stemmer
9
13
  class Stemmer
@@ -14,6 +18,9 @@ module Sastrawi
14
18
  @visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
15
19
  end
16
20
 
21
+ ##
22
+ # Stem a string to its base form
23
+
17
24
  def stem(text)
18
25
  normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
19
26
 
@@ -27,6 +34,9 @@ module Sastrawi
27
34
  stems.join(' ')
28
35
  end
29
36
 
37
+ ##
38
+ # Stem a word to its base form
39
+
30
40
  def stem_word(word)
31
41
  if plural?(word)
32
42
  stem_plural_word(word)
@@ -43,6 +53,11 @@ module Sastrawi
43
53
  return word.include?('-')
44
54
  end
45
55
 
56
+ ##
57
+ # Stem a plural word to its base form
58
+ # Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
59
+ # page 76-77
60
+
46
61
  def stem_plural_word(word)
47
62
  first_match = /^(.*)-(.*)$/.match(word)
48
63
 
@@ -72,6 +87,9 @@ module Sastrawi
72
87
  end
73
88
  end
74
89
 
90
+ ##
91
+ # Stem a singular word to its base form
92
+
75
93
  def stem_singular_word(word)
76
94
  context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
77
95
  context.execute
@@ -5,6 +5,9 @@ require 'sastrawi/stemmer/stemmer'
5
5
 
6
6
  require 'sastrawi/stemmer/cache/array_cache'
7
7
 
8
+ ##
9
+ # Stemmer factory helps creating a pre-configured stemmer
10
+
8
11
  module Sastrawi
9
12
  module Stemmer
10
13
  class StemmerFactory
@@ -7,6 +7,9 @@ module Sastrawi
7
7
  @dictionary = dictionary
8
8
  end
9
9
 
10
+ ##
11
+ # Remove stop words
12
+
10
13
  def remove(text)
11
14
  words = text.split(' ')
12
15
  stop_words = []
@@ -4,7 +4,7 @@ require 'sastrawi/stop_word_remover/stop_word_remover'
4
4
 
5
5
  module Sastrawi
6
6
  module StopWordRemover
7
- module StopWordRemoverFactory
7
+ class StopWordRemoverFactory
8
8
  def create_stop_word_remover
9
9
  stop_words = get_stop_word
10
10
 
@@ -1,3 +1,3 @@
1
1
  module Sastrawi
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sastrawi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrias Meisyal
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-21 00:00:00.000000000 Z
11
+ date: 2017-09-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -61,10 +61,12 @@ extra_rdoc_files: []
61
61
  files:
62
62
  - ".gitignore"
63
63
  - ".travis.yml"
64
+ - CONTRIBUTING.md
64
65
  - Gemfile
65
66
  - LICENSE.txt
66
67
  - README.md
67
68
  - Rakefile
69
+ - _config.yml
68
70
  - data/base-word.txt
69
71
  - lib/sastrawi.rb
70
72
  - lib/sastrawi/dictionary/array_dictionary.rb