sastrawi 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +22 -0
- data/README.md +3 -4
- data/Rakefile +2 -2
- data/_config.yml +1 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +18 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +1 -1
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +4 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +5 -0
- data/lib/sastrawi/stemmer/context/context.rb +20 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +9 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +8 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +8 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +8 -0
- data/lib/sastrawi/stemmer/stemmer.rb +18 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +3 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +3 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +1 -1
- data/lib/sastrawi/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 18d883e3d11d60b35c08f9161c338da411e0b19f
|
4
|
+
data.tar.gz: b9b9c4aab58ca6c23d03f991cfe69851850a556f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e0838a8500a8225083c4504ead591ab769536c62cc93202da4bee6bd24cb4cde4f49121914346106f98f9344a326466ee10cedfed546abce8f33e4d6ba4a6c6
|
7
|
+
data.tar.gz: 8f5cf8a627c7a474dc64de6729843c4fd5f3bbf6e4e70f9d3ef6065efe7f53592185cee8894d3f5dacffb6ab13b145863a5762e24af781818aee2bdc6d29d3b9
|
data/CONTRIBUTING.md
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Contributing to sastrawi-ruby
|
2
|
+
|
3
|
+
If you find a bug, please report it to [issue tracker][issue]. You can also
|
4
|
+
contribute by writing codes.
|
5
|
+
|
6
|
+
## How to contribute
|
7
|
+
|
8
|
+
There are some steps you must follow:
|
9
|
+
|
10
|
+
1. Fork this repository and clone it to your local environment
|
11
|
+
2. Create a named brached that contains your change
|
12
|
+
3. Install the development dependencies by running `bundle install`
|
13
|
+
4. Code
|
14
|
+
5. Add or adjust unit tests and make sure everything passes by running `bundle
|
15
|
+
exec rake`
|
16
|
+
6. Push your branch to GitHub
|
17
|
+
7. Send a pull request for your branch
|
18
|
+
|
19
|
+
Use `dev` branch as a target of your branch for pull request. Both issue and pull
|
20
|
+
request details must be written in English.
|
21
|
+
|
22
|
+
[issue]: https://github.com/meisyal/sastrawi-ruby/issues
|
data/README.md
CHANGED
@@ -77,9 +77,8 @@ puts stemmer.stem('internetan')
|
|
77
77
|
|
78
78
|
## Contributing
|
79
79
|
|
80
|
-
Contributions are welcome.
|
81
|
-
|
82
|
-
request. Both issue and pull request details must be written in English.
|
80
|
+
Contributions are welcome. Please, read [CONTRIBUTING][contributing]
|
81
|
+
guidelines.
|
83
82
|
|
84
83
|
## License
|
85
84
|
|
@@ -92,7 +91,7 @@ Attribution-NonCommercial-ShareAlike 3.0 Unported License][kateglolicense].
|
|
92
91
|
[stemmingwiki]: https://en.wikipedia.org/wiki/Stemming
|
93
92
|
[howstemmingworks]: https://github.com/sastrawi/sastrawi/wiki/Stemming-Bahasa-Indonesia
|
94
93
|
[documentation]: https://github.com/meisyal/sastrawi-ruby/wiki
|
95
|
-
[
|
94
|
+
[contributing]: https://github/com/meisyal/sastrawi-ruby/blob/master/CONTRIBUTING.md
|
96
95
|
[license]: https://github.com/meisyal/sastrawi-ruby/blob/master/LICENSE.txt
|
97
96
|
[kateglo]: http://kateglo.com
|
98
97
|
[kateglolicense]: https://creativecommons.org/licenses/by-nc-sa/3.0/
|
data/Rakefile
CHANGED
data/_config.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
theme: jekyll-theme-cayman
|
@@ -9,26 +9,41 @@ module Sastrawi
|
|
9
9
|
add_words(words)
|
10
10
|
end
|
11
11
|
|
12
|
+
##
|
13
|
+
# Check whether a word is contained in the dictionary
|
14
|
+
|
12
15
|
def contains?(word)
|
13
16
|
@words.include?(word)
|
14
17
|
end
|
15
18
|
|
19
|
+
##
|
20
|
+
# Count how many words in the dictionary
|
21
|
+
|
16
22
|
def count
|
17
23
|
@words.length
|
18
24
|
end
|
19
25
|
|
26
|
+
##
|
27
|
+
# Add multiple words to the dictionary
|
28
|
+
|
20
29
|
def add_words(new_words)
|
21
30
|
new_words.each do |word|
|
22
31
|
add(word)
|
23
32
|
end
|
24
33
|
end
|
25
34
|
|
35
|
+
##
|
36
|
+
# Add a word to the dictionary
|
37
|
+
|
26
38
|
def add(word)
|
27
39
|
return if word == ''
|
28
40
|
|
29
41
|
@words.push(word)
|
30
42
|
end
|
31
43
|
|
44
|
+
##
|
45
|
+
# Add words from a text file to the dictionary
|
46
|
+
|
32
47
|
def add_words_from_text_file(file_path)
|
33
48
|
words = []
|
34
49
|
|
@@ -41,6 +56,9 @@ module Sastrawi
|
|
41
56
|
add_words(words)
|
42
57
|
end
|
43
58
|
|
59
|
+
##
|
60
|
+
# Remove a word from the dictionary
|
61
|
+
|
44
62
|
def remove(word)
|
45
63
|
@words.delete(word)
|
46
64
|
end
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule37a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(er[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule37b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])er([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule38a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(el[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule38b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])el([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule39a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(em[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule39b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])em([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule40a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(in[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule40b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])in([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule5
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^be([
|
6
|
+
contains = /^be([bcdfghjklmnpqstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Confix Stripping Rule Precendence Adjustment Specification
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module ConfixStripping
|
@@ -1,5 +1,9 @@
|
|
1
1
|
require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
|
2
2
|
|
3
|
+
##
|
4
|
+
# Stemming context using Nazief and Adriani, Confix Stripping (CS),
|
5
|
+
# Enhanced Confix Stripping (ECS), and Improved (ECS)
|
6
|
+
|
3
7
|
module Sastrawi
|
4
8
|
module Stemmer
|
5
9
|
module Context
|
@@ -37,6 +41,9 @@ module Sastrawi
|
|
37
41
|
@removals.push(removal)
|
38
42
|
end
|
39
43
|
|
44
|
+
##
|
45
|
+
# Execute stemming process
|
46
|
+
|
40
47
|
def execute
|
41
48
|
start_stemming_process
|
42
49
|
|
@@ -56,6 +63,10 @@ module Sastrawi
|
|
56
63
|
|
57
64
|
cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
|
58
65
|
|
66
|
+
##
|
67
|
+
# Confix stripping
|
68
|
+
# try to remove prefix before suffix if the specification is met
|
69
|
+
|
59
70
|
if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
|
60
71
|
remove_prefixes
|
61
72
|
return if @dictionary.contains?(@current_word)
|
@@ -78,6 +89,9 @@ module Sastrawi
|
|
78
89
|
loop_last_return
|
79
90
|
end
|
80
91
|
|
92
|
+
##
|
93
|
+
# ECS loop last return
|
94
|
+
|
81
95
|
def loop_last_return
|
82
96
|
restore_prefix
|
83
97
|
|
@@ -147,10 +161,16 @@ module Sastrawi
|
|
147
161
|
end
|
148
162
|
end
|
149
163
|
|
164
|
+
##
|
165
|
+
# Check whether the removed part is a suffix
|
166
|
+
|
150
167
|
def suffix_removal?(removal)
|
151
168
|
removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
|
152
169
|
end
|
153
170
|
|
171
|
+
##
|
172
|
+
# Restore prefix to proceed with ECS loop last return
|
173
|
+
|
154
174
|
def restore_prefix
|
155
175
|
@removals.each do |removal|
|
156
176
|
if removal.affix_type == 'DP'
|
@@ -1,5 +1,10 @@
|
|
1
1
|
require 'sastrawi/stemmer/context/removal'
|
2
2
|
|
3
|
+
##
|
4
|
+
# Remove derivational suffix
|
5
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
6
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
7
|
+
|
3
8
|
module Sastrawi
|
4
9
|
module Stemmer
|
5
10
|
module Context
|
@@ -18,6 +23,10 @@ module Sastrawi
|
|
18
23
|
end
|
19
24
|
end
|
20
25
|
|
26
|
+
##
|
27
|
+
# Original rule: i|kan|an
|
28
|
+
# Added the adopted foreign suffix rule: is|isme|isasi
|
29
|
+
|
21
30
|
def remove_suffix(word)
|
22
31
|
word.sub(/(is|isme|isasi|i|kan|an)$/, '')
|
23
32
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove inflectional particle
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove inflectional particle: lah|kah|tah|pun
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/-*(lah|kah|tah|pun)$/, '')
|
21
29
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove inflectional possessive pronoun
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove inflectional possessive pronoun: ku|mu|nya|
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/-*(ku|mu|nya)$/, '')
|
21
29
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove plain prefix
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove plain prefix: di|ke|se
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/^(di|ke|se)/, '')
|
21
29
|
end
|
@@ -4,6 +4,10 @@ require 'sastrawi/stemmer/context/visitor/visitor_provider'
|
|
4
4
|
|
5
5
|
require 'sastrawi/stemmer/filter/text_normalizer'
|
6
6
|
|
7
|
+
##
|
8
|
+
# Indonesian Stemmer
|
9
|
+
# Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
|
10
|
+
|
7
11
|
module Sastrawi
|
8
12
|
module Stemmer
|
9
13
|
class Stemmer
|
@@ -14,6 +18,9 @@ module Sastrawi
|
|
14
18
|
@visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
|
15
19
|
end
|
16
20
|
|
21
|
+
##
|
22
|
+
# Stem a string to its base form
|
23
|
+
|
17
24
|
def stem(text)
|
18
25
|
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
19
26
|
|
@@ -27,6 +34,9 @@ module Sastrawi
|
|
27
34
|
stems.join(' ')
|
28
35
|
end
|
29
36
|
|
37
|
+
##
|
38
|
+
# Stem a word to its base form
|
39
|
+
|
30
40
|
def stem_word(word)
|
31
41
|
if plural?(word)
|
32
42
|
stem_plural_word(word)
|
@@ -43,6 +53,11 @@ module Sastrawi
|
|
43
53
|
return word.include?('-')
|
44
54
|
end
|
45
55
|
|
56
|
+
##
|
57
|
+
# Stem a plural word to its base form
|
58
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
|
59
|
+
# page 76-77
|
60
|
+
|
46
61
|
def stem_plural_word(word)
|
47
62
|
first_match = /^(.*)-(.*)$/.match(word)
|
48
63
|
|
@@ -72,6 +87,9 @@ module Sastrawi
|
|
72
87
|
end
|
73
88
|
end
|
74
89
|
|
90
|
+
##
|
91
|
+
# Stem a singular word to its base form
|
92
|
+
|
75
93
|
def stem_singular_word(word)
|
76
94
|
context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
|
77
95
|
context.execute
|
data/lib/sastrawi/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sastrawi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrias Meisyal
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -61,10 +61,12 @@ extra_rdoc_files: []
|
|
61
61
|
files:
|
62
62
|
- ".gitignore"
|
63
63
|
- ".travis.yml"
|
64
|
+
- CONTRIBUTING.md
|
64
65
|
- Gemfile
|
65
66
|
- LICENSE.txt
|
66
67
|
- README.md
|
67
68
|
- Rakefile
|
69
|
+
- _config.yml
|
68
70
|
- data/base-word.txt
|
69
71
|
- lib/sastrawi.rb
|
70
72
|
- lib/sastrawi/dictionary/array_dictionary.rb
|