sastrawi 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CONTRIBUTING.md +22 -0
- data/README.md +3 -4
- data/Rakefile +2 -2
- data/_config.yml +1 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +18 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +1 -1
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +4 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +5 -0
- data/lib/sastrawi/stemmer/context/context.rb +20 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +9 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +8 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +8 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +8 -0
- data/lib/sastrawi/stemmer/stemmer.rb +18 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +3 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +3 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +1 -1
- data/lib/sastrawi/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 18d883e3d11d60b35c08f9161c338da411e0b19f
|
4
|
+
data.tar.gz: b9b9c4aab58ca6c23d03f991cfe69851850a556f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e0838a8500a8225083c4504ead591ab769536c62cc93202da4bee6bd24cb4cde4f49121914346106f98f9344a326466ee10cedfed546abce8f33e4d6ba4a6c6
|
7
|
+
data.tar.gz: 8f5cf8a627c7a474dc64de6729843c4fd5f3bbf6e4e70f9d3ef6065efe7f53592185cee8894d3f5dacffb6ab13b145863a5762e24af781818aee2bdc6d29d3b9
|
data/CONTRIBUTING.md
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# Contributing to sastrawi-ruby
|
2
|
+
|
3
|
+
If you find a bug, please report it to [issue tracker][issue]. You can also
|
4
|
+
contribute by writing codes.
|
5
|
+
|
6
|
+
## How to contribute
|
7
|
+
|
8
|
+
There are some steps you must follow:
|
9
|
+
|
10
|
+
1. Fork this repository and clone it to your local environment
|
11
|
+
2. Create a named brached that contains your change
|
12
|
+
3. Install the development dependencies by running `bundle install`
|
13
|
+
4. Code
|
14
|
+
5. Add or adjust unit tests and make sure everything passes by running `bundle
|
15
|
+
exec rake`
|
16
|
+
6. Push your branch to GitHub
|
17
|
+
7. Send a pull request for your branch
|
18
|
+
|
19
|
+
Use `dev` branch as a target of your branch for pull request. Both issue and pull
|
20
|
+
request details must be written in English.
|
21
|
+
|
22
|
+
[issue]: https://github.com/meisyal/sastrawi-ruby/issues
|
data/README.md
CHANGED
@@ -77,9 +77,8 @@ puts stemmer.stem('internetan')
|
|
77
77
|
|
78
78
|
## Contributing
|
79
79
|
|
80
|
-
Contributions are welcome.
|
81
|
-
|
82
|
-
request. Both issue and pull request details must be written in English.
|
80
|
+
Contributions are welcome. Please, read [CONTRIBUTING][contributing]
|
81
|
+
guidelines.
|
83
82
|
|
84
83
|
## License
|
85
84
|
|
@@ -92,7 +91,7 @@ Attribution-NonCommercial-ShareAlike 3.0 Unported License][kateglolicense].
|
|
92
91
|
[stemmingwiki]: https://en.wikipedia.org/wiki/Stemming
|
93
92
|
[howstemmingworks]: https://github.com/sastrawi/sastrawi/wiki/Stemming-Bahasa-Indonesia
|
94
93
|
[documentation]: https://github.com/meisyal/sastrawi-ruby/wiki
|
95
|
-
[
|
94
|
+
[contributing]: https://github/com/meisyal/sastrawi-ruby/blob/master/CONTRIBUTING.md
|
96
95
|
[license]: https://github.com/meisyal/sastrawi-ruby/blob/master/LICENSE.txt
|
97
96
|
[kateglo]: http://kateglo.com
|
98
97
|
[kateglolicense]: https://creativecommons.org/licenses/by-nc-sa/3.0/
|
data/Rakefile
CHANGED
data/_config.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
theme: jekyll-theme-cayman
|
@@ -9,26 +9,41 @@ module Sastrawi
|
|
9
9
|
add_words(words)
|
10
10
|
end
|
11
11
|
|
12
|
+
##
|
13
|
+
# Check whether a word is contained in the dictionary
|
14
|
+
|
12
15
|
def contains?(word)
|
13
16
|
@words.include?(word)
|
14
17
|
end
|
15
18
|
|
19
|
+
##
|
20
|
+
# Count how many words in the dictionary
|
21
|
+
|
16
22
|
def count
|
17
23
|
@words.length
|
18
24
|
end
|
19
25
|
|
26
|
+
##
|
27
|
+
# Add multiple words to the dictionary
|
28
|
+
|
20
29
|
def add_words(new_words)
|
21
30
|
new_words.each do |word|
|
22
31
|
add(word)
|
23
32
|
end
|
24
33
|
end
|
25
34
|
|
35
|
+
##
|
36
|
+
# Add a word to the dictionary
|
37
|
+
|
26
38
|
def add(word)
|
27
39
|
return if word == ''
|
28
40
|
|
29
41
|
@words.push(word)
|
30
42
|
end
|
31
43
|
|
44
|
+
##
|
45
|
+
# Add words from a text file to the dictionary
|
46
|
+
|
32
47
|
def add_words_from_text_file(file_path)
|
33
48
|
words = []
|
34
49
|
|
@@ -41,6 +56,9 @@ module Sastrawi
|
|
41
56
|
add_words(words)
|
42
57
|
end
|
43
58
|
|
59
|
+
##
|
60
|
+
# Remove a word from the dictionary
|
61
|
+
|
44
62
|
def remove(word)
|
45
63
|
@words.delete(word)
|
46
64
|
end
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule37a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(er[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule37b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])er([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule38a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(el[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule38b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])el([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule39a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(em[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule39b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])em([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule40a
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])(in[aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule40b
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^([
|
6
|
+
contains = /^([bcdfghjklmnpqrstvwxyz])in([aiueo])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule5
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^be([
|
6
|
+
contains = /^be([bcdfghjklmnpqstvwxyz])(er[bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Confix Stripping Rule Precendence Adjustment Specification
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 78-79
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module ConfixStripping
|
@@ -1,5 +1,9 @@
|
|
1
1
|
require 'sastrawi/stemmer/confix_stripping/precedence_adjustment_specification'
|
2
2
|
|
3
|
+
##
|
4
|
+
# Stemming context using Nazief and Adriani, Confix Stripping (CS),
|
5
|
+
# Enhanced Confix Stripping (ECS), and Improved (ECS)
|
6
|
+
|
3
7
|
module Sastrawi
|
4
8
|
module Stemmer
|
5
9
|
module Context
|
@@ -37,6 +41,9 @@ module Sastrawi
|
|
37
41
|
@removals.push(removal)
|
38
42
|
end
|
39
43
|
|
44
|
+
##
|
45
|
+
# Execute stemming process
|
46
|
+
|
40
47
|
def execute
|
41
48
|
start_stemming_process
|
42
49
|
|
@@ -56,6 +63,10 @@ module Sastrawi
|
|
56
63
|
|
57
64
|
cs_precendence_adjustment_specification = Sastrawi::Stemmer::ConfixStripping::PrecedenceAdjustmentSpecification.new
|
58
65
|
|
66
|
+
##
|
67
|
+
# Confix stripping
|
68
|
+
# try to remove prefix before suffix if the specification is met
|
69
|
+
|
59
70
|
if cs_precendence_adjustment_specification.satisfied_by?(@original_word)
|
60
71
|
remove_prefixes
|
61
72
|
return if @dictionary.contains?(@current_word)
|
@@ -78,6 +89,9 @@ module Sastrawi
|
|
78
89
|
loop_last_return
|
79
90
|
end
|
80
91
|
|
92
|
+
##
|
93
|
+
# ECS loop last return
|
94
|
+
|
81
95
|
def loop_last_return
|
82
96
|
restore_prefix
|
83
97
|
|
@@ -147,10 +161,16 @@ module Sastrawi
|
|
147
161
|
end
|
148
162
|
end
|
149
163
|
|
164
|
+
##
|
165
|
+
# Check whether the removed part is a suffix
|
166
|
+
|
150
167
|
def suffix_removal?(removal)
|
151
168
|
removal.affix_type == 'DS' || removal.affix_type == 'PP' || removal.affix_type == 'P'
|
152
169
|
end
|
153
170
|
|
171
|
+
##
|
172
|
+
# Restore prefix to proceed with ECS loop last return
|
173
|
+
|
154
174
|
def restore_prefix
|
155
175
|
@removals.each do |removal|
|
156
176
|
if removal.affix_type == 'DP'
|
@@ -1,5 +1,10 @@
|
|
1
1
|
require 'sastrawi/stemmer/context/removal'
|
2
2
|
|
3
|
+
##
|
4
|
+
# Remove derivational suffix
|
5
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
6
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
7
|
+
|
3
8
|
module Sastrawi
|
4
9
|
module Stemmer
|
5
10
|
module Context
|
@@ -18,6 +23,10 @@ module Sastrawi
|
|
18
23
|
end
|
19
24
|
end
|
20
25
|
|
26
|
+
##
|
27
|
+
# Original rule: i|kan|an
|
28
|
+
# Added the adopted foreign suffix rule: is|isme|isasi
|
29
|
+
|
21
30
|
def remove_suffix(word)
|
22
31
|
word.sub(/(is|isme|isasi|i|kan|an)$/, '')
|
23
32
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove inflectional particle
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 60
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove inflectional particle: lah|kah|tah|pun
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/-*(lah|kah|tah|pun)$/, '')
|
21
29
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove inflectional possessive pronoun
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesia Text Retrieval" page 60
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove inflectional possessive pronoun: ku|mu|nya|
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/-*(ku|mu|nya)$/, '')
|
21
29
|
end
|
@@ -1,3 +1,8 @@
|
|
1
|
+
##
|
2
|
+
# Remove plain prefix
|
3
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 61
|
4
|
+
# http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf
|
5
|
+
|
1
6
|
module Sastrawi
|
2
7
|
module Stemmer
|
3
8
|
module Context
|
@@ -16,6 +21,9 @@ module Sastrawi
|
|
16
21
|
end
|
17
22
|
end
|
18
23
|
|
24
|
+
##
|
25
|
+
# Remove plain prefix: di|ke|se
|
26
|
+
|
19
27
|
def remove(word)
|
20
28
|
word.sub(/^(di|ke|se)/, '')
|
21
29
|
end
|
@@ -4,6 +4,10 @@ require 'sastrawi/stemmer/context/visitor/visitor_provider'
|
|
4
4
|
|
5
5
|
require 'sastrawi/stemmer/filter/text_normalizer'
|
6
6
|
|
7
|
+
##
|
8
|
+
# Indonesian Stemmer
|
9
|
+
# Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
|
10
|
+
|
7
11
|
module Sastrawi
|
8
12
|
module Stemmer
|
9
13
|
class Stemmer
|
@@ -14,6 +18,9 @@ module Sastrawi
|
|
14
18
|
@visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
|
15
19
|
end
|
16
20
|
|
21
|
+
##
|
22
|
+
# Stem a string to its base form
|
23
|
+
|
17
24
|
def stem(text)
|
18
25
|
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
19
26
|
|
@@ -27,6 +34,9 @@ module Sastrawi
|
|
27
34
|
stems.join(' ')
|
28
35
|
end
|
29
36
|
|
37
|
+
##
|
38
|
+
# Stem a word to its base form
|
39
|
+
|
30
40
|
def stem_word(word)
|
31
41
|
if plural?(word)
|
32
42
|
stem_plural_word(word)
|
@@ -43,6 +53,11 @@ module Sastrawi
|
|
43
53
|
return word.include?('-')
|
44
54
|
end
|
45
55
|
|
56
|
+
##
|
57
|
+
# Stem a plural word to its base form
|
58
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
|
59
|
+
# page 76-77
|
60
|
+
|
46
61
|
def stem_plural_word(word)
|
47
62
|
first_match = /^(.*)-(.*)$/.match(word)
|
48
63
|
|
@@ -72,6 +87,9 @@ module Sastrawi
|
|
72
87
|
end
|
73
88
|
end
|
74
89
|
|
90
|
+
##
|
91
|
+
# Stem a singular word to its base form
|
92
|
+
|
75
93
|
def stem_singular_word(word)
|
76
94
|
context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
|
77
95
|
context.execute
|
data/lib/sastrawi/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sastrawi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrias Meisyal
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-09-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -61,10 +61,12 @@ extra_rdoc_files: []
|
|
61
61
|
files:
|
62
62
|
- ".gitignore"
|
63
63
|
- ".travis.yml"
|
64
|
+
- CONTRIBUTING.md
|
64
65
|
- Gemfile
|
65
66
|
- LICENSE.txt
|
66
67
|
- README.md
|
67
68
|
- Rakefile
|
69
|
+
- _config.yml
|
68
70
|
- data/base-word.txt
|
69
71
|
- lib/sastrawi.rb
|
70
72
|
- lib/sastrawi/dictionary/array_dictionary.rb
|