sastrawi 0.1.0.pre → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +6 -4
- data/README.md +38 -10
- data/data/{kata-dasar.txt → base-word.txt} +0 -0
- data/lib/sastrawi.rb +0 -9
- data/lib/sastrawi/dictionary/array_dictionary.rb +17 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +1 -1
- data/lib/sastrawi/stemmer/cache/array_cache.rb +1 -1
- data/lib/sastrawi/stemmer/cached_stemmer.rb +1 -1
- data/lib/sastrawi/stemmer/context/context.rb +6 -5
- data/lib/sastrawi/stemmer/context/removal.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +2 -2
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +1 -1
- data/lib/sastrawi/stemmer/stemmer.rb +13 -15
- data/lib/sastrawi/stemmer/stemmer_factory.rb +3 -2
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +2 -2
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +1 -0
- data/lib/sastrawi/version.rb +1 -1
- data/sastrawi.gemspec +1 -0
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c5b1727acdc71972e767e95a444969a59982660
|
4
|
+
data.tar.gz: 0cba68fa48adbfc2004b47b0050bd6bbfad921dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1aabe63a7cc2d94eb34e7445fd2912e3a415248d62e871578ee9e9d0c3790e822eb1d76b2bcd092fb4137a42f18734ca78f82d0c24455244103b436f2b46406a
|
7
|
+
data.tar.gz: 70114d83ab5d39081490308ed2437244445136282e439edfe19a76a556c3e4cfbbeda8c30e22546c780f4d8f9f1adbddcf91aedb9974f8bc98763b80ec42db0d
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,9 @@ written in PHP and this library is written in Ruby language.
|
|
6
6
|
|
7
7
|
Taken from [Wikipedia][stemmingwiki], stemming is the process of reducing
|
8
8
|
inflected (or sometimes derived) words to their word stem, base or root form.
|
9
|
-
For instance, "menahan" has "tahan" as its base form.
|
9
|
+
For instance, "menahan" has "tahan" as its base form. If you want to know how
|
10
|
+
stemming works, please read this [page][howstemmingworks] (in Bahasa Indonesia)
|
11
|
+
for further details.
|
10
12
|
|
11
13
|
## Documentation
|
12
14
|
|
@@ -33,27 +35,51 @@ on your system. I would recommend to choose the stable versions.
|
|
33
35
|
|
34
36
|
## Usage
|
35
37
|
|
36
|
-
|
37
|
-
can't add or remove any base form. This feature will be implemented for next
|
38
|
-
release.
|
38
|
+
This library supports stemming words with provided base forms.
|
39
39
|
|
40
40
|
```ruby
|
41
41
|
require 'sastrawi'
|
42
42
|
|
43
|
+
# create stemmer
|
44
|
+
stemmer_factory = Sastrawi::Stemmer::StemmerFactory.new
|
45
|
+
stemmer = stemmer_factory.create_stemmer
|
46
|
+
|
43
47
|
# prepare a sentence or words to be stemmed and call the stem API
|
44
48
|
sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan.'
|
45
|
-
stemming_result =
|
49
|
+
stemming_result = stemmer.stem(sentence)
|
46
50
|
|
47
|
-
# the stemming result should be "ekonomi indonesia sedang dalam tumbuh yang
|
48
|
-
bangga"
|
51
|
+
# the stemming result should be "ekonomi indonesia sedang dalam tumbuh yang bangga"
|
49
52
|
puts stemming_result
|
50
53
|
```
|
51
54
|
|
55
|
+
Beside that, you can add or remove any base form.
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
require 'sastrawi'
|
59
|
+
|
60
|
+
# create stemmer
|
61
|
+
stemmer_factory = Sastrawi::Stemmer::StemmerFactory.new
|
62
|
+
|
63
|
+
# create default dictionary and add a text file that contains words into it
|
64
|
+
dictionary = stemmer_factory.create_default_dictionary
|
65
|
+
dictionary.add_words_from_text_file('my-dictionary.txt')
|
66
|
+
|
67
|
+
# add or remove words
|
68
|
+
dictionary.add('internet')
|
69
|
+
dictionary.remove('desa')
|
70
|
+
|
71
|
+
# stem a word, "internetan", for example
|
72
|
+
stemmer = Sastrawi::Stemmer::Stemmer.new(dictionary)
|
73
|
+
|
74
|
+
# the stemming result should be "internet"
|
75
|
+
puts stemmer.stem('internetan')
|
76
|
+
```
|
77
|
+
|
52
78
|
## Contributing
|
53
79
|
|
54
|
-
Contributions are welcome. If you find a bug, please report it to issue
|
55
|
-
tracker. Use `dev` branch as a target of your feature branch for pull
|
56
|
-
Both issue and pull request details
|
80
|
+
Contributions are welcome. If you find a bug, please report it to [issue
|
81
|
+
tracker][issue]. Use `dev` branch as a target of your feature branch for pull
|
82
|
+
request. Both issue and pull request details must be written in English.
|
57
83
|
|
58
84
|
## License
|
59
85
|
|
@@ -64,7 +90,9 @@ Attribution-NonCommercial-ShareAlike 3.0 Unported License][kateglolicense].
|
|
64
90
|
|
65
91
|
[sastrawi]: https://github.com/sastrawi/sastrawi
|
66
92
|
[stemmingwiki]: https://en.wikipedia.org/wiki/Stemming
|
93
|
+
[howstemmingworks]: https://github.com/sastrawi/sastrawi/wiki/Stemming-Bahasa-Indonesia
|
67
94
|
[documentation]: https://github.com/meisyal/sastrawi-ruby/wiki
|
95
|
+
[issue]: https://github.com/meisyal/sastrawi-ruby/issues
|
68
96
|
[license]: https://github.com/meisyal/sastrawi-ruby/blob/master/LICENSE.txt
|
69
97
|
[kateglo]: http://kateglo.com
|
70
98
|
[kateglolicense]: https://creativecommons.org/licenses/by-nc-sa/3.0/
|
File without changes
|
data/lib/sastrawi.rb
CHANGED
@@ -1,12 +1,3 @@
|
|
1
1
|
require 'sastrawi/version'
|
2
2
|
|
3
3
|
require 'sastrawi/stemmer/stemmer_factory'
|
4
|
-
|
5
|
-
module Sastrawi
|
6
|
-
def self.stem(sentence)
|
7
|
-
stemmer_factory = Sastrawi::Stemmer::StemmerFactory.new
|
8
|
-
stemmer = stemmer_factory.create_stemmer
|
9
|
-
|
10
|
-
stemmer.stem(sentence)
|
11
|
-
end
|
12
|
-
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Sastrawi
|
2
2
|
module Dictionary
|
3
3
|
class ArrayDictionary
|
4
|
-
|
4
|
+
attr_reader :words
|
5
5
|
|
6
6
|
def initialize(words = [])
|
7
7
|
@words = []
|
@@ -28,6 +28,22 @@ module Sastrawi
|
|
28
28
|
|
29
29
|
@words.push(word)
|
30
30
|
end
|
31
|
+
|
32
|
+
def add_words_from_text_file(file_path)
|
33
|
+
words = []
|
34
|
+
|
35
|
+
File.open(file_path, 'r') do |file|
|
36
|
+
file.each do |line|
|
37
|
+
words.push(line.chomp)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
add_words(words)
|
42
|
+
end
|
43
|
+
|
44
|
+
def remove(word)
|
45
|
+
@words.delete(word)
|
46
|
+
end
|
31
47
|
end
|
32
48
|
end
|
33
49
|
end
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule29
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^
|
6
|
+
contains = /^peng([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule9
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^te([bcdfghjklmnpqrstvwxyz])er(
|
6
|
+
contains = /^te([bcdfghjklmnpqrstvwxyz])er([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -4,7 +4,8 @@ module Sastrawi
|
|
4
4
|
module Stemmer
|
5
5
|
module Context
|
6
6
|
class Context
|
7
|
-
|
7
|
+
attr_reader :original_word, :dictionary, :visitor_provider, :visitors, :suffix_visitors, :prefix_visitors
|
8
|
+
attr_accessor :current_word, :process_is_stopped, :removals, :result
|
8
9
|
|
9
10
|
def initialize(original_word, dictionary, visitor_provider)
|
10
11
|
@original_word = original_word
|
@@ -14,10 +15,10 @@ module Sastrawi
|
|
14
15
|
|
15
16
|
@process_is_stopped = false
|
16
17
|
@removals = []
|
17
|
-
@visitors =
|
18
|
-
@suffix_visitors =
|
19
|
-
@prefix_visitors =
|
20
|
-
@result =
|
18
|
+
@visitors = nil
|
19
|
+
@suffix_visitors = nil
|
20
|
+
@prefix_visitors = nil
|
21
|
+
@result = nil
|
21
22
|
|
22
23
|
init_visitors
|
23
24
|
end
|
@@ -2,7 +2,7 @@ module Sastrawi
|
|
2
2
|
module Stemmer
|
3
3
|
module Context
|
4
4
|
class Removal
|
5
|
-
|
5
|
+
attr_reader :visitor, :subject, :result, :removed_part, :affix_type
|
6
6
|
|
7
7
|
def initialize(visitor, subject, result, removed_part, affix_type)
|
8
8
|
@visitor = visitor
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Context
|
4
4
|
module Visitor
|
5
5
|
class PrefixDisambiguator
|
6
|
-
|
6
|
+
attr_reader :disambiguators
|
7
7
|
|
8
8
|
def initialize(disambiguators = [])
|
9
9
|
@disambiguators = []
|
@@ -22,7 +22,7 @@ module Sastrawi
|
|
22
22
|
|
23
23
|
return if result.nil?
|
24
24
|
|
25
|
-
removed_part = context.current_word.sub(result
|
25
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
26
26
|
|
27
27
|
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
28
28
|
|
@@ -9,7 +9,7 @@ module Sastrawi
|
|
9
9
|
result = remove_suffix(context.current_word)
|
10
10
|
|
11
11
|
if result != context.current_word
|
12
|
-
removed_part = context.current_word.sub(result
|
12
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
13
13
|
|
14
14
|
removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
|
15
15
|
|
@@ -7,7 +7,7 @@ module Sastrawi
|
|
7
7
|
result = remove(context.current_word)
|
8
8
|
|
9
9
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
10
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
11
|
|
12
12
|
removal = Removal.new(self, context.current_word, result, removed_part, 'P')
|
13
13
|
|
@@ -7,7 +7,7 @@ module Sastrawi
|
|
7
7
|
result = remove(context.current_word)
|
8
8
|
|
9
9
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
10
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
11
|
|
12
12
|
removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
|
13
13
|
|
@@ -7,7 +7,7 @@ module Sastrawi
|
|
7
7
|
result = remove(context.current_word)
|
8
8
|
|
9
9
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
10
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
11
|
|
12
12
|
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
13
13
|
|
@@ -1,11 +1,13 @@
|
|
1
1
|
require 'sastrawi/stemmer/context/context'
|
2
|
+
|
2
3
|
require 'sastrawi/stemmer/context/visitor/visitor_provider'
|
4
|
+
|
3
5
|
require 'sastrawi/stemmer/filter/text_normalizer'
|
4
6
|
|
5
7
|
module Sastrawi
|
6
8
|
module Stemmer
|
7
9
|
class Stemmer
|
8
|
-
|
10
|
+
attr_reader :dictionary, :visitor_provider
|
9
11
|
|
10
12
|
def initialize(dictionary)
|
11
13
|
@dictionary = dictionary
|
@@ -36,35 +38,31 @@ module Sastrawi
|
|
36
38
|
def plural?(word)
|
37
39
|
matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word)
|
38
40
|
|
39
|
-
if matches
|
40
|
-
|
41
|
-
|
42
|
-
false
|
43
|
-
end
|
41
|
+
return matches[1].include?('-') if matches
|
42
|
+
|
43
|
+
return word.include?('-')
|
44
44
|
end
|
45
45
|
|
46
46
|
def stem_plural_word(word)
|
47
47
|
first_match = /^(.*)-(.*)$/.match(word)
|
48
48
|
|
49
|
-
unless first_match
|
50
|
-
return word
|
51
|
-
end
|
52
|
-
|
53
|
-
words = [first_match.captures[0], first_match.captures[1]]
|
49
|
+
return word unless first_match
|
54
50
|
|
51
|
+
words = [first_match[1], first_match[2]]
|
55
52
|
suffix = words[1]
|
56
|
-
suffixes = [
|
53
|
+
suffixes = %w[ku mu nya lah kah tah pun]
|
57
54
|
second_match = /^(.*)-(.*)$/.match(words[0])
|
58
55
|
|
59
56
|
if suffixes.include?(suffix) && second_match
|
60
|
-
words[
|
57
|
+
words[0] = second_match[1]
|
58
|
+
words[1] = second_match[2] << '-' << suffix
|
61
59
|
end
|
62
60
|
|
63
61
|
root_first_word = stem_singular_word(words[0])
|
64
62
|
root_second_word = stem_singular_word(words[1])
|
65
63
|
|
66
|
-
|
67
|
-
root_second_word = stem_singular_word('me'
|
64
|
+
if !@dictionary.contains?(words[1]) && root_second_word == words[1]
|
65
|
+
root_second_word = stem_singular_word('me' << words[1])
|
68
66
|
end
|
69
67
|
|
70
68
|
if root_first_word == root_second_word
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require 'sastrawi/dictionary/array_dictionary'
|
2
|
+
|
2
3
|
require 'sastrawi/stemmer/cached_stemmer'
|
3
4
|
require 'sastrawi/stemmer/stemmer'
|
4
|
-
require 'sastrawi/stemmer/cache/array_cache'
|
5
5
|
|
6
|
+
require 'sastrawi/stemmer/cache/array_cache'
|
6
7
|
|
7
8
|
module Sastrawi
|
8
9
|
module Stemmer
|
@@ -29,7 +30,7 @@ module Sastrawi
|
|
29
30
|
|
30
31
|
def get_words_from_file
|
31
32
|
root_directory = File.expand_path('../../../..', __FILE__)
|
32
|
-
dictionary_file_path = File.join(root_directory, 'data/
|
33
|
+
dictionary_file_path = File.join(root_directory, 'data/base-word.txt')
|
33
34
|
|
34
35
|
dictionary_content = []
|
35
36
|
File.open(dictionary_file_path, 'r') do |file|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Sastrawi
|
2
2
|
module StopWordRemover
|
3
3
|
class StopWordRemover
|
4
|
-
|
4
|
+
attr_reader :dictionary
|
5
5
|
|
6
6
|
def initialize(dictionary)
|
7
7
|
@dictionary = dictionary
|
@@ -12,7 +12,7 @@ module Sastrawi
|
|
12
12
|
stop_words = []
|
13
13
|
|
14
14
|
words.each do |word|
|
15
|
-
unless @dictionary.
|
15
|
+
unless @dictionary.contains?(word)
|
16
16
|
stop_words.push(word)
|
17
17
|
end
|
18
18
|
end
|
data/lib/sastrawi/version.rb
CHANGED
data/sastrawi.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sastrawi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrias Meisyal
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-05-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -65,7 +65,7 @@ files:
|
|
65
65
|
- LICENSE.txt
|
66
66
|
- README.md
|
67
67
|
- Rakefile
|
68
|
-
- data/
|
68
|
+
- data/base-word.txt
|
69
69
|
- lib/sastrawi.rb
|
70
70
|
- lib/sastrawi/dictionary/array_dictionary.rb
|
71
71
|
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb
|
@@ -158,12 +158,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
158
158
|
requirements:
|
159
159
|
- - ">="
|
160
160
|
- !ruby/object:Gem::Version
|
161
|
-
version:
|
161
|
+
version: 1.9.3
|
162
162
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
|
-
- - "
|
164
|
+
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
|
-
version:
|
166
|
+
version: '0'
|
167
167
|
requirements: []
|
168
168
|
rubyforge_project:
|
169
169
|
rubygems_version: 2.5.1
|