sastrawi 0.1.0.pre → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +6 -4
- data/README.md +38 -10
- data/data/{kata-dasar.txt → base-word.txt} +0 -0
- data/lib/sastrawi.rb +0 -9
- data/lib/sastrawi/dictionary/array_dictionary.rb +17 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +1 -1
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +1 -1
- data/lib/sastrawi/stemmer/cache/array_cache.rb +1 -1
- data/lib/sastrawi/stemmer/cached_stemmer.rb +1 -1
- data/lib/sastrawi/stemmer/context/context.rb +6 -5
- data/lib/sastrawi/stemmer/context/removal.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +2 -2
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +1 -1
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +1 -1
- data/lib/sastrawi/stemmer/stemmer.rb +13 -15
- data/lib/sastrawi/stemmer/stemmer_factory.rb +3 -2
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +2 -2
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +1 -0
- data/lib/sastrawi/version.rb +1 -1
- data/sastrawi.gemspec +1 -0
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2c5b1727acdc71972e767e95a444969a59982660
|
4
|
+
data.tar.gz: 0cba68fa48adbfc2004b47b0050bd6bbfad921dd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1aabe63a7cc2d94eb34e7445fd2912e3a415248d62e871578ee9e9d0c3790e822eb1d76b2bcd092fb4137a42f18734ca78f82d0c24455244103b436f2b46406a
|
7
|
+
data.tar.gz: 70114d83ab5d39081490308ed2437244445136282e439edfe19a76a556c3e4cfbbeda8c30e22546c780f4d8f9f1adbddcf91aedb9974f8bc98763b80ec42db0d
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,9 @@ written in PHP and this library is written in Ruby language.
|
|
6
6
|
|
7
7
|
Taken from [Wikipedia][stemmingwiki], stemming is the process of reducing
|
8
8
|
inflected (or sometimes derived) words to their word stem, base or root form.
|
9
|
-
For instance, "menahan" has "tahan" as its base form.
|
9
|
+
For instance, "menahan" has "tahan" as its base form. If you want to know how
|
10
|
+
stemming works, please read this [page][howstemmingworks] (in Bahasa Indonesia)
|
11
|
+
for further details.
|
10
12
|
|
11
13
|
## Documentation
|
12
14
|
|
@@ -33,27 +35,51 @@ on your system. I would recommend to choose the stable versions.
|
|
33
35
|
|
34
36
|
## Usage
|
35
37
|
|
36
|
-
|
37
|
-
can't add or remove any base form. This feature will be implemented for next
|
38
|
-
release.
|
38
|
+
This library supports stemming words with provided base forms.
|
39
39
|
|
40
40
|
```ruby
|
41
41
|
require 'sastrawi'
|
42
42
|
|
43
|
+
# create stemmer
|
44
|
+
stemmer_factory = Sastrawi::Stemmer::StemmerFactory.new
|
45
|
+
stemmer = stemmer_factory.create_stemmer
|
46
|
+
|
43
47
|
# prepare a sentence or words to be stemmed and call the stem API
|
44
48
|
sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan.'
|
45
|
-
stemming_result =
|
49
|
+
stemming_result = stemmer.stem(sentence)
|
46
50
|
|
47
|
-
# the stemming result should be "ekonomi indonesia sedang dalam tumbuh yang
|
48
|
-
bangga"
|
51
|
+
# the stemming result should be "ekonomi indonesia sedang dalam tumbuh yang bangga"
|
49
52
|
puts stemming_result
|
50
53
|
```
|
51
54
|
|
55
|
+
Beside that, you can add or remove any base form.
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
require 'sastrawi'
|
59
|
+
|
60
|
+
# create stemmer
|
61
|
+
stemmer_factory = Sastrawi::Stemmer::StemmerFactory.new
|
62
|
+
|
63
|
+
# create default dictionary and add a text file that contains words into it
|
64
|
+
dictionary = stemmer_factory.create_default_dictionary
|
65
|
+
dictionary.add_words_from_text_file('my-dictionary.txt')
|
66
|
+
|
67
|
+
# add or remove words
|
68
|
+
dictionary.add('internet')
|
69
|
+
dictionary.remove('desa')
|
70
|
+
|
71
|
+
# stem a word, "internetan", for example
|
72
|
+
stemmer = Sastrawi::Stemmer::Stemmer.new(dictionary)
|
73
|
+
|
74
|
+
# the stemming result should be "internet"
|
75
|
+
puts stemmer.stem('internetan')
|
76
|
+
```
|
77
|
+
|
52
78
|
## Contributing
|
53
79
|
|
54
|
-
Contributions are welcome. If you find a bug, please report it to issue
|
55
|
-
tracker. Use `dev` branch as a target of your feature branch for pull
|
56
|
-
Both issue and pull request details
|
80
|
+
Contributions are welcome. If you find a bug, please report it to [issue
|
81
|
+
tracker][issue]. Use `dev` branch as a target of your feature branch for pull
|
82
|
+
request. Both issue and pull request details must be written in English.
|
57
83
|
|
58
84
|
## License
|
59
85
|
|
@@ -64,7 +90,9 @@ Attribution-NonCommercial-ShareAlike 3.0 Unported License][kateglolicense].
|
|
64
90
|
|
65
91
|
[sastrawi]: https://github.com/sastrawi/sastrawi
|
66
92
|
[stemmingwiki]: https://en.wikipedia.org/wiki/Stemming
|
93
|
+
[howstemmingworks]: https://github.com/sastrawi/sastrawi/wiki/Stemming-Bahasa-Indonesia
|
67
94
|
[documentation]: https://github.com/meisyal/sastrawi-ruby/wiki
|
95
|
+
[issue]: https://github.com/meisyal/sastrawi-ruby/issues
|
68
96
|
[license]: https://github.com/meisyal/sastrawi-ruby/blob/master/LICENSE.txt
|
69
97
|
[kateglo]: http://kateglo.com
|
70
98
|
[kateglolicense]: https://creativecommons.org/licenses/by-nc-sa/3.0/
|
File without changes
|
data/lib/sastrawi.rb
CHANGED
@@ -1,12 +1,3 @@
|
|
1
1
|
require 'sastrawi/version'
|
2
2
|
|
3
3
|
require 'sastrawi/stemmer/stemmer_factory'
|
4
|
-
|
5
|
-
module Sastrawi
|
6
|
-
def self.stem(sentence)
|
7
|
-
stemmer_factory = Sastrawi::Stemmer::StemmerFactory.new
|
8
|
-
stemmer = stemmer_factory.create_stemmer
|
9
|
-
|
10
|
-
stemmer.stem(sentence)
|
11
|
-
end
|
12
|
-
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Sastrawi
|
2
2
|
module Dictionary
|
3
3
|
class ArrayDictionary
|
4
|
-
|
4
|
+
attr_reader :words
|
5
5
|
|
6
6
|
def initialize(words = [])
|
7
7
|
@words = []
|
@@ -28,6 +28,22 @@ module Sastrawi
|
|
28
28
|
|
29
29
|
@words.push(word)
|
30
30
|
end
|
31
|
+
|
32
|
+
def add_words_from_text_file(file_path)
|
33
|
+
words = []
|
34
|
+
|
35
|
+
File.open(file_path, 'r') do |file|
|
36
|
+
file.each do |line|
|
37
|
+
words.push(line.chomp)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
add_words(words)
|
42
|
+
end
|
43
|
+
|
44
|
+
def remove(word)
|
45
|
+
@words.delete(word)
|
46
|
+
end
|
31
47
|
end
|
32
48
|
end
|
33
49
|
end
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule29
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^
|
6
|
+
contains = /^peng([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Disambiguator
|
4
4
|
class DisambiguatorPrefixRule9
|
5
5
|
def disambiguate(word)
|
6
|
-
contains = /^te([bcdfghjklmnpqrstvwxyz])er(
|
6
|
+
contains = /^te([bcdfghjklmnpqrstvwxyz])er([bcdfghjklmnpqrstvwxyz])(.*)$/.match(word)
|
7
7
|
|
8
8
|
if contains
|
9
9
|
matches = contains.captures
|
@@ -4,7 +4,8 @@ module Sastrawi
|
|
4
4
|
module Stemmer
|
5
5
|
module Context
|
6
6
|
class Context
|
7
|
-
|
7
|
+
attr_reader :original_word, :dictionary, :visitor_provider, :visitors, :suffix_visitors, :prefix_visitors
|
8
|
+
attr_accessor :current_word, :process_is_stopped, :removals, :result
|
8
9
|
|
9
10
|
def initialize(original_word, dictionary, visitor_provider)
|
10
11
|
@original_word = original_word
|
@@ -14,10 +15,10 @@ module Sastrawi
|
|
14
15
|
|
15
16
|
@process_is_stopped = false
|
16
17
|
@removals = []
|
17
|
-
@visitors =
|
18
|
-
@suffix_visitors =
|
19
|
-
@prefix_visitors =
|
20
|
-
@result =
|
18
|
+
@visitors = nil
|
19
|
+
@suffix_visitors = nil
|
20
|
+
@prefix_visitors = nil
|
21
|
+
@result = nil
|
21
22
|
|
22
23
|
init_visitors
|
23
24
|
end
|
@@ -2,7 +2,7 @@ module Sastrawi
|
|
2
2
|
module Stemmer
|
3
3
|
module Context
|
4
4
|
class Removal
|
5
|
-
|
5
|
+
attr_reader :visitor, :subject, :result, :removed_part, :affix_type
|
6
6
|
|
7
7
|
def initialize(visitor, subject, result, removed_part, affix_type)
|
8
8
|
@visitor = visitor
|
@@ -3,7 +3,7 @@ module Sastrawi
|
|
3
3
|
module Context
|
4
4
|
module Visitor
|
5
5
|
class PrefixDisambiguator
|
6
|
-
|
6
|
+
attr_reader :disambiguators
|
7
7
|
|
8
8
|
def initialize(disambiguators = [])
|
9
9
|
@disambiguators = []
|
@@ -22,7 +22,7 @@ module Sastrawi
|
|
22
22
|
|
23
23
|
return if result.nil?
|
24
24
|
|
25
|
-
removed_part = context.current_word.sub(result
|
25
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
26
26
|
|
27
27
|
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
28
28
|
|
@@ -9,7 +9,7 @@ module Sastrawi
|
|
9
9
|
result = remove_suffix(context.current_word)
|
10
10
|
|
11
11
|
if result != context.current_word
|
12
|
-
removed_part = context.current_word.sub(result
|
12
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
13
13
|
|
14
14
|
removal = Sastrawi::Stemmer::Context::Removal.new(self, context.current_word, result, removed_part, 'DS')
|
15
15
|
|
@@ -7,7 +7,7 @@ module Sastrawi
|
|
7
7
|
result = remove(context.current_word)
|
8
8
|
|
9
9
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
10
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
11
|
|
12
12
|
removal = Removal.new(self, context.current_word, result, removed_part, 'P')
|
13
13
|
|
@@ -7,7 +7,7 @@ module Sastrawi
|
|
7
7
|
result = remove(context.current_word)
|
8
8
|
|
9
9
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
10
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
11
|
|
12
12
|
removal = Removal.new(self, context.current_word, result, removed_part, 'PP')
|
13
13
|
|
@@ -7,7 +7,7 @@ module Sastrawi
|
|
7
7
|
result = remove(context.current_word)
|
8
8
|
|
9
9
|
if result != context.current_word
|
10
|
-
removed_part = context.current_word.sub(result
|
10
|
+
removed_part = context.current_word.sub(/#{Regexp.quote(result)}/, '')
|
11
11
|
|
12
12
|
removal = Removal.new(self, context.current_word, result, removed_part, 'DP')
|
13
13
|
|
@@ -1,11 +1,13 @@
|
|
1
1
|
require 'sastrawi/stemmer/context/context'
|
2
|
+
|
2
3
|
require 'sastrawi/stemmer/context/visitor/visitor_provider'
|
4
|
+
|
3
5
|
require 'sastrawi/stemmer/filter/text_normalizer'
|
4
6
|
|
5
7
|
module Sastrawi
|
6
8
|
module Stemmer
|
7
9
|
class Stemmer
|
8
|
-
|
10
|
+
attr_reader :dictionary, :visitor_provider
|
9
11
|
|
10
12
|
def initialize(dictionary)
|
11
13
|
@dictionary = dictionary
|
@@ -36,35 +38,31 @@ module Sastrawi
|
|
36
38
|
def plural?(word)
|
37
39
|
matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word)
|
38
40
|
|
39
|
-
if matches
|
40
|
-
|
41
|
-
|
42
|
-
false
|
43
|
-
end
|
41
|
+
return matches[1].include?('-') if matches
|
42
|
+
|
43
|
+
return word.include?('-')
|
44
44
|
end
|
45
45
|
|
46
46
|
def stem_plural_word(word)
|
47
47
|
first_match = /^(.*)-(.*)$/.match(word)
|
48
48
|
|
49
|
-
unless first_match
|
50
|
-
return word
|
51
|
-
end
|
52
|
-
|
53
|
-
words = [first_match.captures[0], first_match.captures[1]]
|
49
|
+
return word unless first_match
|
54
50
|
|
51
|
+
words = [first_match[1], first_match[2]]
|
55
52
|
suffix = words[1]
|
56
|
-
suffixes = [
|
53
|
+
suffixes = %w[ku mu nya lah kah tah pun]
|
57
54
|
second_match = /^(.*)-(.*)$/.match(words[0])
|
58
55
|
|
59
56
|
if suffixes.include?(suffix) && second_match
|
60
|
-
words[
|
57
|
+
words[0] = second_match[1]
|
58
|
+
words[1] = second_match[2] << '-' << suffix
|
61
59
|
end
|
62
60
|
|
63
61
|
root_first_word = stem_singular_word(words[0])
|
64
62
|
root_second_word = stem_singular_word(words[1])
|
65
63
|
|
66
|
-
|
67
|
-
root_second_word = stem_singular_word('me'
|
64
|
+
if !@dictionary.contains?(words[1]) && root_second_word == words[1]
|
65
|
+
root_second_word = stem_singular_word('me' << words[1])
|
68
66
|
end
|
69
67
|
|
70
68
|
if root_first_word == root_second_word
|
@@ -1,8 +1,9 @@
|
|
1
1
|
require 'sastrawi/dictionary/array_dictionary'
|
2
|
+
|
2
3
|
require 'sastrawi/stemmer/cached_stemmer'
|
3
4
|
require 'sastrawi/stemmer/stemmer'
|
4
|
-
require 'sastrawi/stemmer/cache/array_cache'
|
5
5
|
|
6
|
+
require 'sastrawi/stemmer/cache/array_cache'
|
6
7
|
|
7
8
|
module Sastrawi
|
8
9
|
module Stemmer
|
@@ -29,7 +30,7 @@ module Sastrawi
|
|
29
30
|
|
30
31
|
def get_words_from_file
|
31
32
|
root_directory = File.expand_path('../../../..', __FILE__)
|
32
|
-
dictionary_file_path = File.join(root_directory, 'data/
|
33
|
+
dictionary_file_path = File.join(root_directory, 'data/base-word.txt')
|
33
34
|
|
34
35
|
dictionary_content = []
|
35
36
|
File.open(dictionary_file_path, 'r') do |file|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Sastrawi
|
2
2
|
module StopWordRemover
|
3
3
|
class StopWordRemover
|
4
|
-
|
4
|
+
attr_reader :dictionary
|
5
5
|
|
6
6
|
def initialize(dictionary)
|
7
7
|
@dictionary = dictionary
|
@@ -12,7 +12,7 @@ module Sastrawi
|
|
12
12
|
stop_words = []
|
13
13
|
|
14
14
|
words.each do |word|
|
15
|
-
unless @dictionary.
|
15
|
+
unless @dictionary.contains?(word)
|
16
16
|
stop_words.push(word)
|
17
17
|
end
|
18
18
|
end
|
data/lib/sastrawi/version.rb
CHANGED
data/sastrawi.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sastrawi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.0
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrias Meisyal
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-05-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -65,7 +65,7 @@ files:
|
|
65
65
|
- LICENSE.txt
|
66
66
|
- README.md
|
67
67
|
- Rakefile
|
68
|
-
- data/
|
68
|
+
- data/base-word.txt
|
69
69
|
- lib/sastrawi.rb
|
70
70
|
- lib/sastrawi/dictionary/array_dictionary.rb
|
71
71
|
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb
|
@@ -158,12 +158,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
158
158
|
requirements:
|
159
159
|
- - ">="
|
160
160
|
- !ruby/object:Gem::Version
|
161
|
-
version:
|
161
|
+
version: 1.9.3
|
162
162
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
163
163
|
requirements:
|
164
|
-
- - "
|
164
|
+
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
|
-
version:
|
166
|
+
version: '0'
|
167
167
|
requirements: []
|
168
168
|
rubyforge_project:
|
169
169
|
rubygems_version: 2.5.1
|