sastrawi-ruby 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +23 -0
- data/.gitignore +51 -0
- data/.travis.yml +10 -0
- data/CONTRIBUTING.md +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +104 -0
- data/Rakefile +6 -0
- data/_config.yml +1 -0
- data/bin/sastrawi +24 -0
- data/data/base-word.txt +29933 -0
- data/lib/sastrawi/dictionary/array_dictionary.rb +67 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb +11 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb +17 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb +19 -0
- data/lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb +19 -0
- data/lib/sastrawi/morphology/invalid_affix_pair_specification.rb +28 -0
- data/lib/sastrawi/stemmer/cache/array_cache.rb +25 -0
- data/lib/sastrawi/stemmer/cached_stemmer.rb +33 -0
- data/lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb +25 -0
- data/lib/sastrawi/stemmer/context/context.rb +217 -0
- data/lib/sastrawi/stemmer/context/removal.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb +17 -0
- data/lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb +54 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb +37 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb +34 -0
- data/lib/sastrawi/stemmer/context/visitor/visitor_provider.rb +157 -0
- data/lib/sastrawi/stemmer/filter/text_normalizer.rb +15 -0
- data/lib/sastrawi/stemmer/stemmer.rb +101 -0
- data/lib/sastrawi/stemmer/stemmer_factory.rb +49 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover.rb +27 -0
- data/lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb +124 -0
- data/lib/sastrawi/version.rb +5 -0
- data/lib/sastrawi.rb +4 -0
- data/sastrawi.gemspec +34 -0
- metadata +179 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
require 'sastrawi/stemmer/context/context'
|
|
2
|
+
|
|
3
|
+
require 'sastrawi/stemmer/context/visitor/visitor_provider'
|
|
4
|
+
|
|
5
|
+
require 'sastrawi/stemmer/filter/text_normalizer'
|
|
6
|
+
|
|
7
|
+
##
|
|
8
|
+
# Indonesian Stemmer
|
|
9
|
+
# Nazief & Adriani, CS Stemmer, ECS Stemmer, Improved ECS
|
|
10
|
+
|
|
11
|
+
module Sastrawi
|
|
12
|
+
module Stemmer
|
|
13
|
+
class Stemmer
|
|
14
|
+
attr_reader :dictionary, :visitor_provider
|
|
15
|
+
|
|
16
|
+
def initialize(dictionary)
|
|
17
|
+
@dictionary = dictionary
|
|
18
|
+
@visitor_provider = Sastrawi::Stemmer::Context::Visitor::VisitorProvider.new
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# Stem a string to its base form
|
|
23
|
+
|
|
24
|
+
def stem(text)
|
|
25
|
+
normalized_text = Sastrawi::Stemmer::Filter::TextNormalizer.normalize_text(text)
|
|
26
|
+
|
|
27
|
+
words = normalized_text.split(' ')
|
|
28
|
+
stems = []
|
|
29
|
+
|
|
30
|
+
words.each do |word|
|
|
31
|
+
stems.push(stem_word(word))
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
stems.join(' ')
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
##
|
|
38
|
+
# Stem a word to its base form
|
|
39
|
+
|
|
40
|
+
def stem_word(word)
|
|
41
|
+
if plural?(word)
|
|
42
|
+
stem_plural_word(word)
|
|
43
|
+
else
|
|
44
|
+
stem_singular_word(word)
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def plural?(word)
|
|
49
|
+
matches = /^(.*)-(ku|mu|nya|lah|kah|tah|pun)$/.match(word)
|
|
50
|
+
|
|
51
|
+
return matches[1].include?('-') if matches
|
|
52
|
+
|
|
53
|
+
return word.include?('-')
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
##
|
|
57
|
+
# Stem a plural word to its base form
|
|
58
|
+
# Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval"
|
|
59
|
+
# page 76-77
|
|
60
|
+
|
|
61
|
+
def stem_plural_word(word)
|
|
62
|
+
first_match = /^(.*)-(.*)$/.match(word)
|
|
63
|
+
|
|
64
|
+
return word unless first_match
|
|
65
|
+
|
|
66
|
+
words = [first_match[1], first_match[2]]
|
|
67
|
+
suffix = words[1]
|
|
68
|
+
suffixes = %w[ku mu nya lah kah tah pun]
|
|
69
|
+
second_match = /^(.*)-(.*)$/.match(words[0])
|
|
70
|
+
|
|
71
|
+
if suffixes.include?(suffix) && second_match
|
|
72
|
+
words[0] = second_match[1]
|
|
73
|
+
words[1] = "#{second_match[2]}-#{suffix}"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
root_first_word = stem_singular_word(words[0])
|
|
77
|
+
root_second_word = stem_singular_word(words[1])
|
|
78
|
+
|
|
79
|
+
if !@dictionary.contains?(words[1]) && root_second_word == words[1]
|
|
80
|
+
root_second_word = stem_singular_word("me#{words[1]}")
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
if root_first_word == root_second_word
|
|
84
|
+
root_first_word
|
|
85
|
+
else
|
|
86
|
+
word
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
##
|
|
91
|
+
# Stem a singular word to its base form
|
|
92
|
+
|
|
93
|
+
def stem_singular_word(word)
|
|
94
|
+
context = Sastrawi::Stemmer::Context::Context.new(word, @dictionary, @visitor_provider)
|
|
95
|
+
context.execute
|
|
96
|
+
|
|
97
|
+
context.result
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
require 'sastrawi/dictionary/array_dictionary'
|
|
2
|
+
|
|
3
|
+
require 'sastrawi/stemmer/cached_stemmer'
|
|
4
|
+
require 'sastrawi/stemmer/stemmer'
|
|
5
|
+
|
|
6
|
+
require 'sastrawi/stemmer/cache/array_cache'
|
|
7
|
+
|
|
8
|
+
##
|
|
9
|
+
# Stemmer factory helps creating a pre-configured stemmer
|
|
10
|
+
|
|
11
|
+
module Sastrawi
|
|
12
|
+
module Stemmer
|
|
13
|
+
class StemmerFactory
|
|
14
|
+
def create_stemmer(is_dev = false)
|
|
15
|
+
stemmer = Sastrawi::Stemmer::Stemmer.new(create_default_dictionary(is_dev))
|
|
16
|
+
|
|
17
|
+
cache_result = Sastrawi::Stemmer::Cache::ArrayCache.new
|
|
18
|
+
cached_stemmer = Sastrawi::Stemmer::CachedStemmer.new(cache_result, stemmer)
|
|
19
|
+
|
|
20
|
+
cached_stemmer
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def create_default_dictionary(is_dev = false)
|
|
24
|
+
words = get_words(is_dev)
|
|
25
|
+
dictionary = Sastrawi::Dictionary::ArrayDictionary.new(words)
|
|
26
|
+
|
|
27
|
+
dictionary
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def get_words(is_dev = false)
|
|
31
|
+
get_words_from_file
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def get_words_from_file
|
|
35
|
+
root_directory = File.expand_path('../../../..', __FILE__)
|
|
36
|
+
dictionary_file_path = File.join(root_directory, 'data/base-word.txt')
|
|
37
|
+
|
|
38
|
+
dictionary_content = []
|
|
39
|
+
File.open(dictionary_file_path, 'r') do |file|
|
|
40
|
+
file.each do |line|
|
|
41
|
+
dictionary_content.push(line.chomp)
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
dictionary_content
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
module Sastrawi
|
|
2
|
+
module StopWordRemover
|
|
3
|
+
class StopWordRemover
|
|
4
|
+
attr_reader :dictionary
|
|
5
|
+
|
|
6
|
+
def initialize(dictionary)
|
|
7
|
+
@dictionary = dictionary
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
##
|
|
11
|
+
# Remove stop words
|
|
12
|
+
|
|
13
|
+
def remove(text)
|
|
14
|
+
words = text.split(' ')
|
|
15
|
+
stop_words = []
|
|
16
|
+
|
|
17
|
+
words.each do |word|
|
|
18
|
+
unless @dictionary.contains?(word)
|
|
19
|
+
stop_words.push(word)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
stop_words.join(' ')
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
require 'sastrawi/dictionary/array_dictionary'
|
|
2
|
+
|
|
3
|
+
require 'sastrawi/stop_word_remover/stop_word_remover'
|
|
4
|
+
|
|
5
|
+
module Sastrawi
|
|
6
|
+
module StopWordRemover
|
|
7
|
+
class StopWordRemoverFactory
|
|
8
|
+
def create_stop_word_remover
|
|
9
|
+
stop_words = get_stop_word
|
|
10
|
+
|
|
11
|
+
dictionary = Sastrawi::Dictionary::ArrayDictionary.new(stop_words)
|
|
12
|
+
stop_word_remover = Sastrawi::StopWordRemover::StopWordRemover.new(dictionary)
|
|
13
|
+
|
|
14
|
+
stop_word_remover
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def get_stop_word
|
|
18
|
+
stop_words = %w[a ada adalah adanya adapun agak agaknya agar akan
|
|
19
|
+
akankah akhir akhiri akhirnya aku akulah amat amatlah anda andalah
|
|
20
|
+
antar antara antaranya apa apaan apabila apakah apalagi apatah arti
|
|
21
|
+
artinya asal asalkan atas atau ataukah ataupun awal awalnya b bagai
|
|
22
|
+
bagaikan bagaimana bagaimanakah bagaimanapun bagainamakah bagi bagian
|
|
23
|
+
bahkan bahwa bahwasannya bahwasanya baik baiklah bakal bakalan balik
|
|
24
|
+
banyak bapak baru bawah beberapa begini beginian beginikah beginilah
|
|
25
|
+
begitu begitukah begitulah begitupun bekerja belakang belakangan
|
|
26
|
+
belum belumlah benar benarkah benarlah berada berakhir berakhirlah
|
|
27
|
+
berakhirnya berapa berapakah berapalah berapapun berarti berawal
|
|
28
|
+
berbagai berdatangan beri berikan berikut berikutnya berjumlah
|
|
29
|
+
berkali-kali berkata berkehendak berkeinginan berkenaan berlainan
|
|
30
|
+
berlalu berlangsung berlebihan bermacam bermacam-macam bermaksud
|
|
31
|
+
bermula bersama bersama-sama bersiap bersiap-siap bertanya
|
|
32
|
+
bertanya-tanya berturut berturut-turut bertutur berujar berupa besar
|
|
33
|
+
betul betulkah biasa biasanya bila bilakah bisa bisakah boleh bolehkah
|
|
34
|
+
bolehlah buat bukan bukankah bukanlah bukannya bulan bung c cara
|
|
35
|
+
caranya cukup cukupkah cukuplah cuma d dahulu dalam dan dapat dari
|
|
36
|
+
daripada datang dekat demi demikian demikianlah dengan depan di dia
|
|
37
|
+
diakhiri diakhirinya dialah diantara diantaranya diberi diberikan
|
|
38
|
+
diberikannya dibuat dibuatnya didapat didatangkan digunakan
|
|
39
|
+
diibaratkan diibaratkannya diingat diingatkan diinginkan dijawab
|
|
40
|
+
dijelaskan dijelaskannya dikarenakan dikatakan dikatakannya dikerjakan
|
|
41
|
+
diketahui diketahuinya dikira dilakukan dilalui dilihat dimaksud
|
|
42
|
+
dimaksudkan dimaksudkannya dimaksudnya diminta dimintai dimisalkan
|
|
43
|
+
dimulai dimulailah dimulainya dimungkinkan dini dipastikan diperbuat
|
|
44
|
+
diperbuatnya dipergunakan diperkirakan diperlihatkan diperlukan
|
|
45
|
+
diperlukannya dipersoalkan dipertanyakan dipunyai diri dirinya
|
|
46
|
+
disampaikan disebut disebutkan disebutkannya disini disinilah
|
|
47
|
+
ditambahkan ditandaskan ditanya ditanyai ditanyakan ditegaskan
|
|
48
|
+
ditujukan ditunjuk ditunjuki ditunjukkan ditunjukkannya ditunjuknya
|
|
49
|
+
dituturkan dituturkannya diucapkan diucapkannya diungkapkan dong dua
|
|
50
|
+
dulu e empat enak enggak enggaknya entah entahlah f g guna gunakan h
|
|
51
|
+
hadap hai hal halo hallo hampir hanya hanyalah hari harus haruslah
|
|
52
|
+
harusnya helo hello hendak hendaklah hendaknya hingga i ia ialah
|
|
53
|
+
ibarat ibaratkan ibaratnya ibu ikut ingat ingat-ingat ingin inginkah
|
|
54
|
+
inginkan ini inikah inilah itu itukah itulah j jadi jadilah jadinya
|
|
55
|
+
jangan jangankan janganlah jauh jawab jawaban jawabnya jelas jelaskan
|
|
56
|
+
jelaslah jelasnya jika jikalau juga jumlah jumlahnya justru k kadar
|
|
57
|
+
kala kalau kalaulah kalaupun kali kalian kami kamilah kamu kamulah kan
|
|
58
|
+
kapan kapankah kapanpun karena karenanya kasus kata katakan katakanlah
|
|
59
|
+
katanya ke keadaan kebetulan kecil kedua keduanya keinginan kelamaan
|
|
60
|
+
kelihatan kelihatannya kelima keluar kembali kemudian kemungkinan
|
|
61
|
+
kemungkinannya kena kenapa kepada kepadanya kerja kesampaian
|
|
62
|
+
keseluruhan keseluruhannya keterlaluan ketika khusus khususnya kini
|
|
63
|
+
kinilah kira kira-kira kiranya kita kitalah kok kurang l lagi lagian
|
|
64
|
+
lah lain lainnya laku lalu lama lamanya langsung lanjut lanjutnya
|
|
65
|
+
lebih lewat lihat lima luar m macam maka makanya makin maksud malah
|
|
66
|
+
malahan mampu mampukah mana manakala manalagi masa masalah masalahnya
|
|
67
|
+
masih masihkah masing masing-masing masuk mata mau maupun melainkan
|
|
68
|
+
melakukan melalui melihat melihatnya memang memastikan memberi
|
|
69
|
+
memberikan membuat memerlukan memihak meminta memintakan memisalkan
|
|
70
|
+
memperbuat mempergunakan memperkirakan memperlihatkan mempersiapkan
|
|
71
|
+
mempersoalkan mempertanyakan mempunyai memulai memungkinkan menaiki
|
|
72
|
+
menambahkan menandaskan menanti menanti-nanti menantikan menanya
|
|
73
|
+
menanyai menanyakan mendapat mendapatkan mendatang mendatangi
|
|
74
|
+
mendatangkan menegaskan mengakhiri mengapa mengatakan mengatakannya
|
|
75
|
+
mengenai mengerjakan mengetahui menggunakan menghendaki mengibaratkan
|
|
76
|
+
mengibaratkannya mengingat mengingatkan menginginkan mengira
|
|
77
|
+
mengucapkan mengucapkannya mengungkapkan menjadi menjawab menjelaskan
|
|
78
|
+
menuju menunjuk menunjuki menunjukkan menunjuknya menurut menuturkan
|
|
79
|
+
menyampaikan menyangkut menyatakan menyebutkan menyeluruh menyiapkan
|
|
80
|
+
merasa mereka merekalah merupakan meski meskipun meyakini meyakinkan
|
|
81
|
+
minta mirip misal misalkan misalnya mohon mula mulai mulailah mulanya
|
|
82
|
+
mungkin mungkinkah n nah naik namun nanti nantinya nya nyaris nyata
|
|
83
|
+
nyatanya o oleh olehnya orang p pada padahal padanya pak paling
|
|
84
|
+
panjang pantas para pasti pastilah penting pentingnya per percuma
|
|
85
|
+
perlu perlukah perlunya pernah persoalan pertama pertama-tama
|
|
86
|
+
pertanyaan pertanyakan pihak pihaknya pukul pula pun punya q r rasa
|
|
87
|
+
rasanya rupa rupanya s saat saatnya saja sajalah salam saling sama
|
|
88
|
+
sama-sama sambil sampai sampai-sampai sampaikan sana sangat sangatlah
|
|
89
|
+
sangkut satu saya sayalah se sebab sebabnya sebagai sebagaimana
|
|
90
|
+
sebagainya sebagian sebaik sebaik-baiknya sebaiknya sebaliknya
|
|
91
|
+
sebanyak sebegini sebegitu sebelum sebelumnya sebenarnya seberapa
|
|
92
|
+
sebesar sebetulnya sebisanya sebuah sebut sebutlah sebutnya secara
|
|
93
|
+
secukupnya sedang sedangkan sedemikian sedikit sedikitnya seenaknya
|
|
94
|
+
segala segalanya segera seharusnya sehingga seingat sejak sejauh
|
|
95
|
+
sejenak sejumlah sekadar sekadarnya sekali sekali-kali sekalian
|
|
96
|
+
sekaligus sekalipun sekarang sekaranglah sekecil seketika sekiranya
|
|
97
|
+
sekitar sekitarnya sekurang-kurangnya sekurangnya sela selain selaku
|
|
98
|
+
selalu selama selama-lamanya selamanya selanjutnya seluruh seluruhnya
|
|
99
|
+
semacam semakin semampu semampunya semasa semasih semata semata-mata
|
|
100
|
+
semaunya sementara semisal semisalnya sempat semua semuanya semula
|
|
101
|
+
sendiri sendirian sendirinya seolah seolah-olah seorang sepanjang
|
|
102
|
+
sepantasnya sepantasnyalah seperlunya seperti sepertinya sepihak
|
|
103
|
+
sering seringnya serta serupa sesaat sesama sesampai sesegera sesekali
|
|
104
|
+
seseorang sesuatu sesuatunya sesudah sesudahnya setelah setempat
|
|
105
|
+
setengah seterusnya setiap setiba setibanya setidak-tidaknya
|
|
106
|
+
setidaknya setinggi seusai sewaktu siap siapa siapakah siapapun sini
|
|
107
|
+
sinilah soal soalnya suatu sudah sudahkah sudahlah supaya t tadi
|
|
108
|
+
tadinya tahu tak tambah tambahnya tampak tampaknya tandas tandasnya
|
|
109
|
+
tanpa tanya tanyakan tanyanya tapi tegas tegasnya telah tempat tentang
|
|
110
|
+
tentu tentulah tentunya tepat terakhir terasa terbanyak terdahulu
|
|
111
|
+
terdapat terdiri terhadap terhadapnya teringat teringat-ingat terjadi
|
|
112
|
+
terjadilah terjadinya terkira terlalu terlebih terlihat termasuk
|
|
113
|
+
ternyata tersampaikan tersebut tersebutlah tertentu tertuju terus
|
|
114
|
+
terutama tetap tetapi tiap tiba tiba-tiba tidak tidakkah tidaklah tiga
|
|
115
|
+
toh tuju tunjuk turut tutur tuturnya u ucap ucapnya ujar ujarnya
|
|
116
|
+
umumnya ungkap ungkapnya untuk usah usai v w waduh wah wahai waktunya
|
|
117
|
+
walau walaupun wong x y ya yaitu yakin yakni yang z
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
stop_words
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
data/lib/sastrawi.rb
ADDED
data/sastrawi.gemspec
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "lib/sastrawi/version"
|
|
4
|
+
|
|
5
|
+
Gem::Specification.new do |spec|
|
|
6
|
+
spec.name = "sastrawi-ruby"
|
|
7
|
+
spec.version = Sastrawi::VERSION
|
|
8
|
+
spec.required_ruby_version = ">= 3.0.0"
|
|
9
|
+
spec.authors = ["Johannes Dwi Cahyo"]
|
|
10
|
+
spec.email = ["csk.rage@gmail.com"]
|
|
11
|
+
|
|
12
|
+
spec.summary = "Indonesian language stemmer for Ruby"
|
|
13
|
+
spec.description = "A maintained fork of the sastrawi gem. Stems words in Bahasa Indonesia " \
|
|
14
|
+
"using the Nazief & Adriani algorithm with Enhanced Confix Stripping. " \
|
|
15
|
+
"Based on the original work by Andrias Meisyal (sastrawi gem) and the " \
|
|
16
|
+
"PHP Sastrawi project (github.com/sastrawi/sastrawi)."
|
|
17
|
+
spec.homepage = "https://github.com/johannesdwicahyo/sastrawi-ruby"
|
|
18
|
+
spec.license = "MIT"
|
|
19
|
+
|
|
20
|
+
spec.metadata = {
|
|
21
|
+
"source_code_uri" => "https://github.com/johannesdwicahyo/sastrawi-ruby",
|
|
22
|
+
"changelog_uri" => "https://github.com/johannesdwicahyo/sastrawi-ruby/blob/master/README.md",
|
|
23
|
+
"upstream_uri" => "https://github.com/meisyal/sastrawi-ruby"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
|
27
|
+
spec.bindir = "bin"
|
|
28
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
29
|
+
spec.require_paths = ["lib"]
|
|
30
|
+
|
|
31
|
+
spec.add_development_dependency "bundler", ">= 2.0"
|
|
32
|
+
spec.add_development_dependency "rake", "~> 13.0"
|
|
33
|
+
spec.add_development_dependency "rspec", "~> 3.10"
|
|
34
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: sastrawi-ruby
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.2.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Johannes Dwi Cahyo
|
|
8
|
+
bindir: bin
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: bundler
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '2.0'
|
|
19
|
+
type: :development
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '2.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: rake
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '13.0'
|
|
33
|
+
type: :development
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '13.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: rspec
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '3.10'
|
|
47
|
+
type: :development
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '3.10'
|
|
54
|
+
description: A maintained fork of the sastrawi gem. Stems words in Bahasa Indonesia
|
|
55
|
+
using the Nazief & Adriani algorithm with Enhanced Confix Stripping. Based on the
|
|
56
|
+
original work by Andrias Meisyal (sastrawi gem) and the PHP Sastrawi project (github.com/sastrawi/sastrawi).
|
|
57
|
+
email:
|
|
58
|
+
- csk.rage@gmail.com
|
|
59
|
+
executables:
|
|
60
|
+
- sastrawi
|
|
61
|
+
extensions: []
|
|
62
|
+
extra_rdoc_files: []
|
|
63
|
+
files:
|
|
64
|
+
- ".github/workflows/ci.yml"
|
|
65
|
+
- ".gitignore"
|
|
66
|
+
- ".travis.yml"
|
|
67
|
+
- CONTRIBUTING.md
|
|
68
|
+
- Gemfile
|
|
69
|
+
- LICENSE.txt
|
|
70
|
+
- README.md
|
|
71
|
+
- Rakefile
|
|
72
|
+
- _config.yml
|
|
73
|
+
- bin/sastrawi
|
|
74
|
+
- data/base-word.txt
|
|
75
|
+
- lib/sastrawi.rb
|
|
76
|
+
- lib/sastrawi/dictionary/array_dictionary.rb
|
|
77
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule10.rb
|
|
78
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule11.rb
|
|
79
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule12.rb
|
|
80
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13a.rb
|
|
81
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule13b.rb
|
|
82
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule14.rb
|
|
83
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15a.rb
|
|
84
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule15b.rb
|
|
85
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule16.rb
|
|
86
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17a.rb
|
|
87
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17b.rb
|
|
88
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17c.rb
|
|
89
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule17d.rb
|
|
90
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18a.rb
|
|
91
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule18b.rb
|
|
92
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule19.rb
|
|
93
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1a.rb
|
|
94
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule1b.rb
|
|
95
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule2.rb
|
|
96
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule20.rb
|
|
97
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21a.rb
|
|
98
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule21b.rb
|
|
99
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule23.rb
|
|
100
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule24.rb
|
|
101
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule25.rb
|
|
102
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26a.rb
|
|
103
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule26b.rb
|
|
104
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule27.rb
|
|
105
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28a.rb
|
|
106
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule28b.rb
|
|
107
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule29.rb
|
|
108
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule3.rb
|
|
109
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30a.rb
|
|
110
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30b.rb
|
|
111
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule30c.rb
|
|
112
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31a.rb
|
|
113
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule31b.rb
|
|
114
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule32.rb
|
|
115
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule34.rb
|
|
116
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule35.rb
|
|
117
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule36.rb
|
|
118
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37a.rb
|
|
119
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule37b.rb
|
|
120
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38a.rb
|
|
121
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule38b.rb
|
|
122
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39a.rb
|
|
123
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule39b.rb
|
|
124
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule4.rb
|
|
125
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40a.rb
|
|
126
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule40b.rb
|
|
127
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule41.rb
|
|
128
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule42.rb
|
|
129
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule5.rb
|
|
130
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6a.rb
|
|
131
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule6b.rb
|
|
132
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule7.rb
|
|
133
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule8.rb
|
|
134
|
+
- lib/sastrawi/morphology/disambiguator/disambiguator_prefix_rule9.rb
|
|
135
|
+
- lib/sastrawi/morphology/invalid_affix_pair_specification.rb
|
|
136
|
+
- lib/sastrawi/stemmer/cache/array_cache.rb
|
|
137
|
+
- lib/sastrawi/stemmer/cached_stemmer.rb
|
|
138
|
+
- lib/sastrawi/stemmer/confix_stripping/precedence_adjustment_specification.rb
|
|
139
|
+
- lib/sastrawi/stemmer/context/context.rb
|
|
140
|
+
- lib/sastrawi/stemmer/context/removal.rb
|
|
141
|
+
- lib/sastrawi/stemmer/context/visitor/dont_stem_short_word.rb
|
|
142
|
+
- lib/sastrawi/stemmer/context/visitor/prefix_disambiguator.rb
|
|
143
|
+
- lib/sastrawi/stemmer/context/visitor/remove_derivational_suffix.rb
|
|
144
|
+
- lib/sastrawi/stemmer/context/visitor/remove_inflectional_particle.rb
|
|
145
|
+
- lib/sastrawi/stemmer/context/visitor/remove_inflectional_possessive_pronoun.rb
|
|
146
|
+
- lib/sastrawi/stemmer/context/visitor/remove_plain_prefix.rb
|
|
147
|
+
- lib/sastrawi/stemmer/context/visitor/visitor_provider.rb
|
|
148
|
+
- lib/sastrawi/stemmer/filter/text_normalizer.rb
|
|
149
|
+
- lib/sastrawi/stemmer/stemmer.rb
|
|
150
|
+
- lib/sastrawi/stemmer/stemmer_factory.rb
|
|
151
|
+
- lib/sastrawi/stop_word_remover/stop_word_remover.rb
|
|
152
|
+
- lib/sastrawi/stop_word_remover/stop_word_remover_factory.rb
|
|
153
|
+
- lib/sastrawi/version.rb
|
|
154
|
+
- sastrawi.gemspec
|
|
155
|
+
homepage: https://github.com/johannesdwicahyo/sastrawi-ruby
|
|
156
|
+
licenses:
|
|
157
|
+
- MIT
|
|
158
|
+
metadata:
|
|
159
|
+
source_code_uri: https://github.com/johannesdwicahyo/sastrawi-ruby
|
|
160
|
+
changelog_uri: https://github.com/johannesdwicahyo/sastrawi-ruby/blob/master/README.md
|
|
161
|
+
upstream_uri: https://github.com/meisyal/sastrawi-ruby
|
|
162
|
+
rdoc_options: []
|
|
163
|
+
require_paths:
|
|
164
|
+
- lib
|
|
165
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
166
|
+
requirements:
|
|
167
|
+
- - ">="
|
|
168
|
+
- !ruby/object:Gem::Version
|
|
169
|
+
version: 3.0.0
|
|
170
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
171
|
+
requirements:
|
|
172
|
+
- - ">="
|
|
173
|
+
- !ruby/object:Gem::Version
|
|
174
|
+
version: '0'
|
|
175
|
+
requirements: []
|
|
176
|
+
rubygems_version: 3.6.9
|
|
177
|
+
specification_version: 4
|
|
178
|
+
summary: Indonesian language stemmer for Ruby
|
|
179
|
+
test_files: []
|