turkish_stemmer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +282 -0
- data/Rakefile +21 -0
- data/benchmarks/stemmers_comparison.rb +16 -0
- data/benchmarks/stemming_samples.txt +17916 -0
- data/benchmarks/turkish_word_recognition.rb +26 -0
- data/config/derivational_states.yml +10 -0
- data/config/derivational_suffixes.yml +6 -0
- data/config/nominal_verb_states.yml +121 -0
- data/config/nominal_verb_suffixes.yml +90 -0
- data/config/noun_states.yml +177 -0
- data/config/noun_suffixes.yml +113 -0
- data/config/stemmer.yml +206 -0
- data/lib/hash_extension.rb +5 -0
- data/lib/turkish_stemmer/version.rb +3 -0
- data/lib/turkish_stemmer.rb +455 -0
- data/spec/fixtures/simple_state.yml +14 -0
- data/spec/fixtures/simple_state_02.yml +21 -0
- data/spec/fixtures/simple_suffix.yml +7 -0
- data/spec/fixtures/simple_transition.yml +7 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/support/fixtures.csv +101 -0
- data/spec/turkish_stemmer_spec.rb +522 -0
- data/turkish_stemmer.gemspec +35 -0
- metadata +164 -0
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
require 'turkish_stemmer'
|
8
|
+
|
9
|
+
RSpec.configure do |config|
|
10
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
11
|
+
config.run_all_when_everything_filtered = true
|
12
|
+
config.filter_run :focus
|
13
|
+
|
14
|
+
# Run specs in random order to surface order dependencies. If you find an
|
15
|
+
# order dependency and want to debug it, you can fix the order by providing
|
16
|
+
# the seed, which is printed after each run.
|
17
|
+
# --seed 1234
|
18
|
+
config.order = 'random'
|
19
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
evimden,ev,from my house,ev-(i)m-den,
|
2
|
+
göz,göz,eye,--,
|
3
|
+
güzelmişsin,güzel,you were beautiful,güzel-miş-sin,rumor
|
4
|
+
etkilerden,etki,from the effects,etki-ler-den,
|
5
|
+
çocukmuş,çocuk,it was child,çocuk-miş,rumor
|
6
|
+
kediymiş,kedi,it was cat,kedi-(y)miş,rumor
|
7
|
+
balığım,balık,my fish,balık-(i)m,
|
8
|
+
doktoruymuşsunuz,doktor,you were his/her/its doctor,doktor-i-(y)miş-siniz,rumor
|
9
|
+
kalelerimizdekilerden,kale,the ones that are from our castle,kale-ler-(i)miz-de-ki-ler-den,
|
10
|
+
çocuğuymuşumçasına,çocuk,as if i was his/her child ,çocuk-i-(y)miş-im-cesine,
|
11
|
+
kedileriyle,kedi,with his/her/its cats,kedi-ler-i-(y)le,kedileri+ile
|
12
|
+
çocuklarımmış,çocuk,they were my children,çocuk-ler-(i)m-miş,rumor
|
13
|
+
kitabımızdı,kitap,it was our book,kitap-(i)miz-di,
|
14
|
+
kelimelerin,kelime,"""your"" -or- ""of"" words",kelime-ler-(i)n -or- kelime-ler-in,both ways
|
15
|
+
kayısısı,kayısı,his/her/its apricot,kayısı-(s)ı,
|
16
|
+
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
17
|
+
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum ",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
18
|
+
eriğinden,erik,"from ""your"" -or- ""his/her/its"" plum",erik-(i)n-den -or- erik-i-(n)den,both ways
|
19
|
+
eriğine,erik,"to ""your"" -or- ""his/her/its"" plum",erik-(i)n-e -or- erik-i-(n)e,both ways
|
20
|
+
eriğinde,erik,"at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de -or- erik-i-(n)de,both ways
|
21
|
+
kayısısına,kayısı,his/her/its apricot,kayısı-(s)ı-(n)a,
|
22
|
+
kayısısında,kayısı,at his/her/its apricot,kayısı-(s)ı-(n)da,
|
23
|
+
saatlerimiz,saat,our watches/hours,saat-ler-(i)miz,
|
24
|
+
kalemimin,kalem,of my pencil,kalem-(i)m-in,
|
25
|
+
ucu,uç,nib of...,uç-i,
|
26
|
+
kalelerdekilerden,kale,from the ones that are at (the) castle,kale-ler-de-ki-ler-den,
|
27
|
+
kalelerdekilerin,kale,of the ones that are at (the) castle,kale-ler-de-ki-ler-in,
|
28
|
+
kalelerimizdekilerde,kale,at the ones that are at (the) castle,kale-ler-(i)miz-de-ki-ler-de,
|
29
|
+
kaleninkinin,kale,of the one that belongs to (the) castle,kale-(n)in-ki-nin,
|
30
|
+
kalemizinkinin,kale,of the one that belongs to our castle,kale-miz-(i)n-ki-(n)in,
|
31
|
+
kalelerindeki,kale,"the one that is at ""their castle"" -or- ""his/her/its castles""",kale-leri-(n)de-ki -or- kale-ler-i-(n)de-ki,both ways
|
32
|
+
erikleri,erik,"""their plum"" -or- ""his/her/its plums""",erik-leri -or- erik-ler-i,both ways
|
33
|
+
erikler,erik,(the) plums,erik-ler,
|
34
|
+
eriğim,erik,my plum,erik-(i)m,
|
35
|
+
eriğimiz,erik,our plum,erik-(i)miz,
|
36
|
+
eriğin,erik,your plum,erik-(i)n,
|
37
|
+
eriğiniz,erik,your plum,erik-(i)niz,2nd person in plural
|
38
|
+
eriği,erik,his/her/its plum,erik-i,
|
39
|
+
eriğini,erik,"""your"" -or- ""his/her/its"" plum",erik-(i)n-i -or- erik-i-(n)i,both ways
|
40
|
+
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
41
|
+
eriğe,erik,to (the) plum,erik-e,
|
42
|
+
eriğine,erik,"to ""your"" -or- ""his/her/its"" plum",erik-(i)n-e -or- erik-i-(n)e,both ways
|
43
|
+
eriklerine,erik,"to ""their plum"" -or- ""his/her/its plums""",erik-leri-(n)e -or- erik-ler-i-(n)e,both ways
|
44
|
+
erikte,erik,at (the) plum,erik-de,
|
45
|
+
eriğinde,erik,"at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de -or- erik-i-(n)de,both ways
|
46
|
+
erikten,erik,from (the) plum,erik-den,
|
47
|
+
eriğinden,erik,"from ""your"" -or- ""his/her/its"" plum",erik-(i)n-den -or- erik-i-(n)den,both ways
|
48
|
+
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
49
|
+
eriğiyle,erik,with his/her/its plum,erik-i-(y)le,
|
50
|
+
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
51
|
+
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
52
|
+
eriğince,erik,"after ""your"" -or- ""his/her/its"" plum",erik-(i)n-ce -or- erik-i-(n)ce,both ways
|
53
|
+
gülüm,gül,my rose,gül-(i)m,
|
54
|
+
erikteki,erik,the one that is at (the) plum,erik-de-ki,
|
55
|
+
eriktekilerden,erik,the ones that are from (the) plum,erik-de-ki-ler-den,
|
56
|
+
eriklerdeki,erik,the ones that are at (the) plum,erik-ler-de-ki,
|
57
|
+
kitabı,kitap,(the) book,kitap-i,
|
58
|
+
ağacı,ağaç,(the) tree,ağaç-i,
|
59
|
+
eriğim,erik,my plum / i am plum,erik-(i)m / erik-im,
|
60
|
+
kayısıyım,kayısı,i am apricot,kayısı-(y)ım,
|
61
|
+
eriksem,erik,if i am plum,erik-se-m,
|
62
|
+
eriksen,erik,if you are plum,erik-se-n,
|
63
|
+
erikse,erik,if he/she/it is plum,erik-se,
|
64
|
+
erikseniz,erik,if you are plum,erik-se-niz,2nd person in plural
|
65
|
+
erikseler,erik,if they are plum,erik-se-ler,
|
66
|
+
erikti,erik,he/she/it was plum,erik-di,
|
67
|
+
eriktiniz,erik,you were plum,erik-di-niz,2nd person in plural
|
68
|
+
eriktiler,erik,they were plum,erik-di-ler,
|
69
|
+
erikmiş,erik,it was plum,erik-miş,rumor
|
70
|
+
erikmişçesine,erik,as if it was plum,erik-miş-cesine,
|
71
|
+
erikmiştir,erik,it was plum,erik-miş-dir,rumor
|
72
|
+
erikmişim,erik,i was plum,erik-miş-(i)m,rumor
|
73
|
+
erikmişsin,erik,you were plum,erik-miş-sin,rumor
|
74
|
+
erikmişsindir,erik,you happened to be plum,erik-miş-sin-dir,rumor
|
75
|
+
erikmişimdir,erik,i happened to be plum,erik-miş-im-dir,rumor
|
76
|
+
erikmişiz,erik,we were plum,erik-miş-iz,rumor
|
77
|
+
erikmişizdir,erik,we happened to be plum,erik-miş-iz-dir,rumor
|
78
|
+
erikmişsiniz,erik,you were plum,erik-miş-siniz,2nd person in plural + rumor
|
79
|
+
erikmişsinizdir,erik,you happened to be plum,erik-miş-siniz-dir,2nd person in plural + rumor
|
80
|
+
erikmişler,erik,they were plum,erik-miş-ler,rumor
|
81
|
+
erikmişlerdir,erik,they happened to be plum,erik-miş-ler-dir,rumor
|
82
|
+
erikmişimcesine,erik,as if i was plum,erik-miş-im-cesine,
|
83
|
+
erikmişsincesine,erik,as if you were plum,erik-miş-sin-cesine,
|
84
|
+
erikmişizcesine,erik,as if we were plum,erik-miş-iz-cesine,
|
85
|
+
erikmişsinizcesine,erik,as if you were plum,erik-miş-siniz-cesine,2nd person in plural
|
86
|
+
erikmişlercesine,erik,as if they were plum,erik-miş-ler-cesine,
|
87
|
+
erikler,erik,plums,erik-ler,
|
88
|
+
eriğim,erik,my plum / i am plum,erik-(i)m / erik-im,
|
89
|
+
eriksin,erik,you are plum,erik-sin,
|
90
|
+
erik,erik,plum / he/she/it is plum,erik / erik,
|
91
|
+
eriğiz,erik,we are plum,erik-iz,
|
92
|
+
eriksiniz,erik,you are plum,erik-siniz,2nd person in plural
|
93
|
+
erikler,erik,they are plum,erik-ler,
|
94
|
+
eriktir,erik,it is plum,erik-dir,assumption
|
95
|
+
eriktirler,erik,they are plum,erik-dir-ler,assumption
|
96
|
+
erikken,erik,while he/she/it was plum,erik-(i)ken,
|
97
|
+
kötüymüş,kötü,he/she/it is bad,kötü-(y)miş,rumor
|
98
|
+
yüz,yüz,face / hundred,yüz,double meaning
|
99
|
+
muş,muş,--,--,this is a suffix
|
100
|
+
ad,ad,name,ad,
|
101
|
+
soyad,soyad,surname,soyad,soy+ad = lineage+name
|
@@ -0,0 +1,522 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
require "pry"
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
describe TurkishStemmer do
|
7
|
+
|
8
|
+
describe ".count_syllables" do
|
9
|
+
it "counts syllables correctly" do
|
10
|
+
expect(described_class.count_syllables("erikler")).to eq 3
|
11
|
+
expect(described_class.count_syllables("çocuklarımmış")).to eq 5
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe ".vowels" do
|
16
|
+
it "returns all vowels of a word" do
|
17
|
+
expect(described_class.vowels("kötüymüş")).to eq(%w(ö ü ü))
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe ".has_roundness?" do
|
22
|
+
context "when vowel is empty" do
|
23
|
+
it "has roundness" do
|
24
|
+
expect(described_class).to have_roundness(nil, "a")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context "when candidate is empty" do
|
29
|
+
it "has roundness" do
|
30
|
+
expect(described_class).to have_roundness("a", nil)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "when an unrounded vowel is passed" do
|
35
|
+
let(:vowel) { described_class::UNROUNDED_VOWELS.chars.to_a.sample }
|
36
|
+
|
37
|
+
context "and candidate is an unrounded vowel too" do
|
38
|
+
let(:candidate) { described_class::UNROUNDED_VOWELS.chars.to_a.sample }
|
39
|
+
|
40
|
+
it "has roundness" do
|
41
|
+
expect(described_class).to have_roundness(vowel, candidate)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context "and candidate is not an unrounded vowel" do
|
46
|
+
let(:candidate) { described_class::ROUNDED_VOWELS.chars.to_a.sample }
|
47
|
+
|
48
|
+
it "does not have roundness" do
|
49
|
+
expect(described_class).not_to have_roundness(vowel, candidate)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
context "when a rounded vowel is passed" do
|
55
|
+
let(:vowel) { described_class::ROUNDED_VOWELS.chars.to_a.sample }
|
56
|
+
|
57
|
+
context "and one of 'a', 'e', 'u' or 'ü' is a candidate" do
|
58
|
+
let(:candidate) { described_class::FOLLOWING_ROUNDED_VOWELS.chars.to_a.sample }
|
59
|
+
|
60
|
+
it "has roundness" do
|
61
|
+
expect(described_class).to have_roundness(vowel, candidate)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
context "and candidate is 'o'" do
|
66
|
+
let(:candidate) { 'o' }
|
67
|
+
|
68
|
+
it "does not have roundness" do
|
69
|
+
expect(described_class).not_to have_roundness(vowel, candidate)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe ".has_frontness?" do
|
76
|
+
context "when vowel is empty" do
|
77
|
+
it "has frontness" do
|
78
|
+
expect(described_class).to have_frontness(nil, "a")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
context "when candidate is empty" do
|
83
|
+
it "has frontness" do
|
84
|
+
expect(described_class).to have_frontness("a", nil)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context "when a front vowel is passed" do
|
89
|
+
let(:vowel) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
90
|
+
|
91
|
+
context "and candidate is a front vowel" do
|
92
|
+
let(:candidate) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
93
|
+
|
94
|
+
it "has frontness" do
|
95
|
+
expect(described_class).to have_frontness(vowel, candidate)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
context "and candidate is a back vowel" do
|
100
|
+
let(:candidate) { described_class::BACK_VOWELS.chars.to_a.sample }
|
101
|
+
|
102
|
+
it "does not have frontness" do
|
103
|
+
expect(described_class).not_to have_frontness(vowel, candidate)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
context "when a back vowel is passed" do
|
109
|
+
let(:vowel) { described_class::BACK_VOWELS.chars.to_a.sample }
|
110
|
+
|
111
|
+
context "and candidate is a front vowel" do
|
112
|
+
let(:candidate) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
113
|
+
|
114
|
+
it "does not have frontness" do
|
115
|
+
expect(described_class).not_to have_frontness(vowel, candidate)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
context "and candidate is a back vowel" do
|
120
|
+
let(:candidate) { described_class::BACK_VOWELS.chars.to_a.sample }
|
121
|
+
|
122
|
+
it "has frontness" do
|
123
|
+
expect(described_class).to have_frontness(vowel, candidate)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe ".has_vowel_harmony?" do
|
130
|
+
it "has vowel harmony for valid Turkish words" do
|
131
|
+
expect(described_class).to have_vowel_harmony("Türkiyedir")
|
132
|
+
expect(described_class).to have_vowel_harmony("kapıdır")
|
133
|
+
expect(described_class).to have_vowel_harmony("gündür")
|
134
|
+
expect(described_class).to have_vowel_harmony("paltodur")
|
135
|
+
end
|
136
|
+
|
137
|
+
it "does not have vowel harmony for loanwords" do
|
138
|
+
expect(described_class).not_to have_vowel_harmony("kürdan")
|
139
|
+
end
|
140
|
+
|
141
|
+
it "does not have vowel harmony for exceptions" do
|
142
|
+
expect(described_class).not_to have_vowel_harmony("anne")
|
143
|
+
expect(described_class).not_to have_vowel_harmony("kardeş")
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
describe ".affix_morphological_stripper" do
|
148
|
+
context "when states are empty" do
|
149
|
+
it "returns the word" do
|
150
|
+
expect(
|
151
|
+
described_class.
|
152
|
+
affix_morphological_stripper("kapıdır", suffixes: :test)).
|
153
|
+
to eq(["kapıdır"])
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
context "when suffixes are empty" do
|
158
|
+
it "return the word" do
|
159
|
+
expect(
|
160
|
+
described_class.
|
161
|
+
affix_morphological_stripper("kapıdır", states: :test)).
|
162
|
+
to eq(["kapıdır"])
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
context "when there exist states and suffixes" do
|
167
|
+
let(:states) {
|
168
|
+
described_class.
|
169
|
+
load_states_or_suffixes("spec/fixtures/simple_state.yml")
|
170
|
+
}
|
171
|
+
|
172
|
+
let(:suffixes) {
|
173
|
+
described_class.
|
174
|
+
load_states_or_suffixes("spec/fixtures/simple_suffix.yml")
|
175
|
+
}
|
176
|
+
|
177
|
+
it "generates pendings for the initial state" do
|
178
|
+
described_class.should_receive(:generate_pendings).with(:a,
|
179
|
+
"word", states, suffixes).and_call_original
|
180
|
+
|
181
|
+
described_class.affix_morphological_stripper("word",
|
182
|
+
states: states, suffixes: suffixes)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
context "when a transition is valid" do
|
187
|
+
let(:states) {
|
188
|
+
described_class.
|
189
|
+
load_states_or_suffixes("spec/fixtures/simple_state.yml")
|
190
|
+
}
|
191
|
+
|
192
|
+
let(:suffixes) {
|
193
|
+
described_class.
|
194
|
+
load_states_or_suffixes("spec/fixtures/simple_suffix.yml")
|
195
|
+
}
|
196
|
+
|
197
|
+
context "and the transit state is a final state" do
|
198
|
+
it "removes similar pending transitions" do
|
199
|
+
described_class.should_receive(:mark_stem).with(
|
200
|
+
"guzelim", suffixes[:s1]).and_call_original
|
201
|
+
|
202
|
+
described_class.affix_morphological_stripper(
|
203
|
+
"guzelim", states: states, suffixes: suffixes)
|
204
|
+
end
|
205
|
+
|
206
|
+
context "with no other transitions" do
|
207
|
+
it "stems the word" do
|
208
|
+
expect(
|
209
|
+
described_class.
|
210
|
+
affix_morphological_stripper("guzelim",
|
211
|
+
states: states, suffixes: suffixes)).
|
212
|
+
to eq ["guzel"]
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
context "with other transitions" do
|
217
|
+
let(:states) {
|
218
|
+
described_class.load_states_or_suffixes("spec/fixtures/simple_state_02.yml")
|
219
|
+
}
|
220
|
+
|
221
|
+
it "adds more pendings to check" do
|
222
|
+
described_class.should_receive(:mark_stem).with("guzelim",
|
223
|
+
suffixes[:s1]).and_call_original
|
224
|
+
|
225
|
+
described_class.affix_morphological_stripper("guzelim",
|
226
|
+
states: states, suffixes: suffixes)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
context "when one suffix matches correctly with a given word" do
|
233
|
+
it "does not compare other suffixes in the same transition" do
|
234
|
+
described_class.
|
235
|
+
should_receive(:mark_stem).
|
236
|
+
with(anything, anything).
|
237
|
+
# only for suffixes [sUnUz, nUz]
|
238
|
+
exactly(2).times.
|
239
|
+
and_call_original
|
240
|
+
|
241
|
+
puts described_class.
|
242
|
+
affix_morphological_stripper("taksicisiniz",
|
243
|
+
states: described_class::NOMINAL_VERB_STATES,
|
244
|
+
suffixes: described_class::NOMINAL_VERB_SUFFIXES)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
describe ".stem" do
|
250
|
+
context "when input is single syllable" do
|
251
|
+
it "returns the input as is" do
|
252
|
+
expect(described_class.stem("ev")).to eq "ev"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
context "when input has zero syllables - one consonant" do
|
257
|
+
it "returns the input as is" do
|
258
|
+
expect(described_class.stem("p")).to eq "p"
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
describe ".last_consonant!" do
|
264
|
+
context "when last consonant is among 'b', 'c', 'd' or 'ğ'" do
|
265
|
+
it "is replaced by 'p', 'ç', 't' or 'k'" do
|
266
|
+
expect(described_class.last_consonant!('kebab')).to eq('kebap')
|
267
|
+
expect(described_class.last_consonant!('kebac')).to eq('kebaç')
|
268
|
+
expect(described_class.last_consonant!('kebad')).to eq('kebat')
|
269
|
+
expect(described_class.last_consonant!('kebağ')).to eq('kebak')
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
context "when word belongs to protected words" do
|
274
|
+
it "does not replace last consonant" do
|
275
|
+
expect(described_class.last_consonant!('ad')).to eq('ad')
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
describe ".mark_stem" do
|
281
|
+
let(:suffix) do
|
282
|
+
{
|
283
|
+
name: "-dir",
|
284
|
+
regex: "dir",
|
285
|
+
optional_letter: false,
|
286
|
+
check_harmony: true
|
287
|
+
}
|
288
|
+
end
|
289
|
+
|
290
|
+
context "when suffix has harmony check on" do
|
291
|
+
before do
|
292
|
+
suffix[:regex] = "dan"
|
293
|
+
end
|
294
|
+
|
295
|
+
context "and word does not obey harmony rules" do
|
296
|
+
it "does not stem a word that does not obey harmony rules" do
|
297
|
+
expect(described_class.mark_stem("kürdan", suffix)).to eq(
|
298
|
+
{ stem: false, word: "kürdan", suffix_applied: nil })
|
299
|
+
end
|
300
|
+
|
301
|
+
context "and word belongs to exceptions" do
|
302
|
+
before do
|
303
|
+
suffix[:regex] = "ler"
|
304
|
+
end
|
305
|
+
it "stems the word" do
|
306
|
+
expect(described_class.mark_stem("saatler", suffix)).to eq(
|
307
|
+
{ stem: true, word: "saat", suffix_applied: "ler" })
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
end
|
313
|
+
|
314
|
+
context "when suffix has harmony check off" do
|
315
|
+
before do
|
316
|
+
suffix[:regex] = "dan"
|
317
|
+
suffix[:check_harmony] = false
|
318
|
+
end
|
319
|
+
|
320
|
+
it "stems a word that does not obey harmony rules" do
|
321
|
+
expect(
|
322
|
+
described_class.
|
323
|
+
mark_stem("kürdan", suffix)).
|
324
|
+
to eq({ stem: true, word: "kür", suffix_applied: "dan" })
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
context "when word matches suffix" do
|
329
|
+
it "partially stems a word" do
|
330
|
+
expect(
|
331
|
+
described_class.
|
332
|
+
mark_stem("Türkiyedir", suffix)).
|
333
|
+
to eq({ stem: true, word: "Türkiye", suffix_applied: "dir" })
|
334
|
+
end
|
335
|
+
|
336
|
+
|
337
|
+
context "when suffix has (y) as optional letter" do
|
338
|
+
before do
|
339
|
+
suffix[:optional_letter] = "y|y"
|
340
|
+
suffix[:regex] = "um"
|
341
|
+
end
|
342
|
+
|
343
|
+
context "and new word has valid last 'y' symbol" do
|
344
|
+
it "stems correctly and increases the suffix" do
|
345
|
+
expect(
|
346
|
+
described_class.
|
347
|
+
mark_stem("loyum", suffix)).
|
348
|
+
to eq({ stem: true, word: "lo", suffix_applied: "yum" })
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
context "and new word does not have valid last 'y' symbol" do
|
353
|
+
it "does not stem the word" do
|
354
|
+
expect(
|
355
|
+
described_class.
|
356
|
+
mark_stem("lotyum", suffix)).
|
357
|
+
to eq({ stem: false, word: "lotyum", suffix_applied: nil })
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
describe ".generate_pendings" do
|
365
|
+
let(:states) { described_class::NOMINAL_VERB_STATES }
|
366
|
+
let(:suffixes) { described_class::NOMINAL_VERB_SUFFIXES }
|
367
|
+
|
368
|
+
it "raises an error if state does not exist" do
|
369
|
+
expect {
|
370
|
+
described_class.
|
371
|
+
generate_pendings(1, "satıyorsunuz", states, suffixes)
|
372
|
+
}.to raise_error(ArgumentError, "State #{1} does not exist")
|
373
|
+
end
|
374
|
+
|
375
|
+
context "when state key does not have transitions" do
|
376
|
+
it "returns an empty array" do
|
377
|
+
expect(
|
378
|
+
described_class.
|
379
|
+
# :f state does not have transitions
|
380
|
+
generate_pendings(:f, "satıyorsunuz", states, suffixes)).
|
381
|
+
to eq []
|
382
|
+
end
|
383
|
+
end
|
384
|
+
|
385
|
+
context "when state key has transitions" do
|
386
|
+
it "returns an array of hashes for each transition" do
|
387
|
+
expect(
|
388
|
+
described_class.
|
389
|
+
generate_pendings(:a, "satıyorsunuz", states, suffixes).first.keys).
|
390
|
+
to eq [:suffix, :to_state, :from_state, :word, :mark]
|
391
|
+
end
|
392
|
+
|
393
|
+
it "sets :from_state key to current key state" do
|
394
|
+
expect(
|
395
|
+
described_class.
|
396
|
+
generate_pendings(:a, "satıyorsunuz", states, suffixes).first[:from_state]).
|
397
|
+
to eq :a
|
398
|
+
end
|
399
|
+
end
|
400
|
+
end
|
401
|
+
|
402
|
+
describe ".valid_optional_letter?" do
|
403
|
+
context "when last letter of the word is not equal to candidate" do
|
404
|
+
it "responds with [true,nil] - indicating that there was not match" do
|
405
|
+
expect(
|
406
|
+
described_class.valid_optional_letter?("test", "r")).
|
407
|
+
to eq([true, nil])
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
context "when there is a vowel match" do
|
412
|
+
context "and the previous char is a vowel" do
|
413
|
+
it "responds with false" do
|
414
|
+
expect(
|
415
|
+
described_class.
|
416
|
+
valid_optional_letter?("takcicii", "i")).
|
417
|
+
to eq([false, "i"])
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
context "and the previous char is a consonant" do
|
422
|
+
it "responds with true" do
|
423
|
+
expect(
|
424
|
+
described_class.
|
425
|
+
valid_optional_letter?("okula", "a")).
|
426
|
+
to eq([true, "a"])
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
context "when there is a consonant match" do
|
432
|
+
context "and the previous char is a vowel" do
|
433
|
+
it "responds with true" do
|
434
|
+
expect(
|
435
|
+
described_class.
|
436
|
+
valid_optional_letter?("litiy", "y")).
|
437
|
+
to eq([true, "y"])
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
context "and the previous char is a consonant" do
|
442
|
+
it "responds with true" do
|
443
|
+
expect(
|
444
|
+
described_class.
|
445
|
+
valid_optional_letter?("lity", "y")).
|
446
|
+
to eq([false, "y"])
|
447
|
+
end
|
448
|
+
end
|
449
|
+
end
|
450
|
+
end
|
451
|
+
|
452
|
+
describe ".stem_post_process" do
|
453
|
+
context "when input stream has words with last consonant replacements" do
|
454
|
+
it "replaces last consonant" do
|
455
|
+
expect(described_class.stem_post_process(["kebab"], "word")).to eq("kebap")
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
it "flattens and uniq results" do
|
460
|
+
expect(described_class.stem_post_process(["kitap",["kitap"]], "word")).to eq("kitap")
|
461
|
+
end
|
462
|
+
|
463
|
+
it "removes no syllables words" do
|
464
|
+
expect(described_class.stem_post_process(["kitap", "k"], "word")).to eq("kitap")
|
465
|
+
end
|
466
|
+
|
467
|
+
context "when multiple stem candidates exist" do
|
468
|
+
it "returns the shortest" do
|
469
|
+
pending("fix this")
|
470
|
+
expect(described_class.stem_post_process(["kitap", "kita", "kit"], "word")).to eq "kit"
|
471
|
+
end
|
472
|
+
|
473
|
+
context "and word belongs to selection list" do
|
474
|
+
it "returns this word" do
|
475
|
+
expect(described_class.stem_post_process(
|
476
|
+
["su", "suy", "suyu"], "suyu")).to eq "su"
|
477
|
+
end
|
478
|
+
end
|
479
|
+
end
|
480
|
+
end
|
481
|
+
|
482
|
+
describe ".proceed_to_stem?" do
|
483
|
+
context "when word has 1 or less syllables" do
|
484
|
+
it "returns false" do
|
485
|
+
expect(described_class.proceed_to_stem?("kit")).not_to be
|
486
|
+
end
|
487
|
+
end
|
488
|
+
|
489
|
+
context "when word is nil" do
|
490
|
+
it "returns false" do
|
491
|
+
expect(described_class.proceed_to_stem?(nil)).not_to be
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
context "when word is empty" do
|
496
|
+
it "returns false" do
|
497
|
+
expect(described_class.proceed_to_stem?("")).not_to be
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
context "when word is among protected words" do
|
502
|
+
it "returns false" do
|
503
|
+
expect(described_class.proceed_to_stem?("soyad")).not_to be
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
context "when word contains non Turkish letters" do
|
508
|
+
it "returns false" do
|
509
|
+
expect(described_class.proceed_to_stem?("τελειο")).not_to be
|
510
|
+
expect(described_class.proceed_to_stem?("&aa")).not_to be
|
511
|
+
end
|
512
|
+
end
|
513
|
+
end
|
514
|
+
|
515
|
+
context "1:1 testing with paper" do
|
516
|
+
CSV.read("spec/support/fixtures.csv").each do |row|
|
517
|
+
it "stems #{row[0]} correct" do
|
518
|
+
expect(described_class.stem(row[0].downcase)).to eq row[1].downcase
|
519
|
+
end
|
520
|
+
end
|
521
|
+
end
|
522
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'turkish_stemmer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "turkish_stemmer"
|
8
|
+
spec.version = TurkishStemmer::VERSION
|
9
|
+
spec.authors = ["Tasos Stathopoulos", "Giorgos Tsiftsis"]
|
10
|
+
spec.email = ["stathopa@skroutz.gr", "giorgos.tsiftsis@skroutz.gr"]
|
11
|
+
spec.summary = %q{A simple Turkish stemmer}
|
12
|
+
spec.description = %q{A simple Turkish stemmer}
|
13
|
+
spec.homepage = "https://gitlab.skroutz.gr/turkish_stemmer"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "hashie"
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency "rspec"
|
26
|
+
spec.add_development_dependency "ruby-stemmer"
|
27
|
+
|
28
|
+
if RUBY_ENGINE == "ruby"
|
29
|
+
if RUBY_VERSION >= "2.0.0"
|
30
|
+
spec.add_development_dependency "pry-byebug"
|
31
|
+
else
|
32
|
+
spec.add_development_dependency "pry"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|