turkish_stemmer 0.1.2 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- metadata +6 -38
- data/.gitignore +0 -18
- data/.rspec +0 -2
- data/Gemfile +0 -4
- data/LICENSE.txt +0 -22
- data/README.md +0 -282
- data/Rakefile +0 -21
- data/benchmarks/stemmers_comparison.rb +0 -16
- data/benchmarks/stemming_samples.txt +0 -17916
- data/benchmarks/turkish_word_recognition.rb +0 -26
- data/config/derivational_states.yml +0 -10
- data/config/derivational_suffixes.yml +0 -6
- data/config/nominal_verb_states.yml +0 -121
- data/config/nominal_verb_suffixes.yml +0 -90
- data/config/noun_states.yml +0 -177
- data/config/noun_suffixes.yml +0 -113
- data/config/stemmer.yml +0 -206
- data/lib/turkish_stemmer.rb +0 -455
- data/lib/turkish_stemmer/version.rb +0 -3
- data/spec/fixtures/simple_state.yml +0 -14
- data/spec/fixtures/simple_state_02.yml +0 -21
- data/spec/fixtures/simple_suffix.yml +0 -7
- data/spec/fixtures/simple_transition.yml +0 -7
- data/spec/spec_helper.rb +0 -19
- data/spec/support/fixtures.csv +0 -101
- data/spec/turkish_stemmer_spec.rb +0 -522
- data/turkish_stemmer.gemspec +0 -35
data/spec/spec_helper.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
-
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
-
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
-
# loaded once.
|
5
|
-
#
|
6
|
-
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
-
require 'turkish_stemmer'
|
8
|
-
|
9
|
-
RSpec.configure do |config|
|
10
|
-
config.treat_symbols_as_metadata_keys_with_true_values = true
|
11
|
-
config.run_all_when_everything_filtered = true
|
12
|
-
config.filter_run :focus
|
13
|
-
|
14
|
-
# Run specs in random order to surface order dependencies. If you find an
|
15
|
-
# order dependency and want to debug it, you can fix the order by providing
|
16
|
-
# the seed, which is printed after each run.
|
17
|
-
# --seed 1234
|
18
|
-
config.order = 'random'
|
19
|
-
end
|
data/spec/support/fixtures.csv
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
evimden,ev,from my house,ev-(i)m-den,
|
2
|
-
göz,göz,eye,--,
|
3
|
-
güzelmişsin,güzel,you were beautiful,güzel-miş-sin,rumor
|
4
|
-
etkilerden,etki,from the effects,etki-ler-den,
|
5
|
-
çocukmuş,çocuk,it was child,çocuk-miş,rumor
|
6
|
-
kediymiş,kedi,it was cat,kedi-(y)miş,rumor
|
7
|
-
balığım,balık,my fish,balık-(i)m,
|
8
|
-
doktoruymuşsunuz,doktor,you were his/her/its doctor,doktor-i-(y)miş-siniz,rumor
|
9
|
-
kalelerimizdekilerden,kale,the ones that are from our castle,kale-ler-(i)miz-de-ki-ler-den,
|
10
|
-
çocuğuymuşumçasına,çocuk,as if i was his/her child ,çocuk-i-(y)miş-im-cesine,
|
11
|
-
kedileriyle,kedi,with his/her/its cats,kedi-ler-i-(y)le,kedileri+ile
|
12
|
-
çocuklarımmış,çocuk,they were my children,çocuk-ler-(i)m-miş,rumor
|
13
|
-
kitabımızdı,kitap,it was our book,kitap-(i)miz-di,
|
14
|
-
kelimelerin,kelime,"""your"" -or- ""of"" words",kelime-ler-(i)n -or- kelime-ler-in,both ways
|
15
|
-
kayısısı,kayısı,his/her/its apricot,kayısı-(s)ı,
|
16
|
-
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
17
|
-
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum ",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
18
|
-
eriğinden,erik,"from ""your"" -or- ""his/her/its"" plum",erik-(i)n-den -or- erik-i-(n)den,both ways
|
19
|
-
eriğine,erik,"to ""your"" -or- ""his/her/its"" plum",erik-(i)n-e -or- erik-i-(n)e,both ways
|
20
|
-
eriğinde,erik,"at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de -or- erik-i-(n)de,both ways
|
21
|
-
kayısısına,kayısı,his/her/its apricot,kayısı-(s)ı-(n)a,
|
22
|
-
kayısısında,kayısı,at his/her/its apricot,kayısı-(s)ı-(n)da,
|
23
|
-
saatlerimiz,saat,our watches/hours,saat-ler-(i)miz,
|
24
|
-
kalemimin,kalem,of my pencil,kalem-(i)m-in,
|
25
|
-
ucu,uç,nib of...,uç-i,
|
26
|
-
kalelerdekilerden,kale,from the ones that are at (the) castle,kale-ler-de-ki-ler-den,
|
27
|
-
kalelerdekilerin,kale,of the ones that are at (the) castle,kale-ler-de-ki-ler-in,
|
28
|
-
kalelerimizdekilerde,kale,at the ones that are at (the) castle,kale-ler-(i)miz-de-ki-ler-de,
|
29
|
-
kaleninkinin,kale,of the one that belongs to (the) castle,kale-(n)in-ki-nin,
|
30
|
-
kalemizinkinin,kale,of the one that belongs to our castle,kale-miz-(i)n-ki-(n)in,
|
31
|
-
kalelerindeki,kale,"the one that is at ""their castle"" -or- ""his/her/its castles""",kale-leri-(n)de-ki -or- kale-ler-i-(n)de-ki,both ways
|
32
|
-
erikleri,erik,"""their plum"" -or- ""his/her/its plums""",erik-leri -or- erik-ler-i,both ways
|
33
|
-
erikler,erik,(the) plums,erik-ler,
|
34
|
-
eriğim,erik,my plum,erik-(i)m,
|
35
|
-
eriğimiz,erik,our plum,erik-(i)miz,
|
36
|
-
eriğin,erik,your plum,erik-(i)n,
|
37
|
-
eriğiniz,erik,your plum,erik-(i)niz,2nd person in plural
|
38
|
-
eriği,erik,his/her/its plum,erik-i,
|
39
|
-
eriğini,erik,"""your"" -or- ""his/her/its"" plum",erik-(i)n-i -or- erik-i-(n)i,both ways
|
40
|
-
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
41
|
-
eriğe,erik,to (the) plum,erik-e,
|
42
|
-
eriğine,erik,"to ""your"" -or- ""his/her/its"" plum",erik-(i)n-e -or- erik-i-(n)e,both ways
|
43
|
-
eriklerine,erik,"to ""their plum"" -or- ""his/her/its plums""",erik-leri-(n)e -or- erik-ler-i-(n)e,both ways
|
44
|
-
erikte,erik,at (the) plum,erik-de,
|
45
|
-
eriğinde,erik,"at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de -or- erik-i-(n)de,both ways
|
46
|
-
erikten,erik,from (the) plum,erik-den,
|
47
|
-
eriğinden,erik,"from ""your"" -or- ""his/her/its"" plum",erik-(i)n-den -or- erik-i-(n)den,both ways
|
48
|
-
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
49
|
-
eriğiyle,erik,with his/her/its plum,erik-i-(y)le,
|
50
|
-
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
51
|
-
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
52
|
-
eriğince,erik,"after ""your"" -or- ""his/her/its"" plum",erik-(i)n-ce -or- erik-i-(n)ce,both ways
|
53
|
-
gülüm,gül,my rose,gül-(i)m,
|
54
|
-
erikteki,erik,the one that is at (the) plum,erik-de-ki,
|
55
|
-
eriktekilerden,erik,the ones that are from (the) plum,erik-de-ki-ler-den,
|
56
|
-
eriklerdeki,erik,the ones that are at (the) plum,erik-ler-de-ki,
|
57
|
-
kitabı,kitap,(the) book,kitap-i,
|
58
|
-
ağacı,ağaç,(the) tree,ağaç-i,
|
59
|
-
eriğim,erik,my plum / i am plum,erik-(i)m / erik-im,
|
60
|
-
kayısıyım,kayısı,i am apricot,kayısı-(y)ım,
|
61
|
-
eriksem,erik,if i am plum,erik-se-m,
|
62
|
-
eriksen,erik,if you are plum,erik-se-n,
|
63
|
-
erikse,erik,if he/she/it is plum,erik-se,
|
64
|
-
erikseniz,erik,if you are plum,erik-se-niz,2nd person in plural
|
65
|
-
erikseler,erik,if they are plum,erik-se-ler,
|
66
|
-
erikti,erik,he/she/it was plum,erik-di,
|
67
|
-
eriktiniz,erik,you were plum,erik-di-niz,2nd person in plural
|
68
|
-
eriktiler,erik,they were plum,erik-di-ler,
|
69
|
-
erikmiş,erik,it was plum,erik-miş,rumor
|
70
|
-
erikmişçesine,erik,as if it was plum,erik-miş-cesine,
|
71
|
-
erikmiştir,erik,it was plum,erik-miş-dir,rumor
|
72
|
-
erikmişim,erik,i was plum,erik-miş-(i)m,rumor
|
73
|
-
erikmişsin,erik,you were plum,erik-miş-sin,rumor
|
74
|
-
erikmişsindir,erik,you happened to be plum,erik-miş-sin-dir,rumor
|
75
|
-
erikmişimdir,erik,i happened to be plum,erik-miş-im-dir,rumor
|
76
|
-
erikmişiz,erik,we were plum,erik-miş-iz,rumor
|
77
|
-
erikmişizdir,erik,we happened to be plum,erik-miş-iz-dir,rumor
|
78
|
-
erikmişsiniz,erik,you were plum,erik-miş-siniz,2nd person in plural + rumor
|
79
|
-
erikmişsinizdir,erik,you happened to be plum,erik-miş-siniz-dir,2nd person in plural + rumor
|
80
|
-
erikmişler,erik,they were plum,erik-miş-ler,rumor
|
81
|
-
erikmişlerdir,erik,they happened to be plum,erik-miş-ler-dir,rumor
|
82
|
-
erikmişimcesine,erik,as if i was plum,erik-miş-im-cesine,
|
83
|
-
erikmişsincesine,erik,as if you were plum,erik-miş-sin-cesine,
|
84
|
-
erikmişizcesine,erik,as if we were plum,erik-miş-iz-cesine,
|
85
|
-
erikmişsinizcesine,erik,as if you were plum,erik-miş-siniz-cesine,2nd person in plural
|
86
|
-
erikmişlercesine,erik,as if they were plum,erik-miş-ler-cesine,
|
87
|
-
erikler,erik,plums,erik-ler,
|
88
|
-
eriğim,erik,my plum / i am plum,erik-(i)m / erik-im,
|
89
|
-
eriksin,erik,you are plum,erik-sin,
|
90
|
-
erik,erik,plum / he/she/it is plum,erik / erik,
|
91
|
-
eriğiz,erik,we are plum,erik-iz,
|
92
|
-
eriksiniz,erik,you are plum,erik-siniz,2nd person in plural
|
93
|
-
erikler,erik,they are plum,erik-ler,
|
94
|
-
eriktir,erik,it is plum,erik-dir,assumption
|
95
|
-
eriktirler,erik,they are plum,erik-dir-ler,assumption
|
96
|
-
erikken,erik,while he/she/it was plum,erik-(i)ken,
|
97
|
-
kötüymüş,kötü,he/she/it is bad,kötü-(y)miş,rumor
|
98
|
-
yüz,yüz,face / hundred,yüz,double meaning
|
99
|
-
muş,muş,--,--,this is a suffix
|
100
|
-
ad,ad,name,ad,
|
101
|
-
soyad,soyad,surname,soyad,soy+ad = lineage+name
|
@@ -1,522 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
require "spec_helper"
|
3
|
-
require "pry"
|
4
|
-
require "csv"
|
5
|
-
|
6
|
-
describe TurkishStemmer do
|
7
|
-
|
8
|
-
describe ".count_syllables" do
|
9
|
-
it "counts syllables correctly" do
|
10
|
-
expect(described_class.count_syllables("erikler")).to eq 3
|
11
|
-
expect(described_class.count_syllables("çocuklarımmış")).to eq 5
|
12
|
-
end
|
13
|
-
end
|
14
|
-
|
15
|
-
describe ".vowels" do
|
16
|
-
it "returns all vowels of a word" do
|
17
|
-
expect(described_class.vowels("kötüymüş")).to eq(%w(ö ü ü))
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
describe ".has_roundness?" do
|
22
|
-
context "when vowel is empty" do
|
23
|
-
it "has roundness" do
|
24
|
-
expect(described_class).to have_roundness(nil, "a")
|
25
|
-
end
|
26
|
-
end
|
27
|
-
|
28
|
-
context "when candidate is empty" do
|
29
|
-
it "has roundness" do
|
30
|
-
expect(described_class).to have_roundness("a", nil)
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
context "when an unrounded vowel is passed" do
|
35
|
-
let(:vowel) { described_class::UNROUNDED_VOWELS.chars.to_a.sample }
|
36
|
-
|
37
|
-
context "and candidate is an unrounded vowel too" do
|
38
|
-
let(:candidate) { described_class::UNROUNDED_VOWELS.chars.to_a.sample }
|
39
|
-
|
40
|
-
it "has roundness" do
|
41
|
-
expect(described_class).to have_roundness(vowel, candidate)
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
context "and candidate is not an unrounded vowel" do
|
46
|
-
let(:candidate) { described_class::ROUNDED_VOWELS.chars.to_a.sample }
|
47
|
-
|
48
|
-
it "does not have roundness" do
|
49
|
-
expect(described_class).not_to have_roundness(vowel, candidate)
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
context "when a rounded vowel is passed" do
|
55
|
-
let(:vowel) { described_class::ROUNDED_VOWELS.chars.to_a.sample }
|
56
|
-
|
57
|
-
context "and one of 'a', 'e', 'u' or 'ü' is a candidate" do
|
58
|
-
let(:candidate) { described_class::FOLLOWING_ROUNDED_VOWELS.chars.to_a.sample }
|
59
|
-
|
60
|
-
it "has roundness" do
|
61
|
-
expect(described_class).to have_roundness(vowel, candidate)
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
context "and candidate is 'o'" do
|
66
|
-
let(:candidate) { 'o' }
|
67
|
-
|
68
|
-
it "does not have roundness" do
|
69
|
-
expect(described_class).not_to have_roundness(vowel, candidate)
|
70
|
-
end
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
describe ".has_frontness?" do
|
76
|
-
context "when vowel is empty" do
|
77
|
-
it "has frontness" do
|
78
|
-
expect(described_class).to have_frontness(nil, "a")
|
79
|
-
end
|
80
|
-
end
|
81
|
-
|
82
|
-
context "when candidate is empty" do
|
83
|
-
it "has frontness" do
|
84
|
-
expect(described_class).to have_frontness("a", nil)
|
85
|
-
end
|
86
|
-
end
|
87
|
-
|
88
|
-
context "when a front vowel is passed" do
|
89
|
-
let(:vowel) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
90
|
-
|
91
|
-
context "and candidate is a front vowel" do
|
92
|
-
let(:candidate) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
93
|
-
|
94
|
-
it "has frontness" do
|
95
|
-
expect(described_class).to have_frontness(vowel, candidate)
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
context "and candidate is a back vowel" do
|
100
|
-
let(:candidate) { described_class::BACK_VOWELS.chars.to_a.sample }
|
101
|
-
|
102
|
-
it "does not have frontness" do
|
103
|
-
expect(described_class).not_to have_frontness(vowel, candidate)
|
104
|
-
end
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
context "when a back vowel is passed" do
|
109
|
-
let(:vowel) { described_class::BACK_VOWELS.chars.to_a.sample }
|
110
|
-
|
111
|
-
context "and candidate is a front vowel" do
|
112
|
-
let(:candidate) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
113
|
-
|
114
|
-
it "does not have frontness" do
|
115
|
-
expect(described_class).not_to have_frontness(vowel, candidate)
|
116
|
-
end
|
117
|
-
end
|
118
|
-
|
119
|
-
context "and candidate is a back vowel" do
|
120
|
-
let(:candidate) { described_class::BACK_VOWELS.chars.to_a.sample }
|
121
|
-
|
122
|
-
it "has frontness" do
|
123
|
-
expect(described_class).to have_frontness(vowel, candidate)
|
124
|
-
end
|
125
|
-
end
|
126
|
-
end
|
127
|
-
end
|
128
|
-
|
129
|
-
describe ".has_vowel_harmony?" do
|
130
|
-
it "has vowel harmony for valid Turkish words" do
|
131
|
-
expect(described_class).to have_vowel_harmony("Türkiyedir")
|
132
|
-
expect(described_class).to have_vowel_harmony("kapıdır")
|
133
|
-
expect(described_class).to have_vowel_harmony("gündür")
|
134
|
-
expect(described_class).to have_vowel_harmony("paltodur")
|
135
|
-
end
|
136
|
-
|
137
|
-
it "does not have vowel harmony for loanwords" do
|
138
|
-
expect(described_class).not_to have_vowel_harmony("kürdan")
|
139
|
-
end
|
140
|
-
|
141
|
-
it "does not have vowel harmony for exceptions" do
|
142
|
-
expect(described_class).not_to have_vowel_harmony("anne")
|
143
|
-
expect(described_class).not_to have_vowel_harmony("kardeş")
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
describe ".affix_morphological_stripper" do
|
148
|
-
context "when states are empty" do
|
149
|
-
it "returns the word" do
|
150
|
-
expect(
|
151
|
-
described_class.
|
152
|
-
affix_morphological_stripper("kapıdır", suffixes: :test)).
|
153
|
-
to eq(["kapıdır"])
|
154
|
-
end
|
155
|
-
end
|
156
|
-
|
157
|
-
context "when suffixes are empty" do
|
158
|
-
it "return the word" do
|
159
|
-
expect(
|
160
|
-
described_class.
|
161
|
-
affix_morphological_stripper("kapıdır", states: :test)).
|
162
|
-
to eq(["kapıdır"])
|
163
|
-
end
|
164
|
-
end
|
165
|
-
|
166
|
-
context "when there exist states and suffixes" do
|
167
|
-
let(:states) {
|
168
|
-
described_class.
|
169
|
-
load_states_or_suffixes("spec/fixtures/simple_state.yml")
|
170
|
-
}
|
171
|
-
|
172
|
-
let(:suffixes) {
|
173
|
-
described_class.
|
174
|
-
load_states_or_suffixes("spec/fixtures/simple_suffix.yml")
|
175
|
-
}
|
176
|
-
|
177
|
-
it "generates pendings for the initial state" do
|
178
|
-
described_class.should_receive(:generate_pendings).with(:a,
|
179
|
-
"word", states, suffixes).and_call_original
|
180
|
-
|
181
|
-
described_class.affix_morphological_stripper("word",
|
182
|
-
states: states, suffixes: suffixes)
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
context "when a transition is valid" do
|
187
|
-
let(:states) {
|
188
|
-
described_class.
|
189
|
-
load_states_or_suffixes("spec/fixtures/simple_state.yml")
|
190
|
-
}
|
191
|
-
|
192
|
-
let(:suffixes) {
|
193
|
-
described_class.
|
194
|
-
load_states_or_suffixes("spec/fixtures/simple_suffix.yml")
|
195
|
-
}
|
196
|
-
|
197
|
-
context "and the transit state is a final state" do
|
198
|
-
it "removes similar pending transitions" do
|
199
|
-
described_class.should_receive(:mark_stem).with(
|
200
|
-
"guzelim", suffixes[:s1]).and_call_original
|
201
|
-
|
202
|
-
described_class.affix_morphological_stripper(
|
203
|
-
"guzelim", states: states, suffixes: suffixes)
|
204
|
-
end
|
205
|
-
|
206
|
-
context "with no other transitions" do
|
207
|
-
it "stems the word" do
|
208
|
-
expect(
|
209
|
-
described_class.
|
210
|
-
affix_morphological_stripper("guzelim",
|
211
|
-
states: states, suffixes: suffixes)).
|
212
|
-
to eq ["guzel"]
|
213
|
-
end
|
214
|
-
end
|
215
|
-
|
216
|
-
context "with other transitions" do
|
217
|
-
let(:states) {
|
218
|
-
described_class.load_states_or_suffixes("spec/fixtures/simple_state_02.yml")
|
219
|
-
}
|
220
|
-
|
221
|
-
it "adds more pendings to check" do
|
222
|
-
described_class.should_receive(:mark_stem).with("guzelim",
|
223
|
-
suffixes[:s1]).and_call_original
|
224
|
-
|
225
|
-
described_class.affix_morphological_stripper("guzelim",
|
226
|
-
states: states, suffixes: suffixes)
|
227
|
-
end
|
228
|
-
end
|
229
|
-
end
|
230
|
-
end
|
231
|
-
|
232
|
-
context "when one suffix matches correctly with a given word" do
|
233
|
-
it "does not compare other suffixes in the same transition" do
|
234
|
-
described_class.
|
235
|
-
should_receive(:mark_stem).
|
236
|
-
with(anything, anything).
|
237
|
-
# only for suffixes [sUnUz, nUz]
|
238
|
-
exactly(2).times.
|
239
|
-
and_call_original
|
240
|
-
|
241
|
-
puts described_class.
|
242
|
-
affix_morphological_stripper("taksicisiniz",
|
243
|
-
states: described_class::NOMINAL_VERB_STATES,
|
244
|
-
suffixes: described_class::NOMINAL_VERB_SUFFIXES)
|
245
|
-
end
|
246
|
-
end
|
247
|
-
end
|
248
|
-
|
249
|
-
describe ".stem" do
|
250
|
-
context "when input is single syllable" do
|
251
|
-
it "returns the input as is" do
|
252
|
-
expect(described_class.stem("ev")).to eq "ev"
|
253
|
-
end
|
254
|
-
end
|
255
|
-
|
256
|
-
context "when input has zero syllables - one consonant" do
|
257
|
-
it "returns the input as is" do
|
258
|
-
expect(described_class.stem("p")).to eq "p"
|
259
|
-
end
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
describe ".last_consonant!" do
|
264
|
-
context "when last consonant is among 'b', 'c', 'd' or 'ğ'" do
|
265
|
-
it "is replaced by 'p', 'ç', 't' or 'k'" do
|
266
|
-
expect(described_class.last_consonant!('kebab')).to eq('kebap')
|
267
|
-
expect(described_class.last_consonant!('kebac')).to eq('kebaç')
|
268
|
-
expect(described_class.last_consonant!('kebad')).to eq('kebat')
|
269
|
-
expect(described_class.last_consonant!('kebağ')).to eq('kebak')
|
270
|
-
end
|
271
|
-
end
|
272
|
-
|
273
|
-
context "when word belongs to protected words" do
|
274
|
-
it "does not replace last consonant" do
|
275
|
-
expect(described_class.last_consonant!('ad')).to eq('ad')
|
276
|
-
end
|
277
|
-
end
|
278
|
-
end
|
279
|
-
|
280
|
-
describe ".mark_stem" do
|
281
|
-
let(:suffix) do
|
282
|
-
{
|
283
|
-
"name" => "-dir",
|
284
|
-
"regex" => "dir",
|
285
|
-
"optional_letter" => false,
|
286
|
-
"check_harmony" => true
|
287
|
-
}
|
288
|
-
end
|
289
|
-
|
290
|
-
context "when suffix has harmony check on" do
|
291
|
-
before do
|
292
|
-
suffix["regex"] = "dan"
|
293
|
-
end
|
294
|
-
|
295
|
-
context "and word does not obey harmony rules" do
|
296
|
-
it "does not stem a word that does not obey harmony rules" do
|
297
|
-
expect(described_class.mark_stem("kürdan", suffix)).to eq(
|
298
|
-
{ stem: false, word: "kürdan", suffix_applied: nil })
|
299
|
-
end
|
300
|
-
|
301
|
-
context "and word belongs to exceptions" do
|
302
|
-
before do
|
303
|
-
suffix["regex"] = "ler"
|
304
|
-
end
|
305
|
-
it "stems the word" do
|
306
|
-
expect(described_class.mark_stem("saatler", suffix)).to eq(
|
307
|
-
{ stem: true, word: "saat", suffix_applied: "ler" })
|
308
|
-
end
|
309
|
-
end
|
310
|
-
end
|
311
|
-
|
312
|
-
end
|
313
|
-
|
314
|
-
context "when suffix has harmony check off" do
|
315
|
-
before do
|
316
|
-
suffix["regex"] = "dan"
|
317
|
-
suffix["check_harmony"] = false
|
318
|
-
end
|
319
|
-
|
320
|
-
it "stems a word that does not obey harmony rules" do
|
321
|
-
expect(
|
322
|
-
described_class.
|
323
|
-
mark_stem("kürdan", suffix)).
|
324
|
-
to eq({ stem: true, word: "kür", suffix_applied: "dan" })
|
325
|
-
end
|
326
|
-
end
|
327
|
-
|
328
|
-
context "when word matches suffix" do
|
329
|
-
it "partially stems a word" do
|
330
|
-
expect(
|
331
|
-
described_class.
|
332
|
-
mark_stem("Türkiyedir", suffix)).
|
333
|
-
to eq({ stem: true, word: "Türkiye", suffix_applied: "dir" })
|
334
|
-
end
|
335
|
-
|
336
|
-
|
337
|
-
context "when suffix has (y) as optional letter" do
|
338
|
-
before do
|
339
|
-
suffix["optional_letter"] = "y|y"
|
340
|
-
suffix["regex"] = "um"
|
341
|
-
end
|
342
|
-
|
343
|
-
context "and new word has valid last 'y' symbol" do
|
344
|
-
it "stems correctly and increases the suffix" do
|
345
|
-
expect(
|
346
|
-
described_class.
|
347
|
-
mark_stem("loyum", suffix)).
|
348
|
-
to eq({ stem: true, word: "lo", suffix_applied: "yum" })
|
349
|
-
end
|
350
|
-
end
|
351
|
-
|
352
|
-
context "and new word does not have valid last 'y' symbol" do
|
353
|
-
it "does not stem the word" do
|
354
|
-
expect(
|
355
|
-
described_class.
|
356
|
-
mark_stem("lotyum", suffix)).
|
357
|
-
to eq({ stem: false, word: "lotyum", suffix_applied: nil })
|
358
|
-
end
|
359
|
-
end
|
360
|
-
end
|
361
|
-
end
|
362
|
-
end
|
363
|
-
|
364
|
-
describe ".generate_pendings" do
|
365
|
-
let(:states) { described_class::NOMINAL_VERB_STATES }
|
366
|
-
let(:suffixes) { described_class::NOMINAL_VERB_SUFFIXES }
|
367
|
-
|
368
|
-
it "raises an error if state does not exist" do
|
369
|
-
expect {
|
370
|
-
described_class.
|
371
|
-
generate_pendings(1, "satıyorsunuz", states, suffixes)
|
372
|
-
}.to raise_error(ArgumentError, "State #{1} does not exist")
|
373
|
-
end
|
374
|
-
|
375
|
-
context "when state key does not have transitions" do
|
376
|
-
it "returns an empty array" do
|
377
|
-
expect(
|
378
|
-
described_class.
|
379
|
-
# :f state does not have transitions
|
380
|
-
generate_pendings(:f, "satıyorsunuz", states, suffixes)).
|
381
|
-
to eq []
|
382
|
-
end
|
383
|
-
end
|
384
|
-
|
385
|
-
context "when state key has transitions" do
|
386
|
-
it "returns an array of hashes for each transition" do
|
387
|
-
expect(
|
388
|
-
described_class.
|
389
|
-
generate_pendings(:a, "satıyorsunuz", states, suffixes).first.keys).
|
390
|
-
to eq [:suffix, :to_state, :from_state, :word, :mark]
|
391
|
-
end
|
392
|
-
|
393
|
-
it "sets :from_state key to current key state" do
|
394
|
-
expect(
|
395
|
-
described_class.
|
396
|
-
generate_pendings(:a, "satıyorsunuz", states, suffixes).first[:from_state]).
|
397
|
-
to eq :a
|
398
|
-
end
|
399
|
-
end
|
400
|
-
end
|
401
|
-
|
402
|
-
describe ".valid_optional_letter?" do
|
403
|
-
context "when last letter of the word is not equal to candidate" do
|
404
|
-
it "responds with [true,nil] - indicating that there was not match" do
|
405
|
-
expect(
|
406
|
-
described_class.valid_optional_letter?("test", "r")).
|
407
|
-
to eq([true, nil])
|
408
|
-
end
|
409
|
-
end
|
410
|
-
|
411
|
-
context "when there is a vowel match" do
|
412
|
-
context "and the previous char is a vowel" do
|
413
|
-
it "responds with false" do
|
414
|
-
expect(
|
415
|
-
described_class.
|
416
|
-
valid_optional_letter?("takcicii", "i")).
|
417
|
-
to eq([false, "i"])
|
418
|
-
end
|
419
|
-
end
|
420
|
-
|
421
|
-
context "and the previous char is a consonant" do
|
422
|
-
it "responds with true" do
|
423
|
-
expect(
|
424
|
-
described_class.
|
425
|
-
valid_optional_letter?("okula", "a")).
|
426
|
-
to eq([true, "a"])
|
427
|
-
end
|
428
|
-
end
|
429
|
-
end
|
430
|
-
|
431
|
-
context "when there is a consonant match" do
|
432
|
-
context "and the previous char is a vowel" do
|
433
|
-
it "responds with true" do
|
434
|
-
expect(
|
435
|
-
described_class.
|
436
|
-
valid_optional_letter?("litiy", "y")).
|
437
|
-
to eq([true, "y"])
|
438
|
-
end
|
439
|
-
end
|
440
|
-
|
441
|
-
context "and the previous char is a consonant" do
|
442
|
-
it "responds with true" do
|
443
|
-
expect(
|
444
|
-
described_class.
|
445
|
-
valid_optional_letter?("lity", "y")).
|
446
|
-
to eq([false, "y"])
|
447
|
-
end
|
448
|
-
end
|
449
|
-
end
|
450
|
-
end
|
451
|
-
|
452
|
-
describe ".stem_post_process" do
|
453
|
-
context "when input stream has words with last consonant replacements" do
|
454
|
-
it "replaces last consonant" do
|
455
|
-
expect(described_class.stem_post_process(["kebab"], "word")).to eq("kebap")
|
456
|
-
end
|
457
|
-
end
|
458
|
-
|
459
|
-
it "flattens and uniq results" do
|
460
|
-
expect(described_class.stem_post_process(["kitap",["kitap"]], "word")).to eq("kitap")
|
461
|
-
end
|
462
|
-
|
463
|
-
it "removes no syllables words" do
|
464
|
-
expect(described_class.stem_post_process(["kitap", "k"], "word")).to eq("kitap")
|
465
|
-
end
|
466
|
-
|
467
|
-
context "when multiple stem candidates exist" do
|
468
|
-
it "returns the shortest" do
|
469
|
-
pending("fix this")
|
470
|
-
expect(described_class.stem_post_process(["kitap", "kita", "kit"], "word")).to eq "kit"
|
471
|
-
end
|
472
|
-
|
473
|
-
context "and word belongs to selection list" do
|
474
|
-
it "returns this word" do
|
475
|
-
expect(described_class.stem_post_process(
|
476
|
-
["su", "suy", "suyu"], "suyu")).to eq "su"
|
477
|
-
end
|
478
|
-
end
|
479
|
-
end
|
480
|
-
end
|
481
|
-
|
482
|
-
describe ".proceed_to_stem?" do
|
483
|
-
context "when word has 1 or less syllables" do
|
484
|
-
it "returns false" do
|
485
|
-
expect(described_class.proceed_to_stem?("kit")).not_to be
|
486
|
-
end
|
487
|
-
end
|
488
|
-
|
489
|
-
context "when word is nil" do
|
490
|
-
it "returns false" do
|
491
|
-
expect(described_class.proceed_to_stem?(nil)).not_to be
|
492
|
-
end
|
493
|
-
end
|
494
|
-
|
495
|
-
context "when word is empty" do
|
496
|
-
it "returns false" do
|
497
|
-
expect(described_class.proceed_to_stem?("")).not_to be
|
498
|
-
end
|
499
|
-
end
|
500
|
-
|
501
|
-
context "when word is among protected words" do
|
502
|
-
it "returns false" do
|
503
|
-
expect(described_class.proceed_to_stem?("soyad")).not_to be
|
504
|
-
end
|
505
|
-
end
|
506
|
-
|
507
|
-
context "when word contains non Turkish letters" do
|
508
|
-
it "returns false" do
|
509
|
-
expect(described_class.proceed_to_stem?("τελειο")).not_to be
|
510
|
-
expect(described_class.proceed_to_stem?("&aa")).not_to be
|
511
|
-
end
|
512
|
-
end
|
513
|
-
end
|
514
|
-
|
515
|
-
context "1:1 testing with paper" do
|
516
|
-
CSV.read("spec/support/fixtures.csv").each do |row|
|
517
|
-
it "stems #{row[0]} correct" do
|
518
|
-
expect(described_class.stem(row[0].downcase)).to eq row[1].downcase
|
519
|
-
end
|
520
|
-
end
|
521
|
-
end
|
522
|
-
end
|