turkish_stemmer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +282 -0
- data/Rakefile +21 -0
- data/benchmarks/stemmers_comparison.rb +16 -0
- data/benchmarks/stemming_samples.txt +17916 -0
- data/benchmarks/turkish_word_recognition.rb +26 -0
- data/config/derivational_states.yml +10 -0
- data/config/derivational_suffixes.yml +6 -0
- data/config/nominal_verb_states.yml +121 -0
- data/config/nominal_verb_suffixes.yml +90 -0
- data/config/noun_states.yml +177 -0
- data/config/noun_suffixes.yml +113 -0
- data/config/stemmer.yml +206 -0
- data/lib/hash_extension.rb +5 -0
- data/lib/turkish_stemmer/version.rb +3 -0
- data/lib/turkish_stemmer.rb +455 -0
- data/spec/fixtures/simple_state.yml +14 -0
- data/spec/fixtures/simple_state_02.yml +21 -0
- data/spec/fixtures/simple_suffix.yml +7 -0
- data/spec/fixtures/simple_transition.yml +7 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/support/fixtures.csv +101 -0
- data/spec/turkish_stemmer_spec.rb +522 -0
- data/turkish_stemmer.gemspec +35 -0
- metadata +164 -0
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
2
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
3
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
4
|
+
# loaded once.
|
5
|
+
#
|
6
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
|
+
require 'turkish_stemmer'
|
8
|
+
|
9
|
+
RSpec.configure do |config|
|
10
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
11
|
+
config.run_all_when_everything_filtered = true
|
12
|
+
config.filter_run :focus
|
13
|
+
|
14
|
+
# Run specs in random order to surface order dependencies. If you find an
|
15
|
+
# order dependency and want to debug it, you can fix the order by providing
|
16
|
+
# the seed, which is printed after each run.
|
17
|
+
# --seed 1234
|
18
|
+
config.order = 'random'
|
19
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
evimden,ev,from my house,ev-(i)m-den,
|
2
|
+
göz,göz,eye,--,
|
3
|
+
güzelmişsin,güzel,you were beautiful,güzel-miş-sin,rumor
|
4
|
+
etkilerden,etki,from the effects,etki-ler-den,
|
5
|
+
çocukmuş,çocuk,it was child,çocuk-miş,rumor
|
6
|
+
kediymiş,kedi,it was cat,kedi-(y)miş,rumor
|
7
|
+
balığım,balık,my fish,balık-(i)m,
|
8
|
+
doktoruymuşsunuz,doktor,you were his/her/its doctor,doktor-i-(y)miş-siniz,rumor
|
9
|
+
kalelerimizdekilerden,kale,the ones that are from our castle,kale-ler-(i)miz-de-ki-ler-den,
|
10
|
+
çocuğuymuşumçasına,çocuk,as if i was his/her child ,çocuk-i-(y)miş-im-cesine,
|
11
|
+
kedileriyle,kedi,with his/her/its cats,kedi-ler-i-(y)le,kedileri+ile
|
12
|
+
çocuklarımmış,çocuk,they were my children,çocuk-ler-(i)m-miş,rumor
|
13
|
+
kitabımızdı,kitap,it was our book,kitap-(i)miz-di,
|
14
|
+
kelimelerin,kelime,"""your"" -or- ""of"" words",kelime-ler-(i)n -or- kelime-ler-in,both ways
|
15
|
+
kayısısı,kayısı,his/her/its apricot,kayısı-(s)ı,
|
16
|
+
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
17
|
+
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum ",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
18
|
+
eriğinden,erik,"from ""your"" -or- ""his/her/its"" plum",erik-(i)n-den -or- erik-i-(n)den,both ways
|
19
|
+
eriğine,erik,"to ""your"" -or- ""his/her/its"" plum",erik-(i)n-e -or- erik-i-(n)e,both ways
|
20
|
+
eriğinde,erik,"at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de -or- erik-i-(n)de,both ways
|
21
|
+
kayısısına,kayısı,his/her/its apricot,kayısı-(s)ı-(n)a,
|
22
|
+
kayısısında,kayısı,at his/her/its apricot,kayısı-(s)ı-(n)da,
|
23
|
+
saatlerimiz,saat,our watches/hours,saat-ler-(i)miz,
|
24
|
+
kalemimin,kalem,of my pencil,kalem-(i)m-in,
|
25
|
+
ucu,uç,nib of...,uç-i,
|
26
|
+
kalelerdekilerden,kale,from the ones that are at (the) castle,kale-ler-de-ki-ler-den,
|
27
|
+
kalelerdekilerin,kale,of the ones that are at (the) castle,kale-ler-de-ki-ler-in,
|
28
|
+
kalelerimizdekilerde,kale,at the ones that are at (the) castle,kale-ler-(i)miz-de-ki-ler-de,
|
29
|
+
kaleninkinin,kale,of the one that belongs to (the) castle,kale-(n)in-ki-nin,
|
30
|
+
kalemizinkinin,kale,of the one that belongs to our castle,kale-miz-(i)n-ki-(n)in,
|
31
|
+
kalelerindeki,kale,"the one that is at ""their castle"" -or- ""his/her/its castles""",kale-leri-(n)de-ki -or- kale-ler-i-(n)de-ki,both ways
|
32
|
+
erikleri,erik,"""their plum"" -or- ""his/her/its plums""",erik-leri -or- erik-ler-i,both ways
|
33
|
+
erikler,erik,(the) plums,erik-ler,
|
34
|
+
eriğim,erik,my plum,erik-(i)m,
|
35
|
+
eriğimiz,erik,our plum,erik-(i)miz,
|
36
|
+
eriğin,erik,your plum,erik-(i)n,
|
37
|
+
eriğiniz,erik,your plum,erik-(i)niz,2nd person in plural
|
38
|
+
eriği,erik,his/her/its plum,erik-i,
|
39
|
+
eriğini,erik,"""your"" -or- ""his/her/its"" plum",erik-(i)n-i -or- erik-i-(n)i,both ways
|
40
|
+
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
41
|
+
eriğe,erik,to (the) plum,erik-e,
|
42
|
+
eriğine,erik,"to ""your"" -or- ""his/her/its"" plum",erik-(i)n-e -or- erik-i-(n)e,both ways
|
43
|
+
eriklerine,erik,"to ""their plum"" -or- ""his/her/its plums""",erik-leri-(n)e -or- erik-ler-i-(n)e,both ways
|
44
|
+
erikte,erik,at (the) plum,erik-de,
|
45
|
+
eriğinde,erik,"at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de -or- erik-i-(n)de,both ways
|
46
|
+
erikten,erik,from (the) plum,erik-den,
|
47
|
+
eriğinden,erik,"from ""your"" -or- ""his/her/its"" plum",erik-(i)n-den -or- erik-i-(n)den,both ways
|
48
|
+
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
49
|
+
eriğiyle,erik,with his/her/its plum,erik-i-(y)le,
|
50
|
+
eriğinin,erik,"of ""your"" -or- ""his/her/its"" plum",erik-(i)n-in -or- erik-i-(n)in,both ways
|
51
|
+
eriğindeki,erik,"the one that is at ""your"" -or- ""his/her/its"" plum",erik-(i)n-de-ki - or- erik-i-(n)de-ki,both ways
|
52
|
+
eriğince,erik,"after ""your"" -or- ""his/her/its"" plum",erik-(i)n-ce -or- erik-i-(n)ce,both ways
|
53
|
+
gülüm,gül,my rose,gül-(i)m,
|
54
|
+
erikteki,erik,the one that is at (the) plum,erik-de-ki,
|
55
|
+
eriktekilerden,erik,the ones that are from (the) plum,erik-de-ki-ler-den,
|
56
|
+
eriklerdeki,erik,the ones that are at (the) plum,erik-ler-de-ki,
|
57
|
+
kitabı,kitap,(the) book,kitap-i,
|
58
|
+
ağacı,ağaç,(the) tree,ağaç-i,
|
59
|
+
eriğim,erik,my plum / i am plum,erik-(i)m / erik-im,
|
60
|
+
kayısıyım,kayısı,i am apricot,kayısı-(y)ım,
|
61
|
+
eriksem,erik,if i am plum,erik-se-m,
|
62
|
+
eriksen,erik,if you are plum,erik-se-n,
|
63
|
+
erikse,erik,if he/she/it is plum,erik-se,
|
64
|
+
erikseniz,erik,if you are plum,erik-se-niz,2nd person in plural
|
65
|
+
erikseler,erik,if they are plum,erik-se-ler,
|
66
|
+
erikti,erik,he/she/it was plum,erik-di,
|
67
|
+
eriktiniz,erik,you were plum,erik-di-niz,2nd person in plural
|
68
|
+
eriktiler,erik,they were plum,erik-di-ler,
|
69
|
+
erikmiş,erik,it was plum,erik-miş,rumor
|
70
|
+
erikmişçesine,erik,as if it was plum,erik-miş-cesine,
|
71
|
+
erikmiştir,erik,it was plum,erik-miş-dir,rumor
|
72
|
+
erikmişim,erik,i was plum,erik-miş-(i)m,rumor
|
73
|
+
erikmişsin,erik,you were plum,erik-miş-sin,rumor
|
74
|
+
erikmişsindir,erik,you happened to be plum,erik-miş-sin-dir,rumor
|
75
|
+
erikmişimdir,erik,i happened to be plum,erik-miş-im-dir,rumor
|
76
|
+
erikmişiz,erik,we were plum,erik-miş-iz,rumor
|
77
|
+
erikmişizdir,erik,we happened to be plum,erik-miş-iz-dir,rumor
|
78
|
+
erikmişsiniz,erik,you were plum,erik-miş-siniz,2nd person in plural + rumor
|
79
|
+
erikmişsinizdir,erik,you happened to be plum,erik-miş-siniz-dir,2nd person in plural + rumor
|
80
|
+
erikmişler,erik,they were plum,erik-miş-ler,rumor
|
81
|
+
erikmişlerdir,erik,they happened to be plum,erik-miş-ler-dir,rumor
|
82
|
+
erikmişimcesine,erik,as if i was plum,erik-miş-im-cesine,
|
83
|
+
erikmişsincesine,erik,as if you were plum,erik-miş-sin-cesine,
|
84
|
+
erikmişizcesine,erik,as if we were plum,erik-miş-iz-cesine,
|
85
|
+
erikmişsinizcesine,erik,as if you were plum,erik-miş-siniz-cesine,2nd person in plural
|
86
|
+
erikmişlercesine,erik,as if they were plum,erik-miş-ler-cesine,
|
87
|
+
erikler,erik,plums,erik-ler,
|
88
|
+
eriğim,erik,my plum / i am plum,erik-(i)m / erik-im,
|
89
|
+
eriksin,erik,you are plum,erik-sin,
|
90
|
+
erik,erik,plum / he/she/it is plum,erik / erik,
|
91
|
+
eriğiz,erik,we are plum,erik-iz,
|
92
|
+
eriksiniz,erik,you are plum,erik-siniz,2nd person in plural
|
93
|
+
erikler,erik,they are plum,erik-ler,
|
94
|
+
eriktir,erik,it is plum,erik-dir,assumption
|
95
|
+
eriktirler,erik,they are plum,erik-dir-ler,assumption
|
96
|
+
erikken,erik,while he/she/it was plum,erik-(i)ken,
|
97
|
+
kötüymüş,kötü,he/she/it is bad,kötü-(y)miş,rumor
|
98
|
+
yüz,yüz,face / hundred,yüz,double meaning
|
99
|
+
muş,muş,--,--,this is a suffix
|
100
|
+
ad,ad,name,ad,
|
101
|
+
soyad,soyad,surname,soyad,soy+ad = lineage+name
|
@@ -0,0 +1,522 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require "spec_helper"
|
3
|
+
require "pry"
|
4
|
+
require "csv"
|
5
|
+
|
6
|
+
describe TurkishStemmer do
|
7
|
+
|
8
|
+
describe ".count_syllables" do
|
9
|
+
it "counts syllables correctly" do
|
10
|
+
expect(described_class.count_syllables("erikler")).to eq 3
|
11
|
+
expect(described_class.count_syllables("çocuklarımmış")).to eq 5
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
describe ".vowels" do
|
16
|
+
it "returns all vowels of a word" do
|
17
|
+
expect(described_class.vowels("kötüymüş")).to eq(%w(ö ü ü))
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
describe ".has_roundness?" do
|
22
|
+
context "when vowel is empty" do
|
23
|
+
it "has roundness" do
|
24
|
+
expect(described_class).to have_roundness(nil, "a")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context "when candidate is empty" do
|
29
|
+
it "has roundness" do
|
30
|
+
expect(described_class).to have_roundness("a", nil)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "when an unrounded vowel is passed" do
|
35
|
+
let(:vowel) { described_class::UNROUNDED_VOWELS.chars.to_a.sample }
|
36
|
+
|
37
|
+
context "and candidate is an unrounded vowel too" do
|
38
|
+
let(:candidate) { described_class::UNROUNDED_VOWELS.chars.to_a.sample }
|
39
|
+
|
40
|
+
it "has roundness" do
|
41
|
+
expect(described_class).to have_roundness(vowel, candidate)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context "and candidate is not an unrounded vowel" do
|
46
|
+
let(:candidate) { described_class::ROUNDED_VOWELS.chars.to_a.sample }
|
47
|
+
|
48
|
+
it "does not have roundness" do
|
49
|
+
expect(described_class).not_to have_roundness(vowel, candidate)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
context "when a rounded vowel is passed" do
|
55
|
+
let(:vowel) { described_class::ROUNDED_VOWELS.chars.to_a.sample }
|
56
|
+
|
57
|
+
context "and one of 'a', 'e', 'u' or 'ü' is a candidate" do
|
58
|
+
let(:candidate) { described_class::FOLLOWING_ROUNDED_VOWELS.chars.to_a.sample }
|
59
|
+
|
60
|
+
it "has roundness" do
|
61
|
+
expect(described_class).to have_roundness(vowel, candidate)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
context "and candidate is 'o'" do
|
66
|
+
let(:candidate) { 'o' }
|
67
|
+
|
68
|
+
it "does not have roundness" do
|
69
|
+
expect(described_class).not_to have_roundness(vowel, candidate)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe ".has_frontness?" do
|
76
|
+
context "when vowel is empty" do
|
77
|
+
it "has frontness" do
|
78
|
+
expect(described_class).to have_frontness(nil, "a")
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
context "when candidate is empty" do
|
83
|
+
it "has frontness" do
|
84
|
+
expect(described_class).to have_frontness("a", nil)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
context "when a front vowel is passed" do
|
89
|
+
let(:vowel) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
90
|
+
|
91
|
+
context "and candidate is a front vowel" do
|
92
|
+
let(:candidate) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
93
|
+
|
94
|
+
it "has frontness" do
|
95
|
+
expect(described_class).to have_frontness(vowel, candidate)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
context "and candidate is a back vowel" do
|
100
|
+
let(:candidate) { described_class::BACK_VOWELS.chars.to_a.sample }
|
101
|
+
|
102
|
+
it "does not have frontness" do
|
103
|
+
expect(described_class).not_to have_frontness(vowel, candidate)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
context "when a back vowel is passed" do
|
109
|
+
let(:vowel) { described_class::BACK_VOWELS.chars.to_a.sample }
|
110
|
+
|
111
|
+
context "and candidate is a front vowel" do
|
112
|
+
let(:candidate) { described_class::FRONT_VOWELS.chars.to_a.sample }
|
113
|
+
|
114
|
+
it "does not have frontness" do
|
115
|
+
expect(described_class).not_to have_frontness(vowel, candidate)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
context "and candidate is a back vowel" do
|
120
|
+
let(:candidate) { described_class::BACK_VOWELS.chars.to_a.sample }
|
121
|
+
|
122
|
+
it "has frontness" do
|
123
|
+
expect(described_class).to have_frontness(vowel, candidate)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe ".has_vowel_harmony?" do
|
130
|
+
it "has vowel harmony for valid Turkish words" do
|
131
|
+
expect(described_class).to have_vowel_harmony("Türkiyedir")
|
132
|
+
expect(described_class).to have_vowel_harmony("kapıdır")
|
133
|
+
expect(described_class).to have_vowel_harmony("gündür")
|
134
|
+
expect(described_class).to have_vowel_harmony("paltodur")
|
135
|
+
end
|
136
|
+
|
137
|
+
it "does not have vowel harmony for loanwords" do
|
138
|
+
expect(described_class).not_to have_vowel_harmony("kürdan")
|
139
|
+
end
|
140
|
+
|
141
|
+
it "does not have vowel harmony for exceptions" do
|
142
|
+
expect(described_class).not_to have_vowel_harmony("anne")
|
143
|
+
expect(described_class).not_to have_vowel_harmony("kardeş")
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
describe ".affix_morphological_stripper" do
|
148
|
+
context "when states are empty" do
|
149
|
+
it "returns the word" do
|
150
|
+
expect(
|
151
|
+
described_class.
|
152
|
+
affix_morphological_stripper("kapıdır", suffixes: :test)).
|
153
|
+
to eq(["kapıdır"])
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
context "when suffixes are empty" do
|
158
|
+
it "return the word" do
|
159
|
+
expect(
|
160
|
+
described_class.
|
161
|
+
affix_morphological_stripper("kapıdır", states: :test)).
|
162
|
+
to eq(["kapıdır"])
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
context "when there exist states and suffixes" do
|
167
|
+
let(:states) {
|
168
|
+
described_class.
|
169
|
+
load_states_or_suffixes("spec/fixtures/simple_state.yml")
|
170
|
+
}
|
171
|
+
|
172
|
+
let(:suffixes) {
|
173
|
+
described_class.
|
174
|
+
load_states_or_suffixes("spec/fixtures/simple_suffix.yml")
|
175
|
+
}
|
176
|
+
|
177
|
+
it "generates pendings for the initial state" do
|
178
|
+
described_class.should_receive(:generate_pendings).with(:a,
|
179
|
+
"word", states, suffixes).and_call_original
|
180
|
+
|
181
|
+
described_class.affix_morphological_stripper("word",
|
182
|
+
states: states, suffixes: suffixes)
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
context "when a transition is valid" do
|
187
|
+
let(:states) {
|
188
|
+
described_class.
|
189
|
+
load_states_or_suffixes("spec/fixtures/simple_state.yml")
|
190
|
+
}
|
191
|
+
|
192
|
+
let(:suffixes) {
|
193
|
+
described_class.
|
194
|
+
load_states_or_suffixes("spec/fixtures/simple_suffix.yml")
|
195
|
+
}
|
196
|
+
|
197
|
+
context "and the transit state is a final state" do
|
198
|
+
it "removes similar pending transitions" do
|
199
|
+
described_class.should_receive(:mark_stem).with(
|
200
|
+
"guzelim", suffixes[:s1]).and_call_original
|
201
|
+
|
202
|
+
described_class.affix_morphological_stripper(
|
203
|
+
"guzelim", states: states, suffixes: suffixes)
|
204
|
+
end
|
205
|
+
|
206
|
+
context "with no other transitions" do
|
207
|
+
it "stems the word" do
|
208
|
+
expect(
|
209
|
+
described_class.
|
210
|
+
affix_morphological_stripper("guzelim",
|
211
|
+
states: states, suffixes: suffixes)).
|
212
|
+
to eq ["guzel"]
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
context "with other transitions" do
|
217
|
+
let(:states) {
|
218
|
+
described_class.load_states_or_suffixes("spec/fixtures/simple_state_02.yml")
|
219
|
+
}
|
220
|
+
|
221
|
+
it "adds more pendings to check" do
|
222
|
+
described_class.should_receive(:mark_stem).with("guzelim",
|
223
|
+
suffixes[:s1]).and_call_original
|
224
|
+
|
225
|
+
described_class.affix_morphological_stripper("guzelim",
|
226
|
+
states: states, suffixes: suffixes)
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
context "when one suffix matches correctly with a given word" do
|
233
|
+
it "does not compare other suffixes in the same transition" do
|
234
|
+
described_class.
|
235
|
+
should_receive(:mark_stem).
|
236
|
+
with(anything, anything).
|
237
|
+
# only for suffixes [sUnUz, nUz]
|
238
|
+
exactly(2).times.
|
239
|
+
and_call_original
|
240
|
+
|
241
|
+
puts described_class.
|
242
|
+
affix_morphological_stripper("taksicisiniz",
|
243
|
+
states: described_class::NOMINAL_VERB_STATES,
|
244
|
+
suffixes: described_class::NOMINAL_VERB_SUFFIXES)
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
describe ".stem" do
|
250
|
+
context "when input is single syllable" do
|
251
|
+
it "returns the input as is" do
|
252
|
+
expect(described_class.stem("ev")).to eq "ev"
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
context "when input has zero syllables - one consonant" do
|
257
|
+
it "returns the input as is" do
|
258
|
+
expect(described_class.stem("p")).to eq "p"
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
describe ".last_consonant!" do
|
264
|
+
context "when last consonant is among 'b', 'c', 'd' or 'ğ'" do
|
265
|
+
it "is replaced by 'p', 'ç', 't' or 'k'" do
|
266
|
+
expect(described_class.last_consonant!('kebab')).to eq('kebap')
|
267
|
+
expect(described_class.last_consonant!('kebac')).to eq('kebaç')
|
268
|
+
expect(described_class.last_consonant!('kebad')).to eq('kebat')
|
269
|
+
expect(described_class.last_consonant!('kebağ')).to eq('kebak')
|
270
|
+
end
|
271
|
+
end
|
272
|
+
|
273
|
+
context "when word belongs to protected words" do
|
274
|
+
it "does not replace last consonant" do
|
275
|
+
expect(described_class.last_consonant!('ad')).to eq('ad')
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
describe ".mark_stem" do
|
281
|
+
let(:suffix) do
|
282
|
+
{
|
283
|
+
name: "-dir",
|
284
|
+
regex: "dir",
|
285
|
+
optional_letter: false,
|
286
|
+
check_harmony: true
|
287
|
+
}
|
288
|
+
end
|
289
|
+
|
290
|
+
context "when suffix has harmony check on" do
|
291
|
+
before do
|
292
|
+
suffix[:regex] = "dan"
|
293
|
+
end
|
294
|
+
|
295
|
+
context "and word does not obey harmony rules" do
|
296
|
+
it "does not stem a word that does not obey harmony rules" do
|
297
|
+
expect(described_class.mark_stem("kürdan", suffix)).to eq(
|
298
|
+
{ stem: false, word: "kürdan", suffix_applied: nil })
|
299
|
+
end
|
300
|
+
|
301
|
+
context "and word belongs to exceptions" do
|
302
|
+
before do
|
303
|
+
suffix[:regex] = "ler"
|
304
|
+
end
|
305
|
+
it "stems the word" do
|
306
|
+
expect(described_class.mark_stem("saatler", suffix)).to eq(
|
307
|
+
{ stem: true, word: "saat", suffix_applied: "ler" })
|
308
|
+
end
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
end
|
313
|
+
|
314
|
+
context "when suffix has harmony check off" do
|
315
|
+
before do
|
316
|
+
suffix[:regex] = "dan"
|
317
|
+
suffix[:check_harmony] = false
|
318
|
+
end
|
319
|
+
|
320
|
+
it "stems a word that does not obey harmony rules" do
|
321
|
+
expect(
|
322
|
+
described_class.
|
323
|
+
mark_stem("kürdan", suffix)).
|
324
|
+
to eq({ stem: true, word: "kür", suffix_applied: "dan" })
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
context "when word matches suffix" do
|
329
|
+
it "partially stems a word" do
|
330
|
+
expect(
|
331
|
+
described_class.
|
332
|
+
mark_stem("Türkiyedir", suffix)).
|
333
|
+
to eq({ stem: true, word: "Türkiye", suffix_applied: "dir" })
|
334
|
+
end
|
335
|
+
|
336
|
+
|
337
|
+
context "when suffix has (y) as optional letter" do
|
338
|
+
before do
|
339
|
+
suffix[:optional_letter] = "y|y"
|
340
|
+
suffix[:regex] = "um"
|
341
|
+
end
|
342
|
+
|
343
|
+
context "and new word has valid last 'y' symbol" do
|
344
|
+
it "stems correctly and increases the suffix" do
|
345
|
+
expect(
|
346
|
+
described_class.
|
347
|
+
mark_stem("loyum", suffix)).
|
348
|
+
to eq({ stem: true, word: "lo", suffix_applied: "yum" })
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
context "and new word does not have valid last 'y' symbol" do
|
353
|
+
it "does not stem the word" do
|
354
|
+
expect(
|
355
|
+
described_class.
|
356
|
+
mark_stem("lotyum", suffix)).
|
357
|
+
to eq({ stem: false, word: "lotyum", suffix_applied: nil })
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
describe ".generate_pendings" do
|
365
|
+
let(:states) { described_class::NOMINAL_VERB_STATES }
|
366
|
+
let(:suffixes) { described_class::NOMINAL_VERB_SUFFIXES }
|
367
|
+
|
368
|
+
it "raises an error if state does not exist" do
|
369
|
+
expect {
|
370
|
+
described_class.
|
371
|
+
generate_pendings(1, "satıyorsunuz", states, suffixes)
|
372
|
+
}.to raise_error(ArgumentError, "State #{1} does not exist")
|
373
|
+
end
|
374
|
+
|
375
|
+
context "when state key does not have transitions" do
|
376
|
+
it "returns an empty array" do
|
377
|
+
expect(
|
378
|
+
described_class.
|
379
|
+
# :f state does not have transitions
|
380
|
+
generate_pendings(:f, "satıyorsunuz", states, suffixes)).
|
381
|
+
to eq []
|
382
|
+
end
|
383
|
+
end
|
384
|
+
|
385
|
+
context "when state key has transitions" do
|
386
|
+
it "returns an array of hashes for each transition" do
|
387
|
+
expect(
|
388
|
+
described_class.
|
389
|
+
generate_pendings(:a, "satıyorsunuz", states, suffixes).first.keys).
|
390
|
+
to eq [:suffix, :to_state, :from_state, :word, :mark]
|
391
|
+
end
|
392
|
+
|
393
|
+
it "sets :from_state key to current key state" do
|
394
|
+
expect(
|
395
|
+
described_class.
|
396
|
+
generate_pendings(:a, "satıyorsunuz", states, suffixes).first[:from_state]).
|
397
|
+
to eq :a
|
398
|
+
end
|
399
|
+
end
|
400
|
+
end
|
401
|
+
|
402
|
+
describe ".valid_optional_letter?" do
|
403
|
+
context "when last letter of the word is not equal to candidate" do
|
404
|
+
it "responds with [true,nil] - indicating that there was not match" do
|
405
|
+
expect(
|
406
|
+
described_class.valid_optional_letter?("test", "r")).
|
407
|
+
to eq([true, nil])
|
408
|
+
end
|
409
|
+
end
|
410
|
+
|
411
|
+
context "when there is a vowel match" do
|
412
|
+
context "and the previous char is a vowel" do
|
413
|
+
it "responds with false" do
|
414
|
+
expect(
|
415
|
+
described_class.
|
416
|
+
valid_optional_letter?("takcicii", "i")).
|
417
|
+
to eq([false, "i"])
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
context "and the previous char is a consonant" do
|
422
|
+
it "responds with true" do
|
423
|
+
expect(
|
424
|
+
described_class.
|
425
|
+
valid_optional_letter?("okula", "a")).
|
426
|
+
to eq([true, "a"])
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
context "when there is a consonant match" do
|
432
|
+
context "and the previous char is a vowel" do
|
433
|
+
it "responds with true" do
|
434
|
+
expect(
|
435
|
+
described_class.
|
436
|
+
valid_optional_letter?("litiy", "y")).
|
437
|
+
to eq([true, "y"])
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|
441
|
+
context "and the previous char is a consonant" do
|
442
|
+
it "responds with true" do
|
443
|
+
expect(
|
444
|
+
described_class.
|
445
|
+
valid_optional_letter?("lity", "y")).
|
446
|
+
to eq([false, "y"])
|
447
|
+
end
|
448
|
+
end
|
449
|
+
end
|
450
|
+
end
|
451
|
+
|
452
|
+
describe ".stem_post_process" do
|
453
|
+
context "when input stream has words with last consonant replacements" do
|
454
|
+
it "replaces last consonant" do
|
455
|
+
expect(described_class.stem_post_process(["kebab"], "word")).to eq("kebap")
|
456
|
+
end
|
457
|
+
end
|
458
|
+
|
459
|
+
it "flattens and uniq results" do
|
460
|
+
expect(described_class.stem_post_process(["kitap",["kitap"]], "word")).to eq("kitap")
|
461
|
+
end
|
462
|
+
|
463
|
+
it "removes no syllables words" do
|
464
|
+
expect(described_class.stem_post_process(["kitap", "k"], "word")).to eq("kitap")
|
465
|
+
end
|
466
|
+
|
467
|
+
context "when multiple stem candidates exist" do
|
468
|
+
it "returns the shortest" do
|
469
|
+
pending("fix this")
|
470
|
+
expect(described_class.stem_post_process(["kitap", "kita", "kit"], "word")).to eq "kit"
|
471
|
+
end
|
472
|
+
|
473
|
+
context "and word belongs to selection list" do
|
474
|
+
it "returns this word" do
|
475
|
+
expect(described_class.stem_post_process(
|
476
|
+
["su", "suy", "suyu"], "suyu")).to eq "su"
|
477
|
+
end
|
478
|
+
end
|
479
|
+
end
|
480
|
+
end
|
481
|
+
|
482
|
+
describe ".proceed_to_stem?" do
|
483
|
+
context "when word has 1 or less syllables" do
|
484
|
+
it "returns false" do
|
485
|
+
expect(described_class.proceed_to_stem?("kit")).not_to be
|
486
|
+
end
|
487
|
+
end
|
488
|
+
|
489
|
+
context "when word is nil" do
|
490
|
+
it "returns false" do
|
491
|
+
expect(described_class.proceed_to_stem?(nil)).not_to be
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
context "when word is empty" do
|
496
|
+
it "returns false" do
|
497
|
+
expect(described_class.proceed_to_stem?("")).not_to be
|
498
|
+
end
|
499
|
+
end
|
500
|
+
|
501
|
+
context "when word is among protected words" do
|
502
|
+
it "returns false" do
|
503
|
+
expect(described_class.proceed_to_stem?("soyad")).not_to be
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
context "when word contains non Turkish letters" do
|
508
|
+
it "returns false" do
|
509
|
+
expect(described_class.proceed_to_stem?("τελειο")).not_to be
|
510
|
+
expect(described_class.proceed_to_stem?("&aa")).not_to be
|
511
|
+
end
|
512
|
+
end
|
513
|
+
end
|
514
|
+
|
515
|
+
context "1:1 testing with paper" do
|
516
|
+
CSV.read("spec/support/fixtures.csv").each do |row|
|
517
|
+
it "stems #{row[0]} correct" do
|
518
|
+
expect(described_class.stem(row[0].downcase)).to eq row[1].downcase
|
519
|
+
end
|
520
|
+
end
|
521
|
+
end
|
522
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'turkish_stemmer/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "turkish_stemmer"
|
8
|
+
spec.version = TurkishStemmer::VERSION
|
9
|
+
spec.authors = ["Tasos Stathopoulos", "Giorgos Tsiftsis"]
|
10
|
+
spec.email = ["stathopa@skroutz.gr", "giorgos.tsiftsis@skroutz.gr"]
|
11
|
+
spec.summary = %q{A simple Turkish stemmer}
|
12
|
+
spec.description = %q{A simple Turkish stemmer}
|
13
|
+
spec.homepage = "https://gitlab.skroutz.gr/turkish_stemmer"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "hashie"
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.5"
|
24
|
+
spec.add_development_dependency "rake"
|
25
|
+
spec.add_development_dependency "rspec"
|
26
|
+
spec.add_development_dependency "ruby-stemmer"
|
27
|
+
|
28
|
+
if RUBY_ENGINE == "ruby"
|
29
|
+
if RUBY_VERSION >= "2.0.0"
|
30
|
+
spec.add_development_dependency "pry-byebug"
|
31
|
+
else
|
32
|
+
spec.add_development_dependency "pry"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|