turkish_stemmer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,206 @@
1
+ protected_words:
2
+ - abiye
3
+ - adın
4
+ - adana
5
+ - akılsız
6
+ - alaska
7
+ - alet
8
+ - ağda
9
+ - ağız
10
+ - alarm
11
+ - altınbaşak
12
+ - altınyıldız
13
+ - anakucağı
14
+ - anasayfa
15
+ - anime
16
+ - antifriz
17
+ - araba
18
+ - ardeşen
19
+ - armanı
20
+ - aroma
21
+ - arma
22
+ - arsız
23
+ - asa
24
+ - askı
25
+ - astra
26
+ - asus
27
+ - atkı
28
+ - ayakkabı
29
+ - aynı
30
+ - ayı
31
+ - banka
32
+ - başka
33
+ - batık
34
+ - bayı
35
+ - belge
36
+ - bellona
37
+ - benten
38
+ - benzin
39
+ - beşinci
40
+ - bilgi
41
+ - bitki
42
+ - boyut
43
+ - branda
44
+ - bütün
45
+ - buzlu
46
+ - çağrı
47
+ - camsız
48
+ - çanta
49
+ - çarşı
50
+ - ceyiz
51
+ - çıkış
52
+ - cımbiz
53
+ - dalga
54
+ - damla
55
+ - derece
56
+ - dişli
57
+ - düğün
58
+ - ege
59
+ - elbise
60
+ - fendi
61
+ - filtre
62
+ - fiyat
63
+ - forma
64
+ - gazete
65
+ - gemi
66
+ - görüntü
67
+ - igne
68
+ - ince
69
+ - internet
70
+ - iyi
71
+ - kayısı
72
+ - kama
73
+ - katı
74
+ - kötü
75
+ - kumanda
76
+ - lamba
77
+ - lazım
78
+ - litre
79
+ - mağaza
80
+ - magaza
81
+ - makara
82
+ - makine
83
+ - malzeme
84
+ - mana
85
+ - marka
86
+ - masa
87
+ - maskara
88
+ - mine
89
+ - mini
90
+ - nine
91
+ - numara
92
+ - odun
93
+ - oyun
94
+ - ölçü
95
+ - örgü
96
+ - öykü
97
+ - özen
98
+ - parça
99
+ - perde
100
+ - pompa
101
+ - pırlanta
102
+ - raket
103
+ - ranza
104
+ - şamdan
105
+ - şapka
106
+ - şifre
107
+ - sunu
108
+ - soyad
109
+ - tabaka
110
+ - takım
111
+ - talımat
112
+ - tarla
113
+ - tasma
114
+ - törpü
115
+ - tozlu
116
+ - tüplü
117
+ - uçurtma
118
+ - üfleme
119
+ - ürün
120
+ - ütü
121
+ - uygun
122
+ - uzatma
123
+ - uzun
124
+ - vana
125
+ - yağlı
126
+ - yapma
127
+ - yardım
128
+ - yasa
129
+ - yıldız
130
+ - zayıflama
131
+ - zemin
132
+
133
+ last_consonant_exceptions:
134
+ - ad
135
+ - at
136
+ - ked
137
+
138
+ vowel_harmony_exceptions:
139
+ - alkoller
140
+ - değerın
141
+ - generali
142
+ - generale
143
+ - projektörlar
144
+ - saatler
145
+ - tabletlar
146
+ - tersyüz
147
+ - yaninda
148
+ - yani
149
+
150
+ selection_list_exceptions:
151
+ - al
152
+ - am
153
+ - aparat
154
+ - ara
155
+ - bilet
156
+ - bisiklet
157
+ - bulut
158
+ - diyet
159
+ - ev
160
+ - es
161
+ - fiyat
162
+ - fırsat
163
+ - general
164
+ - git
165
+ - gıt
166
+ - iç
167
+ - ip
168
+ - internet
169
+ - iyi
170
+ - kağıt
171
+ - kartuş
172
+ - katı
173
+ - kot
174
+ - kötü
175
+ - kumanda
176
+ - lamba
177
+ - mağaza
178
+ - magaza
179
+ - makara
180
+ - makine
181
+ - marka
182
+ - maskara
183
+ - ne
184
+ - otomat
185
+ - palet
186
+ - perde
187
+ - raket
188
+ - ranza
189
+ - robot
190
+ - sepet
191
+ - servis
192
+ - soyad
193
+ - su
194
+ - tabaka
195
+ - tablet
196
+ - takım
197
+ - talımat
198
+ - tanıt
199
+ - tarla
200
+ - tasma
201
+ - tenis
202
+ - törpü
203
+ - uç
204
+ - uygun
205
+ - var
206
+ - yasa
@@ -0,0 +1,5 @@
1
+ require "hashie/extensions/key_conversion"
2
+
3
+ class Hash
4
+ include Hashie::Extensions::SymbolizeKeys
5
+ end
@@ -0,0 +1,3 @@
1
+ module TurkishStemmer
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,455 @@
1
+ # coding: utf-8
2
+ require "turkish_stemmer/version"
3
+ require "yaml"
4
+ require "hash_extension"
5
+
6
+ # Please note that we use only lowercase letters for all methods. One should
7
+ # normalize input streams before using the `stem` method.
8
+ module TurkishStemmer
9
+ extend self
10
+
11
+ VOWELS = "üiıueöao"
12
+ CONSONANTS = "bcçdfgğhjklmnprsştvyz"
13
+ ROUNDED_VOWELS = "oöuü"
14
+ UNROUNDED_VOWELS = "iıea"
15
+ FOLLOWING_ROUNDED_VOWELS = "aeuü"
16
+ FRONT_VOWELS = "eiöü"
17
+ BACK_VOWELS = "ıuao"
18
+
19
+ # Heuristic size for average Turkish stemmed word size
20
+ AVG_STEMMED_SIZE = 4
21
+
22
+ # Regular expression that checks if the word contains only turkish characters
23
+ ALPHABET = Regexp.new("^[abcçdefgğhıijklmnoöprsştuüvyz]+$").freeze
24
+
25
+ # Stems a Turkish word.
26
+ #
27
+ # Algorithm consists of 3 parts: pre-process, process and post-process. The
28
+ # pre-process phase is a quick lookup for words that should not be stemmed
29
+ # based on length, protected words list and vowel harmony. The process phase
30
+ # includes a nominal verb suffix and a noun suffix stripper machine. The last
31
+ # phase includes some additional checks and a simple stem selection decision.
32
+ #
33
+ # @param word [String] the word to stem
34
+ # @return [String] the stemmed word
35
+ def stem(original_word)
36
+ # Preprocess
37
+ return original_word if !proceed_to_stem?(original_word)
38
+
39
+ word = original_word.dup
40
+
41
+ # Process
42
+ stems = []
43
+ stems << nominal_verbs_suffix_machine { word }
44
+ stems << original_word
45
+ stems.flatten!.uniq!
46
+ stems << stems.map { |word| noun_suffix_machine { word }}
47
+ stems << original_word
48
+ stems.flatten!.uniq!
49
+ stems << stems.map { |word| derivational_suffix_machine { word }}
50
+
51
+ # Postprocess
52
+ stem_post_process(stems, original_word)
53
+ end
54
+
55
+ # Loads yaml file and symbolizes keys
56
+ #
57
+ # @param file [String] path to yaml file
58
+ # @return [Hash] the hash with symbols as keys
59
+ def load_states_or_suffixes(file)
60
+ config_path = File.expand_path("../../#{file}", __FILE__)
61
+
62
+ YAML.load_file(config_path).symbolize_keys
63
+ rescue => e
64
+ raise "An error occured loading #{file}, #{e}"
65
+ end
66
+
67
+ # Helper method for loading settings
68
+ #
69
+ # @param key [String] the key
70
+ def load_settings(key)
71
+ config_path = File.expand_path("../../config/stemmer.yml", __FILE__)
72
+
73
+ begin
74
+ YAML.load_file(config_path)[key]
75
+ rescue => e
76
+ raise "Please provide a valid config/stemmer.yml file, #{e}"
77
+ end
78
+ end
79
+
80
+ NOMINAL_VERB_STATES = load_states_or_suffixes("config/nominal_verb_states.yml")
81
+ NOMINAL_VERB_SUFFIXES = load_states_or_suffixes("config/nominal_verb_suffixes.yml")
82
+
83
+ NOUN_STATES = load_states_or_suffixes("config/noun_states.yml")
84
+ NOUN_SUFFIXES = load_states_or_suffixes("config/noun_suffixes.yml")
85
+
86
+ DERIVATIONAL_STATES = load_states_or_suffixes("config/derivational_states.yml")
87
+ DERIVATIONAL_SUFFIXES = load_states_or_suffixes("config/derivational_suffixes.yml")
88
+
89
+ ##
90
+ # Load settings
91
+ #
92
+ # Protected words
93
+ PROTECTED_WORDS = load_settings("protected_words")
94
+
95
+ # Last consonant exceptions
96
+ LAST_CONSONANT_EXCEPTIONS = load_settings("last_consonant_exceptions")
97
+
98
+ # Vower harmony exceptions
99
+ VOWEL_HARMONY_EXCEPTIONS = load_settings("vowel_harmony_exceptions")
100
+
101
+ # Selection list exceptions
102
+ SELECTION_LIST_EXCEPTIONS = load_settings("selection_list_exceptions")
103
+
104
+ # Counts syllables of a Turkish word. In Turkish the number of syllables is
105
+ # equals to the number of vowels.
106
+ #
107
+ # @param word [String] the word to count its syllables
108
+ # @return [Fixnum] the number of syllables
109
+ def count_syllables(word)
110
+ vowels(word).size
111
+ end
112
+
113
+ # Gets the vowels of a word
114
+ #
115
+ # @param word [String] the word to get its vowels
116
+ # @return [Array] array of vowels
117
+ def vowels(word)
118
+ word.gsub(/#{CONSONANTS.chars.to_a.join('|')}/,"").chars.to_a
119
+ end
120
+
121
+ # Checks vowel harmony of a word according to Turkish vowel harmony.
122
+ #
123
+ # @param word [String] the word to be checked against Turkish vowel harmony
124
+ # @return [Boolean]
125
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
126
+ def has_vowel_harmony?(word)
127
+ word_vowels = vowels(word)
128
+ vowel = word_vowels[-2]
129
+ candidate = word_vowels[-1]
130
+
131
+ vowel_harmony?(vowel, candidate)
132
+ end
133
+
134
+ # Checks vowel harmony between two vowels
135
+ #
136
+ # @param vowel [String] the first vowel
137
+ # @param candidate [String] the second vowel
138
+ # @return [Boolean]
139
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
140
+ def vowel_harmony?(vowel, candidate)
141
+ has_roundness?(vowel, candidate) && has_frontness?(vowel, candidate)
142
+ end
143
+
144
+ # Checks roundness vowel harmony of two vowels according to Turkish vowel
145
+ # harmony.
146
+ #
147
+ # @param vowel [String] the first vowel
148
+ # @param candidate [String] the second vowel
149
+ # @return [Boolean]
150
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
151
+ def has_roundness?(vowel, candidate)
152
+ return true if vowel.nil? || vowel.empty?
153
+ return true if candidate.nil? || candidate.empty?
154
+
155
+ if (UNROUNDED_VOWELS.include?(vowel) && UNROUNDED_VOWELS.include?(candidate)) ||
156
+ (ROUNDED_VOWELS.include?(vowel) && FOLLOWING_ROUNDED_VOWELS.include?(candidate))
157
+ return true
158
+ end
159
+
160
+ false
161
+ end
162
+
163
+ # Checks frontness vowel harmony of two vowels according to Turkish vowel
164
+ # harmony.
165
+ #
166
+ # @param vowel [String] the first vowel
167
+ # @param candidate [String] the second vowel
168
+ # @return [Boolean]
169
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
170
+ def has_frontness?(vowel, candidate)
171
+ return true if vowel.nil? || vowel.empty?
172
+ return true if candidate.nil? || candidate.empty?
173
+
174
+ if (FRONT_VOWELS.include?(vowel) && FRONT_VOWELS.include?(candidate)) ||
175
+ (BACK_VOWELS.include?(vowel) && BACK_VOWELS.include?(candidate))
176
+ return true
177
+ end
178
+
179
+ false
180
+ end
181
+
182
+ # Checks whether a word can be stemmed or not. This method checks candidate
183
+ # word against nil, protected, length and vowel harmory.
184
+ #
185
+ # @param word [String] the candidate word for stemming
186
+ # @return [Boolean] whether should proceed to stem or not
187
+ def proceed_to_stem?(word)
188
+ if word.nil? || !turkish?(word) ||
189
+ PROTECTED_WORDS.include?(word) ||
190
+ count_syllables(word) <= 1
191
+
192
+ return false
193
+ end
194
+
195
+ true
196
+ end
197
+
198
+ # Post stemming process
199
+ #
200
+ # @param stems [Array] array of candidate stems
201
+ # @param original_word [String] the original word
202
+ # @return [String] the stemmed or the original word
203
+ def stem_post_process(stems, original_word)
204
+ if ENV['DEBUG']
205
+ puts "post process for #{original_word}: #{stems}"
206
+ end
207
+
208
+ stems = stems.flatten.uniq
209
+
210
+ # Reject original word
211
+ stems.reject! { |w| w == original_word }
212
+
213
+ # Reject all non-syllable words
214
+ stems.reject! { |w| count_syllables(w) == 0 }
215
+
216
+ # Transform last consonant
217
+ stems.map! { |word| last_consonant!(word) }
218
+
219
+ # Sort stems by size
220
+ stems.sort! do |x,y|
221
+ if (x.size - AVG_STEMMED_SIZE).abs == (y.size - AVG_STEMMED_SIZE).abs
222
+ x.size <=> y.size
223
+ else
224
+ (x.size - AVG_STEMMED_SIZE).abs <=> (y.size - AVG_STEMMED_SIZE).abs
225
+ end
226
+ end
227
+
228
+ # Check selection list exceptions
229
+ if !(exception = (stems & SELECTION_LIST_EXCEPTIONS)).empty?
230
+ return exception.first
231
+ end
232
+
233
+ # Keep first or original word
234
+ stems.empty? ? original_word : stems.first
235
+ end
236
+
237
+ # Given a state key and a word, scans through given states and generate valid
238
+ # pending transitions.
239
+ #
240
+ # @param key [String] the key for states hash
241
+ # @param word [String] the word to check
242
+ # @param states [Hash] the states hash
243
+ # @param suffixes [Hash] the suffixes hash
244
+ # @param options [Hash] options for pendings
245
+ # @option options [Boolean] :mark Whether this pending is marked for deletion
246
+ # @return [Array] array of pendings
247
+ def generate_pendings(key, word, states, suffixes, options = {})
248
+ raise ArgumentError, "State #{key} does not exist" if (state = states[key]).nil?
249
+ mark = options[:mark] || false
250
+
251
+ matched_transitions = state[:transitions].select do |transition|
252
+ word.match(/(#{suffixes[transition[:suffix]][:regex]})$/)
253
+ end
254
+
255
+ matched_transitions.map do |transition|
256
+ {
257
+ suffix: transition[:suffix],
258
+ to_state: transition[:state],
259
+ from_state: key,
260
+ word: word,
261
+ mark: mark
262
+ }
263
+ end
264
+ end
265
+
266
+ # Given a suffix it stems a word according to Turkish orthographic rules
267
+ #
268
+ # @param word [String] the word to stem
269
+ # @param suffix [Hash] a suffix record
270
+ # @return [Hash] a stem answer record
271
+ def mark_stem(word, suffix)
272
+ stem = !PROTECTED_WORDS.include?(word) &&
273
+ (suffix[:check_harmony] &&
274
+ (has_vowel_harmony?(word) || VOWEL_HARMONY_EXCEPTIONS.include?(word))) ||
275
+ !suffix[:check_harmony]
276
+
277
+ suffix_applied = suffix[:regex]
278
+
279
+ if stem && (match = word.match(/(#{suffix_applied})$/))
280
+ new_word = word.gsub(/(#{match.to_s})$/, '')
281
+ suffix_applied = match.to_s
282
+
283
+ if suffix[:optional_letter]
284
+ answer, match = valid_optional_letter?(new_word, suffix[:optional_letter])
285
+
286
+ if answer && match
287
+ new_word = new_word.chop
288
+ suffix_applied = match + suffix_applied
289
+ elsif !answer
290
+ new_word = word
291
+ suffix_applied = nil
292
+ stem = false
293
+ end
294
+ end
295
+ else
296
+ stem = false
297
+ suffix_applied = nil
298
+ new_word = word
299
+ end
300
+
301
+ { stem: stem, word: new_word, suffix_applied: suffix_applied }
302
+ end
303
+
304
+ # Given a word and a letter it checks if the optional letter can be part of
305
+ # the stem or not.
306
+ #
307
+ # @param word [String] the examined word
308
+ # @param letter [String] a single letter or a string armed with a regular
309
+ # expression
310
+ # @return [Array] the answer is returned as an array. First element is a
311
+ # Boolean value and second element is the mached character.
312
+ # @example
313
+ # self.valid_optional_letter?("test", "t")
314
+ # # => [true, 't']
315
+ def valid_optional_letter?(word, letter)
316
+ match = word.match(/(#{letter})$/)
317
+ answer = true
318
+ matched_char = nil
319
+
320
+ if match
321
+ matched_char = match.to_s
322
+ previous_char = word[-2]
323
+
324
+ answer = if VOWELS.include?(matched_char)
325
+ (previous_char && CONSONANTS.include?(previous_char))
326
+ else
327
+ (previous_char && VOWELS.include?(previous_char))
328
+ end
329
+ end
330
+
331
+ [answer, matched_char]
332
+ end
333
+
334
+ # Transforms a word taken into account last consonant rule.
335
+ #
336
+ # @param word [String] the word to check for last consonant change
337
+ # @return [String] the changed word
338
+ def last_consonant!(word)
339
+ return word if LAST_CONSONANT_EXCEPTIONS.include?(word)
340
+
341
+ consonants = { 'b' => 'p', 'c' => 'ç', 'd' => 't', 'ğ' => 'k' }
342
+ last_char = word[-1]
343
+
344
+ if consonants.keys.include?(last_char)
345
+ word[-1] = consonants[last_char]
346
+ end
347
+
348
+ word
349
+ end
350
+
351
+ # Helper method. This is just a shortcut.
352
+ def nominal_verbs_suffix_machine
353
+ affix_morphological_stripper(yield, states: self::NOMINAL_VERB_STATES,
354
+ suffixes: self::NOMINAL_VERB_SUFFIXES)
355
+ end
356
+
357
+ # Helper method. This is just a shortcut.
358
+ def noun_suffix_machine
359
+ affix_morphological_stripper(yield, states: self::NOUN_STATES,
360
+ suffixes: self::NOUN_SUFFIXES)
361
+ end
362
+
363
+ # Helper method
364
+ def derivational_suffix_machine
365
+ affix_morphological_stripper(yield, states: self::DERIVATIONAL_STATES,
366
+ suffixes: self::DERIVATIONAL_SUFFIXES)
367
+ end
368
+
369
+ # A simple algorithm to strip suffixes from a word based on states and
370
+ # transitions.
371
+ #
372
+ # @param word [String] the word to strip affixes from
373
+ # @param options [Hash] options for the algorithm
374
+ # @option options [Hash] :states The states and valid transitions
375
+ # @option options [Hash] :suffixes The suffixes with their rules
376
+ # @return [Array] all possible stem versions
377
+ def affix_morphological_stripper(word, options = {})
378
+ states = options[:states] || {}
379
+ suffixes = options[:suffixes] || {}
380
+
381
+ return [word] if states.nil? || states.empty?
382
+ return [word] if suffixes.nil? || suffixes.empty?
383
+
384
+ stems = []
385
+ # Init first state pending transitions
386
+ pendings = generate_pendings(:a, word, states, suffixes)
387
+
388
+ while !pendings.empty? do
389
+ transition = pendings.shift
390
+ word = transition[:word]
391
+ suffix = suffixes[transition[:suffix]]
392
+ to_state = states[transition[:to_state]]
393
+ answer = mark_stem(word, suffix)
394
+
395
+ if answer[:stem] == true
396
+ if ENV['DEBUG']
397
+ puts "Word: #{word} \nAnswer: #{answer} \nInfo: #{transition} \nSuffix: #{suffix}"
398
+ end
399
+
400
+ if to_state[:final_state] == true
401
+ # We have a valid transition here. It is safe to remove any pendings
402
+ # with the same signature current pending
403
+ remove_pendings_like!(transition, pendings)
404
+ remove_mark_pendings!(pendings)
405
+
406
+ stems.push answer[:word]
407
+
408
+ unless to_state[:transitions].empty?
409
+ pendings.unshift(*generate_pendings(transition[:to_state], answer[:word], states, suffixes))
410
+ end
411
+
412
+ else
413
+ mark_pendings!(transition, pendings)
414
+ pendings.unshift(*generate_pendings(transition[:to_state], answer[:word],
415
+ states, suffixes, mark: true))
416
+ end
417
+ end
418
+ end
419
+
420
+ return [word] if pendings.empty? && stems.empty?
421
+
422
+ stems.uniq
423
+ end
424
+
425
+ private
426
+
427
+ def remove_pendings_like!(pending, array)
428
+ array.reject! do |candidate|
429
+ candidate[:to_state] == pending[:to_state] &&
430
+ candidate[:from_state] == pending[:from_state]
431
+ end
432
+ end
433
+
434
+ def mark_pendings!(pending, array)
435
+ similar_pendings(pending, array).each do |candidate|
436
+ candidate[:mark] = true
437
+ end
438
+ end
439
+
440
+ def remove_mark_pendings!(array)
441
+ array.reject! { |candidate| candidate[:mark] == true }
442
+ end
443
+
444
+ def similar_pendings(pending, array)
445
+ array.select do |candidate|
446
+ candidate[:to_state] == pending[:to_state] &&
447
+ candidate[:from_state] == pending[:from_state]
448
+ end
449
+ end
450
+
451
+ def turkish?(word)
452
+ !! word.match(ALPHABET)
453
+ end
454
+
455
+ end
@@ -0,0 +1,14 @@
1
+ a:
2
+ transitions:
3
+ - suffix: :s1
4
+ state: :b
5
+
6
+ - suffix: :s2
7
+ state: :b
8
+
9
+ final_state: true
10
+
11
+ b:
12
+ transitions: []
13
+
14
+ final_state: true
@@ -0,0 +1,21 @@
1
+ a:
2
+ transitions:
3
+ - suffix: :s1
4
+ state: :b
5
+
6
+ - suffix: :s2
7
+ state: :b
8
+
9
+ final_state: true
10
+
11
+ b:
12
+ transitions:
13
+ - suffix: :s1
14
+ state: :c
15
+
16
+ final_state: true
17
+
18
+ c:
19
+ transitions: []
20
+
21
+ final_state: true
@@ -0,0 +1,7 @@
1
+ s1:
2
+ name: "test"
3
+ regex: "im"
4
+
5
+ s2:
6
+ name: "another"
7
+ regex: "siniz"
@@ -0,0 +1,7 @@
1
+ s1:
2
+ name: "TEST"
3
+ regex: "test"
4
+
5
+ s2:
6
+ name: "another"
7
+ regex: "another"