turkish_stemmer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,206 @@
1
+ protected_words:
2
+ - abiye
3
+ - adın
4
+ - adana
5
+ - akılsız
6
+ - alaska
7
+ - alet
8
+ - ağda
9
+ - ağız
10
+ - alarm
11
+ - altınbaşak
12
+ - altınyıldız
13
+ - anakucağı
14
+ - anasayfa
15
+ - anime
16
+ - antifriz
17
+ - araba
18
+ - ardeşen
19
+ - armanı
20
+ - aroma
21
+ - arma
22
+ - arsız
23
+ - asa
24
+ - askı
25
+ - astra
26
+ - asus
27
+ - atkı
28
+ - ayakkabı
29
+ - aynı
30
+ - ayı
31
+ - banka
32
+ - başka
33
+ - batık
34
+ - bayı
35
+ - belge
36
+ - bellona
37
+ - benten
38
+ - benzin
39
+ - beşinci
40
+ - bilgi
41
+ - bitki
42
+ - boyut
43
+ - branda
44
+ - bütün
45
+ - buzlu
46
+ - çağrı
47
+ - camsız
48
+ - çanta
49
+ - çarşı
50
+ - ceyiz
51
+ - çıkış
52
+ - cımbiz
53
+ - dalga
54
+ - damla
55
+ - derece
56
+ - dişli
57
+ - düğün
58
+ - ege
59
+ - elbise
60
+ - fendi
61
+ - filtre
62
+ - fiyat
63
+ - forma
64
+ - gazete
65
+ - gemi
66
+ - görüntü
67
+ - igne
68
+ - ince
69
+ - internet
70
+ - iyi
71
+ - kayısı
72
+ - kama
73
+ - katı
74
+ - kötü
75
+ - kumanda
76
+ - lamba
77
+ - lazım
78
+ - litre
79
+ - mağaza
80
+ - magaza
81
+ - makara
82
+ - makine
83
+ - malzeme
84
+ - mana
85
+ - marka
86
+ - masa
87
+ - maskara
88
+ - mine
89
+ - mini
90
+ - nine
91
+ - numara
92
+ - odun
93
+ - oyun
94
+ - ölçü
95
+ - örgü
96
+ - öykü
97
+ - özen
98
+ - parça
99
+ - perde
100
+ - pompa
101
+ - pırlanta
102
+ - raket
103
+ - ranza
104
+ - şamdan
105
+ - şapka
106
+ - şifre
107
+ - sunu
108
+ - soyad
109
+ - tabaka
110
+ - takım
111
+ - talımat
112
+ - tarla
113
+ - tasma
114
+ - törpü
115
+ - tozlu
116
+ - tüplü
117
+ - uçurtma
118
+ - üfleme
119
+ - ürün
120
+ - ütü
121
+ - uygun
122
+ - uzatma
123
+ - uzun
124
+ - vana
125
+ - yağlı
126
+ - yapma
127
+ - yardım
128
+ - yasa
129
+ - yıldız
130
+ - zayıflama
131
+ - zemin
132
+
133
+ last_consonant_exceptions:
134
+ - ad
135
+ - at
136
+ - ked
137
+
138
+ vowel_harmony_exceptions:
139
+ - alkoller
140
+ - değerın
141
+ - generali
142
+ - generale
143
+ - projektörlar
144
+ - saatler
145
+ - tabletlar
146
+ - tersyüz
147
+ - yaninda
148
+ - yani
149
+
150
+ selection_list_exceptions:
151
+ - al
152
+ - am
153
+ - aparat
154
+ - ara
155
+ - bilet
156
+ - bisiklet
157
+ - bulut
158
+ - diyet
159
+ - ev
160
+ - es
161
+ - fiyat
162
+ - fırsat
163
+ - general
164
+ - git
165
+ - gıt
166
+ - iç
167
+ - ip
168
+ - internet
169
+ - iyi
170
+ - kağıt
171
+ - kartuş
172
+ - katı
173
+ - kot
174
+ - kötü
175
+ - kumanda
176
+ - lamba
177
+ - mağaza
178
+ - magaza
179
+ - makara
180
+ - makine
181
+ - marka
182
+ - maskara
183
+ - ne
184
+ - otomat
185
+ - palet
186
+ - perde
187
+ - raket
188
+ - ranza
189
+ - robot
190
+ - sepet
191
+ - servis
192
+ - soyad
193
+ - su
194
+ - tabaka
195
+ - tablet
196
+ - takım
197
+ - talımat
198
+ - tanıt
199
+ - tarla
200
+ - tasma
201
+ - tenis
202
+ - törpü
203
+ - uç
204
+ - uygun
205
+ - var
206
+ - yasa
@@ -0,0 +1,5 @@
1
+ require "hashie/extensions/key_conversion"
2
+
3
+ class Hash
4
+ include Hashie::Extensions::SymbolizeKeys
5
+ end
@@ -0,0 +1,3 @@
1
+ module TurkishStemmer
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,455 @@
1
+ # coding: utf-8
2
+ require "turkish_stemmer/version"
3
+ require "yaml"
4
+ require "hash_extension"
5
+
6
+ # Please note that we use only lowercase letters for all methods. One should
7
+ # normalize input streams before using the `stem` method.
8
+ module TurkishStemmer
9
+ extend self
10
+
11
+ VOWELS = "üiıueöao"
12
+ CONSONANTS = "bcçdfgğhjklmnprsştvyz"
13
+ ROUNDED_VOWELS = "oöuü"
14
+ UNROUNDED_VOWELS = "iıea"
15
+ FOLLOWING_ROUNDED_VOWELS = "aeuü"
16
+ FRONT_VOWELS = "eiöü"
17
+ BACK_VOWELS = "ıuao"
18
+
19
+ # Heuristic size for average Turkish stemmed word size
20
+ AVG_STEMMED_SIZE = 4
21
+
22
+ # Regular expression that checks if the word contains only turkish characters
23
+ ALPHABET = Regexp.new("^[abcçdefgğhıijklmnoöprsştuüvyz]+$").freeze
24
+
25
+ # Stems a Turkish word.
26
+ #
27
+ # Algorithm consists of 3 parts: pre-process, process and post-process. The
28
+ # pre-process phase is a quick lookup for words that should not be stemmed
29
+ # based on length, protected words list and vowel harmony. The process phase
30
+ # includes a nominal verb suffix and a noun suffix stripper machine. The last
31
+ # phase includes some additional checks and a simple stem selection decision.
32
+ #
33
+ # @param word [String] the word to stem
34
+ # @return [String] the stemmed word
35
+ def stem(original_word)
36
+ # Preprocess
37
+ return original_word if !proceed_to_stem?(original_word)
38
+
39
+ word = original_word.dup
40
+
41
+ # Process
42
+ stems = []
43
+ stems << nominal_verbs_suffix_machine { word }
44
+ stems << original_word
45
+ stems.flatten!.uniq!
46
+ stems << stems.map { |word| noun_suffix_machine { word }}
47
+ stems << original_word
48
+ stems.flatten!.uniq!
49
+ stems << stems.map { |word| derivational_suffix_machine { word }}
50
+
51
+ # Postprocess
52
+ stem_post_process(stems, original_word)
53
+ end
54
+
55
+ # Loads yaml file and symbolizes keys
56
+ #
57
+ # @param file [String] path to yaml file
58
+ # @return [Hash] the hash with symbols as keys
59
+ def load_states_or_suffixes(file)
60
+ config_path = File.expand_path("../../#{file}", __FILE__)
61
+
62
+ YAML.load_file(config_path).symbolize_keys
63
+ rescue => e
64
+ raise "An error occured loading #{file}, #{e}"
65
+ end
66
+
67
+ # Helper method for loading settings
68
+ #
69
+ # @param key [String] the key
70
+ def load_settings(key)
71
+ config_path = File.expand_path("../../config/stemmer.yml", __FILE__)
72
+
73
+ begin
74
+ YAML.load_file(config_path)[key]
75
+ rescue => e
76
+ raise "Please provide a valid config/stemmer.yml file, #{e}"
77
+ end
78
+ end
79
+
80
+ NOMINAL_VERB_STATES = load_states_or_suffixes("config/nominal_verb_states.yml")
81
+ NOMINAL_VERB_SUFFIXES = load_states_or_suffixes("config/nominal_verb_suffixes.yml")
82
+
83
+ NOUN_STATES = load_states_or_suffixes("config/noun_states.yml")
84
+ NOUN_SUFFIXES = load_states_or_suffixes("config/noun_suffixes.yml")
85
+
86
+ DERIVATIONAL_STATES = load_states_or_suffixes("config/derivational_states.yml")
87
+ DERIVATIONAL_SUFFIXES = load_states_or_suffixes("config/derivational_suffixes.yml")
88
+
89
+ ##
90
+ # Load settings
91
+ #
92
+ # Protected words
93
+ PROTECTED_WORDS = load_settings("protected_words")
94
+
95
+ # Last consonant exceptions
96
+ LAST_CONSONANT_EXCEPTIONS = load_settings("last_consonant_exceptions")
97
+
98
+ # Vower harmony exceptions
99
+ VOWEL_HARMONY_EXCEPTIONS = load_settings("vowel_harmony_exceptions")
100
+
101
+ # Selection list exceptions
102
+ SELECTION_LIST_EXCEPTIONS = load_settings("selection_list_exceptions")
103
+
104
+ # Counts syllables of a Turkish word. In Turkish the number of syllables is
105
+ # equals to the number of vowels.
106
+ #
107
+ # @param word [String] the word to count its syllables
108
+ # @return [Fixnum] the number of syllables
109
+ def count_syllables(word)
110
+ vowels(word).size
111
+ end
112
+
113
+ # Gets the vowels of a word
114
+ #
115
+ # @param word [String] the word to get its vowels
116
+ # @return [Array] array of vowels
117
+ def vowels(word)
118
+ word.gsub(/#{CONSONANTS.chars.to_a.join('|')}/,"").chars.to_a
119
+ end
120
+
121
+ # Checks vowel harmony of a word according to Turkish vowel harmony.
122
+ #
123
+ # @param word [String] the word to be checked against Turkish vowel harmony
124
+ # @return [Boolean]
125
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
126
+ def has_vowel_harmony?(word)
127
+ word_vowels = vowels(word)
128
+ vowel = word_vowels[-2]
129
+ candidate = word_vowels[-1]
130
+
131
+ vowel_harmony?(vowel, candidate)
132
+ end
133
+
134
+ # Checks vowel harmony between two vowels
135
+ #
136
+ # @param vowel [String] the first vowel
137
+ # @param candidate [String] the second vowel
138
+ # @return [Boolean]
139
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
140
+ def vowel_harmony?(vowel, candidate)
141
+ has_roundness?(vowel, candidate) && has_frontness?(vowel, candidate)
142
+ end
143
+
144
+ # Checks roundness vowel harmony of two vowels according to Turkish vowel
145
+ # harmony.
146
+ #
147
+ # @param vowel [String] the first vowel
148
+ # @param candidate [String] the second vowel
149
+ # @return [Boolean]
150
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
151
+ def has_roundness?(vowel, candidate)
152
+ return true if vowel.nil? || vowel.empty?
153
+ return true if candidate.nil? || candidate.empty?
154
+
155
+ if (UNROUNDED_VOWELS.include?(vowel) && UNROUNDED_VOWELS.include?(candidate)) ||
156
+ (ROUNDED_VOWELS.include?(vowel) && FOLLOWING_ROUNDED_VOWELS.include?(candidate))
157
+ return true
158
+ end
159
+
160
+ false
161
+ end
162
+
163
+ # Checks frontness vowel harmony of two vowels according to Turkish vowel
164
+ # harmony.
165
+ #
166
+ # @param vowel [String] the first vowel
167
+ # @param candidate [String] the second vowel
168
+ # @return [Boolean]
169
+ # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
170
+ def has_frontness?(vowel, candidate)
171
+ return true if vowel.nil? || vowel.empty?
172
+ return true if candidate.nil? || candidate.empty?
173
+
174
+ if (FRONT_VOWELS.include?(vowel) && FRONT_VOWELS.include?(candidate)) ||
175
+ (BACK_VOWELS.include?(vowel) && BACK_VOWELS.include?(candidate))
176
+ return true
177
+ end
178
+
179
+ false
180
+ end
181
+
182
+ # Checks whether a word can be stemmed or not. This method checks candidate
183
+ # word against nil, protected, length and vowel harmory.
184
+ #
185
+ # @param word [String] the candidate word for stemming
186
+ # @return [Boolean] whether should proceed to stem or not
187
+ def proceed_to_stem?(word)
188
+ if word.nil? || !turkish?(word) ||
189
+ PROTECTED_WORDS.include?(word) ||
190
+ count_syllables(word) <= 1
191
+
192
+ return false
193
+ end
194
+
195
+ true
196
+ end
197
+
198
+ # Post stemming process
199
+ #
200
+ # @param stems [Array] array of candidate stems
201
+ # @param original_word [String] the original word
202
+ # @return [String] the stemmed or the original word
203
+ def stem_post_process(stems, original_word)
204
+ if ENV['DEBUG']
205
+ puts "post process for #{original_word}: #{stems}"
206
+ end
207
+
208
+ stems = stems.flatten.uniq
209
+
210
+ # Reject original word
211
+ stems.reject! { |w| w == original_word }
212
+
213
+ # Reject all non-syllable words
214
+ stems.reject! { |w| count_syllables(w) == 0 }
215
+
216
+ # Transform last consonant
217
+ stems.map! { |word| last_consonant!(word) }
218
+
219
+ # Sort stems by size
220
+ stems.sort! do |x,y|
221
+ if (x.size - AVG_STEMMED_SIZE).abs == (y.size - AVG_STEMMED_SIZE).abs
222
+ x.size <=> y.size
223
+ else
224
+ (x.size - AVG_STEMMED_SIZE).abs <=> (y.size - AVG_STEMMED_SIZE).abs
225
+ end
226
+ end
227
+
228
+ # Check selection list exceptions
229
+ if !(exception = (stems & SELECTION_LIST_EXCEPTIONS)).empty?
230
+ return exception.first
231
+ end
232
+
233
+ # Keep first or original word
234
+ stems.empty? ? original_word : stems.first
235
+ end
236
+
237
+ # Given a state key and a word, scans through given states and generate valid
238
+ # pending transitions.
239
+ #
240
+ # @param key [String] the key for states hash
241
+ # @param word [String] the word to check
242
+ # @param states [Hash] the states hash
243
+ # @param suffixes [Hash] the suffixes hash
244
+ # @param options [Hash] options for pendings
245
+ # @option options [Boolean] :mark Whether this pending is marked for deletion
246
+ # @return [Array] array of pendings
247
+ def generate_pendings(key, word, states, suffixes, options = {})
248
+ raise ArgumentError, "State #{key} does not exist" if (state = states[key]).nil?
249
+ mark = options[:mark] || false
250
+
251
+ matched_transitions = state[:transitions].select do |transition|
252
+ word.match(/(#{suffixes[transition[:suffix]][:regex]})$/)
253
+ end
254
+
255
+ matched_transitions.map do |transition|
256
+ {
257
+ suffix: transition[:suffix],
258
+ to_state: transition[:state],
259
+ from_state: key,
260
+ word: word,
261
+ mark: mark
262
+ }
263
+ end
264
+ end
265
+
266
+ # Given a suffix it stems a word according to Turkish orthographic rules
267
+ #
268
+ # @param word [String] the word to stem
269
+ # @param suffix [Hash] a suffix record
270
+ # @return [Hash] a stem answer record
271
+ def mark_stem(word, suffix)
272
+ stem = !PROTECTED_WORDS.include?(word) &&
273
+ (suffix[:check_harmony] &&
274
+ (has_vowel_harmony?(word) || VOWEL_HARMONY_EXCEPTIONS.include?(word))) ||
275
+ !suffix[:check_harmony]
276
+
277
+ suffix_applied = suffix[:regex]
278
+
279
+ if stem && (match = word.match(/(#{suffix_applied})$/))
280
+ new_word = word.gsub(/(#{match.to_s})$/, '')
281
+ suffix_applied = match.to_s
282
+
283
+ if suffix[:optional_letter]
284
+ answer, match = valid_optional_letter?(new_word, suffix[:optional_letter])
285
+
286
+ if answer && match
287
+ new_word = new_word.chop
288
+ suffix_applied = match + suffix_applied
289
+ elsif !answer
290
+ new_word = word
291
+ suffix_applied = nil
292
+ stem = false
293
+ end
294
+ end
295
+ else
296
+ stem = false
297
+ suffix_applied = nil
298
+ new_word = word
299
+ end
300
+
301
+ { stem: stem, word: new_word, suffix_applied: suffix_applied }
302
+ end
303
+
304
+ # Given a word and a letter it checks if the optional letter can be part of
305
+ # the stem or not.
306
+ #
307
+ # @param word [String] the examined word
308
+ # @param letter [String] a single letter or a string armed with a regular
309
+ # expression
310
+ # @return [Array] the answer is returned as an array. First element is a
311
+ # Boolean value and second element is the mached character.
312
+ # @example
313
+ # self.valid_optional_letter?("test", "t")
314
+ # # => [true, 't']
315
+ def valid_optional_letter?(word, letter)
316
+ match = word.match(/(#{letter})$/)
317
+ answer = true
318
+ matched_char = nil
319
+
320
+ if match
321
+ matched_char = match.to_s
322
+ previous_char = word[-2]
323
+
324
+ answer = if VOWELS.include?(matched_char)
325
+ (previous_char && CONSONANTS.include?(previous_char))
326
+ else
327
+ (previous_char && VOWELS.include?(previous_char))
328
+ end
329
+ end
330
+
331
+ [answer, matched_char]
332
+ end
333
+
334
+ # Transforms a word taken into account last consonant rule.
335
+ #
336
+ # @param word [String] the word to check for last consonant change
337
+ # @return [String] the changed word
338
+ def last_consonant!(word)
339
+ return word if LAST_CONSONANT_EXCEPTIONS.include?(word)
340
+
341
+ consonants = { 'b' => 'p', 'c' => 'ç', 'd' => 't', 'ğ' => 'k' }
342
+ last_char = word[-1]
343
+
344
+ if consonants.keys.include?(last_char)
345
+ word[-1] = consonants[last_char]
346
+ end
347
+
348
+ word
349
+ end
350
+
351
+ # Helper method. This is just a shortcut.
352
+ def nominal_verbs_suffix_machine
353
+ affix_morphological_stripper(yield, states: self::NOMINAL_VERB_STATES,
354
+ suffixes: self::NOMINAL_VERB_SUFFIXES)
355
+ end
356
+
357
+ # Helper method. This is just a shortcut.
358
+ def noun_suffix_machine
359
+ affix_morphological_stripper(yield, states: self::NOUN_STATES,
360
+ suffixes: self::NOUN_SUFFIXES)
361
+ end
362
+
363
+ # Helper method
364
+ def derivational_suffix_machine
365
+ affix_morphological_stripper(yield, states: self::DERIVATIONAL_STATES,
366
+ suffixes: self::DERIVATIONAL_SUFFIXES)
367
+ end
368
+
369
+ # A simple algorithm to strip suffixes from a word based on states and
370
+ # transitions.
371
+ #
372
+ # @param word [String] the word to strip affixes from
373
+ # @param options [Hash] options for the algorithm
374
+ # @option options [Hash] :states The states and valid transitions
375
+ # @option options [Hash] :suffixes The suffixes with their rules
376
+ # @return [Array] all possible stem versions
377
+ def affix_morphological_stripper(word, options = {})
378
+ states = options[:states] || {}
379
+ suffixes = options[:suffixes] || {}
380
+
381
+ return [word] if states.nil? || states.empty?
382
+ return [word] if suffixes.nil? || suffixes.empty?
383
+
384
+ stems = []
385
+ # Init first state pending transitions
386
+ pendings = generate_pendings(:a, word, states, suffixes)
387
+
388
+ while !pendings.empty? do
389
+ transition = pendings.shift
390
+ word = transition[:word]
391
+ suffix = suffixes[transition[:suffix]]
392
+ to_state = states[transition[:to_state]]
393
+ answer = mark_stem(word, suffix)
394
+
395
+ if answer[:stem] == true
396
+ if ENV['DEBUG']
397
+ puts "Word: #{word} \nAnswer: #{answer} \nInfo: #{transition} \nSuffix: #{suffix}"
398
+ end
399
+
400
+ if to_state[:final_state] == true
401
+ # We have a valid transition here. It is safe to remove any pendings
402
+ # with the same signature current pending
403
+ remove_pendings_like!(transition, pendings)
404
+ remove_mark_pendings!(pendings)
405
+
406
+ stems.push answer[:word]
407
+
408
+ unless to_state[:transitions].empty?
409
+ pendings.unshift(*generate_pendings(transition[:to_state], answer[:word], states, suffixes))
410
+ end
411
+
412
+ else
413
+ mark_pendings!(transition, pendings)
414
+ pendings.unshift(*generate_pendings(transition[:to_state], answer[:word],
415
+ states, suffixes, mark: true))
416
+ end
417
+ end
418
+ end
419
+
420
+ return [word] if pendings.empty? && stems.empty?
421
+
422
+ stems.uniq
423
+ end
424
+
425
+ private
426
+
427
+ def remove_pendings_like!(pending, array)
428
+ array.reject! do |candidate|
429
+ candidate[:to_state] == pending[:to_state] &&
430
+ candidate[:from_state] == pending[:from_state]
431
+ end
432
+ end
433
+
434
+ def mark_pendings!(pending, array)
435
+ similar_pendings(pending, array).each do |candidate|
436
+ candidate[:mark] = true
437
+ end
438
+ end
439
+
440
+ def remove_mark_pendings!(array)
441
+ array.reject! { |candidate| candidate[:mark] == true }
442
+ end
443
+
444
+ def similar_pendings(pending, array)
445
+ array.select do |candidate|
446
+ candidate[:to_state] == pending[:to_state] &&
447
+ candidate[:from_state] == pending[:from_state]
448
+ end
449
+ end
450
+
451
+ def turkish?(word)
452
+ !! word.match(ALPHABET)
453
+ end
454
+
455
+ end
@@ -0,0 +1,14 @@
1
+ a:
2
+ transitions:
3
+ - suffix: :s1
4
+ state: :b
5
+
6
+ - suffix: :s2
7
+ state: :b
8
+
9
+ final_state: true
10
+
11
+ b:
12
+ transitions: []
13
+
14
+ final_state: true
@@ -0,0 +1,21 @@
1
+ a:
2
+ transitions:
3
+ - suffix: :s1
4
+ state: :b
5
+
6
+ - suffix: :s2
7
+ state: :b
8
+
9
+ final_state: true
10
+
11
+ b:
12
+ transitions:
13
+ - suffix: :s1
14
+ state: :c
15
+
16
+ final_state: true
17
+
18
+ c:
19
+ transitions: []
20
+
21
+ final_state: true
@@ -0,0 +1,7 @@
1
+ s1:
2
+ name: "test"
3
+ regex: "im"
4
+
5
+ s2:
6
+ name: "another"
7
+ regex: "siniz"
@@ -0,0 +1,7 @@
1
+ s1:
2
+ name: "TEST"
3
+ regex: "test"
4
+
5
+ s2:
6
+ name: "another"
7
+ regex: "another"