turkish_stemmer 0.1.2 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/config/stemmer.yml DELETED
@@ -1,206 +0,0 @@
1
- protected_words:
2
- - abiye
3
- - adın
4
- - adana
5
- - akılsız
6
- - alaska
7
- - alet
8
- - ağda
9
- - ağız
10
- - alarm
11
- - altınbaşak
12
- - altınyıldız
13
- - anakucağı
14
- - anasayfa
15
- - anime
16
- - antifriz
17
- - araba
18
- - ardeşen
19
- - armanı
20
- - aroma
21
- - arma
22
- - arsız
23
- - asa
24
- - askı
25
- - astra
26
- - asus
27
- - atkı
28
- - ayakkabı
29
- - aynı
30
- - ayı
31
- - banka
32
- - başka
33
- - batık
34
- - bayı
35
- - belge
36
- - bellona
37
- - benten
38
- - benzin
39
- - beşinci
40
- - bilgi
41
- - bitki
42
- - boyut
43
- - branda
44
- - bütün
45
- - buzlu
46
- - çağrı
47
- - camsız
48
- - çanta
49
- - çarşı
50
- - ceyiz
51
- - çıkış
52
- - cımbiz
53
- - dalga
54
- - damla
55
- - derece
56
- - dişli
57
- - düğün
58
- - ege
59
- - elbise
60
- - fendi
61
- - filtre
62
- - fiyat
63
- - forma
64
- - gazete
65
- - gemi
66
- - görüntü
67
- - igne
68
- - ince
69
- - internet
70
- - iyi
71
- - kayısı
72
- - kama
73
- - katı
74
- - kötü
75
- - kumanda
76
- - lamba
77
- - lazım
78
- - litre
79
- - mağaza
80
- - magaza
81
- - makara
82
- - makine
83
- - malzeme
84
- - mana
85
- - marka
86
- - masa
87
- - maskara
88
- - mine
89
- - mini
90
- - nine
91
- - numara
92
- - odun
93
- - oyun
94
- - ölçü
95
- - örgü
96
- - öykü
97
- - özen
98
- - parça
99
- - perde
100
- - pompa
101
- - pırlanta
102
- - raket
103
- - ranza
104
- - şamdan
105
- - şapka
106
- - şifre
107
- - sunu
108
- - soyad
109
- - tabaka
110
- - takım
111
- - talımat
112
- - tarla
113
- - tasma
114
- - törpü
115
- - tozlu
116
- - tüplü
117
- - uçurtma
118
- - üfleme
119
- - ürün
120
- - ütü
121
- - uygun
122
- - uzatma
123
- - uzun
124
- - vana
125
- - yağlı
126
- - yapma
127
- - yardım
128
- - yasa
129
- - yıldız
130
- - zayıflama
131
- - zemin
132
-
133
- last_consonant_exceptions:
134
- - ad
135
- - at
136
- - ked
137
-
138
- vowel_harmony_exceptions:
139
- - alkoller
140
- - değerın
141
- - generali
142
- - generale
143
- - projektörlar
144
- - saatler
145
- - tabletlar
146
- - tersyüz
147
- - yaninda
148
- - yani
149
-
150
- selection_list_exceptions:
151
- - al
152
- - am
153
- - aparat
154
- - ara
155
- - bilet
156
- - bisiklet
157
- - bulut
158
- - diyet
159
- - ev
160
- - es
161
- - fiyat
162
- - fırsat
163
- - general
164
- - git
165
- - gıt
166
- - iç
167
- - ip
168
- - internet
169
- - iyi
170
- - kağıt
171
- - kartuş
172
- - katı
173
- - kot
174
- - kötü
175
- - kumanda
176
- - lamba
177
- - mağaza
178
- - magaza
179
- - makara
180
- - makine
181
- - marka
182
- - maskara
183
- - ne
184
- - otomat
185
- - palet
186
- - perde
187
- - raket
188
- - ranza
189
- - robot
190
- - sepet
191
- - servis
192
- - soyad
193
- - su
194
- - tabaka
195
- - tablet
196
- - takım
197
- - talımat
198
- - tanıt
199
- - tarla
200
- - tasma
201
- - tenis
202
- - törpü
203
- - uç
204
- - uygun
205
- - var
206
- - yasa
@@ -1,455 +0,0 @@
1
- # coding: utf-8
2
- require "turkish_stemmer/version"
3
- require "yaml"
4
- require "active_support/core_ext/hash"
5
-
6
- # Please note that we use only lowercase letters for all methods. One should
7
- # normalize input streams before using the `stem` method.
8
- module TurkishStemmer
9
- extend self
10
-
11
- VOWELS = "üiıueöao"
12
- CONSONANTS = "bcçdfgğhjklmnprsştvyz"
13
- ROUNDED_VOWELS = "oöuü"
14
- UNROUNDED_VOWELS = "iıea"
15
- FOLLOWING_ROUNDED_VOWELS = "aeuü"
16
- FRONT_VOWELS = "eiöü"
17
- BACK_VOWELS = "ıuao"
18
-
19
- # Heuristic size for average Turkish stemmed word size
20
- AVG_STEMMED_SIZE = 4
21
-
22
- # Regular expression that checks if the word contains only turkish characters
23
- ALPHABET = Regexp.new("^[abcçdefgğhıijklmnoöprsştuüvyz]+$").freeze
24
-
25
- # Stems a Turkish word.
26
- #
27
- # Algorithm consists of 3 parts: pre-process, process and post-process. The
28
- # pre-process phase is a quick lookup for words that should not be stemmed
29
- # based on length, protected words list and vowel harmony. The process phase
30
- # includes a nominal verb suffix and a noun suffix stripper machine. The last
31
- # phase includes some additional checks and a simple stem selection decision.
32
- #
33
- # @param word [String] the word to stem
34
- # @return [String] the stemmed word
35
- def stem(original_word)
36
- # Preprocess
37
- return original_word if !proceed_to_stem?(original_word)
38
-
39
- word = original_word.dup
40
-
41
- # Process
42
- stems = []
43
- stems << nominal_verbs_suffix_machine { word }
44
- stems << original_word
45
- stems.flatten!.uniq!
46
- stems << stems.map { |word| noun_suffix_machine { word }}
47
- stems << original_word
48
- stems.flatten!.uniq!
49
- stems << stems.map { |word| derivational_suffix_machine { word }}
50
-
51
- # Postprocess
52
- stem_post_process(stems, original_word)
53
- end
54
-
55
- # Loads yaml file and symbolizes keys
56
- #
57
- # @param file [String] path to yaml file
58
- # @return [Hash] the hash with symbols as keys
59
- def load_states_or_suffixes(file)
60
- config_path = File.expand_path("../../#{file}", __FILE__)
61
-
62
- YAML.load_file(config_path).symbolize_keys
63
- rescue => e
64
- raise "An error occured loading #{file}, #{e}"
65
- end
66
-
67
- # Helper method for loading settings
68
- #
69
- # @param key [String] the key
70
- def load_settings(key)
71
- config_path = File.expand_path("../../config/stemmer.yml", __FILE__)
72
-
73
- begin
74
- YAML.load_file(config_path)[key]
75
- rescue => e
76
- raise "Please provide a valid config/stemmer.yml file, #{e}"
77
- end
78
- end
79
-
80
- NOMINAL_VERB_STATES = load_states_or_suffixes("config/nominal_verb_states.yml")
81
- NOMINAL_VERB_SUFFIXES = load_states_or_suffixes("config/nominal_verb_suffixes.yml")
82
-
83
- NOUN_STATES = load_states_or_suffixes("config/noun_states.yml")
84
- NOUN_SUFFIXES = load_states_or_suffixes("config/noun_suffixes.yml")
85
-
86
- DERIVATIONAL_STATES = load_states_or_suffixes("config/derivational_states.yml")
87
- DERIVATIONAL_SUFFIXES = load_states_or_suffixes("config/derivational_suffixes.yml")
88
-
89
- ##
90
- # Load settings
91
- #
92
- # Protected words
93
- PROTECTED_WORDS = load_settings("protected_words")
94
-
95
- # Last consonant exceptions
96
- LAST_CONSONANT_EXCEPTIONS = load_settings("last_consonant_exceptions")
97
-
98
- # Vower harmony exceptions
99
- VOWEL_HARMONY_EXCEPTIONS = load_settings("vowel_harmony_exceptions")
100
-
101
- # Selection list exceptions
102
- SELECTION_LIST_EXCEPTIONS = load_settings("selection_list_exceptions")
103
-
104
- # Counts syllables of a Turkish word. In Turkish the number of syllables is
105
- # equals to the number of vowels.
106
- #
107
- # @param word [String] the word to count its syllables
108
- # @return [Fixnum] the number of syllables
109
- def count_syllables(word)
110
- vowels(word).size
111
- end
112
-
113
- # Gets the vowels of a word
114
- #
115
- # @param word [String] the word to get its vowels
116
- # @return [Array] array of vowels
117
- def vowels(word)
118
- word.gsub(/#{CONSONANTS.chars.to_a.join('|')}/,"").chars.to_a
119
- end
120
-
121
- # Checks vowel harmony of a word according to Turkish vowel harmony.
122
- #
123
- # @param word [String] the word to be checked against Turkish vowel harmony
124
- # @return [Boolean]
125
- # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
126
- def has_vowel_harmony?(word)
127
- word_vowels = vowels(word)
128
- vowel = word_vowels[-2]
129
- candidate = word_vowels[-1]
130
-
131
- vowel_harmony?(vowel, candidate)
132
- end
133
-
134
- # Checks vowel harmony between two vowels
135
- #
136
- # @param vowel [String] the first vowel
137
- # @param candidate [String] the second vowel
138
- # @return [Boolean]
139
- # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
140
- def vowel_harmony?(vowel, candidate)
141
- has_roundness?(vowel, candidate) && has_frontness?(vowel, candidate)
142
- end
143
-
144
- # Checks roundness vowel harmony of two vowels according to Turkish vowel
145
- # harmony.
146
- #
147
- # @param vowel [String] the first vowel
148
- # @param candidate [String] the second vowel
149
- # @return [Boolean]
150
- # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
151
- def has_roundness?(vowel, candidate)
152
- return true if vowel.nil? || vowel.empty?
153
- return true if candidate.nil? || candidate.empty?
154
-
155
- if (UNROUNDED_VOWELS.include?(vowel) && UNROUNDED_VOWELS.include?(candidate)) ||
156
- (ROUNDED_VOWELS.include?(vowel) && FOLLOWING_ROUNDED_VOWELS.include?(candidate))
157
- return true
158
- end
159
-
160
- false
161
- end
162
-
163
- # Checks frontness vowel harmony of two vowels according to Turkish vowel
164
- # harmony.
165
- #
166
- # @param vowel [String] the first vowel
167
- # @param candidate [String] the second vowel
168
- # @return [Boolean]
169
- # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
170
- def has_frontness?(vowel, candidate)
171
- return true if vowel.nil? || vowel.empty?
172
- return true if candidate.nil? || candidate.empty?
173
-
174
- if (FRONT_VOWELS.include?(vowel) && FRONT_VOWELS.include?(candidate)) ||
175
- (BACK_VOWELS.include?(vowel) && BACK_VOWELS.include?(candidate))
176
- return true
177
- end
178
-
179
- false
180
- end
181
-
182
- # Checks whether a word can be stemmed or not. This method checks candidate
183
- # word against nil, protected, length and vowel harmory.
184
- #
185
- # @param word [String] the candidate word for stemming
186
- # @return [Boolean] whether should proceed to stem or not
187
- def proceed_to_stem?(word)
188
- if word.nil? || !turkish?(word) ||
189
- PROTECTED_WORDS.include?(word) ||
190
- count_syllables(word) <= 1
191
-
192
- return false
193
- end
194
-
195
- true
196
- end
197
-
198
- # Post stemming process
199
- #
200
- # @param stems [Array] array of candidate stems
201
- # @param original_word [String] the original word
202
- # @return [String] the stemmed or the original word
203
- def stem_post_process(stems, original_word)
204
- if ENV['DEBUG']
205
- puts "post process for #{original_word}: #{stems}"
206
- end
207
-
208
- stems = stems.flatten.uniq
209
-
210
- # Reject original word
211
- stems.reject! { |w| w == original_word }
212
-
213
- # Reject all non-syllable words
214
- stems.reject! { |w| count_syllables(w) == 0 }
215
-
216
- # Transform last consonant
217
- stems.map! { |word| last_consonant!(word) }
218
-
219
- # Sort stems by size
220
- stems.sort! do |x,y|
221
- if (x.size - AVG_STEMMED_SIZE).abs == (y.size - AVG_STEMMED_SIZE).abs
222
- x.size <=> y.size
223
- else
224
- (x.size - AVG_STEMMED_SIZE).abs <=> (y.size - AVG_STEMMED_SIZE).abs
225
- end
226
- end
227
-
228
- # Check selection list exceptions
229
- if !(exception = (stems & SELECTION_LIST_EXCEPTIONS)).empty?
230
- return exception.first
231
- end
232
-
233
- # Keep first or original word
234
- stems.empty? ? original_word : stems.first
235
- end
236
-
237
- # Given a state key and a word, scans through given states and generate valid
238
- # pending transitions.
239
- #
240
- # @param key [String] the key for states hash
241
- # @param word [String] the word to check
242
- # @param states [Hash] the states hash
243
- # @param suffixes [Hash] the suffixes hash
244
- # @param options [Hash] options for pendings
245
- # @option options [Boolean] :mark Whether this pending is marked for deletion
246
- # @return [Array] array of pendings
247
- def generate_pendings(key, word, states, suffixes, options = {})
248
- raise ArgumentError, "State #{key} does not exist" if (state = states[key]).nil?
249
- mark = options[:mark] || false
250
-
251
- matched_transitions = state["transitions"].select do |transition|
252
- word.match(/(#{suffixes[transition["suffix"]]["regex"]})$/)
253
- end
254
-
255
- matched_transitions.map do |transition|
256
- {
257
- suffix: transition["suffix"],
258
- to_state: transition["state"],
259
- from_state: key,
260
- word: word,
261
- mark: mark
262
- }
263
- end
264
- end
265
-
266
- # Given a suffix it stems a word according to Turkish orthographic rules
267
- #
268
- # @param word [String] the word to stem
269
- # @param suffix [Hash] a suffix record
270
- # @return [Hash] a stem answer record
271
- def mark_stem(word, suffix)
272
- stem = !PROTECTED_WORDS.include?(word) &&
273
- (suffix["check_harmony"] &&
274
- (has_vowel_harmony?(word) || VOWEL_HARMONY_EXCEPTIONS.include?(word))) ||
275
- !suffix["check_harmony"]
276
-
277
- suffix_applied = suffix["regex"]
278
-
279
- if stem && (match = word.match(/(#{suffix_applied})$/))
280
- new_word = word.gsub(/(#{match.to_s})$/, '')
281
- suffix_applied = match.to_s
282
-
283
- if suffix["optional_letter"]
284
- answer, match = valid_optional_letter?(new_word, suffix["optional_letter"])
285
-
286
- if answer && match
287
- new_word = new_word.chop
288
- suffix_applied = match + suffix_applied
289
- elsif !answer
290
- new_word = word
291
- suffix_applied = nil
292
- stem = false
293
- end
294
- end
295
- else
296
- stem = false
297
- suffix_applied = nil
298
- new_word = word
299
- end
300
-
301
- { stem: stem, word: new_word, suffix_applied: suffix_applied }
302
- end
303
-
304
- # Given a word and a letter it checks if the optional letter can be part of
305
- # the stem or not.
306
- #
307
- # @param word [String] the examined word
308
- # @param letter [String] a single letter or a string armed with a regular
309
- # expression
310
- # @return [Array] the answer is returned as an array. First element is a
311
- # Boolean value and second element is the mached character.
312
- # @example
313
- # self.valid_optional_letter?("test", "t")
314
- # # => [true, 't']
315
- def valid_optional_letter?(word, letter)
316
- match = word.match(/(#{letter})$/)
317
- answer = true
318
- matched_char = nil
319
-
320
- if match
321
- matched_char = match.to_s
322
- previous_char = word[-2]
323
-
324
- answer = if VOWELS.include?(matched_char)
325
- (previous_char && CONSONANTS.include?(previous_char))
326
- else
327
- (previous_char && VOWELS.include?(previous_char))
328
- end
329
- end
330
-
331
- [answer, matched_char]
332
- end
333
-
334
- # Transforms a word taken into account last consonant rule.
335
- #
336
- # @param word [String] the word to check for last consonant change
337
- # @return [String] the changed word
338
- def last_consonant!(word)
339
- return word if LAST_CONSONANT_EXCEPTIONS.include?(word)
340
-
341
- consonants = { 'b' => 'p', 'c' => 'ç', 'd' => 't', 'ğ' => 'k' }
342
- last_char = word[-1]
343
-
344
- if consonants.keys.include?(last_char)
345
- word[-1] = consonants[last_char]
346
- end
347
-
348
- word
349
- end
350
-
351
- # Helper method. This is just a shortcut.
352
- def nominal_verbs_suffix_machine
353
- affix_morphological_stripper(yield, states: self::NOMINAL_VERB_STATES,
354
- suffixes: self::NOMINAL_VERB_SUFFIXES)
355
- end
356
-
357
- # Helper method. This is just a shortcut.
358
- def noun_suffix_machine
359
- affix_morphological_stripper(yield, states: self::NOUN_STATES,
360
- suffixes: self::NOUN_SUFFIXES)
361
- end
362
-
363
- # Helper method
364
- def derivational_suffix_machine
365
- affix_morphological_stripper(yield, states: self::DERIVATIONAL_STATES,
366
- suffixes: self::DERIVATIONAL_SUFFIXES)
367
- end
368
-
369
- # A simple algorithm to strip suffixes from a word based on states and
370
- # transitions.
371
- #
372
- # @param word [String] the word to strip affixes from
373
- # @param options [Hash] options for the algorithm
374
- # @option options [Hash] :states The states and valid transitions
375
- # @option options [Hash] :suffixes The suffixes with their rules
376
- # @return [Array] all possible stem versions
377
- def affix_morphological_stripper(word, options = {})
378
- states = options[:states] || {}
379
- suffixes = options[:suffixes] || {}
380
-
381
- return [word] if states.nil? || states.empty?
382
- return [word] if suffixes.nil? || suffixes.empty?
383
-
384
- stems = []
385
- # Init first state pending transitions
386
- pendings = generate_pendings(:a, word, states, suffixes)
387
-
388
- while !pendings.empty? do
389
- transition = pendings.shift
390
- word = transition[:word]
391
- suffix = suffixes[transition[:suffix]]
392
- to_state = states[transition[:to_state]]
393
- answer = mark_stem(word, suffix)
394
-
395
- if answer[:stem] == true
396
- if ENV['DEBUG']
397
- puts "Word: #{word} \nAnswer: #{answer} \nInfo: #{transition} \nSuffix: #{suffix}"
398
- end
399
-
400
- if to_state["final_state"] == true
401
- # We have a valid transition here. It is safe to remove any pendings
402
- # with the same signature current pending
403
- remove_pendings_like!(transition, pendings)
404
- remove_mark_pendings!(pendings)
405
-
406
- stems.push answer[:word]
407
-
408
- unless to_state["transitions"].empty?
409
- pendings.unshift(*generate_pendings(transition[:to_state], answer[:word], states, suffixes))
410
- end
411
-
412
- else
413
- mark_pendings!(transition, pendings)
414
- pendings.unshift(*generate_pendings(transition[:to_state], answer[:word],
415
- states, suffixes, mark: true))
416
- end
417
- end
418
- end
419
-
420
- return [word] if pendings.empty? && stems.empty?
421
-
422
- stems.uniq
423
- end
424
-
425
- private
426
-
427
- def remove_pendings_like!(pending, array)
428
- array.reject! do |candidate|
429
- candidate[:to_state] == pending[:to_state] &&
430
- candidate[:from_state] == pending[:from_state]
431
- end
432
- end
433
-
434
- def mark_pendings!(pending, array)
435
- similar_pendings(pending, array).each do |candidate|
436
- candidate[:mark] = true
437
- end
438
- end
439
-
440
- def remove_mark_pendings!(array)
441
- array.reject! { |candidate| candidate[:mark] == true }
442
- end
443
-
444
- def similar_pendings(pending, array)
445
- array.select do |candidate|
446
- candidate[:to_state] == pending[:to_state] &&
447
- candidate[:from_state] == pending[:from_state]
448
- end
449
- end
450
-
451
- def turkish?(word)
452
- !! word.match(ALPHABET)
453
- end
454
-
455
- end