turkish_stemmer 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - metadata +6 -38
 - data/.gitignore +0 -18
 - data/.rspec +0 -2
 - data/Gemfile +0 -4
 - data/LICENSE.txt +0 -22
 - data/README.md +0 -282
 - data/Rakefile +0 -21
 - data/benchmarks/stemmers_comparison.rb +0 -16
 - data/benchmarks/stemming_samples.txt +0 -17916
 - data/benchmarks/turkish_word_recognition.rb +0 -26
 - data/config/derivational_states.yml +0 -10
 - data/config/derivational_suffixes.yml +0 -6
 - data/config/nominal_verb_states.yml +0 -121
 - data/config/nominal_verb_suffixes.yml +0 -90
 - data/config/noun_states.yml +0 -177
 - data/config/noun_suffixes.yml +0 -113
 - data/config/stemmer.yml +0 -206
 - data/lib/turkish_stemmer.rb +0 -455
 - data/lib/turkish_stemmer/version.rb +0 -3
 - data/spec/fixtures/simple_state.yml +0 -14
 - data/spec/fixtures/simple_state_02.yml +0 -21
 - data/spec/fixtures/simple_suffix.yml +0 -7
 - data/spec/fixtures/simple_transition.yml +0 -7
 - data/spec/spec_helper.rb +0 -19
 - data/spec/support/fixtures.csv +0 -101
 - data/spec/turkish_stemmer_spec.rb +0 -522
 - data/turkish_stemmer.gemspec +0 -35
 
    
        data/config/stemmer.yml
    DELETED
    
    | 
         @@ -1,206 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            protected_words:
         
     | 
| 
       2 
     | 
    
         
            -
              - abiye
         
     | 
| 
       3 
     | 
    
         
            -
              - adın
         
     | 
| 
       4 
     | 
    
         
            -
              - adana
         
     | 
| 
       5 
     | 
    
         
            -
              - akılsız
         
     | 
| 
       6 
     | 
    
         
            -
              - alaska
         
     | 
| 
       7 
     | 
    
         
            -
              - alet
         
     | 
| 
       8 
     | 
    
         
            -
              - ağda
         
     | 
| 
       9 
     | 
    
         
            -
              - ağız
         
     | 
| 
       10 
     | 
    
         
            -
              - alarm
         
     | 
| 
       11 
     | 
    
         
            -
              - altınbaşak
         
     | 
| 
       12 
     | 
    
         
            -
              - altınyıldız
         
     | 
| 
       13 
     | 
    
         
            -
              - anakucağı
         
     | 
| 
       14 
     | 
    
         
            -
              - anasayfa
         
     | 
| 
       15 
     | 
    
         
            -
              - anime
         
     | 
| 
       16 
     | 
    
         
            -
              - antifriz
         
     | 
| 
       17 
     | 
    
         
            -
              - araba
         
     | 
| 
       18 
     | 
    
         
            -
              - ardeşen
         
     | 
| 
       19 
     | 
    
         
            -
              - armanı
         
     | 
| 
       20 
     | 
    
         
            -
              - aroma
         
     | 
| 
       21 
     | 
    
         
            -
              - arma
         
     | 
| 
       22 
     | 
    
         
            -
              - arsız
         
     | 
| 
       23 
     | 
    
         
            -
              - asa
         
     | 
| 
       24 
     | 
    
         
            -
              - askı
         
     | 
| 
       25 
     | 
    
         
            -
              - astra
         
     | 
| 
       26 
     | 
    
         
            -
              - asus
         
     | 
| 
       27 
     | 
    
         
            -
              - atkı
         
     | 
| 
       28 
     | 
    
         
            -
              - ayakkabı
         
     | 
| 
       29 
     | 
    
         
            -
              - aynı
         
     | 
| 
       30 
     | 
    
         
            -
              - ayı
         
     | 
| 
       31 
     | 
    
         
            -
              - banka
         
     | 
| 
       32 
     | 
    
         
            -
              - başka
         
     | 
| 
       33 
     | 
    
         
            -
              - batık
         
     | 
| 
       34 
     | 
    
         
            -
              - bayı
         
     | 
| 
       35 
     | 
    
         
            -
              - belge
         
     | 
| 
       36 
     | 
    
         
            -
              - bellona
         
     | 
| 
       37 
     | 
    
         
            -
              - benten
         
     | 
| 
       38 
     | 
    
         
            -
              - benzin
         
     | 
| 
       39 
     | 
    
         
            -
              - beşinci
         
     | 
| 
       40 
     | 
    
         
            -
              - bilgi
         
     | 
| 
       41 
     | 
    
         
            -
              - bitki
         
     | 
| 
       42 
     | 
    
         
            -
              - boyut
         
     | 
| 
       43 
     | 
    
         
            -
              - branda
         
     | 
| 
       44 
     | 
    
         
            -
              - bütün
         
     | 
| 
       45 
     | 
    
         
            -
              - buzlu
         
     | 
| 
       46 
     | 
    
         
            -
              - çağrı
         
     | 
| 
       47 
     | 
    
         
            -
              - camsız
         
     | 
| 
       48 
     | 
    
         
            -
              - çanta
         
     | 
| 
       49 
     | 
    
         
            -
              - çarşı
         
     | 
| 
       50 
     | 
    
         
            -
              - ceyiz
         
     | 
| 
       51 
     | 
    
         
            -
              - çıkış
         
     | 
| 
       52 
     | 
    
         
            -
              - cımbiz
         
     | 
| 
       53 
     | 
    
         
            -
              - dalga
         
     | 
| 
       54 
     | 
    
         
            -
              - damla
         
     | 
| 
       55 
     | 
    
         
            -
              - derece
         
     | 
| 
       56 
     | 
    
         
            -
              - dişli
         
     | 
| 
       57 
     | 
    
         
            -
              - düğün
         
     | 
| 
       58 
     | 
    
         
            -
              - ege
         
     | 
| 
       59 
     | 
    
         
            -
              - elbise
         
     | 
| 
       60 
     | 
    
         
            -
              - fendi
         
     | 
| 
       61 
     | 
    
         
            -
              - filtre
         
     | 
| 
       62 
     | 
    
         
            -
              - fiyat
         
     | 
| 
       63 
     | 
    
         
            -
              - forma
         
     | 
| 
       64 
     | 
    
         
            -
              - gazete
         
     | 
| 
       65 
     | 
    
         
            -
              - gemi
         
     | 
| 
       66 
     | 
    
         
            -
              - görüntü
         
     | 
| 
       67 
     | 
    
         
            -
              - igne
         
     | 
| 
       68 
     | 
    
         
            -
              - ince
         
     | 
| 
       69 
     | 
    
         
            -
              - internet
         
     | 
| 
       70 
     | 
    
         
            -
              - iyi
         
     | 
| 
       71 
     | 
    
         
            -
              - kayısı
         
     | 
| 
       72 
     | 
    
         
            -
              - kama
         
     | 
| 
       73 
     | 
    
         
            -
              - katı
         
     | 
| 
       74 
     | 
    
         
            -
              - kötü
         
     | 
| 
       75 
     | 
    
         
            -
              - kumanda
         
     | 
| 
       76 
     | 
    
         
            -
              - lamba
         
     | 
| 
       77 
     | 
    
         
            -
              - lazım
         
     | 
| 
       78 
     | 
    
         
            -
              - litre
         
     | 
| 
       79 
     | 
    
         
            -
              - mağaza
         
     | 
| 
       80 
     | 
    
         
            -
              - magaza
         
     | 
| 
       81 
     | 
    
         
            -
              - makara
         
     | 
| 
       82 
     | 
    
         
            -
              - makine
         
     | 
| 
       83 
     | 
    
         
            -
              - malzeme
         
     | 
| 
       84 
     | 
    
         
            -
              - mana
         
     | 
| 
       85 
     | 
    
         
            -
              - marka
         
     | 
| 
       86 
     | 
    
         
            -
              - masa
         
     | 
| 
       87 
     | 
    
         
            -
              - maskara
         
     | 
| 
       88 
     | 
    
         
            -
              - mine
         
     | 
| 
       89 
     | 
    
         
            -
              - mini
         
     | 
| 
       90 
     | 
    
         
            -
              - nine
         
     | 
| 
       91 
     | 
    
         
            -
              - numara
         
     | 
| 
       92 
     | 
    
         
            -
              - odun
         
     | 
| 
       93 
     | 
    
         
            -
              - oyun
         
     | 
| 
       94 
     | 
    
         
            -
              - ölçü
         
     | 
| 
       95 
     | 
    
         
            -
              - örgü
         
     | 
| 
       96 
     | 
    
         
            -
              - öykü
         
     | 
| 
       97 
     | 
    
         
            -
              - özen
         
     | 
| 
       98 
     | 
    
         
            -
              - parça
         
     | 
| 
       99 
     | 
    
         
            -
              - perde
         
     | 
| 
       100 
     | 
    
         
            -
              - pompa
         
     | 
| 
       101 
     | 
    
         
            -
              - pırlanta
         
     | 
| 
       102 
     | 
    
         
            -
              - raket
         
     | 
| 
       103 
     | 
    
         
            -
              - ranza
         
     | 
| 
       104 
     | 
    
         
            -
              - şamdan
         
     | 
| 
       105 
     | 
    
         
            -
              - şapka
         
     | 
| 
       106 
     | 
    
         
            -
              - şifre
         
     | 
| 
       107 
     | 
    
         
            -
              - sunu
         
     | 
| 
       108 
     | 
    
         
            -
              - soyad
         
     | 
| 
       109 
     | 
    
         
            -
              - tabaka
         
     | 
| 
       110 
     | 
    
         
            -
              - takım
         
     | 
| 
       111 
     | 
    
         
            -
              - talımat
         
     | 
| 
       112 
     | 
    
         
            -
              - tarla
         
     | 
| 
       113 
     | 
    
         
            -
              - tasma
         
     | 
| 
       114 
     | 
    
         
            -
              - törpü
         
     | 
| 
       115 
     | 
    
         
            -
              - tozlu
         
     | 
| 
       116 
     | 
    
         
            -
              - tüplü
         
     | 
| 
       117 
     | 
    
         
            -
              - uçurtma
         
     | 
| 
       118 
     | 
    
         
            -
              - üfleme
         
     | 
| 
       119 
     | 
    
         
            -
              - ürün
         
     | 
| 
       120 
     | 
    
         
            -
              - ütü
         
     | 
| 
       121 
     | 
    
         
            -
              - uygun
         
     | 
| 
       122 
     | 
    
         
            -
              - uzatma
         
     | 
| 
       123 
     | 
    
         
            -
              - uzun
         
     | 
| 
       124 
     | 
    
         
            -
              - vana
         
     | 
| 
       125 
     | 
    
         
            -
              - yağlı
         
     | 
| 
       126 
     | 
    
         
            -
              - yapma
         
     | 
| 
       127 
     | 
    
         
            -
              - yardım
         
     | 
| 
       128 
     | 
    
         
            -
              - yasa
         
     | 
| 
       129 
     | 
    
         
            -
              - yıldız
         
     | 
| 
       130 
     | 
    
         
            -
              - zayıflama
         
     | 
| 
       131 
     | 
    
         
            -
              - zemin
         
     | 
| 
       132 
     | 
    
         
            -
             
     | 
| 
       133 
     | 
    
         
            -
            last_consonant_exceptions:
         
     | 
| 
       134 
     | 
    
         
            -
              - ad
         
     | 
| 
       135 
     | 
    
         
            -
              - at
         
     | 
| 
       136 
     | 
    
         
            -
              - ked
         
     | 
| 
       137 
     | 
    
         
            -
             
     | 
| 
       138 
     | 
    
         
            -
            vowel_harmony_exceptions:
         
     | 
| 
       139 
     | 
    
         
            -
              - alkoller
         
     | 
| 
       140 
     | 
    
         
            -
              - değerın
         
     | 
| 
       141 
     | 
    
         
            -
              - generali
         
     | 
| 
       142 
     | 
    
         
            -
              - generale
         
     | 
| 
       143 
     | 
    
         
            -
              - projektörlar
         
     | 
| 
       144 
     | 
    
         
            -
              - saatler
         
     | 
| 
       145 
     | 
    
         
            -
              - tabletlar
         
     | 
| 
       146 
     | 
    
         
            -
              - tersyüz
         
     | 
| 
       147 
     | 
    
         
            -
              - yaninda
         
     | 
| 
       148 
     | 
    
         
            -
              - yani
         
     | 
| 
       149 
     | 
    
         
            -
             
     | 
| 
       150 
     | 
    
         
            -
            selection_list_exceptions:
         
     | 
| 
       151 
     | 
    
         
            -
              - al
         
     | 
| 
       152 
     | 
    
         
            -
              - am
         
     | 
| 
       153 
     | 
    
         
            -
              - aparat
         
     | 
| 
       154 
     | 
    
         
            -
              - ara
         
     | 
| 
       155 
     | 
    
         
            -
              - bilet
         
     | 
| 
       156 
     | 
    
         
            -
              - bisiklet
         
     | 
| 
       157 
     | 
    
         
            -
              - bulut
         
     | 
| 
       158 
     | 
    
         
            -
              - diyet
         
     | 
| 
       159 
     | 
    
         
            -
              - ev
         
     | 
| 
       160 
     | 
    
         
            -
              - es
         
     | 
| 
       161 
     | 
    
         
            -
              - fiyat
         
     | 
| 
       162 
     | 
    
         
            -
              - fırsat
         
     | 
| 
       163 
     | 
    
         
            -
              - general
         
     | 
| 
       164 
     | 
    
         
            -
              - git
         
     | 
| 
       165 
     | 
    
         
            -
              - gıt
         
     | 
| 
       166 
     | 
    
         
            -
              - iç
         
     | 
| 
       167 
     | 
    
         
            -
              - ip
         
     | 
| 
       168 
     | 
    
         
            -
              - internet
         
     | 
| 
       169 
     | 
    
         
            -
              - iyi
         
     | 
| 
       170 
     | 
    
         
            -
              - kağıt
         
     | 
| 
       171 
     | 
    
         
            -
              - kartuş
         
     | 
| 
       172 
     | 
    
         
            -
              - katı
         
     | 
| 
       173 
     | 
    
         
            -
              - kot
         
     | 
| 
       174 
     | 
    
         
            -
              - kötü
         
     | 
| 
       175 
     | 
    
         
            -
              - kumanda
         
     | 
| 
       176 
     | 
    
         
            -
              - lamba
         
     | 
| 
       177 
     | 
    
         
            -
              - mağaza
         
     | 
| 
       178 
     | 
    
         
            -
              - magaza
         
     | 
| 
       179 
     | 
    
         
            -
              - makara
         
     | 
| 
       180 
     | 
    
         
            -
              - makine
         
     | 
| 
       181 
     | 
    
         
            -
              - marka
         
     | 
| 
       182 
     | 
    
         
            -
              - maskara
         
     | 
| 
       183 
     | 
    
         
            -
              - ne
         
     | 
| 
       184 
     | 
    
         
            -
              - otomat
         
     | 
| 
       185 
     | 
    
         
            -
              - palet
         
     | 
| 
       186 
     | 
    
         
            -
              - perde
         
     | 
| 
       187 
     | 
    
         
            -
              - raket
         
     | 
| 
       188 
     | 
    
         
            -
              - ranza
         
     | 
| 
       189 
     | 
    
         
            -
              - robot
         
     | 
| 
       190 
     | 
    
         
            -
              - sepet
         
     | 
| 
       191 
     | 
    
         
            -
              - servis
         
     | 
| 
       192 
     | 
    
         
            -
              - soyad
         
     | 
| 
       193 
     | 
    
         
            -
              - su
         
     | 
| 
       194 
     | 
    
         
            -
              - tabaka
         
     | 
| 
       195 
     | 
    
         
            -
              - tablet
         
     | 
| 
       196 
     | 
    
         
            -
              - takım
         
     | 
| 
       197 
     | 
    
         
            -
              - talımat
         
     | 
| 
       198 
     | 
    
         
            -
              - tanıt
         
     | 
| 
       199 
     | 
    
         
            -
              - tarla
         
     | 
| 
       200 
     | 
    
         
            -
              - tasma
         
     | 
| 
       201 
     | 
    
         
            -
              - tenis
         
     | 
| 
       202 
     | 
    
         
            -
              - törpü
         
     | 
| 
       203 
     | 
    
         
            -
              - uç
         
     | 
| 
       204 
     | 
    
         
            -
              - uygun
         
     | 
| 
       205 
     | 
    
         
            -
              - var
         
     | 
| 
       206 
     | 
    
         
            -
              - yasa
         
     | 
    
        data/lib/turkish_stemmer.rb
    DELETED
    
    | 
         @@ -1,455 +0,0 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # coding: utf-8
         
     | 
| 
       2 
     | 
    
         
            -
            require "turkish_stemmer/version"
         
     | 
| 
       3 
     | 
    
         
            -
            require "yaml"
         
     | 
| 
       4 
     | 
    
         
            -
            require "active_support/core_ext/hash"
         
     | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
            # Please note that we use only lowercase letters for all methods. One should
         
     | 
| 
       7 
     | 
    
         
            -
            # normalize input streams before using the `stem` method.
         
     | 
| 
       8 
     | 
    
         
            -
            module TurkishStemmer
         
     | 
| 
       9 
     | 
    
         
            -
              extend self
         
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
              VOWELS                    = "üiıueöao"
         
     | 
| 
       12 
     | 
    
         
            -
              CONSONANTS                = "bcçdfgğhjklmnprsştvyz"
         
     | 
| 
       13 
     | 
    
         
            -
              ROUNDED_VOWELS            = "oöuü"
         
     | 
| 
       14 
     | 
    
         
            -
              UNROUNDED_VOWELS          = "iıea"
         
     | 
| 
       15 
     | 
    
         
            -
              FOLLOWING_ROUNDED_VOWELS  = "aeuü"
         
     | 
| 
       16 
     | 
    
         
            -
              FRONT_VOWELS              = "eiöü"
         
     | 
| 
       17 
     | 
    
         
            -
              BACK_VOWELS               = "ıuao"
         
     | 
| 
       18 
     | 
    
         
            -
             
     | 
| 
       19 
     | 
    
         
            -
              # Heuristic size for average Turkish stemmed word size
         
     | 
| 
       20 
     | 
    
         
            -
              AVG_STEMMED_SIZE          = 4
         
     | 
| 
       21 
     | 
    
         
            -
             
     | 
| 
       22 
     | 
    
         
            -
              # Regular expression that checks if the word contains only turkish characters
         
     | 
| 
       23 
     | 
    
         
            -
              ALPHABET = Regexp.new("^[abcçdefgğhıijklmnoöprsştuüvyz]+$").freeze
         
     | 
| 
       24 
     | 
    
         
            -
             
     | 
| 
       25 
     | 
    
         
            -
              # Stems a Turkish word.
         
     | 
| 
       26 
     | 
    
         
            -
              #
         
     | 
| 
       27 
     | 
    
         
            -
              # Algorithm consists of 3 parts: pre-process, process and post-process. The
         
     | 
| 
       28 
     | 
    
         
            -
              # pre-process phase is a quick lookup for words that should not be stemmed
         
     | 
| 
       29 
     | 
    
         
            -
              # based on length, protected words list and vowel harmony. The process phase
         
     | 
| 
       30 
     | 
    
         
            -
              # includes a nominal verb suffix and a noun suffix stripper machine. The last
         
     | 
| 
       31 
     | 
    
         
            -
              # phase includes some additional checks and a simple stem selection decision.
         
     | 
| 
       32 
     | 
    
         
            -
              #
         
     | 
| 
       33 
     | 
    
         
            -
              # @param word [String] the word to stem
         
     | 
| 
       34 
     | 
    
         
            -
              # @return [String] the stemmed word
         
     | 
| 
       35 
     | 
    
         
            -
              def stem(original_word)
         
     | 
| 
       36 
     | 
    
         
            -
                # Preprocess
         
     | 
| 
       37 
     | 
    
         
            -
                return original_word if !proceed_to_stem?(original_word)
         
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
       39 
     | 
    
         
            -
                word = original_word.dup
         
     | 
| 
       40 
     | 
    
         
            -
             
     | 
| 
       41 
     | 
    
         
            -
                # Process
         
     | 
| 
       42 
     | 
    
         
            -
                stems = []
         
     | 
| 
       43 
     | 
    
         
            -
                stems << nominal_verbs_suffix_machine { word }
         
     | 
| 
       44 
     | 
    
         
            -
                stems << original_word
         
     | 
| 
       45 
     | 
    
         
            -
                stems.flatten!.uniq!
         
     | 
| 
       46 
     | 
    
         
            -
                stems << stems.map { |word| noun_suffix_machine { word }}
         
     | 
| 
       47 
     | 
    
         
            -
                stems << original_word
         
     | 
| 
       48 
     | 
    
         
            -
                stems.flatten!.uniq!
         
     | 
| 
       49 
     | 
    
         
            -
                stems << stems.map { |word| derivational_suffix_machine { word }}
         
     | 
| 
       50 
     | 
    
         
            -
             
     | 
| 
       51 
     | 
    
         
            -
                # Postprocess
         
     | 
| 
       52 
     | 
    
         
            -
                stem_post_process(stems, original_word)
         
     | 
| 
       53 
     | 
    
         
            -
              end
         
     | 
| 
       54 
     | 
    
         
            -
             
     | 
| 
       55 
     | 
    
         
            -
              # Loads yaml file and symbolizes keys
         
     | 
| 
       56 
     | 
    
         
            -
              #
         
     | 
| 
       57 
     | 
    
         
            -
              # @param file [String] path to yaml file
         
     | 
| 
       58 
     | 
    
         
            -
              # @return [Hash] the hash with symbols as keys
         
     | 
| 
       59 
     | 
    
         
            -
              def load_states_or_suffixes(file)
         
     | 
| 
       60 
     | 
    
         
            -
                config_path = File.expand_path("../../#{file}", __FILE__)
         
     | 
| 
       61 
     | 
    
         
            -
             
     | 
| 
       62 
     | 
    
         
            -
                YAML.load_file(config_path).symbolize_keys
         
     | 
| 
       63 
     | 
    
         
            -
              rescue => e
         
     | 
| 
       64 
     | 
    
         
            -
                raise "An error occured loading #{file}, #{e}"
         
     | 
| 
       65 
     | 
    
         
            -
              end
         
     | 
| 
       66 
     | 
    
         
            -
             
     | 
| 
       67 
     | 
    
         
            -
              # Helper method for loading settings
         
     | 
| 
       68 
     | 
    
         
            -
              #
         
     | 
| 
       69 
     | 
    
         
            -
              # @param key [String] the key
         
     | 
| 
       70 
     | 
    
         
            -
              def load_settings(key)
         
     | 
| 
       71 
     | 
    
         
            -
                config_path = File.expand_path("../../config/stemmer.yml", __FILE__)
         
     | 
| 
       72 
     | 
    
         
            -
             
     | 
| 
       73 
     | 
    
         
            -
                begin
         
     | 
| 
       74 
     | 
    
         
            -
                  YAML.load_file(config_path)[key]
         
     | 
| 
       75 
     | 
    
         
            -
                rescue => e
         
     | 
| 
       76 
     | 
    
         
            -
                  raise "Please provide a valid config/stemmer.yml file, #{e}"
         
     | 
| 
       77 
     | 
    
         
            -
                end
         
     | 
| 
       78 
     | 
    
         
            -
              end
         
     | 
| 
       79 
     | 
    
         
            -
             
     | 
| 
       80 
     | 
    
         
            -
              NOMINAL_VERB_STATES   = load_states_or_suffixes("config/nominal_verb_states.yml")
         
     | 
| 
       81 
     | 
    
         
            -
              NOMINAL_VERB_SUFFIXES = load_states_or_suffixes("config/nominal_verb_suffixes.yml")
         
     | 
| 
       82 
     | 
    
         
            -
             
     | 
| 
       83 
     | 
    
         
            -
              NOUN_STATES   = load_states_or_suffixes("config/noun_states.yml")
         
     | 
| 
       84 
     | 
    
         
            -
              NOUN_SUFFIXES = load_states_or_suffixes("config/noun_suffixes.yml")
         
     | 
| 
       85 
     | 
    
         
            -
             
     | 
| 
       86 
     | 
    
         
            -
              DERIVATIONAL_STATES = load_states_or_suffixes("config/derivational_states.yml")
         
     | 
| 
       87 
     | 
    
         
            -
              DERIVATIONAL_SUFFIXES = load_states_or_suffixes("config/derivational_suffixes.yml")
         
     | 
| 
       88 
     | 
    
         
            -
             
     | 
| 
       89 
     | 
    
         
            -
              ##
         
     | 
| 
       90 
     | 
    
         
            -
              # Load settings
         
     | 
| 
       91 
     | 
    
         
            -
              #
         
     | 
| 
       92 
     | 
    
         
            -
              # Protected words
         
     | 
| 
       93 
     | 
    
         
            -
              PROTECTED_WORDS = load_settings("protected_words")
         
     | 
| 
       94 
     | 
    
         
            -
             
     | 
| 
       95 
     | 
    
         
            -
              # Last consonant exceptions
         
     | 
| 
       96 
     | 
    
         
            -
              LAST_CONSONANT_EXCEPTIONS = load_settings("last_consonant_exceptions")
         
     | 
| 
       97 
     | 
    
         
            -
             
     | 
| 
       98 
     | 
    
         
            -
              # Vower harmony exceptions
         
     | 
| 
       99 
     | 
    
         
            -
              VOWEL_HARMONY_EXCEPTIONS  = load_settings("vowel_harmony_exceptions")
         
     | 
| 
       100 
     | 
    
         
            -
             
     | 
| 
       101 
     | 
    
         
            -
              # Selection list exceptions
         
     | 
| 
       102 
     | 
    
         
            -
              SELECTION_LIST_EXCEPTIONS = load_settings("selection_list_exceptions")
         
     | 
| 
       103 
     | 
    
         
            -
             
     | 
| 
       104 
     | 
    
         
            -
              # Counts syllables of a Turkish word. In Turkish the number of syllables is
         
     | 
| 
       105 
     | 
    
         
            -
              # equals to the number of vowels.
         
     | 
| 
       106 
     | 
    
         
            -
              #
         
     | 
| 
       107 
     | 
    
         
            -
              # @param word [String] the word to count its syllables
         
     | 
| 
       108 
     | 
    
         
            -
              # @return [Fixnum] the number of syllables
         
     | 
| 
       109 
     | 
    
         
            -
              def count_syllables(word)
         
     | 
| 
       110 
     | 
    
         
            -
                vowels(word).size
         
     | 
| 
       111 
     | 
    
         
            -
              end
         
     | 
| 
       112 
     | 
    
         
            -
             
     | 
| 
       113 
     | 
    
         
            -
              # Gets the vowels of a word
         
     | 
| 
       114 
     | 
    
         
            -
              #
         
     | 
| 
       115 
     | 
    
         
            -
              # @param word [String] the word to get its vowels
         
     | 
| 
       116 
     | 
    
         
            -
              # @return [Array] array of vowels
         
     | 
| 
       117 
     | 
    
         
            -
              def vowels(word)
         
     | 
| 
       118 
     | 
    
         
            -
                word.gsub(/#{CONSONANTS.chars.to_a.join('|')}/,"").chars.to_a
         
     | 
| 
       119 
     | 
    
         
            -
              end
         
     | 
| 
       120 
     | 
    
         
            -
             
     | 
| 
       121 
     | 
    
         
            -
              # Checks vowel harmony of a word according to Turkish vowel harmony.
         
     | 
| 
       122 
     | 
    
         
            -
              #
         
     | 
| 
       123 
     | 
    
         
            -
              # @param word [String] the word to be checked against Turkish vowel harmony
         
     | 
| 
       124 
     | 
    
         
            -
              # @return [Boolean]
         
     | 
| 
       125 
     | 
    
         
            -
              # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
         
     | 
| 
       126 
     | 
    
         
            -
              def has_vowel_harmony?(word)
         
     | 
| 
       127 
     | 
    
         
            -
                word_vowels = vowels(word)
         
     | 
| 
       128 
     | 
    
         
            -
                vowel       = word_vowels[-2]
         
     | 
| 
       129 
     | 
    
         
            -
                candidate   = word_vowels[-1]
         
     | 
| 
       130 
     | 
    
         
            -
             
     | 
| 
       131 
     | 
    
         
            -
                vowel_harmony?(vowel, candidate)
         
     | 
| 
       132 
     | 
    
         
            -
              end
         
     | 
| 
       133 
     | 
    
         
            -
             
     | 
| 
       134 
     | 
    
         
            -
              # Checks vowel harmony between two vowels
         
     | 
| 
       135 
     | 
    
         
            -
              #
         
     | 
| 
       136 
     | 
    
         
            -
              # @param vowel [String] the first vowel
         
     | 
| 
       137 
     | 
    
         
            -
              # @param candidate [String] the second vowel
         
     | 
| 
       138 
     | 
    
         
            -
              # @return [Boolean]
         
     | 
| 
       139 
     | 
    
         
            -
              # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
         
     | 
| 
       140 
     | 
    
         
            -
              def vowel_harmony?(vowel, candidate)
         
     | 
| 
       141 
     | 
    
         
            -
                has_roundness?(vowel, candidate) && has_frontness?(vowel, candidate)
         
     | 
| 
       142 
     | 
    
         
            -
              end
         
     | 
| 
       143 
     | 
    
         
            -
             
     | 
| 
       144 
     | 
    
         
            -
              # Checks roundness vowel harmony of two vowels according to Turkish vowel
         
     | 
| 
       145 
     | 
    
         
            -
              # harmony.
         
     | 
| 
       146 
     | 
    
         
            -
              #
         
     | 
| 
       147 
     | 
    
         
            -
              # @param vowel [String] the first vowel
         
     | 
| 
       148 
     | 
    
         
            -
              # @param candidate [String] the second vowel
         
     | 
| 
       149 
     | 
    
         
            -
              # @return [Boolean]
         
     | 
| 
       150 
     | 
    
         
            -
              # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
         
     | 
| 
       151 
     | 
    
         
            -
              def has_roundness?(vowel, candidate)
         
     | 
| 
       152 
     | 
    
         
            -
                return true if vowel.nil? || vowel.empty?
         
     | 
| 
       153 
     | 
    
         
            -
                return true if candidate.nil? || candidate.empty?
         
     | 
| 
       154 
     | 
    
         
            -
             
     | 
| 
       155 
     | 
    
         
            -
                if (UNROUNDED_VOWELS.include?(vowel) && UNROUNDED_VOWELS.include?(candidate)) ||
         
     | 
| 
       156 
     | 
    
         
            -
                   (ROUNDED_VOWELS.include?(vowel) && FOLLOWING_ROUNDED_VOWELS.include?(candidate))
         
     | 
| 
       157 
     | 
    
         
            -
                  return true
         
     | 
| 
       158 
     | 
    
         
            -
                end
         
     | 
| 
       159 
     | 
    
         
            -
             
     | 
| 
       160 
     | 
    
         
            -
                false
         
     | 
| 
       161 
     | 
    
         
            -
              end
         
     | 
| 
       162 
     | 
    
         
            -
             
     | 
| 
       163 
     | 
    
         
            -
              # Checks frontness vowel harmony of two vowels according to Turkish vowel
         
     | 
| 
       164 
     | 
    
         
            -
              # harmony.
         
     | 
| 
       165 
     | 
    
         
            -
              #
         
     | 
| 
       166 
     | 
    
         
            -
              # @param vowel [String] the first vowel
         
     | 
| 
       167 
     | 
    
         
            -
              # @param candidate [String] the second vowel
         
     | 
| 
       168 
     | 
    
         
            -
              # @return [Boolean]
         
     | 
| 
       169 
     | 
    
         
            -
              # @see https://en.wikipedia.org/wiki/Vowel_harmony#Turkish
         
     | 
| 
       170 
     | 
    
         
            -
              def has_frontness?(vowel, candidate)
         
     | 
| 
       171 
     | 
    
         
            -
                return true if vowel.nil? || vowel.empty?
         
     | 
| 
       172 
     | 
    
         
            -
                return true if candidate.nil? || candidate.empty?
         
     | 
| 
       173 
     | 
    
         
            -
             
     | 
| 
       174 
     | 
    
         
            -
                if (FRONT_VOWELS.include?(vowel) && FRONT_VOWELS.include?(candidate)) ||
         
     | 
| 
       175 
     | 
    
         
            -
                   (BACK_VOWELS.include?(vowel) && BACK_VOWELS.include?(candidate))
         
     | 
| 
       176 
     | 
    
         
            -
                  return true
         
     | 
| 
       177 
     | 
    
         
            -
                end
         
     | 
| 
       178 
     | 
    
         
            -
             
     | 
| 
       179 
     | 
    
         
            -
                false
         
     | 
| 
       180 
     | 
    
         
            -
              end
         
     | 
| 
       181 
     | 
    
         
            -
             
     | 
| 
       182 
     | 
    
         
            -
              # Checks whether a word can be stemmed or not. This method checks candidate
         
     | 
| 
       183 
     | 
    
         
            -
              # word against nil, protected, length and vowel harmory.
         
     | 
| 
       184 
     | 
    
         
            -
              #
         
     | 
| 
       185 
     | 
    
         
            -
              # @param word [String] the candidate word for stemming
         
     | 
| 
       186 
     | 
    
         
            -
              # @return [Boolean] whether should proceed to stem or not
         
     | 
| 
       187 
     | 
    
         
            -
              def proceed_to_stem?(word)
         
     | 
| 
       188 
     | 
    
         
            -
                if word.nil? || !turkish?(word) ||
         
     | 
| 
       189 
     | 
    
         
            -
                  PROTECTED_WORDS.include?(word) ||
         
     | 
| 
       190 
     | 
    
         
            -
                  count_syllables(word) <= 1
         
     | 
| 
       191 
     | 
    
         
            -
             
     | 
| 
       192 
     | 
    
         
            -
                  return false
         
     | 
| 
       193 
     | 
    
         
            -
                end
         
     | 
| 
       194 
     | 
    
         
            -
             
     | 
| 
       195 
     | 
    
         
            -
                true
         
     | 
| 
       196 
     | 
    
         
            -
              end
         
     | 
| 
       197 
     | 
    
         
            -
             
     | 
| 
       198 
     | 
    
         
            -
              # Post stemming process
         
     | 
| 
       199 
     | 
    
         
            -
              #
         
     | 
| 
       200 
     | 
    
         
            -
              # @param   stems          [Array]   array of candidate stems
         
     | 
| 
       201 
     | 
    
         
            -
              # @param   original_word  [String]  the original word
         
     | 
| 
       202 
     | 
    
         
            -
              # @return                 [String]  the stemmed or the original word
         
     | 
| 
       203 
     | 
    
         
            -
              def stem_post_process(stems, original_word)
         
     | 
| 
       204 
     | 
    
         
            -
                if ENV['DEBUG']
         
     | 
| 
       205 
     | 
    
         
            -
                  puts "post process for #{original_word}: #{stems}"
         
     | 
| 
       206 
     | 
    
         
            -
                end
         
     | 
| 
       207 
     | 
    
         
            -
             
     | 
| 
       208 
     | 
    
         
            -
                stems = stems.flatten.uniq
         
     | 
| 
       209 
     | 
    
         
            -
             
     | 
| 
       210 
     | 
    
         
            -
                # Reject original word
         
     | 
| 
       211 
     | 
    
         
            -
                stems.reject! { |w| w == original_word }
         
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
       213 
     | 
    
         
            -
                # Reject all non-syllable words
         
     | 
| 
       214 
     | 
    
         
            -
                stems.reject! { |w| count_syllables(w) == 0 }
         
     | 
| 
       215 
     | 
    
         
            -
             
     | 
| 
       216 
     | 
    
         
            -
                # Transform last consonant
         
     | 
| 
       217 
     | 
    
         
            -
                stems.map! { |word| last_consonant!(word) }
         
     | 
| 
       218 
     | 
    
         
            -
             
     | 
| 
       219 
     | 
    
         
            -
                # Sort stems by size
         
     | 
| 
       220 
     | 
    
         
            -
                stems.sort! do |x,y|
         
     | 
| 
       221 
     | 
    
         
            -
                  if (x.size - AVG_STEMMED_SIZE).abs == (y.size - AVG_STEMMED_SIZE).abs
         
     | 
| 
       222 
     | 
    
         
            -
                    x.size <=> y.size
         
     | 
| 
       223 
     | 
    
         
            -
                  else
         
     | 
| 
       224 
     | 
    
         
            -
                    (x.size - AVG_STEMMED_SIZE).abs <=>  (y.size - AVG_STEMMED_SIZE).abs
         
     | 
| 
       225 
     | 
    
         
            -
                  end
         
     | 
| 
       226 
     | 
    
         
            -
                end
         
     | 
| 
       227 
     | 
    
         
            -
             
     | 
| 
       228 
     | 
    
         
            -
                # Check selection list exceptions
         
     | 
| 
       229 
     | 
    
         
            -
                if !(exception = (stems & SELECTION_LIST_EXCEPTIONS)).empty?
         
     | 
| 
       230 
     | 
    
         
            -
                  return exception.first
         
     | 
| 
       231 
     | 
    
         
            -
                end
         
     | 
| 
       232 
     | 
    
         
            -
             
     | 
| 
       233 
     | 
    
         
            -
                # Keep first or original word
         
     | 
| 
       234 
     | 
    
         
            -
                stems.empty? ? original_word : stems.first
         
     | 
| 
       235 
     | 
    
         
            -
              end
         
     | 
| 
       236 
     | 
    
         
            -
             
     | 
| 
       237 
     | 
    
         
            -
              # Given a state key and a word, scans through given states and generate valid
         
     | 
| 
       238 
     | 
    
         
            -
              # pending transitions.
         
     | 
| 
       239 
     | 
    
         
            -
              #
         
     | 
| 
       240 
     | 
    
         
            -
              # @param key [String] the key for states hash
         
     | 
| 
       241 
     | 
    
         
            -
              # @param word [String] the word to check
         
     | 
| 
       242 
     | 
    
         
            -
              # @param states [Hash] the states hash
         
     | 
| 
       243 
     | 
    
         
            -
              # @param suffixes [Hash] the suffixes hash
         
     | 
| 
       244 
     | 
    
         
            -
              # @param options [Hash] options for pendings
         
     | 
| 
       245 
     | 
    
         
            -
              # @option options [Boolean] :mark Whether this pending is marked for deletion
         
     | 
| 
       246 
     | 
    
         
            -
              # @return [Array] array of pendings
         
     | 
| 
       247 
     | 
    
         
            -
              def generate_pendings(key, word, states, suffixes, options = {})
         
     | 
| 
       248 
     | 
    
         
            -
                raise ArgumentError, "State #{key} does not exist" if (state = states[key]).nil?
         
     | 
| 
       249 
     | 
    
         
            -
                mark = options[:mark] || false
         
     | 
| 
       250 
     | 
    
         
            -
             
     | 
| 
       251 
     | 
    
         
            -
                matched_transitions = state["transitions"].select do |transition|
         
     | 
| 
       252 
     | 
    
         
            -
                  word.match(/(#{suffixes[transition["suffix"]]["regex"]})$/)
         
     | 
| 
       253 
     | 
    
         
            -
                end
         
     | 
| 
       254 
     | 
    
         
            -
             
     | 
| 
       255 
     | 
    
         
            -
                matched_transitions.map do |transition|
         
     | 
| 
       256 
     | 
    
         
            -
                  {
         
     | 
| 
       257 
     | 
    
         
            -
                    suffix: transition["suffix"],
         
     | 
| 
       258 
     | 
    
         
            -
                    to_state: transition["state"],
         
     | 
| 
       259 
     | 
    
         
            -
                    from_state: key,
         
     | 
| 
       260 
     | 
    
         
            -
                    word: word,
         
     | 
| 
       261 
     | 
    
         
            -
                    mark: mark
         
     | 
| 
       262 
     | 
    
         
            -
                  }
         
     | 
| 
       263 
     | 
    
         
            -
                end
         
     | 
| 
       264 
     | 
    
         
            -
              end
         
     | 
| 
       265 
     | 
    
         
            -
             
     | 
| 
       266 
     | 
    
         
            -
              # Given a suffix it stems a word according to Turkish orthographic rules
         
     | 
| 
       267 
     | 
    
         
            -
              #
         
     | 
| 
       268 
     | 
    
         
            -
              # @param word [String] the word to stem
         
     | 
| 
       269 
     | 
    
         
            -
              # @param suffix [Hash] a suffix record
         
     | 
| 
       270 
     | 
    
         
            -
              # @return [Hash] a stem answer record
         
     | 
| 
       271 
     | 
    
         
            -
              def mark_stem(word, suffix)
         
     | 
| 
       272 
     | 
    
         
            -
                stem = !PROTECTED_WORDS.include?(word) &&
         
     | 
| 
       273 
     | 
    
         
            -
                       (suffix["check_harmony"] &&
         
     | 
| 
       274 
     | 
    
         
            -
                       (has_vowel_harmony?(word) || VOWEL_HARMONY_EXCEPTIONS.include?(word))) ||
         
     | 
| 
       275 
     | 
    
         
            -
                       !suffix["check_harmony"]
         
     | 
| 
       276 
     | 
    
         
            -
             
     | 
| 
       277 
     | 
    
         
            -
                suffix_applied = suffix["regex"]
         
     | 
| 
       278 
     | 
    
         
            -
             
     | 
| 
       279 
     | 
    
         
            -
                if stem && (match = word.match(/(#{suffix_applied})$/))
         
     | 
| 
       280 
     | 
    
         
            -
                  new_word = word.gsub(/(#{match.to_s})$/, '')
         
     | 
| 
       281 
     | 
    
         
            -
                  suffix_applied = match.to_s
         
     | 
| 
       282 
     | 
    
         
            -
             
     | 
| 
       283 
     | 
    
         
            -
                  if suffix["optional_letter"]
         
     | 
| 
       284 
     | 
    
         
            -
                    answer, match = valid_optional_letter?(new_word, suffix["optional_letter"])
         
     | 
| 
       285 
     | 
    
         
            -
             
     | 
| 
       286 
     | 
    
         
            -
                    if answer && match
         
     | 
| 
       287 
     | 
    
         
            -
                      new_word = new_word.chop
         
     | 
| 
       288 
     | 
    
         
            -
                      suffix_applied = match + suffix_applied
         
     | 
| 
       289 
     | 
    
         
            -
                    elsif !answer
         
     | 
| 
       290 
     | 
    
         
            -
                      new_word = word
         
     | 
| 
       291 
     | 
    
         
            -
                      suffix_applied = nil
         
     | 
| 
       292 
     | 
    
         
            -
                      stem = false
         
     | 
| 
       293 
     | 
    
         
            -
                    end
         
     | 
| 
       294 
     | 
    
         
            -
                  end
         
     | 
| 
       295 
     | 
    
         
            -
                else
         
     | 
| 
       296 
     | 
    
         
            -
                  stem = false
         
     | 
| 
       297 
     | 
    
         
            -
                  suffix_applied = nil
         
     | 
| 
       298 
     | 
    
         
            -
                  new_word = word
         
     | 
| 
       299 
     | 
    
         
            -
                end
         
     | 
| 
       300 
     | 
    
         
            -
             
     | 
| 
       301 
     | 
    
         
            -
                { stem: stem, word: new_word, suffix_applied: suffix_applied }
         
     | 
| 
       302 
     | 
    
         
            -
              end
         
     | 
| 
       303 
     | 
    
         
            -
             
     | 
| 
       304 
     | 
    
         
            -
              # Given a word and a letter it checks if the optional letter can be part of
         
     | 
| 
       305 
     | 
    
         
            -
              # the stem or not.
         
     | 
| 
       306 
     | 
    
         
            -
              #
         
     | 
| 
       307 
     | 
    
         
            -
              # @param word [String] the examined word
         
     | 
| 
       308 
     | 
    
         
            -
              # @param letter [String] a single letter or a string armed with a regular
         
     | 
| 
       309 
     | 
    
         
            -
              #   expression
         
     | 
| 
       310 
     | 
    
         
            -
              # @return [Array] the answer is returned as an array. First element is a
         
     | 
| 
       311 
     | 
    
         
            -
              #   Boolean value and second element is the mached character.
         
     | 
| 
       312 
     | 
    
         
            -
              # @example
         
     | 
| 
       313 
     | 
    
         
            -
              #   self.valid_optional_letter?("test", "t")
         
     | 
| 
       314 
     | 
    
         
            -
              #   # => [true, 't']
         
     | 
| 
       315 
     | 
    
         
            -
              def valid_optional_letter?(word, letter)
         
     | 
| 
       316 
     | 
    
         
            -
                match         = word.match(/(#{letter})$/)
         
     | 
| 
       317 
     | 
    
         
            -
                answer        = true
         
     | 
| 
       318 
     | 
    
         
            -
                matched_char  = nil
         
     | 
| 
       319 
     | 
    
         
            -
             
     | 
| 
       320 
     | 
    
         
            -
                if match
         
     | 
| 
       321 
     | 
    
         
            -
                  matched_char  = match.to_s
         
     | 
| 
       322 
     | 
    
         
            -
                  previous_char = word[-2]
         
     | 
| 
       323 
     | 
    
         
            -
             
     | 
| 
       324 
     | 
    
         
            -
                  answer = if VOWELS.include?(matched_char)
         
     | 
| 
       325 
     | 
    
         
            -
                             (previous_char && CONSONANTS.include?(previous_char))
         
     | 
| 
       326 
     | 
    
         
            -
                           else
         
     | 
| 
       327 
     | 
    
         
            -
                             (previous_char && VOWELS.include?(previous_char))
         
     | 
| 
       328 
     | 
    
         
            -
                           end
         
     | 
| 
       329 
     | 
    
         
            -
                end
         
     | 
| 
       330 
     | 
    
         
            -
             
     | 
| 
       331 
     | 
    
         
            -
                [answer, matched_char]
         
     | 
| 
       332 
     | 
    
         
            -
              end
         
     | 
| 
       333 
     | 
    
         
            -
             
     | 
| 
       334 
     | 
    
         
            -
              # Transforms a word taken into account last consonant rule.
         
     | 
| 
       335 
     | 
    
         
            -
              #
         
     | 
| 
       336 
     | 
    
         
            -
              # @param word [String] the word to check for last consonant change
         
     | 
| 
       337 
     | 
    
         
            -
              # @return [String] the changed word
         
     | 
| 
       338 
     | 
    
         
            -
              def last_consonant!(word)
         
     | 
| 
       339 
     | 
    
         
            -
                return word if LAST_CONSONANT_EXCEPTIONS.include?(word)
         
     | 
| 
       340 
     | 
    
         
            -
             
     | 
| 
       341 
     | 
    
         
            -
                consonants  = { 'b' => 'p', 'c' => 'ç', 'd' => 't', 'ğ' => 'k' }
         
     | 
| 
       342 
     | 
    
         
            -
                last_char   = word[-1]
         
     | 
| 
       343 
     | 
    
         
            -
             
     | 
| 
       344 
     | 
    
         
            -
                if consonants.keys.include?(last_char)
         
     | 
| 
       345 
     | 
    
         
            -
                  word[-1] = consonants[last_char]
         
     | 
| 
       346 
     | 
    
         
            -
                end
         
     | 
| 
       347 
     | 
    
         
            -
             
     | 
| 
       348 
     | 
    
         
            -
                word
         
     | 
| 
       349 
     | 
    
         
            -
              end
         
     | 
| 
       350 
     | 
    
         
            -
             
     | 
| 
       351 
     | 
    
         
            -
              # Helper method. This is just a shortcut.
         
     | 
| 
       352 
     | 
    
         
            -
              def nominal_verbs_suffix_machine
         
     | 
| 
       353 
     | 
    
         
            -
                affix_morphological_stripper(yield, states: self::NOMINAL_VERB_STATES,
         
     | 
| 
       354 
     | 
    
         
            -
                  suffixes: self::NOMINAL_VERB_SUFFIXES)
         
     | 
| 
       355 
     | 
    
         
            -
              end
         
     | 
| 
       356 
     | 
    
         
            -
             
     | 
| 
       357 
     | 
    
         
            -
              # Helper method. This is just a shortcut.
         
     | 
| 
       358 
     | 
    
         
            -
              def noun_suffix_machine
         
     | 
| 
       359 
     | 
    
         
            -
                affix_morphological_stripper(yield, states: self::NOUN_STATES,
         
     | 
| 
       360 
     | 
    
         
            -
                  suffixes: self::NOUN_SUFFIXES)
         
     | 
| 
       361 
     | 
    
         
            -
              end
         
     | 
| 
       362 
     | 
    
         
            -
             
     | 
| 
       363 
     | 
    
         
            -
              # Helper method
         
     | 
| 
       364 
     | 
    
         
            -
              def derivational_suffix_machine
         
     | 
| 
       365 
     | 
    
         
            -
                affix_morphological_stripper(yield, states: self::DERIVATIONAL_STATES,
         
     | 
| 
       366 
     | 
    
         
            -
                  suffixes: self::DERIVATIONAL_SUFFIXES)
         
     | 
| 
       367 
     | 
    
         
            -
              end
         
     | 
| 
       368 
     | 
    
         
            -
             
     | 
| 
       369 
     | 
    
         
            -
              # A simple algorithm to strip suffixes from a word based on states and
         
     | 
| 
       370 
     | 
    
         
            -
              # transitions.
         
     | 
| 
       371 
     | 
    
         
            -
              #
         
     | 
| 
       372 
     | 
    
         
            -
              # @param  word    [String]  the word to strip affixes from
         
     | 
| 
       373 
     | 
    
         
            -
              # @param  options [Hash]    options for the algorithm
         
     | 
| 
       374 
     | 
    
         
            -
              # @option options [Hash]    :states The states and valid transitions
         
     | 
| 
       375 
     | 
    
         
            -
              # @option options [Hash]    :suffixes The suffixes with their rules
         
     | 
| 
       376 
     | 
    
         
            -
              # @return         [Array]   all possible stem versions
         
     | 
| 
       377 
     | 
    
         
            -
              def affix_morphological_stripper(word, options = {})
         
     | 
| 
       378 
     | 
    
         
            -
                states   = options[:states]   || {}
         
     | 
| 
       379 
     | 
    
         
            -
                suffixes = options[:suffixes] || {}
         
     | 
| 
       380 
     | 
    
         
            -
             
     | 
| 
       381 
     | 
    
         
            -
                return [word] if states.nil?   || states.empty?
         
     | 
| 
       382 
     | 
    
         
            -
                return [word] if suffixes.nil? || suffixes.empty?
         
     | 
| 
       383 
     | 
    
         
            -
             
     | 
| 
       384 
     | 
    
         
            -
                stems    = []
         
     | 
| 
       385 
     | 
    
         
            -
                # Init first state pending transitions
         
     | 
| 
       386 
     | 
    
         
            -
                pendings = generate_pendings(:a, word, states, suffixes)
         
     | 
| 
       387 
     | 
    
         
            -
             
     | 
| 
       388 
     | 
    
         
            -
                while !pendings.empty? do
         
     | 
| 
       389 
     | 
    
         
            -
                  transition = pendings.shift
         
     | 
| 
       390 
     | 
    
         
            -
                  word       = transition[:word]
         
     | 
| 
       391 
     | 
    
         
            -
                  suffix     = suffixes[transition[:suffix]]
         
     | 
| 
       392 
     | 
    
         
            -
                  to_state   = states[transition[:to_state]]
         
     | 
| 
       393 
     | 
    
         
            -
                  answer     = mark_stem(word, suffix)
         
     | 
| 
       394 
     | 
    
         
            -
             
     | 
| 
       395 
     | 
    
         
            -
                  if answer[:stem] == true
         
     | 
| 
       396 
     | 
    
         
            -
                    if ENV['DEBUG']
         
     | 
| 
       397 
     | 
    
         
            -
                      puts "Word: #{word} \nAnswer: #{answer} \nInfo: #{transition} \nSuffix: #{suffix}"
         
     | 
| 
       398 
     | 
    
         
            -
                    end
         
     | 
| 
       399 
     | 
    
         
            -
             
     | 
| 
       400 
     | 
    
         
            -
                    if to_state["final_state"] == true
         
     | 
| 
       401 
     | 
    
         
            -
                      # We have a valid transition here. It is safe to remove any pendings
         
     | 
| 
       402 
     | 
    
         
            -
                      # with the same signature current pending
         
     | 
| 
       403 
     | 
    
         
            -
                      remove_pendings_like!(transition, pendings)
         
     | 
| 
       404 
     | 
    
         
            -
                      remove_mark_pendings!(pendings)
         
     | 
| 
       405 
     | 
    
         
            -
             
     | 
| 
       406 
     | 
    
         
            -
                      stems.push answer[:word]
         
     | 
| 
       407 
     | 
    
         
            -
             
     | 
| 
       408 
     | 
    
         
            -
                      unless to_state["transitions"].empty?
         
     | 
| 
       409 
     | 
    
         
            -
                        pendings.unshift(*generate_pendings(transition[:to_state], answer[:word], states, suffixes))
         
     | 
| 
       410 
     | 
    
         
            -
                      end
         
     | 
| 
       411 
     | 
    
         
            -
             
     | 
| 
       412 
     | 
    
         
            -
                    else
         
     | 
| 
       413 
     | 
    
         
            -
                      mark_pendings!(transition, pendings)
         
     | 
| 
       414 
     | 
    
         
            -
                      pendings.unshift(*generate_pendings(transition[:to_state], answer[:word],
         
     | 
| 
       415 
     | 
    
         
            -
                        states, suffixes, mark: true))
         
     | 
| 
       416 
     | 
    
         
            -
                    end
         
     | 
| 
       417 
     | 
    
         
            -
                  end
         
     | 
| 
       418 
     | 
    
         
            -
                end
         
     | 
| 
       419 
     | 
    
         
            -
             
     | 
| 
       420 
     | 
    
         
            -
                return [word] if pendings.empty? && stems.empty?
         
     | 
| 
       421 
     | 
    
         
            -
             
     | 
| 
       422 
     | 
    
         
            -
                stems.uniq
         
     | 
| 
       423 
     | 
    
         
            -
              end
         
     | 
| 
       424 
     | 
    
         
            -
             
     | 
| 
       425 
     | 
    
         
            -
              private
         
     | 
| 
       426 
     | 
    
         
            -
             
     | 
| 
       427 
     | 
    
         
            -
              def remove_pendings_like!(pending, array)
         
     | 
| 
       428 
     | 
    
         
            -
                array.reject! do |candidate|
         
     | 
| 
       429 
     | 
    
         
            -
                  candidate[:to_state] == pending[:to_state] &&
         
     | 
| 
       430 
     | 
    
         
            -
                  candidate[:from_state] == pending[:from_state]
         
     | 
| 
       431 
     | 
    
         
            -
                end
         
     | 
| 
       432 
     | 
    
         
            -
              end
         
     | 
| 
       433 
     | 
    
         
            -
             
     | 
| 
       434 
     | 
    
         
            -
              def mark_pendings!(pending, array)
         
     | 
| 
       435 
     | 
    
         
            -
                similar_pendings(pending, array).each do |candidate|
         
     | 
| 
       436 
     | 
    
         
            -
                  candidate[:mark] = true
         
     | 
| 
       437 
     | 
    
         
            -
                end
         
     | 
| 
       438 
     | 
    
         
            -
              end
         
     | 
| 
       439 
     | 
    
         
            -
             
     | 
| 
       440 
     | 
    
         
            -
              def remove_mark_pendings!(array)
         
     | 
| 
       441 
     | 
    
         
            -
                array.reject! { |candidate| candidate[:mark] == true }
         
     | 
| 
       442 
     | 
    
         
            -
              end
         
     | 
| 
       443 
     | 
    
         
            -
             
     | 
| 
       444 
     | 
    
         
            -
              def similar_pendings(pending, array)
         
     | 
| 
       445 
     | 
    
         
            -
                array.select do |candidate|
         
     | 
| 
       446 
     | 
    
         
            -
                  candidate[:to_state] == pending[:to_state] &&
         
     | 
| 
       447 
     | 
    
         
            -
                  candidate[:from_state] == pending[:from_state]
         
     | 
| 
       448 
     | 
    
         
            -
                end
         
     | 
| 
       449 
     | 
    
         
            -
              end
         
     | 
| 
       450 
     | 
    
         
            -
             
     | 
| 
       451 
     | 
    
         
            -
              def turkish?(word)
         
     | 
| 
       452 
     | 
    
         
            -
                !! word.match(ALPHABET)
         
     | 
| 
       453 
     | 
    
         
            -
              end
         
     | 
| 
       454 
     | 
    
         
            -
             
     | 
| 
       455 
     | 
    
         
            -
            end
         
     |