turkish_stemmer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +282 -0
- data/Rakefile +21 -0
- data/benchmarks/stemmers_comparison.rb +16 -0
- data/benchmarks/stemming_samples.txt +17916 -0
- data/benchmarks/turkish_word_recognition.rb +26 -0
- data/config/derivational_states.yml +10 -0
- data/config/derivational_suffixes.yml +6 -0
- data/config/nominal_verb_states.yml +121 -0
- data/config/nominal_verb_suffixes.yml +90 -0
- data/config/noun_states.yml +177 -0
- data/config/noun_suffixes.yml +113 -0
- data/config/stemmer.yml +206 -0
- data/lib/hash_extension.rb +5 -0
- data/lib/turkish_stemmer/version.rb +3 -0
- data/lib/turkish_stemmer.rb +455 -0
- data/spec/fixtures/simple_state.yml +14 -0
- data/spec/fixtures/simple_state_02.yml +21 -0
- data/spec/fixtures/simple_suffix.yml +7 -0
- data/spec/fixtures/simple_transition.yml +7 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/support/fixtures.csv +101 -0
- data/spec/turkish_stemmer_spec.rb +522 -0
- data/turkish_stemmer.gemspec +35 -0
- metadata +164 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b12eb0732e274df49f05ff4158522ccfd89f4a7e
|
4
|
+
data.tar.gz: 9b0723baf1eb0fb864c82bd17cec6e8966e4fde5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: de27efe76f051fd747cf0ee9b59fb47ed61e3664d8b20c234adc00d194b67419a16de3a2eb5c6d5f8135a4cb659f7b59d5114a6e719cb507bac574a2bca08a3d
|
7
|
+
data.tar.gz: 3004837f529b305bad6e309a4989603b146aec667059e6e1fd0a281b0ed3e8ee38fe6d6ee13d96c25e9d2343dffdd2802b11bb52ba5940fa1d8b7062969c08ea
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Skroutz SA
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,282 @@
|
|
1
|
+
# TurkishStemmer
|
2
|
+
|
3
|
+
Stemmer algorithm for Τurkish language.
|
4
|
+
|
5
|
+
## Introduction to Turkish language morphology
|
6
|
+
|
7
|
+
> Turkish is an agglutinative language and has a very rich morphological
|
8
|
+
stucture. In Turkish, you can form many different words from a single stem by
|
9
|
+
appending a sequence of suffixes. For example The word "doktoruymuşsunuz"
|
10
|
+
means "You had been the doctor of him". The stem of the word is "doktor" and
|
11
|
+
it takes three different suffixes -sU, -ymUş, and -sUnUz.
|
12
|
+
|
13
|
+
From "Snowball Description":
|
14
|
+
|
15
|
+
> Words are usually composed of a stem and of at least two or three affixes
|
16
|
+
appended to it.
|
17
|
+
|
18
|
+
> We can analyze noun suffixes in Turkish in two groups. Noun suffixes (eg.
|
19
|
+
"doktor-um" meaning "my doctor") and nominal verb suffixes (eg. "doktor-dur"
|
20
|
+
meaning ‘is a doctor’). The words ending with nominal verb suffixes can be
|
21
|
+
used as verbs in sentences. There are over thirty different suffixes
|
22
|
+
classified in these two general groups of suffixes.
|
23
|
+
|
24
|
+
> In Turkish, the suffixes are affixed to the stem according to definite
|
25
|
+
ordering rules.
|
26
|
+
|
27
|
+
From "An affix stripping morphological analyzer for Turkish" paper:
|
28
|
+
|
29
|
+
> Turkish has a special place within the natural languages not only being a
|
30
|
+
fully concatenative language but also having the suffixes as the only affix
|
31
|
+
type. Another feature of the language is that, someone who knows Turkish can
|
32
|
+
easily analyze a word even if he/she does not know its stem.
|
33
|
+
|
34
|
+
> The phonological rules of Turkish are significant factors that influence
|
35
|
+
this feature.
|
36
|
+
Ex: (any word)lerim => (any word)-ler-im
|
37
|
+
"ler" plural suffix, "im" 1st singular person possessive.
|
38
|
+
|
39
|
+
### Rules
|
40
|
+
|
41
|
+
1. The only affix type in Turkish is the suffix.
|
42
|
+
|
43
|
+
2. A plural suffix cannot follow a possesive suffix.
|
44
|
+
|
45
|
+
3. A suffix in Turkish can have multiple allomorphs in order to provide sound
|
46
|
+
harmony in the word to which it is affixed.
|
47
|
+
|
48
|
+
4. In Turkish each vowel indicates a distinct syllable.
|
49
|
+
|
50
|
+
5. In Turkish, single syllable words are mostly the stem itself
|
51
|
+
|
52
|
+
6. If a word has nominal __verb__ suffixes, they always appear at the end of
|
53
|
+
the word. They follow __noun__ suffixes or the stem itself at the absence
|
54
|
+
of noun suffixes
|
55
|
+
|
56
|
+
7. In Turkish, “-lAr” suffix can be used both as a nominal verb suffix (third
|
57
|
+
person plural present tense) and as a noun suffix (plural inflection).
|
58
|
+
|
59
|
+
8. In Turkish, words do not end with consonants 'b', 'c', 'd', and 'ğ'.
|
60
|
+
However, when a suffix starting with a vowel is affixed to a word ending
|
61
|
+
with 'p', 'ç', 't' or 'k', the last consonant is transformed into 'b', 'c',
|
62
|
+
'd', or 'ğ' respectively. The postlude routine transforms last consonants
|
63
|
+
'b', 'c','d', or 'ğ'' back to 'p', 'ç', 't' or 'k', respectively, after
|
64
|
+
stemming is complete.
|
65
|
+
|
66
|
+
### Suffix Classes
|
67
|
+
|
68
|
+
Class | Type
|
69
|
+
-----------------------------|----------------
|
70
|
+
Nominal verb suffixes | Inflectional
|
71
|
+
Derivational suffixes | Derivational
|
72
|
+
Noun suffixes | Inflectional
|
73
|
+
Tense & person verb suffixes | Inflectional
|
74
|
+
Verb suffixes | Inflectional
|
75
|
+
|
76
|
+
### Suffix allomorphs
|
77
|
+
|
78
|
+
Suffix allomorphs are used to create a good sound harmony. They do not change
|
79
|
+
the meaning of the word. If a suffix has a capital letter then it has an
|
80
|
+
allomorh. If a suffix has a letter in parentheses then it can be omitted.
|
81
|
+
Possible allomorphs are given below:
|
82
|
+
|
83
|
+
Letter | Allomorph
|
84
|
+
-------|------------
|
85
|
+
U | ı,i,u,ü
|
86
|
+
C | c,ç
|
87
|
+
A | a,e
|
88
|
+
D | d,t
|
89
|
+
I | ı,I
|
90
|
+
|
91
|
+
### Nominal Verb Suffixes
|
92
|
+
|
93
|
+
a/a | Suffix
|
94
|
+
----|------------------
|
95
|
+
1 | –(y)Um
|
96
|
+
2 | –sUn
|
97
|
+
3 | –(y)Uz
|
98
|
+
4 | –sUnUz
|
99
|
+
5 | –lAr
|
100
|
+
6 | –md
|
101
|
+
7 | –n
|
102
|
+
8 | –k
|
103
|
+
9 | –nUz
|
104
|
+
10 | –DUr
|
105
|
+
11 | –cAsInA
|
106
|
+
12 | –(y)DU
|
107
|
+
13 | –(y)sA
|
108
|
+
14 | –(y)mUş
|
109
|
+
15 | –(y)ken
|
110
|
+
|
111
|
+
Suffix transition ordering for nominal verbs can be seen in References[5]
|
112
|
+
|
113
|
+
### Noun Suffixes
|
114
|
+
|
115
|
+
a/a | Suffixes
|
116
|
+
----|-------------
|
117
|
+
1 | –lAr
|
118
|
+
2 | –(U)m
|
119
|
+
3 | –(U)mUz
|
120
|
+
4 | –(U)n
|
121
|
+
5 | –(U)nUz
|
122
|
+
6 | –(s)U
|
123
|
+
7 | –lArI
|
124
|
+
8 | –(y)U
|
125
|
+
9 | –nU
|
126
|
+
10 | –(n)Un
|
127
|
+
11 | –(y)A
|
128
|
+
12 | –nA
|
129
|
+
13 | –DA
|
130
|
+
14 | –nDA
|
131
|
+
15 | –DAn
|
132
|
+
16 | –nDAn
|
133
|
+
17 | –(y)lA
|
134
|
+
18 | –ki
|
135
|
+
19 | –(n)cA
|
136
|
+
|
137
|
+
Suffix transition ordering for nouns can be seen in References[5]
|
138
|
+
|
139
|
+
### Derivational Suffixes
|
140
|
+
|
141
|
+
a/a | Suffixes
|
142
|
+
----|----------
|
143
|
+
1 | –lUk
|
144
|
+
2 | –CU
|
145
|
+
3 | –CUk
|
146
|
+
4 | –lAş
|
147
|
+
5 | –lA
|
148
|
+
6 | –lAn
|
149
|
+
7 | –CA
|
150
|
+
8 | –lU
|
151
|
+
9 | –sUz
|
152
|
+
|
153
|
+
Initially, we will handle only a small subset of the above suffixes which are
|
154
|
+
more common in our domain.
|
155
|
+
|
156
|
+
### Vowel Harmony
|
157
|
+
|
158
|
+
This routine checks whether __the last two__ vowels of the word obey vowel
|
159
|
+
harmony rules. A brief description of Turkish vowel harmony follows.
|
160
|
+
|
161
|
+
Turkish vowel harmony is a two dimensional vowel harmony system, where vowels
|
162
|
+
are characterised by two features named frontness and roundness. There are
|
163
|
+
vowel harmony rules for each feature.
|
164
|
+
|
165
|
+
1. Vowel harmony rule for frontness: Vowels in Turkish are grouped into two
|
166
|
+
according to where they are produced. Front produced vowels are formed at
|
167
|
+
the front of the mouth ('e', 'i', 'ö', 'ü') and back produced vowels are
|
168
|
+
produced nearer to throat ('a', 'ı', 'o', 'u'). According to the vowel
|
169
|
+
harmony rule, words cannot contain both front and back vowels. This is one
|
170
|
+
of the reasons why suffixes containing vowels can take different forms to
|
171
|
+
obey vowel harmony.
|
172
|
+
|
173
|
+
2. Vowel harmony rule for roundness: Vowels in Turkish are grouped into two
|
174
|
+
according to whether lips are rounded while producing it. 'o', 'ö', 'u' and
|
175
|
+
'ü' are rounded vowels whereas 'a', 'e', 'ı' and 'i' are unrounded.
|
176
|
+
According to the vowel harmony rules, if the vowel of a syllable is
|
177
|
+
unrounded, the following vowel is unrounded as well. If the vowel of a
|
178
|
+
syllable is rounded, the following vowels are 'a', 'e', 'u' or 'ü'.
|
179
|
+
|
180
|
+
### Last consonant
|
181
|
+
|
182
|
+
Another interesting case in detecting suffixes in Turkish is that, for some
|
183
|
+
suffixes, if the word ends with a vowel, a consonant is inserted between the
|
184
|
+
rest of the word and the suffix. These merging consonants can be 'y', 'n' or
|
185
|
+
's'. When a merging consonant can be inserted before the suffix, the
|
186
|
+
representation of the suffix starts with the optional consonant surrounded by
|
187
|
+
paranthesis (eg. –(y)Um, -(n)cA). For these kinds of suffixes, if existence of
|
188
|
+
a merging consonant is considered, the candidate stem is checked whether it
|
189
|
+
ends with a vowel.
|
190
|
+
|
191
|
+
If there is no 'y' consonant before the suffix, only the real part of the
|
192
|
+
suffix (eg. -Um) is marked for stemming. If there is a 'y' consonant and it is
|
193
|
+
preceded by a vowel, 'y' is treated as a merging consonant and both 'y' and
|
194
|
+
the candidate suffix (eg. -Um) is marked for stemming. If there is a consonant
|
195
|
+
just before 'y', the decision is that the consonant 'y' and the candidate
|
196
|
+
suffix are really a part of the stem. In such a case, cursor is not advanced
|
197
|
+
to prevent over-stemming. The last case can occur especially when the stem
|
198
|
+
originates from another language like in 'lityum' (meaning the element
|
199
|
+
Lithium). If the check for vowel harmony was not made, the word would be
|
200
|
+
stemmed to 'lit', for '–(y)Um' would be treated as a suffix affixed to it. But
|
201
|
+
according to morphological rules of Turkish, the final word would be 'litim',
|
202
|
+
not 'lityum' if 'lit' were really the stem of the word and the suffix '–(y)Um'
|
203
|
+
were affixed to it. So detecting 'lit' as the stem of the word would be an over
|
204
|
+
-stemming.
|
205
|
+
|
206
|
+
### Merging Vowel
|
207
|
+
|
208
|
+
Similar to merging consonants, there are merging vowels for some suffixes
|
209
|
+
starting with consonants. They can be preceded by merging vowels like in '-(U)
|
210
|
+
mUz' suffix when they are affixed to a stem ending with a consonant. In such a
|
211
|
+
case, a U vowel ('ı', 'i', 'u' or 'ü' depending on vowel harmony) is inserted
|
212
|
+
between the stem and real suffix (e.g. '-mUz') for ease of pronunciation.
|
213
|
+
|
214
|
+
### Some examples
|
215
|
+
|
216
|
+
Word / Analysis | Meaning / Stem
|
217
|
+
------------------------------ |--------------------------------
|
218
|
+
Kalelerimizdekilerden | From the ones at one of our castles
|
219
|
+
Kale-lAr-UmUz-DA-ki-lAr-DAn | Kale
|
220
|
+
Çocuğuymuşumcasına | As if I were her child
|
221
|
+
Çocuk-(s)U-(y)mUş-(y)Um-cAsInA | Çocuk
|
222
|
+
Kedileriyle | With their cats
|
223
|
+
Kedi-lAr-(s)U-(y)lA | Kedi
|
224
|
+
Çocuklarımmış | Someone told me that they were my children
|
225
|
+
çocuk-lAr-(U)m-(y)mUş | Çocuk
|
226
|
+
Kitabımızdı | It was our book
|
227
|
+
kitap-UmUz-(y)DU | Kitap
|
228
|
+
|
229
|
+
## Future Work
|
230
|
+
|
231
|
+
* Add more verbs suffixes.
|
232
|
+
* Add more derivational suffixes.
|
233
|
+
|
234
|
+
## References
|
235
|
+
|
236
|
+
1. [Turkish Stemmer used in Lucene](http://snowball.tartarus.org/algorithms/turkish/stemmer.html)
|
237
|
+
2. [Java Implementation](http://snowball.tartarus.org/archives/snowball-discuss/att-0875/02-TurkishStemmer.java)
|
238
|
+
3. [Snowball Implementation](http://snowball.tartarus.org/algorithms/turkish/stem_Unicode.sbl)
|
239
|
+
4. [Snowball Description](http://snowball.tartarus.org/algorithms/turkish/accompanying_paper.doc)
|
240
|
+
5. [An affix stripping morphological analyzer for Turkish](http://web.itu.edu.tr/~gulsenc/papers/iasted.pdf)
|
241
|
+
6. [Lead Generation](https://en.wikipedia.org/wiki/Lead_generation)
|
242
|
+
7. [Vowel Harmony](https://en.wikipedia.org/wiki/Vowel_harmony#Turkish)
|
243
|
+
8. [Turkish Suffixes](https://en.wiktionary.org/wiki/Appendix:Turkish_suffixes)
|
244
|
+
9. [Turkish Grammar](https://en.wikipedia.org/wiki/Turkish_grammar)
|
245
|
+
10. [Turkish Language](https://en.wikipedia.org/wiki/Turkish_language)
|
246
|
+
11. [Tartarus](http://tartarus.org/)
|
247
|
+
12. [Information Retrieval on Turkish Texts](http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
|
248
|
+
|
249
|
+
## Installation
|
250
|
+
|
251
|
+
Add this line to your application's Gemfile:
|
252
|
+
|
253
|
+
gem 'turkish_stemmer'
|
254
|
+
|
255
|
+
And then execute:
|
256
|
+
|
257
|
+
$ bundle
|
258
|
+
|
259
|
+
Or install it yourself as:
|
260
|
+
|
261
|
+
$ gem install turkish_stemmer
|
262
|
+
|
263
|
+
## Usage
|
264
|
+
|
265
|
+
```ruby
|
266
|
+
require 'turkish_stemmer'
|
267
|
+
|
268
|
+
TurkishStemmer.stem("gözlükler") # => "gözlük"
|
269
|
+
```
|
270
|
+
|
271
|
+
## Contributing
|
272
|
+
|
273
|
+
1. Fork it ( http://github.com/<my-github-username>/turkish_stemmer/fork )
|
274
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
275
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
276
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
277
|
+
5. Create new Pull Request
|
278
|
+
|
279
|
+
## License
|
280
|
+
|
281
|
+
turkish_stemmer is licensed under MIT. See [LICENSE](LICENSE.txt).
|
282
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
|
4
|
+
desc "Update the stems of the sample words"
|
5
|
+
task :update_stemming_samples do
|
6
|
+
require 'turkish_stemmer'
|
7
|
+
words = []
|
8
|
+
filename = "benchmarks/stemming_samples.txt"
|
9
|
+
File.open(filename, "r") do |sample|
|
10
|
+
while(line = sample.gets)
|
11
|
+
word, _ = line.split(",")
|
12
|
+
words << word
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
File.open(filename, "w") do |sample|
|
17
|
+
words.each do |word|
|
18
|
+
sample.puts "#{word},#{TurkishStemmer.stem(word)}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'turkish_stemmer'
|
3
|
+
require 'lingua/stemmer'
|
4
|
+
|
5
|
+
Benchmark.bmbm(7) do |x|
|
6
|
+
|
7
|
+
lingua_stemmer = Lingua::Stemmer.new(:language => "tr")
|
8
|
+
|
9
|
+
x.report('Stem using turkish_stemmer gem') do
|
10
|
+
1_000.times { TurkishStemmer.stem("telephonlar") }
|
11
|
+
end
|
12
|
+
|
13
|
+
x.report('Stem using ruby-stemmer gem') do
|
14
|
+
1_000.times { lingua_stemmer.stem("telephonlar") }
|
15
|
+
end
|
16
|
+
end
|