turkish_stemmer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +282 -0
- data/Rakefile +21 -0
- data/benchmarks/stemmers_comparison.rb +16 -0
- data/benchmarks/stemming_samples.txt +17916 -0
- data/benchmarks/turkish_word_recognition.rb +26 -0
- data/config/derivational_states.yml +10 -0
- data/config/derivational_suffixes.yml +6 -0
- data/config/nominal_verb_states.yml +121 -0
- data/config/nominal_verb_suffixes.yml +90 -0
- data/config/noun_states.yml +177 -0
- data/config/noun_suffixes.yml +113 -0
- data/config/stemmer.yml +206 -0
- data/lib/hash_extension.rb +5 -0
- data/lib/turkish_stemmer/version.rb +3 -0
- data/lib/turkish_stemmer.rb +455 -0
- data/spec/fixtures/simple_state.yml +14 -0
- data/spec/fixtures/simple_state_02.yml +21 -0
- data/spec/fixtures/simple_suffix.yml +7 -0
- data/spec/fixtures/simple_transition.yml +7 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/support/fixtures.csv +101 -0
- data/spec/turkish_stemmer_spec.rb +522 -0
- data/turkish_stemmer.gemspec +35 -0
- metadata +164 -0
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding : utf-8
|
2
|
+
require 'benchmark'
|
3
|
+
require 'turkish_stemmer'
|
4
|
+
|
5
|
+
Benchmark.bmbm(7) do |x|
|
6
|
+
|
7
|
+
x.report('regex') do
|
8
|
+
TurkishStemmer.class_eval do
|
9
|
+
def self.turkish?(word)
|
10
|
+
!! word.match(TurkishStemmer::ALPHABET)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
100_000.times { TurkishStemmer.turkish?("aaa") }
|
15
|
+
end
|
16
|
+
|
17
|
+
x.report('loop') do
|
18
|
+
TurkishStemmer.class_eval do
|
19
|
+
def self.turkish?(word)
|
20
|
+
!! word.chars.to_a.all? { |c| "abcçdefgğhıijklmnoöprsştuüvyz".include?(c) }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
100_000.times { TurkishStemmer.turkish?("aaaa") }
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
# The order of the transitions is very crusial.
|
2
|
+
a:
|
3
|
+
transitions:
|
4
|
+
# Transitions to state B
|
5
|
+
- suffix: :s1
|
6
|
+
state: :b
|
7
|
+
- suffix: :s2
|
8
|
+
state: :b
|
9
|
+
- suffix: :s4
|
10
|
+
state: :b
|
11
|
+
- suffix: :s3
|
12
|
+
state: :b
|
13
|
+
# Transitions to state C
|
14
|
+
- suffix: :s5
|
15
|
+
state: :c
|
16
|
+
# Transitions to state D
|
17
|
+
- suffix: :s6
|
18
|
+
state: :d
|
19
|
+
- suffix: :s7
|
20
|
+
state: :d
|
21
|
+
- suffix: :s8
|
22
|
+
state: :d
|
23
|
+
- suffix: :s9
|
24
|
+
state: :d
|
25
|
+
# Transitions to state E
|
26
|
+
- suffix: :s10
|
27
|
+
state: :e
|
28
|
+
# Transitions to state F
|
29
|
+
- suffix: :s12
|
30
|
+
state: :f
|
31
|
+
- suffix: :s13
|
32
|
+
state: :f
|
33
|
+
- suffix: :s14
|
34
|
+
state: :f
|
35
|
+
- suffix: :s15
|
36
|
+
state: :f
|
37
|
+
# Transitions to state H
|
38
|
+
- suffix: :s11
|
39
|
+
state: :h
|
40
|
+
|
41
|
+
final_state: false
|
42
|
+
|
43
|
+
b:
|
44
|
+
transitions:
|
45
|
+
- suffix: :s14
|
46
|
+
state: :f
|
47
|
+
|
48
|
+
final_state: true
|
49
|
+
|
50
|
+
c:
|
51
|
+
transitions:
|
52
|
+
- suffix: :s10
|
53
|
+
state: :f
|
54
|
+
- suffix: :s12
|
55
|
+
state: :f
|
56
|
+
- suffix: :s13
|
57
|
+
state: :f
|
58
|
+
- suffix: :s14
|
59
|
+
state: :f
|
60
|
+
|
61
|
+
final_state: true
|
62
|
+
|
63
|
+
d:
|
64
|
+
transitions:
|
65
|
+
- suffix: :s12
|
66
|
+
state: :f
|
67
|
+
- suffix: :s13
|
68
|
+
state: :f
|
69
|
+
|
70
|
+
final_state: false
|
71
|
+
|
72
|
+
e:
|
73
|
+
transitions:
|
74
|
+
# Transitions to state G
|
75
|
+
- suffix: :s1
|
76
|
+
state: :g
|
77
|
+
- suffix: :s2
|
78
|
+
state: :g
|
79
|
+
- suffix: :s3
|
80
|
+
state: :g
|
81
|
+
- suffix: :s4
|
82
|
+
state: :g
|
83
|
+
- suffix: :s5
|
84
|
+
state: :g
|
85
|
+
# Transitions to state F
|
86
|
+
- suffix: :s14
|
87
|
+
state: :f
|
88
|
+
final_state: true
|
89
|
+
|
90
|
+
f:
|
91
|
+
transitions: []
|
92
|
+
|
93
|
+
final_state: true
|
94
|
+
|
95
|
+
|
96
|
+
g:
|
97
|
+
transitions:
|
98
|
+
- suffix: :s14
|
99
|
+
state: :f
|
100
|
+
|
101
|
+
final_state: false
|
102
|
+
|
103
|
+
h:
|
104
|
+
transitions:
|
105
|
+
# Transitions to state F
|
106
|
+
- suffix: :s14
|
107
|
+
state: :f
|
108
|
+
# Transitions to state G
|
109
|
+
- suffix: :s1
|
110
|
+
state: :g
|
111
|
+
- suffix: :s2
|
112
|
+
state: :g
|
113
|
+
- suffix: :s3
|
114
|
+
state: :g
|
115
|
+
- suffix: :s4
|
116
|
+
state: :g
|
117
|
+
- suffix: :s5
|
118
|
+
state: :g
|
119
|
+
|
120
|
+
final_state: false
|
121
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
s1:
|
2
|
+
name: "-(y)Um"
|
3
|
+
regex: "ım|im|um|üm"
|
4
|
+
optional_letter: "y"
|
5
|
+
check_harmony: true
|
6
|
+
|
7
|
+
s2:
|
8
|
+
name: "-sUn"
|
9
|
+
regex: "sın|sin|sun|sün"
|
10
|
+
optional_letter: false
|
11
|
+
check_harmony: true
|
12
|
+
|
13
|
+
s3:
|
14
|
+
name: "-(y)Uz"
|
15
|
+
regex: "ız|iz|uz|üz"
|
16
|
+
optional_letter: "y"
|
17
|
+
check_harmony: true
|
18
|
+
|
19
|
+
s4:
|
20
|
+
name: "-sUnUz"
|
21
|
+
regex: "sınız|siniz|sunuz|sünüz"
|
22
|
+
optional_letter: false
|
23
|
+
check_harmony: true
|
24
|
+
|
25
|
+
s5:
|
26
|
+
name: "-lAr"
|
27
|
+
regex: "lar|ler"
|
28
|
+
optional_letter: false
|
29
|
+
check_harmony: true
|
30
|
+
|
31
|
+
s6:
|
32
|
+
name: "-m"
|
33
|
+
regex: "m"
|
34
|
+
optional_letter: false
|
35
|
+
check_harmony: true
|
36
|
+
|
37
|
+
s7:
|
38
|
+
name: "-n"
|
39
|
+
regex: "n"
|
40
|
+
optional_letter: false
|
41
|
+
check_harmony: true
|
42
|
+
|
43
|
+
s8:
|
44
|
+
name: "-k"
|
45
|
+
regex: "k"
|
46
|
+
optional_letter: false
|
47
|
+
check_harmony: true
|
48
|
+
|
49
|
+
s9:
|
50
|
+
name: "-nUz"
|
51
|
+
regex: "nız|niz|nuz|nüz"
|
52
|
+
optional_letter: false
|
53
|
+
check_harmony: true
|
54
|
+
|
55
|
+
s10:
|
56
|
+
name: "-DUr"
|
57
|
+
regex: "tır|tir|tur|tür|dır|dir|dur|dür"
|
58
|
+
optional_letter: false
|
59
|
+
check_harmony: true
|
60
|
+
|
61
|
+
s11:
|
62
|
+
name: "-cAsInA"
|
63
|
+
regex: "casına|çasına|cesine|çesine"
|
64
|
+
optional_letter: false
|
65
|
+
check_harmony: true
|
66
|
+
|
67
|
+
s12:
|
68
|
+
name: "-(y)DU"
|
69
|
+
regex: "dı|di|du|dü|tı|ti|tu|tü"
|
70
|
+
optional_letter: "y"
|
71
|
+
check_harmony: true
|
72
|
+
|
73
|
+
s13:
|
74
|
+
name: "-(y)sA"
|
75
|
+
regex: "sa|se"
|
76
|
+
optional_letter: "y"
|
77
|
+
check_harmony: true
|
78
|
+
|
79
|
+
s14:
|
80
|
+
name: "-(y)mUş"
|
81
|
+
regex: "muş|miş|müş|mış"
|
82
|
+
optional_letter: "y"
|
83
|
+
check_harmony: true
|
84
|
+
|
85
|
+
s15:
|
86
|
+
name: "-(y)ken"
|
87
|
+
regex: "ken"
|
88
|
+
optional_letter: "y"
|
89
|
+
check_harmony: true
|
90
|
+
|
@@ -0,0 +1,177 @@
|
|
1
|
+
# The order of the transitions is very crusial.
|
2
|
+
a:
|
3
|
+
transitions:
|
4
|
+
- suffix: :s16
|
5
|
+
state: :c
|
6
|
+
- suffix: :s7
|
7
|
+
state: :k
|
8
|
+
- suffix: :s3
|
9
|
+
state: :h
|
10
|
+
- suffix: :s5
|
11
|
+
state: :h
|
12
|
+
- suffix: :s1
|
13
|
+
state: :l
|
14
|
+
- suffix: :s14
|
15
|
+
state: :f
|
16
|
+
- suffix: :s15
|
17
|
+
state: :g
|
18
|
+
- suffix: :s17
|
19
|
+
state: :e
|
20
|
+
- suffix: :s10
|
21
|
+
state: :e
|
22
|
+
- suffix: :s19
|
23
|
+
state: :m
|
24
|
+
- suffix: :s4
|
25
|
+
state: :h
|
26
|
+
- suffix: :s9
|
27
|
+
state: :c
|
28
|
+
- suffix: :s12
|
29
|
+
state: :f
|
30
|
+
- suffix: :s13
|
31
|
+
state: :b
|
32
|
+
- suffix: :s18
|
33
|
+
state: :d
|
34
|
+
- suffix: :s2
|
35
|
+
state: :h
|
36
|
+
- suffix: :s6
|
37
|
+
state: :h
|
38
|
+
- suffix: :s8
|
39
|
+
state: :b
|
40
|
+
- suffix: :s11
|
41
|
+
state: :b
|
42
|
+
|
43
|
+
final_state: true
|
44
|
+
|
45
|
+
b:
|
46
|
+
transitions:
|
47
|
+
- suffix: :s3
|
48
|
+
state: :h
|
49
|
+
- suffix: :s5
|
50
|
+
state: :h
|
51
|
+
- suffix: :s1
|
52
|
+
state: :l
|
53
|
+
- suffix: :s4
|
54
|
+
state: :h
|
55
|
+
- suffix: :s2
|
56
|
+
state: :h
|
57
|
+
|
58
|
+
final_state: true
|
59
|
+
|
60
|
+
c:
|
61
|
+
transitions:
|
62
|
+
# Transitions to state K
|
63
|
+
- suffix: :s7
|
64
|
+
state: :k
|
65
|
+
# Transitions to state H
|
66
|
+
- suffix: :s6
|
67
|
+
state: :h
|
68
|
+
|
69
|
+
final_state: false
|
70
|
+
|
71
|
+
d:
|
72
|
+
transitions:
|
73
|
+
# Transitions to state F
|
74
|
+
- suffix: :s14
|
75
|
+
state: :f
|
76
|
+
# Transitions to state E
|
77
|
+
- suffix: :s10
|
78
|
+
state: :e
|
79
|
+
# Transitions to state B
|
80
|
+
- suffix: :s13
|
81
|
+
state: :b
|
82
|
+
|
83
|
+
final_state: false
|
84
|
+
|
85
|
+
e:
|
86
|
+
transitions:
|
87
|
+
- suffix: :s7
|
88
|
+
state: :k
|
89
|
+
- suffix: :s3
|
90
|
+
state: :h
|
91
|
+
- suffix: :s5
|
92
|
+
state: :h
|
93
|
+
- suffix: :s1
|
94
|
+
state: :l
|
95
|
+
- suffix: :s4
|
96
|
+
state: :h
|
97
|
+
- suffix: :s18
|
98
|
+
state: :d
|
99
|
+
- suffix: :s2
|
100
|
+
state: :h
|
101
|
+
- suffix: :s6
|
102
|
+
state: :h
|
103
|
+
|
104
|
+
final_state: true
|
105
|
+
|
106
|
+
f:
|
107
|
+
transitions:
|
108
|
+
# Transitions to state K
|
109
|
+
- suffix: :s7
|
110
|
+
state: :k
|
111
|
+
# Transitions to state D
|
112
|
+
- suffix: :s18
|
113
|
+
state: :d
|
114
|
+
# Transitions to state H
|
115
|
+
- suffix: :s6
|
116
|
+
state: :h
|
117
|
+
|
118
|
+
final_state: false
|
119
|
+
|
120
|
+
|
121
|
+
g:
|
122
|
+
transitions:
|
123
|
+
- suffix: :s5
|
124
|
+
state: :h
|
125
|
+
- suffix: :s3
|
126
|
+
state: :h
|
127
|
+
- suffix: :s1
|
128
|
+
state: :l
|
129
|
+
- suffix: :s4
|
130
|
+
state: :h
|
131
|
+
- suffix: :s18
|
132
|
+
state: :d
|
133
|
+
- suffix: :s2
|
134
|
+
state: :h
|
135
|
+
|
136
|
+
final_state: true
|
137
|
+
|
138
|
+
h:
|
139
|
+
transitions:
|
140
|
+
# Transitions to state L
|
141
|
+
- suffix: :s1
|
142
|
+
state: :l
|
143
|
+
|
144
|
+
final_state: true
|
145
|
+
|
146
|
+
k:
|
147
|
+
transitions: []
|
148
|
+
|
149
|
+
final_state: true
|
150
|
+
|
151
|
+
l:
|
152
|
+
transitions:
|
153
|
+
# Transitions to state D
|
154
|
+
- suffix: :s18
|
155
|
+
state: :d
|
156
|
+
|
157
|
+
final_state: true
|
158
|
+
|
159
|
+
m:
|
160
|
+
transitions:
|
161
|
+
- suffix: :s7
|
162
|
+
state: :k
|
163
|
+
- suffix: :s3
|
164
|
+
state: :h
|
165
|
+
- suffix: :s5
|
166
|
+
state: :h
|
167
|
+
- suffix: :s1
|
168
|
+
state: :l
|
169
|
+
- suffix: :s4
|
170
|
+
state: :h
|
171
|
+
- suffix: :s2
|
172
|
+
state: :h
|
173
|
+
- suffix: :s6
|
174
|
+
state: :h
|
175
|
+
|
176
|
+
final_state: true
|
177
|
+
|
@@ -0,0 +1,113 @@
|
|
1
|
+
s1:
|
2
|
+
name: "-lAr"
|
3
|
+
regex: "lar|ler"
|
4
|
+
optional_letter: false
|
5
|
+
check_harmony: true
|
6
|
+
|
7
|
+
s2:
|
8
|
+
name: "-(U)m"
|
9
|
+
regex: "m"
|
10
|
+
optional_letter: "ı|i|u|ü"
|
11
|
+
check_harmony: true
|
12
|
+
|
13
|
+
s3:
|
14
|
+
name: "-(U)mUz"
|
15
|
+
regex: "mız|miz|muz|müz"
|
16
|
+
optional_letter: "ı|i|u|ü"
|
17
|
+
check_harmony: true
|
18
|
+
|
19
|
+
s4:
|
20
|
+
name: "-Un"
|
21
|
+
regex: "ın|in|un|ün"
|
22
|
+
optional_letter: false
|
23
|
+
check_harmony: true
|
24
|
+
|
25
|
+
s5:
|
26
|
+
name: "-(U)nUz"
|
27
|
+
regex: "nız|niz|nuz|nüz"
|
28
|
+
optional_letter: "ı|i|u|ü"
|
29
|
+
check_harmony: true
|
30
|
+
|
31
|
+
s6:
|
32
|
+
name: "-(s)U"
|
33
|
+
regex: "ı|i|u|ü"
|
34
|
+
optional_letter: "s"
|
35
|
+
check_harmony: true
|
36
|
+
|
37
|
+
s7:
|
38
|
+
name: "-lArI"
|
39
|
+
regex: "ları|leri"
|
40
|
+
optional_letter: false
|
41
|
+
check_harmony: true
|
42
|
+
|
43
|
+
s8:
|
44
|
+
name: "-(y)U"
|
45
|
+
regex: "ı|i|u|ü"
|
46
|
+
optional_letter: "y"
|
47
|
+
check_harmony: true
|
48
|
+
|
49
|
+
s9:
|
50
|
+
name: "-nU"
|
51
|
+
regex: "nı|ni|nu|nü"
|
52
|
+
optional_letter: false
|
53
|
+
check_harmony: true
|
54
|
+
|
55
|
+
s10:
|
56
|
+
name: "-(n)Un"
|
57
|
+
regex: "ın|in|un|ün"
|
58
|
+
optional_letter: "n"
|
59
|
+
check_harmony: true
|
60
|
+
|
61
|
+
s11:
|
62
|
+
name: "-(y)A"
|
63
|
+
regex: "a|e"
|
64
|
+
optional_letter: "y"
|
65
|
+
check_harmony: true
|
66
|
+
|
67
|
+
s12:
|
68
|
+
name: "-nA"
|
69
|
+
regex: "na|ne"
|
70
|
+
optional_letter: false
|
71
|
+
check_harmony: true
|
72
|
+
|
73
|
+
s13:
|
74
|
+
name: "-DA"
|
75
|
+
regex: "da|de|ta|te"
|
76
|
+
optional_letter: false
|
77
|
+
check_harmony: true
|
78
|
+
|
79
|
+
s14:
|
80
|
+
name: "-nDA"
|
81
|
+
regex: "nta|nte|nda|nde"
|
82
|
+
optional_letter: false
|
83
|
+
check_harmony: true
|
84
|
+
|
85
|
+
s15:
|
86
|
+
name: "-DAn"
|
87
|
+
regex: "dan|tan|den|ten"
|
88
|
+
optional_letter: false
|
89
|
+
check_harmony: true
|
90
|
+
|
91
|
+
s16:
|
92
|
+
name: "-nDAn"
|
93
|
+
regex: "ndan|ntan|nden|nten"
|
94
|
+
optional_letter: false
|
95
|
+
check_harmony: true
|
96
|
+
|
97
|
+
s17:
|
98
|
+
name: "-(y)lA"
|
99
|
+
regex: "la|le"
|
100
|
+
optional_letter: "y"
|
101
|
+
check_harmony: true
|
102
|
+
|
103
|
+
s18:
|
104
|
+
name: "-ki"
|
105
|
+
regex: "ki"
|
106
|
+
optional_letter: false
|
107
|
+
check_harmony: false
|
108
|
+
|
109
|
+
s19:
|
110
|
+
name: "-(n)cA"
|
111
|
+
regex: "ca|ce"
|
112
|
+
optional_letter: "n"
|
113
|
+
check_harmony: true
|