turkish_stemmer 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,26 @@
1
+ # coding : utf-8
2
+ require 'benchmark'
3
+ require 'turkish_stemmer'
4
+
5
+ Benchmark.bmbm(7) do |x|
6
+
7
+ x.report('regex') do
8
+ TurkishStemmer.class_eval do
9
+ def self.turkish?(word)
10
+ !! word.match(TurkishStemmer::ALPHABET)
11
+ end
12
+ end
13
+
14
+ 100_000.times { TurkishStemmer.turkish?("aaa") }
15
+ end
16
+
17
+ x.report('loop') do
18
+ TurkishStemmer.class_eval do
19
+ def self.turkish?(word)
20
+ !! word.chars.to_a.all? { |c| "abcçdefgğhıijklmnoöprsştuüvyz".include?(c) }
21
+ end
22
+ end
23
+
24
+ 100_000.times { TurkishStemmer.turkish?("aaaa") }
25
+ end
26
+ end
@@ -0,0 +1,10 @@
1
+ a:
2
+ transitions:
3
+ - suffix: :s1
4
+ state: :b
5
+
6
+ final_state: false
7
+
8
+ b:
9
+ transitions: []
10
+ final_state: true
@@ -0,0 +1,6 @@
1
+ s1:
2
+ name: "-lU"
3
+ regex: "lı|li|lu|lü"
4
+ optional_letter: false
5
+ check_harmony: true
6
+
@@ -0,0 +1,121 @@
1
+ # The order of the transitions is very crusial.
2
+ a:
3
+ transitions:
4
+ # Transitions to state B
5
+ - suffix: :s1
6
+ state: :b
7
+ - suffix: :s2
8
+ state: :b
9
+ - suffix: :s4
10
+ state: :b
11
+ - suffix: :s3
12
+ state: :b
13
+ # Transitions to state C
14
+ - suffix: :s5
15
+ state: :c
16
+ # Transitions to state D
17
+ - suffix: :s6
18
+ state: :d
19
+ - suffix: :s7
20
+ state: :d
21
+ - suffix: :s8
22
+ state: :d
23
+ - suffix: :s9
24
+ state: :d
25
+ # Transitions to state E
26
+ - suffix: :s10
27
+ state: :e
28
+ # Transitions to state F
29
+ - suffix: :s12
30
+ state: :f
31
+ - suffix: :s13
32
+ state: :f
33
+ - suffix: :s14
34
+ state: :f
35
+ - suffix: :s15
36
+ state: :f
37
+ # Transitions to state H
38
+ - suffix: :s11
39
+ state: :h
40
+
41
+ final_state: false
42
+
43
+ b:
44
+ transitions:
45
+ - suffix: :s14
46
+ state: :f
47
+
48
+ final_state: true
49
+
50
+ c:
51
+ transitions:
52
+ - suffix: :s10
53
+ state: :f
54
+ - suffix: :s12
55
+ state: :f
56
+ - suffix: :s13
57
+ state: :f
58
+ - suffix: :s14
59
+ state: :f
60
+
61
+ final_state: true
62
+
63
+ d:
64
+ transitions:
65
+ - suffix: :s12
66
+ state: :f
67
+ - suffix: :s13
68
+ state: :f
69
+
70
+ final_state: false
71
+
72
+ e:
73
+ transitions:
74
+ # Transitions to state G
75
+ - suffix: :s1
76
+ state: :g
77
+ - suffix: :s2
78
+ state: :g
79
+ - suffix: :s3
80
+ state: :g
81
+ - suffix: :s4
82
+ state: :g
83
+ - suffix: :s5
84
+ state: :g
85
+ # Transitions to state F
86
+ - suffix: :s14
87
+ state: :f
88
+ final_state: true
89
+
90
+ f:
91
+ transitions: []
92
+
93
+ final_state: true
94
+
95
+
96
+ g:
97
+ transitions:
98
+ - suffix: :s14
99
+ state: :f
100
+
101
+ final_state: false
102
+
103
+ h:
104
+ transitions:
105
+ # Transitions to state F
106
+ - suffix: :s14
107
+ state: :f
108
+ # Transitions to state G
109
+ - suffix: :s1
110
+ state: :g
111
+ - suffix: :s2
112
+ state: :g
113
+ - suffix: :s3
114
+ state: :g
115
+ - suffix: :s4
116
+ state: :g
117
+ - suffix: :s5
118
+ state: :g
119
+
120
+ final_state: false
121
+
@@ -0,0 +1,90 @@
1
+ s1:
2
+ name: "-(y)Um"
3
+ regex: "ım|im|um|üm"
4
+ optional_letter: "y"
5
+ check_harmony: true
6
+
7
+ s2:
8
+ name: "-sUn"
9
+ regex: "sın|sin|sun|sün"
10
+ optional_letter: false
11
+ check_harmony: true
12
+
13
+ s3:
14
+ name: "-(y)Uz"
15
+ regex: "ız|iz|uz|üz"
16
+ optional_letter: "y"
17
+ check_harmony: true
18
+
19
+ s4:
20
+ name: "-sUnUz"
21
+ regex: "sınız|siniz|sunuz|sünüz"
22
+ optional_letter: false
23
+ check_harmony: true
24
+
25
+ s5:
26
+ name: "-lAr"
27
+ regex: "lar|ler"
28
+ optional_letter: false
29
+ check_harmony: true
30
+
31
+ s6:
32
+ name: "-m"
33
+ regex: "m"
34
+ optional_letter: false
35
+ check_harmony: true
36
+
37
+ s7:
38
+ name: "-n"
39
+ regex: "n"
40
+ optional_letter: false
41
+ check_harmony: true
42
+
43
+ s8:
44
+ name: "-k"
45
+ regex: "k"
46
+ optional_letter: false
47
+ check_harmony: true
48
+
49
+ s9:
50
+ name: "-nUz"
51
+ regex: "nız|niz|nuz|nüz"
52
+ optional_letter: false
53
+ check_harmony: true
54
+
55
+ s10:
56
+ name: "-DUr"
57
+ regex: "tır|tir|tur|tür|dır|dir|dur|dür"
58
+ optional_letter: false
59
+ check_harmony: true
60
+
61
+ s11:
62
+ name: "-cAsInA"
63
+ regex: "casına|çasına|cesine|çesine"
64
+ optional_letter: false
65
+ check_harmony: true
66
+
67
+ s12:
68
+ name: "-(y)DU"
69
+ regex: "dı|di|du|dü|tı|ti|tu|tü"
70
+ optional_letter: "y"
71
+ check_harmony: true
72
+
73
+ s13:
74
+ name: "-(y)sA"
75
+ regex: "sa|se"
76
+ optional_letter: "y"
77
+ check_harmony: true
78
+
79
+ s14:
80
+ name: "-(y)mUş"
81
+ regex: "muş|miş|müş|mış"
82
+ optional_letter: "y"
83
+ check_harmony: true
84
+
85
+ s15:
86
+ name: "-(y)ken"
87
+ regex: "ken"
88
+ optional_letter: "y"
89
+ check_harmony: true
90
+
@@ -0,0 +1,177 @@
1
+ # The order of the transitions is very crusial.
2
+ a:
3
+ transitions:
4
+ - suffix: :s16
5
+ state: :c
6
+ - suffix: :s7
7
+ state: :k
8
+ - suffix: :s3
9
+ state: :h
10
+ - suffix: :s5
11
+ state: :h
12
+ - suffix: :s1
13
+ state: :l
14
+ - suffix: :s14
15
+ state: :f
16
+ - suffix: :s15
17
+ state: :g
18
+ - suffix: :s17
19
+ state: :e
20
+ - suffix: :s10
21
+ state: :e
22
+ - suffix: :s19
23
+ state: :m
24
+ - suffix: :s4
25
+ state: :h
26
+ - suffix: :s9
27
+ state: :c
28
+ - suffix: :s12
29
+ state: :f
30
+ - suffix: :s13
31
+ state: :b
32
+ - suffix: :s18
33
+ state: :d
34
+ - suffix: :s2
35
+ state: :h
36
+ - suffix: :s6
37
+ state: :h
38
+ - suffix: :s8
39
+ state: :b
40
+ - suffix: :s11
41
+ state: :b
42
+
43
+ final_state: true
44
+
45
+ b:
46
+ transitions:
47
+ - suffix: :s3
48
+ state: :h
49
+ - suffix: :s5
50
+ state: :h
51
+ - suffix: :s1
52
+ state: :l
53
+ - suffix: :s4
54
+ state: :h
55
+ - suffix: :s2
56
+ state: :h
57
+
58
+ final_state: true
59
+
60
+ c:
61
+ transitions:
62
+ # Transitions to state K
63
+ - suffix: :s7
64
+ state: :k
65
+ # Transitions to state H
66
+ - suffix: :s6
67
+ state: :h
68
+
69
+ final_state: false
70
+
71
+ d:
72
+ transitions:
73
+ # Transitions to state F
74
+ - suffix: :s14
75
+ state: :f
76
+ # Transitions to state E
77
+ - suffix: :s10
78
+ state: :e
79
+ # Transitions to state B
80
+ - suffix: :s13
81
+ state: :b
82
+
83
+ final_state: false
84
+
85
+ e:
86
+ transitions:
87
+ - suffix: :s7
88
+ state: :k
89
+ - suffix: :s3
90
+ state: :h
91
+ - suffix: :s5
92
+ state: :h
93
+ - suffix: :s1
94
+ state: :l
95
+ - suffix: :s4
96
+ state: :h
97
+ - suffix: :s18
98
+ state: :d
99
+ - suffix: :s2
100
+ state: :h
101
+ - suffix: :s6
102
+ state: :h
103
+
104
+ final_state: true
105
+
106
+ f:
107
+ transitions:
108
+ # Transitions to state K
109
+ - suffix: :s7
110
+ state: :k
111
+ # Transitions to state D
112
+ - suffix: :s18
113
+ state: :d
114
+ # Transitions to state H
115
+ - suffix: :s6
116
+ state: :h
117
+
118
+ final_state: false
119
+
120
+
121
+ g:
122
+ transitions:
123
+ - suffix: :s5
124
+ state: :h
125
+ - suffix: :s3
126
+ state: :h
127
+ - suffix: :s1
128
+ state: :l
129
+ - suffix: :s4
130
+ state: :h
131
+ - suffix: :s18
132
+ state: :d
133
+ - suffix: :s2
134
+ state: :h
135
+
136
+ final_state: true
137
+
138
+ h:
139
+ transitions:
140
+ # Transitions to state L
141
+ - suffix: :s1
142
+ state: :l
143
+
144
+ final_state: true
145
+
146
+ k:
147
+ transitions: []
148
+
149
+ final_state: true
150
+
151
+ l:
152
+ transitions:
153
+ # Transitions to state D
154
+ - suffix: :s18
155
+ state: :d
156
+
157
+ final_state: true
158
+
159
+ m:
160
+ transitions:
161
+ - suffix: :s7
162
+ state: :k
163
+ - suffix: :s3
164
+ state: :h
165
+ - suffix: :s5
166
+ state: :h
167
+ - suffix: :s1
168
+ state: :l
169
+ - suffix: :s4
170
+ state: :h
171
+ - suffix: :s2
172
+ state: :h
173
+ - suffix: :s6
174
+ state: :h
175
+
176
+ final_state: true
177
+
@@ -0,0 +1,113 @@
1
+ s1:
2
+ name: "-lAr"
3
+ regex: "lar|ler"
4
+ optional_letter: false
5
+ check_harmony: true
6
+
7
+ s2:
8
+ name: "-(U)m"
9
+ regex: "m"
10
+ optional_letter: "ı|i|u|ü"
11
+ check_harmony: true
12
+
13
+ s3:
14
+ name: "-(U)mUz"
15
+ regex: "mız|miz|muz|müz"
16
+ optional_letter: "ı|i|u|ü"
17
+ check_harmony: true
18
+
19
+ s4:
20
+ name: "-Un"
21
+ regex: "ın|in|un|ün"
22
+ optional_letter: false
23
+ check_harmony: true
24
+
25
+ s5:
26
+ name: "-(U)nUz"
27
+ regex: "nız|niz|nuz|nüz"
28
+ optional_letter: "ı|i|u|ü"
29
+ check_harmony: true
30
+
31
+ s6:
32
+ name: "-(s)U"
33
+ regex: "ı|i|u|ü"
34
+ optional_letter: "s"
35
+ check_harmony: true
36
+
37
+ s7:
38
+ name: "-lArI"
39
+ regex: "ları|leri"
40
+ optional_letter: false
41
+ check_harmony: true
42
+
43
+ s8:
44
+ name: "-(y)U"
45
+ regex: "ı|i|u|ü"
46
+ optional_letter: "y"
47
+ check_harmony: true
48
+
49
+ s9:
50
+ name: "-nU"
51
+ regex: "nı|ni|nu|nü"
52
+ optional_letter: false
53
+ check_harmony: true
54
+
55
+ s10:
56
+ name: "-(n)Un"
57
+ regex: "ın|in|un|ün"
58
+ optional_letter: "n"
59
+ check_harmony: true
60
+
61
+ s11:
62
+ name: "-(y)A"
63
+ regex: "a|e"
64
+ optional_letter: "y"
65
+ check_harmony: true
66
+
67
+ s12:
68
+ name: "-nA"
69
+ regex: "na|ne"
70
+ optional_letter: false
71
+ check_harmony: true
72
+
73
+ s13:
74
+ name: "-DA"
75
+ regex: "da|de|ta|te"
76
+ optional_letter: false
77
+ check_harmony: true
78
+
79
+ s14:
80
+ name: "-nDA"
81
+ regex: "nta|nte|nda|nde"
82
+ optional_letter: false
83
+ check_harmony: true
84
+
85
+ s15:
86
+ name: "-DAn"
87
+ regex: "dan|tan|den|ten"
88
+ optional_letter: false
89
+ check_harmony: true
90
+
91
+ s16:
92
+ name: "-nDAn"
93
+ regex: "ndan|ntan|nden|nten"
94
+ optional_letter: false
95
+ check_harmony: true
96
+
97
+ s17:
98
+ name: "-(y)lA"
99
+ regex: "la|le"
100
+ optional_letter: "y"
101
+ check_harmony: true
102
+
103
+ s18:
104
+ name: "-ki"
105
+ regex: "ki"
106
+ optional_letter: false
107
+ check_harmony: false
108
+
109
+ s19:
110
+ name: "-(n)cA"
111
+ regex: "ca|ce"
112
+ optional_letter: "n"
113
+ check_harmony: true