turkish_stemmer 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ # coding : utf-8
2
+ require 'benchmark'
3
+ require 'turkish_stemmer'
4
+
5
+ Benchmark.bmbm(7) do |x|
6
+
7
+ x.report('regex') do
8
+ TurkishStemmer.class_eval do
9
+ def self.turkish?(word)
10
+ !! word.match(TurkishStemmer::ALPHABET)
11
+ end
12
+ end
13
+
14
+ 100_000.times { TurkishStemmer.turkish?("aaa") }
15
+ end
16
+
17
+ x.report('loop') do
18
+ TurkishStemmer.class_eval do
19
+ def self.turkish?(word)
20
+ !! word.chars.to_a.all? { |c| "abcçdefgğhıijklmnoöprsştuüvyz".include?(c) }
21
+ end
22
+ end
23
+
24
+ 100_000.times { TurkishStemmer.turkish?("aaaa") }
25
+ end
26
+ end
@@ -0,0 +1,10 @@
1
+ a:
2
+ transitions:
3
+ - suffix: :s1
4
+ state: :b
5
+
6
+ final_state: false
7
+
8
+ b:
9
+ transitions: []
10
+ final_state: true
@@ -0,0 +1,6 @@
1
+ s1:
2
+ name: "-lU"
3
+ regex: "lı|li|lu|lü"
4
+ optional_letter: false
5
+ check_harmony: true
6
+
@@ -0,0 +1,121 @@
1
+ # The order of the transitions is very crusial.
2
+ a:
3
+ transitions:
4
+ # Transitions to state B
5
+ - suffix: :s1
6
+ state: :b
7
+ - suffix: :s2
8
+ state: :b
9
+ - suffix: :s4
10
+ state: :b
11
+ - suffix: :s3
12
+ state: :b
13
+ # Transitions to state C
14
+ - suffix: :s5
15
+ state: :c
16
+ # Transitions to state D
17
+ - suffix: :s6
18
+ state: :d
19
+ - suffix: :s7
20
+ state: :d
21
+ - suffix: :s8
22
+ state: :d
23
+ - suffix: :s9
24
+ state: :d
25
+ # Transitions to state E
26
+ - suffix: :s10
27
+ state: :e
28
+ # Transitions to state F
29
+ - suffix: :s12
30
+ state: :f
31
+ - suffix: :s13
32
+ state: :f
33
+ - suffix: :s14
34
+ state: :f
35
+ - suffix: :s15
36
+ state: :f
37
+ # Transitions to state H
38
+ - suffix: :s11
39
+ state: :h
40
+
41
+ final_state: false
42
+
43
+ b:
44
+ transitions:
45
+ - suffix: :s14
46
+ state: :f
47
+
48
+ final_state: true
49
+
50
+ c:
51
+ transitions:
52
+ - suffix: :s10
53
+ state: :f
54
+ - suffix: :s12
55
+ state: :f
56
+ - suffix: :s13
57
+ state: :f
58
+ - suffix: :s14
59
+ state: :f
60
+
61
+ final_state: true
62
+
63
+ d:
64
+ transitions:
65
+ - suffix: :s12
66
+ state: :f
67
+ - suffix: :s13
68
+ state: :f
69
+
70
+ final_state: false
71
+
72
+ e:
73
+ transitions:
74
+ # Transitions to state G
75
+ - suffix: :s1
76
+ state: :g
77
+ - suffix: :s2
78
+ state: :g
79
+ - suffix: :s3
80
+ state: :g
81
+ - suffix: :s4
82
+ state: :g
83
+ - suffix: :s5
84
+ state: :g
85
+ # Transitions to state F
86
+ - suffix: :s14
87
+ state: :f
88
+ final_state: true
89
+
90
+ f:
91
+ transitions: []
92
+
93
+ final_state: true
94
+
95
+
96
+ g:
97
+ transitions:
98
+ - suffix: :s14
99
+ state: :f
100
+
101
+ final_state: false
102
+
103
+ h:
104
+ transitions:
105
+ # Transitions to state F
106
+ - suffix: :s14
107
+ state: :f
108
+ # Transitions to state G
109
+ - suffix: :s1
110
+ state: :g
111
+ - suffix: :s2
112
+ state: :g
113
+ - suffix: :s3
114
+ state: :g
115
+ - suffix: :s4
116
+ state: :g
117
+ - suffix: :s5
118
+ state: :g
119
+
120
+ final_state: false
121
+
@@ -0,0 +1,90 @@
1
+ s1:
2
+ name: "-(y)Um"
3
+ regex: "ım|im|um|üm"
4
+ optional_letter: "y"
5
+ check_harmony: true
6
+
7
+ s2:
8
+ name: "-sUn"
9
+ regex: "sın|sin|sun|sün"
10
+ optional_letter: false
11
+ check_harmony: true
12
+
13
+ s3:
14
+ name: "-(y)Uz"
15
+ regex: "ız|iz|uz|üz"
16
+ optional_letter: "y"
17
+ check_harmony: true
18
+
19
+ s4:
20
+ name: "-sUnUz"
21
+ regex: "sınız|siniz|sunuz|sünüz"
22
+ optional_letter: false
23
+ check_harmony: true
24
+
25
+ s5:
26
+ name: "-lAr"
27
+ regex: "lar|ler"
28
+ optional_letter: false
29
+ check_harmony: true
30
+
31
+ s6:
32
+ name: "-m"
33
+ regex: "m"
34
+ optional_letter: false
35
+ check_harmony: true
36
+
37
+ s7:
38
+ name: "-n"
39
+ regex: "n"
40
+ optional_letter: false
41
+ check_harmony: true
42
+
43
+ s8:
44
+ name: "-k"
45
+ regex: "k"
46
+ optional_letter: false
47
+ check_harmony: true
48
+
49
+ s9:
50
+ name: "-nUz"
51
+ regex: "nız|niz|nuz|nüz"
52
+ optional_letter: false
53
+ check_harmony: true
54
+
55
+ s10:
56
+ name: "-DUr"
57
+ regex: "tır|tir|tur|tür|dır|dir|dur|dür"
58
+ optional_letter: false
59
+ check_harmony: true
60
+
61
+ s11:
62
+ name: "-cAsInA"
63
+ regex: "casına|çasına|cesine|çesine"
64
+ optional_letter: false
65
+ check_harmony: true
66
+
67
+ s12:
68
+ name: "-(y)DU"
69
+ regex: "dı|di|du|dü|tı|ti|tu|tü"
70
+ optional_letter: "y"
71
+ check_harmony: true
72
+
73
+ s13:
74
+ name: "-(y)sA"
75
+ regex: "sa|se"
76
+ optional_letter: "y"
77
+ check_harmony: true
78
+
79
+ s14:
80
+ name: "-(y)mUş"
81
+ regex: "muş|miş|müş|mış"
82
+ optional_letter: "y"
83
+ check_harmony: true
84
+
85
+ s15:
86
+ name: "-(y)ken"
87
+ regex: "ken"
88
+ optional_letter: "y"
89
+ check_harmony: true
90
+
@@ -0,0 +1,177 @@
1
+ # The order of the transitions is very crusial.
2
+ a:
3
+ transitions:
4
+ - suffix: :s16
5
+ state: :c
6
+ - suffix: :s7
7
+ state: :k
8
+ - suffix: :s3
9
+ state: :h
10
+ - suffix: :s5
11
+ state: :h
12
+ - suffix: :s1
13
+ state: :l
14
+ - suffix: :s14
15
+ state: :f
16
+ - suffix: :s15
17
+ state: :g
18
+ - suffix: :s17
19
+ state: :e
20
+ - suffix: :s10
21
+ state: :e
22
+ - suffix: :s19
23
+ state: :m
24
+ - suffix: :s4
25
+ state: :h
26
+ - suffix: :s9
27
+ state: :c
28
+ - suffix: :s12
29
+ state: :f
30
+ - suffix: :s13
31
+ state: :b
32
+ - suffix: :s18
33
+ state: :d
34
+ - suffix: :s2
35
+ state: :h
36
+ - suffix: :s6
37
+ state: :h
38
+ - suffix: :s8
39
+ state: :b
40
+ - suffix: :s11
41
+ state: :b
42
+
43
+ final_state: true
44
+
45
+ b:
46
+ transitions:
47
+ - suffix: :s3
48
+ state: :h
49
+ - suffix: :s5
50
+ state: :h
51
+ - suffix: :s1
52
+ state: :l
53
+ - suffix: :s4
54
+ state: :h
55
+ - suffix: :s2
56
+ state: :h
57
+
58
+ final_state: true
59
+
60
+ c:
61
+ transitions:
62
+ # Transitions to state K
63
+ - suffix: :s7
64
+ state: :k
65
+ # Transitions to state H
66
+ - suffix: :s6
67
+ state: :h
68
+
69
+ final_state: false
70
+
71
+ d:
72
+ transitions:
73
+ # Transitions to state F
74
+ - suffix: :s14
75
+ state: :f
76
+ # Transitions to state E
77
+ - suffix: :s10
78
+ state: :e
79
+ # Transitions to state B
80
+ - suffix: :s13
81
+ state: :b
82
+
83
+ final_state: false
84
+
85
+ e:
86
+ transitions:
87
+ - suffix: :s7
88
+ state: :k
89
+ - suffix: :s3
90
+ state: :h
91
+ - suffix: :s5
92
+ state: :h
93
+ - suffix: :s1
94
+ state: :l
95
+ - suffix: :s4
96
+ state: :h
97
+ - suffix: :s18
98
+ state: :d
99
+ - suffix: :s2
100
+ state: :h
101
+ - suffix: :s6
102
+ state: :h
103
+
104
+ final_state: true
105
+
106
+ f:
107
+ transitions:
108
+ # Transitions to state K
109
+ - suffix: :s7
110
+ state: :k
111
+ # Transitions to state D
112
+ - suffix: :s18
113
+ state: :d
114
+ # Transitions to state H
115
+ - suffix: :s6
116
+ state: :h
117
+
118
+ final_state: false
119
+
120
+
121
+ g:
122
+ transitions:
123
+ - suffix: :s5
124
+ state: :h
125
+ - suffix: :s3
126
+ state: :h
127
+ - suffix: :s1
128
+ state: :l
129
+ - suffix: :s4
130
+ state: :h
131
+ - suffix: :s18
132
+ state: :d
133
+ - suffix: :s2
134
+ state: :h
135
+
136
+ final_state: true
137
+
138
+ h:
139
+ transitions:
140
+ # Transitions to state L
141
+ - suffix: :s1
142
+ state: :l
143
+
144
+ final_state: true
145
+
146
+ k:
147
+ transitions: []
148
+
149
+ final_state: true
150
+
151
+ l:
152
+ transitions:
153
+ # Transitions to state D
154
+ - suffix: :s18
155
+ state: :d
156
+
157
+ final_state: true
158
+
159
+ m:
160
+ transitions:
161
+ - suffix: :s7
162
+ state: :k
163
+ - suffix: :s3
164
+ state: :h
165
+ - suffix: :s5
166
+ state: :h
167
+ - suffix: :s1
168
+ state: :l
169
+ - suffix: :s4
170
+ state: :h
171
+ - suffix: :s2
172
+ state: :h
173
+ - suffix: :s6
174
+ state: :h
175
+
176
+ final_state: true
177
+
@@ -0,0 +1,113 @@
1
+ s1:
2
+ name: "-lAr"
3
+ regex: "lar|ler"
4
+ optional_letter: false
5
+ check_harmony: true
6
+
7
+ s2:
8
+ name: "-(U)m"
9
+ regex: "m"
10
+ optional_letter: "ı|i|u|ü"
11
+ check_harmony: true
12
+
13
+ s3:
14
+ name: "-(U)mUz"
15
+ regex: "mız|miz|muz|müz"
16
+ optional_letter: "ı|i|u|ü"
17
+ check_harmony: true
18
+
19
+ s4:
20
+ name: "-Un"
21
+ regex: "ın|in|un|ün"
22
+ optional_letter: false
23
+ check_harmony: true
24
+
25
+ s5:
26
+ name: "-(U)nUz"
27
+ regex: "nız|niz|nuz|nüz"
28
+ optional_letter: "ı|i|u|ü"
29
+ check_harmony: true
30
+
31
+ s6:
32
+ name: "-(s)U"
33
+ regex: "ı|i|u|ü"
34
+ optional_letter: "s"
35
+ check_harmony: true
36
+
37
+ s7:
38
+ name: "-lArI"
39
+ regex: "ları|leri"
40
+ optional_letter: false
41
+ check_harmony: true
42
+
43
+ s8:
44
+ name: "-(y)U"
45
+ regex: "ı|i|u|ü"
46
+ optional_letter: "y"
47
+ check_harmony: true
48
+
49
+ s9:
50
+ name: "-nU"
51
+ regex: "nı|ni|nu|nü"
52
+ optional_letter: false
53
+ check_harmony: true
54
+
55
+ s10:
56
+ name: "-(n)Un"
57
+ regex: "ın|in|un|ün"
58
+ optional_letter: "n"
59
+ check_harmony: true
60
+
61
+ s11:
62
+ name: "-(y)A"
63
+ regex: "a|e"
64
+ optional_letter: "y"
65
+ check_harmony: true
66
+
67
+ s12:
68
+ name: "-nA"
69
+ regex: "na|ne"
70
+ optional_letter: false
71
+ check_harmony: true
72
+
73
+ s13:
74
+ name: "-DA"
75
+ regex: "da|de|ta|te"
76
+ optional_letter: false
77
+ check_harmony: true
78
+
79
+ s14:
80
+ name: "-nDA"
81
+ regex: "nta|nte|nda|nde"
82
+ optional_letter: false
83
+ check_harmony: true
84
+
85
+ s15:
86
+ name: "-DAn"
87
+ regex: "dan|tan|den|ten"
88
+ optional_letter: false
89
+ check_harmony: true
90
+
91
+ s16:
92
+ name: "-nDAn"
93
+ regex: "ndan|ntan|nden|nten"
94
+ optional_letter: false
95
+ check_harmony: true
96
+
97
+ s17:
98
+ name: "-(y)lA"
99
+ regex: "la|le"
100
+ optional_letter: "y"
101
+ check_harmony: true
102
+
103
+ s18:
104
+ name: "-ki"
105
+ regex: "ki"
106
+ optional_letter: false
107
+ check_harmony: false
108
+
109
+ s19:
110
+ name: "-(n)cA"
111
+ regex: "ca|ce"
112
+ optional_letter: "n"
113
+ check_harmony: true