turkish_stemmer 0.1.2 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- metadata +6 -38
- data/.gitignore +0 -18
- data/.rspec +0 -2
- data/Gemfile +0 -4
- data/LICENSE.txt +0 -22
- data/README.md +0 -282
- data/Rakefile +0 -21
- data/benchmarks/stemmers_comparison.rb +0 -16
- data/benchmarks/stemming_samples.txt +0 -17916
- data/benchmarks/turkish_word_recognition.rb +0 -26
- data/config/derivational_states.yml +0 -10
- data/config/derivational_suffixes.yml +0 -6
- data/config/nominal_verb_states.yml +0 -121
- data/config/nominal_verb_suffixes.yml +0 -90
- data/config/noun_states.yml +0 -177
- data/config/noun_suffixes.yml +0 -113
- data/config/stemmer.yml +0 -206
- data/lib/turkish_stemmer.rb +0 -455
- data/lib/turkish_stemmer/version.rb +0 -3
- data/spec/fixtures/simple_state.yml +0 -14
- data/spec/fixtures/simple_state_02.yml +0 -21
- data/spec/fixtures/simple_suffix.yml +0 -7
- data/spec/fixtures/simple_transition.yml +0 -7
- data/spec/spec_helper.rb +0 -19
- data/spec/support/fixtures.csv +0 -101
- data/spec/turkish_stemmer_spec.rb +0 -522
- data/turkish_stemmer.gemspec +0 -35
@@ -1,26 +0,0 @@
|
|
1
|
-
# coding : utf-8
|
2
|
-
require 'benchmark'
|
3
|
-
require 'turkish_stemmer'
|
4
|
-
|
5
|
-
Benchmark.bmbm(7) do |x|
|
6
|
-
|
7
|
-
x.report('regex') do
|
8
|
-
TurkishStemmer.class_eval do
|
9
|
-
def self.turkish?(word)
|
10
|
-
!! word.match(TurkishStemmer::ALPHABET)
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
100_000.times { TurkishStemmer.turkish?("aaa") }
|
15
|
-
end
|
16
|
-
|
17
|
-
x.report('loop') do
|
18
|
-
TurkishStemmer.class_eval do
|
19
|
-
def self.turkish?(word)
|
20
|
-
!! word.chars.to_a.all? { |c| "abcçdefgğhıijklmnoöprsştuüvyz".include?(c) }
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
100_000.times { TurkishStemmer.turkish?("aaaa") }
|
25
|
-
end
|
26
|
-
end
|
@@ -1,121 +0,0 @@
|
|
1
|
-
# The order of the transitions is very crusial.
|
2
|
-
a:
|
3
|
-
transitions:
|
4
|
-
# Transitions to state B
|
5
|
-
- suffix: :s1
|
6
|
-
state: :b
|
7
|
-
- suffix: :s2
|
8
|
-
state: :b
|
9
|
-
- suffix: :s4
|
10
|
-
state: :b
|
11
|
-
- suffix: :s3
|
12
|
-
state: :b
|
13
|
-
# Transitions to state C
|
14
|
-
- suffix: :s5
|
15
|
-
state: :c
|
16
|
-
# Transitions to state D
|
17
|
-
- suffix: :s6
|
18
|
-
state: :d
|
19
|
-
- suffix: :s7
|
20
|
-
state: :d
|
21
|
-
- suffix: :s8
|
22
|
-
state: :d
|
23
|
-
- suffix: :s9
|
24
|
-
state: :d
|
25
|
-
# Transitions to state E
|
26
|
-
- suffix: :s10
|
27
|
-
state: :e
|
28
|
-
# Transitions to state F
|
29
|
-
- suffix: :s12
|
30
|
-
state: :f
|
31
|
-
- suffix: :s13
|
32
|
-
state: :f
|
33
|
-
- suffix: :s14
|
34
|
-
state: :f
|
35
|
-
- suffix: :s15
|
36
|
-
state: :f
|
37
|
-
# Transitions to state H
|
38
|
-
- suffix: :s11
|
39
|
-
state: :h
|
40
|
-
|
41
|
-
final_state: false
|
42
|
-
|
43
|
-
b:
|
44
|
-
transitions:
|
45
|
-
- suffix: :s14
|
46
|
-
state: :f
|
47
|
-
|
48
|
-
final_state: true
|
49
|
-
|
50
|
-
c:
|
51
|
-
transitions:
|
52
|
-
- suffix: :s10
|
53
|
-
state: :f
|
54
|
-
- suffix: :s12
|
55
|
-
state: :f
|
56
|
-
- suffix: :s13
|
57
|
-
state: :f
|
58
|
-
- suffix: :s14
|
59
|
-
state: :f
|
60
|
-
|
61
|
-
final_state: true
|
62
|
-
|
63
|
-
d:
|
64
|
-
transitions:
|
65
|
-
- suffix: :s12
|
66
|
-
state: :f
|
67
|
-
- suffix: :s13
|
68
|
-
state: :f
|
69
|
-
|
70
|
-
final_state: false
|
71
|
-
|
72
|
-
e:
|
73
|
-
transitions:
|
74
|
-
# Transitions to state G
|
75
|
-
- suffix: :s1
|
76
|
-
state: :g
|
77
|
-
- suffix: :s2
|
78
|
-
state: :g
|
79
|
-
- suffix: :s3
|
80
|
-
state: :g
|
81
|
-
- suffix: :s4
|
82
|
-
state: :g
|
83
|
-
- suffix: :s5
|
84
|
-
state: :g
|
85
|
-
# Transitions to state F
|
86
|
-
- suffix: :s14
|
87
|
-
state: :f
|
88
|
-
final_state: true
|
89
|
-
|
90
|
-
f:
|
91
|
-
transitions: []
|
92
|
-
|
93
|
-
final_state: true
|
94
|
-
|
95
|
-
|
96
|
-
g:
|
97
|
-
transitions:
|
98
|
-
- suffix: :s14
|
99
|
-
state: :f
|
100
|
-
|
101
|
-
final_state: false
|
102
|
-
|
103
|
-
h:
|
104
|
-
transitions:
|
105
|
-
# Transitions to state F
|
106
|
-
- suffix: :s14
|
107
|
-
state: :f
|
108
|
-
# Transitions to state G
|
109
|
-
- suffix: :s1
|
110
|
-
state: :g
|
111
|
-
- suffix: :s2
|
112
|
-
state: :g
|
113
|
-
- suffix: :s3
|
114
|
-
state: :g
|
115
|
-
- suffix: :s4
|
116
|
-
state: :g
|
117
|
-
- suffix: :s5
|
118
|
-
state: :g
|
119
|
-
|
120
|
-
final_state: false
|
121
|
-
|
@@ -1,90 +0,0 @@
|
|
1
|
-
s1:
|
2
|
-
name: "-(y)Um"
|
3
|
-
regex: "ım|im|um|üm"
|
4
|
-
optional_letter: "y"
|
5
|
-
check_harmony: true
|
6
|
-
|
7
|
-
s2:
|
8
|
-
name: "-sUn"
|
9
|
-
regex: "sın|sin|sun|sün"
|
10
|
-
optional_letter: false
|
11
|
-
check_harmony: true
|
12
|
-
|
13
|
-
s3:
|
14
|
-
name: "-(y)Uz"
|
15
|
-
regex: "ız|iz|uz|üz"
|
16
|
-
optional_letter: "y"
|
17
|
-
check_harmony: true
|
18
|
-
|
19
|
-
s4:
|
20
|
-
name: "-sUnUz"
|
21
|
-
regex: "sınız|siniz|sunuz|sünüz"
|
22
|
-
optional_letter: false
|
23
|
-
check_harmony: true
|
24
|
-
|
25
|
-
s5:
|
26
|
-
name: "-lAr"
|
27
|
-
regex: "lar|ler"
|
28
|
-
optional_letter: false
|
29
|
-
check_harmony: true
|
30
|
-
|
31
|
-
s6:
|
32
|
-
name: "-m"
|
33
|
-
regex: "m"
|
34
|
-
optional_letter: false
|
35
|
-
check_harmony: true
|
36
|
-
|
37
|
-
s7:
|
38
|
-
name: "-n"
|
39
|
-
regex: "n"
|
40
|
-
optional_letter: false
|
41
|
-
check_harmony: true
|
42
|
-
|
43
|
-
s8:
|
44
|
-
name: "-k"
|
45
|
-
regex: "k"
|
46
|
-
optional_letter: false
|
47
|
-
check_harmony: true
|
48
|
-
|
49
|
-
s9:
|
50
|
-
name: "-nUz"
|
51
|
-
regex: "nız|niz|nuz|nüz"
|
52
|
-
optional_letter: false
|
53
|
-
check_harmony: true
|
54
|
-
|
55
|
-
s10:
|
56
|
-
name: "-DUr"
|
57
|
-
regex: "tır|tir|tur|tür|dır|dir|dur|dür"
|
58
|
-
optional_letter: false
|
59
|
-
check_harmony: true
|
60
|
-
|
61
|
-
s11:
|
62
|
-
name: "-cAsInA"
|
63
|
-
regex: "casına|çasına|cesine|çesine"
|
64
|
-
optional_letter: false
|
65
|
-
check_harmony: true
|
66
|
-
|
67
|
-
s12:
|
68
|
-
name: "-(y)DU"
|
69
|
-
regex: "dı|di|du|dü|tı|ti|tu|tü"
|
70
|
-
optional_letter: "y"
|
71
|
-
check_harmony: true
|
72
|
-
|
73
|
-
s13:
|
74
|
-
name: "-(y)sA"
|
75
|
-
regex: "sa|se"
|
76
|
-
optional_letter: "y"
|
77
|
-
check_harmony: true
|
78
|
-
|
79
|
-
s14:
|
80
|
-
name: "-(y)mUş"
|
81
|
-
regex: "muş|miş|müş|mış"
|
82
|
-
optional_letter: "y"
|
83
|
-
check_harmony: true
|
84
|
-
|
85
|
-
s15:
|
86
|
-
name: "-(y)ken"
|
87
|
-
regex: "ken"
|
88
|
-
optional_letter: "y"
|
89
|
-
check_harmony: true
|
90
|
-
|
data/config/noun_states.yml
DELETED
@@ -1,177 +0,0 @@
|
|
1
|
-
# The order of the transitions is very crusial.
|
2
|
-
a:
|
3
|
-
transitions:
|
4
|
-
- suffix: :s16
|
5
|
-
state: :c
|
6
|
-
- suffix: :s7
|
7
|
-
state: :k
|
8
|
-
- suffix: :s3
|
9
|
-
state: :h
|
10
|
-
- suffix: :s5
|
11
|
-
state: :h
|
12
|
-
- suffix: :s1
|
13
|
-
state: :l
|
14
|
-
- suffix: :s14
|
15
|
-
state: :f
|
16
|
-
- suffix: :s15
|
17
|
-
state: :g
|
18
|
-
- suffix: :s17
|
19
|
-
state: :e
|
20
|
-
- suffix: :s10
|
21
|
-
state: :e
|
22
|
-
- suffix: :s19
|
23
|
-
state: :m
|
24
|
-
- suffix: :s4
|
25
|
-
state: :h
|
26
|
-
- suffix: :s9
|
27
|
-
state: :c
|
28
|
-
- suffix: :s12
|
29
|
-
state: :f
|
30
|
-
- suffix: :s13
|
31
|
-
state: :b
|
32
|
-
- suffix: :s18
|
33
|
-
state: :d
|
34
|
-
- suffix: :s2
|
35
|
-
state: :h
|
36
|
-
- suffix: :s6
|
37
|
-
state: :h
|
38
|
-
- suffix: :s8
|
39
|
-
state: :b
|
40
|
-
- suffix: :s11
|
41
|
-
state: :b
|
42
|
-
|
43
|
-
final_state: true
|
44
|
-
|
45
|
-
b:
|
46
|
-
transitions:
|
47
|
-
- suffix: :s3
|
48
|
-
state: :h
|
49
|
-
- suffix: :s5
|
50
|
-
state: :h
|
51
|
-
- suffix: :s1
|
52
|
-
state: :l
|
53
|
-
- suffix: :s4
|
54
|
-
state: :h
|
55
|
-
- suffix: :s2
|
56
|
-
state: :h
|
57
|
-
|
58
|
-
final_state: true
|
59
|
-
|
60
|
-
c:
|
61
|
-
transitions:
|
62
|
-
# Transitions to state K
|
63
|
-
- suffix: :s7
|
64
|
-
state: :k
|
65
|
-
# Transitions to state H
|
66
|
-
- suffix: :s6
|
67
|
-
state: :h
|
68
|
-
|
69
|
-
final_state: false
|
70
|
-
|
71
|
-
d:
|
72
|
-
transitions:
|
73
|
-
# Transitions to state F
|
74
|
-
- suffix: :s14
|
75
|
-
state: :f
|
76
|
-
# Transitions to state E
|
77
|
-
- suffix: :s10
|
78
|
-
state: :e
|
79
|
-
# Transitions to state B
|
80
|
-
- suffix: :s13
|
81
|
-
state: :b
|
82
|
-
|
83
|
-
final_state: false
|
84
|
-
|
85
|
-
e:
|
86
|
-
transitions:
|
87
|
-
- suffix: :s7
|
88
|
-
state: :k
|
89
|
-
- suffix: :s3
|
90
|
-
state: :h
|
91
|
-
- suffix: :s5
|
92
|
-
state: :h
|
93
|
-
- suffix: :s1
|
94
|
-
state: :l
|
95
|
-
- suffix: :s4
|
96
|
-
state: :h
|
97
|
-
- suffix: :s18
|
98
|
-
state: :d
|
99
|
-
- suffix: :s2
|
100
|
-
state: :h
|
101
|
-
- suffix: :s6
|
102
|
-
state: :h
|
103
|
-
|
104
|
-
final_state: true
|
105
|
-
|
106
|
-
f:
|
107
|
-
transitions:
|
108
|
-
# Transitions to state K
|
109
|
-
- suffix: :s7
|
110
|
-
state: :k
|
111
|
-
# Transitions to state D
|
112
|
-
- suffix: :s18
|
113
|
-
state: :d
|
114
|
-
# Transitions to state H
|
115
|
-
- suffix: :s6
|
116
|
-
state: :h
|
117
|
-
|
118
|
-
final_state: false
|
119
|
-
|
120
|
-
|
121
|
-
g:
|
122
|
-
transitions:
|
123
|
-
- suffix: :s5
|
124
|
-
state: :h
|
125
|
-
- suffix: :s3
|
126
|
-
state: :h
|
127
|
-
- suffix: :s1
|
128
|
-
state: :l
|
129
|
-
- suffix: :s4
|
130
|
-
state: :h
|
131
|
-
- suffix: :s18
|
132
|
-
state: :d
|
133
|
-
- suffix: :s2
|
134
|
-
state: :h
|
135
|
-
|
136
|
-
final_state: true
|
137
|
-
|
138
|
-
h:
|
139
|
-
transitions:
|
140
|
-
# Transitions to state L
|
141
|
-
- suffix: :s1
|
142
|
-
state: :l
|
143
|
-
|
144
|
-
final_state: true
|
145
|
-
|
146
|
-
k:
|
147
|
-
transitions: []
|
148
|
-
|
149
|
-
final_state: true
|
150
|
-
|
151
|
-
l:
|
152
|
-
transitions:
|
153
|
-
# Transitions to state D
|
154
|
-
- suffix: :s18
|
155
|
-
state: :d
|
156
|
-
|
157
|
-
final_state: true
|
158
|
-
|
159
|
-
m:
|
160
|
-
transitions:
|
161
|
-
- suffix: :s7
|
162
|
-
state: :k
|
163
|
-
- suffix: :s3
|
164
|
-
state: :h
|
165
|
-
- suffix: :s5
|
166
|
-
state: :h
|
167
|
-
- suffix: :s1
|
168
|
-
state: :l
|
169
|
-
- suffix: :s4
|
170
|
-
state: :h
|
171
|
-
- suffix: :s2
|
172
|
-
state: :h
|
173
|
-
- suffix: :s6
|
174
|
-
state: :h
|
175
|
-
|
176
|
-
final_state: true
|
177
|
-
|
data/config/noun_suffixes.yml
DELETED
@@ -1,113 +0,0 @@
|
|
1
|
-
s1:
|
2
|
-
name: "-lAr"
|
3
|
-
regex: "lar|ler"
|
4
|
-
optional_letter: false
|
5
|
-
check_harmony: true
|
6
|
-
|
7
|
-
s2:
|
8
|
-
name: "-(U)m"
|
9
|
-
regex: "m"
|
10
|
-
optional_letter: "ı|i|u|ü"
|
11
|
-
check_harmony: true
|
12
|
-
|
13
|
-
s3:
|
14
|
-
name: "-(U)mUz"
|
15
|
-
regex: "mız|miz|muz|müz"
|
16
|
-
optional_letter: "ı|i|u|ü"
|
17
|
-
check_harmony: true
|
18
|
-
|
19
|
-
s4:
|
20
|
-
name: "-Un"
|
21
|
-
regex: "ın|in|un|ün"
|
22
|
-
optional_letter: false
|
23
|
-
check_harmony: true
|
24
|
-
|
25
|
-
s5:
|
26
|
-
name: "-(U)nUz"
|
27
|
-
regex: "nız|niz|nuz|nüz"
|
28
|
-
optional_letter: "ı|i|u|ü"
|
29
|
-
check_harmony: true
|
30
|
-
|
31
|
-
s6:
|
32
|
-
name: "-(s)U"
|
33
|
-
regex: "ı|i|u|ü"
|
34
|
-
optional_letter: "s"
|
35
|
-
check_harmony: true
|
36
|
-
|
37
|
-
s7:
|
38
|
-
name: "-lArI"
|
39
|
-
regex: "ları|leri"
|
40
|
-
optional_letter: false
|
41
|
-
check_harmony: true
|
42
|
-
|
43
|
-
s8:
|
44
|
-
name: "-(y)U"
|
45
|
-
regex: "ı|i|u|ü"
|
46
|
-
optional_letter: "y"
|
47
|
-
check_harmony: true
|
48
|
-
|
49
|
-
s9:
|
50
|
-
name: "-nU"
|
51
|
-
regex: "nı|ni|nu|nü"
|
52
|
-
optional_letter: false
|
53
|
-
check_harmony: true
|
54
|
-
|
55
|
-
s10:
|
56
|
-
name: "-(n)Un"
|
57
|
-
regex: "ın|in|un|ün"
|
58
|
-
optional_letter: "n"
|
59
|
-
check_harmony: true
|
60
|
-
|
61
|
-
s11:
|
62
|
-
name: "-(y)A"
|
63
|
-
regex: "a|e"
|
64
|
-
optional_letter: "y"
|
65
|
-
check_harmony: true
|
66
|
-
|
67
|
-
s12:
|
68
|
-
name: "-nA"
|
69
|
-
regex: "na|ne"
|
70
|
-
optional_letter: false
|
71
|
-
check_harmony: true
|
72
|
-
|
73
|
-
s13:
|
74
|
-
name: "-DA"
|
75
|
-
regex: "da|de|ta|te"
|
76
|
-
optional_letter: false
|
77
|
-
check_harmony: true
|
78
|
-
|
79
|
-
s14:
|
80
|
-
name: "-nDA"
|
81
|
-
regex: "nta|nte|nda|nde"
|
82
|
-
optional_letter: false
|
83
|
-
check_harmony: true
|
84
|
-
|
85
|
-
s15:
|
86
|
-
name: "-DAn"
|
87
|
-
regex: "dan|tan|den|ten"
|
88
|
-
optional_letter: false
|
89
|
-
check_harmony: true
|
90
|
-
|
91
|
-
s16:
|
92
|
-
name: "-nDAn"
|
93
|
-
regex: "ndan|ntan|nden|nten"
|
94
|
-
optional_letter: false
|
95
|
-
check_harmony: true
|
96
|
-
|
97
|
-
s17:
|
98
|
-
name: "-(y)lA"
|
99
|
-
regex: "la|le"
|
100
|
-
optional_letter: "y"
|
101
|
-
check_harmony: true
|
102
|
-
|
103
|
-
s18:
|
104
|
-
name: "-ki"
|
105
|
-
regex: "ki"
|
106
|
-
optional_letter: false
|
107
|
-
check_harmony: false
|
108
|
-
|
109
|
-
s19:
|
110
|
-
name: "-(n)cA"
|
111
|
-
regex: "ca|ce"
|
112
|
-
optional_letter: "n"
|
113
|
-
check_harmony: true
|