feedbackmine-language_detector 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/language_detector.rb +99 -98
- data/lib/model.yml +35 -35
- data/test/language_detector_test.rb +13 -9
- metadata +3 -2
data/lib/language_detector.rb
CHANGED
@@ -6,7 +6,7 @@ class LanguageDetector
|
|
6
6
|
def detect text
|
7
7
|
@profiles ||= load_model
|
8
8
|
|
9
|
-
p = Profile.new("")
|
9
|
+
p = LanguageDetector::Profile.new("")
|
10
10
|
p.init_with_string text
|
11
11
|
best_profile = nil
|
12
12
|
best_distance = nil
|
@@ -97,7 +97,7 @@ class LanguageDetector
|
|
97
97
|
|
98
98
|
profiles = []
|
99
99
|
training_data.each {|data|
|
100
|
-
p = Profile.new data[0]
|
100
|
+
p = LanguageDetector::Profile.new data[0]
|
101
101
|
p.init_with_file data[1]
|
102
102
|
profiles << p
|
103
103
|
}
|
@@ -112,118 +112,119 @@ class LanguageDetector
|
|
112
112
|
filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
|
113
113
|
@profiles = YAML.load_file(filename)
|
114
114
|
end
|
115
|
-
end
|
116
|
-
|
117
|
-
class Profile
|
118
|
-
|
119
|
-
PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
|
120
|
-
?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
|
121
|
-
?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
|
122
|
-
|
123
|
-
LIMIT = 2000
|
124
115
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
116
|
+
class LanguageDetector::Profile
|
117
|
+
|
118
|
+
PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
|
119
|
+
?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
|
120
|
+
?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
|
121
|
+
|
122
|
+
LIMIT = 2000
|
123
|
+
|
124
|
+
def compute_distance other_profile
|
125
|
+
distance = 0
|
126
|
+
other_profile.ngrams.each {|k, v|
|
127
|
+
n = @ngrams[k]
|
128
|
+
if n
|
129
|
+
distance += (v - n).abs
|
130
|
+
else
|
131
|
+
distance += LanguageDetector::Profile::LIMIT
|
132
|
+
end
|
133
|
+
}
|
134
|
+
return distance
|
135
|
+
end
|
137
136
|
|
138
|
-
|
137
|
+
attr_reader :ngrams, :name
|
139
138
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
139
|
+
def initialize(name)
|
140
|
+
@name = name
|
141
|
+
@puctuations = {}
|
142
|
+
PUNCTUATIONS.each {|p| @puctuations[p] = 1}
|
143
|
+
@ngrams = {}
|
144
|
+
end
|
146
145
|
|
147
|
-
|
148
|
-
|
146
|
+
def init_with_file filename
|
147
|
+
ngram_count = {}
|
148
|
+
|
149
|
+
path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
|
150
|
+
puts "training with " + path
|
151
|
+
File.open(path).each_line{ |line|
|
152
|
+
_init_with_string line, ngram_count
|
153
|
+
}
|
154
|
+
|
155
|
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
|
156
|
+
i = 1
|
157
|
+
a.each {|t|
|
158
|
+
@ngrams[t[0]] = i
|
159
|
+
i += 1
|
160
|
+
break if i > LIMIT
|
161
|
+
}
|
162
|
+
end
|
149
163
|
|
150
|
-
|
151
|
-
|
152
|
-
File.open(path).each_line{ |line|
|
153
|
-
_init_with_string line, ngram_count
|
154
|
-
}
|
164
|
+
def init_with_string str
|
165
|
+
ngram_count = {}
|
155
166
|
|
156
|
-
|
157
|
-
i = 1
|
158
|
-
a.each {|t|
|
159
|
-
@ngrams[t[0]] = i
|
160
|
-
i += 1
|
161
|
-
break if i > LIMIT
|
162
|
-
}
|
163
|
-
end
|
167
|
+
_init_with_string str, ngram_count
|
164
168
|
|
165
|
-
|
166
|
-
|
169
|
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
|
170
|
+
i = 1
|
171
|
+
a.each {|t|
|
172
|
+
@ngrams[t[0]] = i
|
173
|
+
i += 1
|
174
|
+
break if i > LIMIT
|
175
|
+
}
|
176
|
+
end
|
167
177
|
|
168
|
-
_init_with_string str, ngram_count
|
178
|
+
def _init_with_string str, ngram_count
|
179
|
+
tokens = tokenize(str)
|
180
|
+
tokens.each {|token|
|
181
|
+
count_ngram token, 2, ngram_count
|
182
|
+
count_ngram token, 3, ngram_count
|
183
|
+
count_ngram token, 4, ngram_count
|
184
|
+
count_ngram token, 5, ngram_count
|
185
|
+
}
|
186
|
+
end
|
169
187
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
188
|
+
def tokenize str
|
189
|
+
tokens = []
|
190
|
+
s = ''
|
191
|
+
str.each_byte {|b|
|
192
|
+
if is_puctuation?(b)
|
193
|
+
tokens << s unless s.empty?
|
194
|
+
s = ''
|
195
|
+
else
|
196
|
+
s << b
|
197
|
+
end
|
198
|
+
}
|
199
|
+
tokens << s unless s.empty?
|
200
|
+
return tokens
|
201
|
+
end
|
178
202
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
count_ngram token, 2, ngram_count
|
183
|
-
count_ngram token, 3, ngram_count
|
184
|
-
count_ngram token, 4, ngram_count
|
185
|
-
count_ngram token, 5, ngram_count
|
186
|
-
}
|
187
|
-
end
|
203
|
+
def is_puctuation? b
|
204
|
+
@puctuations[b]
|
205
|
+
end
|
188
206
|
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
if is_puctuation?(b)
|
194
|
-
tokens << s unless s.empty?
|
207
|
+
def count_ngram token, n, counts
|
208
|
+
token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
|
209
|
+
i = 0
|
210
|
+
while i + n <= token.length
|
195
211
|
s = ''
|
196
|
-
|
197
|
-
|
212
|
+
j = 0
|
213
|
+
while j < n
|
214
|
+
s << token[i+j]
|
215
|
+
j += 1
|
216
|
+
end
|
217
|
+
if counts[s]
|
218
|
+
counts[s] = counts[s] + 1
|
219
|
+
else
|
220
|
+
counts[s] = 1
|
221
|
+
end
|
222
|
+
i += 1
|
198
223
|
end
|
199
|
-
}
|
200
|
-
tokens << s unless s.empty?
|
201
|
-
return tokens
|
202
|
-
end
|
203
|
-
|
204
|
-
def is_puctuation? b
|
205
|
-
@puctuations[b]
|
206
|
-
end
|
207
224
|
|
208
|
-
|
209
|
-
token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
|
210
|
-
i = 0
|
211
|
-
while i + n <= token.length
|
212
|
-
s = ''
|
213
|
-
j = 0
|
214
|
-
while j < n
|
215
|
-
s << token[i+j]
|
216
|
-
j += 1
|
217
|
-
end
|
218
|
-
if counts[s]
|
219
|
-
counts[s] = counts[s] + 1
|
220
|
-
else
|
221
|
-
counts[s] = 1
|
222
|
-
end
|
223
|
-
i += 1
|
225
|
+
return counts
|
224
226
|
end
|
225
227
|
|
226
|
-
return counts
|
227
228
|
end
|
228
229
|
|
229
230
|
end
|
data/lib/model.yml
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
---
|
2
|
-
- !ruby/object:Profile
|
2
|
+
- !ruby/object:LanguageDetector::Profile
|
3
3
|
name: ar
|
4
4
|
ngrams:
|
5
5
|
? !binary "qtmH\n"
|
@@ -5149,7 +5149,7 @@
|
|
5149
5149
|
43: 1
|
5150
5150
|
32: 1
|
5151
5151
|
10: 1
|
5152
|
-
- !ruby/object:Profile
|
5152
|
+
- !ruby/object:LanguageDetector::Profile
|
5153
5153
|
name: bg
|
5154
5154
|
ngrams:
|
5155
5155
|
? !binary "0L7QtNA=\n"
|
@@ -10519,7 +10519,7 @@
|
|
10519
10519
|
43: 1
|
10520
10520
|
32: 1
|
10521
10521
|
10: 1
|
10522
|
-
- !ruby/object:Profile
|
10522
|
+
- !ruby/object:LanguageDetector::Profile
|
10523
10523
|
name: cs
|
10524
10524
|
ngrams:
|
10525
10525
|
He: 765
|
@@ -12641,7 +12641,7 @@
|
|
12641
12641
|
43: 1
|
12642
12642
|
32: 1
|
12643
12643
|
10: 1
|
12644
|
-
- !ruby/object:Profile
|
12644
|
+
- !ruby/object:LanguageDetector::Profile
|
12645
12645
|
name: da
|
12646
12646
|
ngrams:
|
12647
12647
|
erede: 347
|
@@ -14691,7 +14691,7 @@
|
|
14691
14691
|
43: 1
|
14692
14692
|
32: 1
|
14693
14693
|
10: 1
|
14694
|
-
- !ruby/object:Profile
|
14694
|
+
- !ruby/object:LanguageDetector::Profile
|
14695
14695
|
name: de
|
14696
14696
|
ngrams:
|
14697
14697
|
Ope: 1204
|
@@ -16751,7 +16751,7 @@
|
|
16751
16751
|
43: 1
|
16752
16752
|
32: 1
|
16753
16753
|
10: 1
|
16754
|
-
- !ruby/object:Profile
|
16754
|
+
- !ruby/object:LanguageDetector::Profile
|
16755
16755
|
name: el
|
16756
16756
|
ngrams:
|
16757
16757
|
? !binary "zpTPic8=\n"
|
@@ -22079,7 +22079,7 @@
|
|
22079
22079
|
43: 1
|
22080
22080
|
32: 1
|
22081
22081
|
10: 1
|
22082
|
-
- !ruby/object:Profile
|
22082
|
+
- !ruby/object:LanguageDetector::Profile
|
22083
22083
|
name: en
|
22084
22084
|
ngrams:
|
22085
22085
|
my_: 833
|
@@ -24129,7 +24129,7 @@
|
|
24129
24129
|
43: 1
|
24130
24130
|
32: 1
|
24131
24131
|
10: 1
|
24132
|
-
- !ruby/object:Profile
|
24132
|
+
- !ruby/object:LanguageDetector::Profile
|
24133
24133
|
name: et
|
24134
24134
|
ngrams:
|
24135
24135
|
_saav: 1637
|
@@ -26215,7 +26215,7 @@
|
|
26215
26215
|
43: 1
|
26216
26216
|
32: 1
|
26217
26217
|
10: 1
|
26218
|
-
- !ruby/object:Profile
|
26218
|
+
- !ruby/object:LanguageDetector::Profile
|
26219
26219
|
name: es
|
26220
26220
|
ngrams:
|
26221
26221
|
nco: 1791
|
@@ -28277,7 +28277,7 @@
|
|
28277
28277
|
43: 1
|
28278
28278
|
32: 1
|
28279
28279
|
10: 1
|
28280
|
-
- !ruby/object:Profile
|
28280
|
+
- !ruby/object:LanguageDetector::Profile
|
28281
28281
|
name: fa
|
28282
28282
|
ngrams:
|
28283
28283
|
? !binary "2YPYp9g=\n"
|
@@ -33419,7 +33419,7 @@
|
|
33419
33419
|
43: 1
|
33420
33420
|
32: 1
|
33421
33421
|
10: 1
|
33422
|
-
- !ruby/object:Profile
|
33422
|
+
- !ruby/object:LanguageDetector::Profile
|
33423
33423
|
name: fi
|
33424
33424
|
ngrams:
|
33425
33425
|
valla: 637
|
@@ -35483,7 +35483,7 @@
|
|
35483
35483
|
43: 1
|
35484
35484
|
32: 1
|
35485
35485
|
10: 1
|
35486
|
-
- !ruby/object:Profile
|
35486
|
+
- !ruby/object:LanguageDetector::Profile
|
35487
35487
|
name: fr
|
35488
35488
|
ngrams:
|
35489
35489
|
hine: 1617
|
@@ -37549,7 +37549,7 @@
|
|
37549
37549
|
43: 1
|
37550
37550
|
32: 1
|
37551
37551
|
10: 1
|
37552
|
-
- !ruby/object:Profile
|
37552
|
+
- !ruby/object:LanguageDetector::Profile
|
37553
37553
|
name: fy
|
37554
37554
|
ngrams:
|
37555
37555
|
nfoel: 1606
|
@@ -39607,7 +39607,7 @@
|
|
39607
39607
|
43: 1
|
39608
39608
|
32: 1
|
39609
39609
|
10: 1
|
39610
|
-
- !ruby/object:Profile
|
39610
|
+
- !ruby/object:LanguageDetector::Profile
|
39611
39611
|
name: ga
|
39612
39612
|
ngrams:
|
39613
39613
|
ud: 1880
|
@@ -41671,7 +41671,7 @@
|
|
41671
41671
|
43: 1
|
41672
41672
|
32: 1
|
41673
41673
|
10: 1
|
41674
|
-
- !ruby/object:Profile
|
41674
|
+
- !ruby/object:LanguageDetector::Profile
|
41675
41675
|
name: he
|
41676
41676
|
ngrams:
|
41677
41677
|
? !binary "ldep15k=\n"
|
@@ -46859,7 +46859,7 @@
|
|
46859
46859
|
43: 1
|
46860
46860
|
32: 1
|
46861
46861
|
10: 1
|
46862
|
-
- !ruby/object:Profile
|
46862
|
+
- !ruby/object:LanguageDetector::Profile
|
46863
46863
|
name: hi
|
46864
46864
|
ngrams:
|
46865
46865
|
? !binary "ruCkuA==\n"
|
@@ -51907,7 +51907,7 @@
|
|
51907
51907
|
43: 1
|
51908
51908
|
32: 1
|
51909
51909
|
10: 1
|
51910
|
-
- !ruby/object:Profile
|
51910
|
+
- !ruby/object:LanguageDetector::Profile
|
51911
51911
|
name: hr
|
51912
51912
|
ngrams:
|
51913
51913
|
vine: 1902
|
@@ -53967,7 +53967,7 @@
|
|
53967
53967
|
43: 1
|
53968
53968
|
32: 1
|
53969
53969
|
10: 1
|
53970
|
-
- !ruby/object:Profile
|
53970
|
+
- !ruby/object:LanguageDetector::Profile
|
53971
53971
|
name: io
|
53972
53972
|
ngrams:
|
53973
53973
|
_Kere: 1912
|
@@ -56027,7 +56027,7 @@
|
|
56027
56027
|
43: 1
|
56028
56028
|
32: 1
|
56029
56029
|
10: 1
|
56030
|
-
- !ruby/object:Profile
|
56030
|
+
- !ruby/object:LanguageDetector::Profile
|
56031
56031
|
name: is
|
56032
56032
|
ngrams:
|
56033
56033
|
kaga_: 1774
|
@@ -58175,7 +58175,7 @@
|
|
58175
58175
|
43: 1
|
58176
58176
|
32: 1
|
58177
58177
|
10: 1
|
58178
|
-
- !ruby/object:Profile
|
58178
|
+
- !ruby/object:LanguageDetector::Profile
|
58179
58179
|
name: it
|
58180
58180
|
ngrams:
|
58181
58181
|
Ope: 1795
|
@@ -60235,7 +60235,7 @@
|
|
60235
60235
|
43: 1
|
60236
60236
|
32: 1
|
60237
60237
|
10: 1
|
60238
|
-
- !ruby/object:Profile
|
60238
|
+
- !ruby/object:LanguageDetector::Profile
|
60239
60239
|
name: ja
|
60240
60240
|
ngrams:
|
60241
60241
|
? !binary "g6vj\n"
|
@@ -65965,7 +65965,7 @@
|
|
65965
65965
|
43: 1
|
65966
65966
|
32: 1
|
65967
65967
|
10: 1
|
65968
|
-
- !ruby/object:Profile
|
65968
|
+
- !ruby/object:LanguageDetector::Profile
|
65969
65969
|
name: ko
|
65970
65970
|
ngrams:
|
65971
65971
|
? !binary "mIDqsw==\n"
|
@@ -71445,7 +71445,7 @@
|
|
71445
71445
|
43: 1
|
71446
71446
|
32: 1
|
71447
71447
|
10: 1
|
71448
|
-
- !ruby/object:Profile
|
71448
|
+
- !ruby/object:LanguageDetector::Profile
|
71449
71449
|
name: hu
|
71450
71450
|
ngrams:
|
71451
71451
|
"ens\xC3\xA9": 1414
|
@@ -73515,7 +73515,7 @@
|
|
73515
73515
|
43: 1
|
73516
73516
|
32: 1
|
73517
73517
|
10: 1
|
73518
|
-
- !ruby/object:Profile
|
73518
|
+
- !ruby/object:LanguageDetector::Profile
|
73519
73519
|
name: nl
|
73520
73520
|
ngrams:
|
73521
73521
|
He: 619
|
@@ -75565,7 +75565,7 @@
|
|
75565
75565
|
43: 1
|
75566
75566
|
32: 1
|
75567
75567
|
10: 1
|
75568
|
-
- !ruby/object:Profile
|
75568
|
+
- !ruby/object:LanguageDetector::Profile
|
75569
75569
|
name: "no"
|
75570
75570
|
ngrams:
|
75571
75571
|
Ope: 1032
|
@@ -77625,7 +77625,7 @@
|
|
77625
77625
|
43: 1
|
77626
77626
|
32: 1
|
77627
77627
|
10: 1
|
77628
|
-
- !ruby/object:Profile
|
77628
|
+
- !ruby/object:LanguageDetector::Profile
|
77629
77629
|
name: pl
|
77630
77630
|
ngrams:
|
77631
77631
|
"k\xC3\xB3w": 1379
|
@@ -79717,7 +79717,7 @@
|
|
79717
79717
|
43: 1
|
79718
79718
|
32: 1
|
79719
79719
|
10: 1
|
79720
|
-
- !ruby/object:Profile
|
79720
|
+
- !ruby/object:LanguageDetector::Profile
|
79721
79721
|
name: pt
|
79722
79722
|
ngrams:
|
79723
79723
|
nco: 1274
|
@@ -81815,7 +81815,7 @@
|
|
81815
81815
|
43: 1
|
81816
81816
|
32: 1
|
81817
81817
|
10: 1
|
81818
|
-
- !ruby/object:Profile
|
81818
|
+
- !ruby/object:LanguageDetector::Profile
|
81819
81819
|
name: ro
|
81820
81820
|
ngrams:
|
81821
81821
|
_ur: 1996
|
@@ -83893,7 +83893,7 @@
|
|
83893
83893
|
43: 1
|
83894
83894
|
32: 1
|
83895
83895
|
10: 1
|
83896
|
-
- !ruby/object:Profile
|
83896
|
+
- !ruby/object:LanguageDetector::Profile
|
83897
83897
|
name: ru
|
83898
83898
|
ngrams:
|
83899
83899
|
? !binary "v9C/\n"
|
@@ -89077,7 +89077,7 @@
|
|
89077
89077
|
43: 1
|
89078
89078
|
32: 1
|
89079
89079
|
10: 1
|
89080
|
-
- !ruby/object:Profile
|
89080
|
+
- !ruby/object:LanguageDetector::Profile
|
89081
89081
|
name: sl
|
89082
89082
|
ngrams:
|
89083
89083
|
preds: 594
|
@@ -91147,7 +91147,7 @@
|
|
91147
91147
|
43: 1
|
91148
91148
|
32: 1
|
91149
91149
|
10: 1
|
91150
|
-
- !ruby/object:Profile
|
91150
|
+
- !ruby/object:LanguageDetector::Profile
|
91151
91151
|
name: sv
|
91152
91152
|
ngrams:
|
91153
91153
|
karna: 1187
|
@@ -93197,7 +93197,7 @@
|
|
93197
93197
|
43: 1
|
93198
93198
|
32: 1
|
93199
93199
|
10: 1
|
93200
|
-
- !ruby/object:Profile
|
93200
|
+
- !ruby/object:LanguageDetector::Profile
|
93201
93201
|
name: th
|
93202
93202
|
ngrams:
|
93203
93203
|
? !binary "uYPguKs=\n"
|
@@ -98785,7 +98785,7 @@
|
|
98785
98785
|
43: 1
|
98786
98786
|
32: 1
|
98787
98787
|
10: 1
|
98788
|
-
- !ruby/object:Profile
|
98788
|
+
- !ruby/object:LanguageDetector::Profile
|
98789
98789
|
name: uk
|
98790
98790
|
ngrams:
|
98791
98791
|
"\xBA\xD0\xBE__": 1806
|
@@ -103973,7 +103973,7 @@
|
|
103973
103973
|
43: 1
|
103974
103974
|
32: 1
|
103975
103975
|
10: 1
|
103976
|
-
- !ruby/object:Profile
|
103976
|
+
- !ruby/object:LanguageDetector::Profile
|
103977
103977
|
name: vi
|
103978
103978
|
ngrams:
|
103979
103979
|
? !binary "xJHhu5E=\n"
|
@@ -106335,7 +106335,7 @@
|
|
106335
106335
|
43: 1
|
106336
106336
|
32: 1
|
106337
106337
|
10: 1
|
106338
|
-
- !ruby/object:Profile
|
106338
|
+
- !ruby/object:LanguageDetector::Profile
|
106339
106339
|
name: zh
|
106340
106340
|
ngrams:
|
106341
106341
|
? !binary "6L+Z\n"
|
@@ -4,7 +4,7 @@ require File.dirname(__FILE__) + '/../lib/language_detector'
|
|
4
4
|
|
5
5
|
class ProfileTest < Test::Unit::TestCase
|
6
6
|
def test_is_puctuation
|
7
|
-
p = Profile.new("test")
|
7
|
+
p = LanguageDetector::Profile.new("test")
|
8
8
|
assert p.is_puctuation?(?,)
|
9
9
|
assert p.is_puctuation?(?.)
|
10
10
|
assert !p.is_puctuation?(?A)
|
@@ -12,12 +12,12 @@ class ProfileTest < Test::Unit::TestCase
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def test_tokenize
|
15
|
-
p = Profile.new("test")
|
15
|
+
p = LanguageDetector::Profile.new("test")
|
16
16
|
assert_equal ["this", "is", "A", "test"], p.tokenize("this is ,+_ A \t 123 test")
|
17
17
|
end
|
18
18
|
|
19
19
|
def test_count_ngram
|
20
|
-
p = Profile.new("test")
|
20
|
+
p = LanguageDetector::Profile.new("test")
|
21
21
|
assert_equal({"w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1}, p.count_ngram('words', 1, {}))
|
22
22
|
assert_equal({"wo"=>1, "or"=>1, "rd"=>1, "ds"=>1, "_w" => 1, "s_" => 1}, p.count_ngram('words', 2, {}))
|
23
23
|
assert_equal({"wor"=>1, "ord"=>1, "rds"=>1, "_wo" => 1, "ds_" => 1, "s__" => 1}, p.count_ngram('words', 3, {}))
|
@@ -27,25 +27,29 @@ class ProfileTest < Test::Unit::TestCase
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def test_init_with_string
|
30
|
-
p = Profile.new("test")
|
30
|
+
p = LanguageDetector::Profile.new("test")
|
31
31
|
p.init_with_string("this is ,+_ A \t 123 test")
|
32
|
-
assert_equal(
|
32
|
+
assert_equal(
|
33
|
+
[["t_", 30], ["st__", 29], ["st", 16], ["hi", 8], ["_tes", 7], ["is__", 6], ["s___", 5], ["s_", 3], ["his_", 11], ["tes", 10], ["t___", 9], ["es", 12], ["_te", 14], ["est_", 13], ["est", 15], ["te", 4], ["his", 17], ["_th", 20], ["s__", 19], ["st_", 18], ["th", 24], ["_thi", 23], ["t__", 22], ["test", 21], ["thi", 28], ["is_", 27], ["this", 26], ["_i", 25], ["is", 2], ["_t", 1]],
|
34
|
+
p.ngrams.sort_by { |a,b| a[1] <=> b[1] },
|
35
|
+
"This test does not pass in the original repository either: http://github.com/feedbackmine/language_detector"
|
36
|
+
)
|
33
37
|
end
|
34
38
|
|
35
39
|
def test_init_with_file
|
36
|
-
p = Profile.new("test")
|
40
|
+
p = LanguageDetector::Profile.new("test")
|
37
41
|
p.init_with_file("bg-utf8.txt")
|
38
42
|
assert !p.ngrams.empty?
|
39
43
|
end
|
40
44
|
|
41
45
|
def test_compute_distance
|
42
|
-
p1 = Profile.new("test")
|
46
|
+
p1 = LanguageDetector::Profile.new("test")
|
43
47
|
p1.init_with_string("this is ,+_ A \t 123 test")
|
44
|
-
p2 = Profile.new("test")
|
48
|
+
p2 = LanguageDetector::Profile.new("test")
|
45
49
|
p2.init_with_string("this is ,+_ A \t 123 test")
|
46
50
|
assert_equal 0, p1.compute_distance(p2)
|
47
51
|
|
48
|
-
p3 = Profile.new("test")
|
52
|
+
p3 = LanguageDetector::Profile.new("test")
|
49
53
|
p3.init_with_string("xxxx")
|
50
54
|
assert_equal 24000, p1.compute_distance(p3)
|
51
55
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: feedbackmine-language_detector
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- feedbackmine
|
@@ -29,6 +29,7 @@ files:
|
|
29
29
|
- test/language_detector_test.rb
|
30
30
|
has_rdoc: false
|
31
31
|
homepage: http://www.tweetjobsearch.com
|
32
|
+
licenses:
|
32
33
|
post_install_message:
|
33
34
|
rdoc_options: []
|
34
35
|
|
@@ -49,7 +50,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
49
50
|
requirements: []
|
50
51
|
|
51
52
|
rubyforge_project:
|
52
|
-
rubygems_version: 1.
|
53
|
+
rubygems_version: 1.3.5
|
53
54
|
signing_key:
|
54
55
|
specification_version: 2
|
55
56
|
summary: n-gram based language detector, written in ruby
|