feedbackmine-language_detector 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +7 -4
- data/lib/language_detector.rb +37 -32
- data/lib/model.yml +73838 -80934
- data/test/language_detector_test.rb +18 -27
- metadata +2 -2
data/README
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
feedbackmine
|
2
|
-
http://twitter.com/feedbackmine
|
3
1
|
|
4
2
|
== Welcome
|
5
|
-
This is a n-gram based language detector, written in ruby
|
3
|
+
This is a n-gram based language detector, written in ruby.
|
4
|
+
|
5
|
+
For technical details about language detection, I recommend this paper: http://www.sfs.uni-tuebingen.de/iscl/Theses/kranig.pdf
|
6
6
|
|
7
7
|
== Installation
|
8
8
|
sudo gem sources -a http://gems.github.com (you only have to do this once)
|
@@ -14,5 +14,8 @@ d = LanguageDetector.new
|
|
14
14
|
p d.detect('this text is in English')
|
15
15
|
|
16
16
|
== Thanks
|
17
|
-
Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/). I used all the training data in his ngramcat project.
|
17
|
+
Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/ ). I used all the training data in his ngramcat project.
|
18
18
|
|
19
|
+
== Follow us on twitter
|
20
|
+
|
21
|
+
http://twitter.com/feedbackmine
|
data/lib/language_detector.rb
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'jcode'
|
3
|
+
$KCODE = 'u' if RUBY_VERSION < '1.9'
|
4
|
+
|
1
5
|
class LanguageDetector
|
2
6
|
def detect text
|
3
7
|
@profiles ||= load_model
|
@@ -8,7 +12,7 @@ class LanguageDetector
|
|
8
12
|
best_distance = nil
|
9
13
|
@profiles.each {|profile|
|
10
14
|
distance = profile.compute_distance(p)
|
11
|
-
|
15
|
+
|
12
16
|
if !best_distance || distance < best_distance
|
13
17
|
best_distance = distance
|
14
18
|
best_profile = profile
|
@@ -16,11 +20,11 @@ class LanguageDetector
|
|
16
20
|
}
|
17
21
|
return best_profile.name
|
18
22
|
end
|
19
|
-
|
23
|
+
|
20
24
|
def self.train
|
21
|
-
|
25
|
+
|
22
26
|
# For a full list of ISO 639 language tags visit:
|
23
|
-
|
27
|
+
|
24
28
|
# http:#www.loc.gov/standards/iso639-2/englangn.html
|
25
29
|
|
26
30
|
#LARGE profiles follow:
|
@@ -31,7 +35,7 @@ class LanguageDetector
|
|
31
35
|
#always a good source of data.
|
32
36
|
#
|
33
37
|
# http:#en.wikipedia.org/wiki/World_War_II
|
34
|
-
|
38
|
+
|
35
39
|
training_data = [
|
36
40
|
# af (afrikaans)
|
37
41
|
[ "ar", "ar-utf8.txt", "utf8", "arabic" ],
|
@@ -92,7 +96,7 @@ class LanguageDetector
|
|
92
96
|
]
|
93
97
|
|
94
98
|
profiles = []
|
95
|
-
training_data.each {|data|
|
99
|
+
training_data.each {|data|
|
96
100
|
p = Profile.new data[0]
|
97
101
|
p.init_with_file data[1]
|
98
102
|
profiles << p
|
@@ -103,7 +107,7 @@ class LanguageDetector
|
|
103
107
|
YAML.dump(profiles, f)
|
104
108
|
}
|
105
109
|
end
|
106
|
-
|
110
|
+
|
107
111
|
def load_model
|
108
112
|
filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
|
109
113
|
@profiles = YAML.load_file(filename)
|
@@ -112,12 +116,12 @@ end
|
|
112
116
|
|
113
117
|
class Profile
|
114
118
|
|
115
|
-
PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
|
116
|
-
?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
|
119
|
+
PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
|
120
|
+
?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
|
117
121
|
?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
|
118
|
-
|
122
|
+
|
119
123
|
LIMIT = 2000
|
120
|
-
|
124
|
+
|
121
125
|
def compute_distance other_profile
|
122
126
|
distance = 0
|
123
127
|
other_profile.ngrams.each {|k, v|
|
@@ -130,48 +134,48 @@ class Profile
|
|
130
134
|
}
|
131
135
|
return distance
|
132
136
|
end
|
133
|
-
|
137
|
+
|
134
138
|
attr_reader :ngrams, :name
|
135
|
-
|
139
|
+
|
136
140
|
def initialize(name)
|
137
141
|
@name = name
|
138
142
|
@puctuations = {}
|
139
143
|
PUNCTUATIONS.each {|p| @puctuations[p] = 1}
|
140
144
|
@ngrams = {}
|
141
145
|
end
|
142
|
-
|
146
|
+
|
143
147
|
def init_with_file filename
|
144
148
|
ngram_count = {}
|
145
|
-
|
149
|
+
|
146
150
|
path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
|
147
151
|
puts "training with " + path
|
148
|
-
File.open(path).each_line{ |line|
|
149
|
-
_init_with_string line, ngram_count
|
152
|
+
File.open(path).each_line{ |line|
|
153
|
+
_init_with_string line, ngram_count
|
150
154
|
}
|
151
|
-
|
155
|
+
|
152
156
|
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
|
153
157
|
i = 1
|
154
|
-
a.each {|t|
|
158
|
+
a.each {|t|
|
155
159
|
@ngrams[t[0]] = i
|
156
160
|
i += 1
|
157
161
|
break if i > LIMIT
|
158
162
|
}
|
159
163
|
end
|
160
|
-
|
164
|
+
|
161
165
|
def init_with_string str
|
162
166
|
ngram_count = {}
|
163
|
-
|
167
|
+
|
164
168
|
_init_with_string str, ngram_count
|
165
|
-
|
169
|
+
|
166
170
|
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
|
167
171
|
i = 1
|
168
|
-
a.each {|t|
|
172
|
+
a.each {|t|
|
169
173
|
@ngrams[t[0]] = i
|
170
174
|
i += 1
|
171
175
|
break if i > LIMIT
|
172
176
|
}
|
173
177
|
end
|
174
|
-
|
178
|
+
|
175
179
|
def _init_with_string str, ngram_count
|
176
180
|
tokens = tokenize(str)
|
177
181
|
tokens.each {|token|
|
@@ -179,13 +183,13 @@ class Profile
|
|
179
183
|
count_ngram token, 3, ngram_count
|
180
184
|
count_ngram token, 4, ngram_count
|
181
185
|
count_ngram token, 5, ngram_count
|
182
|
-
}
|
186
|
+
}
|
183
187
|
end
|
184
|
-
|
188
|
+
|
185
189
|
def tokenize str
|
186
190
|
tokens = []
|
187
191
|
s = ''
|
188
|
-
str.each_byte {|b|
|
192
|
+
str.each_byte {|b|
|
189
193
|
if is_puctuation?(b)
|
190
194
|
tokens << s unless s.empty?
|
191
195
|
s = ''
|
@@ -196,12 +200,13 @@ class Profile
|
|
196
200
|
tokens << s unless s.empty?
|
197
201
|
return tokens
|
198
202
|
end
|
199
|
-
|
203
|
+
|
200
204
|
def is_puctuation? b
|
201
205
|
@puctuations[b]
|
202
206
|
end
|
203
|
-
|
207
|
+
|
204
208
|
def count_ngram token, n, counts
|
209
|
+
token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
|
205
210
|
i = 0
|
206
211
|
while i + n <= token.length
|
207
212
|
s = ''
|
@@ -210,14 +215,14 @@ class Profile
|
|
210
215
|
s << token[i+j]
|
211
216
|
j += 1
|
212
217
|
end
|
213
|
-
if counts[s]
|
218
|
+
if counts[s]
|
214
219
|
counts[s] = counts[s] + 1
|
215
|
-
else
|
220
|
+
else
|
216
221
|
counts[s] = 1
|
217
222
|
end
|
218
223
|
i += 1
|
219
224
|
end
|
220
|
-
|
225
|
+
|
221
226
|
return counts
|
222
227
|
end
|
223
228
|
|