feedbackmine-language_detector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +5 -0
- data/README +18 -0
- data/lib/language_detector.rb +234 -0
- data/lib/model.yml +119191 -0
- data/test/language_detector_test.rb +85 -0
- metadata +57 -0
data/Manifest.txt
ADDED
data/README
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
feedbackmine
|
2
|
+
http://twitter.com/feedbackmine
|
3
|
+
|
4
|
+
== Welcome
|
5
|
+
This is a n-gram based language detector, written in ruby
|
6
|
+
|
7
|
+
== Installation
|
8
|
+
sudo gem sources -a http://gems.github.com (you only have to do this once)
|
9
|
+
sudo gem install feedbackmine-language_detector
|
10
|
+
|
11
|
+
== Usage
|
12
|
+
require 'language_detector'
|
13
|
+
d = LanguageDetector.new
|
14
|
+
p d.detect('this text is in English')
|
15
|
+
|
16
|
+
== Thanks
|
17
|
+
Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/). I used all the training data in his ngramcat project.
|
18
|
+
|
@@ -0,0 +1,234 @@
|
|
1
|
+
class LanguageDetector
|
2
|
+
def detect text
|
3
|
+
@profiles ||= load_model
|
4
|
+
|
5
|
+
p = Profile.new("")
|
6
|
+
p.init_with_string text
|
7
|
+
best_profile = nil
|
8
|
+
best_distance = nil
|
9
|
+
@profiles.each {|profile|
|
10
|
+
distance = profile.compute_distance(p)
|
11
|
+
|
12
|
+
if !best_distance || distance < best_distance
|
13
|
+
best_distance = distance
|
14
|
+
best_profile = profile
|
15
|
+
end
|
16
|
+
}
|
17
|
+
return best_profile.name
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.train
|
21
|
+
|
22
|
+
# For a full list of ISO 639 language tags visit:
|
23
|
+
|
24
|
+
# http:#www.loc.gov/standards/iso639-2/englangn.html
|
25
|
+
|
26
|
+
#LARGE profiles follow:
|
27
|
+
|
28
|
+
#NOTE: These profiles taken from the "World War II" node on wikipedia
|
29
|
+
#with the 'lang' and ?action=raw URI which results in a UTF8 encoded
|
30
|
+
#file. If we need to get more profile data for a language this is
|
31
|
+
#always a good source of data.
|
32
|
+
#
|
33
|
+
# http:#en.wikipedia.org/wiki/World_War_II
|
34
|
+
|
35
|
+
training_data = [
|
36
|
+
# af (afrikaans)
|
37
|
+
[ "ar", "ar-utf8.txt", "utf8", "arabic" ],
|
38
|
+
[ "bg", "bg-utf8.txt", "utf8", "bulgarian" ],
|
39
|
+
# bs (bosnian )
|
40
|
+
# ca (catalan)
|
41
|
+
[ "cs", "cs-utf8.txt", "utf8", "czech" ],
|
42
|
+
# cy (welsh)
|
43
|
+
[ "da", "da-iso-8859-1.txt", "iso-8859-1", "danish" ],
|
44
|
+
[ "de", "de-utf8.txt", "utf8", "german" ],
|
45
|
+
[ "el", "el-utf8.txt", "utf8", "greek" ],
|
46
|
+
[ "en", "en-iso-8859-1.txt", "iso-8859-1", "english" ],
|
47
|
+
[ "et", "et-utf8.txt", "utf8", "estonian" ],
|
48
|
+
[ "es", "es-utf8.txt", "utf8", "spanish" ],
|
49
|
+
[ "fa", "fa-utf8.txt", "utf8", "farsi" ],
|
50
|
+
[ "fi", "fi-utf8.txt", "utf8", "finnish" ],
|
51
|
+
[ "fr", "fr-utf8.txt", "utf8", "french" ],
|
52
|
+
[ "fy", "fy-utf8.txt", "utf8", "frisian" ],
|
53
|
+
[ "ga", "ga-utf8.txt", "utf8", "irish" ],
|
54
|
+
#gd (gaelic)
|
55
|
+
#haw (hawaiian)
|
56
|
+
[ "he", "he-utf8.txt", "utf8", "hebrew" ],
|
57
|
+
[ "hi", "hi-utf8.txt", "utf8", "hindi" ],
|
58
|
+
[ "hr", "hr-utf8.txt", "utf8", "croatian" ],
|
59
|
+
#id (indonesian)
|
60
|
+
[ "io", "io-utf8.txt", "utf8", "ido" ],
|
61
|
+
[ "is", "is-utf8.txt", "utf8", "icelandic" ],
|
62
|
+
[ "it", "it-utf8.txt", "utf8", "italian" ],
|
63
|
+
[ "ja", "ja-utf8.txt", "utf8", "japanese" ],
|
64
|
+
[ "ko", "ko-utf8.txt", "utf8", "korean" ],
|
65
|
+
#ku (kurdish)
|
66
|
+
#la ?
|
67
|
+
#lb ?
|
68
|
+
#lt (lithuanian)
|
69
|
+
#lv (latvian)
|
70
|
+
[ "hu", "hu-utf8.txt", "utf8", "hungarian" ],
|
71
|
+
#mk (macedonian)
|
72
|
+
#ms (malay)
|
73
|
+
#my (burmese)
|
74
|
+
[ "nl", "nl-iso-8859-1.txt", "iso-8859-1", "dutch" ],
|
75
|
+
[ "no", "no-utf8.txt", "utf8", "norwegian" ],
|
76
|
+
[ "pl", "pl-utf8.txt", "utf8", "polish" ],
|
77
|
+
[ "pt", "pt-utf8.txt", "utf8", "portuguese" ],
|
78
|
+
[ "ro", "ro-utf8.txt", "utf8", "romanian" ],
|
79
|
+
[ "ru", "ru-utf8.txt", "utf8", "russian" ],
|
80
|
+
[ "sl", "sl-utf8.txt", "utf8", "slovenian" ],
|
81
|
+
#sr (serbian)
|
82
|
+
[ "sv", "sv-iso-8859-1.txt", "iso-8859-1", "swedish" ],
|
83
|
+
#[ "sv", "sv-utf8.txt", "utf8", "swedish" ],
|
84
|
+
[ "th", "th-utf8.txt", "utf8", "thai" ],
|
85
|
+
#tl (tagalog)
|
86
|
+
#ty (tahitian)
|
87
|
+
[ "uk", "uk-utf8.txt", "utf8", "ukraninan" ],
|
88
|
+
[ "vi", "vi-utf8.txt", "utf8", "vietnamese" ],
|
89
|
+
#wa (walloon)
|
90
|
+
#yi (yidisih)
|
91
|
+
[ "zh", "zh-utf8.txt", "utf8", "chinese" ]
|
92
|
+
]
|
93
|
+
|
94
|
+
profiles = []
|
95
|
+
training_data.each {|data|
|
96
|
+
p = Profile.new data[0]
|
97
|
+
p.init_with_file data[1]
|
98
|
+
profiles << p
|
99
|
+
}
|
100
|
+
puts 'saving model...'
|
101
|
+
filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
|
102
|
+
File.open(filename, 'w') {|f|
|
103
|
+
YAML.dump(profiles, f)
|
104
|
+
}
|
105
|
+
end
|
106
|
+
|
107
|
+
def load_model
|
108
|
+
filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
|
109
|
+
@profiles = YAML.load_file(filename)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
class Profile
|
114
|
+
|
115
|
+
PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
|
116
|
+
?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
|
117
|
+
?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
|
118
|
+
|
119
|
+
LIMIT = 2000
|
120
|
+
|
121
|
+
def compute_distance other_profile
|
122
|
+
distance = 0
|
123
|
+
other_profile.ngrams.each {|k, v|
|
124
|
+
n = @ngrams[k]
|
125
|
+
if n
|
126
|
+
distance += (v - n).abs
|
127
|
+
else
|
128
|
+
distance += Profile::LIMIT
|
129
|
+
end
|
130
|
+
}
|
131
|
+
return distance
|
132
|
+
end
|
133
|
+
|
134
|
+
attr_reader :ngrams, :name
|
135
|
+
|
136
|
+
def initialize(name)
|
137
|
+
@name = name
|
138
|
+
@puctuations = {}
|
139
|
+
PUNCTUATIONS.each {|p| @puctuations[p] = 1}
|
140
|
+
@ngrams = {}
|
141
|
+
end
|
142
|
+
|
143
|
+
def init_with_file filename
|
144
|
+
ngram_count = {}
|
145
|
+
|
146
|
+
path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
|
147
|
+
puts "training with " + path
|
148
|
+
File.open(path).each_line{ |line|
|
149
|
+
_init_with_string line, ngram_count
|
150
|
+
}
|
151
|
+
|
152
|
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
|
153
|
+
i = 1
|
154
|
+
a.each {|t|
|
155
|
+
@ngrams[t[0]] = i
|
156
|
+
i += 1
|
157
|
+
break if i > LIMIT
|
158
|
+
}
|
159
|
+
end
|
160
|
+
|
161
|
+
def init_with_string str
|
162
|
+
ngram_count = {}
|
163
|
+
|
164
|
+
_init_with_string str, ngram_count
|
165
|
+
|
166
|
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
|
167
|
+
i = 1
|
168
|
+
a.each {|t|
|
169
|
+
@ngrams[t[0]] = i
|
170
|
+
i += 1
|
171
|
+
break if i > LIMIT
|
172
|
+
}
|
173
|
+
end
|
174
|
+
|
175
|
+
def _init_with_string str, ngram_count
|
176
|
+
tokens = tokenize(str)
|
177
|
+
tokens.each {|token|
|
178
|
+
count_ngram token, 2, ngram_count
|
179
|
+
count_ngram token, 3, ngram_count
|
180
|
+
count_ngram token, 4, ngram_count
|
181
|
+
count_ngram token, 5, ngram_count
|
182
|
+
}
|
183
|
+
end
|
184
|
+
|
185
|
+
def tokenize str
|
186
|
+
tokens = []
|
187
|
+
s = ''
|
188
|
+
str.each_byte {|b|
|
189
|
+
if is_puctuation?(b)
|
190
|
+
tokens << s unless s.empty?
|
191
|
+
s = ''
|
192
|
+
else
|
193
|
+
s << b
|
194
|
+
end
|
195
|
+
}
|
196
|
+
tokens << s unless s.empty?
|
197
|
+
return tokens
|
198
|
+
end
|
199
|
+
|
200
|
+
def is_puctuation? b
|
201
|
+
@puctuations[b]
|
202
|
+
end
|
203
|
+
|
204
|
+
def count_ngram token, n, counts
|
205
|
+
i = 0
|
206
|
+
while i + n <= token.length
|
207
|
+
s = ''
|
208
|
+
j = 0
|
209
|
+
while j < n
|
210
|
+
s << token[i+j]
|
211
|
+
j += 1
|
212
|
+
end
|
213
|
+
if counts[s]
|
214
|
+
counts[s] = counts[s] + 1
|
215
|
+
else
|
216
|
+
counts[s] = 1
|
217
|
+
end
|
218
|
+
i += 1
|
219
|
+
end
|
220
|
+
|
221
|
+
return counts
|
222
|
+
end
|
223
|
+
|
224
|
+
end
|
225
|
+
|
226
|
+
if $0 == __FILE__
|
227
|
+
if ARGV.length == 1 && 'train' == ARGV[0]
|
228
|
+
LanguageDetector.train
|
229
|
+
else
|
230
|
+
d = LanguageDetector.new
|
231
|
+
p d.detect("what language is it is?")
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|