feedbackmine-language_detector 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt ADDED
@@ -0,0 +1,5 @@
1
+ Manifest.txt
2
+ README
3
+ lib/language_detector.rb
4
+ lib/model.yml
5
+ test/language_detector_test.rb
data/README ADDED
@@ -0,0 +1,18 @@
1
+ feedbackmine
2
+ http://twitter.com/feedbackmine
3
+
4
+ == Welcome
5
+ This is a n-gram based language detector, written in ruby
6
+
7
+ == Installation
8
+ sudo gem sources -a http://gems.github.com (you only have to do this once)
9
+ sudo gem install feedbackmine-language_detector
10
+
11
+ == Usage
12
+ require 'language_detector'
13
+ d = LanguageDetector.new
14
+ p d.detect('this text is in English')
15
+
16
+ == Thanks
17
+ Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/). I used all the training data in his ngramcat project.
18
+
@@ -0,0 +1,234 @@
1
+ class LanguageDetector
2
+ def detect text
3
+ @profiles ||= load_model
4
+
5
+ p = Profile.new("")
6
+ p.init_with_string text
7
+ best_profile = nil
8
+ best_distance = nil
9
+ @profiles.each {|profile|
10
+ distance = profile.compute_distance(p)
11
+
12
+ if !best_distance || distance < best_distance
13
+ best_distance = distance
14
+ best_profile = profile
15
+ end
16
+ }
17
+ return best_profile.name
18
+ end
19
+
20
+ def self.train
21
+
22
+ # For a full list of ISO 639 language tags visit:
23
+
24
+ # http:#www.loc.gov/standards/iso639-2/englangn.html
25
+
26
+ #LARGE profiles follow:
27
+
28
+ #NOTE: These profiles taken from the "World War II" node on wikipedia
29
+ #with the 'lang' and ?action=raw URI which results in a UTF8 encoded
30
+ #file. If we need to get more profile data for a language this is
31
+ #always a good source of data.
32
+ #
33
+ # http:#en.wikipedia.org/wiki/World_War_II
34
+
35
+ training_data = [
36
+ # af (afrikaans)
37
+ [ "ar", "ar-utf8.txt", "utf8", "arabic" ],
38
+ [ "bg", "bg-utf8.txt", "utf8", "bulgarian" ],
39
+ # bs (bosnian )
40
+ # ca (catalan)
41
+ [ "cs", "cs-utf8.txt", "utf8", "czech" ],
42
+ # cy (welsh)
43
+ [ "da", "da-iso-8859-1.txt", "iso-8859-1", "danish" ],
44
+ [ "de", "de-utf8.txt", "utf8", "german" ],
45
+ [ "el", "el-utf8.txt", "utf8", "greek" ],
46
+ [ "en", "en-iso-8859-1.txt", "iso-8859-1", "english" ],
47
+ [ "et", "et-utf8.txt", "utf8", "estonian" ],
48
+ [ "es", "es-utf8.txt", "utf8", "spanish" ],
49
+ [ "fa", "fa-utf8.txt", "utf8", "farsi" ],
50
+ [ "fi", "fi-utf8.txt", "utf8", "finnish" ],
51
+ [ "fr", "fr-utf8.txt", "utf8", "french" ],
52
+ [ "fy", "fy-utf8.txt", "utf8", "frisian" ],
53
+ [ "ga", "ga-utf8.txt", "utf8", "irish" ],
54
+ #gd (gaelic)
55
+ #haw (hawaiian)
56
+ [ "he", "he-utf8.txt", "utf8", "hebrew" ],
57
+ [ "hi", "hi-utf8.txt", "utf8", "hindi" ],
58
+ [ "hr", "hr-utf8.txt", "utf8", "croatian" ],
59
+ #id (indonesian)
60
+ [ "io", "io-utf8.txt", "utf8", "ido" ],
61
+ [ "is", "is-utf8.txt", "utf8", "icelandic" ],
62
+ [ "it", "it-utf8.txt", "utf8", "italian" ],
63
+ [ "ja", "ja-utf8.txt", "utf8", "japanese" ],
64
+ [ "ko", "ko-utf8.txt", "utf8", "korean" ],
65
+ #ku (kurdish)
66
+ #la ?
67
+ #lb ?
68
+ #lt (lithuanian)
69
+ #lv (latvian)
70
+ [ "hu", "hu-utf8.txt", "utf8", "hungarian" ],
71
+ #mk (macedonian)
72
+ #ms (malay)
73
+ #my (burmese)
74
+ [ "nl", "nl-iso-8859-1.txt", "iso-8859-1", "dutch" ],
75
+ [ "no", "no-utf8.txt", "utf8", "norwegian" ],
76
+ [ "pl", "pl-utf8.txt", "utf8", "polish" ],
77
+ [ "pt", "pt-utf8.txt", "utf8", "portuguese" ],
78
+ [ "ro", "ro-utf8.txt", "utf8", "romanian" ],
79
+ [ "ru", "ru-utf8.txt", "utf8", "russian" ],
80
+ [ "sl", "sl-utf8.txt", "utf8", "slovenian" ],
81
+ #sr (serbian)
82
+ [ "sv", "sv-iso-8859-1.txt", "iso-8859-1", "swedish" ],
83
+ #[ "sv", "sv-utf8.txt", "utf8", "swedish" ],
84
+ [ "th", "th-utf8.txt", "utf8", "thai" ],
85
+ #tl (tagalog)
86
+ #ty (tahitian)
87
+ [ "uk", "uk-utf8.txt", "utf8", "ukraninan" ],
88
+ [ "vi", "vi-utf8.txt", "utf8", "vietnamese" ],
89
+ #wa (walloon)
90
+ #yi (yidisih)
91
+ [ "zh", "zh-utf8.txt", "utf8", "chinese" ]
92
+ ]
93
+
94
+ profiles = []
95
+ training_data.each {|data|
96
+ p = Profile.new data[0]
97
+ p.init_with_file data[1]
98
+ profiles << p
99
+ }
100
+ puts 'saving model...'
101
+ filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
102
+ File.open(filename, 'w') {|f|
103
+ YAML.dump(profiles, f)
104
+ }
105
+ end
106
+
107
+ def load_model
108
+ filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
109
+ @profiles = YAML.load_file(filename)
110
+ end
111
+ end
112
+
113
+ class Profile
114
+
115
+ PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
116
+ ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
117
+ ?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
118
+
119
+ LIMIT = 2000
120
+
121
+ def compute_distance other_profile
122
+ distance = 0
123
+ other_profile.ngrams.each {|k, v|
124
+ n = @ngrams[k]
125
+ if n
126
+ distance += (v - n).abs
127
+ else
128
+ distance += Profile::LIMIT
129
+ end
130
+ }
131
+ return distance
132
+ end
133
+
134
+ attr_reader :ngrams, :name
135
+
136
+ def initialize(name)
137
+ @name = name
138
+ @puctuations = {}
139
+ PUNCTUATIONS.each {|p| @puctuations[p] = 1}
140
+ @ngrams = {}
141
+ end
142
+
143
+ def init_with_file filename
144
+ ngram_count = {}
145
+
146
+ path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
147
+ puts "training with " + path
148
+ File.open(path).each_line{ |line|
149
+ _init_with_string line, ngram_count
150
+ }
151
+
152
+ a = ngram_count.sort {|a,b| b[1] <=> a[1]}
153
+ i = 1
154
+ a.each {|t|
155
+ @ngrams[t[0]] = i
156
+ i += 1
157
+ break if i > LIMIT
158
+ }
159
+ end
160
+
161
+ def init_with_string str
162
+ ngram_count = {}
163
+
164
+ _init_with_string str, ngram_count
165
+
166
+ a = ngram_count.sort {|a,b| b[1] <=> a[1]}
167
+ i = 1
168
+ a.each {|t|
169
+ @ngrams[t[0]] = i
170
+ i += 1
171
+ break if i > LIMIT
172
+ }
173
+ end
174
+
175
+ def _init_with_string str, ngram_count
176
+ tokens = tokenize(str)
177
+ tokens.each {|token|
178
+ count_ngram token, 2, ngram_count
179
+ count_ngram token, 3, ngram_count
180
+ count_ngram token, 4, ngram_count
181
+ count_ngram token, 5, ngram_count
182
+ }
183
+ end
184
+
185
+ def tokenize str
186
+ tokens = []
187
+ s = ''
188
+ str.each_byte {|b|
189
+ if is_puctuation?(b)
190
+ tokens << s unless s.empty?
191
+ s = ''
192
+ else
193
+ s << b
194
+ end
195
+ }
196
+ tokens << s unless s.empty?
197
+ return tokens
198
+ end
199
+
200
+ def is_puctuation? b
201
+ @puctuations[b]
202
+ end
203
+
204
+ def count_ngram token, n, counts
205
+ i = 0
206
+ while i + n <= token.length
207
+ s = ''
208
+ j = 0
209
+ while j < n
210
+ s << token[i+j]
211
+ j += 1
212
+ end
213
+ if counts[s]
214
+ counts[s] = counts[s] + 1
215
+ else
216
+ counts[s] = 1
217
+ end
218
+ i += 1
219
+ end
220
+
221
+ return counts
222
+ end
223
+
224
+ end
225
+
226
+ if $0 == __FILE__
227
+ if ARGV.length == 1 && 'train' == ARGV[0]
228
+ LanguageDetector.train
229
+ else
230
+ d = LanguageDetector.new
231
+ p d.detect("what language is it is?")
232
+ end
233
+ end
234
+