feedbackmine-language_detector 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -1,8 +1,8 @@
1
- feedbackmine
2
- http://twitter.com/feedbackmine
3
1
 
4
2
  == Welcome
5
- This is a n-gram based language detector, written in ruby
3
+ This is a n-gram based language detector, written in ruby.
4
+
5
+ For technical details about language detection, I recommend this paper: http://www.sfs.uni-tuebingen.de/iscl/Theses/kranig.pdf
6
6
 
7
7
  == Installation
8
8
  sudo gem sources -a http://gems.github.com (you only have to do this once)
@@ -14,5 +14,8 @@ d = LanguageDetector.new
14
14
  p d.detect('this text is in English')
15
15
 
16
16
  == Thanks
17
- Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/). I used all the training data in his ngramcat project.
17
+ Kevin Burton (http://feedblog.org/2005/08/19/ngram-language-categorization-source/ ). I used all the training data in his ngramcat project.
18
18
 
19
+ == Follow us on twitter
20
+
21
+ http://twitter.com/feedbackmine
@@ -1,3 +1,7 @@
1
+ require 'yaml'
2
+ require 'jcode'
3
+ $KCODE = 'u' if RUBY_VERSION < '1.9'
4
+
1
5
  class LanguageDetector
2
6
  def detect text
3
7
  @profiles ||= load_model
@@ -8,7 +12,7 @@ class LanguageDetector
8
12
  best_distance = nil
9
13
  @profiles.each {|profile|
10
14
  distance = profile.compute_distance(p)
11
-
15
+
12
16
  if !best_distance || distance < best_distance
13
17
  best_distance = distance
14
18
  best_profile = profile
@@ -16,11 +20,11 @@ class LanguageDetector
16
20
  }
17
21
  return best_profile.name
18
22
  end
19
-
23
+
20
24
  def self.train
21
-
25
+
22
26
  # For a full list of ISO 639 language tags visit:
23
-
27
+
24
28
  # http:#www.loc.gov/standards/iso639-2/englangn.html
25
29
 
26
30
  #LARGE profiles follow:
@@ -31,7 +35,7 @@ class LanguageDetector
31
35
  #always a good source of data.
32
36
  #
33
37
  # http:#en.wikipedia.org/wiki/World_War_II
34
-
38
+
35
39
  training_data = [
36
40
  # af (afrikaans)
37
41
  [ "ar", "ar-utf8.txt", "utf8", "arabic" ],
@@ -92,7 +96,7 @@ class LanguageDetector
92
96
  ]
93
97
 
94
98
  profiles = []
95
- training_data.each {|data|
99
+ training_data.each {|data|
96
100
  p = Profile.new data[0]
97
101
  p.init_with_file data[1]
98
102
  profiles << p
@@ -103,7 +107,7 @@ class LanguageDetector
103
107
  YAML.dump(profiles, f)
104
108
  }
105
109
  end
106
-
110
+
107
111
  def load_model
108
112
  filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
109
113
  @profiles = YAML.load_file(filename)
@@ -112,12 +116,12 @@ end
112
116
 
113
117
  class Profile
114
118
 
115
- PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
116
- ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
119
+ PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
120
+ ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
117
121
  ?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
118
-
122
+
119
123
  LIMIT = 2000
120
-
124
+
121
125
  def compute_distance other_profile
122
126
  distance = 0
123
127
  other_profile.ngrams.each {|k, v|
@@ -130,48 +134,48 @@ class Profile
130
134
  }
131
135
  return distance
132
136
  end
133
-
137
+
134
138
  attr_reader :ngrams, :name
135
-
139
+
136
140
  def initialize(name)
137
141
  @name = name
138
142
  @puctuations = {}
139
143
  PUNCTUATIONS.each {|p| @puctuations[p] = 1}
140
144
  @ngrams = {}
141
145
  end
142
-
146
+
143
147
  def init_with_file filename
144
148
  ngram_count = {}
145
-
149
+
146
150
  path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
147
151
  puts "training with " + path
148
- File.open(path).each_line{ |line|
149
- _init_with_string line, ngram_count
152
+ File.open(path).each_line{ |line|
153
+ _init_with_string line, ngram_count
150
154
  }
151
-
155
+
152
156
  a = ngram_count.sort {|a,b| b[1] <=> a[1]}
153
157
  i = 1
154
- a.each {|t|
158
+ a.each {|t|
155
159
  @ngrams[t[0]] = i
156
160
  i += 1
157
161
  break if i > LIMIT
158
162
  }
159
163
  end
160
-
164
+
161
165
  def init_with_string str
162
166
  ngram_count = {}
163
-
167
+
164
168
  _init_with_string str, ngram_count
165
-
169
+
166
170
  a = ngram_count.sort {|a,b| b[1] <=> a[1]}
167
171
  i = 1
168
- a.each {|t|
172
+ a.each {|t|
169
173
  @ngrams[t[0]] = i
170
174
  i += 1
171
175
  break if i > LIMIT
172
176
  }
173
177
  end
174
-
178
+
175
179
  def _init_with_string str, ngram_count
176
180
  tokens = tokenize(str)
177
181
  tokens.each {|token|
@@ -179,13 +183,13 @@ class Profile
179
183
  count_ngram token, 3, ngram_count
180
184
  count_ngram token, 4, ngram_count
181
185
  count_ngram token, 5, ngram_count
182
- }
186
+ }
183
187
  end
184
-
188
+
185
189
  def tokenize str
186
190
  tokens = []
187
191
  s = ''
188
- str.each_byte {|b|
192
+ str.each_byte {|b|
189
193
  if is_puctuation?(b)
190
194
  tokens << s unless s.empty?
191
195
  s = ''
@@ -196,12 +200,13 @@ class Profile
196
200
  tokens << s unless s.empty?
197
201
  return tokens
198
202
  end
199
-
203
+
200
204
  def is_puctuation? b
201
205
  @puctuations[b]
202
206
  end
203
-
207
+
204
208
  def count_ngram token, n, counts
209
+ token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
205
210
  i = 0
206
211
  while i + n <= token.length
207
212
  s = ''
@@ -210,14 +215,14 @@ class Profile
210
215
  s << token[i+j]
211
216
  j += 1
212
217
  end
213
- if counts[s]
218
+ if counts[s]
214
219
  counts[s] = counts[s] + 1
215
- else
220
+ else
216
221
  counts[s] = 1
217
222
  end
218
223
  i += 1
219
224
  end
220
-
225
+
221
226
  return counts
222
227
  end
223
228