feedbackmine-language_detector 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ class LanguageDetector
6
6
  def detect text
7
7
  @profiles ||= load_model
8
8
 
9
- p = Profile.new("")
9
+ p = LanguageDetector::Profile.new("")
10
10
  p.init_with_string text
11
11
  best_profile = nil
12
12
  best_distance = nil
@@ -97,7 +97,7 @@ class LanguageDetector
97
97
 
98
98
  profiles = []
99
99
  training_data.each {|data|
100
- p = Profile.new data[0]
100
+ p = LanguageDetector::Profile.new data[0]
101
101
  p.init_with_file data[1]
102
102
  profiles << p
103
103
  }
@@ -112,118 +112,119 @@ class LanguageDetector
112
112
  filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
113
113
  @profiles = YAML.load_file(filename)
114
114
  end
115
- end
116
-
117
- class Profile
118
-
119
- PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
120
- ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
121
- ?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
122
-
123
- LIMIT = 2000
124
115
 
125
- def compute_distance other_profile
126
- distance = 0
127
- other_profile.ngrams.each {|k, v|
128
- n = @ngrams[k]
129
- if n
130
- distance += (v - n).abs
131
- else
132
- distance += Profile::LIMIT
133
- end
134
- }
135
- return distance
136
- end
116
+ class LanguageDetector::Profile
117
+
118
+ PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
119
+ ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
120
+ ?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
121
+
122
+ LIMIT = 2000
123
+
124
+ def compute_distance other_profile
125
+ distance = 0
126
+ other_profile.ngrams.each {|k, v|
127
+ n = @ngrams[k]
128
+ if n
129
+ distance += (v - n).abs
130
+ else
131
+ distance += LanguageDetector::Profile::LIMIT
132
+ end
133
+ }
134
+ return distance
135
+ end
137
136
 
138
- attr_reader :ngrams, :name
137
+ attr_reader :ngrams, :name
139
138
 
140
- def initialize(name)
141
- @name = name
142
- @puctuations = {}
143
- PUNCTUATIONS.each {|p| @puctuations[p] = 1}
144
- @ngrams = {}
145
- end
139
+ def initialize(name)
140
+ @name = name
141
+ @puctuations = {}
142
+ PUNCTUATIONS.each {|p| @puctuations[p] = 1}
143
+ @ngrams = {}
144
+ end
146
145
 
147
- def init_with_file filename
148
- ngram_count = {}
146
+ def init_with_file filename
147
+ ngram_count = {}
148
+
149
+ path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
150
+ puts "training with " + path
151
+ File.open(path).each_line{ |line|
152
+ _init_with_string line, ngram_count
153
+ }
154
+
155
+ a = ngram_count.sort {|a,b| b[1] <=> a[1]}
156
+ i = 1
157
+ a.each {|t|
158
+ @ngrams[t[0]] = i
159
+ i += 1
160
+ break if i > LIMIT
161
+ }
162
+ end
149
163
 
150
- path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
151
- puts "training with " + path
152
- File.open(path).each_line{ |line|
153
- _init_with_string line, ngram_count
154
- }
164
+ def init_with_string str
165
+ ngram_count = {}
155
166
 
156
- a = ngram_count.sort {|a,b| b[1] <=> a[1]}
157
- i = 1
158
- a.each {|t|
159
- @ngrams[t[0]] = i
160
- i += 1
161
- break if i > LIMIT
162
- }
163
- end
167
+ _init_with_string str, ngram_count
164
168
 
165
- def init_with_string str
166
- ngram_count = {}
169
+ a = ngram_count.sort {|a,b| b[1] <=> a[1]}
170
+ i = 1
171
+ a.each {|t|
172
+ @ngrams[t[0]] = i
173
+ i += 1
174
+ break if i > LIMIT
175
+ }
176
+ end
167
177
 
168
- _init_with_string str, ngram_count
178
+ def _init_with_string str, ngram_count
179
+ tokens = tokenize(str)
180
+ tokens.each {|token|
181
+ count_ngram token, 2, ngram_count
182
+ count_ngram token, 3, ngram_count
183
+ count_ngram token, 4, ngram_count
184
+ count_ngram token, 5, ngram_count
185
+ }
186
+ end
169
187
 
170
- a = ngram_count.sort {|a,b| b[1] <=> a[1]}
171
- i = 1
172
- a.each {|t|
173
- @ngrams[t[0]] = i
174
- i += 1
175
- break if i > LIMIT
176
- }
177
- end
188
+ def tokenize str
189
+ tokens = []
190
+ s = ''
191
+ str.each_byte {|b|
192
+ if is_puctuation?(b)
193
+ tokens << s unless s.empty?
194
+ s = ''
195
+ else
196
+ s << b
197
+ end
198
+ }
199
+ tokens << s unless s.empty?
200
+ return tokens
201
+ end
178
202
 
179
- def _init_with_string str, ngram_count
180
- tokens = tokenize(str)
181
- tokens.each {|token|
182
- count_ngram token, 2, ngram_count
183
- count_ngram token, 3, ngram_count
184
- count_ngram token, 4, ngram_count
185
- count_ngram token, 5, ngram_count
186
- }
187
- end
203
+ def is_puctuation? b
204
+ @puctuations[b]
205
+ end
188
206
 
189
- def tokenize str
190
- tokens = []
191
- s = ''
192
- str.each_byte {|b|
193
- if is_puctuation?(b)
194
- tokens << s unless s.empty?
207
+ def count_ngram token, n, counts
208
+ token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
209
+ i = 0
210
+ while i + n <= token.length
195
211
  s = ''
196
- else
197
- s << b
212
+ j = 0
213
+ while j < n
214
+ s << token[i+j]
215
+ j += 1
216
+ end
217
+ if counts[s]
218
+ counts[s] = counts[s] + 1
219
+ else
220
+ counts[s] = 1
221
+ end
222
+ i += 1
198
223
  end
199
- }
200
- tokens << s unless s.empty?
201
- return tokens
202
- end
203
-
204
- def is_puctuation? b
205
- @puctuations[b]
206
- end
207
224
 
208
- def count_ngram token, n, counts
209
- token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
210
- i = 0
211
- while i + n <= token.length
212
- s = ''
213
- j = 0
214
- while j < n
215
- s << token[i+j]
216
- j += 1
217
- end
218
- if counts[s]
219
- counts[s] = counts[s] + 1
220
- else
221
- counts[s] = 1
222
- end
223
- i += 1
225
+ return counts
224
226
  end
225
227
 
226
- return counts
227
228
  end
228
229
 
229
230
  end
data/lib/model.yml CHANGED
@@ -1,5 +1,5 @@
1
1
  ---
2
- - !ruby/object:Profile
2
+ - !ruby/object:LanguageDetector::Profile
3
3
  name: ar
4
4
  ngrams:
5
5
  ? !binary "qtmH\n"
@@ -5149,7 +5149,7 @@
5149
5149
  43: 1
5150
5150
  32: 1
5151
5151
  10: 1
5152
- - !ruby/object:Profile
5152
+ - !ruby/object:LanguageDetector::Profile
5153
5153
  name: bg
5154
5154
  ngrams:
5155
5155
  ? !binary "0L7QtNA=\n"
@@ -10519,7 +10519,7 @@
10519
10519
  43: 1
10520
10520
  32: 1
10521
10521
  10: 1
10522
- - !ruby/object:Profile
10522
+ - !ruby/object:LanguageDetector::Profile
10523
10523
  name: cs
10524
10524
  ngrams:
10525
10525
  He: 765
@@ -12641,7 +12641,7 @@
12641
12641
  43: 1
12642
12642
  32: 1
12643
12643
  10: 1
12644
- - !ruby/object:Profile
12644
+ - !ruby/object:LanguageDetector::Profile
12645
12645
  name: da
12646
12646
  ngrams:
12647
12647
  erede: 347
@@ -14691,7 +14691,7 @@
14691
14691
  43: 1
14692
14692
  32: 1
14693
14693
  10: 1
14694
- - !ruby/object:Profile
14694
+ - !ruby/object:LanguageDetector::Profile
14695
14695
  name: de
14696
14696
  ngrams:
14697
14697
  Ope: 1204
@@ -16751,7 +16751,7 @@
16751
16751
  43: 1
16752
16752
  32: 1
16753
16753
  10: 1
16754
- - !ruby/object:Profile
16754
+ - !ruby/object:LanguageDetector::Profile
16755
16755
  name: el
16756
16756
  ngrams:
16757
16757
  ? !binary "zpTPic8=\n"
@@ -22079,7 +22079,7 @@
22079
22079
  43: 1
22080
22080
  32: 1
22081
22081
  10: 1
22082
- - !ruby/object:Profile
22082
+ - !ruby/object:LanguageDetector::Profile
22083
22083
  name: en
22084
22084
  ngrams:
22085
22085
  my_: 833
@@ -24129,7 +24129,7 @@
24129
24129
  43: 1
24130
24130
  32: 1
24131
24131
  10: 1
24132
- - !ruby/object:Profile
24132
+ - !ruby/object:LanguageDetector::Profile
24133
24133
  name: et
24134
24134
  ngrams:
24135
24135
  _saav: 1637
@@ -26215,7 +26215,7 @@
26215
26215
  43: 1
26216
26216
  32: 1
26217
26217
  10: 1
26218
- - !ruby/object:Profile
26218
+ - !ruby/object:LanguageDetector::Profile
26219
26219
  name: es
26220
26220
  ngrams:
26221
26221
  nco: 1791
@@ -28277,7 +28277,7 @@
28277
28277
  43: 1
28278
28278
  32: 1
28279
28279
  10: 1
28280
- - !ruby/object:Profile
28280
+ - !ruby/object:LanguageDetector::Profile
28281
28281
  name: fa
28282
28282
  ngrams:
28283
28283
  ? !binary "2YPYp9g=\n"
@@ -33419,7 +33419,7 @@
33419
33419
  43: 1
33420
33420
  32: 1
33421
33421
  10: 1
33422
- - !ruby/object:Profile
33422
+ - !ruby/object:LanguageDetector::Profile
33423
33423
  name: fi
33424
33424
  ngrams:
33425
33425
  valla: 637
@@ -35483,7 +35483,7 @@
35483
35483
  43: 1
35484
35484
  32: 1
35485
35485
  10: 1
35486
- - !ruby/object:Profile
35486
+ - !ruby/object:LanguageDetector::Profile
35487
35487
  name: fr
35488
35488
  ngrams:
35489
35489
  hine: 1617
@@ -37549,7 +37549,7 @@
37549
37549
  43: 1
37550
37550
  32: 1
37551
37551
  10: 1
37552
- - !ruby/object:Profile
37552
+ - !ruby/object:LanguageDetector::Profile
37553
37553
  name: fy
37554
37554
  ngrams:
37555
37555
  nfoel: 1606
@@ -39607,7 +39607,7 @@
39607
39607
  43: 1
39608
39608
  32: 1
39609
39609
  10: 1
39610
- - !ruby/object:Profile
39610
+ - !ruby/object:LanguageDetector::Profile
39611
39611
  name: ga
39612
39612
  ngrams:
39613
39613
  ud: 1880
@@ -41671,7 +41671,7 @@
41671
41671
  43: 1
41672
41672
  32: 1
41673
41673
  10: 1
41674
- - !ruby/object:Profile
41674
+ - !ruby/object:LanguageDetector::Profile
41675
41675
  name: he
41676
41676
  ngrams:
41677
41677
  ? !binary "ldep15k=\n"
@@ -46859,7 +46859,7 @@
46859
46859
  43: 1
46860
46860
  32: 1
46861
46861
  10: 1
46862
- - !ruby/object:Profile
46862
+ - !ruby/object:LanguageDetector::Profile
46863
46863
  name: hi
46864
46864
  ngrams:
46865
46865
  ? !binary "ruCkuA==\n"
@@ -51907,7 +51907,7 @@
51907
51907
  43: 1
51908
51908
  32: 1
51909
51909
  10: 1
51910
- - !ruby/object:Profile
51910
+ - !ruby/object:LanguageDetector::Profile
51911
51911
  name: hr
51912
51912
  ngrams:
51913
51913
  vine: 1902
@@ -53967,7 +53967,7 @@
53967
53967
  43: 1
53968
53968
  32: 1
53969
53969
  10: 1
53970
- - !ruby/object:Profile
53970
+ - !ruby/object:LanguageDetector::Profile
53971
53971
  name: io
53972
53972
  ngrams:
53973
53973
  _Kere: 1912
@@ -56027,7 +56027,7 @@
56027
56027
  43: 1
56028
56028
  32: 1
56029
56029
  10: 1
56030
- - !ruby/object:Profile
56030
+ - !ruby/object:LanguageDetector::Profile
56031
56031
  name: is
56032
56032
  ngrams:
56033
56033
  kaga_: 1774
@@ -58175,7 +58175,7 @@
58175
58175
  43: 1
58176
58176
  32: 1
58177
58177
  10: 1
58178
- - !ruby/object:Profile
58178
+ - !ruby/object:LanguageDetector::Profile
58179
58179
  name: it
58180
58180
  ngrams:
58181
58181
  Ope: 1795
@@ -60235,7 +60235,7 @@
60235
60235
  43: 1
60236
60236
  32: 1
60237
60237
  10: 1
60238
- - !ruby/object:Profile
60238
+ - !ruby/object:LanguageDetector::Profile
60239
60239
  name: ja
60240
60240
  ngrams:
60241
60241
  ? !binary "g6vj\n"
@@ -65965,7 +65965,7 @@
65965
65965
  43: 1
65966
65966
  32: 1
65967
65967
  10: 1
65968
- - !ruby/object:Profile
65968
+ - !ruby/object:LanguageDetector::Profile
65969
65969
  name: ko
65970
65970
  ngrams:
65971
65971
  ? !binary "mIDqsw==\n"
@@ -71445,7 +71445,7 @@
71445
71445
  43: 1
71446
71446
  32: 1
71447
71447
  10: 1
71448
- - !ruby/object:Profile
71448
+ - !ruby/object:LanguageDetector::Profile
71449
71449
  name: hu
71450
71450
  ngrams:
71451
71451
  "ens\xC3\xA9": 1414
@@ -73515,7 +73515,7 @@
73515
73515
  43: 1
73516
73516
  32: 1
73517
73517
  10: 1
73518
- - !ruby/object:Profile
73518
+ - !ruby/object:LanguageDetector::Profile
73519
73519
  name: nl
73520
73520
  ngrams:
73521
73521
  He: 619
@@ -75565,7 +75565,7 @@
75565
75565
  43: 1
75566
75566
  32: 1
75567
75567
  10: 1
75568
- - !ruby/object:Profile
75568
+ - !ruby/object:LanguageDetector::Profile
75569
75569
  name: "no"
75570
75570
  ngrams:
75571
75571
  Ope: 1032
@@ -77625,7 +77625,7 @@
77625
77625
  43: 1
77626
77626
  32: 1
77627
77627
  10: 1
77628
- - !ruby/object:Profile
77628
+ - !ruby/object:LanguageDetector::Profile
77629
77629
  name: pl
77630
77630
  ngrams:
77631
77631
  "k\xC3\xB3w": 1379
@@ -79717,7 +79717,7 @@
79717
79717
  43: 1
79718
79718
  32: 1
79719
79719
  10: 1
79720
- - !ruby/object:Profile
79720
+ - !ruby/object:LanguageDetector::Profile
79721
79721
  name: pt
79722
79722
  ngrams:
79723
79723
  nco: 1274
@@ -81815,7 +81815,7 @@
81815
81815
  43: 1
81816
81816
  32: 1
81817
81817
  10: 1
81818
- - !ruby/object:Profile
81818
+ - !ruby/object:LanguageDetector::Profile
81819
81819
  name: ro
81820
81820
  ngrams:
81821
81821
  _ur: 1996
@@ -83893,7 +83893,7 @@
83893
83893
  43: 1
83894
83894
  32: 1
83895
83895
  10: 1
83896
- - !ruby/object:Profile
83896
+ - !ruby/object:LanguageDetector::Profile
83897
83897
  name: ru
83898
83898
  ngrams:
83899
83899
  ? !binary "v9C/\n"
@@ -89077,7 +89077,7 @@
89077
89077
  43: 1
89078
89078
  32: 1
89079
89079
  10: 1
89080
- - !ruby/object:Profile
89080
+ - !ruby/object:LanguageDetector::Profile
89081
89081
  name: sl
89082
89082
  ngrams:
89083
89083
  preds: 594
@@ -91147,7 +91147,7 @@
91147
91147
  43: 1
91148
91148
  32: 1
91149
91149
  10: 1
91150
- - !ruby/object:Profile
91150
+ - !ruby/object:LanguageDetector::Profile
91151
91151
  name: sv
91152
91152
  ngrams:
91153
91153
  karna: 1187
@@ -93197,7 +93197,7 @@
93197
93197
  43: 1
93198
93198
  32: 1
93199
93199
  10: 1
93200
- - !ruby/object:Profile
93200
+ - !ruby/object:LanguageDetector::Profile
93201
93201
  name: th
93202
93202
  ngrams:
93203
93203
  ? !binary "uYPguKs=\n"
@@ -98785,7 +98785,7 @@
98785
98785
  43: 1
98786
98786
  32: 1
98787
98787
  10: 1
98788
- - !ruby/object:Profile
98788
+ - !ruby/object:LanguageDetector::Profile
98789
98789
  name: uk
98790
98790
  ngrams:
98791
98791
  "\xBA\xD0\xBE__": 1806
@@ -103973,7 +103973,7 @@
103973
103973
  43: 1
103974
103974
  32: 1
103975
103975
  10: 1
103976
- - !ruby/object:Profile
103976
+ - !ruby/object:LanguageDetector::Profile
103977
103977
  name: vi
103978
103978
  ngrams:
103979
103979
  ? !binary "xJHhu5E=\n"
@@ -106335,7 +106335,7 @@
106335
106335
  43: 1
106336
106336
  32: 1
106337
106337
  10: 1
106338
- - !ruby/object:Profile
106338
+ - !ruby/object:LanguageDetector::Profile
106339
106339
  name: zh
106340
106340
  ngrams:
106341
106341
  ? !binary "6L+Z\n"
@@ -4,7 +4,7 @@ require File.dirname(__FILE__) + '/../lib/language_detector'
4
4
 
5
5
  class ProfileTest < Test::Unit::TestCase
6
6
  def test_is_puctuation
7
- p = Profile.new("test")
7
+ p = LanguageDetector::Profile.new("test")
8
8
  assert p.is_puctuation?(?,)
9
9
  assert p.is_puctuation?(?.)
10
10
  assert !p.is_puctuation?(?A)
@@ -12,12 +12,12 @@ class ProfileTest < Test::Unit::TestCase
12
12
  end
13
13
 
14
14
  def test_tokenize
15
- p = Profile.new("test")
15
+ p = LanguageDetector::Profile.new("test")
16
16
  assert_equal ["this", "is", "A", "test"], p.tokenize("this is ,+_ A \t 123 test")
17
17
  end
18
18
 
19
19
  def test_count_ngram
20
- p = Profile.new("test")
20
+ p = LanguageDetector::Profile.new("test")
21
21
  assert_equal({"w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1}, p.count_ngram('words', 1, {}))
22
22
  assert_equal({"wo"=>1, "or"=>1, "rd"=>1, "ds"=>1, "_w" => 1, "s_" => 1}, p.count_ngram('words', 2, {}))
23
23
  assert_equal({"wor"=>1, "ord"=>1, "rds"=>1, "_wo" => 1, "ds_" => 1, "s__" => 1}, p.count_ngram('words', 3, {}))
@@ -27,25 +27,29 @@ class ProfileTest < Test::Unit::TestCase
27
27
  end
28
28
 
29
29
  def test_init_with_string
30
- p = Profile.new("test")
30
+ p = LanguageDetector::Profile.new("test")
31
31
  p.init_with_string("this is ,+_ A \t 123 test")
32
- assert_equal([["t_", 30], ["st__", 29], ["st", 16], ["hi", 8], ["_tes", 7], ["is__", 6], ["s___", 5], ["s_", 3], ["his_", 11], ["tes", 10], ["t___", 9], ["es", 12], ["_te", 14], ["est_", 13], ["est", 15], ["te", 4], ["his", 17], ["_th", 20], ["s__", 19], ["st_", 18], ["th", 24], ["_thi", 23], ["t__", 22], ["test", 21], ["thi", 28], ["is_", 27], ["this", 26], ["_i", 25], ["is", 2], ["_t", 1]], p.ngrams.sort_by { |a,b| a[1] <=> b[1] })
32
+ assert_equal(
33
+ [["t_", 30], ["st__", 29], ["st", 16], ["hi", 8], ["_tes", 7], ["is__", 6], ["s___", 5], ["s_", 3], ["his_", 11], ["tes", 10], ["t___", 9], ["es", 12], ["_te", 14], ["est_", 13], ["est", 15], ["te", 4], ["his", 17], ["_th", 20], ["s__", 19], ["st_", 18], ["th", 24], ["_thi", 23], ["t__", 22], ["test", 21], ["thi", 28], ["is_", 27], ["this", 26], ["_i", 25], ["is", 2], ["_t", 1]],
34
+ p.ngrams.sort_by { |a,b| a[1] <=> b[1] },
35
+ "This test does not pass in the original repository either: http://github.com/feedbackmine/language_detector"
36
+ )
33
37
  end
34
38
 
35
39
  def test_init_with_file
36
- p = Profile.new("test")
40
+ p = LanguageDetector::Profile.new("test")
37
41
  p.init_with_file("bg-utf8.txt")
38
42
  assert !p.ngrams.empty?
39
43
  end
40
44
 
41
45
  def test_compute_distance
42
- p1 = Profile.new("test")
46
+ p1 = LanguageDetector::Profile.new("test")
43
47
  p1.init_with_string("this is ,+_ A \t 123 test")
44
- p2 = Profile.new("test")
48
+ p2 = LanguageDetector::Profile.new("test")
45
49
  p2.init_with_string("this is ,+_ A \t 123 test")
46
50
  assert_equal 0, p1.compute_distance(p2)
47
51
 
48
- p3 = Profile.new("test")
52
+ p3 = LanguageDetector::Profile.new("test")
49
53
  p3.init_with_string("xxxx")
50
54
  assert_equal 24000, p1.compute_distance(p3)
51
55
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: feedbackmine-language_detector
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - feedbackmine
@@ -29,6 +29,7 @@ files:
29
29
  - test/language_detector_test.rb
30
30
  has_rdoc: false
31
31
  homepage: http://www.tweetjobsearch.com
32
+ licenses:
32
33
  post_install_message:
33
34
  rdoc_options: []
34
35
 
@@ -49,7 +50,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
49
50
  requirements: []
50
51
 
51
52
  rubyforge_project:
52
- rubygems_version: 1.2.0
53
+ rubygems_version: 1.3.5
53
54
  signing_key:
54
55
  specification_version: 2
55
56
  summary: n-gram based language detector, written in ruby