rlid 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/data/naive_bayes_models +127059 -1
- data/lib/interactive_guesser.rb +12 -0
- data/lib/rlid.rb +8 -0
- data/lib/rlid/common.rb +144 -0
- data/lib/rlid/language_guesser/language_guesser.rb +15 -0
- data/lib/rlid/language_guesser/model_distance_guesser.rb +43 -0
- data/lib/rlid/language_guesser/naive_bayes_guesser.rb +74 -0
- data/lib/rlid/models/cosine_distance_model.rb +69 -0
- data/lib/rlid/models/generate_models.rb +22 -0
- data/lib/rlid/models/generate_naive_bayes_models.rb +9 -0
- data/lib/rlid/models/model.rb +55 -0
- data/lib/rlid/models/naive_bayes_models.rb +157 -0
- data/lib/rlid/models/ordered_ngrams.rb +100 -0
- data/lib/rlid/probabilities/language_probabilities.rb +199 -0
- data/lib/rlid/tmp.rb +57 -0
- data/lib/rlid/web.rb +16 -0
- metadata +71 -0
@@ -0,0 +1,157 @@
|
|
1
|
+
#!/usr/bin/env ruby1.9.1
|
2
|
+
|
3
|
+
module Rlid
|
4
|
+
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
require 'rlid/common'
|
8
|
+
|
9
|
+
class NaiveBayesModels
|
10
|
+
attr_accessor :default_count
|
11
|
+
# ngram leght
|
12
|
+
N = 3
|
13
|
+
# top ngrams kept for every language
|
14
|
+
CUTOFF = 3000
|
15
|
+
# special feature
|
16
|
+
OTHER = nil
|
17
|
+
|
18
|
+
MAX_STRING_LENGTH = 75
|
19
|
+
|
20
|
+
FILEPATH = "#{DATA_DIRECTORY}/naive_bayes_models"
|
21
|
+
|
22
|
+
def initialize(default_count=1)
|
23
|
+
@default_count=default_count
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.generate_models
|
27
|
+
models = NaiveBayesModels.new(nil)
|
28
|
+
puts "Training started.."
|
29
|
+
models.train
|
30
|
+
File.open(FILEPATH, "w") do |file|
|
31
|
+
file.write Marshal.dump(models)
|
32
|
+
puts "Models saved to #{FILEPATH}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.load
|
37
|
+
Marshal.load(File.read(FILEPATH))
|
38
|
+
end
|
39
|
+
|
40
|
+
def probabilities(string)
|
41
|
+
if not string.is_a? String
|
42
|
+
raise InvalidArgument
|
43
|
+
end
|
44
|
+
@ngram_frequency.keys.each do |lang|
|
45
|
+
prob = 1
|
46
|
+
string[0..MAX_STRING_LENGTH].each_ngram do |ngram|
|
47
|
+
prob *= frequency_of(lang, ngram)
|
48
|
+
end
|
49
|
+
yield lang, prob
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def train
|
54
|
+
ngram_counts = get_ngram_counts
|
55
|
+
# ngrams for which we want to store information (all languages)
|
56
|
+
@stored_ngrams = top_ngrams(ngram_counts)
|
57
|
+
|
58
|
+
puts "- processing ngrams"
|
59
|
+
# content: ngram_frequency[lang][ngram] = freq
|
60
|
+
@ngram_frequency = Hash.new
|
61
|
+
# content: total_ngrams_found[lang] = total count of ngrams encountered
|
62
|
+
@total_ngrams_found= Hash.new
|
63
|
+
# content: total_ngrams_not_found[lang] = n of ngrams not found
|
64
|
+
@total_ngrams_not_found = Hash.new
|
65
|
+
|
66
|
+
ngram_counts.each do |lang, counts|
|
67
|
+
@ngram_frequency[lang] = Hash.new(0)
|
68
|
+
@total_ngrams_found[lang] = 0
|
69
|
+
counts.each do |ngram, count|
|
70
|
+
if @stored_ngrams.include?(ngram)
|
71
|
+
@ngram_frequency[lang][ngram] = count
|
72
|
+
else
|
73
|
+
@ngram_frequency[lang][OTHER] += count
|
74
|
+
end
|
75
|
+
@total_ngrams_found[lang] += count
|
76
|
+
end
|
77
|
+
|
78
|
+
not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size
|
79
|
+
@total_ngrams_not_found[lang] = not_found
|
80
|
+
|
81
|
+
puts_info(lang)
|
82
|
+
end
|
83
|
+
|
84
|
+
n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
|
85
|
+
@total_ngrams_found[:nnn] = n
|
86
|
+
@ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
|
87
|
+
@total_ngrams_not_found[:nnn] = @stored_ngrams.size
|
88
|
+
|
89
|
+
#puts "total frequencies saved: #{freqs}"
|
90
|
+
#puts "defauld values used: #{default_count} (#{100*default_count/freqs}%)"
|
91
|
+
#@ngram_frequency
|
92
|
+
end
|
93
|
+
|
94
|
+
protected
|
95
|
+
def total_ngrams(lang)
|
96
|
+
@total_ngrams_found[lang] + @total_ngrams_not_found[lang] * @default_count
|
97
|
+
end
|
98
|
+
|
99
|
+
def frequency_of(lang, ngram)
|
100
|
+
if not @stored_ngrams.include?(ngram)
|
101
|
+
#warn " :#{ngram}: is in OTHER!" if lang == :eng
|
102
|
+
ngram = OTHER
|
103
|
+
end
|
104
|
+
count = 0
|
105
|
+
if @ngram_frequency[lang].include?(ngram)
|
106
|
+
count = @ngram_frequency[lang][ngram]
|
107
|
+
else
|
108
|
+
count = @default_count
|
109
|
+
end
|
110
|
+
count.to_f / total_ngrams(lang)
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def puts_info(lang)
|
116
|
+
# default count of 1 is supposed
|
117
|
+
tot = @total_ngrams_found[lang] + @total_ngrams_not_found[lang]
|
118
|
+
d = (100.0 * @total_ngrams_not_found[lang] / tot).round(1)
|
119
|
+
o = (100.0 * @ngram_frequency[lang][OTHER] / tot).round(1)
|
120
|
+
puts " #{lang} processed tot:#{tot}, default:#{d}%, other:#{o}%"
|
121
|
+
end
|
122
|
+
|
123
|
+
# auxiliary functions
|
124
|
+
|
125
|
+
# gets all ngram_counts and returns an hash having:
|
126
|
+
# ngram_counts[lang][ngram] = count
|
127
|
+
def get_ngram_counts
|
128
|
+
@stored_ngrams = Set.new
|
129
|
+
ngram_counts = Hash.new
|
130
|
+
Language.each_file("corpus") do |file, lang|
|
131
|
+
puts "- I'm learning #{lang}"
|
132
|
+
ngram_counts[lang] = Hash.new(0) # default is 1
|
133
|
+
file.read.each_ngram(N) do |ngram|
|
134
|
+
ngram_counts[lang][ngram] += 1
|
135
|
+
end
|
136
|
+
|
137
|
+
# top ngrams (transformed into arrays)
|
138
|
+
arrays = ngram_counts[lang].to_a.sort{|x, y| y[1] <=> x[1]}
|
139
|
+
@stored_ngrams += arrays[0...CUTOFF].map{|x| x[0]}
|
140
|
+
end
|
141
|
+
ngram_counts
|
142
|
+
end
|
143
|
+
|
144
|
+
# extract the top ngrams for every language
|
145
|
+
def top_ngrams(ngram_counts)
|
146
|
+
res = Set.new
|
147
|
+
ngram_counts.values.each do |hash|
|
148
|
+
# top ngrams (transformed into arrays)
|
149
|
+
arrays = hash.to_a.sort{|x, y| y[1] <=> x[1]}
|
150
|
+
res += arrays[0...CUTOFF].map{|x| x[0]}
|
151
|
+
end
|
152
|
+
res
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
|
157
|
+
end # module Rlid
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
require 'rlid/models/model'
|
4
|
+
require 'rlid/common'
|
5
|
+
|
6
|
+
# a subclass should define the filename
|
7
|
+
class OrderedNGrams < NGramModel
|
8
|
+
N = 3
|
9
|
+
def initialize(string, cutoff=300)
|
10
|
+
super(string, N, cutoff)
|
11
|
+
end
|
12
|
+
|
13
|
+
def save(file)
|
14
|
+
@ngram_pos.each do |ngram, pos|
|
15
|
+
file.write "#{ngram} #{pos}\n"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def load(file)
|
20
|
+
@ngram_pos = Hash.new
|
21
|
+
pos = 0
|
22
|
+
file.each_line do |line|
|
23
|
+
# keep only the first @n characters of the line
|
24
|
+
ngram = line.gsub(/^(.{#{N}}).*\n?/, '\1')
|
25
|
+
@ngram_pos[ngram] = pos
|
26
|
+
pos += 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def generate_model(ngram_count)
|
31
|
+
# top ngrams (transformed into arrays)
|
32
|
+
top = ngram_count.to_a.sort{|x, y| y[1] <=> x[1]}[0...@cutoff]
|
33
|
+
@ngram_pos = Hash.new # key is ngram value is position
|
34
|
+
i = 0
|
35
|
+
top.each {|n,| @ngram_pos[n] = i; i +=1}
|
36
|
+
end
|
37
|
+
|
38
|
+
def -(other)
|
39
|
+
if not other.is_a?(OrderedNGrams)
|
40
|
+
raise InvalidArgument
|
41
|
+
end
|
42
|
+
dist = 0
|
43
|
+
other.ngram_pos.each do |ngram, pos_other|
|
44
|
+
pos_self = ngram_pos[ngram]
|
45
|
+
if pos_self != nil
|
46
|
+
dist += (pos_self - pos_other).abs
|
47
|
+
else
|
48
|
+
dist += @cutoff # max distance
|
49
|
+
end
|
50
|
+
end
|
51
|
+
dist
|
52
|
+
end
|
53
|
+
|
54
|
+
protected
|
55
|
+
attr_reader :ngram_pos
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
class NGramsKDE < OrderedNGrams
|
60
|
+
def self.filename
|
61
|
+
return "3grams300kde"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class NGrams300 < OrderedNGrams
|
66
|
+
def self.filename
|
67
|
+
return "3grams300"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class NGrams800 < OrderedNGrams
|
72
|
+
def initialize(string)
|
73
|
+
super(string, 800)
|
74
|
+
end
|
75
|
+
def self.filename
|
76
|
+
return "3grams800"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class NGrams3000 < OrderedNGrams
|
81
|
+
def initialize(string)
|
82
|
+
super(string, 3000)
|
83
|
+
end
|
84
|
+
def self.filename
|
85
|
+
return "3grams3000"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
class NGrams4000 < OrderedNGrams
|
91
|
+
def initialize(string)
|
92
|
+
super(string, 4000)
|
93
|
+
end
|
94
|
+
def self.filename
|
95
|
+
return "3grams4000"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
end # module Rlid
|
@@ -0,0 +1,199 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
require 'rlid/common'
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
class Percentage
|
8
|
+
def initialize value
|
9
|
+
@value = value
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
if @value <= 0.98
|
14
|
+
return "%.2g" % (@value * 100)
|
15
|
+
else
|
16
|
+
complement = 1.0 - @value
|
17
|
+
# complement =
|
18
|
+
log = -Math.log10(complement).ceil
|
19
|
+
digits = log - 1
|
20
|
+
res = "%.#{digits}f" % (@value * 100)
|
21
|
+
last = -1
|
22
|
+
while res[last] == ?9
|
23
|
+
digits += 1
|
24
|
+
res = "%.#{digits}f" % (@value * 100)
|
25
|
+
end
|
26
|
+
return res
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_f
|
31
|
+
@value
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
class LanguageProbabilities
|
42
|
+
MAX_OUTPUT = 3
|
43
|
+
|
44
|
+
def initialize(args={})
|
45
|
+
@percentage = Hash.new(0)
|
46
|
+
args.each do |languages, percentage|
|
47
|
+
add(languages, percentage)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def random_language
|
52
|
+
r = rand
|
53
|
+
sum = 0
|
54
|
+
@percentage.each do |language, perc|
|
55
|
+
sum += perc
|
56
|
+
return language if sum > r
|
57
|
+
#puts "#{sum}(#{r})"
|
58
|
+
end
|
59
|
+
warn "rounding error!! (sum is not 1!!)"
|
60
|
+
@percentage.keys.first
|
61
|
+
end
|
62
|
+
|
63
|
+
def to_s
|
64
|
+
sorted[0...MAX_OUTPUT].map do |x|
|
65
|
+
# calculate the digits
|
66
|
+
formatted_perc = Percentage.new(x[PERC]).to_s
|
67
|
+
"#{x[LANG]}(#{formatted_perc})"
|
68
|
+
end.join(" : ")
|
69
|
+
end
|
70
|
+
|
71
|
+
def first
|
72
|
+
sorted.first[LANG]
|
73
|
+
end
|
74
|
+
|
75
|
+
def *(other)
|
76
|
+
if not other.is_a? LanguageProbabilities
|
77
|
+
p other.inspect
|
78
|
+
raise InvalidArgument.new(other)
|
79
|
+
end
|
80
|
+
res = LanguageProbabilities.new()
|
81
|
+
@percentage.each_key do |lang|
|
82
|
+
res.percentage[lang] = percentage[lang] * other.percentage[lang]
|
83
|
+
end
|
84
|
+
res.normalize
|
85
|
+
res
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
def add(languages, perc)
|
90
|
+
if perc < 0 or perc > 1
|
91
|
+
perc = perc.round
|
92
|
+
end
|
93
|
+
languages = [languages] if not languages.is_a? Array
|
94
|
+
perc = perc.to_f / languages.size
|
95
|
+
languages.each {|l| @percentage[l] = perc}
|
96
|
+
end
|
97
|
+
|
98
|
+
# def add_remainder
|
99
|
+
# languages = LANGUAGES - @percentage.keys
|
100
|
+
# perc = 1.0 - sum
|
101
|
+
# add(languages, perc)
|
102
|
+
# end
|
103
|
+
|
104
|
+
protected
|
105
|
+
# indexes
|
106
|
+
LANG = 0
|
107
|
+
PERC = 1
|
108
|
+
|
109
|
+
def random_language_and_delete
|
110
|
+
l = random_language
|
111
|
+
@percentage.delete(l)
|
112
|
+
normalize
|
113
|
+
l
|
114
|
+
end
|
115
|
+
|
116
|
+
def normalize
|
117
|
+
tot = sum
|
118
|
+
return if tot == 0
|
119
|
+
@percentage.each_key do |key|
|
120
|
+
@percentage[key] /= tot
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def sum
|
125
|
+
@percentage.values.inject(0){|s, v| s + v}
|
126
|
+
end
|
127
|
+
|
128
|
+
def sorted
|
129
|
+
@percentage.to_a.sort!{|x,y| y[PERC] <=> x[PERC]}
|
130
|
+
end
|
131
|
+
|
132
|
+
attr_accessor :percentage
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
class TestProbabilities < LanguageProbabilities
|
140
|
+
def initialize(lang, perc_lang=0.8, perc_lang_and_common=0.98)
|
141
|
+
@lang = lang
|
142
|
+
@perc_lang = perc_lang
|
143
|
+
common = COMMON_LANGUAGES - [lang]
|
144
|
+
other = LANGUAGES - COMMON_LANGUAGES - [lang]
|
145
|
+
@common_size = common.size
|
146
|
+
@other_size = other.size
|
147
|
+
@perc_common = perc_lang_and_common - perc_lang
|
148
|
+
@perc_other = 1 - perc_lang_and_common
|
149
|
+
super(lang => @perc_lang, common => @perc_common, other => @perc_other)
|
150
|
+
end
|
151
|
+
|
152
|
+
def random_permutation
|
153
|
+
lang = random_language
|
154
|
+
return self if lang == @lang
|
155
|
+
|
156
|
+
probs = probabilities
|
157
|
+
|
158
|
+
top_lang = probs.first
|
159
|
+
|
160
|
+
probs.percentage[lang], probs.percentage[top_lang] =
|
161
|
+
probs.percentage[top_lang], probs.percentage[lang]
|
162
|
+
|
163
|
+
probs
|
164
|
+
end
|
165
|
+
|
166
|
+
# common = []
|
167
|
+
# @common_size.times do
|
168
|
+
# common << probs.random_language_and_delete
|
169
|
+
# end
|
170
|
+
# other = []
|
171
|
+
# @other_size.times do
|
172
|
+
# other << probs.random_language_and_delete
|
173
|
+
# end
|
174
|
+
#
|
175
|
+
# LanguageProbabilities.new(
|
176
|
+
# lang => @perc_lang,
|
177
|
+
# common => @perc_common,
|
178
|
+
# other => @perc_other)
|
179
|
+
|
180
|
+
def probabilities
|
181
|
+
LanguageProbabilities.new(percentage)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
#class Array
|
186
|
+
# def sum
|
187
|
+
# inject(0) {|s,v| s+v}
|
188
|
+
# end
|
189
|
+
#end
|
190
|
+
#x = TestProbabilities.new(:eng)
|
191
|
+
#res = Hash.new(0)
|
192
|
+
#100000.times{res[x.random_permutation.random_language] += 1}
|
193
|
+
#res.each{|k,v| puts "#{k}: #{v/1000.0}"}
|
194
|
+
#
|
195
|
+
#res.values.select{|x| x > 1000 and x < 10000}.sum / 1000.0
|
196
|
+
#res.values.select{|x| x > 10000}.sum / 1000.0
|
197
|
+
#res.values.select{|x| x < 1000}.sum / 1000.0
|
198
|
+
|
199
|
+
end # module Rlid
|