rlid 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/data/naive_bayes_models +127059 -1
- data/lib/interactive_guesser.rb +12 -0
- data/lib/rlid.rb +8 -0
- data/lib/rlid/common.rb +144 -0
- data/lib/rlid/language_guesser/language_guesser.rb +15 -0
- data/lib/rlid/language_guesser/model_distance_guesser.rb +43 -0
- data/lib/rlid/language_guesser/naive_bayes_guesser.rb +74 -0
- data/lib/rlid/models/cosine_distance_model.rb +69 -0
- data/lib/rlid/models/generate_models.rb +22 -0
- data/lib/rlid/models/generate_naive_bayes_models.rb +9 -0
- data/lib/rlid/models/model.rb +55 -0
- data/lib/rlid/models/naive_bayes_models.rb +157 -0
- data/lib/rlid/models/ordered_ngrams.rb +100 -0
- data/lib/rlid/probabilities/language_probabilities.rb +199 -0
- data/lib/rlid/tmp.rb +57 -0
- data/lib/rlid/web.rb +16 -0
- metadata +71 -0
@@ -0,0 +1,157 @@
|
|
1
|
+
#!/usr/bin/env ruby1.9.1
|
2
|
+
|
3
|
+
module Rlid
|
4
|
+
|
5
|
+
require 'set'
|
6
|
+
|
7
|
+
require 'rlid/common'
|
8
|
+
|
9
|
+
class NaiveBayesModels
|
10
|
+
attr_accessor :default_count
|
11
|
+
# ngram leght
|
12
|
+
N = 3
|
13
|
+
# top ngrams kept for every language
|
14
|
+
CUTOFF = 3000
|
15
|
+
# special feature
|
16
|
+
OTHER = nil
|
17
|
+
|
18
|
+
MAX_STRING_LENGTH = 75
|
19
|
+
|
20
|
+
FILEPATH = "#{DATA_DIRECTORY}/naive_bayes_models"
|
21
|
+
|
22
|
+
def initialize(default_count=1)
|
23
|
+
@default_count=default_count
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.generate_models
|
27
|
+
models = NaiveBayesModels.new(nil)
|
28
|
+
puts "Training started.."
|
29
|
+
models.train
|
30
|
+
File.open(FILEPATH, "w") do |file|
|
31
|
+
file.write Marshal.dump(models)
|
32
|
+
puts "Models saved to #{FILEPATH}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.load
|
37
|
+
Marshal.load(File.read(FILEPATH))
|
38
|
+
end
|
39
|
+
|
40
|
+
def probabilities(string)
|
41
|
+
if not string.is_a? String
|
42
|
+
raise InvalidArgument
|
43
|
+
end
|
44
|
+
@ngram_frequency.keys.each do |lang|
|
45
|
+
prob = 1
|
46
|
+
string[0..MAX_STRING_LENGTH].each_ngram do |ngram|
|
47
|
+
prob *= frequency_of(lang, ngram)
|
48
|
+
end
|
49
|
+
yield lang, prob
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def train
|
54
|
+
ngram_counts = get_ngram_counts
|
55
|
+
# ngrams for which we want to store information (all languages)
|
56
|
+
@stored_ngrams = top_ngrams(ngram_counts)
|
57
|
+
|
58
|
+
puts "- processing ngrams"
|
59
|
+
# content: ngram_frequency[lang][ngram] = freq
|
60
|
+
@ngram_frequency = Hash.new
|
61
|
+
# content: total_ngrams_found[lang] = total count of ngrams encountered
|
62
|
+
@total_ngrams_found= Hash.new
|
63
|
+
# content: total_ngrams_not_found[lang] = n of ngrams not found
|
64
|
+
@total_ngrams_not_found = Hash.new
|
65
|
+
|
66
|
+
ngram_counts.each do |lang, counts|
|
67
|
+
@ngram_frequency[lang] = Hash.new(0)
|
68
|
+
@total_ngrams_found[lang] = 0
|
69
|
+
counts.each do |ngram, count|
|
70
|
+
if @stored_ngrams.include?(ngram)
|
71
|
+
@ngram_frequency[lang][ngram] = count
|
72
|
+
else
|
73
|
+
@ngram_frequency[lang][OTHER] += count
|
74
|
+
end
|
75
|
+
@total_ngrams_found[lang] += count
|
76
|
+
end
|
77
|
+
|
78
|
+
not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size
|
79
|
+
@total_ngrams_not_found[lang] = not_found
|
80
|
+
|
81
|
+
puts_info(lang)
|
82
|
+
end
|
83
|
+
|
84
|
+
n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
|
85
|
+
@total_ngrams_found[:nnn] = n
|
86
|
+
@ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
|
87
|
+
@total_ngrams_not_found[:nnn] = @stored_ngrams.size
|
88
|
+
|
89
|
+
#puts "total frequencies saved: #{freqs}"
|
90
|
+
#puts "defauld values used: #{default_count} (#{100*default_count/freqs}%)"
|
91
|
+
#@ngram_frequency
|
92
|
+
end
|
93
|
+
|
94
|
+
protected
|
95
|
+
def total_ngrams(lang)
|
96
|
+
@total_ngrams_found[lang] + @total_ngrams_not_found[lang] * @default_count
|
97
|
+
end
|
98
|
+
|
99
|
+
def frequency_of(lang, ngram)
|
100
|
+
if not @stored_ngrams.include?(ngram)
|
101
|
+
#warn " :#{ngram}: is in OTHER!" if lang == :eng
|
102
|
+
ngram = OTHER
|
103
|
+
end
|
104
|
+
count = 0
|
105
|
+
if @ngram_frequency[lang].include?(ngram)
|
106
|
+
count = @ngram_frequency[lang][ngram]
|
107
|
+
else
|
108
|
+
count = @default_count
|
109
|
+
end
|
110
|
+
count.to_f / total_ngrams(lang)
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
|
115
|
+
def puts_info(lang)
|
116
|
+
# default count of 1 is supposed
|
117
|
+
tot = @total_ngrams_found[lang] + @total_ngrams_not_found[lang]
|
118
|
+
d = (100.0 * @total_ngrams_not_found[lang] / tot).round(1)
|
119
|
+
o = (100.0 * @ngram_frequency[lang][OTHER] / tot).round(1)
|
120
|
+
puts " #{lang} processed tot:#{tot}, default:#{d}%, other:#{o}%"
|
121
|
+
end
|
122
|
+
|
123
|
+
# auxiliary functions
|
124
|
+
|
125
|
+
# gets all ngram_counts and returns an hash having:
|
126
|
+
# ngram_counts[lang][ngram] = count
|
127
|
+
def get_ngram_counts
|
128
|
+
@stored_ngrams = Set.new
|
129
|
+
ngram_counts = Hash.new
|
130
|
+
Language.each_file("corpus") do |file, lang|
|
131
|
+
puts "- I'm learning #{lang}"
|
132
|
+
ngram_counts[lang] = Hash.new(0) # default is 1
|
133
|
+
file.read.each_ngram(N) do |ngram|
|
134
|
+
ngram_counts[lang][ngram] += 1
|
135
|
+
end
|
136
|
+
|
137
|
+
# top ngrams (transformed into arrays)
|
138
|
+
arrays = ngram_counts[lang].to_a.sort{|x, y| y[1] <=> x[1]}
|
139
|
+
@stored_ngrams += arrays[0...CUTOFF].map{|x| x[0]}
|
140
|
+
end
|
141
|
+
ngram_counts
|
142
|
+
end
|
143
|
+
|
144
|
+
# extract the top ngrams for every language
|
145
|
+
def top_ngrams(ngram_counts)
|
146
|
+
res = Set.new
|
147
|
+
ngram_counts.values.each do |hash|
|
148
|
+
# top ngrams (transformed into arrays)
|
149
|
+
arrays = hash.to_a.sort{|x, y| y[1] <=> x[1]}
|
150
|
+
res += arrays[0...CUTOFF].map{|x| x[0]}
|
151
|
+
end
|
152
|
+
res
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
|
157
|
+
end # module Rlid
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
require 'rlid/models/model'
|
4
|
+
require 'rlid/common'
|
5
|
+
|
6
|
+
# a subclass should define the filename
|
7
|
+
class OrderedNGrams < NGramModel
|
8
|
+
N = 3
|
9
|
+
def initialize(string, cutoff=300)
|
10
|
+
super(string, N, cutoff)
|
11
|
+
end
|
12
|
+
|
13
|
+
def save(file)
|
14
|
+
@ngram_pos.each do |ngram, pos|
|
15
|
+
file.write "#{ngram} #{pos}\n"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def load(file)
|
20
|
+
@ngram_pos = Hash.new
|
21
|
+
pos = 0
|
22
|
+
file.each_line do |line|
|
23
|
+
# keep only the first @n characters of the line
|
24
|
+
ngram = line.gsub(/^(.{#{N}}).*\n?/, '\1')
|
25
|
+
@ngram_pos[ngram] = pos
|
26
|
+
pos += 1
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def generate_model(ngram_count)
|
31
|
+
# top ngrams (transformed into arrays)
|
32
|
+
top = ngram_count.to_a.sort{|x, y| y[1] <=> x[1]}[0...@cutoff]
|
33
|
+
@ngram_pos = Hash.new # key is ngram value is position
|
34
|
+
i = 0
|
35
|
+
top.each {|n,| @ngram_pos[n] = i; i +=1}
|
36
|
+
end
|
37
|
+
|
38
|
+
def -(other)
|
39
|
+
if not other.is_a?(OrderedNGrams)
|
40
|
+
raise InvalidArgument
|
41
|
+
end
|
42
|
+
dist = 0
|
43
|
+
other.ngram_pos.each do |ngram, pos_other|
|
44
|
+
pos_self = ngram_pos[ngram]
|
45
|
+
if pos_self != nil
|
46
|
+
dist += (pos_self - pos_other).abs
|
47
|
+
else
|
48
|
+
dist += @cutoff # max distance
|
49
|
+
end
|
50
|
+
end
|
51
|
+
dist
|
52
|
+
end
|
53
|
+
|
54
|
+
protected
|
55
|
+
attr_reader :ngram_pos
|
56
|
+
end
|
57
|
+
|
58
|
+
|
59
|
+
class NGramsKDE < OrderedNGrams
|
60
|
+
def self.filename
|
61
|
+
return "3grams300kde"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
class NGrams300 < OrderedNGrams
|
66
|
+
def self.filename
|
67
|
+
return "3grams300"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class NGrams800 < OrderedNGrams
|
72
|
+
def initialize(string)
|
73
|
+
super(string, 800)
|
74
|
+
end
|
75
|
+
def self.filename
|
76
|
+
return "3grams800"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
class NGrams3000 < OrderedNGrams
|
81
|
+
def initialize(string)
|
82
|
+
super(string, 3000)
|
83
|
+
end
|
84
|
+
def self.filename
|
85
|
+
return "3grams3000"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
class NGrams4000 < OrderedNGrams
|
91
|
+
def initialize(string)
|
92
|
+
super(string, 4000)
|
93
|
+
end
|
94
|
+
def self.filename
|
95
|
+
return "3grams4000"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
end # module Rlid
|
@@ -0,0 +1,199 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
require 'rlid/common'
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
class Percentage
|
8
|
+
def initialize value
|
9
|
+
@value = value
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
if @value <= 0.98
|
14
|
+
return "%.2g" % (@value * 100)
|
15
|
+
else
|
16
|
+
complement = 1.0 - @value
|
17
|
+
# complement =
|
18
|
+
log = -Math.log10(complement).ceil
|
19
|
+
digits = log - 1
|
20
|
+
res = "%.#{digits}f" % (@value * 100)
|
21
|
+
last = -1
|
22
|
+
while res[last] == ?9
|
23
|
+
digits += 1
|
24
|
+
res = "%.#{digits}f" % (@value * 100)
|
25
|
+
end
|
26
|
+
return res
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_f
|
31
|
+
@value
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
class LanguageProbabilities
|
42
|
+
MAX_OUTPUT = 3
|
43
|
+
|
44
|
+
def initialize(args={})
|
45
|
+
@percentage = Hash.new(0)
|
46
|
+
args.each do |languages, percentage|
|
47
|
+
add(languages, percentage)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def random_language
|
52
|
+
r = rand
|
53
|
+
sum = 0
|
54
|
+
@percentage.each do |language, perc|
|
55
|
+
sum += perc
|
56
|
+
return language if sum > r
|
57
|
+
#puts "#{sum}(#{r})"
|
58
|
+
end
|
59
|
+
warn "rounding error!! (sum is not 1!!)"
|
60
|
+
@percentage.keys.first
|
61
|
+
end
|
62
|
+
|
63
|
+
def to_s
|
64
|
+
sorted[0...MAX_OUTPUT].map do |x|
|
65
|
+
# calculate the digits
|
66
|
+
formatted_perc = Percentage.new(x[PERC]).to_s
|
67
|
+
"#{x[LANG]}(#{formatted_perc})"
|
68
|
+
end.join(" : ")
|
69
|
+
end
|
70
|
+
|
71
|
+
def first
|
72
|
+
sorted.first[LANG]
|
73
|
+
end
|
74
|
+
|
75
|
+
def *(other)
|
76
|
+
if not other.is_a? LanguageProbabilities
|
77
|
+
p other.inspect
|
78
|
+
raise InvalidArgument.new(other)
|
79
|
+
end
|
80
|
+
res = LanguageProbabilities.new()
|
81
|
+
@percentage.each_key do |lang|
|
82
|
+
res.percentage[lang] = percentage[lang] * other.percentage[lang]
|
83
|
+
end
|
84
|
+
res.normalize
|
85
|
+
res
|
86
|
+
end
|
87
|
+
|
88
|
+
private
|
89
|
+
def add(languages, perc)
|
90
|
+
if perc < 0 or perc > 1
|
91
|
+
perc = perc.round
|
92
|
+
end
|
93
|
+
languages = [languages] if not languages.is_a? Array
|
94
|
+
perc = perc.to_f / languages.size
|
95
|
+
languages.each {|l| @percentage[l] = perc}
|
96
|
+
end
|
97
|
+
|
98
|
+
# def add_remainder
|
99
|
+
# languages = LANGUAGES - @percentage.keys
|
100
|
+
# perc = 1.0 - sum
|
101
|
+
# add(languages, perc)
|
102
|
+
# end
|
103
|
+
|
104
|
+
protected
|
105
|
+
# indexes
|
106
|
+
LANG = 0
|
107
|
+
PERC = 1
|
108
|
+
|
109
|
+
def random_language_and_delete
|
110
|
+
l = random_language
|
111
|
+
@percentage.delete(l)
|
112
|
+
normalize
|
113
|
+
l
|
114
|
+
end
|
115
|
+
|
116
|
+
def normalize
|
117
|
+
tot = sum
|
118
|
+
return if tot == 0
|
119
|
+
@percentage.each_key do |key|
|
120
|
+
@percentage[key] /= tot
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def sum
|
125
|
+
@percentage.values.inject(0){|s, v| s + v}
|
126
|
+
end
|
127
|
+
|
128
|
+
def sorted
|
129
|
+
@percentage.to_a.sort!{|x,y| y[PERC] <=> x[PERC]}
|
130
|
+
end
|
131
|
+
|
132
|
+
attr_accessor :percentage
|
133
|
+
end
|
134
|
+
|
135
|
+
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
class TestProbabilities < LanguageProbabilities
|
140
|
+
def initialize(lang, perc_lang=0.8, perc_lang_and_common=0.98)
|
141
|
+
@lang = lang
|
142
|
+
@perc_lang = perc_lang
|
143
|
+
common = COMMON_LANGUAGES - [lang]
|
144
|
+
other = LANGUAGES - COMMON_LANGUAGES - [lang]
|
145
|
+
@common_size = common.size
|
146
|
+
@other_size = other.size
|
147
|
+
@perc_common = perc_lang_and_common - perc_lang
|
148
|
+
@perc_other = 1 - perc_lang_and_common
|
149
|
+
super(lang => @perc_lang, common => @perc_common, other => @perc_other)
|
150
|
+
end
|
151
|
+
|
152
|
+
def random_permutation
|
153
|
+
lang = random_language
|
154
|
+
return self if lang == @lang
|
155
|
+
|
156
|
+
probs = probabilities
|
157
|
+
|
158
|
+
top_lang = probs.first
|
159
|
+
|
160
|
+
probs.percentage[lang], probs.percentage[top_lang] =
|
161
|
+
probs.percentage[top_lang], probs.percentage[lang]
|
162
|
+
|
163
|
+
probs
|
164
|
+
end
|
165
|
+
|
166
|
+
# common = []
|
167
|
+
# @common_size.times do
|
168
|
+
# common << probs.random_language_and_delete
|
169
|
+
# end
|
170
|
+
# other = []
|
171
|
+
# @other_size.times do
|
172
|
+
# other << probs.random_language_and_delete
|
173
|
+
# end
|
174
|
+
#
|
175
|
+
# LanguageProbabilities.new(
|
176
|
+
# lang => @perc_lang,
|
177
|
+
# common => @perc_common,
|
178
|
+
# other => @perc_other)
|
179
|
+
|
180
|
+
def probabilities
|
181
|
+
LanguageProbabilities.new(percentage)
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
#class Array
|
186
|
+
# def sum
|
187
|
+
# inject(0) {|s,v| s+v}
|
188
|
+
# end
|
189
|
+
#end
|
190
|
+
#x = TestProbabilities.new(:eng)
|
191
|
+
#res = Hash.new(0)
|
192
|
+
#100000.times{res[x.random_permutation.random_language] += 1}
|
193
|
+
#res.each{|k,v| puts "#{k}: #{v/1000.0}"}
|
194
|
+
#
|
195
|
+
#res.values.select{|x| x > 1000 and x < 10000}.sum / 1000.0
|
196
|
+
#res.values.select{|x| x > 10000}.sum / 1000.0
|
197
|
+
#res.values.select{|x| x < 1000}.sum / 1000.0
|
198
|
+
|
199
|
+
end # module Rlid
|