rlid 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby1.9.1
2
+
3
+ module Rlid
4
+
5
+ require 'set'
6
+
7
+ require 'rlid/common'
8
+
9
+ class NaiveBayesModels
10
+ attr_accessor :default_count
11
+ # ngram leght
12
+ N = 3
13
+ # top ngrams kept for every language
14
+ CUTOFF = 3000
15
+ # special feature
16
+ OTHER = nil
17
+
18
+ MAX_STRING_LENGTH = 75
19
+
20
+ FILEPATH = "#{DATA_DIRECTORY}/naive_bayes_models"
21
+
22
+ def initialize(default_count=1)
23
+ @default_count=default_count
24
+ end
25
+
26
+ def self.generate_models
27
+ models = NaiveBayesModels.new(nil)
28
+ puts "Training started.."
29
+ models.train
30
+ File.open(FILEPATH, "w") do |file|
31
+ file.write Marshal.dump(models)
32
+ puts "Models saved to #{FILEPATH}"
33
+ end
34
+ end
35
+
36
+ def self.load
37
+ Marshal.load(File.read(FILEPATH))
38
+ end
39
+
40
+ def probabilities(string)
41
+ if not string.is_a? String
42
+ raise InvalidArgument
43
+ end
44
+ @ngram_frequency.keys.each do |lang|
45
+ prob = 1
46
+ string[0..MAX_STRING_LENGTH].each_ngram do |ngram|
47
+ prob *= frequency_of(lang, ngram)
48
+ end
49
+ yield lang, prob
50
+ end
51
+ end
52
+
53
+ def train
54
+ ngram_counts = get_ngram_counts
55
+ # ngrams for which we want to store information (all languages)
56
+ @stored_ngrams = top_ngrams(ngram_counts)
57
+
58
+ puts "- processing ngrams"
59
+ # content: ngram_frequency[lang][ngram] = freq
60
+ @ngram_frequency = Hash.new
61
+ # content: total_ngrams_found[lang] = total count of ngrams encountered
62
+ @total_ngrams_found= Hash.new
63
+ # content: total_ngrams_not_found[lang] = n of ngrams not found
64
+ @total_ngrams_not_found = Hash.new
65
+
66
+ ngram_counts.each do |lang, counts|
67
+ @ngram_frequency[lang] = Hash.new(0)
68
+ @total_ngrams_found[lang] = 0
69
+ counts.each do |ngram, count|
70
+ if @stored_ngrams.include?(ngram)
71
+ @ngram_frequency[lang][ngram] = count
72
+ else
73
+ @ngram_frequency[lang][OTHER] += count
74
+ end
75
+ @total_ngrams_found[lang] += count
76
+ end
77
+
78
+ not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size
79
+ @total_ngrams_not_found[lang] = not_found
80
+
81
+ puts_info(lang)
82
+ end
83
+
84
+ n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
85
+ @total_ngrams_found[:nnn] = n
86
+ @ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
87
+ @total_ngrams_not_found[:nnn] = @stored_ngrams.size
88
+
89
+ #puts "total frequencies saved: #{freqs}"
90
+ #puts "defauld values used: #{default_count} (#{100*default_count/freqs}%)"
91
+ #@ngram_frequency
92
+ end
93
+
94
+ protected
95
+ def total_ngrams(lang)
96
+ @total_ngrams_found[lang] + @total_ngrams_not_found[lang] * @default_count
97
+ end
98
+
99
+ def frequency_of(lang, ngram)
100
+ if not @stored_ngrams.include?(ngram)
101
+ #warn " :#{ngram}: is in OTHER!" if lang == :eng
102
+ ngram = OTHER
103
+ end
104
+ count = 0
105
+ if @ngram_frequency[lang].include?(ngram)
106
+ count = @ngram_frequency[lang][ngram]
107
+ else
108
+ count = @default_count
109
+ end
110
+ count.to_f / total_ngrams(lang)
111
+ end
112
+
113
+ private
114
+
115
+ def puts_info(lang)
116
+ # default count of 1 is supposed
117
+ tot = @total_ngrams_found[lang] + @total_ngrams_not_found[lang]
118
+ d = (100.0 * @total_ngrams_not_found[lang] / tot).round(1)
119
+ o = (100.0 * @ngram_frequency[lang][OTHER] / tot).round(1)
120
+ puts " #{lang} processed tot:#{tot}, default:#{d}%, other:#{o}%"
121
+ end
122
+
123
+ # auxiliary functions
124
+
125
+ # gets all ngram_counts and returns an hash having:
126
+ # ngram_counts[lang][ngram] = count
127
+ def get_ngram_counts
128
+ @stored_ngrams = Set.new
129
+ ngram_counts = Hash.new
130
+ Language.each_file("corpus") do |file, lang|
131
+ puts "- I'm learning #{lang}"
132
+ ngram_counts[lang] = Hash.new(0) # default is 1
133
+ file.read.each_ngram(N) do |ngram|
134
+ ngram_counts[lang][ngram] += 1
135
+ end
136
+
137
+ # top ngrams (transformed into arrays)
138
+ arrays = ngram_counts[lang].to_a.sort{|x, y| y[1] <=> x[1]}
139
+ @stored_ngrams += arrays[0...CUTOFF].map{|x| x[0]}
140
+ end
141
+ ngram_counts
142
+ end
143
+
144
+ # extract the top ngrams for every language
145
+ def top_ngrams(ngram_counts)
146
+ res = Set.new
147
+ ngram_counts.values.each do |hash|
148
+ # top ngrams (transformed into arrays)
149
+ arrays = hash.to_a.sort{|x, y| y[1] <=> x[1]}
150
+ res += arrays[0...CUTOFF].map{|x| x[0]}
151
+ end
152
+ res
153
+ end
154
+ end
155
+
156
+
157
+ end # module Rlid
@@ -0,0 +1,100 @@
1
+ module Rlid
2
+
3
+ require 'rlid/models/model'
4
+ require 'rlid/common'
5
+
6
+ # a subclass should define the filename
7
+ class OrderedNGrams < NGramModel
8
+ N = 3
9
+ def initialize(string, cutoff=300)
10
+ super(string, N, cutoff)
11
+ end
12
+
13
+ def save(file)
14
+ @ngram_pos.each do |ngram, pos|
15
+ file.write "#{ngram} #{pos}\n"
16
+ end
17
+ end
18
+
19
+ def load(file)
20
+ @ngram_pos = Hash.new
21
+ pos = 0
22
+ file.each_line do |line|
23
+ # keep only the first @n characters of the line
24
+ ngram = line.gsub(/^(.{#{N}}).*\n?/, '\1')
25
+ @ngram_pos[ngram] = pos
26
+ pos += 1
27
+ end
28
+ end
29
+
30
+ def generate_model(ngram_count)
31
+ # top ngrams (transformed into arrays)
32
+ top = ngram_count.to_a.sort{|x, y| y[1] <=> x[1]}[0...@cutoff]
33
+ @ngram_pos = Hash.new # key is ngram value is position
34
+ i = 0
35
+ top.each {|n,| @ngram_pos[n] = i; i +=1}
36
+ end
37
+
38
+ def -(other)
39
+ if not other.is_a?(OrderedNGrams)
40
+ raise InvalidArgument
41
+ end
42
+ dist = 0
43
+ other.ngram_pos.each do |ngram, pos_other|
44
+ pos_self = ngram_pos[ngram]
45
+ if pos_self != nil
46
+ dist += (pos_self - pos_other).abs
47
+ else
48
+ dist += @cutoff # max distance
49
+ end
50
+ end
51
+ dist
52
+ end
53
+
54
+ protected
55
+ attr_reader :ngram_pos
56
+ end
57
+
58
+
59
+ class NGramsKDE < OrderedNGrams
60
+ def self.filename
61
+ return "3grams300kde"
62
+ end
63
+ end
64
+
65
+ class NGrams300 < OrderedNGrams
66
+ def self.filename
67
+ return "3grams300"
68
+ end
69
+ end
70
+
71
+ class NGrams800 < OrderedNGrams
72
+ def initialize(string)
73
+ super(string, 800)
74
+ end
75
+ def self.filename
76
+ return "3grams800"
77
+ end
78
+ end
79
+
80
+ class NGrams3000 < OrderedNGrams
81
+ def initialize(string)
82
+ super(string, 3000)
83
+ end
84
+ def self.filename
85
+ return "3grams3000"
86
+ end
87
+ end
88
+
89
+
90
+ class NGrams4000 < OrderedNGrams
91
+ def initialize(string)
92
+ super(string, 4000)
93
+ end
94
+ def self.filename
95
+ return "3grams4000"
96
+ end
97
+ end
98
+
99
+
100
+ end # module Rlid
@@ -0,0 +1,199 @@
1
+ module Rlid
2
+
3
+ require 'rlid/common'
4
+
5
+
6
+
7
+ class Percentage
8
+ def initialize value
9
+ @value = value
10
+ end
11
+
12
+ def to_s
13
+ if @value <= 0.98
14
+ return "%.2g" % (@value * 100)
15
+ else
16
+ complement = 1.0 - @value
17
+ # complement =
18
+ log = -Math.log10(complement).ceil
19
+ digits = log - 1
20
+ res = "%.#{digits}f" % (@value * 100)
21
+ last = -1
22
+ while res[last] == ?9
23
+ digits += 1
24
+ res = "%.#{digits}f" % (@value * 100)
25
+ end
26
+ return res
27
+ end
28
+ end
29
+
30
+ def to_f
31
+ @value
32
+ end
33
+ end
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+ class LanguageProbabilities
42
+ MAX_OUTPUT = 3
43
+
44
+ def initialize(args={})
45
+ @percentage = Hash.new(0)
46
+ args.each do |languages, percentage|
47
+ add(languages, percentage)
48
+ end
49
+ end
50
+
51
+ def random_language
52
+ r = rand
53
+ sum = 0
54
+ @percentage.each do |language, perc|
55
+ sum += perc
56
+ return language if sum > r
57
+ #puts "#{sum}(#{r})"
58
+ end
59
+ warn "rounding error!! (sum is not 1!!)"
60
+ @percentage.keys.first
61
+ end
62
+
63
+ def to_s
64
+ sorted[0...MAX_OUTPUT].map do |x|
65
+ # calculate the digits
66
+ formatted_perc = Percentage.new(x[PERC]).to_s
67
+ "#{x[LANG]}(#{formatted_perc})"
68
+ end.join(" : ")
69
+ end
70
+
71
+ def first
72
+ sorted.first[LANG]
73
+ end
74
+
75
+ def *(other)
76
+ if not other.is_a? LanguageProbabilities
77
+ p other.inspect
78
+ raise InvalidArgument.new(other)
79
+ end
80
+ res = LanguageProbabilities.new()
81
+ @percentage.each_key do |lang|
82
+ res.percentage[lang] = percentage[lang] * other.percentage[lang]
83
+ end
84
+ res.normalize
85
+ res
86
+ end
87
+
88
+ private
89
+ def add(languages, perc)
90
+ if perc < 0 or perc > 1
91
+ perc = perc.round
92
+ end
93
+ languages = [languages] if not languages.is_a? Array
94
+ perc = perc.to_f / languages.size
95
+ languages.each {|l| @percentage[l] = perc}
96
+ end
97
+
98
+ # def add_remainder
99
+ # languages = LANGUAGES - @percentage.keys
100
+ # perc = 1.0 - sum
101
+ # add(languages, perc)
102
+ # end
103
+
104
+ protected
105
+ # indexes
106
+ LANG = 0
107
+ PERC = 1
108
+
109
+ def random_language_and_delete
110
+ l = random_language
111
+ @percentage.delete(l)
112
+ normalize
113
+ l
114
+ end
115
+
116
+ def normalize
117
+ tot = sum
118
+ return if tot == 0
119
+ @percentage.each_key do |key|
120
+ @percentage[key] /= tot
121
+ end
122
+ end
123
+
124
+ def sum
125
+ @percentage.values.inject(0){|s, v| s + v}
126
+ end
127
+
128
+ def sorted
129
+ @percentage.to_a.sort!{|x,y| y[PERC] <=> x[PERC]}
130
+ end
131
+
132
+ attr_accessor :percentage
133
+ end
134
+
135
+
136
+
137
+
138
+
139
+ class TestProbabilities < LanguageProbabilities
140
+ def initialize(lang, perc_lang=0.8, perc_lang_and_common=0.98)
141
+ @lang = lang
142
+ @perc_lang = perc_lang
143
+ common = COMMON_LANGUAGES - [lang]
144
+ other = LANGUAGES - COMMON_LANGUAGES - [lang]
145
+ @common_size = common.size
146
+ @other_size = other.size
147
+ @perc_common = perc_lang_and_common - perc_lang
148
+ @perc_other = 1 - perc_lang_and_common
149
+ super(lang => @perc_lang, common => @perc_common, other => @perc_other)
150
+ end
151
+
152
+ def random_permutation
153
+ lang = random_language
154
+ return self if lang == @lang
155
+
156
+ probs = probabilities
157
+
158
+ top_lang = probs.first
159
+
160
+ probs.percentage[lang], probs.percentage[top_lang] =
161
+ probs.percentage[top_lang], probs.percentage[lang]
162
+
163
+ probs
164
+ end
165
+
166
+ # common = []
167
+ # @common_size.times do
168
+ # common << probs.random_language_and_delete
169
+ # end
170
+ # other = []
171
+ # @other_size.times do
172
+ # other << probs.random_language_and_delete
173
+ # end
174
+ #
175
+ # LanguageProbabilities.new(
176
+ # lang => @perc_lang,
177
+ # common => @perc_common,
178
+ # other => @perc_other)
179
+
180
+ def probabilities
181
+ LanguageProbabilities.new(percentage)
182
+ end
183
+ end
184
+
185
+ #class Array
186
+ # def sum
187
+ # inject(0) {|s,v| s+v}
188
+ # end
189
+ #end
190
+ #x = TestProbabilities.new(:eng)
191
+ #res = Hash.new(0)
192
+ #100000.times{res[x.random_permutation.random_language] += 1}
193
+ #res.each{|k,v| puts "#{k}: #{v/1000.0}"}
194
+ #
195
+ #res.values.select{|x| x > 1000 and x < 10000}.sum / 1000.0
196
+ #res.values.select{|x| x > 10000}.sum / 1000.0
197
+ #res.values.select{|x| x < 1000}.sum / 1000.0
198
+
199
+ end # module Rlid