rlid 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby1.9.1
2
+
3
+ module Rlid
4
+
5
+ require 'set'
6
+
7
+ require 'rlid/common'
8
+
9
+ class NaiveBayesModels
10
+ attr_accessor :default_count
11
+ # ngram leght
12
+ N = 3
13
+ # top ngrams kept for every language
14
+ CUTOFF = 3000
15
+ # special feature
16
+ OTHER = nil
17
+
18
+ MAX_STRING_LENGTH = 75
19
+
20
+ FILEPATH = "#{DATA_DIRECTORY}/naive_bayes_models"
21
+
22
+ def initialize(default_count=1)
23
+ @default_count=default_count
24
+ end
25
+
26
+ def self.generate_models
27
+ models = NaiveBayesModels.new(nil)
28
+ puts "Training started.."
29
+ models.train
30
+ File.open(FILEPATH, "w") do |file|
31
+ file.write Marshal.dump(models)
32
+ puts "Models saved to #{FILEPATH}"
33
+ end
34
+ end
35
+
36
+ def self.load
37
+ Marshal.load(File.read(FILEPATH))
38
+ end
39
+
40
+ def probabilities(string)
41
+ if not string.is_a? String
42
+ raise InvalidArgument
43
+ end
44
+ @ngram_frequency.keys.each do |lang|
45
+ prob = 1
46
+ string[0..MAX_STRING_LENGTH].each_ngram do |ngram|
47
+ prob *= frequency_of(lang, ngram)
48
+ end
49
+ yield lang, prob
50
+ end
51
+ end
52
+
53
+ def train
54
+ ngram_counts = get_ngram_counts
55
+ # ngrams for which we want to store information (all languages)
56
+ @stored_ngrams = top_ngrams(ngram_counts)
57
+
58
+ puts "- processing ngrams"
59
+ # content: ngram_frequency[lang][ngram] = freq
60
+ @ngram_frequency = Hash.new
61
+ # content: total_ngrams_found[lang] = total count of ngrams encountered
62
+ @total_ngrams_found= Hash.new
63
+ # content: total_ngrams_not_found[lang] = n of ngrams not found
64
+ @total_ngrams_not_found = Hash.new
65
+
66
+ ngram_counts.each do |lang, counts|
67
+ @ngram_frequency[lang] = Hash.new(0)
68
+ @total_ngrams_found[lang] = 0
69
+ counts.each do |ngram, count|
70
+ if @stored_ngrams.include?(ngram)
71
+ @ngram_frequency[lang][ngram] = count
72
+ else
73
+ @ngram_frequency[lang][OTHER] += count
74
+ end
75
+ @total_ngrams_found[lang] += count
76
+ end
77
+
78
+ not_found = (@stored_ngrams - @ngram_frequency[lang].keys).size
79
+ @total_ngrams_not_found[lang] = not_found
80
+
81
+ puts_info(lang)
82
+ end
83
+
84
+ n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
85
+ @total_ngrams_found[:nnn] = n
86
+ @ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
87
+ @total_ngrams_not_found[:nnn] = @stored_ngrams.size
88
+
89
+ #puts "total frequencies saved: #{freqs}"
90
+ #puts "defauld values used: #{default_count} (#{100*default_count/freqs}%)"
91
+ #@ngram_frequency
92
+ end
93
+
94
+ protected
95
+ def total_ngrams(lang)
96
+ @total_ngrams_found[lang] + @total_ngrams_not_found[lang] * @default_count
97
+ end
98
+
99
+ def frequency_of(lang, ngram)
100
+ if not @stored_ngrams.include?(ngram)
101
+ #warn " :#{ngram}: is in OTHER!" if lang == :eng
102
+ ngram = OTHER
103
+ end
104
+ count = 0
105
+ if @ngram_frequency[lang].include?(ngram)
106
+ count = @ngram_frequency[lang][ngram]
107
+ else
108
+ count = @default_count
109
+ end
110
+ count.to_f / total_ngrams(lang)
111
+ end
112
+
113
+ private
114
+
115
+ def puts_info(lang)
116
+ # default count of 1 is supposed
117
+ tot = @total_ngrams_found[lang] + @total_ngrams_not_found[lang]
118
+ d = (100.0 * @total_ngrams_not_found[lang] / tot).round(1)
119
+ o = (100.0 * @ngram_frequency[lang][OTHER] / tot).round(1)
120
+ puts " #{lang} processed tot:#{tot}, default:#{d}%, other:#{o}%"
121
+ end
122
+
123
+ # auxiliary functions
124
+
125
+ # gets all ngram_counts and returns an hash having:
126
+ # ngram_counts[lang][ngram] = count
127
+ def get_ngram_counts
128
+ @stored_ngrams = Set.new
129
+ ngram_counts = Hash.new
130
+ Language.each_file("corpus") do |file, lang|
131
+ puts "- I'm learning #{lang}"
132
+ ngram_counts[lang] = Hash.new(0) # default is 1
133
+ file.read.each_ngram(N) do |ngram|
134
+ ngram_counts[lang][ngram] += 1
135
+ end
136
+
137
+ # top ngrams (transformed into arrays)
138
+ arrays = ngram_counts[lang].to_a.sort{|x, y| y[1] <=> x[1]}
139
+ @stored_ngrams += arrays[0...CUTOFF].map{|x| x[0]}
140
+ end
141
+ ngram_counts
142
+ end
143
+
144
+ # extract the top ngrams for every language
145
+ def top_ngrams(ngram_counts)
146
+ res = Set.new
147
+ ngram_counts.values.each do |hash|
148
+ # top ngrams (transformed into arrays)
149
+ arrays = hash.to_a.sort{|x, y| y[1] <=> x[1]}
150
+ res += arrays[0...CUTOFF].map{|x| x[0]}
151
+ end
152
+ res
153
+ end
154
+ end
155
+
156
+
157
+ end # module Rlid
@@ -0,0 +1,100 @@
1
+ module Rlid
2
+
3
+ require 'rlid/models/model'
4
+ require 'rlid/common'
5
+
6
+ # a subclass should define the filename
7
+ class OrderedNGrams < NGramModel
8
+ N = 3
9
+ def initialize(string, cutoff=300)
10
+ super(string, N, cutoff)
11
+ end
12
+
13
+ def save(file)
14
+ @ngram_pos.each do |ngram, pos|
15
+ file.write "#{ngram} #{pos}\n"
16
+ end
17
+ end
18
+
19
+ def load(file)
20
+ @ngram_pos = Hash.new
21
+ pos = 0
22
+ file.each_line do |line|
23
+ # keep only the first @n characters of the line
24
+ ngram = line.gsub(/^(.{#{N}}).*\n?/, '\1')
25
+ @ngram_pos[ngram] = pos
26
+ pos += 1
27
+ end
28
+ end
29
+
30
+ def generate_model(ngram_count)
31
+ # top ngrams (transformed into arrays)
32
+ top = ngram_count.to_a.sort{|x, y| y[1] <=> x[1]}[0...@cutoff]
33
+ @ngram_pos = Hash.new # key is ngram value is position
34
+ i = 0
35
+ top.each {|n,| @ngram_pos[n] = i; i +=1}
36
+ end
37
+
38
+ def -(other)
39
+ if not other.is_a?(OrderedNGrams)
40
+ raise InvalidArgument
41
+ end
42
+ dist = 0
43
+ other.ngram_pos.each do |ngram, pos_other|
44
+ pos_self = ngram_pos[ngram]
45
+ if pos_self != nil
46
+ dist += (pos_self - pos_other).abs
47
+ else
48
+ dist += @cutoff # max distance
49
+ end
50
+ end
51
+ dist
52
+ end
53
+
54
+ protected
55
+ attr_reader :ngram_pos
56
+ end
57
+
58
+
59
+ class NGramsKDE < OrderedNGrams
60
+ def self.filename
61
+ return "3grams300kde"
62
+ end
63
+ end
64
+
65
+ class NGrams300 < OrderedNGrams
66
+ def self.filename
67
+ return "3grams300"
68
+ end
69
+ end
70
+
71
+ class NGrams800 < OrderedNGrams
72
+ def initialize(string)
73
+ super(string, 800)
74
+ end
75
+ def self.filename
76
+ return "3grams800"
77
+ end
78
+ end
79
+
80
+ class NGrams3000 < OrderedNGrams
81
+ def initialize(string)
82
+ super(string, 3000)
83
+ end
84
+ def self.filename
85
+ return "3grams3000"
86
+ end
87
+ end
88
+
89
+
90
+ class NGrams4000 < OrderedNGrams
91
+ def initialize(string)
92
+ super(string, 4000)
93
+ end
94
+ def self.filename
95
+ return "3grams4000"
96
+ end
97
+ end
98
+
99
+
100
+ end # module Rlid
@@ -0,0 +1,199 @@
1
+ module Rlid
2
+
3
+ require 'rlid/common'
4
+
5
+
6
+
7
+ class Percentage
8
+ def initialize value
9
+ @value = value
10
+ end
11
+
12
+ def to_s
13
+ if @value <= 0.98
14
+ return "%.2g" % (@value * 100)
15
+ else
16
+ complement = 1.0 - @value
17
+ # complement =
18
+ log = -Math.log10(complement).ceil
19
+ digits = log - 1
20
+ res = "%.#{digits}f" % (@value * 100)
21
+ last = -1
22
+ while res[last] == ?9
23
+ digits += 1
24
+ res = "%.#{digits}f" % (@value * 100)
25
+ end
26
+ return res
27
+ end
28
+ end
29
+
30
+ def to_f
31
+ @value
32
+ end
33
+ end
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+ class LanguageProbabilities
42
+ MAX_OUTPUT = 3
43
+
44
+ def initialize(args={})
45
+ @percentage = Hash.new(0)
46
+ args.each do |languages, percentage|
47
+ add(languages, percentage)
48
+ end
49
+ end
50
+
51
+ def random_language
52
+ r = rand
53
+ sum = 0
54
+ @percentage.each do |language, perc|
55
+ sum += perc
56
+ return language if sum > r
57
+ #puts "#{sum}(#{r})"
58
+ end
59
+ warn "rounding error!! (sum is not 1!!)"
60
+ @percentage.keys.first
61
+ end
62
+
63
+ def to_s
64
+ sorted[0...MAX_OUTPUT].map do |x|
65
+ # calculate the digits
66
+ formatted_perc = Percentage.new(x[PERC]).to_s
67
+ "#{x[LANG]}(#{formatted_perc})"
68
+ end.join(" : ")
69
+ end
70
+
71
+ def first
72
+ sorted.first[LANG]
73
+ end
74
+
75
+ def *(other)
76
+ if not other.is_a? LanguageProbabilities
77
+ p other.inspect
78
+ raise InvalidArgument.new(other)
79
+ end
80
+ res = LanguageProbabilities.new()
81
+ @percentage.each_key do |lang|
82
+ res.percentage[lang] = percentage[lang] * other.percentage[lang]
83
+ end
84
+ res.normalize
85
+ res
86
+ end
87
+
88
+ private
89
+ def add(languages, perc)
90
+ if perc < 0 or perc > 1
91
+ perc = perc.round
92
+ end
93
+ languages = [languages] if not languages.is_a? Array
94
+ perc = perc.to_f / languages.size
95
+ languages.each {|l| @percentage[l] = perc}
96
+ end
97
+
98
+ # def add_remainder
99
+ # languages = LANGUAGES - @percentage.keys
100
+ # perc = 1.0 - sum
101
+ # add(languages, perc)
102
+ # end
103
+
104
+ protected
105
+ # indexes
106
+ LANG = 0
107
+ PERC = 1
108
+
109
+ def random_language_and_delete
110
+ l = random_language
111
+ @percentage.delete(l)
112
+ normalize
113
+ l
114
+ end
115
+
116
+ def normalize
117
+ tot = sum
118
+ return if tot == 0
119
+ @percentage.each_key do |key|
120
+ @percentage[key] /= tot
121
+ end
122
+ end
123
+
124
+ def sum
125
+ @percentage.values.inject(0){|s, v| s + v}
126
+ end
127
+
128
+ def sorted
129
+ @percentage.to_a.sort!{|x,y| y[PERC] <=> x[PERC]}
130
+ end
131
+
132
+ attr_accessor :percentage
133
+ end
134
+
135
+
136
+
137
+
138
+
139
+ class TestProbabilities < LanguageProbabilities
140
+ def initialize(lang, perc_lang=0.8, perc_lang_and_common=0.98)
141
+ @lang = lang
142
+ @perc_lang = perc_lang
143
+ common = COMMON_LANGUAGES - [lang]
144
+ other = LANGUAGES - COMMON_LANGUAGES - [lang]
145
+ @common_size = common.size
146
+ @other_size = other.size
147
+ @perc_common = perc_lang_and_common - perc_lang
148
+ @perc_other = 1 - perc_lang_and_common
149
+ super(lang => @perc_lang, common => @perc_common, other => @perc_other)
150
+ end
151
+
152
+ def random_permutation
153
+ lang = random_language
154
+ return self if lang == @lang
155
+
156
+ probs = probabilities
157
+
158
+ top_lang = probs.first
159
+
160
+ probs.percentage[lang], probs.percentage[top_lang] =
161
+ probs.percentage[top_lang], probs.percentage[lang]
162
+
163
+ probs
164
+ end
165
+
166
+ # common = []
167
+ # @common_size.times do
168
+ # common << probs.random_language_and_delete
169
+ # end
170
+ # other = []
171
+ # @other_size.times do
172
+ # other << probs.random_language_and_delete
173
+ # end
174
+ #
175
+ # LanguageProbabilities.new(
176
+ # lang => @perc_lang,
177
+ # common => @perc_common,
178
+ # other => @perc_other)
179
+
180
+ def probabilities
181
+ LanguageProbabilities.new(percentage)
182
+ end
183
+ end
184
+
185
+ #class Array
186
+ # def sum
187
+ # inject(0) {|s,v| s+v}
188
+ # end
189
+ #end
190
+ #x = TestProbabilities.new(:eng)
191
+ #res = Hash.new(0)
192
+ #100000.times{res[x.random_permutation.random_language] += 1}
193
+ #res.each{|k,v| puts "#{k}: #{v/1000.0}"}
194
+ #
195
+ #res.values.select{|x| x > 1000 and x < 10000}.sum / 1000.0
196
+ #res.values.select{|x| x > 10000}.sum / 1000.0
197
+ #res.values.select{|x| x < 1000}.sum / 1000.0
198
+
199
+ end # module Rlid