rlid 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.dirname( __FILE__)
4
+
5
+ require 'rlid'
6
+
7
+
8
+ loop do
9
+ print "> "
10
+ puts Rlid.guess_language(gets).to_s
11
+ end
12
+
@@ -0,0 +1,8 @@
1
+ require 'rlid/language_guesser/naive_bayes_guesser'
2
+
3
+ module Rlid
4
+ @guesser = NaiveBayesProbabilityGuesser.new
5
+ def self.guess_language(string)
6
+ @guesser.guess_language(string)
7
+ end
8
+ end
@@ -0,0 +1,144 @@
1
+ module Rlid
2
+
3
+ DATA_DIRECTORY = File.expand_path("#{__FILE__}/../../../data")
4
+
5
+ class Language
6
+ CODES = [
7
+ [:en, :eng, 'English'],
8
+ [:es, :spa, 'Spanish'],
9
+ [:de, :ger, 'German'],
10
+ [:fr, :fre, 'French'],
11
+ [:it, :ita, 'Italian'],
12
+ [:pt, :por, 'Portoguese'],
13
+ [:nl, :dut, 'Dutch'],
14
+ [:pl, :pol, 'Polish'],
15
+ [:id, :ind, 'Malay/Indonesian'],
16
+ [:sv, :swe, 'Swedish'],
17
+ [:tr, :tur, 'Turkish'],
18
+ [:vi, :vie, 'Vietnamese'],
19
+ [:ro, :rum, 'Romanian'],
20
+ [:cs, :cze, 'Czech'],
21
+ [:da, :dan, 'Danish'],
22
+ [:fi, :fin, 'Finnish'],
23
+ [:hu, :hun, 'Hungarian'],
24
+ [:el, :ell, 'Greek'],
25
+ [:ca, :cat, 'Catalan'],
26
+ [:no, :nor, 'Norvegian'],
27
+ [:sk, :slo, 'Slovak'],
28
+ [:is, :ice, 'Icelandic'],
29
+ #[:nn, :nnn, 'No Language']
30
+ #[:ff, :fff], # fdakjlfdaj;
31
+ ]
32
+
33
+ # indexes
34
+ CODE2 = 0
35
+ CODE3 = 1
36
+ NAME = 2
37
+ NO_LANGUAGE_CODE = :nnn
38
+
39
+ def Language.all_codes2
40
+ CODES.map{|c| c[CODE2]}
41
+ end
42
+
43
+ def Language.all_codes3
44
+ CODES.map{|c| c[CODE3]}
45
+ end
46
+
47
+
48
+ # enters each directory and passes the directory name to the block
49
+ # def Language.each_dir
50
+ # all_codes2.each do |lang_code|
51
+ # Dir.chdir("#{DATA_DIRECTORY}/#{lang_code}") do |dir|
52
+ # yield dir
53
+ # end
54
+ # end
55
+ # end
56
+
57
+ def Language.code2to3 code2
58
+ begin
59
+ CODES.select{|x| x[CODE2].to_s == code2.to_s}[0][CODE3]
60
+ rescue
61
+ nil
62
+ end
63
+ end
64
+
65
+ def Language.name_of(code)
66
+ if code == :nn or code == :nnn
67
+ return "No Language"
68
+ end
69
+ index = all_codes3.index(code) or all_codes2.index(code)
70
+ CODES[index][NAME]
71
+ end
72
+
73
+ def Language.each_file(filename, mode="r")
74
+ all_codes3.each do |lang_code|
75
+ filepath = "#{DATA_DIRECTORY}/#{lang_code}/#{filename}"
76
+ File.open(filepath, mode) do |file|
77
+ yield file, lang_code
78
+ end
79
+ end
80
+ end
81
+
82
+ def Language.each_2files(filename1, filename2, mode1="r", mode2="r")
83
+ all_codes3.each do |lang_code|
84
+ filepath1 = "#{DATA_DIRECTORY}/#{lang_code}/#{filename1}"
85
+ filepath2 = "#{DATA_DIRECTORY}/#{lang_code}/#{filename2}"
86
+ File.open(filepath1, mode1) do |file1|
87
+ File.open(filepath2, mode2) do |file2|
88
+ yield file1, file2, lang_code
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ LANGUAGES = Language.all_codes3
96
+ COMMON_LANGUAGES = [:dut, :eng, :ita, :por, :fre, :ger]
97
+
98
+
99
+ # for ngrams
100
+
101
+ end # module Rlid
102
+
103
+
104
+ # add methods to String
105
+ class String
106
+ def each_ngram(n=3)
107
+ string = preprocess(n)
108
+ string.chars.each_cons(n) do |chars|
109
+ yield chars.join
110
+ end
111
+ end
112
+
113
+ #private
114
+ def preprocess(n)
115
+ string = self.dup
116
+
117
+ # remove spaces at the start and end
118
+ string.gsub!(/\A\s+/, '')
119
+ string.gsub!(/\s+\Z/, '')
120
+
121
+ # remove non alphabetic characters
122
+ string.gsub!(/[^[:alpha:]'\n]/, ' ')
123
+ # substitute newlines with ||
124
+ string.gsub!(/\s*\n\s*/, '|'*(n-1))
125
+ string.gsub!(/\s+/, ' ')
126
+ # remove spaces at the start and end
127
+ string.gsub!(/\A\s+/, '')
128
+ string.gsub!(/\s+\Z/, '')
129
+
130
+ string.downcase!
131
+
132
+ padding = "|" * (n-1)
133
+
134
+ if string.size == 1
135
+ string = "|" + string + " "
136
+ elsif string.size == 1
137
+ string = padding + string + " "
138
+ else
139
+ string = padding + string + padding
140
+ end
141
+ string
142
+ end
143
+ end
144
+
@@ -0,0 +1,15 @@
1
+ module Rlid
2
+
3
+ class LanguageGuesser
4
+ attr_accessor :name
5
+ def initialize
6
+ @name = "unknown"
7
+ end
8
+
9
+ def guess_language(string)
10
+ raise "#{self.class} should be subclassed"
11
+ string # never called, supresses unused variable warning
12
+ end
13
+ end
14
+
15
+ end # module Rlid
@@ -0,0 +1,43 @@
1
+ module Rlid
2
+
3
+ require 'rlid/language_guesser/language_guesser'
4
+
5
+ class ModelDistanceGuesser < LanguageGuesser
6
+ def initialize(model_class)
7
+ @model_class = model_class
8
+ print "Loading models.. "
9
+ @language_models = model_class.language_models
10
+ @name = "Model Distance"
11
+ puts "Done!"
12
+ end
13
+
14
+ def guess_language(string)
15
+ model = @model_class.new(string)
16
+ min_language = min_distance = nil
17
+ @language_models.each do |lang, lang_model|
18
+ dist = lang_model - model
19
+ if min_distance == nil or dist < min_distance
20
+ min_distance = dist
21
+ min_language = lang
22
+ end
23
+ end
24
+ min_language
25
+ end
26
+ end
27
+
28
+
29
+ #class OnlineGuesser < ModelDistanceGuesser
30
+ # def guess_language(string)
31
+ # min_language = min_distance = nil
32
+ # @language_models.each do |lang, lang_model|
33
+ # dist = lang_model.distance_from(string)
34
+ # if min_distance == nil or dist < min_distance
35
+ # min_distance = dist
36
+ # min_language = lang
37
+ # end
38
+ # end
39
+ # min_language
40
+ # end
41
+ #end
42
+
43
+ end # module Rlid
@@ -0,0 +1,74 @@
1
+ module Rlid
2
+
3
+ require 'rlid/language_guesser/language_guesser'
4
+ require 'rlid/models/naive_bayes_models'
5
+ require 'rlid/probabilities/language_probabilities'
6
+
7
+ class NaiveBayesGuesser < LanguageGuesser
8
+ def initialize(default=1)
9
+ print "Naive Bayes: loading models.."
10
+ @models = NaiveBayesModels.load
11
+ @models.default_count = default
12
+ @name = "Naive Bayes"
13
+ puts " Done!"
14
+ end
15
+
16
+ def guess_language(string)
17
+ max_prob = 0.0
18
+ best_language = nil
19
+ @models.probabilities(string) do |lang, prob|
20
+ if prob > max_prob
21
+ max_prob = prob
22
+ best_language = lang
23
+ end
24
+ end
25
+ best_language
26
+ end
27
+ end
28
+
29
+ class NaiveBayesProbabilityGuesser < NaiveBayesGuesser
30
+ MAX = 3
31
+ def guess_language(string)
32
+ results = {}
33
+ tot = 0.0 # for normalization
34
+ @models.probabilities(string) do |lang, p|
35
+ size = string.preprocess(3).size
36
+ long = Math.log(1 + size)
37
+ # higher means lower
38
+ short = 1
39
+ exp = short/long
40
+ prob = p**exp
41
+ results[lang] = prob
42
+ tot += prob
43
+ end
44
+ # normalize
45
+ results.each_key do |k|
46
+ results[k] /= tot if tot != 0
47
+ end
48
+
49
+ LanguageProbabilities.new(results)
50
+ end
51
+ end
52
+
53
+
54
+ class NaiveBayesPriorGuesser < NaiveBayesProbabilityGuesser
55
+ def initialize(prior=TestProbabilities.new(:eng))
56
+ if not prior.is_a?(LanguageProbabilities)
57
+ raise InvalidArgument
58
+ end
59
+ @prior = prior
60
+ super()
61
+ end
62
+
63
+ def set_prior(prior)
64
+ @prior = prior
65
+ end
66
+
67
+ alias :super_guess_language :guess_language
68
+ def guess_language(string)
69
+ conditional = super_guess_language(string)
70
+ (conditional * @prior).first
71
+ end
72
+ end
73
+
74
+ end # module Rlid
@@ -0,0 +1,69 @@
1
+ module Rlid
2
+
3
+
4
+ require 'rlid/models/model'
5
+ require 'rlid/common'
6
+
7
+
8
+ class FrequencyModel < NGramModel
9
+ N = 3 # trigrams
10
+ def initialize(string, cutoff=3000)
11
+ super(string, N, cutoff)
12
+ end
13
+
14
+ def save(file)
15
+ file.write Marshal.dump(@ngram_frequency)
16
+ end
17
+
18
+ def load(file)
19
+ @ngram_frequency = Marshal.load(file.read)
20
+ end
21
+
22
+ def generate_model(ngram_count)
23
+ # top ngrams (transformed into arrays)
24
+ arrays = ngram_count.to_a.sort{|x, y| y[1] <=> x[1]}
25
+ top = arrays[0...@cutoff] # will be kept
26
+
27
+ tot = 0.0 # total, for normalization
28
+ @ngram_frequency = Hash.new # key is ngram value is position
29
+ top.each_with_index do |ngram_and_count, i|
30
+ ngram, count = ngram_and_count
31
+ @ngram_frequency[ngram] = count
32
+ tot += count
33
+ end
34
+
35
+ # normalization
36
+ @ngram_frequency.each do |ngram, count|
37
+ @ngram_frequency[ngram] /= tot
38
+ end
39
+ end
40
+
41
+ def self.filename
42
+ # FIXME should be frequency3000
43
+ return "cosine_distance3000"
44
+ end
45
+
46
+ protected
47
+ attr_reader :ngram_frequency
48
+ end
49
+
50
+ class CosineDistanceModel < FrequencyModel
51
+ def -(other)
52
+ if not other.is_a?(CosineDistanceModel)
53
+ raise InvalidArgument
54
+ end
55
+ prod = 0
56
+ other.ngram_frequency.each do |ngram, freq_other|
57
+ freq_self = ngram_frequency[ngram]
58
+ if freq_self != nil
59
+ prod += (freq_self * freq_other)**0.2
60
+ end
61
+ end
62
+ 1 - prod
63
+ end
64
+ end
65
+
66
+
67
+
68
+
69
+ end # module Rlid
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby1.9.1
2
+ module Rlid
3
+
4
+ require 'rlid/common'
5
+ require 'rlid/models/ordered_ngrams'
6
+ require 'rlid/models/cosine_distance_model'
7
+
8
+ # models to train
9
+ MODELS = [CosineDistanceModel]
10
+
11
+ MODELS.each do |model|
12
+ puts "training #{model}"
13
+ Language.each_2files('corpus', model.filename, 'r', 'w') do |corpus, file, l|
14
+ puts "learning #{l}.."
15
+ model.new(corpus.read).save file
16
+ end
17
+ end
18
+
19
+ puts ">> Successfully trained!! <<"
20
+
21
+
22
+ end # module Rlid
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby1.9.1
2
+
3
+ module Rlid
4
+
5
+ require 'rlid/models/naive_bayes_models'
6
+
7
+ NaiveBayesModels.generate_models
8
+
9
+ end # module Rlid
@@ -0,0 +1,55 @@
1
+ module Rlid
2
+
3
+
4
+ # abstract class
5
+ class Model
6
+ def initialize string
7
+ raise "#{self.class} should be subclassed"
8
+ string # never called, supresses unused variable warning
9
+ end
10
+ end
11
+
12
+
13
+
14
+ # in subclasses generate_model filename, load and save should be implemented
15
+ class NGramModel < Model
16
+ def initialize(string=nil, n=3, cutoff=300)
17
+ @n = n
18
+ @cutoff = cutoff
19
+
20
+ if string == nil then return end
21
+
22
+ # ngrams and count of each
23
+ ngram_count = Hash.new(0)
24
+
25
+ string.each_ngram(@n) do |ngram|
26
+ ngram_count[ngram] += 1
27
+ end
28
+
29
+ generate_model(ngram_count)
30
+ end
31
+
32
+
33
+ def self.language_models
34
+ if not defined?(filename)
35
+ raise "#{self.class} should implement 'filename' accessor!"
36
+ end
37
+ res = Hash.new
38
+ Language.each_file(filename) do |file, lang|
39
+ model = self.new(nil)
40
+ model.load(file)
41
+ res[lang] = model
42
+ end
43
+ res
44
+ end
45
+
46
+ protected
47
+ # should be implemented in the subclass
48
+ # ngram_count is a hash: ngram => count
49
+ def generate_model(ngram_count)
50
+ raise "#{self.class} should be subclassed"
51
+ ngram_count # never called, supresses unused variable warning
52
+ end
53
+ end
54
+
55
+ end # module Rlid