rlid 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.dirname( __FILE__)
4
+
5
+ require 'rlid'
6
+
7
+
8
+ loop do
9
+ print "> "
10
+ puts Rlid.guess_language(gets).to_s
11
+ end
12
+
@@ -0,0 +1,8 @@
1
+ require 'rlid/language_guesser/naive_bayes_guesser'
2
+
3
+ module Rlid
4
+ @guesser = NaiveBayesProbabilityGuesser.new
5
+ def self.guess_language(string)
6
+ @guesser.guess_language(string)
7
+ end
8
+ end
@@ -0,0 +1,144 @@
1
+ module Rlid
2
+
3
+ DATA_DIRECTORY = File.expand_path("#{__FILE__}/../../../data")
4
+
5
+ class Language
6
+ CODES = [
7
+ [:en, :eng, 'English'],
8
+ [:es, :spa, 'Spanish'],
9
+ [:de, :ger, 'German'],
10
+ [:fr, :fre, 'French'],
11
+ [:it, :ita, 'Italian'],
12
+ [:pt, :por, 'Portoguese'],
13
+ [:nl, :dut, 'Dutch'],
14
+ [:pl, :pol, 'Polish'],
15
+ [:id, :ind, 'Malay/Indonesian'],
16
+ [:sv, :swe, 'Swedish'],
17
+ [:tr, :tur, 'Turkish'],
18
+ [:vi, :vie, 'Vietnamese'],
19
+ [:ro, :rum, 'Romanian'],
20
+ [:cs, :cze, 'Czech'],
21
+ [:da, :dan, 'Danish'],
22
+ [:fi, :fin, 'Finnish'],
23
+ [:hu, :hun, 'Hungarian'],
24
+ [:el, :ell, 'Greek'],
25
+ [:ca, :cat, 'Catalan'],
26
+ [:no, :nor, 'Norvegian'],
27
+ [:sk, :slo, 'Slovak'],
28
+ [:is, :ice, 'Icelandic'],
29
+ #[:nn, :nnn, 'No Language']
30
+ #[:ff, :fff], # fdakjlfdaj;
31
+ ]
32
+
33
+ # indexes
34
+ CODE2 = 0
35
+ CODE3 = 1
36
+ NAME = 2
37
+ NO_LANGUAGE_CODE = :nnn
38
+
39
+ def Language.all_codes2
40
+ CODES.map{|c| c[CODE2]}
41
+ end
42
+
43
+ def Language.all_codes3
44
+ CODES.map{|c| c[CODE3]}
45
+ end
46
+
47
+
48
+ # enters each directory and passes the directory name to the block
49
+ # def Language.each_dir
50
+ # all_codes2.each do |lang_code|
51
+ # Dir.chdir("#{DATA_DIRECTORY}/#{lang_code}") do |dir|
52
+ # yield dir
53
+ # end
54
+ # end
55
+ # end
56
+
57
+ def Language.code2to3 code2
58
+ begin
59
+ CODES.select{|x| x[CODE2].to_s == code2.to_s}[0][CODE3]
60
+ rescue
61
+ nil
62
+ end
63
+ end
64
+
65
+ def Language.name_of(code)
66
+ if code == :nn or code == :nnn
67
+ return "No Language"
68
+ end
69
+ index = all_codes3.index(code) or all_codes2.index(code)
70
+ CODES[index][NAME]
71
+ end
72
+
73
+ def Language.each_file(filename, mode="r")
74
+ all_codes3.each do |lang_code|
75
+ filepath = "#{DATA_DIRECTORY}/#{lang_code}/#{filename}"
76
+ File.open(filepath, mode) do |file|
77
+ yield file, lang_code
78
+ end
79
+ end
80
+ end
81
+
82
+ def Language.each_2files(filename1, filename2, mode1="r", mode2="r")
83
+ all_codes3.each do |lang_code|
84
+ filepath1 = "#{DATA_DIRECTORY}/#{lang_code}/#{filename1}"
85
+ filepath2 = "#{DATA_DIRECTORY}/#{lang_code}/#{filename2}"
86
+ File.open(filepath1, mode1) do |file1|
87
+ File.open(filepath2, mode2) do |file2|
88
+ yield file1, file2, lang_code
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ LANGUAGES = Language.all_codes3
96
+ COMMON_LANGUAGES = [:dut, :eng, :ita, :por, :fre, :ger]
97
+
98
+
99
+ # for ngrams
100
+
101
+ end # module Rlid
102
+
103
+
104
+ # add methods to String
105
+ class String
106
+ def each_ngram(n=3)
107
+ string = preprocess(n)
108
+ string.chars.each_cons(n) do |chars|
109
+ yield chars.join
110
+ end
111
+ end
112
+
113
+ #private
114
+ def preprocess(n)
115
+ string = self.dup
116
+
117
+ # remove spaces at the start and end
118
+ string.gsub!(/\A\s+/, '')
119
+ string.gsub!(/\s+\Z/, '')
120
+
121
+ # remove non alphabetic characters
122
+ string.gsub!(/[^[:alpha:]'\n]/, ' ')
123
+ # substitute newlines with ||
124
+ string.gsub!(/\s*\n\s*/, '|'*(n-1))
125
+ string.gsub!(/\s+/, ' ')
126
+ # remove spaces at the start and end
127
+ string.gsub!(/\A\s+/, '')
128
+ string.gsub!(/\s+\Z/, '')
129
+
130
+ string.downcase!
131
+
132
+ padding = "|" * (n-1)
133
+
134
+ if string.size == 1
135
+ string = "|" + string + " "
136
+ elsif string.size == 1
137
+ string = padding + string + " "
138
+ else
139
+ string = padding + string + padding
140
+ end
141
+ string
142
+ end
143
+ end
144
+
@@ -0,0 +1,15 @@
1
+ module Rlid
2
+
3
+ class LanguageGuesser
4
+ attr_accessor :name
5
+ def initialize
6
+ @name = "unknown"
7
+ end
8
+
9
+ def guess_language(string)
10
+ raise "#{self.class} should be subclassed"
11
+ string # never called, supresses unused variable warning
12
+ end
13
+ end
14
+
15
+ end # module Rlid
@@ -0,0 +1,43 @@
1
+ module Rlid
2
+
3
+ require 'rlid/language_guesser/language_guesser'
4
+
5
+ class ModelDistanceGuesser < LanguageGuesser
6
+ def initialize(model_class)
7
+ @model_class = model_class
8
+ print "Loading models.. "
9
+ @language_models = model_class.language_models
10
+ @name = "Model Distance"
11
+ puts "Done!"
12
+ end
13
+
14
+ def guess_language(string)
15
+ model = @model_class.new(string)
16
+ min_language = min_distance = nil
17
+ @language_models.each do |lang, lang_model|
18
+ dist = lang_model - model
19
+ if min_distance == nil or dist < min_distance
20
+ min_distance = dist
21
+ min_language = lang
22
+ end
23
+ end
24
+ min_language
25
+ end
26
+ end
27
+
28
+
29
+ #class OnlineGuesser < ModelDistanceGuesser
30
+ # def guess_language(string)
31
+ # min_language = min_distance = nil
32
+ # @language_models.each do |lang, lang_model|
33
+ # dist = lang_model.distance_from(string)
34
+ # if min_distance == nil or dist < min_distance
35
+ # min_distance = dist
36
+ # min_language = lang
37
+ # end
38
+ # end
39
+ # min_language
40
+ # end
41
+ #end
42
+
43
+ end # module Rlid
@@ -0,0 +1,74 @@
1
+ module Rlid
2
+
3
+ require 'rlid/language_guesser/language_guesser'
4
+ require 'rlid/models/naive_bayes_models'
5
+ require 'rlid/probabilities/language_probabilities'
6
+
7
+ class NaiveBayesGuesser < LanguageGuesser
8
+ def initialize(default=1)
9
+ print "Naive Bayes: loading models.."
10
+ @models = NaiveBayesModels.load
11
+ @models.default_count = default
12
+ @name = "Naive Bayes"
13
+ puts " Done!"
14
+ end
15
+
16
+ def guess_language(string)
17
+ max_prob = 0.0
18
+ best_language = nil
19
+ @models.probabilities(string) do |lang, prob|
20
+ if prob > max_prob
21
+ max_prob = prob
22
+ best_language = lang
23
+ end
24
+ end
25
+ best_language
26
+ end
27
+ end
28
+
29
+ class NaiveBayesProbabilityGuesser < NaiveBayesGuesser
30
+ MAX = 3
31
+ def guess_language(string)
32
+ results = {}
33
+ tot = 0.0 # for normalization
34
+ @models.probabilities(string) do |lang, p|
35
+ size = string.preprocess(3).size
36
+ long = Math.log(1 + size)
37
+ # higher means lower
38
+ short = 1
39
+ exp = short/long
40
+ prob = p**exp
41
+ results[lang] = prob
42
+ tot += prob
43
+ end
44
+ # normalize
45
+ results.each_key do |k|
46
+ results[k] /= tot if tot != 0
47
+ end
48
+
49
+ LanguageProbabilities.new(results)
50
+ end
51
+ end
52
+
53
+
54
+ class NaiveBayesPriorGuesser < NaiveBayesProbabilityGuesser
55
+ def initialize(prior=TestProbabilities.new(:eng))
56
+ if not prior.is_a?(LanguageProbabilities)
57
+ raise InvalidArgument
58
+ end
59
+ @prior = prior
60
+ super()
61
+ end
62
+
63
+ def set_prior(prior)
64
+ @prior = prior
65
+ end
66
+
67
+ alias :super_guess_language :guess_language
68
+ def guess_language(string)
69
+ conditional = super_guess_language(string)
70
+ (conditional * @prior).first
71
+ end
72
+ end
73
+
74
+ end # module Rlid
@@ -0,0 +1,69 @@
1
+ module Rlid
2
+
3
+
4
+ require 'rlid/models/model'
5
+ require 'rlid/common'
6
+
7
+
8
+ class FrequencyModel < NGramModel
9
+ N = 3 # trigrams
10
+ def initialize(string, cutoff=3000)
11
+ super(string, N, cutoff)
12
+ end
13
+
14
+ def save(file)
15
+ file.write Marshal.dump(@ngram_frequency)
16
+ end
17
+
18
+ def load(file)
19
+ @ngram_frequency = Marshal.load(file.read)
20
+ end
21
+
22
+ def generate_model(ngram_count)
23
+ # top ngrams (transformed into arrays)
24
+ arrays = ngram_count.to_a.sort{|x, y| y[1] <=> x[1]}
25
+ top = arrays[0...@cutoff] # will be kept
26
+
27
+ tot = 0.0 # total, for normalization
28
+ @ngram_frequency = Hash.new # key is ngram value is position
29
+ top.each_with_index do |ngram_and_count, i|
30
+ ngram, count = ngram_and_count
31
+ @ngram_frequency[ngram] = count
32
+ tot += count
33
+ end
34
+
35
+ # normalization
36
+ @ngram_frequency.each do |ngram, count|
37
+ @ngram_frequency[ngram] /= tot
38
+ end
39
+ end
40
+
41
+ def self.filename
42
+ # FIXME should be frequency3000
43
+ return "cosine_distance3000"
44
+ end
45
+
46
+ protected
47
+ attr_reader :ngram_frequency
48
+ end
49
+
50
+ class CosineDistanceModel < FrequencyModel
51
+ def -(other)
52
+ if not other.is_a?(CosineDistanceModel)
53
+ raise InvalidArgument
54
+ end
55
+ prod = 0
56
+ other.ngram_frequency.each do |ngram, freq_other|
57
+ freq_self = ngram_frequency[ngram]
58
+ if freq_self != nil
59
+ prod += (freq_self * freq_other)**0.2
60
+ end
61
+ end
62
+ 1 - prod
63
+ end
64
+ end
65
+
66
+
67
+
68
+
69
+ end # module Rlid
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby1.9.1
2
+ module Rlid
3
+
4
+ require 'rlid/common'
5
+ require 'rlid/models/ordered_ngrams'
6
+ require 'rlid/models/cosine_distance_model'
7
+
8
+ # models to train
9
+ MODELS = [CosineDistanceModel]
10
+
11
+ MODELS.each do |model|
12
+ puts "training #{model}"
13
+ Language.each_2files('corpus', model.filename, 'r', 'w') do |corpus, file, l|
14
+ puts "learning #{l}.."
15
+ model.new(corpus.read).save file
16
+ end
17
+ end
18
+
19
+ puts ">> Successfully trained!! <<"
20
+
21
+
22
+ end # module Rlid
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env ruby1.9.1
2
+
3
+ module Rlid
4
+
5
+ require 'rlid/models/naive_bayes_models'
6
+
7
+ NaiveBayesModels.generate_models
8
+
9
+ end # module Rlid
@@ -0,0 +1,55 @@
1
+ module Rlid
2
+
3
+
4
+ # abstract class
5
+ class Model
6
+ def initialize string
7
+ raise "#{self.class} should be subclassed"
8
+ string # never called, supresses unused variable warning
9
+ end
10
+ end
11
+
12
+
13
+
14
+ # in subclasses generate_model filename, load and save should be implemented
15
+ class NGramModel < Model
16
+ def initialize(string=nil, n=3, cutoff=300)
17
+ @n = n
18
+ @cutoff = cutoff
19
+
20
+ if string == nil then return end
21
+
22
+ # ngrams and count of each
23
+ ngram_count = Hash.new(0)
24
+
25
+ string.each_ngram(@n) do |ngram|
26
+ ngram_count[ngram] += 1
27
+ end
28
+
29
+ generate_model(ngram_count)
30
+ end
31
+
32
+
33
+ def self.language_models
34
+ if not defined?(filename)
35
+ raise "#{self.class} should implement 'filename' accessor!"
36
+ end
37
+ res = Hash.new
38
+ Language.each_file(filename) do |file, lang|
39
+ model = self.new(nil)
40
+ model.load(file)
41
+ res[lang] = model
42
+ end
43
+ res
44
+ end
45
+
46
+ protected
47
+ # should be implemented in the subclass
48
+ # ngram_count is a hash: ngram => count
49
+ def generate_model(ngram_count)
50
+ raise "#{self.class} should be subclassed"
51
+ ngram_count # never called, supresses unused variable warning
52
+ end
53
+ end
54
+
55
+ end # module Rlid