rlid 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/data/naive_bayes_models +127059 -1
- data/lib/interactive_guesser.rb +12 -0
- data/lib/rlid.rb +8 -0
- data/lib/rlid/common.rb +144 -0
- data/lib/rlid/language_guesser/language_guesser.rb +15 -0
- data/lib/rlid/language_guesser/model_distance_guesser.rb +43 -0
- data/lib/rlid/language_guesser/naive_bayes_guesser.rb +74 -0
- data/lib/rlid/models/cosine_distance_model.rb +69 -0
- data/lib/rlid/models/generate_models.rb +22 -0
- data/lib/rlid/models/generate_naive_bayes_models.rb +9 -0
- data/lib/rlid/models/model.rb +55 -0
- data/lib/rlid/models/naive_bayes_models.rb +157 -0
- data/lib/rlid/models/ordered_ngrams.rb +100 -0
- data/lib/rlid/probabilities/language_probabilities.rb +199 -0
- data/lib/rlid/tmp.rb +57 -0
- data/lib/rlid/web.rb +16 -0
- metadata +71 -0
data/lib/rlid.rb
ADDED
data/lib/rlid/common.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
DATA_DIRECTORY = File.expand_path("#{__FILE__}/../../../data")
|
4
|
+
|
5
|
+
class Language
|
6
|
+
CODES = [
|
7
|
+
[:en, :eng, 'English'],
|
8
|
+
[:es, :spa, 'Spanish'],
|
9
|
+
[:de, :ger, 'German'],
|
10
|
+
[:fr, :fre, 'French'],
|
11
|
+
[:it, :ita, 'Italian'],
|
12
|
+
[:pt, :por, 'Portoguese'],
|
13
|
+
[:nl, :dut, 'Dutch'],
|
14
|
+
[:pl, :pol, 'Polish'],
|
15
|
+
[:id, :ind, 'Malay/Indonesian'],
|
16
|
+
[:sv, :swe, 'Swedish'],
|
17
|
+
[:tr, :tur, 'Turkish'],
|
18
|
+
[:vi, :vie, 'Vietnamese'],
|
19
|
+
[:ro, :rum, 'Romanian'],
|
20
|
+
[:cs, :cze, 'Czech'],
|
21
|
+
[:da, :dan, 'Danish'],
|
22
|
+
[:fi, :fin, 'Finnish'],
|
23
|
+
[:hu, :hun, 'Hungarian'],
|
24
|
+
[:el, :ell, 'Greek'],
|
25
|
+
[:ca, :cat, 'Catalan'],
|
26
|
+
[:no, :nor, 'Norvegian'],
|
27
|
+
[:sk, :slo, 'Slovak'],
|
28
|
+
[:is, :ice, 'Icelandic'],
|
29
|
+
#[:nn, :nnn, 'No Language']
|
30
|
+
#[:ff, :fff], # fdakjlfdaj;
|
31
|
+
]
|
32
|
+
|
33
|
+
# indexes
|
34
|
+
CODE2 = 0
|
35
|
+
CODE3 = 1
|
36
|
+
NAME = 2
|
37
|
+
NO_LANGUAGE_CODE = :nnn
|
38
|
+
|
39
|
+
def Language.all_codes2
|
40
|
+
CODES.map{|c| c[CODE2]}
|
41
|
+
end
|
42
|
+
|
43
|
+
def Language.all_codes3
|
44
|
+
CODES.map{|c| c[CODE3]}
|
45
|
+
end
|
46
|
+
|
47
|
+
|
48
|
+
# enters each directory and passes the directory name to the block
|
49
|
+
# def Language.each_dir
|
50
|
+
# all_codes2.each do |lang_code|
|
51
|
+
# Dir.chdir("#{DATA_DIRECTORY}/#{lang_code}") do |dir|
|
52
|
+
# yield dir
|
53
|
+
# end
|
54
|
+
# end
|
55
|
+
# end
|
56
|
+
|
57
|
+
def Language.code2to3 code2
|
58
|
+
begin
|
59
|
+
CODES.select{|x| x[CODE2].to_s == code2.to_s}[0][CODE3]
|
60
|
+
rescue
|
61
|
+
nil
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def Language.name_of(code)
|
66
|
+
if code == :nn or code == :nnn
|
67
|
+
return "No Language"
|
68
|
+
end
|
69
|
+
index = all_codes3.index(code) or all_codes2.index(code)
|
70
|
+
CODES[index][NAME]
|
71
|
+
end
|
72
|
+
|
73
|
+
def Language.each_file(filename, mode="r")
|
74
|
+
all_codes3.each do |lang_code|
|
75
|
+
filepath = "#{DATA_DIRECTORY}/#{lang_code}/#{filename}"
|
76
|
+
File.open(filepath, mode) do |file|
|
77
|
+
yield file, lang_code
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def Language.each_2files(filename1, filename2, mode1="r", mode2="r")
|
83
|
+
all_codes3.each do |lang_code|
|
84
|
+
filepath1 = "#{DATA_DIRECTORY}/#{lang_code}/#{filename1}"
|
85
|
+
filepath2 = "#{DATA_DIRECTORY}/#{lang_code}/#{filename2}"
|
86
|
+
File.open(filepath1, mode1) do |file1|
|
87
|
+
File.open(filepath2, mode2) do |file2|
|
88
|
+
yield file1, file2, lang_code
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
LANGUAGES = Language.all_codes3
|
96
|
+
COMMON_LANGUAGES = [:dut, :eng, :ita, :por, :fre, :ger]
|
97
|
+
|
98
|
+
|
99
|
+
# for ngrams
|
100
|
+
|
101
|
+
end # module Rlid
|
102
|
+
|
103
|
+
|
104
|
+
# add methods to String
|
105
|
+
class String
|
106
|
+
def each_ngram(n=3)
|
107
|
+
string = preprocess(n)
|
108
|
+
string.chars.each_cons(n) do |chars|
|
109
|
+
yield chars.join
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
#private
|
114
|
+
def preprocess(n)
|
115
|
+
string = self.dup
|
116
|
+
|
117
|
+
# remove spaces at the start and end
|
118
|
+
string.gsub!(/\A\s+/, '')
|
119
|
+
string.gsub!(/\s+\Z/, '')
|
120
|
+
|
121
|
+
# remove non alphabetic characters
|
122
|
+
string.gsub!(/[^[:alpha:]'\n]/, ' ')
|
123
|
+
# substitute newlines with ||
|
124
|
+
string.gsub!(/\s*\n\s*/, '|'*(n-1))
|
125
|
+
string.gsub!(/\s+/, ' ')
|
126
|
+
# remove spaces at the start and end
|
127
|
+
string.gsub!(/\A\s+/, '')
|
128
|
+
string.gsub!(/\s+\Z/, '')
|
129
|
+
|
130
|
+
string.downcase!
|
131
|
+
|
132
|
+
padding = "|" * (n-1)
|
133
|
+
|
134
|
+
if string.size == 1
|
135
|
+
string = "|" + string + " "
|
136
|
+
elsif string.size == 1
|
137
|
+
string = padding + string + " "
|
138
|
+
else
|
139
|
+
string = padding + string + padding
|
140
|
+
end
|
141
|
+
string
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
class LanguageGuesser
|
4
|
+
attr_accessor :name
|
5
|
+
def initialize
|
6
|
+
@name = "unknown"
|
7
|
+
end
|
8
|
+
|
9
|
+
def guess_language(string)
|
10
|
+
raise "#{self.class} should be subclassed"
|
11
|
+
string # never called, supresses unused variable warning
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
end # module Rlid
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
require 'rlid/language_guesser/language_guesser'
|
4
|
+
|
5
|
+
class ModelDistanceGuesser < LanguageGuesser
|
6
|
+
def initialize(model_class)
|
7
|
+
@model_class = model_class
|
8
|
+
print "Loading models.. "
|
9
|
+
@language_models = model_class.language_models
|
10
|
+
@name = "Model Distance"
|
11
|
+
puts "Done!"
|
12
|
+
end
|
13
|
+
|
14
|
+
def guess_language(string)
|
15
|
+
model = @model_class.new(string)
|
16
|
+
min_language = min_distance = nil
|
17
|
+
@language_models.each do |lang, lang_model|
|
18
|
+
dist = lang_model - model
|
19
|
+
if min_distance == nil or dist < min_distance
|
20
|
+
min_distance = dist
|
21
|
+
min_language = lang
|
22
|
+
end
|
23
|
+
end
|
24
|
+
min_language
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
|
29
|
+
#class OnlineGuesser < ModelDistanceGuesser
|
30
|
+
# def guess_language(string)
|
31
|
+
# min_language = min_distance = nil
|
32
|
+
# @language_models.each do |lang, lang_model|
|
33
|
+
# dist = lang_model.distance_from(string)
|
34
|
+
# if min_distance == nil or dist < min_distance
|
35
|
+
# min_distance = dist
|
36
|
+
# min_language = lang
|
37
|
+
# end
|
38
|
+
# end
|
39
|
+
# min_language
|
40
|
+
# end
|
41
|
+
#end
|
42
|
+
|
43
|
+
end # module Rlid
|
@@ -0,0 +1,74 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
require 'rlid/language_guesser/language_guesser'
|
4
|
+
require 'rlid/models/naive_bayes_models'
|
5
|
+
require 'rlid/probabilities/language_probabilities'
|
6
|
+
|
7
|
+
class NaiveBayesGuesser < LanguageGuesser
|
8
|
+
def initialize(default=1)
|
9
|
+
print "Naive Bayes: loading models.."
|
10
|
+
@models = NaiveBayesModels.load
|
11
|
+
@models.default_count = default
|
12
|
+
@name = "Naive Bayes"
|
13
|
+
puts " Done!"
|
14
|
+
end
|
15
|
+
|
16
|
+
def guess_language(string)
|
17
|
+
max_prob = 0.0
|
18
|
+
best_language = nil
|
19
|
+
@models.probabilities(string) do |lang, prob|
|
20
|
+
if prob > max_prob
|
21
|
+
max_prob = prob
|
22
|
+
best_language = lang
|
23
|
+
end
|
24
|
+
end
|
25
|
+
best_language
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
class NaiveBayesProbabilityGuesser < NaiveBayesGuesser
|
30
|
+
MAX = 3
|
31
|
+
def guess_language(string)
|
32
|
+
results = {}
|
33
|
+
tot = 0.0 # for normalization
|
34
|
+
@models.probabilities(string) do |lang, p|
|
35
|
+
size = string.preprocess(3).size
|
36
|
+
long = Math.log(1 + size)
|
37
|
+
# higher means lower
|
38
|
+
short = 1
|
39
|
+
exp = short/long
|
40
|
+
prob = p**exp
|
41
|
+
results[lang] = prob
|
42
|
+
tot += prob
|
43
|
+
end
|
44
|
+
# normalize
|
45
|
+
results.each_key do |k|
|
46
|
+
results[k] /= tot if tot != 0
|
47
|
+
end
|
48
|
+
|
49
|
+
LanguageProbabilities.new(results)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
|
54
|
+
class NaiveBayesPriorGuesser < NaiveBayesProbabilityGuesser
|
55
|
+
def initialize(prior=TestProbabilities.new(:eng))
|
56
|
+
if not prior.is_a?(LanguageProbabilities)
|
57
|
+
raise InvalidArgument
|
58
|
+
end
|
59
|
+
@prior = prior
|
60
|
+
super()
|
61
|
+
end
|
62
|
+
|
63
|
+
def set_prior(prior)
|
64
|
+
@prior = prior
|
65
|
+
end
|
66
|
+
|
67
|
+
alias :super_guess_language :guess_language
|
68
|
+
def guess_language(string)
|
69
|
+
conditional = super_guess_language(string)
|
70
|
+
(conditional * @prior).first
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
end # module Rlid
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
|
4
|
+
require 'rlid/models/model'
|
5
|
+
require 'rlid/common'
|
6
|
+
|
7
|
+
|
8
|
+
class FrequencyModel < NGramModel
|
9
|
+
N = 3 # trigrams
|
10
|
+
def initialize(string, cutoff=3000)
|
11
|
+
super(string, N, cutoff)
|
12
|
+
end
|
13
|
+
|
14
|
+
def save(file)
|
15
|
+
file.write Marshal.dump(@ngram_frequency)
|
16
|
+
end
|
17
|
+
|
18
|
+
def load(file)
|
19
|
+
@ngram_frequency = Marshal.load(file.read)
|
20
|
+
end
|
21
|
+
|
22
|
+
def generate_model(ngram_count)
|
23
|
+
# top ngrams (transformed into arrays)
|
24
|
+
arrays = ngram_count.to_a.sort{|x, y| y[1] <=> x[1]}
|
25
|
+
top = arrays[0...@cutoff] # will be kept
|
26
|
+
|
27
|
+
tot = 0.0 # total, for normalization
|
28
|
+
@ngram_frequency = Hash.new # key is ngram value is position
|
29
|
+
top.each_with_index do |ngram_and_count, i|
|
30
|
+
ngram, count = ngram_and_count
|
31
|
+
@ngram_frequency[ngram] = count
|
32
|
+
tot += count
|
33
|
+
end
|
34
|
+
|
35
|
+
# normalization
|
36
|
+
@ngram_frequency.each do |ngram, count|
|
37
|
+
@ngram_frequency[ngram] /= tot
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.filename
|
42
|
+
# FIXME should be frequency3000
|
43
|
+
return "cosine_distance3000"
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
attr_reader :ngram_frequency
|
48
|
+
end
|
49
|
+
|
50
|
+
class CosineDistanceModel < FrequencyModel
|
51
|
+
def -(other)
|
52
|
+
if not other.is_a?(CosineDistanceModel)
|
53
|
+
raise InvalidArgument
|
54
|
+
end
|
55
|
+
prod = 0
|
56
|
+
other.ngram_frequency.each do |ngram, freq_other|
|
57
|
+
freq_self = ngram_frequency[ngram]
|
58
|
+
if freq_self != nil
|
59
|
+
prod += (freq_self * freq_other)**0.2
|
60
|
+
end
|
61
|
+
end
|
62
|
+
1 - prod
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
end # module Rlid
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby1.9.1
|
2
|
+
module Rlid
|
3
|
+
|
4
|
+
require 'rlid/common'
|
5
|
+
require 'rlid/models/ordered_ngrams'
|
6
|
+
require 'rlid/models/cosine_distance_model'
|
7
|
+
|
8
|
+
# models to train
|
9
|
+
MODELS = [CosineDistanceModel]
|
10
|
+
|
11
|
+
MODELS.each do |model|
|
12
|
+
puts "training #{model}"
|
13
|
+
Language.each_2files('corpus', model.filename, 'r', 'w') do |corpus, file, l|
|
14
|
+
puts "learning #{l}.."
|
15
|
+
model.new(corpus.read).save file
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
puts ">> Successfully trained!! <<"
|
20
|
+
|
21
|
+
|
22
|
+
end # module Rlid
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Rlid
|
2
|
+
|
3
|
+
|
4
|
+
# abstract class
|
5
|
+
class Model
|
6
|
+
def initialize string
|
7
|
+
raise "#{self.class} should be subclassed"
|
8
|
+
string # never called, supresses unused variable warning
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
# in subclasses generate_model filename, load and save should be implemented
|
15
|
+
class NGramModel < Model
|
16
|
+
def initialize(string=nil, n=3, cutoff=300)
|
17
|
+
@n = n
|
18
|
+
@cutoff = cutoff
|
19
|
+
|
20
|
+
if string == nil then return end
|
21
|
+
|
22
|
+
# ngrams and count of each
|
23
|
+
ngram_count = Hash.new(0)
|
24
|
+
|
25
|
+
string.each_ngram(@n) do |ngram|
|
26
|
+
ngram_count[ngram] += 1
|
27
|
+
end
|
28
|
+
|
29
|
+
generate_model(ngram_count)
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def self.language_models
|
34
|
+
if not defined?(filename)
|
35
|
+
raise "#{self.class} should implement 'filename' accessor!"
|
36
|
+
end
|
37
|
+
res = Hash.new
|
38
|
+
Language.each_file(filename) do |file, lang|
|
39
|
+
model = self.new(nil)
|
40
|
+
model.load(file)
|
41
|
+
res[lang] = model
|
42
|
+
end
|
43
|
+
res
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
# should be implemented in the subclass
|
48
|
+
# ngram_count is a hash: ngram => count
|
49
|
+
def generate_model(ngram_count)
|
50
|
+
raise "#{self.class} should be subclassed"
|
51
|
+
ngram_count # never called, supresses unused variable warning
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end # module Rlid
|