rlid 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/data/naive_bayes_models +6926 -127052
- data/lib/rlid.rb +1 -1
- data/lib/rlid/common.rb +13 -4
- data/lib/rlid/language_guesser/naive_bayes_guesser.rb +17 -1
- data/lib/rlid/models/generate_naive_bayes_models.rb +2 -0
- data/lib/rlid/models/naive_bayes_models.rb +65 -18
- data/lib/rlid/probabilities/language_probabilities.rb +13 -3
- data/lib/rlid/tmp.rb +1 -1
- metadata +26 -36
data/lib/rlid.rb
CHANGED
data/lib/rlid/common.rb
CHANGED
@@ -96,11 +96,20 @@ LANGUAGES = Language.all_codes3
|
|
96
96
|
COMMON_LANGUAGES = [:dut, :eng, :ita, :por, :fre, :ger]
|
97
97
|
|
98
98
|
|
99
|
-
|
99
|
+
def self.scrollbar(perc, size=80)
|
100
|
+
realsize = size-2
|
101
|
+
pos = (perc.to_f * realsize).round
|
102
|
+
bar = "=" * pos + " " * (realsize-pos)
|
103
|
+
bar[pos-1] = ">" if pos > 0 and pos < realsize
|
104
|
+
print "|", bar, "|", "\r"
|
105
|
+
end
|
106
|
+
|
100
107
|
|
101
108
|
end # module Rlid
|
102
109
|
|
103
110
|
|
111
|
+
|
112
|
+
|
104
113
|
# add methods to String
|
105
114
|
class String
|
106
115
|
def each_ngram(n=3)
|
@@ -131,9 +140,9 @@ class String
|
|
131
140
|
|
132
141
|
padding = "|" * (n-1)
|
133
142
|
|
134
|
-
if string.size == 1
|
135
|
-
|
136
|
-
|
143
|
+
#if string.size == 1
|
144
|
+
# string = "|" + string + " "
|
145
|
+
if string.size < n-1
|
137
146
|
string = padding + string + " "
|
138
147
|
else
|
139
148
|
string = padding + string + padding
|
@@ -27,7 +27,6 @@ class NaiveBayesGuesser < LanguageGuesser
|
|
27
27
|
end
|
28
28
|
|
29
29
|
class NaiveBayesProbabilityGuesser < NaiveBayesGuesser
|
30
|
-
MAX = 3
|
31
30
|
def guess_language(string)
|
32
31
|
results = {}
|
33
32
|
tot = 0.0 # for normalization
|
@@ -50,6 +49,23 @@ class NaiveBayesProbabilityGuesser < NaiveBayesGuesser
|
|
50
49
|
end
|
51
50
|
end
|
52
51
|
|
52
|
+
class SmartBayesGuesser < LanguageGuesser
|
53
|
+
def initialize(default=1)
|
54
|
+
#print "Smart Bayes: loading models.."
|
55
|
+
@models = SmartBayesModels.new
|
56
|
+
#puts " Done!"
|
57
|
+
end
|
58
|
+
def guess_language(string)
|
59
|
+
results = @models.probabilities(string)
|
60
|
+
results.delete :nnn
|
61
|
+
tot = results.values.inject(0.0){|s,x| s+x}
|
62
|
+
results.each_key do |lang|
|
63
|
+
results[lang] /= tot
|
64
|
+
end
|
65
|
+
LanguageProbabilities.new(results)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
53
69
|
|
54
70
|
class NaiveBayesPriorGuesser < NaiveBayesProbabilityGuesser
|
55
71
|
def initialize(prior=TestProbabilities.new(:eng))
|
@@ -6,35 +6,45 @@ require 'set'
|
|
6
6
|
|
7
7
|
require 'rlid/common'
|
8
8
|
|
9
|
+
|
10
|
+
# > prova di una stringa molto lunga lunghissima davvero lunga yyyy
|
11
|
+
# default = 10
|
12
|
+
# ita(99.97) : cat(0.026) : spa(0.0023)
|
13
|
+
# default = 1
|
14
|
+
# ita(99.995) : cat(0.0045) : por(0.00019)
|
15
|
+
# default = 0.1
|
16
|
+
# ita(99.9990) : cat(0.00086) : rum(3.7e-05)
|
17
|
+
|
9
18
|
class NaiveBayesModels
|
10
|
-
attr_accessor :default_count
|
11
|
-
# ngram leght
|
12
|
-
N = 3
|
19
|
+
attr_accessor :default_count, :n
|
13
20
|
# top ngrams kept for every language
|
14
21
|
CUTOFF = 3000
|
15
22
|
# special feature
|
16
23
|
OTHER = nil
|
17
24
|
|
18
|
-
MAX_STRING_LENGTH =
|
25
|
+
MAX_STRING_LENGTH = 20
|
19
26
|
|
20
|
-
|
27
|
+
FILENAME = "naive_bayes_models"
|
21
28
|
|
22
|
-
def initialize(
|
23
|
-
@
|
29
|
+
def initialize(n=3)
|
30
|
+
@n=n
|
31
|
+
@default_count=1
|
24
32
|
end
|
25
33
|
|
26
|
-
def self.generate_models
|
27
|
-
|
34
|
+
def self.generate_models(file=nil, n=3)
|
35
|
+
file ||= FILENAME
|
36
|
+
models = NaiveBayesModels.new(n)
|
28
37
|
puts "Training started.."
|
29
38
|
models.train
|
30
|
-
File.open(
|
31
|
-
|
32
|
-
puts "Models saved to #{
|
39
|
+
File.open( "#{DATA_DIRECTORY}/#{file}", "w") do |f|
|
40
|
+
f.write Marshal.dump(models)
|
41
|
+
puts "Models saved to #{DATA_DIRECTORY}/#{file}"
|
33
42
|
end
|
34
43
|
end
|
35
44
|
|
36
|
-
def self.load
|
37
|
-
|
45
|
+
def self.load(file=nil)
|
46
|
+
file ||= FILENAME
|
47
|
+
Marshal.load(File.read("#{DATA_DIRECTORY}/#{file}"))
|
38
48
|
end
|
39
49
|
|
40
50
|
def probabilities(string)
|
@@ -43,13 +53,26 @@ class NaiveBayesModels
|
|
43
53
|
end
|
44
54
|
@ngram_frequency.keys.each do |lang|
|
45
55
|
prob = 1
|
46
|
-
string[0..MAX_STRING_LENGTH].each_ngram do |ngram|
|
56
|
+
string[0..MAX_STRING_LENGTH].each_ngram(@n) do |ngram|
|
57
|
+
if lang == :eng
|
58
|
+
#print ngram, ", "
|
59
|
+
end
|
47
60
|
prob *= frequency_of(lang, ngram)
|
48
61
|
end
|
49
62
|
yield lang, prob
|
50
63
|
end
|
51
64
|
end
|
52
65
|
|
66
|
+
# returns a hash
|
67
|
+
def probabilities_h(string)
|
68
|
+
#puts "#{@n}: #{total_ngrams(:ita)}"
|
69
|
+
res = {}
|
70
|
+
probabilities(string) do |lang, prob|
|
71
|
+
res[lang] = prob
|
72
|
+
end
|
73
|
+
res
|
74
|
+
end
|
75
|
+
|
53
76
|
def train
|
54
77
|
ngram_counts = get_ngram_counts
|
55
78
|
# ngrams for which we want to store information (all languages)
|
@@ -81,13 +104,14 @@ class NaiveBayesModels
|
|
81
104
|
puts_info(lang)
|
82
105
|
end
|
83
106
|
|
107
|
+
# add language :nnn
|
84
108
|
n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
|
85
109
|
@total_ngrams_found[:nnn] = n
|
86
110
|
@ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
|
87
111
|
@total_ngrams_not_found[:nnn] = @stored_ngrams.size
|
88
112
|
|
89
113
|
#puts "total frequencies saved: #{freqs}"
|
90
|
-
#puts "
|
114
|
+
#puts "default values used: #{default_count} (#{100*default_count/freqs}%)"
|
91
115
|
#@ngram_frequency
|
92
116
|
end
|
93
117
|
|
@@ -101,7 +125,6 @@ protected
|
|
101
125
|
#warn " :#{ngram}: is in OTHER!" if lang == :eng
|
102
126
|
ngram = OTHER
|
103
127
|
end
|
104
|
-
count = 0
|
105
128
|
if @ngram_frequency[lang].include?(ngram)
|
106
129
|
count = @ngram_frequency[lang][ngram]
|
107
130
|
else
|
@@ -130,7 +153,7 @@ private
|
|
130
153
|
Language.each_file("corpus") do |file, lang|
|
131
154
|
puts "- I'm learning #{lang}"
|
132
155
|
ngram_counts[lang] = Hash.new(0) # default is 1
|
133
|
-
file.read.each_ngram(
|
156
|
+
file.read.each_ngram(@n) do |ngram|
|
134
157
|
ngram_counts[lang][ngram] += 1
|
135
158
|
end
|
136
159
|
|
@@ -154,4 +177,28 @@ private
|
|
154
177
|
end
|
155
178
|
|
156
179
|
|
180
|
+
class SmartBayesModels
|
181
|
+
def initialize
|
182
|
+
@trigrams = NaiveBayesModels.load
|
183
|
+
@bigrams = NaiveBayesModels.load("bigrams")
|
184
|
+
@trigrams.default_count = 0.1
|
185
|
+
@bigrams.default_count = 0.1
|
186
|
+
end
|
187
|
+
|
188
|
+
def probabilities string
|
189
|
+
probtri = @trigrams.probabilities_h(string)
|
190
|
+
probbi = @bigrams.probabilities_h(string)
|
191
|
+
res = {}
|
192
|
+
probtri.each_key do |lang|
|
193
|
+
if probbi[lang] != 0
|
194
|
+
res[lang] = probtri[lang]/probbi[lang]
|
195
|
+
else
|
196
|
+
res[lang] = probtri[lang]
|
197
|
+
end
|
198
|
+
#puts "#{lang} = #{probtri[lang]}/#{probbi[lang]} (#{res[lang]})"
|
199
|
+
end
|
200
|
+
res
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
157
204
|
end # module Rlid
|
@@ -11,7 +11,9 @@ class Percentage
|
|
11
11
|
|
12
12
|
def to_s
|
13
13
|
if @value <= 0.98
|
14
|
-
|
14
|
+
"%.2g" % (@value * 100)
|
15
|
+
elsif @value == 1.0
|
16
|
+
"100"
|
15
17
|
else
|
16
18
|
complement = 1.0 - @value
|
17
19
|
# complement =
|
@@ -23,7 +25,7 @@ class Percentage
|
|
23
25
|
digits += 1
|
24
26
|
res = "%.#{digits}f" % (@value * 100)
|
25
27
|
end
|
26
|
-
|
28
|
+
res
|
27
29
|
end
|
28
30
|
end
|
29
31
|
|
@@ -68,6 +70,10 @@ class LanguageProbabilities
|
|
68
70
|
end.join(" : ")
|
69
71
|
end
|
70
72
|
|
73
|
+
def [](lang)
|
74
|
+
@percentage[lang]
|
75
|
+
end
|
76
|
+
|
71
77
|
def first
|
72
78
|
sorted.first[LANG]
|
73
79
|
end
|
@@ -126,7 +132,11 @@ protected
|
|
126
132
|
end
|
127
133
|
|
128
134
|
def sorted
|
129
|
-
|
135
|
+
begin
|
136
|
+
@percentage.to_a.sort!{|x,y| y[PERC] <=> x[PERC]}
|
137
|
+
rescue
|
138
|
+
p @percentage
|
139
|
+
end
|
130
140
|
end
|
131
141
|
|
132
142
|
attr_accessor :percentage
|
data/lib/rlid/tmp.rb
CHANGED
metadata
CHANGED
@@ -1,71 +1,61 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: rlid
|
3
|
-
version: !ruby/object:Gem::Version
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
4
5
|
prerelease:
|
5
|
-
version: 0.1.0
|
6
6
|
platform: ruby
|
7
|
-
authors:
|
7
|
+
authors:
|
8
8
|
- Fela Winkelmolen
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
|
13
|
-
date: 2011-02-01 00:00:00 +01:00
|
14
|
-
default_executable:
|
12
|
+
date: 2013-07-04 00:00:00.000000000 Z
|
15
13
|
dependencies: []
|
16
|
-
|
17
|
-
|
14
|
+
description: Language identification library specialized in guessing the language
|
15
|
+
of short strings.
|
18
16
|
email: fela.kde@gmail.com
|
19
17
|
executables: []
|
20
|
-
|
21
18
|
extensions: []
|
22
|
-
|
23
19
|
extra_rdoc_files: []
|
24
|
-
|
25
|
-
files:
|
26
|
-
- lib/interactive_guesser.rb
|
20
|
+
files:
|
27
21
|
- lib/rlid.rb
|
28
|
-
- lib/
|
22
|
+
- lib/interactive_guesser.rb
|
29
23
|
- lib/rlid/web.rb
|
24
|
+
- lib/rlid/common.rb
|
30
25
|
- lib/rlid/tmp.rb
|
31
26
|
- lib/rlid/language_guesser/model_distance_guesser.rb
|
32
|
-
- lib/rlid/language_guesser/naive_bayes_guesser.rb
|
33
27
|
- lib/rlid/language_guesser/language_guesser.rb
|
34
|
-
- lib/rlid/
|
35
|
-
- lib/rlid/models/generate_models.rb
|
36
|
-
- lib/rlid/models/cosine_distance_model.rb
|
28
|
+
- lib/rlid/language_guesser/naive_bayes_guesser.rb
|
37
29
|
- lib/rlid/models/model.rb
|
38
|
-
- lib/rlid/models/
|
30
|
+
- lib/rlid/models/cosine_distance_model.rb
|
39
31
|
- lib/rlid/models/ordered_ngrams.rb
|
32
|
+
- lib/rlid/models/naive_bayes_models.rb
|
33
|
+
- lib/rlid/models/generate_models.rb
|
34
|
+
- lib/rlid/models/generate_naive_bayes_models.rb
|
40
35
|
- lib/rlid/probabilities/language_probabilities.rb
|
41
36
|
- data/naive_bayes_models
|
42
|
-
has_rdoc: true
|
43
37
|
homepage: https://github.com/fela/rlid
|
44
38
|
licenses: []
|
45
|
-
|
46
39
|
post_install_message:
|
47
40
|
rdoc_options: []
|
48
|
-
|
49
|
-
require_paths:
|
41
|
+
require_paths:
|
50
42
|
- lib
|
51
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
44
|
none: false
|
53
|
-
requirements:
|
54
|
-
- -
|
55
|
-
- !ruby/object:Gem::Version
|
45
|
+
requirements:
|
46
|
+
- - ! '>='
|
47
|
+
- !ruby/object:Gem::Version
|
56
48
|
version: 1.9.1
|
57
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
50
|
none: false
|
59
|
-
requirements:
|
60
|
-
- -
|
61
|
-
- !ruby/object:Gem::Version
|
62
|
-
version:
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
63
55
|
requirements: []
|
64
|
-
|
65
56
|
rubyforge_project:
|
66
|
-
rubygems_version: 1.
|
57
|
+
rubygems_version: 1.8.24
|
67
58
|
signing_key:
|
68
59
|
specification_version: 3
|
69
60
|
summary: Language identification library
|
70
61
|
test_files: []
|
71
|
-
|