rlid 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  require 'rlid/language_guesser/naive_bayes_guesser'
2
2
 
3
3
  module Rlid
4
- @guesser = NaiveBayesProbabilityGuesser.new
4
+ @guesser = SmartBayesGuesser.new
5
5
  def self.guess_language(string)
6
6
  @guesser.guess_language(string)
7
7
  end
@@ -96,11 +96,20 @@ LANGUAGES = Language.all_codes3
96
96
  COMMON_LANGUAGES = [:dut, :eng, :ita, :por, :fre, :ger]
97
97
 
98
98
 
99
- # for ngrams
99
+ def self.scrollbar(perc, size=80)
100
+ realsize = size-2
101
+ pos = (perc.to_f * realsize).round
102
+ bar = "=" * pos + " " * (realsize-pos)
103
+ bar[pos-1] = ">" if pos > 0 and pos < realsize
104
+ print "|", bar, "|", "\r"
105
+ end
106
+
100
107
 
101
108
  end # module Rlid
102
109
 
103
110
 
111
+
112
+
104
113
  # add methods to String
105
114
  class String
106
115
  def each_ngram(n=3)
@@ -131,9 +140,9 @@ class String
131
140
 
132
141
  padding = "|" * (n-1)
133
142
 
134
- if string.size == 1
135
- string = "|" + string + " "
136
- elsif string.size == 1
143
+ #if string.size == 1
144
+ # string = "|" + string + " "
145
+ if string.size < n-1
137
146
  string = padding + string + " "
138
147
  else
139
148
  string = padding + string + padding
@@ -27,7 +27,6 @@ class NaiveBayesGuesser < LanguageGuesser
27
27
  end
28
28
 
29
29
  class NaiveBayesProbabilityGuesser < NaiveBayesGuesser
30
- MAX = 3
31
30
  def guess_language(string)
32
31
  results = {}
33
32
  tot = 0.0 # for normalization
@@ -50,6 +49,23 @@ class NaiveBayesProbabilityGuesser < NaiveBayesGuesser
50
49
  end
51
50
  end
52
51
 
52
+ class SmartBayesGuesser < LanguageGuesser
53
+ def initialize(default=1)
54
+ #print "Smart Bayes: loading models.."
55
+ @models = SmartBayesModels.new
56
+ #puts " Done!"
57
+ end
58
+ def guess_language(string)
59
+ results = @models.probabilities(string)
60
+ results.delete :nnn
61
+ tot = results.values.inject(0.0){|s,x| s+x}
62
+ results.each_key do |lang|
63
+ results[lang] /= tot
64
+ end
65
+ LanguageProbabilities.new(results)
66
+ end
67
+ end
68
+
53
69
 
54
70
  class NaiveBayesPriorGuesser < NaiveBayesProbabilityGuesser
55
71
  def initialize(prior=TestProbabilities.new(:eng))
@@ -2,8 +2,10 @@
2
2
 
3
3
  module Rlid
4
4
 
5
+ $LOAD_PATH << File.expand_path("#{__FILE__}/../../../")
5
6
  require 'rlid/models/naive_bayes_models'
6
7
 
7
8
  NaiveBayesModels.generate_models
9
+ NaiveBayesModels.generate_models("bigrams", 2)
8
10
 
9
11
  end # module Rlid
@@ -6,35 +6,45 @@ require 'set'
6
6
 
7
7
  require 'rlid/common'
8
8
 
9
+
10
+ # > prova di una stringa molto lunga lunghissima davvero lunga yyyy
11
+ # default = 10
12
+ # ita(99.97) : cat(0.026) : spa(0.0023)
13
+ # default = 1
14
+ # ita(99.995) : cat(0.0045) : por(0.00019)
15
+ # default = 0.1
16
+ # ita(99.9990) : cat(0.00086) : rum(3.7e-05)
17
+
9
18
  class NaiveBayesModels
10
- attr_accessor :default_count
11
- # ngram leght
12
- N = 3
19
+ attr_accessor :default_count, :n
13
20
  # top ngrams kept for every language
14
21
  CUTOFF = 3000
15
22
  # special feature
16
23
  OTHER = nil
17
24
 
18
- MAX_STRING_LENGTH = 75
25
+ MAX_STRING_LENGTH = 20
19
26
 
20
- FILEPATH = "#{DATA_DIRECTORY}/naive_bayes_models"
27
+ FILENAME = "naive_bayes_models"
21
28
 
22
- def initialize(default_count=1)
23
- @default_count=default_count
29
+ def initialize(n=3)
30
+ @n=n
31
+ @default_count=1
24
32
  end
25
33
 
26
- def self.generate_models
27
- models = NaiveBayesModels.new(nil)
34
+ def self.generate_models(file=nil, n=3)
35
+ file ||= FILENAME
36
+ models = NaiveBayesModels.new(n)
28
37
  puts "Training started.."
29
38
  models.train
30
- File.open(FILEPATH, "w") do |file|
31
- file.write Marshal.dump(models)
32
- puts "Models saved to #{FILEPATH}"
39
+ File.open( "#{DATA_DIRECTORY}/#{file}", "w") do |f|
40
+ f.write Marshal.dump(models)
41
+ puts "Models saved to #{DATA_DIRECTORY}/#{file}"
33
42
  end
34
43
  end
35
44
 
36
- def self.load
37
- Marshal.load(File.read(FILEPATH))
45
+ def self.load(file=nil)
46
+ file ||= FILENAME
47
+ Marshal.load(File.read("#{DATA_DIRECTORY}/#{file}"))
38
48
  end
39
49
 
40
50
  def probabilities(string)
@@ -43,13 +53,26 @@ class NaiveBayesModels
43
53
  end
44
54
  @ngram_frequency.keys.each do |lang|
45
55
  prob = 1
46
- string[0..MAX_STRING_LENGTH].each_ngram do |ngram|
56
+ string[0..MAX_STRING_LENGTH].each_ngram(@n) do |ngram|
57
+ if lang == :eng
58
+ #print ngram, ", "
59
+ end
47
60
  prob *= frequency_of(lang, ngram)
48
61
  end
49
62
  yield lang, prob
50
63
  end
51
64
  end
52
65
 
66
+ # returns a hash
67
+ def probabilities_h(string)
68
+ #puts "#{@n}: #{total_ngrams(:ita)}"
69
+ res = {}
70
+ probabilities(string) do |lang, prob|
71
+ res[lang] = prob
72
+ end
73
+ res
74
+ end
75
+
53
76
  def train
54
77
  ngram_counts = get_ngram_counts
55
78
  # ngrams for which we want to store information (all languages)
@@ -81,13 +104,14 @@ class NaiveBayesModels
81
104
  puts_info(lang)
82
105
  end
83
106
 
107
+ # add language :nnn
84
108
  n = @ngram_frequency.values.map{|x| x[OTHER]}.max * 3 / 2 # (* 1.5)
85
109
  @total_ngrams_found[:nnn] = n
86
110
  @ngram_frequency[Language::NO_LANGUAGE_CODE] = {OTHER => n}
87
111
  @total_ngrams_not_found[:nnn] = @stored_ngrams.size
88
112
 
89
113
  #puts "total frequencies saved: #{freqs}"
90
- #puts "defauld values used: #{default_count} (#{100*default_count/freqs}%)"
114
+ #puts "default values used: #{default_count} (#{100*default_count/freqs}%)"
91
115
  #@ngram_frequency
92
116
  end
93
117
 
@@ -101,7 +125,6 @@ protected
101
125
  #warn " :#{ngram}: is in OTHER!" if lang == :eng
102
126
  ngram = OTHER
103
127
  end
104
- count = 0
105
128
  if @ngram_frequency[lang].include?(ngram)
106
129
  count = @ngram_frequency[lang][ngram]
107
130
  else
@@ -130,7 +153,7 @@ private
130
153
  Language.each_file("corpus") do |file, lang|
131
154
  puts "- I'm learning #{lang}"
132
155
  ngram_counts[lang] = Hash.new(0) # default is 1
133
- file.read.each_ngram(N) do |ngram|
156
+ file.read.each_ngram(@n) do |ngram|
134
157
  ngram_counts[lang][ngram] += 1
135
158
  end
136
159
 
@@ -154,4 +177,28 @@ private
154
177
  end
155
178
 
156
179
 
180
+ class SmartBayesModels
181
+ def initialize
182
+ @trigrams = NaiveBayesModels.load
183
+ @bigrams = NaiveBayesModels.load("bigrams")
184
+ @trigrams.default_count = 0.1
185
+ @bigrams.default_count = 0.1
186
+ end
187
+
188
+ def probabilities string
189
+ probtri = @trigrams.probabilities_h(string)
190
+ probbi = @bigrams.probabilities_h(string)
191
+ res = {}
192
+ probtri.each_key do |lang|
193
+ if probbi[lang] != 0
194
+ res[lang] = probtri[lang]/probbi[lang]
195
+ else
196
+ res[lang] = probtri[lang]
197
+ end
198
+ #puts "#{lang} = #{probtri[lang]}/#{probbi[lang]} (#{res[lang]})"
199
+ end
200
+ res
201
+ end
202
+ end
203
+
157
204
  end # module Rlid
@@ -11,7 +11,9 @@ class Percentage
11
11
 
12
12
  def to_s
13
13
  if @value <= 0.98
14
- return "%.2g" % (@value * 100)
14
+ "%.2g" % (@value * 100)
15
+ elsif @value == 1.0
16
+ "100"
15
17
  else
16
18
  complement = 1.0 - @value
17
19
  # complement =
@@ -23,7 +25,7 @@ class Percentage
23
25
  digits += 1
24
26
  res = "%.#{digits}f" % (@value * 100)
25
27
  end
26
- return res
28
+ res
27
29
  end
28
30
  end
29
31
 
@@ -68,6 +70,10 @@ class LanguageProbabilities
68
70
  end.join(" : ")
69
71
  end
70
72
 
73
+ def [](lang)
74
+ @percentage[lang]
75
+ end
76
+
71
77
  def first
72
78
  sorted.first[LANG]
73
79
  end
@@ -126,7 +132,11 @@ protected
126
132
  end
127
133
 
128
134
  def sorted
129
- @percentage.to_a.sort!{|x,y| y[PERC] <=> x[PERC]}
135
+ begin
136
+ @percentage.to_a.sort!{|x,y| y[PERC] <=> x[PERC]}
137
+ rescue
138
+ p @percentage
139
+ end
130
140
  end
131
141
 
132
142
  attr_accessor :percentage
@@ -51,7 +51,7 @@ module Rlid
51
51
  end
52
52
  end
53
53
 
54
- puts require 'irb'
54
+ puts RUBY_VERSION
55
55
 
56
56
 
57
57
  #irb self
metadata CHANGED
@@ -1,71 +1,61 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: rlid
3
- version: !ruby/object:Gem::Version
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
4
5
  prerelease:
5
- version: 0.1.0
6
6
  platform: ruby
7
- authors:
7
+ authors:
8
8
  - Fela Winkelmolen
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
-
13
- date: 2011-02-01 00:00:00 +01:00
14
- default_executable:
12
+ date: 2013-07-04 00:00:00.000000000 Z
15
13
  dependencies: []
16
-
17
- description: Language identification library specialized in guessing the language of short strings.
14
+ description: Language identification library specialized in guessing the language
15
+ of short strings.
18
16
  email: fela.kde@gmail.com
19
17
  executables: []
20
-
21
18
  extensions: []
22
-
23
19
  extra_rdoc_files: []
24
-
25
- files:
26
- - lib/interactive_guesser.rb
20
+ files:
27
21
  - lib/rlid.rb
28
- - lib/rlid/common.rb
22
+ - lib/interactive_guesser.rb
29
23
  - lib/rlid/web.rb
24
+ - lib/rlid/common.rb
30
25
  - lib/rlid/tmp.rb
31
26
  - lib/rlid/language_guesser/model_distance_guesser.rb
32
- - lib/rlid/language_guesser/naive_bayes_guesser.rb
33
27
  - lib/rlid/language_guesser/language_guesser.rb
34
- - lib/rlid/models/generate_naive_bayes_models.rb
35
- - lib/rlid/models/generate_models.rb
36
- - lib/rlid/models/cosine_distance_model.rb
28
+ - lib/rlid/language_guesser/naive_bayes_guesser.rb
37
29
  - lib/rlid/models/model.rb
38
- - lib/rlid/models/naive_bayes_models.rb
30
+ - lib/rlid/models/cosine_distance_model.rb
39
31
  - lib/rlid/models/ordered_ngrams.rb
32
+ - lib/rlid/models/naive_bayes_models.rb
33
+ - lib/rlid/models/generate_models.rb
34
+ - lib/rlid/models/generate_naive_bayes_models.rb
40
35
  - lib/rlid/probabilities/language_probabilities.rb
41
36
  - data/naive_bayes_models
42
- has_rdoc: true
43
37
  homepage: https://github.com/fela/rlid
44
38
  licenses: []
45
-
46
39
  post_install_message:
47
40
  rdoc_options: []
48
-
49
- require_paths:
41
+ require_paths:
50
42
  - lib
51
- required_ruby_version: !ruby/object:Gem::Requirement
43
+ required_ruby_version: !ruby/object:Gem::Requirement
52
44
  none: false
53
- requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
45
+ requirements:
46
+ - - ! '>='
47
+ - !ruby/object:Gem::Version
56
48
  version: 1.9.1
57
- required_rubygems_version: !ruby/object:Gem::Requirement
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
50
  none: false
59
- requirements:
60
- - - ">="
61
- - !ruby/object:Gem::Version
62
- version: "0"
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
63
55
  requirements: []
64
-
65
56
  rubyforge_project:
66
- rubygems_version: 1.5.0
57
+ rubygems_version: 1.8.24
67
58
  signing_key:
68
59
  specification_version: 3
69
60
  summary: Language identification library
70
61
  test_files: []
71
-