lite 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 25ff1a57bf0429aa06d54f565f690193d820ace4
4
- data.tar.gz: b38a29a2a1cc6322a4d68dcaf0fac4cfa313a844
3
+ metadata.gz: 2fc13836cc1019dd8809a6c48a7a6cdffa8bb088
4
+ data.tar.gz: 69b8a922e062ec32047dfe1926f1a5cb54845d16
5
5
  SHA512:
6
- metadata.gz: 26355fc1113fcfa91eb8fd68c41df77f89e9b40f99a5b0584c089a8e73dc191a42b6c80ff3c9698e8ff737ddf7444e0d24c542e632376dd5a8bfecdb01735c04
7
- data.tar.gz: 4d62a7c6a8d20c29e582fa9b938e9fcfc70265cccebfbb6130acba04b39c32d81d6eb9d1deb89b6f4e64814d987f58a0a5d08c0fdbe7b0423a54bf2a5f4a7171
6
+ metadata.gz: b3289282345c068a6e0524bc4662e0c9764a1b8bf224fcf8ec24e30e122b5093a9cf2a2d8f1e63b31b309bd67691a8b01b152aecb7e4892c1ad6aeb787c6945d
7
+ data.tar.gz: 65a04db0306934823048101fe592acb883003358a38d4c329cf28278618c8c6e79c11bc394d636a950b71aedc2f072b4726f9ffb56edeec00e790aad80250250
@@ -0,0 +1,54 @@
1
+ require "json"
2
+ require "set"
3
+
4
+ module Classify
5
+
6
+ class NB
7
+
8
+ def initialize
9
+ @labels = {}
10
+ @features = Set.new
11
+ @nF = 0.0
12
+ @nL = 0.0
13
+ @c = 0.5
14
+ end
15
+
16
+ def update! fvect, label
17
+ @labels[ label ] ||= { "xs" => {}, "N"=>0 }
18
+ fvect.each{|k,v| @features<<k; @labels[label]["nX"]||=@c ;@labels[ label ]["xs"][k] ||= @c; @labels[ label ]["xs"][k] += v;@labels[label]["nX"]+=v}
19
+ @labels[ label ]["N"]+=1
20
+ wrapup
21
+ end
22
+
23
+ def classify fvect
24
+ @labels.keys.inject({}) do |aux,y|
25
+ sx = fvect.keys.inject(0.0){|z, fi| z += fvect[fi] * Math.log( (@labels[y]["xs"][fi]||@c) / (@labels[y]["nX"]+@c*@nF))}
26
+ sy = Math.log( @labels[y]["N"] / @nL ) # here no smoothing
27
+ aux[ y ] = sx + sy
28
+ aux
29
+ end
30
+ end
31
+
32
+
33
+ def to_json
34
+ { "id" => "#{rand(10000)}#{Time.now.to_i}", "labels"=>@labels, "F"=>@features.to_a, "nf"=>@nF, "nl"=>@nL,"c"=>@c }.to_json
35
+ end
36
+
37
+ def self.from_json json
38
+ parsed = JSON.parse json
39
+ c = self.new
40
+ c.instance_variable_set("@labels", parsed["labels"])
41
+ c.instance_variable_set("@features", Set.new( parsed["F"] ) )
42
+ c.instance_variable_set("@nF", parsed["nf"])
43
+ c.instance_variable_set("@nL", parsed["nl"])
44
+ c
45
+ end
46
+
47
+ :private
48
+ def wrapup
49
+ @nF = @features.size
50
+ @nL = @labels.keys.inject(0.0){|s,k| s += @labels[k]["N"]}
51
+ @labels.keys.each{|k| @labels[k]["sF"] = @labels[k]["N"]+@c*@nF}
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,76 @@
1
+ require File.dirname(__FILE__)+'/sparsevect.rb'
2
+
3
+ module Cluster
4
+ class AddC
5
+ def initialize( upperBoundOnNumClusters )
6
+ @k_max = upperBoundOnNumClusters
7
+ @centroids = []
8
+ end
9
+
10
+ def observe!( instance )
11
+ if @centroids.size == 0
12
+ @centroids << Centroid.new( instance )
13
+ return self
14
+ end
15
+
16
+ @centroids.sort! {|c1, c2| instance.dist(c1.x) <=> instance.dist(c2.x) }
17
+ closest_centroid = @centroids.first
18
+ closest_centroid.update!( instance )
19
+
20
+ if( @centroids.size >= @k_max )
21
+ pairs = []
22
+ @centroids.each_index do |i|
23
+ min_d = 10**20
24
+ min_c = 0
25
+ @centroids.each_index do |j|
26
+ next if i==j
27
+ d = @centroids[i].x.dist( @centroids[j].x )
28
+ min_c = j if d < min_d
29
+ min_d = d if d < min_d
30
+ end
31
+ pairs[i] = [ min_d, i, min_c]
32
+ end
33
+ pairs.sort! {|x,y| x[0]<=>y[0]}
34
+ merge_info = pairs.first
35
+ @centroids[merge_info[1]].merge!( @centroids[merge_info[2]] )
36
+ @centroids = @centroids - [ @centroids[merge_info[2]] ]
37
+ end
38
+ @centroids << Centroid.new( instance )
39
+
40
+ []
41
+ end
42
+
43
+ def getCentroids( min_num_instances_in_cluster = 2 )
44
+ @centroids.each do |c|
45
+ next if c.n >= min_num_instances_in_cluster
46
+ p c
47
+ @centroids = @centroids - [ c ]
48
+ next if c.n == 0
49
+ aux = @centroids.inject( {:min_c => @centroids.first, :d => @centroids.first.x.dist( c.x )} ) {|a,cc| cc.nil? || cc.x.dist(c.x) > a[:d] ? a : { :min_c=>cc, :d=>cc.x.dist(c.x)} }
50
+ aux[:min_c].merge! c
51
+ end
52
+ @centroids
53
+ end
54
+ end
55
+
56
+ class Centroid < SparseVector
57
+ attr_accessor :x,:n
58
+ def initialize( x )
59
+ @x = x
60
+ @n = 0
61
+ end
62
+
63
+ def update!( newX )
64
+ @x += ( @x - newX ).mult_scalar( 1.0/(@n+1) )
65
+ @n += 1
66
+ self
67
+ end
68
+
69
+ def merge!( centroid )
70
+ @x = ( @x.mult_scalar(@n)+centroid.x.mult_scalar(centroid.n) ).mult_scalar( 1.0 / (@n + centroid.n) )
71
+ @n += centroid.n
72
+ self
73
+ end
74
+ end
75
+
76
+ end
@@ -0,0 +1,154 @@
1
+ module Cluster
2
+ class Grammy
3
+
4
+ def initialize
5
+ @word = Hash.new
6
+ @word_next = Hash.new
7
+ @word_bigram = Hash.new
8
+ @perms = Hash.new
9
+ end
10
+
11
+ def digest!( word_seq_array )
12
+ (0..word_seq_array.size-1).each do |i|
13
+ w = word_seq_array[i]
14
+ @word[ w ] ||= 0
15
+ @word[ w ] += 1
16
+ next if i == word_seq_array.size-1
17
+ next_w = word_seq_array[i+1]
18
+ @word_bigram[ w ] ||= {}
19
+ @word_bigram[ w ][next_w] ||= 0
20
+ @word_bigram[ w ][next_w] += 1
21
+ @word_next[ next_w] ||= 0
22
+ @word_next[ next_w ] += 1
23
+ end
24
+ end
25
+
26
+ def extract
27
+ calculate_ngrams()[ :w ].sort{|x1,x2| x2.last <=> x1.last}
28
+ end
29
+
30
+ def calculate_ngrams( depth=5, cutoffs=[2,2,1,1,1] )
31
+ a = { :w => @word.delete_if{|key, value| value <= cutoffs.first } , :wb => @word_bigram } #{ :w=>{}, :wb=>{} }
32
+ depth.times do |i|
33
+ cutoff = cutoffs[ i ]
34
+ @word = a[:w]
35
+ @word_bigram = a[:wb]
36
+ a = a[:w].keys.inject( a ) do |a, uni|
37
+ cs = sig_bigrams(uni, cutoff)
38
+ cs.keys.each do |x|
39
+ new_uni = "#{uni} #{x}"
40
+ a[:w][new_uni] = a[:wb][uni][x] rescue 0;
41
+ a[:wb][x].keys.each{|z| a[:wb][new_uni] ||= {}; a[:wb][new_uni][z] ||= {}; a[:wb][new_uni][z] = ( (a[:wb][uni][x]/@word_next[x].to_f)* (a[:wb][x][z]||0) ).to_i } rescue ""
42
+ end
43
+ a[:w].delete(uni) if cs.size > 0 or a[:w][uni] < cutoff
44
+ a
45
+ end
46
+ end
47
+ a
48
+ end
49
+
50
+
51
+ def sig_bigrams(word, min)
52
+ return { } if @word_bigram[ word ].nil?||@word_bigram[ word ].empty?
53
+
54
+ total = @word.values.inject(:+)
55
+ count = @word_bigram[word].values.inject(:+)
56
+ sig_big = { }
57
+ scores = word_scores( count, @word, @word_bigram[word], total, min )
58
+ scores.to_a.sort{|wc,zc| zc[1] <=> wc[1] }.each do |w,c|
59
+ next if @word_bigram[word][w] < min
60
+ null_score = null_score( count, @word, total, 0.1, 10 )
61
+ sig_big[w] = c if c > null_score
62
+ end
63
+ sig_big
64
+ end
65
+
66
+ def word_scores( count, unigram, bigram, total, min_count )
67
+ val = Hash.new
68
+ bigram.keys.each do |v|
69
+ uni = unigram[v]||0
70
+ big = bigram[v]||0
71
+ next if big < min_count
72
+
73
+ log_pi_vu = safelog(big) - safelog(count)
74
+ log_pi_vnu = safelog(uni - big) - safelog(total - big)
75
+ log_pi_v_old = safelog(uni) - safelog(total)
76
+ log_1mp_v = safelog(1 - Math.exp(log_pi_vnu))
77
+ log_1mp_vu = safelog(1 - Math.exp(log_pi_vu))
78
+
79
+ val[v] = 2 * (big * log_pi_vu + \
80
+ (uni - big) * log_pi_vnu - \
81
+ uni * log_pi_v_old + \
82
+ (count - big) * (log_1mp_vu - log_1mp_v))
83
+ end
84
+ val
85
+ end
86
+
87
+ def null_score( count, bigram, total, pvalue, perm_hash )
88
+
89
+ perm_key = count/perm_hash # int div ..
90
+
91
+ return @perms[perm_key] if @perms.has_key? perm_key
92
+
93
+ max_score = 0
94
+ nperm = (1.0 / pvalue).to_i
95
+ table = bigram.to_a.sort{|a,b| b[1]<=>a[1]}
96
+ (0..nperm).each do |perm|
97
+ #perm_bigram = sample_no_replace(total, table, count)
98
+ perm_bigram = new_sample_no_replace(total, bigram, count)
99
+ obs_score = word_scores(count, bigram, perm_bigram, total, 1)
100
+ obs_score = obs_score.values.max
101
+ max_score = obs_score if (obs_score > max_score or perm == 0)
102
+ end
103
+ @perms[perm_key] = max_score
104
+
105
+ max_score
106
+ end
107
+
108
+ def safelog x
109
+ x< 0 ? x : x==0? -1000000 : Math.log( x )
110
+ end
111
+
112
+ def new_sample_no_replace(total, table, nitems)
113
+ cdf = CDFast.new table
114
+
115
+ cdf.sample( nitems ).inject( {} ){|h,x| h[ x ] ||= 0; h[x] +=1; h}
116
+ end
117
+
118
+ def sample_no_replace(total, table, nitems)
119
+ sample = (0..total).to_a.sample( nitems )
120
+ count = {}
121
+ sample.each do |n|
122
+ w = nth_item_from_table(table, n)
123
+ count[w] ||= 0
124
+ count[w] += 1
125
+ end
126
+ count
127
+ end
128
+
129
+ def nth_item_from_table(table, n)
130
+ sum = 0
131
+ table.each do |wc|
132
+ sum = sum + wc[1]
133
+ return wc[0] if (n < sum) #table is sorted
134
+ end
135
+ table.last.first
136
+ end
137
+ end
138
+
139
+
140
+ class CDFast
141
+ def initialize table
142
+ @a = table.to_a.inject([[], 0]){|a,kv| a[0] += Array.new( kv.last,a[1]); a[1]+=1 ; a}
143
+ end
144
+
145
+ def to_s
146
+ "#{@a}"
147
+ end
148
+
149
+ def sample tt
150
+ s = tt.size/[@a.size, tt.size].min
151
+ (1..s).to_a.inject([]){|a,x| a += @a.sample(s) }
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,25 @@
1
+ require 'set'
2
+
3
+ class SparseVector
4
+ attr_accessor :attr
5
+
6
+ def initialize( attr_map )
7
+ @attr = attr_map
8
+ end
9
+
10
+ def dist( v )
11
+ Math.sqrt( Set.new( @attr.keys + v.attr.keys ).inject(0){|d,k| u_i = (@attr.has_key? k) ? @attr[k] : 0; v_i = (v.attr.has_key? k) ? v.attr[k] : 0; d + (u_i-v_i)*(u_i-v_i) } )
12
+ end
13
+
14
+ def -(v)
15
+ SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) - (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
16
+ end
17
+
18
+ def +(v)
19
+ SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) + (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
20
+ end
21
+
22
+ def mult_scalar( c )
23
+ SparseVector.new( @attr.inject( { } ){ |a, kv| a[ kv.first ] = kv.last * c; a })
24
+ end
25
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - ronbee
@@ -17,6 +17,10 @@ extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
19
  - lib/lite.rb
20
+ - lib/lite/classifier.rb
21
+ - lib/lite/cluster.rb
22
+ - lib/lite/ngrams.rb
23
+ - lib/lite/sparsevect.rb
20
24
  homepage: https://github.com/ronbee/lite
21
25
  licenses:
22
26
  - mit