lite 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 25ff1a57bf0429aa06d54f565f690193d820ace4
4
- data.tar.gz: b38a29a2a1cc6322a4d68dcaf0fac4cfa313a844
3
+ metadata.gz: 2fc13836cc1019dd8809a6c48a7a6cdffa8bb088
4
+ data.tar.gz: 69b8a922e062ec32047dfe1926f1a5cb54845d16
5
5
  SHA512:
6
- metadata.gz: 26355fc1113fcfa91eb8fd68c41df77f89e9b40f99a5b0584c089a8e73dc191a42b6c80ff3c9698e8ff737ddf7444e0d24c542e632376dd5a8bfecdb01735c04
7
- data.tar.gz: 4d62a7c6a8d20c29e582fa9b938e9fcfc70265cccebfbb6130acba04b39c32d81d6eb9d1deb89b6f4e64814d987f58a0a5d08c0fdbe7b0423a54bf2a5f4a7171
6
+ metadata.gz: b3289282345c068a6e0524bc4662e0c9764a1b8bf224fcf8ec24e30e122b5093a9cf2a2d8f1e63b31b309bd67691a8b01b152aecb7e4892c1ad6aeb787c6945d
7
+ data.tar.gz: 65a04db0306934823048101fe592acb883003358a38d4c329cf28278618c8c6e79c11bc394d636a950b71aedc2f072b4726f9ffb56edeec00e790aad80250250
@@ -0,0 +1,54 @@
1
+ require "json"
2
+ require "set"
3
+
4
+ module Classify
5
+
6
+ class NB
7
+
8
+ def initialize
9
+ @labels = {}
10
+ @features = Set.new
11
+ @nF = 0.0
12
+ @nL = 0.0
13
+ @c = 0.5
14
+ end
15
+
16
+ def update! fvect, label
17
+ @labels[ label ] ||= { "xs" => {}, "N"=>0 }
18
+ fvect.each{|k,v| @features<<k; @labels[label]["nX"]||=@c ;@labels[ label ]["xs"][k] ||= @c; @labels[ label ]["xs"][k] += v;@labels[label]["nX"]+=v}
19
+ @labels[ label ]["N"]+=1
20
+ wrapup
21
+ end
22
+
23
+ def classify fvect
24
+ @labels.keys.inject({}) do |aux,y|
25
+ sx = fvect.keys.inject(0.0){|z, fi| z += fvect[fi] * Math.log( (@labels[y]["xs"][fi]||@c) / (@labels[y]["nX"]+@c*@nF))}
26
+ sy = Math.log( @labels[y]["N"] / @nL ) # here no smoothing
27
+ aux[ y ] = sx + sy
28
+ aux
29
+ end
30
+ end
31
+
32
+
33
+ def to_json
34
+ { "id" => "#{rand(10000)}#{Time.now.to_i}", "labels"=>@labels, "F"=>@features.to_a, "nf"=>@nF, "nl"=>@nL,"c"=>@c }.to_json
35
+ end
36
+
37
+ def self.from_json json
38
+ parsed = JSON.parse json
39
+ c = self.new
40
+ c.instance_variable_set("@labels", parsed["labels"])
41
+ c.instance_variable_set("@features", Set.new( parsed["F"] ) )
42
+ c.instance_variable_set("@nF", parsed["nf"])
43
+ c.instance_variable_set("@nL", parsed["nl"])
44
+ c
45
+ end
46
+
47
+ :private
48
+ def wrapup
49
+ @nF = @features.size
50
+ @nL = @labels.keys.inject(0.0){|s,k| s += @labels[k]["N"]}
51
+ @labels.keys.each{|k| @labels[k]["sF"] = @labels[k]["N"]+@c*@nF}
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,76 @@
1
+ require File.dirname(__FILE__)+'/sparsevect.rb'
2
+
3
+ module Cluster
4
+ class AddC
5
+ def initialize( upperBoundOnNumClusters )
6
+ @k_max = upperBoundOnNumClusters
7
+ @centroids = []
8
+ end
9
+
10
+ def observe!( instance )
11
+ if @centroids.size == 0
12
+ @centroids << Centroid.new( instance )
13
+ return self
14
+ end
15
+
16
+ @centroids.sort! {|c1, c2| instance.dist(c1.x) <=> instance.dist(c2.x) }
17
+ closest_centroid = @centroids.first
18
+ closest_centroid.update!( instance )
19
+
20
+ if( @centroids.size >= @k_max )
21
+ pairs = []
22
+ @centroids.each_index do |i|
23
+ min_d = 10**20
24
+ min_c = 0
25
+ @centroids.each_index do |j|
26
+ next if i==j
27
+ d = @centroids[i].x.dist( @centroids[j].x )
28
+ min_c = j if d < min_d
29
+ min_d = d if d < min_d
30
+ end
31
+ pairs[i] = [ min_d, i, min_c]
32
+ end
33
+ pairs.sort! {|x,y| x[0]<=>y[0]}
34
+ merge_info = pairs.first
35
+ @centroids[merge_info[1]].merge!( @centroids[merge_info[2]] )
36
+ @centroids = @centroids - [ @centroids[merge_info[2]] ]
37
+ end
38
+ @centroids << Centroid.new( instance )
39
+
40
+ []
41
+ end
42
+
43
+ def getCentroids( min_num_instances_in_cluster = 2 )
44
+ @centroids.each do |c|
45
+ next if c.n >= min_num_instances_in_cluster
46
+ p c
47
+ @centroids = @centroids - [ c ]
48
+ next if c.n == 0
49
+ aux = @centroids.inject( {:min_c => @centroids.first, :d => @centroids.first.x.dist( c.x )} ) {|a,cc| cc.nil? || cc.x.dist(c.x) > a[:d] ? a : { :min_c=>cc, :d=>cc.x.dist(c.x)} }
50
+ aux[:min_c].merge! c
51
+ end
52
+ @centroids
53
+ end
54
+ end
55
+
56
+ class Centroid < SparseVector
57
+ attr_accessor :x,:n
58
+ def initialize( x )
59
+ @x = x
60
+ @n = 0
61
+ end
62
+
63
+ def update!( newX )
64
+ @x += ( @x - newX ).mult_scalar( 1.0/(@n+1) )
65
+ @n += 1
66
+ self
67
+ end
68
+
69
+ def merge!( centroid )
70
+ @x = ( @x.mult_scalar(@n)+centroid.x.mult_scalar(centroid.n) ).mult_scalar( 1.0 / (@n + centroid.n) )
71
+ @n += centroid.n
72
+ self
73
+ end
74
+ end
75
+
76
+ end
@@ -0,0 +1,154 @@
1
+ module Cluster
2
+ class Grammy
3
+
4
+ def initialize
5
+ @word = Hash.new
6
+ @word_next = Hash.new
7
+ @word_bigram = Hash.new
8
+ @perms = Hash.new
9
+ end
10
+
11
+ def digest!( word_seq_array )
12
+ (0..word_seq_array.size-1).each do |i|
13
+ w = word_seq_array[i]
14
+ @word[ w ] ||= 0
15
+ @word[ w ] += 1
16
+ next if i == word_seq_array.size-1
17
+ next_w = word_seq_array[i+1]
18
+ @word_bigram[ w ] ||= {}
19
+ @word_bigram[ w ][next_w] ||= 0
20
+ @word_bigram[ w ][next_w] += 1
21
+ @word_next[ next_w] ||= 0
22
+ @word_next[ next_w ] += 1
23
+ end
24
+ end
25
+
26
+ def extract
27
+ calculate_ngrams()[ :w ].sort{|x1,x2| x2.last <=> x1.last}
28
+ end
29
+
30
+ def calculate_ngrams( depth=5, cutoffs=[2,2,1,1,1] )
31
+ a = { :w => @word.delete_if{|key, value| value <= cutoffs.first } , :wb => @word_bigram } #{ :w=>{}, :wb=>{} }
32
+ depth.times do |i|
33
+ cutoff = cutoffs[ i ]
34
+ @word = a[:w]
35
+ @word_bigram = a[:wb]
36
+ a = a[:w].keys.inject( a ) do |a, uni|
37
+ cs = sig_bigrams(uni, cutoff)
38
+ cs.keys.each do |x|
39
+ new_uni = "#{uni} #{x}"
40
+ a[:w][new_uni] = a[:wb][uni][x] rescue 0;
41
+ a[:wb][x].keys.each{|z| a[:wb][new_uni] ||= {}; a[:wb][new_uni][z] ||= {}; a[:wb][new_uni][z] = ( (a[:wb][uni][x]/@word_next[x].to_f)* (a[:wb][x][z]||0) ).to_i } rescue ""
42
+ end
43
+ a[:w].delete(uni) if cs.size > 0 or a[:w][uni] < cutoff
44
+ a
45
+ end
46
+ end
47
+ a
48
+ end
49
+
50
+
51
+ def sig_bigrams(word, min)
52
+ return { } if @word_bigram[ word ].nil?||@word_bigram[ word ].empty?
53
+
54
+ total = @word.values.inject(:+)
55
+ count = @word_bigram[word].values.inject(:+)
56
+ sig_big = { }
57
+ scores = word_scores( count, @word, @word_bigram[word], total, min )
58
+ scores.to_a.sort{|wc,zc| zc[1] <=> wc[1] }.each do |w,c|
59
+ next if @word_bigram[word][w] < min
60
+ null_score = null_score( count, @word, total, 0.1, 10 )
61
+ sig_big[w] = c if c > null_score
62
+ end
63
+ sig_big
64
+ end
65
+
66
+ def word_scores( count, unigram, bigram, total, min_count )
67
+ val = Hash.new
68
+ bigram.keys.each do |v|
69
+ uni = unigram[v]||0
70
+ big = bigram[v]||0
71
+ next if big < min_count
72
+
73
+ log_pi_vu = safelog(big) - safelog(count)
74
+ log_pi_vnu = safelog(uni - big) - safelog(total - big)
75
+ log_pi_v_old = safelog(uni) - safelog(total)
76
+ log_1mp_v = safelog(1 - Math.exp(log_pi_vnu))
77
+ log_1mp_vu = safelog(1 - Math.exp(log_pi_vu))
78
+
79
+ val[v] = 2 * (big * log_pi_vu + \
80
+ (uni - big) * log_pi_vnu - \
81
+ uni * log_pi_v_old + \
82
+ (count - big) * (log_1mp_vu - log_1mp_v))
83
+ end
84
+ val
85
+ end
86
+
87
+ def null_score( count, bigram, total, pvalue, perm_hash )
88
+
89
+ perm_key = count/perm_hash # int div ..
90
+
91
+ return @perms[perm_key] if @perms.has_key? perm_key
92
+
93
+ max_score = 0
94
+ nperm = (1.0 / pvalue).to_i
95
+ table = bigram.to_a.sort{|a,b| b[1]<=>a[1]}
96
+ (0..nperm).each do |perm|
97
+ #perm_bigram = sample_no_replace(total, table, count)
98
+ perm_bigram = new_sample_no_replace(total, bigram, count)
99
+ obs_score = word_scores(count, bigram, perm_bigram, total, 1)
100
+ obs_score = obs_score.values.max
101
+ max_score = obs_score if (obs_score > max_score or perm == 0)
102
+ end
103
+ @perms[perm_key] = max_score
104
+
105
+ max_score
106
+ end
107
+
108
+ def safelog x
109
+ x< 0 ? x : x==0? -1000000 : Math.log( x )
110
+ end
111
+
112
+ def new_sample_no_replace(total, table, nitems)
113
+ cdf = CDFast.new table
114
+
115
+ cdf.sample( nitems ).inject( {} ){|h,x| h[ x ] ||= 0; h[x] +=1; h}
116
+ end
117
+
118
+ def sample_no_replace(total, table, nitems)
119
+ sample = (0..total).to_a.sample( nitems )
120
+ count = {}
121
+ sample.each do |n|
122
+ w = nth_item_from_table(table, n)
123
+ count[w] ||= 0
124
+ count[w] += 1
125
+ end
126
+ count
127
+ end
128
+
129
+ def nth_item_from_table(table, n)
130
+ sum = 0
131
+ table.each do |wc|
132
+ sum = sum + wc[1]
133
+ return wc[0] if (n < sum) #table is sorted
134
+ end
135
+ table.last.first
136
+ end
137
+ end
138
+
139
+
140
+ class CDFast
141
+ def initialize table
142
+ @a = table.to_a.inject([[], 0]){|a,kv| a[0] += Array.new( kv.last,a[1]); a[1]+=1 ; a}
143
+ end
144
+
145
+ def to_s
146
+ "#{@a}"
147
+ end
148
+
149
+ def sample tt
150
+ s = tt.size/[@a.size, tt.size].min
151
+ (1..s).to_a.inject([]){|a,x| a += @a.sample(s) }
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,25 @@
1
+ require 'set'
2
+
3
+ class SparseVector
4
+ attr_accessor :attr
5
+
6
+ def initialize( attr_map )
7
+ @attr = attr_map
8
+ end
9
+
10
+ def dist( v )
11
+ Math.sqrt( Set.new( @attr.keys + v.attr.keys ).inject(0){|d,k| u_i = (@attr.has_key? k) ? @attr[k] : 0; v_i = (v.attr.has_key? k) ? v.attr[k] : 0; d + (u_i-v_i)*(u_i-v_i) } )
12
+ end
13
+
14
+ def -(v)
15
+ SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) - (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
16
+ end
17
+
18
+ def +(v)
19
+ SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) + (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
20
+ end
21
+
22
+ def mult_scalar( c )
23
+ SparseVector.new( @attr.inject( { } ){ |a, kv| a[ kv.first ] = kv.last * c; a })
24
+ end
25
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: lite
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - ronbee
@@ -17,6 +17,10 @@ extensions: []
17
17
  extra_rdoc_files: []
18
18
  files:
19
19
  - lib/lite.rb
20
+ - lib/lite/classifier.rb
21
+ - lib/lite/cluster.rb
22
+ - lib/lite/ngrams.rb
23
+ - lib/lite/sparsevect.rb
20
24
  homepage: https://github.com/ronbee/lite
21
25
  licenses:
22
26
  - mit