lite 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/lite/classifier.rb +54 -0
- data/lib/lite/cluster.rb +76 -0
- data/lib/lite/ngrams.rb +154 -0
- data/lib/lite/sparsevect.rb +25 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2fc13836cc1019dd8809a6c48a7a6cdffa8bb088
|
4
|
+
data.tar.gz: 69b8a922e062ec32047dfe1926f1a5cb54845d16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3289282345c068a6e0524bc4662e0c9764a1b8bf224fcf8ec24e30e122b5093a9cf2a2d8f1e63b31b309bd67691a8b01b152aecb7e4892c1ad6aeb787c6945d
|
7
|
+
data.tar.gz: 65a04db0306934823048101fe592acb883003358a38d4c329cf28278618c8c6e79c11bc394d636a950b71aedc2f072b4726f9ffb56edeec00e790aad80250250
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require "json"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module Classify
|
5
|
+
|
6
|
+
class NB
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@labels = {}
|
10
|
+
@features = Set.new
|
11
|
+
@nF = 0.0
|
12
|
+
@nL = 0.0
|
13
|
+
@c = 0.5
|
14
|
+
end
|
15
|
+
|
16
|
+
def update! fvect, label
|
17
|
+
@labels[ label ] ||= { "xs" => {}, "N"=>0 }
|
18
|
+
fvect.each{|k,v| @features<<k; @labels[label]["nX"]||=@c ;@labels[ label ]["xs"][k] ||= @c; @labels[ label ]["xs"][k] += v;@labels[label]["nX"]+=v}
|
19
|
+
@labels[ label ]["N"]+=1
|
20
|
+
wrapup
|
21
|
+
end
|
22
|
+
|
23
|
+
def classify fvect
|
24
|
+
@labels.keys.inject({}) do |aux,y|
|
25
|
+
sx = fvect.keys.inject(0.0){|z, fi| z += fvect[fi] * Math.log( (@labels[y]["xs"][fi]||@c) / (@labels[y]["nX"]+@c*@nF))}
|
26
|
+
sy = Math.log( @labels[y]["N"] / @nL ) # here no smoothing
|
27
|
+
aux[ y ] = sx + sy
|
28
|
+
aux
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def to_json
|
34
|
+
{ "id" => "#{rand(10000)}#{Time.now.to_i}", "labels"=>@labels, "F"=>@features.to_a, "nf"=>@nF, "nl"=>@nL,"c"=>@c }.to_json
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.from_json json
|
38
|
+
parsed = JSON.parse json
|
39
|
+
c = self.new
|
40
|
+
c.instance_variable_set("@labels", parsed["labels"])
|
41
|
+
c.instance_variable_set("@features", Set.new( parsed["F"] ) )
|
42
|
+
c.instance_variable_set("@nF", parsed["nf"])
|
43
|
+
c.instance_variable_set("@nL", parsed["nl"])
|
44
|
+
c
|
45
|
+
end
|
46
|
+
|
47
|
+
:private
|
48
|
+
def wrapup
|
49
|
+
@nF = @features.size
|
50
|
+
@nL = @labels.keys.inject(0.0){|s,k| s += @labels[k]["N"]}
|
51
|
+
@labels.keys.each{|k| @labels[k]["sF"] = @labels[k]["N"]+@c*@nF}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/lite/cluster.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/sparsevect.rb'
|
2
|
+
|
3
|
+
module Cluster
|
4
|
+
class AddC
|
5
|
+
def initialize( upperBoundOnNumClusters )
|
6
|
+
@k_max = upperBoundOnNumClusters
|
7
|
+
@centroids = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def observe!( instance )
|
11
|
+
if @centroids.size == 0
|
12
|
+
@centroids << Centroid.new( instance )
|
13
|
+
return self
|
14
|
+
end
|
15
|
+
|
16
|
+
@centroids.sort! {|c1, c2| instance.dist(c1.x) <=> instance.dist(c2.x) }
|
17
|
+
closest_centroid = @centroids.first
|
18
|
+
closest_centroid.update!( instance )
|
19
|
+
|
20
|
+
if( @centroids.size >= @k_max )
|
21
|
+
pairs = []
|
22
|
+
@centroids.each_index do |i|
|
23
|
+
min_d = 10**20
|
24
|
+
min_c = 0
|
25
|
+
@centroids.each_index do |j|
|
26
|
+
next if i==j
|
27
|
+
d = @centroids[i].x.dist( @centroids[j].x )
|
28
|
+
min_c = j if d < min_d
|
29
|
+
min_d = d if d < min_d
|
30
|
+
end
|
31
|
+
pairs[i] = [ min_d, i, min_c]
|
32
|
+
end
|
33
|
+
pairs.sort! {|x,y| x[0]<=>y[0]}
|
34
|
+
merge_info = pairs.first
|
35
|
+
@centroids[merge_info[1]].merge!( @centroids[merge_info[2]] )
|
36
|
+
@centroids = @centroids - [ @centroids[merge_info[2]] ]
|
37
|
+
end
|
38
|
+
@centroids << Centroid.new( instance )
|
39
|
+
|
40
|
+
[]
|
41
|
+
end
|
42
|
+
|
43
|
+
def getCentroids( min_num_instances_in_cluster = 2 )
|
44
|
+
@centroids.each do |c|
|
45
|
+
next if c.n >= min_num_instances_in_cluster
|
46
|
+
p c
|
47
|
+
@centroids = @centroids - [ c ]
|
48
|
+
next if c.n == 0
|
49
|
+
aux = @centroids.inject( {:min_c => @centroids.first, :d => @centroids.first.x.dist( c.x )} ) {|a,cc| cc.nil? || cc.x.dist(c.x) > a[:d] ? a : { :min_c=>cc, :d=>cc.x.dist(c.x)} }
|
50
|
+
aux[:min_c].merge! c
|
51
|
+
end
|
52
|
+
@centroids
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Centroid < SparseVector
|
57
|
+
attr_accessor :x,:n
|
58
|
+
def initialize( x )
|
59
|
+
@x = x
|
60
|
+
@n = 0
|
61
|
+
end
|
62
|
+
|
63
|
+
def update!( newX )
|
64
|
+
@x += ( @x - newX ).mult_scalar( 1.0/(@n+1) )
|
65
|
+
@n += 1
|
66
|
+
self
|
67
|
+
end
|
68
|
+
|
69
|
+
def merge!( centroid )
|
70
|
+
@x = ( @x.mult_scalar(@n)+centroid.x.mult_scalar(centroid.n) ).mult_scalar( 1.0 / (@n + centroid.n) )
|
71
|
+
@n += centroid.n
|
72
|
+
self
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
data/lib/lite/ngrams.rb
ADDED
@@ -0,0 +1,154 @@
|
|
1
|
+
module Cluster
|
2
|
+
class Grammy
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@word = Hash.new
|
6
|
+
@word_next = Hash.new
|
7
|
+
@word_bigram = Hash.new
|
8
|
+
@perms = Hash.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def digest!( word_seq_array )
|
12
|
+
(0..word_seq_array.size-1).each do |i|
|
13
|
+
w = word_seq_array[i]
|
14
|
+
@word[ w ] ||= 0
|
15
|
+
@word[ w ] += 1
|
16
|
+
next if i == word_seq_array.size-1
|
17
|
+
next_w = word_seq_array[i+1]
|
18
|
+
@word_bigram[ w ] ||= {}
|
19
|
+
@word_bigram[ w ][next_w] ||= 0
|
20
|
+
@word_bigram[ w ][next_w] += 1
|
21
|
+
@word_next[ next_w] ||= 0
|
22
|
+
@word_next[ next_w ] += 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def extract
|
27
|
+
calculate_ngrams()[ :w ].sort{|x1,x2| x2.last <=> x1.last}
|
28
|
+
end
|
29
|
+
|
30
|
+
def calculate_ngrams( depth=5, cutoffs=[2,2,1,1,1] )
|
31
|
+
a = { :w => @word.delete_if{|key, value| value <= cutoffs.first } , :wb => @word_bigram } #{ :w=>{}, :wb=>{} }
|
32
|
+
depth.times do |i|
|
33
|
+
cutoff = cutoffs[ i ]
|
34
|
+
@word = a[:w]
|
35
|
+
@word_bigram = a[:wb]
|
36
|
+
a = a[:w].keys.inject( a ) do |a, uni|
|
37
|
+
cs = sig_bigrams(uni, cutoff)
|
38
|
+
cs.keys.each do |x|
|
39
|
+
new_uni = "#{uni} #{x}"
|
40
|
+
a[:w][new_uni] = a[:wb][uni][x] rescue 0;
|
41
|
+
a[:wb][x].keys.each{|z| a[:wb][new_uni] ||= {}; a[:wb][new_uni][z] ||= {}; a[:wb][new_uni][z] = ( (a[:wb][uni][x]/@word_next[x].to_f)* (a[:wb][x][z]||0) ).to_i } rescue ""
|
42
|
+
end
|
43
|
+
a[:w].delete(uni) if cs.size > 0 or a[:w][uni] < cutoff
|
44
|
+
a
|
45
|
+
end
|
46
|
+
end
|
47
|
+
a
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
def sig_bigrams(word, min)
|
52
|
+
return { } if @word_bigram[ word ].nil?||@word_bigram[ word ].empty?
|
53
|
+
|
54
|
+
total = @word.values.inject(:+)
|
55
|
+
count = @word_bigram[word].values.inject(:+)
|
56
|
+
sig_big = { }
|
57
|
+
scores = word_scores( count, @word, @word_bigram[word], total, min )
|
58
|
+
scores.to_a.sort{|wc,zc| zc[1] <=> wc[1] }.each do |w,c|
|
59
|
+
next if @word_bigram[word][w] < min
|
60
|
+
null_score = null_score( count, @word, total, 0.1, 10 )
|
61
|
+
sig_big[w] = c if c > null_score
|
62
|
+
end
|
63
|
+
sig_big
|
64
|
+
end
|
65
|
+
|
66
|
+
def word_scores( count, unigram, bigram, total, min_count )
|
67
|
+
val = Hash.new
|
68
|
+
bigram.keys.each do |v|
|
69
|
+
uni = unigram[v]||0
|
70
|
+
big = bigram[v]||0
|
71
|
+
next if big < min_count
|
72
|
+
|
73
|
+
log_pi_vu = safelog(big) - safelog(count)
|
74
|
+
log_pi_vnu = safelog(uni - big) - safelog(total - big)
|
75
|
+
log_pi_v_old = safelog(uni) - safelog(total)
|
76
|
+
log_1mp_v = safelog(1 - Math.exp(log_pi_vnu))
|
77
|
+
log_1mp_vu = safelog(1 - Math.exp(log_pi_vu))
|
78
|
+
|
79
|
+
val[v] = 2 * (big * log_pi_vu + \
|
80
|
+
(uni - big) * log_pi_vnu - \
|
81
|
+
uni * log_pi_v_old + \
|
82
|
+
(count - big) * (log_1mp_vu - log_1mp_v))
|
83
|
+
end
|
84
|
+
val
|
85
|
+
end
|
86
|
+
|
87
|
+
def null_score( count, bigram, total, pvalue, perm_hash )
|
88
|
+
|
89
|
+
perm_key = count/perm_hash # int div ..
|
90
|
+
|
91
|
+
return @perms[perm_key] if @perms.has_key? perm_key
|
92
|
+
|
93
|
+
max_score = 0
|
94
|
+
nperm = (1.0 / pvalue).to_i
|
95
|
+
table = bigram.to_a.sort{|a,b| b[1]<=>a[1]}
|
96
|
+
(0..nperm).each do |perm|
|
97
|
+
#perm_bigram = sample_no_replace(total, table, count)
|
98
|
+
perm_bigram = new_sample_no_replace(total, bigram, count)
|
99
|
+
obs_score = word_scores(count, bigram, perm_bigram, total, 1)
|
100
|
+
obs_score = obs_score.values.max
|
101
|
+
max_score = obs_score if (obs_score > max_score or perm == 0)
|
102
|
+
end
|
103
|
+
@perms[perm_key] = max_score
|
104
|
+
|
105
|
+
max_score
|
106
|
+
end
|
107
|
+
|
108
|
+
def safelog x
|
109
|
+
x< 0 ? x : x==0? -1000000 : Math.log( x )
|
110
|
+
end
|
111
|
+
|
112
|
+
def new_sample_no_replace(total, table, nitems)
|
113
|
+
cdf = CDFast.new table
|
114
|
+
|
115
|
+
cdf.sample( nitems ).inject( {} ){|h,x| h[ x ] ||= 0; h[x] +=1; h}
|
116
|
+
end
|
117
|
+
|
118
|
+
def sample_no_replace(total, table, nitems)
|
119
|
+
sample = (0..total).to_a.sample( nitems )
|
120
|
+
count = {}
|
121
|
+
sample.each do |n|
|
122
|
+
w = nth_item_from_table(table, n)
|
123
|
+
count[w] ||= 0
|
124
|
+
count[w] += 1
|
125
|
+
end
|
126
|
+
count
|
127
|
+
end
|
128
|
+
|
129
|
+
def nth_item_from_table(table, n)
|
130
|
+
sum = 0
|
131
|
+
table.each do |wc|
|
132
|
+
sum = sum + wc[1]
|
133
|
+
return wc[0] if (n < sum) #table is sorted
|
134
|
+
end
|
135
|
+
table.last.first
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
class CDFast
|
141
|
+
def initialize table
|
142
|
+
@a = table.to_a.inject([[], 0]){|a,kv| a[0] += Array.new( kv.last,a[1]); a[1]+=1 ; a}
|
143
|
+
end
|
144
|
+
|
145
|
+
def to_s
|
146
|
+
"#{@a}"
|
147
|
+
end
|
148
|
+
|
149
|
+
def sample tt
|
150
|
+
s = tt.size/[@a.size, tt.size].min
|
151
|
+
(1..s).to_a.inject([]){|a,x| a += @a.sample(s) }
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
class SparseVector
|
4
|
+
attr_accessor :attr
|
5
|
+
|
6
|
+
def initialize( attr_map )
|
7
|
+
@attr = attr_map
|
8
|
+
end
|
9
|
+
|
10
|
+
def dist( v )
|
11
|
+
Math.sqrt( Set.new( @attr.keys + v.attr.keys ).inject(0){|d,k| u_i = (@attr.has_key? k) ? @attr[k] : 0; v_i = (v.attr.has_key? k) ? v.attr[k] : 0; d + (u_i-v_i)*(u_i-v_i) } )
|
12
|
+
end
|
13
|
+
|
14
|
+
def -(v)
|
15
|
+
SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) - (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
|
16
|
+
end
|
17
|
+
|
18
|
+
def +(v)
|
19
|
+
SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) + (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
|
20
|
+
end
|
21
|
+
|
22
|
+
def mult_scalar( c )
|
23
|
+
SparseVector.new( @attr.inject( { } ){ |a, kv| a[ kv.first ] = kv.last * c; a })
|
24
|
+
end
|
25
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ronbee
|
@@ -17,6 +17,10 @@ extensions: []
|
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
19
|
- lib/lite.rb
|
20
|
+
- lib/lite/classifier.rb
|
21
|
+
- lib/lite/cluster.rb
|
22
|
+
- lib/lite/ngrams.rb
|
23
|
+
- lib/lite/sparsevect.rb
|
20
24
|
homepage: https://github.com/ronbee/lite
|
21
25
|
licenses:
|
22
26
|
- mit
|