lite 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/lite/classifier.rb +54 -0
- data/lib/lite/cluster.rb +76 -0
- data/lib/lite/ngrams.rb +154 -0
- data/lib/lite/sparsevect.rb +25 -0
- metadata +5 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2fc13836cc1019dd8809a6c48a7a6cdffa8bb088
|
4
|
+
data.tar.gz: 69b8a922e062ec32047dfe1926f1a5cb54845d16
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b3289282345c068a6e0524bc4662e0c9764a1b8bf224fcf8ec24e30e122b5093a9cf2a2d8f1e63b31b309bd67691a8b01b152aecb7e4892c1ad6aeb787c6945d
|
7
|
+
data.tar.gz: 65a04db0306934823048101fe592acb883003358a38d4c329cf28278618c8c6e79c11bc394d636a950b71aedc2f072b4726f9ffb56edeec00e790aad80250250
|
@@ -0,0 +1,54 @@
|
|
1
|
+
require "json"
|
2
|
+
require "set"
|
3
|
+
|
4
|
+
module Classify
|
5
|
+
|
6
|
+
class NB
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@labels = {}
|
10
|
+
@features = Set.new
|
11
|
+
@nF = 0.0
|
12
|
+
@nL = 0.0
|
13
|
+
@c = 0.5
|
14
|
+
end
|
15
|
+
|
16
|
+
def update! fvect, label
|
17
|
+
@labels[ label ] ||= { "xs" => {}, "N"=>0 }
|
18
|
+
fvect.each{|k,v| @features<<k; @labels[label]["nX"]||=@c ;@labels[ label ]["xs"][k] ||= @c; @labels[ label ]["xs"][k] += v;@labels[label]["nX"]+=v}
|
19
|
+
@labels[ label ]["N"]+=1
|
20
|
+
wrapup
|
21
|
+
end
|
22
|
+
|
23
|
+
def classify fvect
|
24
|
+
@labels.keys.inject({}) do |aux,y|
|
25
|
+
sx = fvect.keys.inject(0.0){|z, fi| z += fvect[fi] * Math.log( (@labels[y]["xs"][fi]||@c) / (@labels[y]["nX"]+@c*@nF))}
|
26
|
+
sy = Math.log( @labels[y]["N"] / @nL ) # here no smoothing
|
27
|
+
aux[ y ] = sx + sy
|
28
|
+
aux
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
def to_json
|
34
|
+
{ "id" => "#{rand(10000)}#{Time.now.to_i}", "labels"=>@labels, "F"=>@features.to_a, "nf"=>@nF, "nl"=>@nL,"c"=>@c }.to_json
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.from_json json
|
38
|
+
parsed = JSON.parse json
|
39
|
+
c = self.new
|
40
|
+
c.instance_variable_set("@labels", parsed["labels"])
|
41
|
+
c.instance_variable_set("@features", Set.new( parsed["F"] ) )
|
42
|
+
c.instance_variable_set("@nF", parsed["nf"])
|
43
|
+
c.instance_variable_set("@nL", parsed["nl"])
|
44
|
+
c
|
45
|
+
end
|
46
|
+
|
47
|
+
:private
|
48
|
+
def wrapup
|
49
|
+
@nF = @features.size
|
50
|
+
@nL = @labels.keys.inject(0.0){|s,k| s += @labels[k]["N"]}
|
51
|
+
@labels.keys.each{|k| @labels[k]["sF"] = @labels[k]["N"]+@c*@nF}
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/lite/cluster.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/sparsevect.rb'
|
2
|
+
|
3
|
+
module Cluster
|
4
|
+
class AddC
|
5
|
+
def initialize( upperBoundOnNumClusters )
|
6
|
+
@k_max = upperBoundOnNumClusters
|
7
|
+
@centroids = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def observe!( instance )
|
11
|
+
if @centroids.size == 0
|
12
|
+
@centroids << Centroid.new( instance )
|
13
|
+
return self
|
14
|
+
end
|
15
|
+
|
16
|
+
@centroids.sort! {|c1, c2| instance.dist(c1.x) <=> instance.dist(c2.x) }
|
17
|
+
closest_centroid = @centroids.first
|
18
|
+
closest_centroid.update!( instance )
|
19
|
+
|
20
|
+
if( @centroids.size >= @k_max )
|
21
|
+
pairs = []
|
22
|
+
@centroids.each_index do |i|
|
23
|
+
min_d = 10**20
|
24
|
+
min_c = 0
|
25
|
+
@centroids.each_index do |j|
|
26
|
+
next if i==j
|
27
|
+
d = @centroids[i].x.dist( @centroids[j].x )
|
28
|
+
min_c = j if d < min_d
|
29
|
+
min_d = d if d < min_d
|
30
|
+
end
|
31
|
+
pairs[i] = [ min_d, i, min_c]
|
32
|
+
end
|
33
|
+
pairs.sort! {|x,y| x[0]<=>y[0]}
|
34
|
+
merge_info = pairs.first
|
35
|
+
@centroids[merge_info[1]].merge!( @centroids[merge_info[2]] )
|
36
|
+
@centroids = @centroids - [ @centroids[merge_info[2]] ]
|
37
|
+
end
|
38
|
+
@centroids << Centroid.new( instance )
|
39
|
+
|
40
|
+
[]
|
41
|
+
end
|
42
|
+
|
43
|
+
def getCentroids( min_num_instances_in_cluster = 2 )
|
44
|
+
@centroids.each do |c|
|
45
|
+
next if c.n >= min_num_instances_in_cluster
|
46
|
+
p c
|
47
|
+
@centroids = @centroids - [ c ]
|
48
|
+
next if c.n == 0
|
49
|
+
aux = @centroids.inject( {:min_c => @centroids.first, :d => @centroids.first.x.dist( c.x )} ) {|a,cc| cc.nil? || cc.x.dist(c.x) > a[:d] ? a : { :min_c=>cc, :d=>cc.x.dist(c.x)} }
|
50
|
+
aux[:min_c].merge! c
|
51
|
+
end
|
52
|
+
@centroids
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
class Centroid < SparseVector
|
57
|
+
attr_accessor :x,:n
|
58
|
+
def initialize( x )
|
59
|
+
@x = x
|
60
|
+
@n = 0
|
61
|
+
end
|
62
|
+
|
63
|
+
def update!( newX )
|
64
|
+
@x += ( @x - newX ).mult_scalar( 1.0/(@n+1) )
|
65
|
+
@n += 1
|
66
|
+
self
|
67
|
+
end
|
68
|
+
|
69
|
+
def merge!( centroid )
|
70
|
+
@x = ( @x.mult_scalar(@n)+centroid.x.mult_scalar(centroid.n) ).mult_scalar( 1.0 / (@n + centroid.n) )
|
71
|
+
@n += centroid.n
|
72
|
+
self
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
data/lib/lite/ngrams.rb
ADDED
@@ -0,0 +1,154 @@
|
|
1
|
+
module Cluster
|
2
|
+
class Grammy
|
3
|
+
|
4
|
+
def initialize
|
5
|
+
@word = Hash.new
|
6
|
+
@word_next = Hash.new
|
7
|
+
@word_bigram = Hash.new
|
8
|
+
@perms = Hash.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def digest!( word_seq_array )
|
12
|
+
(0..word_seq_array.size-1).each do |i|
|
13
|
+
w = word_seq_array[i]
|
14
|
+
@word[ w ] ||= 0
|
15
|
+
@word[ w ] += 1
|
16
|
+
next if i == word_seq_array.size-1
|
17
|
+
next_w = word_seq_array[i+1]
|
18
|
+
@word_bigram[ w ] ||= {}
|
19
|
+
@word_bigram[ w ][next_w] ||= 0
|
20
|
+
@word_bigram[ w ][next_w] += 1
|
21
|
+
@word_next[ next_w] ||= 0
|
22
|
+
@word_next[ next_w ] += 1
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def extract
|
27
|
+
calculate_ngrams()[ :w ].sort{|x1,x2| x2.last <=> x1.last}
|
28
|
+
end
|
29
|
+
|
30
|
+
def calculate_ngrams( depth=5, cutoffs=[2,2,1,1,1] )
|
31
|
+
a = { :w => @word.delete_if{|key, value| value <= cutoffs.first } , :wb => @word_bigram } #{ :w=>{}, :wb=>{} }
|
32
|
+
depth.times do |i|
|
33
|
+
cutoff = cutoffs[ i ]
|
34
|
+
@word = a[:w]
|
35
|
+
@word_bigram = a[:wb]
|
36
|
+
a = a[:w].keys.inject( a ) do |a, uni|
|
37
|
+
cs = sig_bigrams(uni, cutoff)
|
38
|
+
cs.keys.each do |x|
|
39
|
+
new_uni = "#{uni} #{x}"
|
40
|
+
a[:w][new_uni] = a[:wb][uni][x] rescue 0;
|
41
|
+
a[:wb][x].keys.each{|z| a[:wb][new_uni] ||= {}; a[:wb][new_uni][z] ||= {}; a[:wb][new_uni][z] = ( (a[:wb][uni][x]/@word_next[x].to_f)* (a[:wb][x][z]||0) ).to_i } rescue ""
|
42
|
+
end
|
43
|
+
a[:w].delete(uni) if cs.size > 0 or a[:w][uni] < cutoff
|
44
|
+
a
|
45
|
+
end
|
46
|
+
end
|
47
|
+
a
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
def sig_bigrams(word, min)
|
52
|
+
return { } if @word_bigram[ word ].nil?||@word_bigram[ word ].empty?
|
53
|
+
|
54
|
+
total = @word.values.inject(:+)
|
55
|
+
count = @word_bigram[word].values.inject(:+)
|
56
|
+
sig_big = { }
|
57
|
+
scores = word_scores( count, @word, @word_bigram[word], total, min )
|
58
|
+
scores.to_a.sort{|wc,zc| zc[1] <=> wc[1] }.each do |w,c|
|
59
|
+
next if @word_bigram[word][w] < min
|
60
|
+
null_score = null_score( count, @word, total, 0.1, 10 )
|
61
|
+
sig_big[w] = c if c > null_score
|
62
|
+
end
|
63
|
+
sig_big
|
64
|
+
end
|
65
|
+
|
66
|
+
def word_scores( count, unigram, bigram, total, min_count )
|
67
|
+
val = Hash.new
|
68
|
+
bigram.keys.each do |v|
|
69
|
+
uni = unigram[v]||0
|
70
|
+
big = bigram[v]||0
|
71
|
+
next if big < min_count
|
72
|
+
|
73
|
+
log_pi_vu = safelog(big) - safelog(count)
|
74
|
+
log_pi_vnu = safelog(uni - big) - safelog(total - big)
|
75
|
+
log_pi_v_old = safelog(uni) - safelog(total)
|
76
|
+
log_1mp_v = safelog(1 - Math.exp(log_pi_vnu))
|
77
|
+
log_1mp_vu = safelog(1 - Math.exp(log_pi_vu))
|
78
|
+
|
79
|
+
val[v] = 2 * (big * log_pi_vu + \
|
80
|
+
(uni - big) * log_pi_vnu - \
|
81
|
+
uni * log_pi_v_old + \
|
82
|
+
(count - big) * (log_1mp_vu - log_1mp_v))
|
83
|
+
end
|
84
|
+
val
|
85
|
+
end
|
86
|
+
|
87
|
+
def null_score( count, bigram, total, pvalue, perm_hash )
|
88
|
+
|
89
|
+
perm_key = count/perm_hash # int div ..
|
90
|
+
|
91
|
+
return @perms[perm_key] if @perms.has_key? perm_key
|
92
|
+
|
93
|
+
max_score = 0
|
94
|
+
nperm = (1.0 / pvalue).to_i
|
95
|
+
table = bigram.to_a.sort{|a,b| b[1]<=>a[1]}
|
96
|
+
(0..nperm).each do |perm|
|
97
|
+
#perm_bigram = sample_no_replace(total, table, count)
|
98
|
+
perm_bigram = new_sample_no_replace(total, bigram, count)
|
99
|
+
obs_score = word_scores(count, bigram, perm_bigram, total, 1)
|
100
|
+
obs_score = obs_score.values.max
|
101
|
+
max_score = obs_score if (obs_score > max_score or perm == 0)
|
102
|
+
end
|
103
|
+
@perms[perm_key] = max_score
|
104
|
+
|
105
|
+
max_score
|
106
|
+
end
|
107
|
+
|
108
|
+
def safelog x
|
109
|
+
x< 0 ? x : x==0? -1000000 : Math.log( x )
|
110
|
+
end
|
111
|
+
|
112
|
+
def new_sample_no_replace(total, table, nitems)
|
113
|
+
cdf = CDFast.new table
|
114
|
+
|
115
|
+
cdf.sample( nitems ).inject( {} ){|h,x| h[ x ] ||= 0; h[x] +=1; h}
|
116
|
+
end
|
117
|
+
|
118
|
+
def sample_no_replace(total, table, nitems)
|
119
|
+
sample = (0..total).to_a.sample( nitems )
|
120
|
+
count = {}
|
121
|
+
sample.each do |n|
|
122
|
+
w = nth_item_from_table(table, n)
|
123
|
+
count[w] ||= 0
|
124
|
+
count[w] += 1
|
125
|
+
end
|
126
|
+
count
|
127
|
+
end
|
128
|
+
|
129
|
+
def nth_item_from_table(table, n)
|
130
|
+
sum = 0
|
131
|
+
table.each do |wc|
|
132
|
+
sum = sum + wc[1]
|
133
|
+
return wc[0] if (n < sum) #table is sorted
|
134
|
+
end
|
135
|
+
table.last.first
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
|
140
|
+
class CDFast
|
141
|
+
def initialize table
|
142
|
+
@a = table.to_a.inject([[], 0]){|a,kv| a[0] += Array.new( kv.last,a[1]); a[1]+=1 ; a}
|
143
|
+
end
|
144
|
+
|
145
|
+
def to_s
|
146
|
+
"#{@a}"
|
147
|
+
end
|
148
|
+
|
149
|
+
def sample tt
|
150
|
+
s = tt.size/[@a.size, tt.size].min
|
151
|
+
(1..s).to_a.inject([]){|a,x| a += @a.sample(s) }
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
class SparseVector
|
4
|
+
attr_accessor :attr
|
5
|
+
|
6
|
+
def initialize( attr_map )
|
7
|
+
@attr = attr_map
|
8
|
+
end
|
9
|
+
|
10
|
+
def dist( v )
|
11
|
+
Math.sqrt( Set.new( @attr.keys + v.attr.keys ).inject(0){|d,k| u_i = (@attr.has_key? k) ? @attr[k] : 0; v_i = (v.attr.has_key? k) ? v.attr[k] : 0; d + (u_i-v_i)*(u_i-v_i) } )
|
12
|
+
end
|
13
|
+
|
14
|
+
def -(v)
|
15
|
+
SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) - (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
|
16
|
+
end
|
17
|
+
|
18
|
+
def +(v)
|
19
|
+
SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) + (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
|
20
|
+
end
|
21
|
+
|
22
|
+
def mult_scalar( c )
|
23
|
+
SparseVector.new( @attr.inject( { } ){ |a, kv| a[ kv.first ] = kv.last * c; a })
|
24
|
+
end
|
25
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: lite
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ronbee
|
@@ -17,6 +17,10 @@ extensions: []
|
|
17
17
|
extra_rdoc_files: []
|
18
18
|
files:
|
19
19
|
- lib/lite.rb
|
20
|
+
- lib/lite/classifier.rb
|
21
|
+
- lib/lite/cluster.rb
|
22
|
+
- lib/lite/ngrams.rb
|
23
|
+
- lib/lite/sparsevect.rb
|
20
24
|
homepage: https://github.com/ronbee/lite
|
21
25
|
licenses:
|
22
26
|
- mit
|