RubyGems - lite - Versions diffs - 0.0.1 → 0.0.2 - Mend

lite 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 25ff1a57bf0429aa06d54f565f690193d820ace4
-  data.tar.gz: b38a29a2a1cc6322a4d68dcaf0fac4cfa313a844
+  metadata.gz: 2fc13836cc1019dd8809a6c48a7a6cdffa8bb088
+  data.tar.gz: 69b8a922e062ec32047dfe1926f1a5cb54845d16
 SHA512:
-  metadata.gz: 26355fc1113fcfa91eb8fd68c41df77f89e9b40f99a5b0584c089a8e73dc191a42b6c80ff3c9698e8ff737ddf7444e0d24c542e632376dd5a8bfecdb01735c04
-  data.tar.gz: 4d62a7c6a8d20c29e582fa9b938e9fcfc70265cccebfbb6130acba04b39c32d81d6eb9d1deb89b6f4e64814d987f58a0a5d08c0fdbe7b0423a54bf2a5f4a7171
+  metadata.gz: b3289282345c068a6e0524bc4662e0c9764a1b8bf224fcf8ec24e30e122b5093a9cf2a2d8f1e63b31b309bd67691a8b01b152aecb7e4892c1ad6aeb787c6945d
+  data.tar.gz: 65a04db0306934823048101fe592acb883003358a38d4c329cf28278618c8c6e79c11bc394d636a950b71aedc2f072b4726f9ffb56edeec00e790aad80250250

data/lib/lite/classifier.rb ADDED

@@ -0,0 +1,54 @@
+require "json"
+require "set"
+module Classify
+  class NB
+    def initialize
+      @labels = {}
+      @features = Set.new
+      @nF = 0.0
+      @nL = 0.0
+      @c = 0.5
+    end
+    def update! fvect, label
+      @labels[ label ] ||= { "xs" => {}, "N"=>0 }
+      fvect.each{|k,v|  @features<<k; @labels[label]["nX"]||=@c ;@labels[ label ]["xs"][k] ||= @c; @labels[ label ]["xs"][k] += v;@labels[label]["nX"]+=v}
+      @labels[ label ]["N"]+=1
+      wrapup
+    end
+    def classify fvect
+      @labels.keys.inject({}) do |aux,y|
+        sx = fvect.keys.inject(0.0){|z, fi| z += fvect[fi] * Math.log( (@labels[y]["xs"][fi]||@c) / (@labels[y]["nX"]+@c*@nF))}
+        sy = Math.log( @labels[y]["N"] / @nL ) # here no smoothing
+        aux[ y ] = sx + sy
+        aux
+      end
+    end
+    def to_json
+      { "id" => "#{rand(10000)}#{Time.now.to_i}", "labels"=>@labels, "F"=>@features.to_a, "nf"=>@nF, "nl"=>@nL,"c"=>@c  }.to_json
+    end
+    def self.from_json json
+      parsed = JSON.parse json
+      c = self.new
+      c.instance_variable_set("@labels", parsed["labels"])
+      c.instance_variable_set("@features", Set.new( parsed["F"] ) )
+      c.instance_variable_set("@nF", parsed["nf"])
+      c.instance_variable_set("@nL",  parsed["nl"])
+      c
+    end
+    :private
+    def wrapup
+      @nF = @features.size
+      @nL = @labels.keys.inject(0.0){|s,k| s += @labels[k]["N"]}
+      @labels.keys.each{|k| @labels[k]["sF"] = @labels[k]["N"]+@c*@nF}
+    end
+  end
+end

data/lib/lite/cluster.rb ADDED

@@ -0,0 +1,76 @@
+require File.dirname(__FILE__)+'/sparsevect.rb'
+module Cluster
+  class AddC
+    def initialize( upperBoundOnNumClusters )
+      @k_max = upperBoundOnNumClusters
+      @centroids = []
+    end
+    def observe!( instance )
+      if @centroids.size == 0
+        @centroids << Centroid.new( instance )
+        return self
+      end
+      @centroids.sort! {|c1, c2|   instance.dist(c1.x) <=> instance.dist(c2.x) }
+      closest_centroid = @centroids.first
+      closest_centroid.update!( instance )
+      if( @centroids.size >= @k_max )
+        pairs = []
+        @centroids.each_index do |i|
+          min_d = 10**20
+          min_c = 0
+          @centroids.each_index do |j|
+            next if i==j
+            d = @centroids[i].x.dist( @centroids[j].x )
+            min_c = j if d < min_d
+            min_d = d if d < min_d
+          end
+          pairs[i] = [ min_d, i, min_c]
+        end
+        pairs.sort! {|x,y| x[0]<=>y[0]}
+        merge_info = pairs.first
+        @centroids[merge_info[1]].merge!( @centroids[merge_info[2]] )
+        @centroids = @centroids - [ @centroids[merge_info[2]] ]
+      end
+      @centroids << Centroid.new( instance )
+      []
+    end
+    def getCentroids( min_num_instances_in_cluster = 2 )
+      @centroids.each do |c|
+        next if c.n >= min_num_instances_in_cluster
+        p c
+        @centroids = @centroids - [ c ]
+        next if c.n == 0
+        aux = @centroids.inject( {:min_c => @centroids.first, :d => @centroids.first.x.dist( c.x )} ) {|a,cc| cc.nil? || cc.x.dist(c.x) > a[:d] ? a : { :min_c=>cc, :d=>cc.x.dist(c.x)} }
+        aux[:min_c].merge! c
+      end
+      @centroids
+    end
+  end
+  class Centroid < SparseVector
+    attr_accessor :x,:n
+    def initialize( x )
+      @x = x
+      @n = 0
+    end
+    def update!( newX )
+      @x += ( @x - newX ).mult_scalar( 1.0/(@n+1) )
+      @n +=  1
+      self
+    end
+    def merge!( centroid )
+      @x = ( @x.mult_scalar(@n)+centroid.x.mult_scalar(centroid.n) ).mult_scalar( 1.0 / (@n + centroid.n) )
+      @n += centroid.n
+      self
+    end
+  end
+end

data/lib/lite/ngrams.rb ADDED

@@ -0,0 +1,154 @@
+module Cluster
+  class Grammy
+    def initialize
+      @word        = Hash.new
+      @word_next   = Hash.new
+      @word_bigram = Hash.new
+      @perms = Hash.new
+    end
+    def digest!( word_seq_array )
+      (0..word_seq_array.size-1).each do |i|
+        w = word_seq_array[i]
+        @word[ w ] ||= 0
+        @word[ w ] += 1
+        next if i == word_seq_array.size-1
+        next_w = word_seq_array[i+1]
+        @word_bigram[ w ] ||= {}
+        @word_bigram[ w ][next_w] ||= 0
+        @word_bigram[ w ][next_w] += 1
+        @word_next[ next_w] ||= 0
+        @word_next[ next_w ] += 1
+      end
+    end
+    def extract
+      calculate_ngrams()[ :w ].sort{|x1,x2| x2.last <=> x1.last}
+    end
+    def calculate_ngrams( depth=5, cutoffs=[2,2,1,1,1] )
+      a = { :w => @word.delete_if{|key, value| value <= cutoffs.first } , :wb => @word_bigram } #{  :w=>{}, :wb=>{} }
+      depth.times do |i|
+        cutoff = cutoffs[ i ]
+        @word = a[:w]
+        @word_bigram = a[:wb]
+        a = a[:w].keys.inject( a ) do |a, uni|
+          cs = sig_bigrams(uni, cutoff)
+          cs.keys.each do |x|
+            new_uni = "#{uni} #{x}"
+            a[:w][new_uni] = a[:wb][uni][x] rescue 0;
+            a[:wb][x].keys.each{|z| a[:wb][new_uni] ||= {}; a[:wb][new_uni][z] ||= {}; a[:wb][new_uni][z] = ( (a[:wb][uni][x]/@word_next[x].to_f)* (a[:wb][x][z]||0) ).to_i  } rescue ""
+          end
+          a[:w].delete(uni) if cs.size > 0 or a[:w][uni] < cutoff
+          a
+        end
+      end
+      a
+    end
+    def sig_bigrams(word, min)
+      return { } if @word_bigram[ word ].nil?||@word_bigram[ word ].empty?
+      total = @word.values.inject(:+)
+      count = @word_bigram[word].values.inject(:+)
+      sig_big = { }
+      scores = word_scores( count, @word, @word_bigram[word], total, min )
+      scores.to_a.sort{|wc,zc| zc[1] <=> wc[1] }.each do |w,c|
+        next if @word_bigram[word][w] < min
+        null_score = null_score( count, @word, total, 0.1, 10 )
+        sig_big[w] = c if c > null_score
+      end
+      sig_big
+    end
+    def word_scores( count, unigram, bigram, total, min_count )
+      val = Hash.new
+      bigram.keys.each do |v|
+        uni = unigram[v]||0
+        big = bigram[v]||0
+        next if big < min_count
+        log_pi_vu = safelog(big) - safelog(count)
+        log_pi_vnu = safelog(uni - big) - safelog(total - big)
+        log_pi_v_old = safelog(uni) - safelog(total)
+        log_1mp_v = safelog(1 - Math.exp(log_pi_vnu))
+        log_1mp_vu = safelog(1 - Math.exp(log_pi_vu))
+        val[v] = 2 * (big * log_pi_vu + \
+                     (uni - big) * log_pi_vnu - \
+                     uni * log_pi_v_old + \
+                     (count - big) * (log_1mp_vu - log_1mp_v))
+      end
+      val
+    end
+    def null_score( count, bigram, total, pvalue, perm_hash )
+      perm_key =  count/perm_hash # int div ..
+      return @perms[perm_key] if @perms.has_key?  perm_key
+      max_score = 0
+      nperm = (1.0 / pvalue).to_i
+      table = bigram.to_a.sort{|a,b| b[1]<=>a[1]}
+      (0..nperm).each do |perm|
+        #perm_bigram = sample_no_replace(total, table, count)
+        perm_bigram = new_sample_no_replace(total, bigram, count)
+        obs_score = word_scores(count, bigram, perm_bigram, total, 1)
+        obs_score = obs_score.values.max
+        max_score = obs_score if (obs_score > max_score or perm == 0)
+      end
+      @perms[perm_key] = max_score
+      max_score
+    end
+    def safelog x
+      x< 0 ? x : x==0? -1000000 : Math.log( x )
+    end
+    def new_sample_no_replace(total, table, nitems)
+      cdf = CDFast.new table
+      cdf.sample( nitems ).inject( {} ){|h,x| h[ x ] ||= 0; h[x] +=1; h}
+    end
+    def sample_no_replace(total, table, nitems)
+      sample = (0..total).to_a.sample( nitems )
+      count = {}
+      sample.each do |n|
+        w = nth_item_from_table(table, n)
+        count[w] ||= 0
+        count[w] += 1
+      end
+      count
+    end
+    def nth_item_from_table(table, n)
+      sum = 0
+      table.each do |wc|
+        sum = sum + wc[1]
+        return wc[0] if (n < sum) #table is sorted
+      end
+      table.last.first
+    end
+  end
+  class CDFast
+    def initialize table
+      @a = table.to_a.inject([[], 0]){|a,kv| a[0] += Array.new( kv.last,a[1]); a[1]+=1 ; a}
+    end
+    def to_s
+      "#{@a}"
+    end
+    def sample tt
+      s = tt.size/[@a.size, tt.size].min
+      (1..s).to_a.inject([]){|a,x| a += @a.sample(s) }
+    end
+  end
+end

data/lib/lite/sparsevect.rb ADDED

@@ -0,0 +1,25 @@
+require 'set'
+class SparseVector
+  attr_accessor :attr
+  def initialize( attr_map )
+    @attr = attr_map
+  end
+  def dist( v )
+    Math.sqrt( Set.new( @attr.keys + v.attr.keys ).inject(0){|d,k|  u_i = (@attr.has_key? k) ? @attr[k] : 0; v_i =  (v.attr.has_key? k) ? v.attr[k] : 0; d + (u_i-v_i)*(u_i-v_i) } )
+  end
+  def -(v)
+    SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) -  (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
+  end
+  def +(v)
+    SparseVector.new( Set.new( v.attr.keys + @attr.keys ).inject( { } ) { |a,c| a[c] = (@attr.has_key?(c) ? @attr[c] : 0) + (v.attr.has_key?(c) ? v.attr[c] : 0); a } )
+  end
+  def mult_scalar( c )
+    SparseVector.new( @attr.inject( { } ){ |a, kv| a[ kv.first ] = kv.last * c; a })
+  end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: lite
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - ronbee
@@ -17,6 +17,10 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/lite.rb
+- lib/lite/classifier.rb
+- lib/lite/cluster.rb
+- lib/lite/ngrams.rb
+- lib/lite/sparsevect.rb
 homepage: https://github.com/ronbee/lite
 licenses:
 - mit