RubyGems - eluka - Versions diffs - 0.1.0 - Mend

eluka 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

data/.document +5 -0
data/DOCUMENTATION_STANDARDS +39 -0
data/Gemfile +13 -0
data/Gemfile.lock +20 -0
data/LICENSE.txt +20 -0
data/README.rdoc +19 -0
data/Rakefile +69 -0
data/VERSION +1 -0
data/examples/example.rb +59 -0
data/ext/libsvm/COPYRIGHT +31 -0
data/ext/libsvm/FAQ.html +1749 -0
data/ext/libsvm/Makefile +25 -0
data/ext/libsvm/Makefile.win +33 -0
data/ext/libsvm/README +733 -0
data/ext/libsvm/extconf.rb +1 -0
data/ext/libsvm/heart_scale +270 -0
data/ext/libsvm/java/Makefile +25 -0
data/ext/libsvm/java/libsvm.jar +0 -0
data/ext/libsvm/java/libsvm/svm.java +2776 -0
data/ext/libsvm/java/libsvm/svm.m4 +2776 -0
data/ext/libsvm/java/libsvm/svm_model.java +21 -0
data/ext/libsvm/java/libsvm/svm_node.java +6 -0
data/ext/libsvm/java/libsvm/svm_parameter.java +47 -0
data/ext/libsvm/java/libsvm/svm_print_interface.java +5 -0
data/ext/libsvm/java/libsvm/svm_problem.java +7 -0
data/ext/libsvm/java/svm_predict.java +163 -0
data/ext/libsvm/java/svm_scale.java +350 -0
data/ext/libsvm/java/svm_toy.java +471 -0
data/ext/libsvm/java/svm_train.java +318 -0
data/ext/libsvm/java/test_applet.html +1 -0
data/ext/libsvm/python/Makefile +4 -0
data/ext/libsvm/python/README +331 -0
data/ext/libsvm/python/svm.py +259 -0
data/ext/libsvm/python/svmutil.py +242 -0
data/ext/libsvm/svm-predict.c +226 -0
data/ext/libsvm/svm-scale.c +353 -0
data/ext/libsvm/svm-toy/gtk/Makefile +22 -0
data/ext/libsvm/svm-toy/gtk/callbacks.cpp +423 -0
data/ext/libsvm/svm-toy/gtk/callbacks.h +54 -0
data/ext/libsvm/svm-toy/gtk/interface.c +164 -0
data/ext/libsvm/svm-toy/gtk/interface.h +14 -0
data/ext/libsvm/svm-toy/gtk/main.c +23 -0
data/ext/libsvm/svm-toy/gtk/svm-toy.glade +238 -0
data/ext/libsvm/svm-toy/qt/Makefile +17 -0
data/ext/libsvm/svm-toy/qt/svm-toy.cpp +413 -0
data/ext/libsvm/svm-toy/windows/svm-toy.cpp +456 -0
data/ext/libsvm/svm-train.c +376 -0
data/ext/libsvm/svm.cpp +3060 -0
data/ext/libsvm/svm.def +19 -0
data/ext/libsvm/svm.h +105 -0
data/ext/libsvm/svm.o +0 -0
data/ext/libsvm/tools/README +149 -0
data/ext/libsvm/tools/checkdata.py +108 -0
data/ext/libsvm/tools/easy.py +79 -0
data/ext/libsvm/tools/grid.py +359 -0
data/ext/libsvm/tools/subset.py +146 -0
data/ext/libsvm/windows/libsvm.dll +0 -0
data/ext/libsvm/windows/svm-predict.exe +0 -0
data/ext/libsvm/windows/svm-scale.exe +0 -0
data/ext/libsvm/windows/svm-toy.exe +0 -0
data/ext/libsvm/windows/svm-train.exe +0 -0
data/lib/eluka.rb +10 -0
data/lib/eluka/bijection.rb +23 -0
data/lib/eluka/data_point.rb +36 -0
data/lib/eluka/document.rb +47 -0
data/lib/eluka/feature_vector.rb +86 -0
data/lib/eluka/features.rb +31 -0
data/lib/eluka/model.rb +129 -0
data/lib/fselect.rb +321 -0
data/lib/grid.rb +25 -0
data/test/helper.rb +18 -0
data/test/test_eluka.rb +7 -0
metadata +214 -0

data/ext/libsvm/windows/libsvm.dll ADDED Viewed

Binary file

data/ext/libsvm/windows/svm-predict.exe ADDED Viewed

Binary file

data/ext/libsvm/windows/svm-scale.exe ADDED Viewed

Binary file

data/ext/libsvm/windows/svm-toy.exe ADDED Viewed

Binary file

data/ext/libsvm/windows/svm-train.exe ADDED Viewed

Binary file

data/lib/eluka.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'rubygems'
+require 'ferret'
+require 'eluka/features'
+require 'eluka/document'
+require 'eluka/feature_vector'
+require 'eluka/model'
+require 'eluka/bijection'
+require 'eluka/data_point'

data/lib/eluka/bijection.rb ADDED Viewed

@@ -0,0 +1,23 @@
+#Creates a two way hash like lookup using two hashes
+#Source inspired from a post in some forum
+#Author: Unknown
+module Eluka
+  class Bijection < Hash
+    def initialize(*args)
+      super(*args)
+      @reverse = self.invert
+    end
+    def []=(key, val)
+      super(key, val)
+      if @reverse.has_key?(val)
+        self.delete(@reverse[val])
+      end
+      @reverse[val] = key
+    end
+    def lookup(val)
+      @reverse[val]
+    end
+  end
+end

data/lib/eluka/data_point.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Eluka
+  class DataPoint
+    def initialize(data, analyzer)
+      raise "Can't find any data" unless (data)
+      if data.instance_of?(String)
+        data = {:text => data}
+      end
+      raise "Invalid data added" unless (data.instance_of? Hash)
+      raise "Data can't be empty" unless (data.size > 0)
+      @data     = data
+      @analyzer = analyzer
+    end
+    def vector
+      vector = Hash.new
+      @data.each do |field, value|
+        if value.instance_of?(String) then
+          doc_vec = Eluka::Document.new(field, value, @analyzer).vector
+          vector.merge!(doc_vec)
+        elsif value.instance_of?(Fixnum) or value.instance_of?(Float)
+          vector[field] = value
+        else
+          raise "A field can contain either an integer or a double or it can be a string"
+        end
+      end
+      vector
+    end
+  end
+end

data/lib/eluka/document.rb ADDED Viewed

@@ -0,0 +1,47 @@
+module Eluka
+  class Document
+    def initialize(field, text, analyzer)
+      @field        = field
+      @text         = text
+      @analyzer     = analyzer
+      @bag_of_words = nil
+      self.bag_of_words
+    end
+    def bag_of_words
+      #Position counter for the document
+      pos = 0
+      @bag_of_words = Hash.new
+      #Token Stream
+      token_stream = @analyzer.token_stream(:field, @text)
+      while token = token_stream.next do
+        pos += token.pos_inc
+        @bag_of_words[token.text] = Array.new unless @bag_of_words[token.text]
+        @bag_of_words[token.text].push(pos)
+      end
+    end
+    def vector
+      vector = Hash.new
+      squared_length = 0
+      @bag_of_words.each do |term, pos_vector|
+        squared_length += pos_vector.size**2
+        #vector[[@field,term].join("||")] = pos_vector.size
+      end
+      length = squared_length.to_f**0.5
+      @bag_of_words.each do |term, pos_vector|
+        vector[[@field,term].join("||")] = pos_vector.size.to_f / length
+      end
+      vector
+    end
+  end
+end

data/lib/eluka/feature_vector.rb ADDED Viewed

@@ -0,0 +1,86 @@
+#!/usr/bin/env ruby
+#
+# Feature Vectors performs two important functions
+#
+# 1. Maintain a list of <feature, value> pairs for each data point (vector)
+# so that a model can be built whenever needed. (Sparse representation)
+#
+# 2. Adds every feature to the Features object to maintain
+# a unique list of features
+#
+# TODO: On disk representation for large training data
+module Eluka
+  class FeatureVectors
+    # Feature Vectors for a data point need to know the
+    # global list of features and their respective ids
+    #
+    # During training, as we keep finding new features
+    # we add them to the features list
+    #
+    # Hence we need to know whether the vectors we are computing
+    # are for training or classification
+    def initialize (features, train)
+      @fvs      = Array.new
+      @features = features  #Instance of features
+      @train    = train     #Boolean
+    end
+    # We just keep all data points stored and convert them to
+    # feature vectors only on demand
+    def add (vector, label = 0)
+      @fvs.push([vector, label])
+    end
+    # For training data points we make sure all the features
+    # are added to the feature list
+    def define_features
+      @fvs.each do |vector, label|
+        vector.each do |term, value|
+          @features.add(term)
+        end
+      end
+    end
+    # Creates feature vectors and converts them to
+    # LibSVM format -- a multiline string with one
+    # data point per line
+    #
+    # If provided with a list of selected features then
+    # insert only those features
+    def to_libSVM (sel_features = nil)
+      #Load the selected features into a Hash
+      sf = Hash.new
+      if (sel_features)
+        sel_features.each do |f|
+          sf[f] = 1
+        end
+      end
+      self.define_features if (@train) #This method is needed only for training data
+      output = Array.new
+      @fvs.each do |vector, label|
+        line = Array.new
+        line.push(label)
+        (1..@features.f_count).each do |id| #OPTIMIZE: Change this line to consider sorting in case of terms being features
+          term = @features.term(id)
+          if ( value = vector[term] ) then
+            line.push([id, value].join(":")) if sf[term] or not sel_features
+          end
+        end
+        output.push(line.join(" "))
+      end
+      output.join("\n")
+    end
+  end
+end

data/lib/eluka/features.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module Eluka
+  class Features
+    def initialize
+      @features = Eluka::Bijection.new
+      @f_count  = 0
+    end
+    attr_reader :f_count
+    def add (term)
+      unless @features[term] then
+        @f_count += 1
+        @features[term] = @f_count
+      end
+      return @features[term]
+    end
+    def id (term)
+      @features[term]
+    end
+    def term (id)
+      @features.lookup(id)
+    end
+  end
+end

data/lib/eluka/model.rb ADDED Viewed

@@ -0,0 +1,129 @@
+module Eluka
+  #A binary classifier classifies data into two classes given a category (the class label)
+  # a. Data which is indicative of the category -- positive data
+  # b. Data which is not indicative of the category -- negative data
+  #
+  #== Model
+  # A classifier model observes positve and negative data and learns the properties of
+  #each set. In the future if given an unlabelled data point it decides whether the
+  #the data point is a positive or negative instance of the category.
+  #
+  #=== Internal Data Representation
+  # A classifier model internally represents a data instance as a point in a vector space
+  # The dimensions of the vector space are termed as features
+  #
+  #=== Eluka::Model
+  # An Eluka model takes a hash of features and their values and internally processes them
+  #as points in a vector space. If the input is a string of words like in a document then
+  #it relies on Ferret's text anaysis modules to convert it into a data point
+  class Model
+    include Ferret::Analysis
+    # Initialize the classifier with sane defaults
+    # if customised data is not provided
+    def initialize (params = {})
+      #Set the labels
+      @labels             = Bijection.new
+      @labels[:positive]  =  1
+      @labels[:negative]  = -1
+      @labels[:unknown]   =  0
+      @gem_root           = File.expand_path(File.join(File.dirname(__FILE__), '..'))
+      @bin_dir            = File.expand_path(File.join(File.dirname(@gem_root), 'bin'))
+      @analyzer           = StandardAnalyzer.new
+      @features           = Eluka::Features.new
+      @fv_train           = Eluka::FeatureVectors.new(@features, true)
+      @fv_test            = nil
+      @directory          = (params[:directory]         or "/tmp")
+      @svm_train_path     = (params[:svm_train_path]    or "#{@bin_dir}/eluka-svm-train")
+      @svm_scale_path     = (params[:svm_scale_path]    or "#{@bin_dir}/eluka-svm-scale")
+      @svm_predict_path   = (params[:svm_predict_path]  or "#{@bin_dir}/eluka-svm-predict")
+      @grid_py_path       = (params[:grid_py_path]      or "python rsvm/tools/grid.py")
+      @fselect_py_path    = (params[:fselect_py_path]   or "python rsvm/tools/fselect.py")
+      @verbose            = (params[:verbose]           or false)
+      #Convert directory to absolute path
+      Dir.chdir(@directory) do @directory = Dir.pwd end
+    end
+    # Add a data point to the training data
+    def add (data, label)
+      raise "No meaningful label associated with data" unless ([:positive, :negative].include? label)
+      data_point = Eluka::DataPoint.new(data, @analyzer)
+      @fv_train.add(data_point.vector, @labels[label])
+        end
+    # Build a model from the training data using LibSVM
+    def build (features = nil)
+      File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM(features) end
+      output = `#{@svm_train_path} #{@directory}/train #{@directory}/model`
+      puts output if (@verbose)
+      @fv_test  = Eluka::FeatureVectors.new(@features, false)
+      return output
+    end
+    # Classify a data point
+    def classify (data, features = nil)
+      raise "Untrained model" unless (@fv_test)
+      data_point = Eluka::DataPoint.new(data, @analyzer)
+      @fv_test.add(data_point.vector)
+      File.open(@directory + "/classify", "w") do |f| f.puts @fv_test.to_libSVM(features) end
+      output = `#{@svm_predict_path} #{@directory}/classify #{@directory}/model #{@directory}/result`
+      puts output if (@verbose)
+      return @labels.lookup( File.open( @directory + "/result", "r" ).read.to_i )
+    end
+    # Suggests the best set of features chosen using fselect.py
+    # IMPROVE: Depending on fselect.py (an unnecessary python dependency) is stupid
+    # TODO: Finish wirting fselect.rb and integrate it
+    def suggest_features
+      sel_features = Array.new
+      File.open(@directory + "/train", "w") do |f| f.puts @fv_train.to_libSVM end
+      Dir.chdir('./rsvm/bin/tools') do
+        output = `python fselect.py #{@directory}/train`
+        puts output if (@verbose)
+        x = File.read("train.select")
+        sel_f_ids = x[1..-2].split(", ")
+        sel_f_ids.each do |f|
+          s_f = @features.term(f.to_i)
+          if s_f.instance_of? String then
+            s_f     = s_f.split("||")
+            s_f[0]  = s_f[0].to_sym
+          end
+          sel_features.push(s_f)
+        end
+        #Remove temporary files
+        File.delete("train.select") if File.exist?("train.select")
+        File.delete("train.fscore") if File.exist?("train.fscore")
+        File.delete("train.tr.out") if File.exist?("train.tr.out")
+      end
+      return sel_features
+    end
+  end
+end

data/lib/fselect.rb ADDED Viewed

@@ -0,0 +1,321 @@
+#trying to convert fselect.py method by method into ruby
+require 'rbconfig'
+##### Path Setting #####
+is_win32 = (Config::CONFIG["host_os"] == 'win32')
+unless is_win32
+	gridpy_exe      = "./grid.py -log2c -2,9,2 -log2g 1,-11,-2"
+	svmtrain_exe    = "../svm-train"
+	svmpredict_exe  = "../svm-predict"
+else
+	gridpy_exe      = ".\\grid.py -log2c -2,9,2 -log2g 1,-11,-2"
+	svmtrain_exe    = "..\\windows\\svmtrain.exe"
+	svmpredict_exe  = "..\\windows\\svmpredict.exe"
+end
+##### Global Variables #####
+@train_pathfile=""
+@train_file=""
+@test_pathfile=""
+@test_file=""
+@if_predict_all=0
+@whole_fsc_dict={}
+@whole_imp_v=[]
+VERBOSE_MAX=100
+VERBOSE_ITER = 3
+VERBOSE_GRID_TIME = 2
+VERBOSE_TIME = 1
+def arg_process
+  unless (ARGV.size == 2 or ARGV.size == 3)
+    puts 'Usage: #{ARGV[0]} training_file [testing_file]'
+    exit
+  end
+  @train_pathfile = ARGV[1]
+  raise "training file not found" unless File.exist? @train_pathfile
+  @train_file = File.basename(@train_pathfile)
+  if ARGV.size == 3
+    @test_pathfile = ARGV[1]
+    raise "testing file not found" unless File.exist? @test_pathfile
+    @test_file = File.basename(@test_pathfile)
+  end
+end
+##### Decide sizes of selected feautures #####
+def feat_num_try_half(max_index)
+	v=[]
+	while max_index > 1 do
+		v.push(max_index)
+		max_index /= 2
+  end
+	return v
+end
+def feat_num_try(f_tuple)
+	for i in 0...f_tuple.size do
+		if f_tuple[i][1] < 1e-20
+			i = i - 1
+      break
+    end
+  end
+	#only take first eight numbers (>1%)
+	return feat_num_try_half(i+1)[0...8]
+end
+def random_shuffle(label, sample)
+  srand 1
+	size = label.size
+	for i in 0...label.size
+		ri = rand(size)
+		tmp = label[ri]
+		label[ri] = label[size-i-1]
+		label[size-i-1] = tmp
+		tmp = sample[ri]
+		sample[ri] = sample[size-i-1]
+		sample[size-i-1] = tmp
+  end
+end
+### compare function used in list.sort(): sort by element[1]
+#def value_cmpf(x,y):
+#	if x[1]>y[1]: return -1
+#	if x[1]<y[1]: return 1
+#	return 0
+def value_cmpf(x)
+	return (-x[1])
+end
+### cal importance of features
+### return fscore_dict and feat with desc order
+def cal_feat_imp(label, sample)
+	puts("calculating fsc...")
+	score_dict = cal_Fscore(label, sample)
+  #NOTE: Convert the following two lines carefully
+	score_tuples = list(score_dict.items())
+	score_tuples.sort(key = value_cmpf)
+	feat_v = score_tuples
+	for i in 0...feat_v.size
+    feat_v[i] = score_tuples[i][0]
+  end
+	puts("fsc done")
+	return score_dict,feat_v
+end
+### select features and return new data
+def select(sample, feat_v)
+	new_samp = []
+	feat_v.sort()
+	#for each sample
+  sample.each do |key, s| #NOTE: Extremely doubtful conversion
+    point = Hash.new
+		#for each feature to select
+    feat_v.each do |f|
+			if s[f]
+        point[f]=s[f]
+      end
+    end
+		new_samp.push(point)
+  end
+	return new_samp
+end
+=begin
+#TODO: Convert the following code
+### Do parameter searching (grid.py)
+def train_svm(tr_file)
+	cmd = "#{gridpy_exe} #{tr_file}"
+	puts(cmd)
+	puts('Cross validation...')
+	std_out = Popen(cmd, shell = True, stdout = PIPE).stdout
+	line = ''
+	while 1:
+		last_line = line
+		line = std_out.readline()
+		if not line: break
+	c,g,rate = map(float,last_line.split())
+	print('Best c=%s, g=%s CV rate=%s' % (c,g,rate))
+	return c,g,rate
+### Given (C,g) and training/testing data,
+### return predicted labels
+def predict(tr_label, tr_sample, c, g, test_label, test_sample, del_model=1, model_name=None):
+	global train_file
+	tr_file = train_file+".tr"
+	te_file = train_file+".te"
+	if model_name:  model_file = model_name
+	else:  model_file = "%s.model"%tr_file
+	out_file = "%s.o"%te_file
+	# train
+	writedata(tr_sample,tr_label,tr_file)
+	cmd = "%s -c %f -g %f %s %s" % (svmtrain_exe,c,g,tr_file,model_file)
+	os.system(cmd)
+	# test
+	writedata(test_sample,test_label,te_file)
+	cmd = "%s %s %s %s" % (svmpredict_exe, te_file,model_file,out_file )
+	print(cmd)
+	os.system(cmd)
+	# fill in pred_y
+	pred_y=[]
+	fp = open(out_file)
+	line = fp.readline()
+	while line:
+		pred_y.append( float(line) )
+		line = fp.readline()
+	rem_file(tr_file)
+	#rem_file("%s.out"%tr_file)
+	#rem_file("%s.png"%tr_file)
+	rem_file(te_file)
+	if del_model: rem_file(model_file)
+	fp.close()
+	rem_file(out_file)
+	return pred_y
+def cal_acc(pred_y, real_y):
+	right = 0.0
+	for i in range(len(pred_y)):
+		if(pred_y[i] == real_y[i]): right += 1
+	print("ACC: %d/%d"%(right, len(pred_y)))
+	return right/len(pred_y)
+### balanced accuracy
+def cal_bacc(pred_y, real_y):
+	p_right = 0.0
+	n_right = 0.0
+	p_num = 0
+	n_num = 0
+	size=len(pred_y)
+	for i in range(size):
+		if real_y[i] == 1:
+			p_num+=1
+			if real_y[i]==pred_y[i]: p_right+=1
+		else:
+			n_num+=1
+			if real_y[i]==pred_y[i]: n_right+=1
+	print([p_right,p_num,n_right,n_num])
+	writelog("       p_yes/p_num, n_yes/n_num: %d/%d , %d/%d\n"%(p_right,p_num,n_right,n_num))
+	if p_num==0: p_num=1
+	if n_num==0: n_num=1
+	return 0.5*( p_right/p_num + n_right/n_num )
+=end
+##### Log related #####
+def initlog(name)
+  @logname = name
+  logfile = File.open(@logname, "w").close
+end
+def writelog(str, vlevel = VERBOSE_MAX)
+  if vlevel > VERBOSE_ITER
+    logfile = File.open(@logname, "a")
+    logfile.print(str)
+    logfile.close
+  end
+end
+###### svm data IO ######
+def readdata(filename)
+  labels = Array.new
+  samples = Array.new
+  max_index = 0
+  f = File.open(filename)
+  f.each_line do |line|
+    line.chomp!
+    next if line[0] == "#"
+    elems = line.split(" ")
+    sample = Hash.new
+    label_read = false
+    elements.each do |e|
+      unless label_read
+        labels.push e.to_f
+        label_read = true
+        next
+      end
+      feature, value = e.split(":")
+      p0 = feature.chomp.to_i
+      p1 = value.chomp.to_f
+      sample[p0] = p1
+      max_index = p0 if p0 > max_index
+      samples.push(sample)
+    end
+  end
+  f.close
+  return labels, samples, max_index
+end
+def writedata(samples, labels, filename)
+  fp = $stdout
+  if filename
+		fp = File.open(filename, "w")
+  end
+	num = samples.size
+  samples.each_index do |i|
+    if labels
+      fp.print label[i]
+    else
+      fp.print "0"
+    end
+    samples[i].keys.sort.each do |k|
+      fp.print(" #{k}:#{samples[i][k]}")
+    end
+    fp.puts ""
+  end
+  fp.close
+end
+###### PROGRAM ENTRY POINT ######
+arg_process()
+initlog("#{@train_file}.select")
+writelog("start: #{Time.now}\n\n")
+main()
+# do testing on all possible feature sets
+if if_predict_all
+	predict_all()
+end
+writelog("\nend: \n#{Time.now}\n")