RubyGems - statsample - Versions diffs - 0.5.1 → 0.6.0 - Mend

statsample 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

data/History.txt +12 -0
data/Manifest.txt +13 -0
data/README.txt +2 -1
data/demo/pca.rb +29 -0
data/demo/umann.rb +8 -0
data/lib/distribution.rb +0 -1
data/lib/matrix_extension.rb +35 -21
data/lib/statsample.rb +31 -28
data/lib/statsample/anova.rb +7 -2
data/lib/statsample/bivariate.rb +17 -11
data/lib/statsample/codification.rb +136 -87
data/lib/statsample/combination.rb +0 -2
data/lib/statsample/converter/csv18.rb +1 -1
data/lib/statsample/converter/csv19.rb +1 -1
data/lib/statsample/converters.rb +176 -171
data/lib/statsample/crosstab.rb +227 -154
data/lib/statsample/dataset.rb +94 -12
data/lib/statsample/dominanceanalysis.rb +69 -62
data/lib/statsample/dominanceanalysis/bootstrap.rb +25 -21
data/lib/statsample/factor.rb +18 -0
data/lib/statsample/factor/pca.rb +128 -0
data/lib/statsample/factor/principalaxis.rb +133 -0
data/lib/statsample/factor/rotation.rb +125 -0
data/lib/statsample/histogram.rb +99 -0
data/lib/statsample/mle.rb +125 -126
data/lib/statsample/mle/logit.rb +91 -91
data/lib/statsample/mle/probit.rb +84 -85
data/lib/statsample/multiset.rb +1 -1
data/lib/statsample/permutation.rb +96 -0
data/lib/statsample/regression.rb +1 -1
data/lib/statsample/regression/binomial.rb +89 -89
data/lib/statsample/regression/binomial/logit.rb +9 -9
data/lib/statsample/regression/binomial/probit.rb +9 -9
data/lib/statsample/regression/multiple.rb +8 -14
data/lib/statsample/regression/multiple/gslengine.rb +1 -1
data/lib/statsample/regression/multiple/rubyengine.rb +55 -55
data/lib/statsample/resample.rb +12 -17
data/lib/statsample/srs.rb +4 -1
data/lib/statsample/test.rb +23 -22
data/lib/statsample/test/umannwhitney.rb +182 -0
data/lib/statsample/vector.rb +854 -815
data/test/test_bivariate.rb +132 -132
data/test/test_codification.rb +71 -50
data/test/test_dataset.rb +19 -1
data/test/test_factor.rb +44 -0
data/test/test_histogram.rb +26 -0
data/test/test_permutation.rb +37 -0
data/test/test_statistics.rb +74 -63
data/test/test_umannwhitney.rb +17 -0
data/test/test_vector.rb +46 -30
metadata +31 -4

data/History.txt CHANGED Viewed

@@ -1,3 +1,14 @@
+=== 0.6.0 / 2010-02-05
+* New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and  Statsample::Factor::PrincipalAxis) and rotate component matrix  ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations
+* New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many
+* New class Statsample::Permutation to produce permutations of a given array
+* New class Statsample::Histogram, with same interface as GSL one
+* New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability
+* Improved support for ReportBuilder
+* Statsample::Codification module reworked
+* Fixed bugs on Dominance Analysis classes
+* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
 === 0.5.1 / 2009-10-06
 * New class Statsample::Bivariate::Tetrachoric, for calculation of tetrachoric correlations. See http://www.john-uebersax.com/stat/tetra.htm for information.
@@ -18,6 +29,7 @@
 * Logit tests
 * Bug fix: rescue for requires doesn't specify LoadError
 * Binomial::BaseEngine new methods: coeffs_se, coeffs, constant and constant_se
 === 0.4.0 / 2009-09-10
 * New Distribution module, based on statistics2.rb by Shin-ichiro HARA. Replaces all instances of GSL distributions pdf and cdf calculations for native calculation.
 * New Maximum Likehood Estimation for Logit, Probit and Normal Distribution using Von Tessin(2005) algorithm. See MLE class and subclasses for more information.

data/Manifest.txt CHANGED Viewed

@@ -18,6 +18,7 @@ demo/graph.rb
 demo/item_analysis.rb
 demo/mean.rb
 demo/nunnally_6.rb
+demo/pca.rb
 demo/proportion.rb
 demo/regression.rb
 demo/sample_test.csv
@@ -25,6 +26,7 @@ demo/spss_matrix.rb
 demo/strata_proportion.rb
 demo/stratum.rb
 demo/t-student.rb
+demo/umann.rb
 lib/distribution.rb
 lib/distribution/chisquare.rb
 lib/distribution/f.rb
@@ -47,17 +49,23 @@ lib/statsample/crosstab.rb
 lib/statsample/dataset.rb
 lib/statsample/dominanceanalysis.rb
 lib/statsample/dominanceanalysis/bootstrap.rb
+lib/statsample/factor.rb
+lib/statsample/factor/pca.rb
+lib/statsample/factor/principalaxis.rb
+lib/statsample/factor/rotation.rb
 lib/statsample/graph/gdchart.rb
 lib/statsample/graph/svgboxplot.rb
 lib/statsample/graph/svggraph.rb
 lib/statsample/graph/svghistogram.rb
 lib/statsample/graph/svgscatterplot.rb
+lib/statsample/histogram.rb
 lib/statsample/htmlreport.rb
 lib/statsample/mle.rb
 lib/statsample/mle/logit.rb
 lib/statsample/mle/normal.rb
 lib/statsample/mle/probit.rb
 lib/statsample/multiset.rb
+lib/statsample/permutation.rb
 lib/statsample/regression.rb
 lib/statsample/regression/binomial.rb
 lib/statsample/regression/binomial/logit.rb
@@ -72,6 +80,7 @@ lib/statsample/reliability.rb
 lib/statsample/resample.rb
 lib/statsample/srs.rb
 lib/statsample/test.rb
+lib/statsample/test/umannwhitney.rb
 lib/statsample/vector.rb
 po/es/statsample.po
 po/statsample.pot
@@ -85,11 +94,14 @@ test/test_csv.csv
 test/test_csv.rb
 test/test_dataset.rb
 test/test_distribution.rb
+test/test_factor.rb
 test/test_ggobi.rb
 test/test_gsl.rb
+test/test_histogram.rb
 test/test_logit.rb
 test/test_mle.rb
 test/test_multiset.rb
+test/test_permutation.rb
 test/test_regression.rb
 test/test_reliability.rb
 test/test_resample.rb
@@ -97,6 +109,7 @@ test/test_srs.rb
 test/test_statistics.rb
 test/test_stratified.rb
 test/test_svg_graph.rb
+test/test_umannwhitney.rb
 test/test_vector.rb
 test/test_xls.rb
 test/test_xls.xls

data/README.txt CHANGED Viewed

@@ -5,10 +5,11 @@ http://ruby-statsample.rubyforge.org/
 == DESCRIPTION:
-A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
+A suite for your basic and advanced statistics needs. Descriptive statistics, multiple regression, factorial analysis, dominance analysis, scale's reliability analysis, bivariate statistics and others procedures.
 == FEATURES:
+* Factorial Analysis. Principal Component Analysis and Principal Axis extraction, with orthogonal rotations (Varimax, Equimax, Quartimax)
 * Multiple Regression. Listwise analysis optimized with use of Alglib library. Pairwise analysis is executed on pure ruby and reports same values as SPSS
 * Dominance Analysis. Based on Budescu and Azen papers, <strong>DominanceAnalysis</strong> class can report dominance analysis for a sample and <strong>DominanceAnalysisBootstrap</strong> can execute bootstrap analysis to determine dominance stability, as recomended by  Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
 * Classes for Vector, Datasets (set of Vectors) and Multisets (multiple datasets with same fields and type of vectors), and multiple methods to manipulate them

data/demo/pca.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require File.dirname(__FILE__)+"/../lib/statsample"
+require 'matrix_extension'
+require 'reportbuilder'
+require 'gsl'
+ds=Statsample.load("/home/cdx/trabajo/sepade/pdie/2008_ntic/analisis_c1/tesis.ds")
+ds2=ds['ac_gen'..'ac_db'].dup_only_valid
+cm=Statsample::Bivariate.correlation_matrix(ds2)
+pca=Statsample::Factor::PCA.new(cm, :m=>2)
+rb=ReportBuilder.new()
+rb.add(pca)
+varimax=Statsample::Factor::Quartimax.new(pca.component_matrix.to_matrix)
+varimax.iterate
+rb.add(varimax.rotated)
+rb.add(varimax.iterations)
+rb.add(varimax.component_transformation_matrix)
+rb.add(varimax.h2)
+=begin
+fa=Statsample::Factor::PrincipalAxis.new(cm, :m=>1)
+rb=ReportBuilder.new()
+rb.add(fa)
+=end
+puts rb.to_text

data/demo/umann.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require File.dirname(__FILE__)+'/../lib/statsample'
+v1=[1,2,3,4,7,8,9,10,14,15].to_scale
+v2=[5,6,11,12,13,16,17,18,19].to_scale
+u=Statsample::Test::UMannWhitney.new(v1,v2)
+puts u.summary
+#p Statsample::Test::UMannWhitney.exact_probability_as62(100,100)

data/lib/distribution.rb CHANGED Viewed

@@ -12,5 +12,4 @@ module Distribution
     autoload(:T, 'distribution/t')
     autoload(:F, 'distribution/f')
     autoload(:Normal, 'distribution/normal')
 end

data/lib/matrix_extension.rb CHANGED Viewed

@@ -1,4 +1,18 @@
 require 'matrix'
+if RUBY_VERSION<="1.9.0"
+  class Vector
+    alias_method :old_coerce, :coerce
+    def coerce(other)
+    case other
+    when Numeric
+      return Matrix::Scalar.new(other), self
+    else
+      raise TypeError, "#{self.class} can't be coerced into #{other.class}"
+    end
+  end
+  end
+end
 class Matrix
     def rows_sum
         (0...row_size).collect {|i|
@@ -37,31 +51,31 @@ class Matrix
     end
     # Test if a Matrix is a identity one
     def identity?
-        if regular?
-            rows=(0...row_size).each{|i|
-                (0...column_size).each {|j|
-                    v = self[i,j]
-                    return false if (i==j and v!=1) or (i!=j and v!=0)
-                }
-            }
-            true
-        else
-            false
-        end
+      if regular?
+      rows=(0...row_size).each{|i|
+      (0...column_size).each {|j|
+        v = self[i,j]
+        return false if (i==j and v!=1) or (i!=j and v!=0)
+      }
+      }
+      true
+      else
+      false
+      end
     end
     def to_gsl
-        out=[]
-        self.row_size.times{|i|
-            out[i]=self.row(i).to_a
-        }
-        GSL::Matrix[*out]
+      out=[]
+      self.row_size.times{|i|
+        out[i]=self.row(i).to_a
+      }
+      GSL::Matrix[*out]
     end
     def orthogonal?
-        if regular?
-            (self * self.t).identity?
-        else
-            false
-        end
+      if regular?
+        (self * self.t).identity?
+      else
+        false
+      end
     end
 end

data/lib/statsample.rb CHANGED Viewed

@@ -108,16 +108,18 @@ end
 # * Dataset: An union of vectors.
 #
 module Statsample
-  VERSION = '0.5.1'
+  VERSION = '0.6.0'
   SPLIT_TOKEN = ","
   autoload(:Database, 'statsample/converters')
   autoload(:Anova, 'statsample/anova')
   autoload(:Combination, 'statsample/combination')
+  autoload(:Permutation, 'statsample/permutation')
   autoload(:CSV, 'statsample/converters')
   autoload(:PlainText, 'statsample/converters')
   autoload(:Excel, 'statsample/converters')
   autoload(:GGobi, 'statsample/converters')
   autoload(:SPSS, 'statsample/converter/spss')
+  autoload(:Histogram, 'statsample/histogram')
   autoload(:DominanceAnalysis, 'statsample/dominanceanalysis')
   autoload(:HtmlReport, 'statsample/htmlreport')
   autoload(:Mx, 'statsample/converters')
@@ -132,6 +134,7 @@ module Statsample
   autoload(:MLE, 'statsample/mle')
   autoload(:Regression, 'statsample/regression')
   autoload(:Test, 'statsample/test')
+  autoload(:Factor, 'statsample/factor')
   def self.load(filename)
     if File.exists? filename
       o=false
@@ -157,38 +160,38 @@ module Statsample
 	end
   module Writable
     def save(filename)
-        fp=File.open(filename,"w")
-        Marshal.dump(self,fp)
-        fp.close
+      fp=File.open(filename,"w")
+      Marshal.dump(self,fp)
+      fp.close
     end
   end
   module HtmlSummary
-      def add_line(n=nil)
-          self << "<hr />"
-      end
-      def nl
-          self << "<br />"
-      end
-      def add(text)
-          self << ("<p>"+text.gsub("\n","<br />")+"</p>")
-      end
-      def parse_table(table)
-          self << table.parse_html
-      end
+    def add_line(n=nil)
+        self << "<hr />"
+    end
+    def nl
+        self << "<br />"
+    end
+    def add(text)
+        self << ("<p>"+text.gsub("\n","<br />")+"</p>")
+    end
+    def parse_table(table)
+        self << table.parse_html
+    end
   end
   module ConsoleSummary
-        def add_line(n=80)
-            self << "-"*n+"\n"
-        end
-        def nl
-            self << "\n"
-        end
-        def add(text)
-            self << text
-        end
-        def parse_table(table)
-            self << table.parse_console
-        end
+    def add_line(n=80)
+      self << "-"*n+"\n"
+    end
+    def nl
+      self << "\n"
+    end
+    def add(text)
+      self << text
+    end
+    def parse_table(table)
+      self << table.parse_console
+    end
   end
   class ReportTable
     attr_reader :header

data/lib/statsample/anova.rb CHANGED Viewed

@@ -6,8 +6,13 @@ module Statsample
     #   v2=[3,3,4,5,6].to_scale
     #   v3=[5,3,1,5,6].to_scale
     #   anova=Statsample::Anova::OneWay.new([v1,v2,v3])
-    #   puts anova.f
-    #   puts anova.significance
+    #   anova.f
+    #   => 0.0243902439024391
+    #   anova.significance
+    #   => 0.975953044203438
+    #   anova.sst
+    #   => 32.9333333333333
+    #
     class OneWay
       def initialize(vectors)
         @vectors=vectors

data/lib/statsample/bivariate.rb CHANGED Viewed

@@ -13,6 +13,7 @@ module Statsample
           covariance_slow(v1a,v2a)
         end
       end
+      # Estimate the ML between two dichotomic vectors
       def maximum_likehood_dichotomic(pred,real)
         preda,reala=Statsample.only_valid(pred,real)
         sum=0
@@ -59,13 +60,14 @@ module Statsample
       end
       # Retrieves the value for t test for a pearson correlation
       # giving r and vector size
+      # Source : http://faculty.chass.ncsu.edu/garson/PA765/correl.htm
       def t_r(r,size)
         r * Math::sqrt(((size)-2).to_f / (1 - r**2))
       end
       # Retrieves the probability value (a la SPSS)
       # for a given t, size and number of tails.
       # Uses a second parameter
-      # * :both  or 2  : for r!=0
+      # * :both  or 2  : for r!=0 (default)
       # * :right, :positive or 1  : for r > 0
       # * :left, :negative        : for r < 0
@@ -112,6 +114,7 @@ module Statsample
         (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
       end
       # Covariance matrix.
       # Order of rows and columns depends on Dataset#fields order
@@ -139,7 +142,8 @@ module Statsample
           end
         end
       end
-      # Retrieves the n valid pairwise
+      # Retrieves the n valid pairwise.
       def n_valid_matrix(ds)
         ds.collect_matrix do |row,col|
           if row==col
@@ -150,7 +154,8 @@ module Statsample
           end
         end
       end
-      # Matrix of correlation probability
+      # Matrix of correlation probabilities.
       # Order of rows and columns depends on Dataset#fields order
       def correlation_probability_matrix(ds, tails=:both)
@@ -162,6 +167,7 @@ module Statsample
         end
         Matrix.rows(rows)
       end
       # Spearman ranked correlation coefficient between 2 vectors
       def spearman(v1,v2)
         v1a,v2a=Statsample.only_valid(v1,v2)
@@ -218,16 +224,16 @@ module Statsample
         rs=matrix.row_size
         cs=matrix.column_size
         conc=disc=ties_x=ties_y=0
-        (0...(rs-1)).each {|x|
-          (0...(cs-1)).each{|y|
-            ((x+1)...rs).each{|x2|
-              ((y+1)...cs).each{|y2|
+        (0...(rs-1)).each do |x|
+          (0...(cs-1)).each do |y|
+            ((x+1)...rs).each do |x2|
+              ((y+1)...cs).each do |y2|
                 # #p sprintf("%d:%d,%d:%d",x,y,x2,y2)
                 conc+=matrix[x,y]*matrix[x2,y2]
-              }
-            }
-          }
-        }
+              end
+            end
+          end
+        end
         (0...(rs-1)).each {|x|
           (1...(cs)).each{|y|
             ((x+1)...rs).each{|x2|

data/lib/statsample/codification.rb CHANGED Viewed

@@ -27,94 +27,143 @@ module Statsample
 #     }
 #   end
 #
-    module Codification
-        class << self
-            # Create a yaml dump for a hash, based on vectors
-            # The keys will be vectors name on dataset and the values
-            # will be hashes, with keys = values, for recodification
-            #
-            #   v1=%w{a,b b,c d}.to_vector
-            #   ds={"v1"=>v1}.to_dataset
-            #   Statsample::Codification.create_yaml(ds,['v1'])
-            #   => "--- \nv1: \n  a: a\n  b: b\n  c: c\n  d: d\n"
-            def create_yaml(dataset,vectors,sep=Statsample::SPLIT_TOKEN,io=nil)
-                raise ArgumentError,"Array should't be empty" if vectors.size==0
-                pro_hash=vectors.inject({}){|h,v_name|
-                    raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
-                    v=dataset[v_name]
-                    split_data=v.splitted(sep)
-                    factors=split_data.flatten.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac}
-                    h[v_name]=factors
-                    h
-                }
-                YAML.dump(pro_hash,io)
-            end
-            def inverse_hash(h,sep=Statsample::SPLIT_TOKEN)
-                h.inject({}) {|a,v|
-                    v[1].split(sep).each {|val|
-                        a[val]||=[]
-                        a[val].push(v[0])
-                    }
-                    a
-                }
-            end
-            def dictionary(h,sep=Statsample::SPLIT_TOKEN)
-                h.inject({}) {|a,v|
-                    a[v[0]]=v[1].split(sep)
-                    a
-                }
-            end
-            def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
-                dict=dictionary(h,sep)
-                new_data=v.splitted(sep)
-                recoded=new_data.collect{|c|
-                    if c.nil?
-                        nil
-                    else
-                    c.collect{|value|
-                        dict[value]
-                    }.flatten.uniq
-                end
-                }
-            end
-            def recode_dataset_simple!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
-                _recode_dataset(dataset,yaml,sep,false)
-            end
-            def recode_dataset_split!(dataset,yaml,sep=Statsample::SPLIT_TOKEN)
-                _recode_dataset(dataset,yaml,sep,true)
-            end
-            def _recode_dataset(dataset,yaml,sep=Statsample::SPLIT_TOKEN,split=false)
-                h=YAML::load(yaml)
-                v_names||=h.keys
-                v_names.each do |v_name|
-                    raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
-                    recoded=recode_vector(dataset[v_name],h[v_name],sep).collect { |c|
-                        if c.nil?
-                            nil
-                        else
-                            c.join(sep)
-                        end
-                    }.to_vector
-                    if(split)
-                    recoded.split_by_separator(sep).each {|k,v|
-                        dataset[v_name+"_"+k]=v
-                    }
-                    else
-                        dataset[v_name+"_recoded"]=recoded
-                    end
-                end
-            end
-            def verify(yaml,v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
-                require 'pp'
-                h=YAML::load(yaml)
-                v_names||=h.keys
-                v_names.each{|v_name|
-                    inverse=inverse_hash(h[v_name],sep)
-                    io.puts "Vector: #{v_name}"
-                    YAML.dump(inverse.sort,io)
-                }
+  module Codification
+    class << self
+      # Create a hash, based on vectors, to create the dictionary.
+      # The keys will be vectors name on dataset and the values
+      # will be hashes, with keys = values, for recodification
+      def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
+        raise ArgumentError,"Array should't be empty" if vectors.size==0
+        pro_hash=vectors.inject({}){|h,v_name|
+          raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
+          v=dataset[v_name]
+          split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
+          factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
+          h[v_name]=factors
+          h
+        }
+        pro_hash
+      end
+      # Create a yaml to create a dictionary, based on vectors
+      # The keys will be vectors name on dataset and the values
+      # will be hashes, with keys = values, for recodification
+      #
+      #   v1=%w{a,b b,c d}.to_vector
+      #   ds={"v1"=>v1}.to_dataset
+      #   Statsample::Codification.create_yaml(ds,['v1'])
+      #   => "--- \nv1: \n  a: a\n  b: b\n  c: c\n  d: d\n"
+      def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
+        pro_hash=create_hash(dataset, vectors, sep)
+        YAML.dump(pro_hash,io)
+      end
+      # Create a excel to create a dictionary, based on vectors.
+      # Raises an error if filename exists
+      # The rows will be:
+      # * field: name of vector
+      # * original: original name
+      # * recoded: new code
+      def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
+        require 'spreadsheet'
+        if File.exists?(filename)
+          raise "Exists a file named #{filename}. Delete ir before overwrite."
+        end
+        book = Spreadsheet::Workbook.new
+        sheet = book.create_worksheet
+        sheet.row(0).concat(%w{field original recoded})
+        i=1
+        create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
+          inner_hash.sort.each do |k,v|
+            sheet.row(i).concat([field.dup,k.dup,v.dup])
+            i+=1
+          end
+        end
+        book.write(filename)
+      end
+      # From a excel generates a dictionary hash
+      # to use on recode_dataset_simple!() or recode_dataset_split!().
+      #
+      def excel_to_recoded_hash(filename)
+        require 'spreadsheet'
+        h={}
+        book = Spreadsheet.open filename
+        sheet= book.worksheet 0
+        row_i=0
+        sheet.each do |row|
+          row_i+=1
+          next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
+          h[row[0]]={} if h[row[0]].nil?
+          h[row[0]][row[1]]=row[2]
+        end
+        h
+      end
+      def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
+        h.inject({}) do |a,v|
+          v[1].split(sep).each do |val|
+            a[val]||=[]
+            a[val].push(v[0])
+          end
+          a
+        end
+      end
+      def dictionary(h, sep=Statsample::SPLIT_TOKEN)
+        h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
+      end
+      def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
+        dict=dictionary(h,sep)
+        new_data=v.splitted(sep)
+        recoded=new_data.collect do |c|
+          if c.nil?
+            nil
+          else
+            c.collect{|value| dict[value] }.flatten.uniq
+          end
+        end
+      end
+      def recode_dataset_simple!(dataset, dictionary_hash ,sep=Statsample::SPLIT_TOKEN)
+            _recode_dataset(dataset,dictionary_hash ,sep,false)
+        end
+        def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
+            _recode_dataset(dataset, dictionary_hash, sep,true)
+        end
+        def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
+          v_names||=h.keys
+          v_names.each do |v_name|
+            raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
+            recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
+              if c.nil?
+                  nil
+              else
+                  c.join(sep)
+              end
+            }.to_vector
+            if(split)
+            recoded.split_by_separator(sep).each {|k,v|
+              dataset[v_name+"_"+k]=v
+            }
+            else
+              dataset[v_name+"_recoded"]=recoded
             end
+          end
+        end
+        def verify(h, v_names=nil,sep=Statsample::SPLIT_TOKEN,io=$>)
+          require 'pp'
+          v_names||=h.keys
+          v_names.each{|v_name|
+            inverse=inverse_hash(h[v_name],sep)
+            io.puts "- Field: #{v_name}"
+            inverse.sort{|a,b| -(a[1].count<=>b[1].count)}.each {|k,v|
+              io.puts "  - \"#{k}\" (#{v.count}) :\n    -'"+v.join("\n    -'")+"'"
+            }
+          }
         end
     end
+  end
 end