RubyGems - statsample - Versions diffs - 1.5.0 → 2.0.0 - Mend

statsample 1.5.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

checksums.yaml +4 -4
data/.build.sh +15 -0
data/.gitignore +1 -0
data/.travis.yml +19 -7
data/CONTRIBUTING.md +33 -0
data/History.txt +5 -0
data/README.md +41 -53
data/benchmarks/correlation_matrix_15_variables.rb +6 -5
data/benchmarks/correlation_matrix_5_variables.rb +6 -5
data/benchmarks/correlation_matrix_methods/correlation_matrix.rb +23 -26
data/examples/boxplot.rb +17 -5
data/examples/correlation_matrix.rb +36 -7
data/examples/dataset.rb +25 -5
data/examples/dominance_analysis.rb +8 -7
data/examples/dominance_analysis_bootstrap.rb +16 -11
data/examples/histogram.rb +16 -2
data/examples/icc.rb +5 -6
data/examples/levene.rb +17 -3
data/examples/multiple_regression.rb +6 -3
data/examples/parallel_analysis.rb +11 -6
data/examples/polychoric.rb +26 -13
data/examples/principal_axis.rb +8 -4
data/examples/reliability.rb +10 -10
data/examples/scatterplot.rb +8 -0
data/examples/t_test.rb +7 -0
data/examples/u_test.rb +10 -2
data/examples/vector.rb +9 -6
data/examples/velicer_map_test.rb +12 -8
data/lib/statsample.rb +13 -47
data/lib/statsample/analysis/suite.rb +1 -1
data/lib/statsample/anova/oneway.rb +6 -6
data/lib/statsample/anova/twoway.rb +26 -24
data/lib/statsample/bivariate.rb +78 -61
data/lib/statsample/bivariate/pearson.rb +2 -2
data/lib/statsample/codification.rb +45 -32
data/lib/statsample/converter/csv.rb +15 -53
data/lib/statsample/converter/spss.rb +6 -5
data/lib/statsample/converters.rb +50 -211
data/lib/statsample/crosstab.rb +26 -25
data/lib/statsample/daru.rb +117 -0
data/lib/statsample/dataset.rb +70 -942
data/lib/statsample/dominanceanalysis.rb +16 -17
data/lib/statsample/dominanceanalysis/bootstrap.rb +26 -28
data/lib/statsample/factor/parallelanalysis.rb +17 -19
data/lib/statsample/factor/pca.rb +21 -20
data/lib/statsample/factor/principalaxis.rb +3 -3
data/lib/statsample/graph/boxplot.rb +8 -16
data/lib/statsample/graph/histogram.rb +4 -4
data/lib/statsample/graph/scatterplot.rb +8 -7
data/lib/statsample/histogram.rb +128 -119
data/lib/statsample/matrix.rb +20 -16
data/lib/statsample/multiset.rb +39 -38
data/lib/statsample/regression.rb +3 -3
data/lib/statsample/regression/multiple.rb +8 -10
data/lib/statsample/regression/multiple/alglibengine.rb +96 -89
data/lib/statsample/regression/multiple/baseengine.rb +32 -32
data/lib/statsample/regression/multiple/gslengine.rb +33 -36
data/lib/statsample/regression/multiple/matrixengine.rb +7 -9
data/lib/statsample/regression/multiple/rubyengine.rb +39 -41
data/lib/statsample/reliability.rb +23 -25
data/lib/statsample/reliability/icc.rb +8 -7
data/lib/statsample/reliability/multiscaleanalysis.rb +14 -12
data/lib/statsample/reliability/scaleanalysis.rb +58 -60
data/lib/statsample/reliability/skillscaleanalysis.rb +34 -29
data/lib/statsample/resample.rb +1 -1
data/lib/statsample/shorthand.rb +29 -25
data/lib/statsample/test/kolmogorovsmirnov.rb +5 -3
data/lib/statsample/test/levene.rb +28 -27
data/lib/statsample/test/t.rb +7 -9
data/lib/statsample/test/umannwhitney.rb +28 -28
data/lib/statsample/test/wilcoxonsignedrank.rb +45 -43
data/lib/statsample/vector.rb +70 -1013
data/lib/statsample/version.rb +1 -1
data/statsample.gemspec +12 -16
data/test/helpers_tests.rb +1 -1
data/test/test_analysis.rb +17 -17
data/test/test_anova_contrast.rb +6 -6
data/test/test_anovatwowaywithdataset.rb +8 -8
data/test/test_anovawithvectors.rb +8 -8
data/test/test_awesome_print_bug.rb +1 -1
data/test/test_bartlettsphericity.rb +4 -4
data/test/test_bivariate.rb +48 -43
data/test/test_codification.rb +33 -33
data/test/test_crosstab.rb +9 -9
data/test/test_dataset.rb +28 -458
data/test/test_factor.rb +46 -38
data/test/test_factor_pa.rb +22 -13
data/test/test_ggobi.rb +4 -4
data/test/test_gsl.rb +4 -4
data/test/test_histogram.rb +3 -3
data/test/test_matrix.rb +13 -13
data/test/test_multiset.rb +103 -91
data/test/test_regression.rb +57 -52
data/test/test_reliability.rb +55 -45
data/test/test_reliability_icc.rb +8 -8
data/test/test_reliability_skillscale.rb +26 -24
data/test/test_resample.rb +1 -1
data/test/test_statistics.rb +3 -13
data/test/test_stest.rb +9 -9
data/test/test_stratified.rb +3 -3
data/test/test_test_t.rb +12 -12
data/test/test_umannwhitney.rb +2 -2
data/test/test_vector.rb +76 -613
data/test/test_wilcoxonsignedrank.rb +4 -4
metadata +57 -28
data/lib/statsample/rserve_extension.rb +0 -20
data/lib/statsample/vector/gsl.rb +0 -106
data/test/fixtures/repeated_fields.csv +0 -7
data/test/fixtures/scientific_notation.csv +0 -4
data/test/fixtures/test_csv.csv +0 -7
data/test/fixtures/test_xls.xls +0 -0
data/test/test_csv.rb +0 -63
data/test/test_rserve_extension.rb +0 -42
data/test/test_xls.rb +0 -52

data/lib/statsample/crosstab.rb CHANGED

@@ -8,24 +8,25 @@ module Statsample
     attr_reader :v_rows, :v_cols
     attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
     def initialize(v1, v2, opts=Hash.new)
-      #raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
       raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
-      @v_rows, @v_cols=Statsample.only_valid_clone(v1.to_vector,v2.to_vector)
-      @cases=@v_rows.size
-      @row_label=v1.name
-      @column_label=v2.name
-      @name=nil
+      @v_rows, @v_cols = Statsample.only_valid_clone(
+        Daru::Vector.new(v1),
+        Daru::Vector.new(v2))
+      @cases          = @v_rows.size
+      @row_label      = v1.name
+      @column_label   = v2.name
+      @name           = nil
       @percentage_row = @percentage_column = @percentage_total=false
-      opts.each{|k,v|
+      opts.each do |k,v|
         self.send("#{k}=",v) if self.respond_to? k
-      }
-      @name||=_("Crosstab %s - %s") % [@row_label, @column_label]
+      end
+      @name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
     end
     def rows_names
-      @v_rows.factors.sort
+      @v_rows.factors.sort.reset_index!
     end
     def cols_names
-      @v_cols.factors.sort
+      @v_cols.factors.sort.reset_index!
     end
     def rows_total
       @v_rows.frequencies
@@ -35,18 +36,18 @@ module Statsample
     end
     def frequencies
-      base=rows_names.inject([]){|s,row|
-        s+=cols_names.collect{|col| [row,col]}
-      }.inject({}) {|s,par|
+      base = rows_names.inject([]) do |s,row|
+        s += cols_names.collect { |col| [row,col] }
+      end.inject({}) do |s,par|
         s[par]=0
         s
-      }
-      base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
+      end
+      base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies)
     end
     def to_matrix
-      f=frequencies
-      rn=rows_names
-      cn=cols_names
+      f  = frequencies
+      rn = rows_names
+      cn = cols_names
       Matrix.rows(rn.collect{|row|
           cn.collect{|col| f[[row,col]]}
       })
@@ -67,8 +68,8 @@ module Statsample
     end
     # Chi square, based on expected and real matrix
     def chi_square
-        require 'statsample/test'
-        Statsample::Test.chi_square(self.to_matrix, matrix_expected)
+      require 'statsample/test'
+      Statsample::Test.chi_square(self.to_matrix, matrix_expected)
     end
     # Useful to obtain chi square
     def matrix_expected
@@ -98,10 +99,10 @@ module Statsample
         generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
         generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
-        t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
+        t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
         rn.each do |row|
           total_row=0
-          t_row=[@v_rows.labeling(row)]
+          t_row=[@v_rows.index_of(row)]
           cn.each do |col|
             data=fq[[row,col]]
             total_row+=fq[[row,col]]
@@ -148,9 +149,9 @@ module Statsample
         when :total   then  _("% Total")
       end
-      t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c) } + [_("Total")])
+      t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
         rn.each do |row|
-          t_row=[@v_rows.labeling(row)]
+          t_row=[@v_rows.index_of(row)]
           cn.each do |col|
             total=case type
               when :row     then  rt[row]

data/lib/statsample/daru.rb ADDED

@@ -0,0 +1,117 @@
+# Opening the Daru::DataFrame class for adding methods to convert from
+# data structures to specialized statsample data structues like Multiset.
+module Daru
+  class Vector
+    def histogram(bins=10)
+      type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
+      if bins.is_a? Array
+        h = Statsample::Histogram.alloc(bins)
+      else
+        # ugly patch. The upper limit for a bin has the form
+        # x < range
+        #h=Statsample::Histogram.new(self, bins)
+        valid = only_valid
+        min,max=Statsample::Util.nice(valid.min,valid.max)
+        # fix last data
+        if max == valid.max
+          max += 1e-10
+        end
+        h = Statsample::Histogram.alloc(bins,[min,max])
+        # Fix last bin
+      end
+      h.increment(valid)
+      h
+    end
+    # Variance of p, according to poblation size
+    def variance_proportion(n_poblation, v=1)
+      Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
+    end
+    # Variance of p, according to poblation size
+    def variance_total(n_poblation, v=1)
+      Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
+    end
+    def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
+      Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
+    end
+    def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
+      Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
+    end
+  end
+  class DataFrame
+    def crosstab(v1,v2,opts={})
+      Statsample::Crosstab.new(self[v1], self[v2],opts)
+    end
+    # Functions for converting to Statsample::Multiset
+    def to_multiset_by_split(*vecs)
+      require 'statsample/multiset'
+      if vecs.size == 1
+        to_multiset_by_split_one_field(vecs[0])
+      else
+        to_multiset_by_split_multiple_fields(*vecs)
+      end
+    end
+    # Creates a Statsample::Multiset, using one field
+    def to_multiset_by_split_one_field(field)
+      raise ArgumentError,"Should use a correct field name" if
+        !@vectors.include? field
+      factors = self[field].factors
+      ms      = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
+      each_row do |row|
+        ms[row[field]].add_row(row)
+      end
+      #puts "Ingreso a los dataset"
+      ms.datasets.each do |k,ds|
+        ds.update
+        ds.rename self[field].index_of(k)
+      end
+      ms
+    end
+    def to_multiset_by_split_multiple_fields(*fields)
+      fields.map!(&:to_sym)
+      factors_total=nil
+      fields.each do |f|
+        if factors_total.nil?
+          factors_total = self[f].factors.collect { |c| [c] }
+        else
+          suma = []
+          factors = self[f].factors
+          factors_total.each do |f1|
+            factors.each do |f2|
+              suma.push(f1+[f2])
+            end
+          end
+          factors_total = suma
+        end
+      end
+      ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
+      p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
+      each_row { |r| p1.call(r) }
+      ms.datasets.each do |k,ds|
+        ds.update
+        ds.rename(
+          fields.size.times.map do |i|
+            f  = fields[i]
+            sk = k[i]
+            self[f].index_of(sk)
+          end.join("-")
+        )
+      end
+      ms
+    end
+  end
+end

data/lib/statsample/dataset.rb CHANGED

@@ -2,9 +2,11 @@ require 'statsample/vector'
 class Hash
   # Creates a Statsample::Dataset based on a Hash
-  def to_dataset(*args)
+  def to_dataframe(*args)
     Statsample::Dataset.new(self, *args)
   end
+  alias :to_dataset :to_dataframe
 end
 class Array
@@ -17,990 +19,116 @@ class Array
 end
 module Statsample
-  class DatasetException < RuntimeError # :nodoc:
-    attr_reader :ds,:exp
-    def initialize(ds,e)
-      @ds=ds
-      @exp=e
-    end
-    def to_s
-      m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
-      m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil?
-      m
+  # == Deprecation Warning
+  #
+  # This class will soon be replaced by Daru::DataFrame in the
+  # next release. Please see the daru docs at https://github.com/v0dro/daru
+  # for more details
+  class Dataset < Daru::DataFrame
+    # Ordered ids of vectors
+    def fields
+      $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#vectors.\n"
+      @vectors.to_a
     end
-  end
-  # Set of cases with values for one or more variables,
-  # analog to a dataframe on R or a standard data file of SPSS.
-  # Every vector has <tt>#field</tt> name, which represent it. By default,
-  # the vectors are ordered by it field name, but you can change it
-  # the fields order manually.
-  # The Dataset work as a Hash, with keys are field names
-  # and values are Statsample::Vector
-  #
-  #
-  # ==Usage
-  # Create a empty dataset:
-  #   Dataset.new()
-  # Create a dataset with three empty vectors, called <tt>v1</tt>, <tt>v2</tt> and <tt>v3</tt>:
-  #   Dataset.new(%w{v1 v2 v3})
-  # Create a dataset with two vectors, called <tt>v1</tt>
-  # and <tt>v2</tt>:
-  #   Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
-  # Create a dataset with two given vectors (v1 and v2),
-  # with vectors on inverted order:
-  #   Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
-  #
-  # The fast way to create a dataset uses Hash#to_dataset, with
-  # field order  as arguments
-  #   v1 = [1,2,3].to_numeric
-  #   v2 = [1,2,3].to_numeric
-  #   ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
-  class Dataset
-    include Writable
-    include Summarizable
-    # Hash of Statsample::Vector
-    attr_reader :vectors
-    # Ordered ids of vectors
-    attr_reader :fields
-    # Name of dataset
-    attr_accessor :name
-    # Number of cases
-    attr_reader :cases
-    # Location of pointer on enumerations methods (like #each)
-    attr_reader :i
+    def name= new_name
+      $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#rename.\n"
-    # Generates a new dataset, using three vectors
-    # - Rows
-    # - Columns
-    # - Values
-    #
-    # For example, you have these values
-    #
-    #   x   y   v
-    #   a   a   0
-    #   a   b   1
-    #   b   a   1
-    #   b   b   0
-    #
-    # You obtain
-    #   id  a   b
-    #    a  0   1
-    #    b  1   0
-    #
-    # Useful to process outputs from databases
-    def self.crosstab_by_asignation(rows,columns,values)
-      raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
-      cols_values=columns.factors
-      cols_n=cols_values.size
-      h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
-        |a1,v1| a1[v1]=nil; a1
-        }
-        ;a}
-      values.each_index{|i|
-        h_rows[rows[i]][columns[i]]=values[i]
-      }
-      ds=Dataset.new(["_id"]+cols_values)
-      cols_values.each{|c|
-        ds[c].type=values.type
-      }
-      rows.factors.each {|row|
-        n_row=Array.new(cols_n+1)
-        n_row[0]=row
-          cols_values.each_index {|i|
-            n_row[i+1]=h_rows[row][cols_values[i]]
-        }
-        ds.add_case_array(n_row)
-      }
-      ds.update_valid_data
-      ds
+      rename new_name
     end
-    # Return true if any vector has missing data
-    def has_missing_data?
-      @vectors.any? {|k,v| v.has_missing_data?}
+    # Number of cases
+    def cases
+      $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using Daru::DataFrame#nrows.\n"
+      nrows
     end
-    # Return a nested hash using fields as keys and
-    # an array constructed of hashes with other values.
-    # If block provided, is used to provide the
-    # values, with parameters +row+ of dataset,
-    # +current+ last hash on hierarchy and
-    # +name+ of the key to include
-    def nest(*tree_keys,&block)
-      tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
-      out=Hash.new
-      each do |row|
-        current=out
-        # Create tree
-        tree_keys[0,tree_keys.size-1].each do |f|
-          root=row[f]
-          current[root]||=Hash.new
-          current=current[root]
-        end
-        name=row[tree_keys.last]
-        if !block
-          current[name]||=Array.new
-          current[name].push(row.delete_if{|key,value| tree_keys.include? key})
-        else
-          current[name]=block.call(row, current,name)
-        end
-      end
-      out
+    # == Deprecation Warning
+    #
+    # This class will soon be replaced by Daru::DataFrame in the
+    # next release. Use Daru::DataFrame.crosstab_by_assignation
+    # for the same effect. Please see the daru docs at
+    # https://github.com/v0dro/daru for more details.
+    def self.crosstab_by_assignation(rows,columns,values)
+      ds = super(rows, columns, values)
+      Dataset.new ds.to_hash
     end
-    # Creates a new dataset. A dataset is a set of ordered named vectors
-    # of the same size.
-    #
-    # [vectors] With an array, creates a set of empty vectors named as
-    # values on the array. With a hash, each Vector is assigned as
-    # a variable of the Dataset named as its key
-    # [fields]  Array of names for vectors. Is only used for set the
-    # order of variables. If empty, vectors keys on alfabethic order as
-    # used as fields.
+    # == Deprecation Warning
+    #
+    # This class will soon be replaced by Daru::DataFrame in the
+    # next release. Use Daru::DataFrame.new for the same effect.
+    # Please see the daru docs at https://github.com/v0dro/daru for more details.
     def initialize(vectors={}, fields=[])
-      @@n_dataset||=0
-      @@n_dataset+=1
-      @name=_("Dataset %d") % @@n_dataset
-      @cases=0
-      @gsl=nil
-      @i=nil
+      $stderr.puts "WARNING: Statsample::Dataset and Statsample::Vector have been deprecated in favor of Daru::DataFrame and Daru::Vector. Please switch to using that.\n"
       if vectors.instance_of? Array
         @fields=vectors.dup
-        @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
+        super({}, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e })
       else
         # Check vectors
-        @vectors=vectors
-        @fields=fields
-        check_order
-        check_length
-      end
-    end
-    #
-    # Creates a copy of the given dataset, deleting all the cases with
-    # missing data on one of the vectors.
-    #
-    # @param array of fields to include. No value include all fields
-    #
-    def dup_only_valid(*fields_to_include)
-      if fields_to_include.size==1 and fields_to_include[0].is_a? Array
-        fields_to_include=fields_to_include[0]
-      end
-      fields_to_include=@fields if fields_to_include.size==0
-      if fields_to_include.any? {|f| @vectors[f].has_missing_data?}
-        ds=Dataset.new(fields_to_include)
-        fields_to_include.each {|f| ds[f].type=@vectors[f].type}
-        each {|row|
-          unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]}
-            row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac}
-            ds.add_case(row_2)
-          end
-        }
-      else
-        ds=dup fields_to_include
-      end
-      ds.name= self.name
-      ds
-    end
-    #
-    # Returns a duplicate of the Dataset.
-    # All vectors are copied, so any modification on new
-    # dataset doesn't affect original dataset's vectors.
-    # If fields given as parameter, only include those vectors.
-    #
-    # @param array of fields to include. No value include all fields
-    # @return {Statsample::Dataset}
-    def dup(*fields_to_include)
-      if fields_to_include.size==1 and fields_to_include[0].is_a? Array
-        fields_to_include=fields_to_include[0]
-      end
-      fields_to_include=@fields if fields_to_include.size==0
-      vectors={}
-      fields=[]
-      fields_to_include.each{|f|
-        raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
-        vectors[f]=@vectors[f].dup
-        fields.push(f)
-      }
-      ds=Dataset.new(vectors,fields)
-      ds.name= self.name
-      ds
-    end
-    # Returns an array with the fields from first argumen to last argument
-    def from_to(from,to)
-      raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
-      raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
-      @fields.slice(@fields.index(from)..@fields.index(to))
-    end
-    # Returns (when possible) a cheap copy of dataset.
-    # If no vector have missing values, returns original vectors.
-    # If missing values presents, uses Dataset.dup_only_valid.
-    #
-    # @param array of fields to include. No value include all fields
-    # @return {Statsample::Dataset}
-    def clone_only_valid(*fields_to_include)
-      if fields_to_include.size==1 and fields_to_include[0].is_a? Array
-        fields_to_include=fields_to_include[0]
-      end
-      fields_to_include=@fields.dup if fields_to_include.size==0
-      if fields_to_include.any? {|v| @vectors[v].has_missing_data?}
-        dup_only_valid(fields_to_include)
-      else
-        clone(fields_to_include)
-      end
-    end
-    # Returns a shallow copy of Dataset.
-    # Object id will be distinct, but @vectors will be the same.
-    # @param array of fields to include. No value include all fields
-    # @return {Statsample::Dataset}
-    def clone(*fields_to_include)
-      if fields_to_include.size==1 and fields_to_include[0].is_a? Array
-        fields_to_include=fields_to_include[0]
-      end
-      fields_to_include=@fields.dup if fields_to_include.size==0
-      ds=Dataset.new
-      fields_to_include.each{|f|
-        raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
-        ds[f]=@vectors[f]
-      }
-      ds.fields=fields_to_include
-      ds.name=@name
-      ds.update_valid_data
-      ds
-    end
-    # Creates a copy of the given dataset, without data on vectors
-    #
-    # @return {Statsample::Dataset}
-    def dup_empty
-      vectors=@vectors.inject({}) {|a,v|
-        a[v[0]]=v[1].dup_empty
-        a
-      }
-      Dataset.new(vectors,@fields.dup)
-    end
-    # Merge vectors from two datasets
-    # In case of name collition, the vectors names are changed to
-    # x_1, x_2 ....
-    #
-    # @return {Statsample::Dataset}
-    def merge(other_ds)
-      raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
-      types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
-      new_fields = (@fields+other_ds.fields).recode_repeated
-      ds_new=Statsample::Dataset.new(new_fields)
-      new_fields.each_index{|i|
-        field=new_fields[i]
-        ds_new[field].type=types[i]
-      }
-      @cases.times {|i|
-        row=case_as_array(i)+other_ds.case_as_array(i)
-        ds_new.add_case_array(row)
-      }
-      ds_new.update_valid_data
-      ds_new
-    end
-    # Join 2 Datasets by given fields
-    # type is one of :left and :inner, default is :left
-    #
-    # @return {Statsample::Dataset}
-    def join(other_ds,fields_1=[],fields_2=[],type=:left)
-      fields_new = other_ds.fields - fields_2
-      fields = self.fields + fields_new
-      other_ds_hash = {}
-      other_ds.each do |row|
-        key = row.select{|k,v| fields_2.include?(k)}.values
-        value = row.select{|k,v| fields_new.include?(k)}
-        if other_ds_hash[key].nil?
-          other_ds_hash[key] = [value]
-        else
-          other_ds_hash[key] << value
+        @vectors = {}
+        vectors.each do |k,v|
+          @vectors[k.respond_to?(:to_sym) ? k.to_sym : k] = v
         end
+        @fields  = fields
+        super @vectors, order: @fields.map { |e| e.respond_to?(:to_sym) ? e.to_sym : e }
       end
-      new_ds = Dataset.new(fields)
-      self.each do |row|
-        key = row.select{|k,v| fields_1.include?(k)}.values
-        new_case = row.dup
-        if other_ds_hash[key].nil?
-          if type == :left
-            fields_new.each{|field| new_case[field] = nil}
-            new_ds.add_case(new_case)
-          end
-        else
-          other_ds_hash[key].each do |new_values|
-            new_ds.add_case new_case.merge(new_values)
-          end
-        end
-      end
-      new_ds
     end
-    # Returns a dataset with standarized data.
-    #
-    # @return {Statsample::Dataset}
-    def standarize
-      ds=dup()
-      ds.fields.each do |f|
-        ds[f]=ds[f].vector_standarized
-      end
-      ds
-    end
-    # Generate a matrix, based on fields of dataset
-    #
-    # @return {::Matrix}
-    def collect_matrix
-      rows=@fields.collect{|row|
-        @fields.collect{|col|
-          yield row,col
-        }
-      }
-      Matrix.rows(rows)
+    def from_to(from,to)
+      raise NoMethodError, "This method is no longer supported. To see the vector index use Daru::DataFrame#vectors"
     end
-    # We have the same datasets if +vectors+ and +fields+ are the same
-    #
-    # @return {Boolean}
-    def ==(d2)
-      @vectors==d2.vectors and @fields==d2.fields
-    end
-    # Returns vector <tt>c</tt>
-    #
-    # @return {Statsample::Vector}
-    def col(c)
-      @vectors[c]
-    end
-    alias_method :vector, :col
-    # Equal to Dataset[<tt>name</tt>]=<tt>vector</tt>
-    #
-    # @return self
     def add_vector(name, vector)
-      raise ArgumentError, "Vector have different size" if vector.size!=@cases
-      @vectors[name]=vector
-      check_order
-      self
-    end
-    # Returns true if dataset have vector <tt>v</tt>.
-    #
-    # @return {Boolean}
-    def has_vector? (v)
-      return @vectors.has_key?(v)
-    end
-    # Creates a dataset with the random data, of a n size
-    # If n not given, uses original number of cases.
-    #
-    # @return {Statsample::Dataset}
-    def bootstrap(n=nil)
-      n||=@cases
-      ds_boot=dup_empty
-      n.times do
-        ds_boot.add_case_array(case_as_array(rand(n)))
-      end
-      ds_boot.update_valid_data
-      ds_boot
+      raise NoMethodError, "Deprecated. Use Daru::DataFrame#[]= directly."
     end
-    # Fast version of #add_case.
-    # Can only add one case and no error check if performed
-    # You SHOULD use #update_valid_data at the end of insertion cycle
-    #
-    #
     def add_case_array(v)
-      v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
+      raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
     end
-    # Insert a case, using:
-    # * Array: size equal to number of vectors and values in the same order as fields
-    # * Hash: keys equal to fields
-    # If uvd is false, #update_valid_data is not executed after
-    # inserting a case. This is very useful if you want to increase the
-    # performance on inserting many cases,  because #update_valid_data
-    # performs check on vectors and on the dataset
     def add_case(v,uvd=true)
-      case v
-      when Array
-        if (v[0].is_a? Array)
-          v.each{|subv| add_case(subv,false)}
-        else
-          raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
-          v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
-        end
-      when Hash
-        raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
-        @fields.each{|f| @vectors[f].add(v[f],false)}
-      else
-        raise TypeError, 'Value must be a Array or a Hash'
-      end
-      if uvd
-        update_valid_data
-      end
-    end
-    # Check vectors and fields after inserting data. Use only
-    # after  #add_case_array or #add_case with second parameter to false
-    def update_valid_data
-      @gsl=nil
-      @fields.each{|f| @vectors[f].set_valid_data}
-      check_length
-    end
-    # Delete vector named +name+. Multiple fields accepted.
-    def delete_vector(*args)
-      if args.size==1 and args[0].is_a? Array
-        names=args[0]
-      else
-        names=args
-      end
-      names.each do |name|
-        @fields.delete(name)
-        @vectors.delete(name)
-      end
-    end
-    def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
-      split=@vectors[name_].split_by_separator(sep)
-      i=1
-      split.each{|k,v|
-        new_field=name_+join+i.to_s
-        v.name=name_+":"+k
-        add_vector(new_field,v)
-        i+=1
-      }
-    end
-    def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
-      split=@vectors[name].split_by_separator(sep)
-      split.each{|k,v|
-        add_vector(name+join+k,v)
-      }
-    end
-    def vector_by_calculation(type=:numeric)
-      a=[]
-      each do |row|
-        a.push(yield(row))
-      end
-      a.to_vector(type)
-    end
-    # Returns a vector with sumatory of fields
-    # if fields parameter is empty, sum all fields
-    def vector_sum(fields=nil)
-      fields||=@fields
-      vector=collect_with_index do |row, i|
-        if(fields.find{|f| !@vectors[f].data_with_nils[i]})
-          nil
-        else
-          fields.inject(0) {|ac,v| ac + row[v].to_f}
-        end
-      end
-      vector.name=_("Sum from %s") % @name
-      vector
-    end
-    # Check if #fields attribute is correct, after inserting or deleting vectors
-    def check_fields(fields)
-      fields||=@fields
-      raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
-      fields
-    end
-    # Returns a vector with the numbers of missing values for a case
-    def vector_missing_values(fields=nil)
-      fields=check_fields(fields)
-      collect_with_index do |row, i|
-        fields.inject(0) {|a,v|
-          a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
-        }
-      end
-    end
-    def vector_count_characters(fields=nil)
-      fields=check_fields(fields)
-      collect_with_index do |row, i|
-        fields.inject(0){|a,v|
-          a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
-        }
-      end
-    end
-    # Returns a vector with the mean for a set of fields
-    # if fields parameter is empty, return the mean for all fields
-    # if max invalid parameter > 0, returns the mean for all tuples
-    # with 0 to max_invalid invalid fields
-    def vector_mean(fields=nil, max_invalid=0)
-      a=[]
-      fields=check_fields(fields)
-      size=fields.size
-      each_with_index do |row, i |
-        # numero de invalidos
-        sum=0
-        invalids=0
-        fields.each{|f|
-          if !@vectors[f].data_with_nils[i].nil?
-            sum+=row[f].to_f
-          else
-            invalids+=1
-          end
-        }
-        if(invalids>max_invalid)
-          a.push(nil)
-        else
-          a.push(sum.quo(size-invalids))
-        end
-      end
-      a=a.to_vector(:numeric)
-      a.name=_("Means from %s") % @name
-      a
-    end
-    # Check vectors for type and size.
-    def check_length # :nodoc:
-      size=nil
-      @vectors.each do |k,v|
-        raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
-        if size.nil?
-          size=v.size
-        else
-          if v.size!=size
-            raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
-          end
-        end
-      end
-      @cases=size
-    end
-    # Retrieves each vector as [key, vector]
-    def each_vector # :yield: |key, vector|
-      @fields.each{|k| yield k, @vectors[k]}
+      raise NoMethodError, "Deprecated. Use Daru::DataFrame#add_row instead."
     end
-    if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
-      def case_as_hash(c) # :nodoc:
-        Statsample::STATSAMPLE__.case_as_hash(self,c)
-      end
-    else
-      # Retrieves case i as a hash
-      def case_as_hash(i)
-        _case_as_hash(i)
-      end
+    def update_valid_data
+      raise NoMethodError, "Deprecated. Use Daru::DataFrame#update instead. Also see Daru.lazy_update in the daru docs."
     end
-    if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
-      def case_as_array(c) # :nodoc:
-        Statsample::STATSAMPLE__.case_as_array(self,c)
-      end
-    else
-      # Retrieves case i as a array, ordered on #fields order
-      def case_as_array(i)
-        _case_as_array(i)
-      end
-    end
-    def _case_as_hash(c) # :nodoc:
-      @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
-    end
-    def _case_as_array(c) # :nodoc:
-      @fields.collect {|x| @vectors[x][c]}
+    def each_array
+      raise NoMethodError, "Deprecated. Use Daru::DataFrame#each_row instead."
     end
-    # Returns each case as a hash
-    def each
-      begin
-        @i=0
-        @cases.times {|i|
-          @i=i
-          row=case_as_hash(i)
-          yield row
-        }
-        @i=nil
-      rescue =>e
-        raise DatasetException.new(self, e)
-      end
-    end
+    def fields=(f)
+      $stderr.puts "WARNING: Deprecated. Use Daru::DataFrame#reindex_vectors! instead.\n"
-    # Returns each case as hash and index
-    def each_with_index # :yield: |case, i|
-      begin
-        @i=0
-        @cases.times{|i|
-          @i=i
-          row=case_as_hash(i)
-          yield row, i
-        }
-        @i=nil
-      rescue =>e
-        raise DatasetException.new(self, e)
-      end
+      reindex_vectors! f
     end
-    # Returns each case as an array, coding missing values as nils
-    def each_array_with_nils
-      m=fields.size
-      @cases.times {|i|
-        @i=i
-        row=Array.new(m)
-        fields.each_index{|j|
-          f=fields[j]
-          row[j]=@vectors[f].data_with_nils[i]
-        }
-        yield row
-      }
-      @i=nil
-    end
-    # Returns each case as an array
-    def each_array
-      @cases.times {|i|
-        @i=i
-        row=case_as_array(i)
-        yield row
-      }
-      @i=nil
-    end
-    # Set fields order. If you omit one or more vectors, they are
-    # ordered by alphabetic order.
-    def fields=(f)
-      @fields=f
-      check_order
-    end
-    # Check congruence between +fields+ attribute
-    # and keys on +vectors
-    def check_order #:nodoc:
-      if(@vectors.keys.sort!=@fields.sort)
-        @fields=@fields&@vectors.keys
-        @fields+=@vectors.keys.sort-@fields
-      end
-    end
     # Returns the vector named i
-    def[](i)
+    def [](i)
+      $stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
       if i.is_a? Range
-        fields=from_to(i.begin,i.end)
-        clone(*fields)
-      elsif i.is_a? Array
-        clone(i)
+        beg = i.begin.respond_to?(:to_sym) ? i.to_sym : i
+        en = i.end.respond_to?(:to_sym) ? i.to_sym : i
+        super(beg..en)
       else
-        raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
-        @vectors[i]
+        super i.to_sym
       end
     end
-    # Retrieves a Statsample::Vector, based on the result
-    # of calculation performed on each case.
-    def collect(type=:numeric)
-      data=[]
-      each {|row|
-        data.push yield(row)
-      }
-      Statsample::Vector.new(data,type)
-    end
-    # Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
-    def collect_with_index(type=:numeric)
-      data=[]
-      each_with_index {|row, i|
-        data.push(yield(row, i))
-      }
-      Statsample::Vector.new(data,type)
-    end
-    # Recode a vector based on a block
-    def recode!(vector_name)
-      0.upto(@cases-1) {|i|
-        @vectors[vector_name].data[i]=yield case_as_hash(i)
-      }
-      @vectors[vector_name].set_valid_data
-    end
-    def crosstab(v1,v2,opts={})
-      Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
-    end
-    def[]=(i,v)
-      if v.instance_of? Statsample::Vector
-        @vectors[i]=v
-        check_order
-      else
-        raise ArgumentError,"Should pass a Statsample::Vector"
-      end
-    end
-    # Return data as a matrix. Column are ordered by #fields and
-    # rows by orden of insertion
-    def to_matrix
-      rows=[]
-      self.each_array{|c|
-        rows.push(c)
-      }
-      Matrix.rows(rows)
+    def []=(i,v)
+      $stderr.puts "WARNING: Daru uses symbols instead of strings for naming vectors. Please switch to symbols.\n"
+      super i, v
     end
     if Statsample.has_gsl?
       def clear_gsl
-        @gsl=nil
+        raise NoMethodError, "This method is no longer needed/supported."
       end
-      def to_gsl
-        if @gsl.nil?
-          if cases.nil?
-            update_valid_data
-          end
-          @gsl=GSL::Matrix.alloc(cases,fields.size)
-          self.each_array{|c|
-            @gsl.set_row(@i,c)
-          }
-        end
-        @gsl
-      end
-    end
-    # Return a correlation matrix for fields included as parameters.
-    # By default, uses all fields of dataset
-  def correlation_matrix(fields = nil)
-    if fields
-      ds = clone(fields)
-    else
-      ds = self
-    end
-    Statsample::Bivariate.correlation_matrix(ds)
-  end
-  # Return a correlation matrix for fields included as parameters.
-  # By default, uses all fields of dataset
-  def covariance_matrix(fields = nil)
-    if fields
-      ds = clone(fields)
-    else
-      ds = self
-    end
-    Statsample::Bivariate.covariance_matrix(ds)
-  end
-    # Create a new dataset with all cases which the block returns true
-    def filter
-      ds=self.dup_empty
-      each {|c|
-        ds.add_case(c, false) if yield c
-      }
-      ds.update_valid_data
-      ds.name=_("%s(filtered)") % @name
-      ds
-    end
-    # creates a new vector with the data of a given field which the block returns true
-    def filter_field(field)
-      a=[]
-      each do |c|
-        a.push(c[field]) if yield c
-      end
-      a.to_vector(@vectors[field].type)
-    end
-    # Creates a Stastample::Multiset, using one or more fields
-    # to split the dataset.
-    def to_multiset_by_split(*fields)
-			require 'statsample/multiset'
-			if fields.size==1
-				to_multiset_by_split_one_field(fields[0])
-			else
-				to_multiset_by_split_multiple_fields(*fields)
-			end
-    end
-    # Creates a Statsample::Multiset, using one field
-    def to_multiset_by_split_one_field(field)
-      raise ArgumentError,"Should use a correct field name" if !@fields.include? field
-      factors=@vectors[field].factors
-      ms=Multiset.new_empty_vectors(@fields, factors)
-      each {|c|
-        ms[c[field]].add_case(c,false)
-      }
-      #puts "Ingreso a los dataset"
-      ms.datasets.each {|k,ds|
-        ds.update_valid_data
-        ds.name=@vectors[field].labeling(k)
-        ds.vectors.each{|k1,v1|
-          #        puts "Vector #{k1}:"+v1.to_s
-          v1.type=@vectors[k1].type
-          v1.name=@vectors[k1].name
-          v1.labels=@vectors[k1].labels
-        }
-      }
-      ms
-    end
-    def to_multiset_by_split_multiple_fields(*fields)
-      factors_total=nil
-      fields.each do |f|
-        if factors_total.nil?
-          factors_total=@vectors[f].factors.collect{|c|
-            [c]
-          }
-        else
-          suma=[]
-          factors=@vectors[f].factors
-          factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
-          factors_total=suma
-        end
-      end
-      ms=Multiset.new_empty_vectors(@fields,factors_total)
-      p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
-      each{|c| p1.call(c)}
-      ms.datasets.each do |k,ds|
-        ds.update_valid_data
-        ds.name=fields.size.times.map {|i|
-          f=fields[i]
-          sk=k[i]
-          @vectors[f].labeling(sk)
-        }.join("-")
-        ds.vectors.each{|k1,v1|
-          v1.type=@vectors[k1].type
-          v1.name=@vectors[k1].name
-          v1.labels=@vectors[k1].labels
-        }
-      end
-      ms
-    end
-    # Returns a vector, based on a string with a calculation based
-    # on vector
-    # The calculation will be eval'ed, so you can put any variable
-    # or expression valid on ruby
-    # For example:
-    #   a=[1,2].to_vector(scale)
-    #   b=[3,4].to_vector(scale)
-    #   ds={'a'=>a,'b'=>b}.to_dataset
-    #   ds.compute("a+b")
-    #   => Vector [4,6]
-    def compute(text)
-      @fields.each{|f|
-        if @vectors[f].type=:numeric
-          text.gsub!(f,"row['#{f}'].to_f")
-        else
-          text.gsub!(f,"row['#{f}']")
-        end
-      }
-      collect_with_index {|row, i|
-        invalid=false
-        @fields.each{|f|
-          if @vectors[f].data_with_nils[i].nil?
-            invalid=true
-          end
-        }
-        if invalid
-          nil
-        else
-          eval(text)
-        end
-      }
-    end
-    # Test each row with one or more tests
-    # each test is a Proc with the form
-    #   Proc.new {|row| row['age']>0}
-    # The function returns an array with all errors
-    def verify(*tests)
-      if(tests[0].is_a? String)
-        id=tests[0]
-        tests.shift
-      else
-        id=@fields[0]
-      end
-      vr=[]
-      i=0
-      each do |row|
-        i+=1
-        tests.each{|test|
-          if ! test[2].call(row)
-            values=""
-            if test[1].size>0
-              values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
-            end
-            vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
-          end
-        }
-      end
-      vr
-    end
-    def to_s
-      "#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
-    end
-    def inspect
-      self.to_s
-    end
-    # Creates a new dataset for one to many relations
-    # on a dataset, based on pattern of field names.
-    #
-    # for example, you have a survey for number of children
-    # with this structure:
-    #   id, name, child_name_1, child_age_1, child_name_2, child_age_2
-    # with
-    #   ds.one_to_many(%w{id}, "child_%v_%n"
-    # the field of first parameters will be copied verbatim
-    # to new dataset, and fields which responds to second
-    # pattern will be added one case for each different %n.
-    # For example
-    #   cases=[
-    #     ['1','george','red',10,'blue',20,nil,nil],
-    #     ['2','fred','green',15,'orange',30,'white',20],
-    #     ['3','alfred',nil,nil,nil,nil,nil,nil]
-    #   ]
-    #   ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
-    #   cases.each {|c| ds.add_case_array c }
-    #   ds.one_to_many(['id'],'car_%v%n').to_matrix
-    #   => Matrix[
-    #      ["red", "1", 10],
-    #      ["blue", "1", 20],
-    #      ["green", "2", 15],
-    #      ["orange", "2", 30],
-    #      ["white", "2", 20]
-    #      ]
-    #
-    def one_to_many(parent_fields, pattern)
-      #base_pattern=pattern.gsub(/%v|%n/,"")
-      re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
-      ds_vars=parent_fields
-      vars=[]
-      max_n=0
-      h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
-      # Adding _row_id
-      h['_col_id']=[].to_numeric
-      ds_vars.push("_col_id")
-      @fields.each do |f|
-        if f=~re
-          if !vars.include? $1
-            vars.push($1)
-            h[$1]=Statsample::Vector.new([], @vectors[f].type)
-          end
-          max_n=$2.to_i if max_n < $2.to_i
-        end
-      end
-      ds=Dataset.new(h,ds_vars+vars)
-      each do |row|
-        row_out={}
-        parent_fields.each do |f|
-          row_out[f]=row[f]
-        end
-        max_n.times do |n1|
-          n=n1+1
-          any_data=false
-          vars.each do |v|
-            data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
-            row_out[v]=data
-            any_data=true if !data.nil?
-          end
-          if any_data
-            row_out["_col_id"]=n
-            ds.add_case(row_out,false)
-          end
-        end
-      end
-      ds.update_valid_data
-      ds
-    end
-    def report_building(b)
-      b.section(:name=>@name) do |g|
-        g.text _"Cases: %d"  % cases
-        @fields.each do |f|
-          g.text "Element:[#{f}]"
-          g.parse_element(@vectors[f])
-        end
-      end
-    end
+    end
   end
 end