RubyGems - ai4ruby - Versions diffs - 1.11 - Mend

ai4ruby 1.11

Files changed (79) hide show

data/README.rdoc +47 -0
data/examples/classifiers/id3_data.csv +121 -0
data/examples/classifiers/id3_example.rb +29 -0
data/examples/classifiers/naive_bayes_data.csv +11 -0
data/examples/classifiers/naive_bayes_example.rb +16 -0
data/examples/classifiers/results.txt +31 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +37 -0
data/examples/genetic_algorithm/travel_cost.csv +16 -0
data/examples/neural_network/backpropagation_example.rb +67 -0
data/examples/neural_network/patterns_with_base_noise.rb +68 -0
data/examples/neural_network/patterns_with_noise.rb +66 -0
data/examples/neural_network/training_patterns.rb +68 -0
data/examples/neural_network/xor_example.rb +35 -0
data/examples/som/som_data.rb +156 -0
data/examples/som/som_multi_node_example.rb +22 -0
data/examples/som/som_single_example.rb +24 -0
data/lib/ai4r.rb +33 -0
data/lib/ai4r/classifiers/classifier.rb +62 -0
data/lib/ai4r/classifiers/hyperpipes.rb +118 -0
data/lib/ai4r/classifiers/ib1.rb +121 -0
data/lib/ai4r/classifiers/id3.rb +326 -0
data/lib/ai4r/classifiers/multilayer_perceptron.rb +135 -0
data/lib/ai4r/classifiers/naive_bayes.rb +259 -0
data/lib/ai4r/classifiers/one_r.rb +110 -0
data/lib/ai4r/classifiers/prism.rb +197 -0
data/lib/ai4r/classifiers/zero_r.rb +73 -0
data/lib/ai4r/clusterers/average_linkage.rb +59 -0
data/lib/ai4r/clusterers/bisecting_k_means.rb +93 -0
data/lib/ai4r/clusterers/centroid_linkage.rb +66 -0
data/lib/ai4r/clusterers/clusterer.rb +61 -0
data/lib/ai4r/clusterers/complete_linkage.rb +67 -0
data/lib/ai4r/clusterers/diana.rb +139 -0
data/lib/ai4r/clusterers/k_means.rb +126 -0
data/lib/ai4r/clusterers/median_linkage.rb +61 -0
data/lib/ai4r/clusterers/single_linkage.rb +194 -0
data/lib/ai4r/clusterers/ward_linkage.rb +64 -0
data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +31 -0
data/lib/ai4r/clusterers/weighted_average_linkage.rb +61 -0
data/lib/ai4r/data/data_set.rb +266 -0
data/lib/ai4r/data/parameterizable.rb +64 -0
data/lib/ai4r/data/proximity.rb +100 -0
data/lib/ai4r/data/statistics.rb +77 -0
data/lib/ai4r/experiment/classifier_evaluator.rb +95 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +270 -0
data/lib/ai4r/neural_network/backpropagation.rb +326 -0
data/lib/ai4r/neural_network/hopfield.rb +149 -0
data/lib/ai4r/som/layer.rb +68 -0
data/lib/ai4r/som/node.rb +96 -0
data/lib/ai4r/som/som.rb +155 -0
data/lib/ai4r/som/two_phase_layer.rb +90 -0
data/test/classifiers/hyperpipes_test.rb +84 -0
data/test/classifiers/ib1_test.rb +78 -0
data/test/classifiers/id3_test.rb +208 -0
data/test/classifiers/multilayer_perceptron_test.rb +79 -0
data/test/classifiers/naive_bayes_test.rb +43 -0
data/test/classifiers/one_r_test.rb +62 -0
data/test/classifiers/prism_test.rb +85 -0
data/test/classifiers/zero_r_test.rb +49 -0
data/test/clusterers/average_linkage_test.rb +51 -0
data/test/clusterers/bisecting_k_means_test.rb +66 -0
data/test/clusterers/centroid_linkage_test.rb +53 -0
data/test/clusterers/complete_linkage_test.rb +57 -0
data/test/clusterers/diana_test.rb +69 -0
data/test/clusterers/k_means_test.rb +100 -0
data/test/clusterers/median_linkage_test.rb +53 -0
data/test/clusterers/single_linkage_test.rb +122 -0
data/test/clusterers/ward_linkage_hierarchical_test.rb +61 -0
data/test/clusterers/ward_linkage_test.rb +53 -0
data/test/clusterers/weighted_average_linkage_test.rb +53 -0
data/test/data/data_set_test.rb +96 -0
data/test/data/proximity_test.rb +81 -0
data/test/data/statistics_test.rb +65 -0
data/test/experiment/classifier_evaluator_test.rb +76 -0
data/test/genetic_algorithm/chromosome_test.rb +58 -0
data/test/genetic_algorithm/genetic_algorithm_test.rb +81 -0
data/test/neural_network/backpropagation_test.rb +82 -0
data/test/neural_network/hopfield_test.rb +72 -0
data/test/som/som_test.rb +97 -0
metadata +168 -0

data/lib/ai4r/clusterers/single_linkage.rb ADDED

@@ -0,0 +1,194 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../data/proximity'
+require File.dirname(__FILE__) + '/../clusterers/clusterer'
+module Ai4r
+  module Clusterers
+    # Implementation of a Hierarchical clusterer with single linkage (Everitt et
+    # al., 2001 ; Johnson, 1967 ; Jain and Dubes, 1988 ; Sneath, 1957 )
+    # Hierarchical clusteres create one cluster per element, and then
+    # progressively merge clusters, until the required number of clusters
+    # is reached.
+    # With single linkage, the distance between two clusters is computed as the
+    # distance between the two closest elements in the two clusters.
+    #
+    #   D(cx, (ci U cj) = min(D(cx, ci), D(cx, cj))
+    class SingleLinkage < Clusterer
+      attr_reader :data_set, :number_of_clusters, :clusters
+      parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      def initialize
+        @distance_function = lambda do |a,b|
+            Ai4r::Data::Proximity.squared_euclidean_distance(
+              a.select {|att_a| att_a.is_a? Numeric} ,
+              b.select {|att_b| att_b.is_a? Numeric})
+          end
+      end
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        @data_set = data_set
+        @number_of_clusters = number_of_clusters
+        @index_clusters = create_initial_index_clusters
+        create_distance_matrix(data_set)
+        while @index_clusters.length > @number_of_clusters
+          ci, cj = get_closest_clusters(@index_clusters)
+          update_distance_matrix(ci, cj)
+          merge_clusters(ci, cj, @index_clusters)
+        end
+        @clusters = build_clusters_from_index_clusters @index_clusters
+        return self
+      end
+      # Classifies the given data item, returning the cluster index it belongs
+      # to (0-based).
+      def eval(data_item)
+        get_min_index(@clusters.collect {|cluster|
+            distance_between_item_and_cluster(data_item, cluster)})
+      end
+      protected
+      # returns [ [0], [1], [2], ... , [n-1] ]
+      # where n is the number of data items in the data set
+      def create_initial_index_clusters
+        index_clusters = []
+        @data_set.data_items.length.times {|i| index_clusters << [i]}
+        return index_clusters
+      end
+      # Create a partial distance matrix:
+      #   [
+      #     [d(1,0)],
+      #     [d(2,0)], [d(2,1)],
+      #     [d(3,0)], [d(3,1)], [d(3,2)],
+      #     ...
+      #     [d(n-1,0)], [d(n-1,1)], [d(n-1,2)], ... , [d(n-1,n-2)]
+      #   ]
+      # where n is the number of data items in the data set
+      def create_distance_matrix(data_set)
+        @distance_matrix = Array.new(data_set.data_items.length-1) {|index| Array.new(index+1)}
+        data_set.data_items.each_with_index do |a, i|
+          i.times do |j|
+            b = data_set.data_items[j]
+            @distance_matrix[i-1][j] = @distance_function.call(a, b)
+          end
+        end
+      end
+      # Returns the distance between element data_item[index_a] and
+      # data_item[index_b] using the distance matrix
+      def read_distance_matrix(index_a, index_b)
+        return 0 if index_a == index_b
+        index_a, index_b = index_b, index_a if index_b > index_a
+        return @distance_matrix[index_a-1][index_b]
+      end
+      # ci and cj are the indexes of the clusters that are going to
+      # be merged. We need to remove distances from/to ci and ci,
+      # and add distances from/to new cluster (ci U cj)
+      def update_distance_matrix(ci, cj)
+        ci, cj = cj, ci if cj > ci
+        distances_to_new_cluster = Array.new
+        (@distance_matrix.length+1).times do |cx|
+          if cx!= ci && cx!=cj
+            distances_to_new_cluster << linkage_distance(cx, ci, cj)
+          end
+        end
+        if cj==0 && ci==1
+          @distance_matrix.delete_at(1)
+          @distance_matrix.delete_at(0)
+        elsif cj==0
+          @distance_matrix.delete_at(ci-1)
+          @distance_matrix.delete_at(0)
+        else
+          @distance_matrix.delete_at(ci-1)
+          @distance_matrix.delete_at(cj-1)
+        end
+        @distance_matrix.each do |d|
+          d.delete_at(ci)
+          d.delete_at(cj)
+        end
+        @distance_matrix << distances_to_new_cluster
+      end
+      # return distance between cluster cx and new cluster (ci U cj),
+      # using single linkage
+      def linkage_distance(cx, ci, cj)
+        [read_distance_matrix(cx, ci),
+          read_distance_matrix(cx, cj)].min
+      end
+      # cluster_a and cluster_b are removed from index_cluster,
+      # and a new cluster with all members of cluster_a and cluster_b
+      # is added.
+      # It modifies index clusters array.
+      def merge_clusters(index_a, index_b, index_clusters)
+        index_a, index_b = index_b, index_a if index_b > index_a
+        new_index_cluster = index_clusters[index_a] +
+          index_clusters[index_b]
+        index_clusters.delete_at index_a
+        index_clusters.delete_at index_b
+        index_clusters << new_index_cluster
+        return index_clusters
+      end
+      # Given an array with clusters of data_items indexes,
+      # it returns an array of data_items clusters
+      def build_clusters_from_index_clusters(index_clusters)
+        @distance_matrix = nil
+        return index_clusters.collect do |index_cluster|
+          Ai4r::Data::DataSet.new(:data_labels => @data_set.data_labels,
+            :data_items => index_cluster.collect {|i| @data_set.data_items[i]})
+        end
+      end
+      # Returns ans array with the indexes of the two closest
+      # clusters => [index_cluster_a, index_cluster_b]
+      def get_closest_clusters(index_clusters)
+        min_distance = 1.0/0
+        closest_clusters = [1, 0]
+        index_clusters.each_index do |index_a|
+          index_a.times do |index_b|
+            cluster_distance = read_distance_matrix(index_a, index_b)
+            if cluster_distance < min_distance
+              closest_clusters = [index_a, index_b]
+              min_distance = cluster_distance
+            end
+          end
+        end
+        return closest_clusters
+      end
+      def distance_between_item_and_cluster(data_item, cluster)
+        min_dist = 1.0/0
+        cluster.data_items.each do |another_item|
+          dist = @distance_function.call(data_item, another_item)
+          min_dist = dist if dist < min_dist
+        end
+        return min_dist
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/ward_linkage.rb ADDED

@@ -0,0 +1,64 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+module Ai4r
+  module Clusterers
+    # Implementation of an Agglomerative Hierarchical clusterer with
+    # Ward's method linkage algorithm, aka the minimum variance method (Everitt
+    # et al., 2001 ; Jain and Dubes, 1988 ; Ward, 1963 ).
+    # Hierarchical clusteres create one cluster per element, and then
+    # progressively merge clusters, until the required number of clusters
+    # is reached.
+    # The objective of this method is to minime the variance.
+    #
+    #   D(cx, (ci U cj)) =  (ni/(ni+nj+nx))*D(cx, ci) +
+    #                       (nj/(ni+nj+nx))*D(cx, cj) -
+    #                       (nx/(ni+nj)^2)*D(ci, cj)
+    class WardLinkage < SingleLinkage
+    parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        super
+      end
+      # This algorithms does not allow classification of new data items
+      # once it has been built. Rebuild the cluster including you data element.
+      def eval(data_item)
+        Raise "Eval of new data is not supported by this algorithm."
+      end
+      protected
+      # return distance between cluster cx and cluster (ci U cj),
+      # using ward's method linkage
+      def linkage_distance(cx, ci, cj)
+        ni = @index_clusters[ci].length
+        nj = @index_clusters[cj].length
+        nx = @index_clusters[cx].length
+        ( ( ( 1.0* (ni+nx) * read_distance_matrix(cx, ci) ) +
+            ( 1.0* (nj+nx) * read_distance_matrix(cx, cj) ) ) / (ni + nj + nx)  -
+            ( 1.0 * nx * read_distance_matrix(ci, cj) / (ni+nj)**2 ) )
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb ADDED

@@ -0,0 +1,31 @@
+# Author::    Peter Lubell-Doughtie
+# License::   BSD 3 Clause
+# Project::   ai4r
+# Url::       http://peet.ldee.org
+require File.dirname(__FILE__) + '/../clusterers/ward_linkage'
+module Ai4r
+  module Clusterers
+    # Hierarchical version to store classes as merges occur.
+    class WardLinkageHierarchical < WardLinkage
+      attr_reader :cluster_tree
+      def initialize
+        @cluster_tree = []
+        super
+      end
+      protected
+      def merge_clusters(index_a, index_b, index_clusters)
+        # store current index_clusters
+        @cluster_tree << index_clusters.dup
+        super
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/weighted_average_linkage.rb ADDED

@@ -0,0 +1,61 @@
+# Author::    Sergio Fierens (implementation)
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+module Ai4r
+  module Clusterers
+    # Implementation of an Agglomerative Hierarchical clusterer with
+    # weighted average linkage algorithm, aka weighted pair group method
+    # average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
+    # Hierarchical clusteres create one cluster per element, and then
+    # progressively merge clusters, until the required number of clusters
+    # is reached.
+    # Similar to AverageLinkage, but the distances between clusters are
+    # weighted based on the number of data items in each of them.
+    #
+    #   D(cx, (ci U cj)) =  ( ni * D(cx, ci) + nj * D(cx, cj)) / (ni + nj)
+    class WeightedAverageLinkage < SingleLinkage
+    parameters_info :distance_function =>
+          "Custom implementation of distance function. " +
+          "It must be a closure receiving two data items and return the " +
+          "distance bewteen them. By default, this algorithm uses " +
+          "ecuclidean distance of numeric attributes to the power of 2."
+      # Build a new clusterer, using data examples found in data_set.
+      # Items will be clustered in "number_of_clusters" different
+      # clusters.
+      def build(data_set, number_of_clusters)
+        super
+      end
+      # This algorithms does not allow classification of new data items
+      # once it has been built. Rebuild the cluster including you data element.
+      def eval(data_item)
+        Raise "Eval of new data is not supported by this algorithm."
+      end
+      protected
+      # return distance between cluster cx and cluster (ci U cj),
+      # using weighted average linkage
+      def linkage_distance(cx, ci, cj)
+        ni = @index_clusters[ci].length
+        nj = @index_clusters[cj].length
+        (1.0 * ni * read_distance_matrix(cx, ci)+
+          nj * read_distance_matrix(cx, cj))/(ni+nj)
+      end
+    end
+  end
+end

data/lib/ai4r/data/data_set.rb ADDED

@@ -0,0 +1,266 @@
+# Author::    Sergio Fierens
+# License::   MPL 1.1
+# Project::   ai4r
+# Url::       http://ai4r.rubyforge.org/
+#
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
+# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
+require 'csv'
+require 'set'
+require File.dirname(__FILE__) + '/statistics'
+module Ai4r
+  module Data
+    # A data set is a collection of N data items. Each data item is
+    # described by a set of attributes, represented as an array.
+    # Optionally, you can assign a label to the attributes, using
+    # the data_labels property.
+    class DataSet
+      @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
+      attr_reader :data_labels, :data_items
+      # Create a new DataSet. By default, empty.
+      # Optionaly, you can provide the initial data items and data labels.
+      #
+      # e.g. DataSet.new(:data_items => data_items, :data_labels => labels)
+      #
+      # If you provide data items, but no data labels, the data set will
+      # use the default data label values (see set_data_labels)
+      def initialize(options = {})
+        @data_labels = []
+        @data_items = options[:data_items] || []
+        set_data_labels(options[:data_labels]) if options[:data_labels]
+        set_data_items(options[:data_items]) if options[:data_items]
+      end
+      # Retrieve a new DataSet, with the item(s) selected by the provided
+      # index. You can specify an index range, too.
+      def [](index)
+        selected_items = (index.is_a?(Fixnum)) ?
+                [@data_items[index]] : @data_items[index]
+        return DataSet.new(:data_items => selected_items,
+                           :data_labels =>@data_labels)
+      end
+      # Load data items from csv file
+      def load_csv(filepath)
+        items = []
+        open_csv_file(filepath) do |entry|
+          items << entry
+        end
+        set_data_items(items)
+      end
+      # opens a csv-file and reads it line by line
+      # for each line, a block is called and the row is passed to the block
+      # ruby1.8 and 1.9 safe
+      def open_csv_file(filepath, &block)
+        if CSV.const_defined? :Reader
+          CSV::Reader.parse(File.open(filepath, 'r')) do |row|
+            block.call row
+          end
+        else
+          CSV.parse(File.open(filepath, 'r')) do |row|
+            block.call row
+          end
+        end
+      end
+      # Load data items from csv file. The first row is used as data labels.
+      def load_csv_with_labels(filepath)
+        load_csv(filepath)
+        @data_labels = @data_items.shift
+        return self
+      end
+      # Same as load_csv, but it will try to convert cell contents as numbers.
+      def parse_csv(filepath)
+        items = []
+        open_csv_file(filepath) do |row|
+          items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
+        end
+        set_data_items(items)
+      end
+      # Set data labels.
+      # Data labels must have the following format:
+      #     [ 'city', 'age_range', 'gender', 'marketing_target'  ]
+      #
+      # If you do not provide labels for you data, the following labels will
+      # be created by default:
+      #     [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value'  ]
+      def set_data_labels(labels)
+        check_data_labels(labels)
+        @data_labels = labels
+        return self
+      end
+      # Set the data items.
+      # M data items with  N attributes must have the following
+      # format:
+      #
+      #     [   [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1,  CLASS_VAL1],
+      #         [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2,  CLASS_VAL2],
+      #         ...
+      #         [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
+      #     ]
+      #
+      # e.g.
+      #     [   ['New York',  '<30',      'M', 'Y'],
+      #          ['Chicago',     '<30',      'M', 'Y'],
+      #          ['Chicago',     '<30',      'F', 'Y'],
+      #          ['New York',  '<30',      'M', 'Y'],
+      #          ['New York',  '<30',      'M', 'Y'],
+      #          ['Chicago',     '[30-50)',  'M', 'Y'],
+      #          ['New York',  '[30-50)',  'F', 'N'],
+      #          ['Chicago',     '[30-50)',  'F', 'Y'],
+      #          ['New York',  '[30-50)',  'F', 'N'],
+      #          ['Chicago',     '[50-80]', 'M', 'N'],
+      #          ['New York',  '[50-80]', 'F', 'N'],
+      #          ['New York',  '[50-80]', 'M', 'N'],
+      #          ['Chicago',     '[50-80]', 'M', 'N'],
+      #          ['New York',  '[50-80]', 'F', 'N'],
+      #          ['Chicago',     '>80',      'F', 'Y']
+      #        ]
+      #
+      # This method returns the classifier (self), allowing method chaining.
+      def set_data_items(items)
+        check_data_items(items)
+        @data_labels = default_data_labels(items) if @data_labels.empty?
+        @data_items = items
+        return self
+      end
+      # Returns an array with the domain of each attribute:
+      # * Set instance containing all possible values for nominal attributes
+      # * Array with min and max values for numeric attributes (i.e. [min, max])
+      #
+      # Return example:
+      # => [#<Set: {"New York", "Chicago"}>,
+      #     #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
+      #     #<Set: {"M", "F"}>,
+      #     [5, 85],
+      #     #<Set: {"Y", "N"}>]
+      def build_domains
+        @data_labels.collect {|attr_label| build_domain(attr_label) }
+      end
+      # Returns a Set instance containing all possible values for an attribute
+      # The parameter can be an attribute label or index (0 based).
+      # * Set instance containing all possible values for nominal attributes
+      # * Array with min and max values for numeric attributes (i.e. [min, max])
+      #
+      #   build_domain("city")
+      #   => #<Set: {"New York", "Chicago"}>
+      #
+      #   build_domain("age")
+      #   => [5, 85]
+      #
+      #   build_domain(2) # In this example, the third attribute is gender
+      #   => #<Set: {"M", "F"}>
+      def build_domain(attr)
+        index = get_index(attr)
+        if @data_items.first[index].is_a?(Numeric)
+          return [Statistics.min(self, index), Statistics.max(self, index)]
+        else
+          return @data_items.inject(Set.new){|domain, x| domain << x[index]}
+        end
+      end
+      # Returns attributes number, including class attribute
+      def num_attributes
+        return (@data_items.empty?) ? 0 : @data_items.first.size
+      end
+      # Returns the index of a given attribute (0-based).
+      # For example, if "gender" is the third attribute, then:
+      #   get_index("gender")
+      #   => 2
+      def get_index(attr)
+        return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
+      end
+      # Raise an exception if there is no data item.
+      def check_not_empty
+        if @data_items.empty?
+          raise ArgumentError, "Examples data set must not be empty."
+        end
+      end
+      # Add a data item to the data set
+      def << data_item
+        if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
+          raise ArgumentError, "Data must not be an non empty array."
+        elsif @data_items.empty?
+          set_data_items([data_item])
+        elsif data_item.length != num_attributes
+          raise ArgumentError, "Number of attributes do not match. " +
+                  "#{data_item.length} attributes provided, " +
+                  "#{num_attributes} attributes expected."
+        else
+          @data_items << data_item
+        end
+      end
+      # Returns an array with the mean value of numeric attributes, and
+      # the most frequent value of non numeric attributes
+      def get_mean_or_mode
+        mean = []
+        num_attributes.times do |i|
+          mean[i] =
+                  if @data_items.first[i].is_a?(Numeric)
+                    Statistics.mean(self, i)
+                  else
+                    Statistics.mode(self, i)
+                  end
+        end
+        return mean
+      end
+      protected
+      def check_data_items(data_items)
+        if !data_items || data_items.empty?
+          raise ArgumentError, "Examples data set must not be empty."
+        elsif !data_items.first.is_a?(Enumerable)
+          raise ArgumentError, "Unkown format for example data."
+        end
+        attributes_num = data_items.first.length
+        data_items.each_index do |index|
+          if data_items[index].length != attributes_num
+            raise ArgumentError,
+                  "Quantity of attributes is inconsistent. " +
+                          "The first item has #{attributes_num} attributes "+
+                          "and row #{index} has #{data_items[index].length} attributes"
+          end
+        end
+      end
+      def check_data_labels(labels)
+        if !@data_items.empty?
+          if labels.length != @data_items.first.length
+            raise ArgumentError,
+                  "Number of labels and attributes do not match. " +
+                          "#{labels.length} labels and " +
+                          "#{@data_items.first.length} attributes found."
+          end
+        end
+      end
+      def default_data_labels(data_items)
+        data_labels = []
+        data_items[0][0..-2].each_index do |i|
+          data_labels[i] = "attribute_#{i+1}"
+        end
+        data_labels[data_labels.length]="class_value"
+        return data_labels
+      end
+    end
+  end
+end