RubyGems - ai4r - Versions diffs - 1.12 → 2.0 - Mend

ai4r 1.12 → 2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

checksums.yaml +7 -0
data/README.md +174 -0
data/examples/classifiers/hyperpipes_data.csv +14 -0
data/examples/classifiers/hyperpipes_example.rb +22 -0
data/examples/classifiers/ib1_example.rb +12 -0
data/examples/classifiers/id3_example.rb +15 -10
data/examples/classifiers/id3_graphviz_example.rb +17 -0
data/examples/classifiers/logistic_regression_example.rb +11 -0
data/examples/classifiers/naive_bayes_attributes_example.rb +13 -0
data/examples/classifiers/naive_bayes_example.rb +12 -13
data/examples/classifiers/one_r_example.rb +27 -0
data/examples/classifiers/parameter_tutorial.rb +29 -0
data/examples/classifiers/prism_nominal_example.rb +15 -0
data/examples/classifiers/prism_numeric_example.rb +21 -0
data/examples/classifiers/simple_linear_regression_example.csv +159 -0
data/examples/classifiers/simple_linear_regression_example.rb +18 -0
data/examples/classifiers/zero_and_one_r_example.rb +34 -0
data/examples/classifiers/zero_one_r_data.csv +8 -0
data/examples/clusterers/clusterer_example.rb +62 -0
data/examples/clusterers/dbscan_example.rb +17 -0
data/examples/clusterers/dendrogram_example.rb +17 -0
data/examples/clusterers/hierarchical_dendrogram_example.rb +20 -0
data/examples/clusterers/kmeans_custom_example.rb +26 -0
data/examples/genetic_algorithm/bitstring_example.rb +41 -0
data/examples/genetic_algorithm/genetic_algorithm_example.rb +26 -18
data/examples/genetic_algorithm/kmeans_seed_tuning.rb +45 -0
data/examples/neural_network/backpropagation_example.rb +49 -48
data/examples/neural_network/hopfield_example.rb +45 -0
data/examples/neural_network/patterns_with_base_noise.rb +39 -39
data/examples/neural_network/patterns_with_noise.rb +41 -39
data/examples/neural_network/train_epochs_callback.rb +25 -0
data/examples/neural_network/training_patterns.rb +39 -39
data/examples/neural_network/transformer_text_classification.rb +78 -0
data/examples/neural_network/xor_example.rb +23 -22
data/examples/reinforcement/q_learning_example.rb +10 -0
data/examples/som/som_data.rb +155 -152
data/examples/som/som_multi_node_example.rb +12 -13
data/examples/som/som_single_example.rb +12 -15
data/examples/transformer/decode_classifier_example.rb +68 -0
data/examples/transformer/deterministic_example.rb +10 -0
data/examples/transformer/seq2seq_example.rb +16 -0
data/lib/ai4r/classifiers/classifier.rb +24 -16
data/lib/ai4r/classifiers/gradient_boosting.rb +64 -0
data/lib/ai4r/classifiers/hyperpipes.rb +119 -43
data/lib/ai4r/classifiers/ib1.rb +122 -32
data/lib/ai4r/classifiers/id3.rb +527 -144
data/lib/ai4r/classifiers/logistic_regression.rb +96 -0
data/lib/ai4r/classifiers/multilayer_perceptron.rb +75 -59
data/lib/ai4r/classifiers/naive_bayes.rb +112 -48
data/lib/ai4r/classifiers/one_r.rb +112 -44
data/lib/ai4r/classifiers/prism.rb +167 -76
data/lib/ai4r/classifiers/random_forest.rb +72 -0
data/lib/ai4r/classifiers/simple_linear_regression.rb +143 -0
data/lib/ai4r/classifiers/support_vector_machine.rb +91 -0
data/lib/ai4r/classifiers/votes.rb +57 -0
data/lib/ai4r/classifiers/zero_r.rb +71 -30
data/lib/ai4r/clusterers/average_linkage.rb +46 -27
data/lib/ai4r/clusterers/bisecting_k_means.rb +50 -44
data/lib/ai4r/clusterers/centroid_linkage.rb +52 -36
data/lib/ai4r/clusterers/cluster_tree.rb +50 -0
data/lib/ai4r/clusterers/clusterer.rb +28 -24
data/lib/ai4r/clusterers/complete_linkage.rb +42 -31
data/lib/ai4r/clusterers/dbscan.rb +134 -0
data/lib/ai4r/clusterers/diana.rb +75 -49
data/lib/ai4r/clusterers/k_means.rb +309 -72
data/lib/ai4r/clusterers/median_linkage.rb +49 -33
data/lib/ai4r/clusterers/single_linkage.rb +196 -88
data/lib/ai4r/clusterers/ward_linkage.rb +51 -35
data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb +63 -0
data/lib/ai4r/clusterers/weighted_average_linkage.rb +48 -32
data/lib/ai4r/data/data_set.rb +229 -100
data/lib/ai4r/data/parameterizable.rb +31 -25
data/lib/ai4r/data/proximity.rb +72 -50
data/lib/ai4r/data/statistics.rb +46 -35
data/lib/ai4r/experiment/classifier_evaluator.rb +84 -32
data/lib/ai4r/experiment/split.rb +39 -0
data/lib/ai4r/genetic_algorithm/chromosome_base.rb +43 -0
data/lib/ai4r/genetic_algorithm/genetic_algorithm.rb +92 -170
data/lib/ai4r/genetic_algorithm/tsp_chromosome.rb +83 -0
data/lib/ai4r/hmm/hidden_markov_model.rb +134 -0
data/lib/ai4r/neural_network/activation_functions.rb +37 -0
data/lib/ai4r/neural_network/backpropagation.rb +419 -143
data/lib/ai4r/neural_network/hopfield.rb +175 -58
data/lib/ai4r/neural_network/transformer.rb +194 -0
data/lib/ai4r/neural_network/weight_initializations.rb +40 -0
data/lib/ai4r/reinforcement/policy_iteration.rb +66 -0
data/lib/ai4r/reinforcement/q_learning.rb +51 -0
data/lib/ai4r/search/a_star.rb +76 -0
data/lib/ai4r/search/bfs.rb +50 -0
data/lib/ai4r/search/dfs.rb +50 -0
data/lib/ai4r/search/mcts.rb +118 -0
data/lib/ai4r/search.rb +12 -0
data/lib/ai4r/som/distance_metrics.rb +29 -0
data/lib/ai4r/som/layer.rb +28 -17
data/lib/ai4r/som/node.rb +61 -32
data/lib/ai4r/som/som.rb +158 -41
data/lib/ai4r/som/two_phase_layer.rb +21 -25
data/lib/ai4r/version.rb +3 -0
data/lib/ai4r.rb +58 -27
metadata +117 -106
data/README.rdoc +0 -44
data/test/classifiers/hyperpipes_test.rb +0 -84
data/test/classifiers/ib1_test.rb +0 -78
data/test/classifiers/id3_test.rb +0 -208
data/test/classifiers/multilayer_perceptron_test.rb +0 -79
data/test/classifiers/naive_bayes_test.rb +0 -43
data/test/classifiers/one_r_test.rb +0 -62
data/test/classifiers/prism_test.rb +0 -85
data/test/classifiers/zero_r_test.rb +0 -50
data/test/clusterers/average_linkage_test.rb +0 -51
data/test/clusterers/bisecting_k_means_test.rb +0 -66
data/test/clusterers/centroid_linkage_test.rb +0 -53
data/test/clusterers/complete_linkage_test.rb +0 -57
data/test/clusterers/diana_test.rb +0 -69
data/test/clusterers/k_means_test.rb +0 -100
data/test/clusterers/median_linkage_test.rb +0 -53
data/test/clusterers/single_linkage_test.rb +0 -122
data/test/clusterers/ward_linkage_test.rb +0 -53
data/test/clusterers/weighted_average_linkage_test.rb +0 -53
data/test/data/data_set_test.rb +0 -96
data/test/data/proximity_test.rb +0 -81
data/test/data/statistics_test.rb +0 -65
data/test/experiment/classifier_evaluator_test.rb +0 -76
data/test/genetic_algorithm/chromosome_test.rb +0 -57
data/test/genetic_algorithm/genetic_algorithm_test.rb +0 -81
data/test/neural_network/backpropagation_test.rb +0 -82
data/test/neural_network/hopfield_test.rb +0 -72
data/test/som/som_test.rb +0 -97

data/lib/ai4r/clusterers/ward_linkage_hierarchical.rb ADDED Viewed

@@ -0,0 +1,63 @@
+# frozen_string_literal: true
+# Author::    Peter Lubell-Doughtie
+# License::   BSD 3 Clause
+# Project::   ai4r
+# Url::       http://peet.ldee.org
+require_relative '../clusterers/ward_linkage'
+require_relative '../clusterers/cluster_tree'
+module Ai4r
+  module Clusterers
+    # Hierarchical version to store classes as merges occur.
+    class WardLinkageHierarchical < WardLinkage
+      include ClusterTree
+      # @param depth [Object]
+      # @return [Object]
+      def initialize(depth = nil)
+        @cluster_tree = []
+        @depth = depth
+        @merges_so_far = 0
+        super(depth)
+      end
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @param *options [Object]
+      # @return [Object]
+      def build(data_set, number_of_clusters = 1, **options)
+        data_len = data_set.data_items.length
+        @total_merges = data_len - number_of_clusters
+        super
+        @cluster_tree << clusters
+        @cluster_tree.reverse!
+        self
+      end
+      # @return [Object]
+      def supports_eval?
+        false
+      end
+      protected
+      # @param index_a [Object]
+      # @param index_b [Object]
+      # @param index_clusters [Object]
+      # @return [Object]
+      def merge_clusters(index_a, index_b, index_clusters)
+        # only store if no or above depth
+        if @depth.nil? || (@merges_so_far > @total_merges - @depth)
+          # store current clusters
+          stored_distance_matrix = @distance_matrix.dup
+          @cluster_tree << build_clusters_from_index_clusters(index_clusters)
+          @distance_matrix = stored_distance_matrix
+        end
+        @merges_so_far += 1
+        super
+      end
+    end
+  end
+end

data/lib/ai4r/clusterers/weighted_average_linkage.rb CHANGED Viewed

@@ -1,61 +1,77 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens (implementation)
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://www.ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
-require File.dirname(__FILE__) + '/../data/data_set'
-require File.dirname(__FILE__) + '/../clusterers/single_linkage'
+require_relative '../data/data_set'
+require_relative '../clusterers/single_linkage'
+require_relative '../clusterers/cluster_tree'
 module Ai4r
   module Clusterers
-    # Implementation of an Agglomerative Hierarchical clusterer with
-    # weighted average linkage algorithm, aka weighted pair group method
+    # Implementation of an Agglomerative Hierarchical clusterer with
+    # weighted average linkage algorithm, aka weighted pair group method
     # average or WPGMA (Jain and Dubes, 1988 ; McQuitty, 1966 )
-    # Hierarchical clusteres create one cluster per element, and then
+    # Hierarchical clusterer create one cluster per element, and then
     # progressively merge clusters, until the required number of clusters
     # is reached.
-    # Similar to AverageLinkage, but the distances between clusters are
+    # Similar to AverageLinkage, but the distances between clusters are
     # weighted based on the number of data items in each of them.
-    #
+    #
     #   D(cx, (ci U cj)) =  ( ni * D(cx, ci) + nj * D(cx, cj)) / (ni + nj)
     class WeightedAverageLinkage < SingleLinkage
-    parameters_info :distance_function =>
-          "Custom implementation of distance function. " +
-          "It must be a closure receiving two data items and return the " +
-          "distance bewteen them. By default, this algorithm uses " +
-          "ecuclidean distance of numeric attributes to the power of 2."
+      include ClusterTree
+      parameters_info distance_function:
+            'Custom implementation of distance function. ' \
+            'It must be a closure receiving two data items and return the ' \
+            'distance between them. By default, this algorithm uses ' \
+            'euclidean distance of numeric attributes to the power of 2.'
       # Build a new clusterer, using data examples found in data_set.
       # Items will be clustered in "number_of_clusters" different
       # clusters.
-      def build(data_set, number_of_clusters)
+      # @param data_set [Object]
+      # @param number_of_clusters [Object]
+      # @param *options [Object]
+      # @return [Object]
+      def build(data_set, number_of_clusters = 1, **options)
         super
       end
-      # This algorithms does not allow classification of new data items
+      # This algorithms does not allow classification of new data items
       # once it has been built. Rebuild the cluster including you data element.
-      def eval(data_item)
-        Raise "Eval of new data is not supported by this algorithm."
+      # @param _data_item [Object]
+      # @return [Object]
+      def eval(_data_item)
+        raise NotImplementedError, 'Eval of new data is not supported by this algorithm.'
       end
+      # @return [Object]
+      def supports_eval?
+        false
+      end
       protected
       # return distance between cluster cx and cluster (ci U cj),
       # using weighted average linkage
-      def linkage_distance(cx, ci, cj)
-        ni = @index_clusters[ci].length
-        nj = @index_clusters[cj].length
-        (1.0 * ni * read_distance_matrix(cx, ci)+
-          nj * read_distance_matrix(cx, cj))/(ni+nj)
+      # @param cx [Object]
+      # @param ci [Object]
+      # @param cj [Object]
+      # @return [Object]
+      def linkage_distance(cluster_x, cluster_i, cluster_j)
+        ni = @index_clusters[cluster_i].length
+        nj = @index_clusters[cluster_j].length
+        ((1.0 * ni * read_distance_matrix(cluster_x, cluster_i)) +
+          (nj * read_distance_matrix(cluster_x, cluster_j))) / (ni + nj)
       end
     end
   end
 end

data/lib/ai4r/data/data_set.rb CHANGED Viewed

@@ -1,36 +1,51 @@
+# frozen_string_literal: true
 # Author::    Sergio Fierens
 # License::   MPL 1.1
 # Project::   ai4r
-# Url::       http://ai4r.org/
+# Url::       https://github.com/SergioFierens/ai4r
 #
-# You can redistribute it and/or modify it under the terms of
-# the Mozilla Public License version 1.1  as published by the
+# You can redistribute it and/or modify it under the terms of
+# the Mozilla Public License version 1.1  as published by the
 # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
 require 'csv'
 require 'set'
-require File.dirname(__FILE__) + '/statistics'
+require_relative 'statistics'
 module Ai4r
   module Data
-    # A data set is a collection of N data items. Each data item is
+    # A data set is a collection of N data items. Each data item is
     # described by a set of attributes, represented as an array.
-    # Optionally, you can assign a label to the attributes, using
+    # Optionally, you can assign a label to the attributes, using
     # the data_labels property.
     class DataSet
-      @@number_regex = /(((\b[0-9]+)?\.)?\b[0-9]+([eE][-+]?[0-9]+)?\b)/
       attr_reader :data_labels, :data_items
+      # Return a new DataSet with numeric attributes normalized.
+      # Available methods are:
+      # * +:zscore+ - subtract the mean and divide by the standard deviation
+      # * +:minmax+ - scale values to the [0,1] range
+      # @param data_set [Object]
+      # @param method [Object]
+      # @return [Object]
+      def self.normalized(data_set, method: :zscore)
+        new_set = DataSet.new(
+          data_items: data_set.data_items.map(&:dup),
+          data_labels: data_set.data_labels.dup
+        )
+        new_set.normalize!(method)
+      end
       # Create a new DataSet. By default, empty.
       # Optionaly, you can provide the initial data items and data labels.
-      #
+      #
       # e.g. DataSet.new(:data_items => data_items, :data_labels => labels)
-      #
+      #
       # If you provide data items, but no data labels, the data set will
       # use the default data label values (see set_data_labels)
+      # @param options [Object]
+      # @return [Object]
       def initialize(options = {})
         @data_labels = []
         @data_items = options[:data_items] || []
@@ -38,78 +53,97 @@ module Ai4r
         set_data_items(options[:data_items]) if options[:data_items]
       end
-      # Retrieve a new DataSet, with the item(s) selected by the provided
+      # Retrieve a new DataSet, with the item(s) selected by the provided
       # index. You can specify an index range, too.
+      # @param index [Object]
+      # @return [Object]
       def [](index)
-        selected_items = (index.is_a?(Fixnum)) ?
-                [@data_items[index]] : @data_items[index]
-        return DataSet.new(:data_items => selected_items,
-                           :data_labels =>@data_labels)
+        selected_items = if index.is_a?(Integer)
+                           [@data_items[index]]
+                         else
+                           @data_items[index]
+                         end
+        DataSet.new(data_items: selected_items,
+                    data_labels: @data_labels)
       end
       # Load data items from csv file
-      def load_csv(filepath)
-        items = []
-        open_csv_file(filepath) do |entry|
-          items << entry
-        end
-        set_data_items(items)
-      end
-      # opens a csv-file and reads it line by line
-      # for each line, a block is called and the row is passed to the block
-      # ruby1.8 and 1.9 safe
-      def open_csv_file(filepath, &block)
-        if CSV.const_defined? :Reader
-          CSV::Reader.parse(File.open(filepath, 'r')) do |row|
-            block.call row
-          end
+      # @param filepath [Object]
+      # @return [Object]
+      def load_csv(filepath, parse_numeric: false)
+        if parse_numeric
+          parse_csv(filepath)
         else
-          CSV.parse(File.open(filepath, 'r')) do |row|
-            block.call row
+          items = []
+          open_csv_file(filepath) do |entry|
+            items << entry
           end
+          set_data_items(items)
         end
       end
+      # Open a CSV file and yield each row to the provided block.
+      # @param filepath [Object]
+      # @param block [Object]
+      # @return [Object]
+      def open_csv_file(filepath, &)
+        CSV.foreach(filepath, &)
+      end
       # Load data items from csv file. The first row is used as data labels.
-      def load_csv_with_labels(filepath)
-        load_csv(filepath)
+      # @param filepath [Object]
+      # @return [Object]
+      def load_csv_with_labels(filepath, parse_numeric: false)
+        load_csv(filepath, parse_numeric: parse_numeric)
         @data_labels = @data_items.shift
-        return self
+        self
       end
       # Same as load_csv, but it will try to convert cell contents as numbers.
+      # @param filepath [Object]
+      # @return [Object]
       def parse_csv(filepath)
         items = []
         open_csv_file(filepath) do |row|
-          items << row.collect{|x| (x.match(@@number_regex)) ? x.to_f : x.data }
+          items << row.collect do |x|
+            number?(x) ? Float(x, exception: false) : x
+          end
         end
         set_data_items(items)
       end
+      # Same as load_csv_with_labels, but it will try to convert cell contents as numbers.
+      # @param filepath [Object]
+      # @return [Object]
+      def parse_csv_with_labels(filepath)
+        load_csv_with_labels(filepath, parse_numeric: true)
+      end
       # Set data labels.
       # Data labels must have the following format:
       #     [ 'city', 'age_range', 'gender', 'marketing_target'  ]
       #
       # If you do not provide labels for you data, the following labels will
       # be created by default:
-      #     [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value'  ]
+      #     [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value'  ]
+      # @param labels [Object]
+      # @return [Object]
       def set_data_labels(labels)
         check_data_labels(labels)
         @data_labels = labels
-        return self
+        self
       end
       # Set the data items.
-      # M data items with  N attributes must have the following
+      # M data items with  N attributes must have the following
       # format:
-      #
-      #     [   [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1,  CLASS_VAL1],
-      #         [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2,  CLASS_VAL2],
+      #
+      #     [   [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1,  CLASS_VAL1],
+      #         [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2,  CLASS_VAL2],
       #         ...
-      #         [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
+      #         [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
       #     ]
-      #
+      #
       # e.g.
       #     [   ['New York',  '<30',      'M', 'Y'],
       #          ['Chicago',     '<30',      'M', 'Y'],
@@ -127,140 +161,235 @@ module Ai4r
       #          ['New York',  '[50-80]', 'F', 'N'],
       #          ['Chicago',     '>80',      'F', 'Y']
       #        ]
-      #
+      #
       # This method returns the classifier (self), allowing method chaining.
+      # @param items [Object]
+      # @return [Object]
       def set_data_items(items)
         check_data_items(items)
         @data_labels = default_data_labels(items) if @data_labels.empty?
         @data_items = items
-        return self
+        self
       end
       # Returns an array with the domain of each attribute:
       # * Set instance containing all possible values for nominal attributes
       # * Array with min and max values for numeric attributes (i.e. [min, max])
-      #
+      #
       # Return example:
-      # => [#<Set: {"New York", "Chicago"}>,
-      #     #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
+      # => [#<Set: {"New York", "Chicago"}>,
+      #     #<Set: {"<30", "[30-50)", "[50-80]", ">80"}>,
       #     #<Set: {"M", "F"}>,
-      #     [5, 85],
+      #     [5, 85],
       #     #<Set: {"Y", "N"}>]
+      # @return [Object]
       def build_domains
-        @data_labels.collect {|attr_label| build_domain(attr_label) }
+        @data_labels.collect { |attr_label| build_domain(attr_label) }
       end
       # Returns a Set instance containing all possible values for an attribute
       # The parameter can be an attribute label or index (0 based).
       # * Set instance containing all possible values for nominal attributes
       # * Array with min and max values for numeric attributes (i.e. [min, max])
-      #
+      #
       #   build_domain("city")
       #   => #<Set: {"New York", "Chicago"}>
-      #
+      #
       #   build_domain("age")
       #   => [5, 85]
-      #
+      #
       #   build_domain(2) # In this example, the third attribute is gender
       #   => #<Set: {"M", "F"}>
+      # @param attr [Object]
+      # @return [Object]
       def build_domain(attr)
         index = get_index(attr)
-        if @data_items.first[index].is_a?(Numeric)
-          return [Statistics.min(self, index), Statistics.max(self, index)]
-        else
-          return @data_items.inject(Set.new){|domain, x| domain << x[index]}
-        end
+        return [Statistics.min(self, index), Statistics.max(self, index)] if @data_items.first[index].is_a?(Numeric)
+        @data_items.inject(Set.new) { |domain, x| domain << x[index] }
       end
       # Returns attributes number, including class attribute
+      # @return [Object]
       def num_attributes
-        return (@data_items.empty?) ? 0 : @data_items.first.size
+        @data_items.empty? ? 0 : @data_items.first.size
       end
       # Returns the index of a given attribute (0-based).
       # For example, if "gender" is the third attribute, then:
-      #   get_index("gender")
+      #   get_index("gender")
       #   => 2
+      # @param attr [Object]
+      # @return [Object]
       def get_index(attr)
-        return (attr.is_a?(Fixnum) || attr.is_a?(Range)) ? attr : @data_labels.index(attr)
+        attr.is_a?(Integer) || attr.is_a?(Range) ? attr : @data_labels.index(attr)
       end
       # Raise an exception if there is no data item.
+      # @return [Object]
       def check_not_empty
-        if @data_items.empty?
-          raise ArgumentError, "Examples data set must not be empty."
-        end
+        return unless @data_items.empty?
+        raise ArgumentError, 'Examples data set must not be empty.'
       end
       # Add a data item to the data set
-      def << data_item
+      # @return [Object]
+      def <<(data_item)
         if data_item.nil? || !data_item.is_a?(Enumerable) || data_item.empty?
-          raise ArgumentError, "Data must not be an non empty array."
+          raise ArgumentError, 'Data must not be an non empty array.'
         elsif @data_items.empty?
           set_data_items([data_item])
         elsif data_item.length != num_attributes
-          raise ArgumentError, "Number of attributes do not match. " +
-                  "#{data_item.length} attributes provided, " +
-                  "#{num_attributes} attributes expected."
+          raise ArgumentError, 'Number of attributes do not match. ' \
+                               "#{data_item.length} attributes provided, " \
+                               "#{num_attributes} attributes expected."
         else
           @data_items << data_item
         end
       end
-      # Returns an array with the mean value of numeric attributes, and
+      # Returns an array with the mean value of numeric attributes, and
       # the most frequent value of non numeric attributes
+      # @return [Object]
       def get_mean_or_mode
         mean = []
         num_attributes.times do |i|
           mean[i] =
-                  if @data_items.first[i].is_a?(Numeric)
-                    Statistics.mean(self, i)
-                  else
-                    Statistics.mode(self, i)
-                  end
+            if @data_items.first[i].is_a?(Numeric)
+              Statistics.mean(self, i)
+            else
+              Statistics.mode(self, i)
+            end
         end
-        return mean
+        mean
+      end
+      # Normalize numeric attributes in place. Supported methods are
+      # +:zscore+ (default) and +:minmax+.
+      # @param method [Object]
+      # @return [Object]
+      def normalize!(method = :zscore)
+        numeric_indices = (0...num_attributes).select do |i|
+          @data_items.first[i].is_a?(Numeric)
+        end
+        case method
+        when :zscore
+          means = numeric_indices.map { |i| Statistics.mean(self, i) }
+          sds = numeric_indices.map { |i| Statistics.standard_deviation(self, i) }
+          @data_items.each do |row|
+            numeric_indices.each_with_index do |idx, j|
+              sd = sds[j]
+              row[idx] = sd.zero? ? 0 : (row[idx] - means[j]) / sd
+            end
+          end
+        when :minmax
+          mins = numeric_indices.map { |i| Statistics.min(self, i) }
+          maxs = numeric_indices.map { |i| Statistics.max(self, i) }
+          @data_items.each do |row|
+            numeric_indices.each_with_index do |idx, j|
+              range = maxs[j] - mins[j]
+              row[idx] = range.zero? ? 0 : (row[idx] - mins[j]) / range.to_f
+            end
+          end
+        else
+          raise ArgumentError, "Unknown normalization method #{method}"
+        end
+        self
+      end
+      # Randomizes the order of data items in place.
+      # If a +seed+ is provided, it is used to initialize the random number
+      # generator for deterministic shuffling.
+      #
+      #   data_set.shuffle!(seed: 123)
+      #
+      # @param seed [Integer, nil] Seed for the RNG
+      # @return [DataSet] self
+      def shuffle!(seed: nil)
+        rng = seed ? Random.new(seed) : Random.new
+        @data_items.shuffle!(random: rng)
+        self
+      end
+      # Split the dataset into two new DataSet instances using the given ratio
+      # for the first set.
+      #
+      #   train, test = data_set.split(ratio: 0.8)
+      #
+      # @param ratio [Float] fraction of items to place in the first set
+      # @return [Array<DataSet, DataSet>] the two resulting datasets
+      def split(ratio:)
+        raise ArgumentError, 'ratio must be between 0 and 1' unless ratio.positive? && ratio < 1
+        pivot = (ratio * @data_items.length).round
+        first_items = @data_items[0...pivot].map(&:dup)
+        second_items = @data_items[pivot..].map(&:dup)
+        [
+          DataSet.new(data_items: first_items, data_labels: @data_labels.dup),
+          DataSet.new(data_items: second_items, data_labels: @data_labels.dup)
+        ]
+      end
+      # Returns label of category
+      # @return [Object]
+      def category_label
+        data_labels.last
       end
       protected
+      # @param x [Object]
+      # @return [Object]
+      def number?(x)
+        !Float(x, exception: false).nil?
+      end
+      # @param data_items [Object]
+      # @return [Object]
       def check_data_items(data_items)
         if !data_items || data_items.empty?
-          raise ArgumentError, "Examples data set must not be empty."
+          raise ArgumentError, 'Examples data set must not be empty.'
         elsif !data_items.first.is_a?(Enumerable)
-          raise ArgumentError, "Unkown format for example data."
+          raise ArgumentError, 'Unkown format for example data.'
         end
         attributes_num = data_items.first.length
         data_items.each_index do |index|
-          if data_items[index].length != attributes_num
-            raise ArgumentError,
-                  "Quantity of attributes is inconsistent. " +
-                          "The first item has #{attributes_num} attributes "+
-                          "and row #{index} has #{data_items[index].length} attributes"
-          end
+          next unless data_items[index].length != attributes_num
+          raise ArgumentError,
+                'Quantity of attributes is inconsistent. ' \
+                "The first item has #{attributes_num} attributes " \
+                "and row #{index} has #{data_items[index].length} attributes"
         end
       end
+      # @param labels [Object]
+      # @return [Object]
       def check_data_labels(labels)
-        if !@data_items.empty?
-          if labels.length != @data_items.first.length
-            raise ArgumentError,
-                  "Number of labels and attributes do not match. " +
-                          "#{labels.length} labels and " +
-                          "#{@data_items.first.length} attributes found."
-          end
-        end
+        return if @data_items.empty?
+        return unless labels.length != @data_items.first.length
+        raise ArgumentError,
+              'Number of labels and attributes do not match. ' \
+              "#{labels.length} labels and " \
+              "#{@data_items.first.length} attributes found."
       end
+      # @param data_items [Object]
+      # @return [Object]
       def default_data_labels(data_items)
         data_labels = []
         data_items[0][0..-2].each_index do |i|
-          data_labels[i] = "attribute_#{i+1}"
+          data_labels[i] = "attribute_#{i + 1}"
         end
-        data_labels[data_labels.length]="class_value"
-        return data_labels
+        data_labels[data_labels.length] = 'class_value'
+        data_labels
       end
     end
   end
 end