RubyGems - data_modeler - Versions diffs - 0.2.1 → 0.3.0 - Mend

data_modeler 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/lib/data_modeler.rb +7 -2
data/lib/data_modeler/base.rb +127 -0
data/lib/data_modeler/dataset/dataset.rb +27 -8
data/lib/data_modeler/dataset/dataset_gen.rb +22 -8
data/lib/data_modeler/dataset/dataset_helper.rb +13 -1
data/lib/data_modeler/helpers.rb +17 -0
data/lib/data_modeler/model.rb +17 -0
data/lib/data_modeler/model/fann.rb +18 -14
data/lib/data_modeler/version.rb +1 -1
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 023a9053513981d2058cd23f7b0a9154da588b6b
-  data.tar.gz: 5e7a9c83204a2da747aa273e1ced42b4d1fefe36
+  metadata.gz: 762480cf9239c43cfe81e82634a63b52b2ac1d28
+  data.tar.gz: 7554ba11f59112d0dddc39f80ad1c4b897fb02a0
 SHA512:
-  metadata.gz: 70cbccdd7cd7c9a70142c853f2a68e2a9be28412b05c48d6cd0b0440b703d6f878cdb58b1b0902efab321b4064116968aab37288ae2618634e44fc689ce0c433
-  data.tar.gz: 5d6ddf4e31fe4ae1d18973bf6719378037e9899d51627fdfc607813d8fff7de362d30aff0b4cb97343e13078414fde6a4a52e806e21f96fd572a0dfef740552e
+  metadata.gz: 84547a8cf68c84f42a58aec83961b644315095aac87c33f30d2aa0cacfbd2f6b966b95ef060aae253866bdb2e4dd2f37bf29a662eaa5d2b14429d790ab68a03f
+  data.tar.gz: 2501a06535f433c6a58b45ccbad701c37ebb2be6c062ee9a2d627fa2a9f34235aa20dd88c57738af9f0d5f1f03a6c18fd4aa5336428d2bf0260f6682c57e7127

data/lib/data_modeler.rb CHANGED Viewed

@@ -1,11 +1,16 @@
+# Helpers
 require "data_modeler/version"
 require "data_modeler/exceptions"
+require "data_modeler/helpers"
 # Dataset
 require "data_modeler/dataset/dataset_helper"
 require "data_modeler/dataset/dataset"
 require "data_modeler/dataset/dataset_gen"
-# Models (should be added to this module)
-module DataModeler::Model; end
+# Models
+require "data_modeler/model"
 require "data_modeler/model/fann"
+# Modeler
+require "data_modeler/base"

data/lib/data_modeler/base.rb ADDED Viewed

@@ -0,0 +1,127 @@
+# TODO: use fastestcsv if available
+require 'csv'
+# Base class, core of the DataModeler framework.
+# - Initializes the system based on the config
+# - Runs over the data training and testing models
+# - Results and models are saved to the file system
+class DataModeler::Base
+  attr_reader :config, :inputs, :targets, :train_size, :test_size,
+              :nruns, :data, :out_dir, :tset_gen, :model
+  # @param config [Hash] configuration hash for the whole experiment setup
+  def initialize config
+    @config = config
+    @inputs = config[:tset][:input_series].map! &:to_sym
+    @targets =  config[:tset][:target_series].map! &:to_sym
+    @train_size = config[:tset][:train_size]
+    @test_size = config[:tset][:test_size]
+    @nruns = config[:tset][:nruns] ||= Float::INFINITY # terminates with data
+    @save_models = config[:results].delete :save_models
+    @data = load_data config[:data]
+    @out_dir = prepare_output config[:results]
+    @tset_gen = DataModeler::DatasetGen.new data, **opts_for(:datasetgen)
+    @model = DataModeler::Model.from_conf **opts_for(:learner)
+  end
+  # Main control: up to `nruns` (or until end of data) loop train-test-save
+  # @param report_interval [Integer] interval at which to print to stdout
+  #     (in number of generations) -- will be passed to the `Model`
+  # @return [void]
+  # @note saves model, preds and obs to the file sistem at the end of each run
+  def run report_interval: 1000
+    1.upto(nruns) do |nrun; predictions, observations| # block-local variables
+      begin
+        train_set = tset_gen.train(nrun)
+      rescue DataModeler::DatasetGen::NoDataLeft
+        # will check if there's enough data for both train&test
+        break
+      end
+      model.reset
+      model.train train_set, report_interval: report_interval
+      test_input, observations = tset_gen.test(nrun).values
+      predictions = model.test test_input
+      save_run nrun, model, [predictions, observations]
+    end
+  end
+  # Attribute reader for instance variable `@save_models`, ending in '?' since
+  #    it's a boolean value.
+  # @return [true|false] value of instance variable @save_models
+  #    (false if nil/uninitialized)
+  def save_models?
+    @save_models || false
+  end
+  private
+  # Loads the data in a Hash ready for `DatasetGen` (and `Dataset`)
+  # @param dir [String/path] directory where to find the data (from `config`)
+  # @param file [String/fname] name of the file containing the data (from `config`)
+  # @return [Hash] the data ready for access
+  def load_data dir:, file:
+    filename = Pathname.new(dir).join(file)
+    abort "Only CSV data for now, sorry" unless filename.extname == '.csv'
+    # avoid loading data we won't use
+    series = [:time] + inputs + targets
+    csv_opts = { headers: true, header_converters: :symbol, converters: :float }
+    Hash.new { |h,k| h[k] = [] }.tap do |data|
+      CSV.foreach(filename, **csv_opts) do |row|
+        series.each { |s| data[s] << row[s] }
+      end
+    end
+  end
+  # Prepares a directory to hold the output of each run
+  # @param dir [String/path] directory where to save the results (from `config`)
+  # @param id [String/fname] id of current config/experiment (from `config`)
+  # @return [void]
+  # @note side effect: creates directories on file system to hold output
+  def prepare_output dir:, id:
+    Pathname.new(dir).join(id).tap { |path| FileUtils.mkdir_p path }
+  end
+  # Compatibility helper, preparing configuration hashes for different classes
+  # @param who [Symbol] which class are you preparing the config for
+  # @return [Hash] configuration for the class as required
+  def opts_for who
+    case who
+    when :datasetgen
+      { ds_args: opts_for(:dataset),
+        train_size: config[:tset][:train_size],
+        test_size: config[:tset][:test_size]
+      }
+    when :dataset
+      { inputs: inputs,
+        targets:  targets,
+        ntimes: config[:tset][:ntimes],
+        tspread: config[:tset][:tspread],
+        look_ahead: config[:tset][:look_ahead]
+      }
+    when :learner
+      config[:learner].merge({
+        ninputs: (config[:tset][:ntimes] * inputs.size),
+        noutputs: targets.size
+      })
+    else abort "Unrecognized `who`: '#{who}'"
+    end
+  end
+  # Save a run's results on the file system
+  # @param nrun [Integer] the curent run number (used as id for naming)
+  # @param model [Model] the model trained in the current run
+  # @param predobs [Array<Array<pred, obs>>] list of prediction-observation pairs
+  # @return [void]
+  # @note side effect: saves model and predobs to file system
+  def save_run nrun, model, predobs
+    run_id = format '%02d', nrun
+    model.save out_dir.join("model_#{run_id}.sav") if save_models?
+    CSV.open(out_dir.join("predobs_#{run_id}.csv"), 'wb') do |csv|
+      csv << targets.collect { |t| ["p_#{t}", "o_#{t}"] }.transpose.flatten
+      predobs.transpose.each { |po| csv << po.flatten }
+    end
+  end
+end

data/lib/data_modeler/dataset/dataset.rb CHANGED Viewed

@@ -6,8 +6,8 @@
 class DataModeler::Dataset
   attr_reader :data, :input_series, :target_series, :first_idx, :end_idx,
-              :ntimes, :tspread, :look_ahead, :target_idx, :input_idxs,
-              :nrows
+              :ntimes, :tspread, :look_ahead, :first_idx, :target_idx,
+              :input_idxs, :nrows
   # @param data [Hash-like] the data, in an object that can be
   #     accessed by keys and return a time series per each key.
@@ -26,8 +26,7 @@ class DataModeler::Dataset
   #     the target -- i.e., how far ahead the model is trained to predict
   # @note we expect Datasets indices to be used with left inclusion but
   #     right exclusion, i.e. targets are considered in the range `[from,to)`
-  def initialize data, inputs:, targets:, first_idx:, end_idx:,
-      ntimes:, tspread:, look_ahead:
+  def initialize data, inputs:, targets:, first_idx:, end_idx:, ntimes:, tspread:, look_ahead:
     @data = data
     @input_series = inputs
     @target_series = targets
@@ -37,8 +36,8 @@ class DataModeler::Dataset
     @nrows = data[:time].size
     @tspread = tspread
     @look_ahead = look_ahead
-    @target_idx = first_idx
-    @input_idxs = init_inputs
+    @first_idx = first_idx
+    reset_iteration
   end
   # TODO: make sure constructor requirements are unnecessary for static models
@@ -63,8 +62,11 @@ class DataModeler::Dataset
     end
   end
+  ### ITERATION
   # Returns the next pair [inputs, targets]
   # @return [Array]
+  # @raise [StopIteration] when the target index is past the dataset limits
   def peek
     raise StopIteration if target_idx >= end_idx
     [inputs, targets]
@@ -79,7 +81,16 @@ class DataModeler::Dataset
     end
   end
-  include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
+  # `#each` and `#to_a` based on `#next`
+  include DataModeler::Dataset::IteratingBasedOnNext
+  ### COMPATIBILITY
+  # Compatibility with Hash, which returns a list of series' data arrays
+  # @return [Array<Array>>] list of values per each serie
+  def values
+    to_a.transpose
+  end
   # Overloaded comparison for easier testing
   # @param other [Dataset] what needs comparing to
@@ -94,7 +105,15 @@ class DataModeler::Dataset
   private
-  include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
+  # Resets the indices at the start position -- used for iterations
+  # @return [void]
+  def reset_iteration
+    @target_idx = first_idx
+    @input_idxs = init_inputs
+  end
+  # `#time` and `#idx` for time/index conversion
+  include DataModeler::Dataset::ConvertingTimeAndIndices
   # Initializes input indices vector
   # @return [Array<input_idx>]

data/lib/data_modeler/dataset/dataset_gen.rb CHANGED Viewed

@@ -28,12 +28,14 @@ class DataModeler::DatasetGen
     @first_idx = first_idx
     @train_size = train_size
     @test_size = test_size
-    @local_nrun = 1 # used to iterate over nruns with #next
+    reset_iteration
     @nrows = data[:time].size
     validate_enough_data_for min_nruns
   end
+  ### DATA ACCESS
   # Builds training set for the training
   # @param nrun [Integer] will build different train+test for each run
   # @return [Dataset]
@@ -56,41 +58,53 @@ class DataModeler::DatasetGen
     DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
   end
+  ### ITERATION
+  # TODO: @local_nrun is an ugly name, refactor it!
   # Returns the next pair [trainset, testset]
   # @return [Array<Dataset, Dataset>]
   def peek
     [self.train(@local_nrun), self.test(@local_nrun)]
   end
-  # TODO: @local_nrun is an ugly hack, refactor it!
   # Returns the next pair [trainset, testset] and increments the counter
   # @return [Array<Dataset, Dataset>]
   def next
     peek.tap { @local_nrun += 1 }
   end
-  include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
+  # `#each` and `#to_a` based on `#next`
+  include DataModeler::Dataset::IteratingBasedOnNext
   # I want `#to_a` to return an array of arrays rather than an array of dataset
+  # Returns an array of datasets
   # @return [Array<Array[Dataset]>]
   alias_method :to_ds_a, :to_a
+  # Returns an array of arrays (list of inputs-targets pairs)
   # @return [Array<Array<Array<...>>]
   def to_a
-    to_ds_a.collect do |run|
-      run.collect &:to_a
+    to_ds_a.collect do |train_test_for_run|
+      train_test_for_run.collect &:to_a
     end
   end
   private
-  include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
+  # Resets the index at the start position -- used for iterations
+  # @return [void]
+  def reset_iteration
+    @local_nrun = 1
+  end
+  # `#time` and `#idx` for time/index conversion
+  include DataModeler::Dataset::ConvertingTimeAndIndices
   # Find the index of the first element in the data eligible as target for training
   # @return [Integer] the index of the first eligible target
   def min_eligible_trg
-    @min_eligible_trg ||= idx(time(0) +
+    @min_eligible_trg ||= idx( time(0) +
       # minimum time span required as input for the first target
       ds_args[:look_ahead] + (ds_args[:ntimes]-1) * ds_args[:tspread]
     )

data/lib/data_modeler/dataset/dataset_helper.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-module DataModeler
+class DataModeler::Dataset
   # Converts between time and indices for referencing data lines
   module ConvertingTimeAndIndices
     # Returns the time for a given index
@@ -30,11 +30,23 @@ module DataModeler
     # Yields on each [inputs, targets] pair.
     # @return [nil, Iterator] `block_given? ? nil : Iterator`
     def each
+      reset_iteration
       return enum_for(:each) unless block_given?
       loop { yield self.next }
       nil
     end
+    # Yields on each [inputs, targets] pair, collecting the input.
+    # @return [Array, Iterator] `block_given? ? nil : Iterator`
+    def map
+      reset_iteration
+      return enum_for(:collect) unless block_given?
+      [].tap { |ret| loop { ret << yield(self.next) } }
+    end
+    # @see #collect
+    alias_method :collect, :map
     # @return [Array]
     def to_a
       each.to_a

data/lib/data_modeler/helpers.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# Helper functions go here
+module DataModeler
+  # Returns a standardized String ID from a (sequentially named) file
+  # @return [String]
+  # @note convenient method to have available in the config
+  def self.id_from_filename filename=__FILE__
+    format "%02d", Integer(filename[/_(\d+).rb$/,1])
+  end
+  # Returns an instance of the Base class
+  # @param config [Hash] Base class configuration
+  # @return [Base] initialized instance of Base class
+  def self.new config
+    DataModeler::Base.new config
+  end
+end

data/lib/data_modeler/model.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# All models for the framework should belong to this module.
+# Also includes a model selector for initialization from config.
+module DataModeler::Model
+  # Returns a new Model correctly initialized based on the `type` of choice
+  # @param type [Symbol] which type of Model is chosen
+  # @param opts [splatted Hash params] the rest of the parameters will be passed
+  #     to the model for initialization
+  # @return [Model] a correctly initialized Model of type `type`
+  def self.from_conf type:, **opts
+    case type
+    when :fann
+      FANN.new opts
+    else abort "Unrecognized model: #{type}"
+    end
+  end
+end

data/lib/data_modeler/model/fann.rb CHANGED Viewed

@@ -3,19 +3,22 @@ require 'ruby-fann'
 # Model class based on Fast Artificial Neural Networks (FANN)
 class DataModeler::Model::FANN
-  attr_reader :opts, :fann, :algo, :actfn
+  attr_reader :fann_opts, :ngens, :fann, :algo, :actfn
-  # @param netstruct [Array<ninputs, Array<hidden_layers>, noutputs>] network
-  #     structure
+  # @param ngens [Integer] number of generations alloted for training
+  # @param hidden_layers [Array<Integer>] list of number of hidden neurons
+  #      per each hidden layer in the network
+  # @param ninputs [Integer] number of inputs of the network
+  # @param noutputs [Integer] number of outputs of the network
   # @param algo [:incremental, :batch, :rprop, :quickprop] training algorithm
   # @param actfn [:sigmoid, ...] activation function
-  def initialize netstruct, algo: nil, actfn: nil
-    ninputs, hidden_layers, noutputs = netstruct
-    @opts = {
+  def initialize ngens:, hidden_layers:, ninputs:, noutputs:, algo: nil, actfn: nil
+    @fann_opts = {
       num_inputs: ninputs,
       hidden_neurons: hidden_layers,
       num_outputs: noutputs
     }
+    @ngens = ngens
     @algo = algo
     @actfn = actfn
     reset
@@ -24,7 +27,7 @@ class DataModeler::Model::FANN
   # Resets / initializes the model
   # @return [void]
   def reset
-    @fann = RubyFann::Standard.new opts
+    @fann = RubyFann::Standard.new fann_opts
     fann.set_training_algorithm(algo) if algo
     if actfn
       fann.set_activation_function_hidden(actfn)
@@ -34,16 +37,17 @@ class DataModeler::Model::FANN
   end
   # Trains the model for ngens on the trainset
-  # @param ngens [Integer] number of training generations
   # @param trainset [Hash-like<input: Array, target: Array>] training set
+  # @param ngens [Integer] number of training generations
   # @return [void]
-  def train ngens, trainset
-    tset = RubyFann::TrainData.new(
-      inputs: trainset[:input], desired_outputs: trainset[:target])
+  def train trainset, ngens=@ngens, report_interval: 1000, desired_error: 1e-10
+    # TODO: optimize maybe?
+    inputs, targets = trainset.values
+    tset = RubyFann::TrainData.new inputs: inputs, desired_outputs: targets
     # fann.init_weights tset # test this weights initialization
-    # params: train_data, max_epochs, reports_interval, desired_error
-    fann.train_on_data(tset, ngens, 1000, 1e-10)
+    # params: train_data, max_epochs, report_interval, desired_error
+    fann.train_on_data(tset, ngens, report_interval, desired_error)
   end
   # Tests the model on inputs.
@@ -59,6 +63,6 @@ class DataModeler::Model::FANN
   def save filename
     # can do filename check here...?
     # TODO: I'd like to have a kind of `to_s`, and do all the saving in the modeler...
-    fann.save filename
+    fann.save filename.to_s
   end
 end

data/lib/data_modeler/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # Main gem module
 module DataModeler
   # Version number
-  VERSION = "0.2.1"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: data_modeler
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.3.0
 platform: ruby
 authors:
 - Giuseppe Cuccu
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-05-13 00:00:00.000000000 Z
+date: 2017-05-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ruby-fann
@@ -156,10 +156,13 @@ files:
 - bin/setup
 - data_modeler.gemspec
 - lib/data_modeler.rb
+- lib/data_modeler/base.rb
 - lib/data_modeler/dataset/dataset.rb
 - lib/data_modeler/dataset/dataset_gen.rb
 - lib/data_modeler/dataset/dataset_helper.rb
 - lib/data_modeler/exceptions.rb
+- lib/data_modeler/helpers.rb
+- lib/data_modeler/model.rb
 - lib/data_modeler/model/fann.rb
 - lib/data_modeler/version.rb
 homepage: https://github.com/giuse/data_modeler