data_modeler 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 023a9053513981d2058cd23f7b0a9154da588b6b
4
- data.tar.gz: 5e7a9c83204a2da747aa273e1ced42b4d1fefe36
3
+ metadata.gz: 762480cf9239c43cfe81e82634a63b52b2ac1d28
4
+ data.tar.gz: 7554ba11f59112d0dddc39f80ad1c4b897fb02a0
5
5
  SHA512:
6
- metadata.gz: 70cbccdd7cd7c9a70142c853f2a68e2a9be28412b05c48d6cd0b0440b703d6f878cdb58b1b0902efab321b4064116968aab37288ae2618634e44fc689ce0c433
7
- data.tar.gz: 5d6ddf4e31fe4ae1d18973bf6719378037e9899d51627fdfc607813d8fff7de362d30aff0b4cb97343e13078414fde6a4a52e806e21f96fd572a0dfef740552e
6
+ metadata.gz: 84547a8cf68c84f42a58aec83961b644315095aac87c33f30d2aa0cacfbd2f6b966b95ef060aae253866bdb2e4dd2f37bf29a662eaa5d2b14429d790ab68a03f
7
+ data.tar.gz: 2501a06535f433c6a58b45ccbad701c37ebb2be6c062ee9a2d627fa2a9f34235aa20dd88c57738af9f0d5f1f03a6c18fd4aa5336428d2bf0260f6682c57e7127
data/lib/data_modeler.rb CHANGED
@@ -1,11 +1,16 @@
1
+ # Helpers
1
2
  require "data_modeler/version"
2
3
  require "data_modeler/exceptions"
4
+ require "data_modeler/helpers"
3
5
 
4
6
  # Dataset
5
7
  require "data_modeler/dataset/dataset_helper"
6
8
  require "data_modeler/dataset/dataset"
7
9
  require "data_modeler/dataset/dataset_gen"
8
10
 
9
- # Models (should be added to this module)
10
- module DataModeler::Model; end
11
+ # Models
12
+ require "data_modeler/model"
11
13
  require "data_modeler/model/fann"
14
+
15
+ # Modeler
16
+ require "data_modeler/base"
@@ -0,0 +1,127 @@
1
+ # TODO: use fastestcsv if available
2
+ require 'csv'
3
+
4
+ # Base class, core of the DataModeler framework.
5
+ # - Initializes the system based on the config
6
+ # - Runs over the data training and testing models
7
+ # - Results and models are saved to the file system
8
+ class DataModeler::Base
9
+
10
+ attr_reader :config, :inputs, :targets, :train_size, :test_size,
11
+ :nruns, :data, :out_dir, :tset_gen, :model
12
+
13
+ # @param config [Hash] configuration hash for the whole experiment setup
14
+ def initialize config
15
+ @config = config
16
+ @inputs = config[:tset][:input_series].map! &:to_sym
17
+ @targets = config[:tset][:target_series].map! &:to_sym
18
+ @train_size = config[:tset][:train_size]
19
+ @test_size = config[:tset][:test_size]
20
+ @nruns = config[:tset][:nruns] ||= Float::INFINITY # terminates with data
21
+ @save_models = config[:results].delete :save_models
22
+
23
+ @data = load_data config[:data]
24
+ @out_dir = prepare_output config[:results]
25
+
26
+ @tset_gen = DataModeler::DatasetGen.new data, **opts_for(:datasetgen)
27
+ @model = DataModeler::Model.from_conf **opts_for(:learner)
28
+ end
29
+
30
+ # Main control: up to `nruns` (or until end of data) loop train-test-save
31
+ # @param report_interval [Integer] interval at which to print to stdout
32
+ # (in number of generations) -- will be passed to the `Model`
33
+ # @return [void]
34
+ # @note saves model, preds and obs to the file sistem at the end of each run
35
+ def run report_interval: 1000
36
+ 1.upto(nruns) do |nrun; predictions, observations| # block-local variables
37
+ begin
38
+ train_set = tset_gen.train(nrun)
39
+ rescue DataModeler::DatasetGen::NoDataLeft
40
+ # will check if there's enough data for both train&test
41
+ break
42
+ end
43
+ model.reset
44
+ model.train train_set, report_interval: report_interval
45
+ test_input, observations = tset_gen.test(nrun).values
46
+ predictions = model.test test_input
47
+ save_run nrun, model, [predictions, observations]
48
+ end
49
+ end
50
+
51
+ # Attribute reader for instance variable `@save_models`, ending in '?' since
52
+ # it's a boolean value.
53
+ # @return [true|false] value of instance variable @save_models
54
+ # (false if nil/uninitialized)
55
+ def save_models?
56
+ @save_models || false
57
+ end
58
+
59
+ private
60
+
61
+ # Loads the data in a Hash ready for `DatasetGen` (and `Dataset`)
62
+ # @param dir [String/path] directory where to find the data (from `config`)
63
+ # @param file [String/fname] name of the file containing the data (from `config`)
64
+ # @return [Hash] the data ready for access
65
+ def load_data dir:, file:
66
+ filename = Pathname.new(dir).join(file)
67
+ abort "Only CSV data for now, sorry" unless filename.extname == '.csv'
68
+ # avoid loading data we won't use
69
+ series = [:time] + inputs + targets
70
+ csv_opts = { headers: true, header_converters: :symbol, converters: :float }
71
+ Hash.new { |h,k| h[k] = [] }.tap do |data|
72
+ CSV.foreach(filename, **csv_opts) do |row|
73
+ series.each { |s| data[s] << row[s] }
74
+ end
75
+ end
76
+ end
77
+
78
+ # Prepares a directory to hold the output of each run
79
+ # @param dir [String/path] directory where to save the results (from `config`)
80
+ # @param id [String/fname] id of current config/experiment (from `config`)
81
+ # @return [void]
82
+ # @note side effect: creates directories on file system to hold output
83
+ def prepare_output dir:, id:
84
+ Pathname.new(dir).join(id).tap { |path| FileUtils.mkdir_p path }
85
+ end
86
+
87
+ # Compatibility helper, preparing configuration hashes for different classes
88
+ # @param who [Symbol] which class are you preparing the config for
89
+ # @return [Hash] configuration for the class as required
90
+ def opts_for who
91
+ case who
92
+ when :datasetgen
93
+ { ds_args: opts_for(:dataset),
94
+ train_size: config[:tset][:train_size],
95
+ test_size: config[:tset][:test_size]
96
+ }
97
+ when :dataset
98
+ { inputs: inputs,
99
+ targets: targets,
100
+ ntimes: config[:tset][:ntimes],
101
+ tspread: config[:tset][:tspread],
102
+ look_ahead: config[:tset][:look_ahead]
103
+ }
104
+ when :learner
105
+ config[:learner].merge({
106
+ ninputs: (config[:tset][:ntimes] * inputs.size),
107
+ noutputs: targets.size
108
+ })
109
+ else abort "Unrecognized `who`: '#{who}'"
110
+ end
111
+ end
112
+
113
+ # Save a run's results on the file system
114
+ # @param nrun [Integer] the curent run number (used as id for naming)
115
+ # @param model [Model] the model trained in the current run
116
+ # @param predobs [Array<Array<pred, obs>>] list of prediction-observation pairs
117
+ # @return [void]
118
+ # @note side effect: saves model and predobs to file system
119
+ def save_run nrun, model, predobs
120
+ run_id = format '%02d', nrun
121
+ model.save out_dir.join("model_#{run_id}.sav") if save_models?
122
+ CSV.open(out_dir.join("predobs_#{run_id}.csv"), 'wb') do |csv|
123
+ csv << targets.collect { |t| ["p_#{t}", "o_#{t}"] }.transpose.flatten
124
+ predobs.transpose.each { |po| csv << po.flatten }
125
+ end
126
+ end
127
+ end
@@ -6,8 +6,8 @@
6
6
  class DataModeler::Dataset
7
7
 
8
8
  attr_reader :data, :input_series, :target_series, :first_idx, :end_idx,
9
- :ntimes, :tspread, :look_ahead, :target_idx, :input_idxs,
10
- :nrows
9
+ :ntimes, :tspread, :look_ahead, :first_idx, :target_idx,
10
+ :input_idxs, :nrows
11
11
 
12
12
  # @param data [Hash-like] the data, in an object that can be
13
13
  # accessed by keys and return a time series per each key.
@@ -26,8 +26,7 @@ class DataModeler::Dataset
26
26
  # the target -- i.e., how far ahead the model is trained to predict
27
27
  # @note we expect Datasets indices to be used with left inclusion but
28
28
  # right exclusion, i.e. targets are considered in the range `[from,to)`
29
- def initialize data, inputs:, targets:, first_idx:, end_idx:,
30
- ntimes:, tspread:, look_ahead:
29
+ def initialize data, inputs:, targets:, first_idx:, end_idx:, ntimes:, tspread:, look_ahead:
31
30
  @data = data
32
31
  @input_series = inputs
33
32
  @target_series = targets
@@ -37,8 +36,8 @@ class DataModeler::Dataset
37
36
  @nrows = data[:time].size
38
37
  @tspread = tspread
39
38
  @look_ahead = look_ahead
40
- @target_idx = first_idx
41
- @input_idxs = init_inputs
39
+ @first_idx = first_idx
40
+ reset_iteration
42
41
  end
43
42
 
44
43
  # TODO: make sure constructor requirements are unnecessary for static models
@@ -63,8 +62,11 @@ class DataModeler::Dataset
63
62
  end
64
63
  end
65
64
 
65
+ ### ITERATION
66
+
66
67
  # Returns the next pair [inputs, targets]
67
68
  # @return [Array]
69
+ # @raise [StopIteration] when the target index is past the dataset limits
68
70
  def peek
69
71
  raise StopIteration if target_idx >= end_idx
70
72
  [inputs, targets]
@@ -79,7 +81,16 @@ class DataModeler::Dataset
79
81
  end
80
82
  end
81
83
 
82
- include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
84
+ # `#each` and `#to_a` based on `#next`
85
+ include DataModeler::Dataset::IteratingBasedOnNext
86
+
87
+ ### COMPATIBILITY
88
+
89
+ # Compatibility with Hash, which returns a list of series' data arrays
90
+ # @return [Array<Array>>] list of values per each serie
91
+ def values
92
+ to_a.transpose
93
+ end
83
94
 
84
95
  # Overloaded comparison for easier testing
85
96
  # @param other [Dataset] what needs comparing to
@@ -94,7 +105,15 @@ class DataModeler::Dataset
94
105
 
95
106
  private
96
107
 
97
- include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
108
+ # Resets the indices at the start position -- used for iterations
109
+ # @return [void]
110
+ def reset_iteration
111
+ @target_idx = first_idx
112
+ @input_idxs = init_inputs
113
+ end
114
+
115
+ # `#time` and `#idx` for time/index conversion
116
+ include DataModeler::Dataset::ConvertingTimeAndIndices
98
117
 
99
118
  # Initializes input indices vector
100
119
  # @return [Array<input_idx>]
@@ -28,12 +28,14 @@ class DataModeler::DatasetGen
28
28
  @first_idx = first_idx
29
29
  @train_size = train_size
30
30
  @test_size = test_size
31
- @local_nrun = 1 # used to iterate over nruns with #next
31
+ reset_iteration
32
32
 
33
33
  @nrows = data[:time].size
34
34
  validate_enough_data_for min_nruns
35
35
  end
36
36
 
37
+ ### DATA ACCESS
38
+
37
39
  # Builds training set for the training
38
40
  # @param nrun [Integer] will build different train+test for each run
39
41
  # @return [Dataset]
@@ -56,41 +58,53 @@ class DataModeler::DatasetGen
56
58
  DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
57
59
  end
58
60
 
61
+ ### ITERATION
62
+
63
+ # TODO: @local_nrun is an ugly name, refactor it!
64
+
59
65
  # Returns the next pair [trainset, testset]
60
66
  # @return [Array<Dataset, Dataset>]
61
67
  def peek
62
68
  [self.train(@local_nrun), self.test(@local_nrun)]
63
69
  end
64
70
 
65
- # TODO: @local_nrun is an ugly hack, refactor it!
66
-
67
71
  # Returns the next pair [trainset, testset] and increments the counter
68
72
  # @return [Array<Dataset, Dataset>]
69
73
  def next
70
74
  peek.tap { @local_nrun += 1 }
71
75
  end
72
76
 
73
- include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
77
+ # `#each` and `#to_a` based on `#next`
78
+ include DataModeler::Dataset::IteratingBasedOnNext
74
79
 
75
80
  # I want `#to_a` to return an array of arrays rather than an array of dataset
76
81
 
82
+ # Returns an array of datasets
77
83
  # @return [Array<Array[Dataset]>]
78
84
  alias_method :to_ds_a, :to_a
85
+ # Returns an array of arrays (list of inputs-targets pairs)
79
86
  # @return [Array<Array<Array<...>>]
80
87
  def to_a
81
- to_ds_a.collect do |run|
82
- run.collect &:to_a
88
+ to_ds_a.collect do |train_test_for_run|
89
+ train_test_for_run.collect &:to_a
83
90
  end
84
91
  end
85
92
 
86
93
  private
87
94
 
88
- include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
95
+ # Resets the index at the start position -- used for iterations
96
+ # @return [void]
97
+ def reset_iteration
98
+ @local_nrun = 1
99
+ end
100
+
101
+ # `#time` and `#idx` for time/index conversion
102
+ include DataModeler::Dataset::ConvertingTimeAndIndices
89
103
 
90
104
  # Find the index of the first element in the data eligible as target for training
91
105
  # @return [Integer] the index of the first eligible target
92
106
  def min_eligible_trg
93
- @min_eligible_trg ||= idx(time(0) +
107
+ @min_eligible_trg ||= idx( time(0) +
94
108
  # minimum time span required as input for the first target
95
109
  ds_args[:look_ahead] + (ds_args[:ntimes]-1) * ds_args[:tspread]
96
110
  )
@@ -1,4 +1,4 @@
1
- module DataModeler
1
+ class DataModeler::Dataset
2
2
  # Converts between time and indices for referencing data lines
3
3
  module ConvertingTimeAndIndices
4
4
  # Returns the time for a given index
@@ -30,11 +30,23 @@ module DataModeler
30
30
  # Yields on each [inputs, targets] pair.
31
31
  # @return [nil, Iterator] `block_given? ? nil : Iterator`
32
32
  def each
33
+ reset_iteration
33
34
  return enum_for(:each) unless block_given?
34
35
  loop { yield self.next }
35
36
  nil
36
37
  end
37
38
 
39
+ # Yields on each [inputs, targets] pair, collecting the input.
40
+ # @return [Array, Iterator] `block_given? ? nil : Iterator`
41
+ def map
42
+ reset_iteration
43
+ return enum_for(:collect) unless block_given?
44
+ [].tap { |ret| loop { ret << yield(self.next) } }
45
+ end
46
+
47
+ # @see #collect
48
+ alias_method :collect, :map
49
+
38
50
  # @return [Array]
39
51
  def to_a
40
52
  each.to_a
@@ -0,0 +1,17 @@
1
+
2
+ # Helper functions go here
3
+ module DataModeler
4
+ # Returns a standardized String ID from a (sequentially named) file
5
+ # @return [String]
6
+ # @note convenient method to have available in the config
7
+ def self.id_from_filename filename=__FILE__
8
+ format "%02d", Integer(filename[/_(\d+).rb$/,1])
9
+ end
10
+
11
+ # Returns an instance of the Base class
12
+ # @param config [Hash] Base class configuration
13
+ # @return [Base] initialized instance of Base class
14
+ def self.new config
15
+ DataModeler::Base.new config
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+
2
+ # All models for the framework should belong to this module.
3
+ # Also includes a model selector for initialization from config.
4
+ module DataModeler::Model
5
+ # Returns a new Model correctly initialized based on the `type` of choice
6
+ # @param type [Symbol] which type of Model is chosen
7
+ # @param opts [splatted Hash params] the rest of the parameters will be passed
8
+ # to the model for initialization
9
+ # @return [Model] a correctly initialized Model of type `type`
10
+ def self.from_conf type:, **opts
11
+ case type
12
+ when :fann
13
+ FANN.new opts
14
+ else abort "Unrecognized model: #{type}"
15
+ end
16
+ end
17
+ end
@@ -3,19 +3,22 @@ require 'ruby-fann'
3
3
  # Model class based on Fast Artificial Neural Networks (FANN)
4
4
  class DataModeler::Model::FANN
5
5
 
6
- attr_reader :opts, :fann, :algo, :actfn
6
+ attr_reader :fann_opts, :ngens, :fann, :algo, :actfn
7
7
 
8
- # @param netstruct [Array<ninputs, Array<hidden_layers>, noutputs>] network
9
- # structure
8
+ # @param ngens [Integer] number of generations alloted for training
9
+ # @param hidden_layers [Array<Integer>] list of number of hidden neurons
10
+ # per each hidden layer in the network
11
+ # @param ninputs [Integer] number of inputs of the network
12
+ # @param noutputs [Integer] number of outputs of the network
10
13
  # @param algo [:incremental, :batch, :rprop, :quickprop] training algorithm
11
14
  # @param actfn [:sigmoid, ...] activation function
12
- def initialize netstruct, algo: nil, actfn: nil
13
- ninputs, hidden_layers, noutputs = netstruct
14
- @opts = {
15
+ def initialize ngens:, hidden_layers:, ninputs:, noutputs:, algo: nil, actfn: nil
16
+ @fann_opts = {
15
17
  num_inputs: ninputs,
16
18
  hidden_neurons: hidden_layers,
17
19
  num_outputs: noutputs
18
20
  }
21
+ @ngens = ngens
19
22
  @algo = algo
20
23
  @actfn = actfn
21
24
  reset
@@ -24,7 +27,7 @@ class DataModeler::Model::FANN
24
27
  # Resets / initializes the model
25
28
  # @return [void]
26
29
  def reset
27
- @fann = RubyFann::Standard.new opts
30
+ @fann = RubyFann::Standard.new fann_opts
28
31
  fann.set_training_algorithm(algo) if algo
29
32
  if actfn
30
33
  fann.set_activation_function_hidden(actfn)
@@ -34,16 +37,17 @@ class DataModeler::Model::FANN
34
37
  end
35
38
 
36
39
  # Trains the model for ngens on the trainset
37
- # @param ngens [Integer] number of training generations
38
40
  # @param trainset [Hash-like<input: Array, target: Array>] training set
41
+ # @param ngens [Integer] number of training generations
39
42
  # @return [void]
40
- def train ngens, trainset
41
- tset = RubyFann::TrainData.new(
42
- inputs: trainset[:input], desired_outputs: trainset[:target])
43
+ def train trainset, ngens=@ngens, report_interval: 1000, desired_error: 1e-10
44
+ # TODO: optimize maybe?
45
+ inputs, targets = trainset.values
46
+ tset = RubyFann::TrainData.new inputs: inputs, desired_outputs: targets
43
47
  # fann.init_weights tset # test this weights initialization
44
48
 
45
- # params: train_data, max_epochs, reports_interval, desired_error
46
- fann.train_on_data(tset, ngens, 1000, 1e-10)
49
+ # params: train_data, max_epochs, report_interval, desired_error
50
+ fann.train_on_data(tset, ngens, report_interval, desired_error)
47
51
  end
48
52
 
49
53
  # Tests the model on inputs.
@@ -59,6 +63,6 @@ class DataModeler::Model::FANN
59
63
  def save filename
60
64
  # can do filename check here...?
61
65
  # TODO: I'd like to have a kind of `to_s`, and do all the saving in the modeler...
62
- fann.save filename
66
+ fann.save filename.to_s
63
67
  end
64
68
  end
@@ -1,5 +1,5 @@
1
1
  # Main gem module
2
2
  module DataModeler
3
3
  # Version number
4
- VERSION = "0.2.1"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_modeler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giuseppe Cuccu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-13 00:00:00.000000000 Z
11
+ date: 2017-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-fann
@@ -156,10 +156,13 @@ files:
156
156
  - bin/setup
157
157
  - data_modeler.gemspec
158
158
  - lib/data_modeler.rb
159
+ - lib/data_modeler/base.rb
159
160
  - lib/data_modeler/dataset/dataset.rb
160
161
  - lib/data_modeler/dataset/dataset_gen.rb
161
162
  - lib/data_modeler/dataset/dataset_helper.rb
162
163
  - lib/data_modeler/exceptions.rb
164
+ - lib/data_modeler/helpers.rb
165
+ - lib/data_modeler/model.rb
163
166
  - lib/data_modeler/model/fann.rb
164
167
  - lib/data_modeler/version.rb
165
168
  homepage: https://github.com/giuse/data_modeler