data_modeler 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 023a9053513981d2058cd23f7b0a9154da588b6b
4
- data.tar.gz: 5e7a9c83204a2da747aa273e1ced42b4d1fefe36
3
+ metadata.gz: 762480cf9239c43cfe81e82634a63b52b2ac1d28
4
+ data.tar.gz: 7554ba11f59112d0dddc39f80ad1c4b897fb02a0
5
5
  SHA512:
6
- metadata.gz: 70cbccdd7cd7c9a70142c853f2a68e2a9be28412b05c48d6cd0b0440b703d6f878cdb58b1b0902efab321b4064116968aab37288ae2618634e44fc689ce0c433
7
- data.tar.gz: 5d6ddf4e31fe4ae1d18973bf6719378037e9899d51627fdfc607813d8fff7de362d30aff0b4cb97343e13078414fde6a4a52e806e21f96fd572a0dfef740552e
6
+ metadata.gz: 84547a8cf68c84f42a58aec83961b644315095aac87c33f30d2aa0cacfbd2f6b966b95ef060aae253866bdb2e4dd2f37bf29a662eaa5d2b14429d790ab68a03f
7
+ data.tar.gz: 2501a06535f433c6a58b45ccbad701c37ebb2be6c062ee9a2d627fa2a9f34235aa20dd88c57738af9f0d5f1f03a6c18fd4aa5336428d2bf0260f6682c57e7127
data/lib/data_modeler.rb CHANGED
@@ -1,11 +1,16 @@
1
+ # Helpers
1
2
  require "data_modeler/version"
2
3
  require "data_modeler/exceptions"
4
+ require "data_modeler/helpers"
3
5
 
4
6
  # Dataset
5
7
  require "data_modeler/dataset/dataset_helper"
6
8
  require "data_modeler/dataset/dataset"
7
9
  require "data_modeler/dataset/dataset_gen"
8
10
 
9
- # Models (should be added to this module)
10
- module DataModeler::Model; end
11
+ # Models
12
+ require "data_modeler/model"
11
13
  require "data_modeler/model/fann"
14
+
15
+ # Modeler
16
+ require "data_modeler/base"
@@ -0,0 +1,127 @@
1
+ # TODO: use fastestcsv if available
2
+ require 'csv'
3
+
4
+ # Base class, core of the DataModeler framework.
5
+ # - Initializes the system based on the config
6
+ # - Runs over the data training and testing models
7
+ # - Results and models are saved to the file system
8
+ class DataModeler::Base
9
+
10
+ attr_reader :config, :inputs, :targets, :train_size, :test_size,
11
+ :nruns, :data, :out_dir, :tset_gen, :model
12
+
13
+ # @param config [Hash] configuration hash for the whole experiment setup
14
+ def initialize config
15
+ @config = config
16
+ @inputs = config[:tset][:input_series].map! &:to_sym
17
+ @targets = config[:tset][:target_series].map! &:to_sym
18
+ @train_size = config[:tset][:train_size]
19
+ @test_size = config[:tset][:test_size]
20
+ @nruns = config[:tset][:nruns] ||= Float::INFINITY # terminates with data
21
+ @save_models = config[:results].delete :save_models
22
+
23
+ @data = load_data config[:data]
24
+ @out_dir = prepare_output config[:results]
25
+
26
+ @tset_gen = DataModeler::DatasetGen.new data, **opts_for(:datasetgen)
27
+ @model = DataModeler::Model.from_conf **opts_for(:learner)
28
+ end
29
+
30
+ # Main control: up to `nruns` (or until end of data) loop train-test-save
31
+ # @param report_interval [Integer] interval at which to print to stdout
32
+ # (in number of generations) -- will be passed to the `Model`
33
+ # @return [void]
34
+ # @note saves model, preds and obs to the file sistem at the end of each run
35
+ def run report_interval: 1000
36
+ 1.upto(nruns) do |nrun; predictions, observations| # block-local variables
37
+ begin
38
+ train_set = tset_gen.train(nrun)
39
+ rescue DataModeler::DatasetGen::NoDataLeft
40
+ # will check if there's enough data for both train&test
41
+ break
42
+ end
43
+ model.reset
44
+ model.train train_set, report_interval: report_interval
45
+ test_input, observations = tset_gen.test(nrun).values
46
+ predictions = model.test test_input
47
+ save_run nrun, model, [predictions, observations]
48
+ end
49
+ end
50
+
51
+ # Attribute reader for instance variable `@save_models`, ending in '?' since
52
+ # it's a boolean value.
53
+ # @return [true|false] value of instance variable @save_models
54
+ # (false if nil/uninitialized)
55
+ def save_models?
56
+ @save_models || false
57
+ end
58
+
59
+ private
60
+
61
+ # Loads the data in a Hash ready for `DatasetGen` (and `Dataset`)
62
+ # @param dir [String/path] directory where to find the data (from `config`)
63
+ # @param file [String/fname] name of the file containing the data (from `config`)
64
+ # @return [Hash] the data ready for access
65
+ def load_data dir:, file:
66
+ filename = Pathname.new(dir).join(file)
67
+ abort "Only CSV data for now, sorry" unless filename.extname == '.csv'
68
+ # avoid loading data we won't use
69
+ series = [:time] + inputs + targets
70
+ csv_opts = { headers: true, header_converters: :symbol, converters: :float }
71
+ Hash.new { |h,k| h[k] = [] }.tap do |data|
72
+ CSV.foreach(filename, **csv_opts) do |row|
73
+ series.each { |s| data[s] << row[s] }
74
+ end
75
+ end
76
+ end
77
+
78
+ # Prepares a directory to hold the output of each run
79
+ # @param dir [String/path] directory where to save the results (from `config`)
80
+ # @param id [String/fname] id of current config/experiment (from `config`)
81
+ # @return [void]
82
+ # @note side effect: creates directories on file system to hold output
83
+ def prepare_output dir:, id:
84
+ Pathname.new(dir).join(id).tap { |path| FileUtils.mkdir_p path }
85
+ end
86
+
87
+ # Compatibility helper, preparing configuration hashes for different classes
88
+ # @param who [Symbol] which class are you preparing the config for
89
+ # @return [Hash] configuration for the class as required
90
+ def opts_for who
91
+ case who
92
+ when :datasetgen
93
+ { ds_args: opts_for(:dataset),
94
+ train_size: config[:tset][:train_size],
95
+ test_size: config[:tset][:test_size]
96
+ }
97
+ when :dataset
98
+ { inputs: inputs,
99
+ targets: targets,
100
+ ntimes: config[:tset][:ntimes],
101
+ tspread: config[:tset][:tspread],
102
+ look_ahead: config[:tset][:look_ahead]
103
+ }
104
+ when :learner
105
+ config[:learner].merge({
106
+ ninputs: (config[:tset][:ntimes] * inputs.size),
107
+ noutputs: targets.size
108
+ })
109
+ else abort "Unrecognized `who`: '#{who}'"
110
+ end
111
+ end
112
+
113
+ # Save a run's results on the file system
114
+ # @param nrun [Integer] the curent run number (used as id for naming)
115
+ # @param model [Model] the model trained in the current run
116
+ # @param predobs [Array<Array<pred, obs>>] list of prediction-observation pairs
117
+ # @return [void]
118
+ # @note side effect: saves model and predobs to file system
119
+ def save_run nrun, model, predobs
120
+ run_id = format '%02d', nrun
121
+ model.save out_dir.join("model_#{run_id}.sav") if save_models?
122
+ CSV.open(out_dir.join("predobs_#{run_id}.csv"), 'wb') do |csv|
123
+ csv << targets.collect { |t| ["p_#{t}", "o_#{t}"] }.transpose.flatten
124
+ predobs.transpose.each { |po| csv << po.flatten }
125
+ end
126
+ end
127
+ end
@@ -6,8 +6,8 @@
6
6
  class DataModeler::Dataset
7
7
 
8
8
  attr_reader :data, :input_series, :target_series, :first_idx, :end_idx,
9
- :ntimes, :tspread, :look_ahead, :target_idx, :input_idxs,
10
- :nrows
9
+ :ntimes, :tspread, :look_ahead, :first_idx, :target_idx,
10
+ :input_idxs, :nrows
11
11
 
12
12
  # @param data [Hash-like] the data, in an object that can be
13
13
  # accessed by keys and return a time series per each key.
@@ -26,8 +26,7 @@ class DataModeler::Dataset
26
26
  # the target -- i.e., how far ahead the model is trained to predict
27
27
  # @note we expect Datasets indices to be used with left inclusion but
28
28
  # right exclusion, i.e. targets are considered in the range `[from,to)`
29
- def initialize data, inputs:, targets:, first_idx:, end_idx:,
30
- ntimes:, tspread:, look_ahead:
29
+ def initialize data, inputs:, targets:, first_idx:, end_idx:, ntimes:, tspread:, look_ahead:
31
30
  @data = data
32
31
  @input_series = inputs
33
32
  @target_series = targets
@@ -37,8 +36,8 @@ class DataModeler::Dataset
37
36
  @nrows = data[:time].size
38
37
  @tspread = tspread
39
38
  @look_ahead = look_ahead
40
- @target_idx = first_idx
41
- @input_idxs = init_inputs
39
+ @first_idx = first_idx
40
+ reset_iteration
42
41
  end
43
42
 
44
43
  # TODO: make sure constructor requirements are unnecessary for static models
@@ -63,8 +62,11 @@ class DataModeler::Dataset
63
62
  end
64
63
  end
65
64
 
65
+ ### ITERATION
66
+
66
67
  # Returns the next pair [inputs, targets]
67
68
  # @return [Array]
69
+ # @raise [StopIteration] when the target index is past the dataset limits
68
70
  def peek
69
71
  raise StopIteration if target_idx >= end_idx
70
72
  [inputs, targets]
@@ -79,7 +81,16 @@ class DataModeler::Dataset
79
81
  end
80
82
  end
81
83
 
82
- include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
84
+ # `#each` and `#to_a` based on `#next`
85
+ include DataModeler::Dataset::IteratingBasedOnNext
86
+
87
+ ### COMPATIBILITY
88
+
89
+ # Compatibility with Hash, which returns a list of series' data arrays
90
+ # @return [Array<Array>>] list of values per each serie
91
+ def values
92
+ to_a.transpose
93
+ end
83
94
 
84
95
  # Overloaded comparison for easier testing
85
96
  # @param other [Dataset] what needs comparing to
@@ -94,7 +105,15 @@ class DataModeler::Dataset
94
105
 
95
106
  private
96
107
 
97
- include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
108
+ # Resets the indices at the start position -- used for iterations
109
+ # @return [void]
110
+ def reset_iteration
111
+ @target_idx = first_idx
112
+ @input_idxs = init_inputs
113
+ end
114
+
115
+ # `#time` and `#idx` for time/index conversion
116
+ include DataModeler::Dataset::ConvertingTimeAndIndices
98
117
 
99
118
  # Initializes input indices vector
100
119
  # @return [Array<input_idx>]
@@ -28,12 +28,14 @@ class DataModeler::DatasetGen
28
28
  @first_idx = first_idx
29
29
  @train_size = train_size
30
30
  @test_size = test_size
31
- @local_nrun = 1 # used to iterate over nruns with #next
31
+ reset_iteration
32
32
 
33
33
  @nrows = data[:time].size
34
34
  validate_enough_data_for min_nruns
35
35
  end
36
36
 
37
+ ### DATA ACCESS
38
+
37
39
  # Builds training set for the training
38
40
  # @param nrun [Integer] will build different train+test for each run
39
41
  # @return [Dataset]
@@ -56,41 +58,53 @@ class DataModeler::DatasetGen
56
58
  DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
57
59
  end
58
60
 
61
+ ### ITERATION
62
+
63
+ # TODO: @local_nrun is an ugly name, refactor it!
64
+
59
65
  # Returns the next pair [trainset, testset]
60
66
  # @return [Array<Dataset, Dataset>]
61
67
  def peek
62
68
  [self.train(@local_nrun), self.test(@local_nrun)]
63
69
  end
64
70
 
65
- # TODO: @local_nrun is an ugly hack, refactor it!
66
-
67
71
  # Returns the next pair [trainset, testset] and increments the counter
68
72
  # @return [Array<Dataset, Dataset>]
69
73
  def next
70
74
  peek.tap { @local_nrun += 1 }
71
75
  end
72
76
 
73
- include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
77
+ # `#each` and `#to_a` based on `#next`
78
+ include DataModeler::Dataset::IteratingBasedOnNext
74
79
 
75
80
  # I want `#to_a` to return an array of arrays rather than an array of dataset
76
81
 
82
+ # Returns an array of datasets
77
83
  # @return [Array<Array[Dataset]>]
78
84
  alias_method :to_ds_a, :to_a
85
+ # Returns an array of arrays (list of inputs-targets pairs)
79
86
  # @return [Array<Array<Array<...>>]
80
87
  def to_a
81
- to_ds_a.collect do |run|
82
- run.collect &:to_a
88
+ to_ds_a.collect do |train_test_for_run|
89
+ train_test_for_run.collect &:to_a
83
90
  end
84
91
  end
85
92
 
86
93
  private
87
94
 
88
- include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
95
+ # Resets the index at the start position -- used for iterations
96
+ # @return [void]
97
+ def reset_iteration
98
+ @local_nrun = 1
99
+ end
100
+
101
+ # `#time` and `#idx` for time/index conversion
102
+ include DataModeler::Dataset::ConvertingTimeAndIndices
89
103
 
90
104
  # Find the index of the first element in the data eligible as target for training
91
105
  # @return [Integer] the index of the first eligible target
92
106
  def min_eligible_trg
93
- @min_eligible_trg ||= idx(time(0) +
107
+ @min_eligible_trg ||= idx( time(0) +
94
108
  # minimum time span required as input for the first target
95
109
  ds_args[:look_ahead] + (ds_args[:ntimes]-1) * ds_args[:tspread]
96
110
  )
@@ -1,4 +1,4 @@
1
- module DataModeler
1
+ class DataModeler::Dataset
2
2
  # Converts between time and indices for referencing data lines
3
3
  module ConvertingTimeAndIndices
4
4
  # Returns the time for a given index
@@ -30,11 +30,23 @@ module DataModeler
30
30
  # Yields on each [inputs, targets] pair.
31
31
  # @return [nil, Iterator] `block_given? ? nil : Iterator`
32
32
  def each
33
+ reset_iteration
33
34
  return enum_for(:each) unless block_given?
34
35
  loop { yield self.next }
35
36
  nil
36
37
  end
37
38
 
39
+ # Yields on each [inputs, targets] pair, collecting the input.
40
+ # @return [Array, Iterator] `block_given? ? nil : Iterator`
41
+ def map
42
+ reset_iteration
43
+ return enum_for(:collect) unless block_given?
44
+ [].tap { |ret| loop { ret << yield(self.next) } }
45
+ end
46
+
47
+ # @see #collect
48
+ alias_method :collect, :map
49
+
38
50
  # @return [Array]
39
51
  def to_a
40
52
  each.to_a
@@ -0,0 +1,17 @@
1
+
2
+ # Helper functions go here
3
+ module DataModeler
4
+ # Returns a standardized String ID from a (sequentially named) file
5
+ # @return [String]
6
+ # @note convenient method to have available in the config
7
+ def self.id_from_filename filename=__FILE__
8
+ format "%02d", Integer(filename[/_(\d+).rb$/,1])
9
+ end
10
+
11
+ # Returns an instance of the Base class
12
+ # @param config [Hash] Base class configuration
13
+ # @return [Base] initialized instance of Base class
14
+ def self.new config
15
+ DataModeler::Base.new config
16
+ end
17
+ end
@@ -0,0 +1,17 @@
1
+
2
+ # All models for the framework should belong to this module.
3
+ # Also includes a model selector for initialization from config.
4
+ module DataModeler::Model
5
+ # Returns a new Model correctly initialized based on the `type` of choice
6
+ # @param type [Symbol] which type of Model is chosen
7
+ # @param opts [splatted Hash params] the rest of the parameters will be passed
8
+ # to the model for initialization
9
+ # @return [Model] a correctly initialized Model of type `type`
10
+ def self.from_conf type:, **opts
11
+ case type
12
+ when :fann
13
+ FANN.new opts
14
+ else abort "Unrecognized model: #{type}"
15
+ end
16
+ end
17
+ end
@@ -3,19 +3,22 @@ require 'ruby-fann'
3
3
  # Model class based on Fast Artificial Neural Networks (FANN)
4
4
  class DataModeler::Model::FANN
5
5
 
6
- attr_reader :opts, :fann, :algo, :actfn
6
+ attr_reader :fann_opts, :ngens, :fann, :algo, :actfn
7
7
 
8
- # @param netstruct [Array<ninputs, Array<hidden_layers>, noutputs>] network
9
- # structure
8
+ # @param ngens [Integer] number of generations alloted for training
9
+ # @param hidden_layers [Array<Integer>] list of number of hidden neurons
10
+ # per each hidden layer in the network
11
+ # @param ninputs [Integer] number of inputs of the network
12
+ # @param noutputs [Integer] number of outputs of the network
10
13
  # @param algo [:incremental, :batch, :rprop, :quickprop] training algorithm
11
14
  # @param actfn [:sigmoid, ...] activation function
12
- def initialize netstruct, algo: nil, actfn: nil
13
- ninputs, hidden_layers, noutputs = netstruct
14
- @opts = {
15
+ def initialize ngens:, hidden_layers:, ninputs:, noutputs:, algo: nil, actfn: nil
16
+ @fann_opts = {
15
17
  num_inputs: ninputs,
16
18
  hidden_neurons: hidden_layers,
17
19
  num_outputs: noutputs
18
20
  }
21
+ @ngens = ngens
19
22
  @algo = algo
20
23
  @actfn = actfn
21
24
  reset
@@ -24,7 +27,7 @@ class DataModeler::Model::FANN
24
27
  # Resets / initializes the model
25
28
  # @return [void]
26
29
  def reset
27
- @fann = RubyFann::Standard.new opts
30
+ @fann = RubyFann::Standard.new fann_opts
28
31
  fann.set_training_algorithm(algo) if algo
29
32
  if actfn
30
33
  fann.set_activation_function_hidden(actfn)
@@ -34,16 +37,17 @@ class DataModeler::Model::FANN
34
37
  end
35
38
 
36
39
  # Trains the model for ngens on the trainset
37
- # @param ngens [Integer] number of training generations
38
40
  # @param trainset [Hash-like<input: Array, target: Array>] training set
41
+ # @param ngens [Integer] number of training generations
39
42
  # @return [void]
40
- def train ngens, trainset
41
- tset = RubyFann::TrainData.new(
42
- inputs: trainset[:input], desired_outputs: trainset[:target])
43
+ def train trainset, ngens=@ngens, report_interval: 1000, desired_error: 1e-10
44
+ # TODO: optimize maybe?
45
+ inputs, targets = trainset.values
46
+ tset = RubyFann::TrainData.new inputs: inputs, desired_outputs: targets
43
47
  # fann.init_weights tset # test this weights initialization
44
48
 
45
- # params: train_data, max_epochs, reports_interval, desired_error
46
- fann.train_on_data(tset, ngens, 1000, 1e-10)
49
+ # params: train_data, max_epochs, report_interval, desired_error
50
+ fann.train_on_data(tset, ngens, report_interval, desired_error)
47
51
  end
48
52
 
49
53
  # Tests the model on inputs.
@@ -59,6 +63,6 @@ class DataModeler::Model::FANN
59
63
  def save filename
60
64
  # can do filename check here...?
61
65
  # TODO: I'd like to have a kind of `to_s`, and do all the saving in the modeler...
62
- fann.save filename
66
+ fann.save filename.to_s
63
67
  end
64
68
  end
@@ -1,5 +1,5 @@
1
1
  # Main gem module
2
2
  module DataModeler
3
3
  # Version number
4
- VERSION = "0.2.1"
4
+ VERSION = "0.3.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_modeler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giuseppe Cuccu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-13 00:00:00.000000000 Z
11
+ date: 2017-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: ruby-fann
@@ -156,10 +156,13 @@ files:
156
156
  - bin/setup
157
157
  - data_modeler.gemspec
158
158
  - lib/data_modeler.rb
159
+ - lib/data_modeler/base.rb
159
160
  - lib/data_modeler/dataset/dataset.rb
160
161
  - lib/data_modeler/dataset/dataset_gen.rb
161
162
  - lib/data_modeler/dataset/dataset_helper.rb
162
163
  - lib/data_modeler/exceptions.rb
164
+ - lib/data_modeler/helpers.rb
165
+ - lib/data_modeler/model.rb
163
166
  - lib/data_modeler/model/fann.rb
164
167
  - lib/data_modeler/version.rb
165
168
  homepage: https://github.com/giuse/data_modeler