data_modeler 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9ab07761ab1c6a95194902cb445c9af4f63f13ad
4
- data.tar.gz: '0812c7257cb9b151790b1c95f7c796e4b94b8bbc'
3
+ metadata.gz: 2b665a6495480b44b7d337d0a1acf6a7ad826262
4
+ data.tar.gz: 85ab09bec546d8bb18b5f3c0ac3f0a48b926cd86
5
5
  SHA512:
6
- metadata.gz: 74fab45b2f6293fd7250ae07dddbe688bcd77dc1c2a258388fd63aa300abf824b850075533a676c6b115424d1615a4665ba2b5ba61cb14ba700f40fe1d80fc13
7
- data.tar.gz: 6b8484b312d6a6fc907d00491f3e9db4a921e68a5d07eeca462c9e428f5fddced203be1c168445b24be2742fbdfddefc728e90d1e0af4d9d514cce2f76e346aa
6
+ metadata.gz: 05d5e05210191c51be3b50fe1db36787d8db0ab6659cdf902d6b41f8d3d86155afb39f714b1a843321a4c39fdd967b3dfc050f0caccebe574301fc281c450e96
7
+ data.tar.gz: 5b565eb79f326a0e169a496d12e9b98a8b432bc23caa67df74ac0d03ebeb7772ead78840fb982e0b3bb8e1ffccb503e13a623a6eee9b335f0ffef125d6286091
data/data_modeler.gemspec CHANGED
@@ -22,10 +22,10 @@ Gem::Specification.new do |spec|
22
22
  spec.require_paths = ["lib"]
23
23
 
24
24
  # Debug
25
- spec.add_development_dependency 'pry', '~> 0'
26
- spec.add_development_dependency 'pry-nav', '~> 0'
27
- spec.add_development_dependency 'pry-stack_explorer', '~> 0'
28
- spec.add_development_dependency 'pry-rescue', '~> 0'
25
+ spec.add_development_dependency 'pry', '~> 0.10'
26
+ spec.add_development_dependency 'pry-nav', '~> 0.2'
27
+ spec.add_development_dependency 'pry-stack_explorer', '~> 0.4'
28
+ spec.add_development_dependency 'pry-rescue', '~> 1.4'
29
29
 
30
30
  # Test
31
31
  spec.add_development_dependency 'bundler', '~> 1.14'
data/lib/data_modeler.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require "data_modeler/version"
2
-
3
- module DataModeler
4
- # Your code goes here...
5
- end
2
+ require "data_modeler/exceptions"
3
+ require "data_modeler/dataset_helper"
4
+ require "data_modeler/dataset"
5
+ require "data_modeler/dataset_gen"
@@ -0,0 +1,107 @@
1
+
2
+ # Create complex inputs and targets for the model from the given data
3
+ # @note checks to validate if enough data is present (given ntimes, tspread
4
+ # and look_ahead) should be done on the caller (typically DatasetGen)
5
+ class DataModeler::Dataset
6
+
7
+ attr_reader :data, :input_series, :target_series, :first_idx, :end_idx,
8
+ :ntimes, :tspread, :look_ahead, :target_idx, :input_idxs,
9
+ :nrows
10
+
11
+ # @param data [Hash-like] the data, in an object that can be
12
+ # accessed by keys and return a time series per each key.
13
+ # It is required to include and be sorted by a series named `time`,
14
+ # and for all series to have equal length.
15
+ # @param inputs [Array] data key accessors for input series
16
+ # @param targets [Array] data key accessors for target series
17
+ # @param first_idx [Integer] index where the dataset starts on data
18
+ # @param end_idx [Integer] index where the dataset ends on data
19
+ # @param ntimes [Integer] number of lines/times/datapoints to be
20
+ # used to construct the input
21
+ # @param tspread [Numeric] distance (in `time`!) between the `ntimes`
22
+ # lines/times/datapoints used to construct the input
23
+ # @param look_ahead [Numeric] distance (in `time`!) between the
24
+ # most recent line/time/datapoint used for the input and
25
+ # the target -- i.e., how far ahead the model is trained to predict
26
+ # @note we expect Datasets indices to be used with left inclusion but
27
+ # right exclusion, i.e. targets are considered in the range `[from,to)`
28
+ def initialize data, inputs:, targets:, first_idx:, end_idx:,
29
+ ntimes:, tspread:, look_ahead:
30
+ @data = data
31
+ @input_series = inputs
32
+ @target_series = targets
33
+ @first_idx = first_idx
34
+ @end_idx = end_idx
35
+ @ntimes = ntimes
36
+ @nrows = data[:time].size
37
+ @tspread = tspread
38
+ @look_ahead = look_ahead
39
+ @target_idx = first_idx
40
+ @input_idxs = init_inputs
41
+ end
42
+
43
+ # TODO: make sure constructor requirements are unnecessary for static models
44
+ # TODO: check if enough data / minimum_target
45
+ # TODO: the check in `#init_target` should go in the `ds_gen`
46
+
47
+ # Builds inputs for the model
48
+ # @return [Array]
49
+ def inputs
50
+ input_idxs.flat_map do |idx|
51
+ input_series.collect do |s|
52
+ data[s][idx]
53
+ end
54
+ end
55
+ end
56
+
57
+ # Builds targets for the model
58
+ # @return [Array]
59
+ def targets
60
+ target_series.collect do |s|
61
+ data[s][target_idx]
62
+ end
63
+ end
64
+
65
+ # Returns the next pair [inputs, targets]
66
+ # @return [Array]
67
+ def peek
68
+ raise StopIteration if target_idx >= end_idx
69
+ [inputs, targets]
70
+ end
71
+
72
+ # Returns the next pair [inputs, targets] and increments the target
73
+ # @return [Array]
74
+ def next
75
+ peek.tap do
76
+ @target_idx += 1
77
+ @input_idxs = init_inputs
78
+ end
79
+ end
80
+
81
+ include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
82
+
83
+ # Overloaded comparison for easier testing
84
+ def == other
85
+ self.class == other.class &&
86
+ data.object_id == other.data.object_id &&
87
+ (instance_variables - [:@data]).all? do |var|
88
+ self.instance_variable_get(var) == other.instance_variable_get(var)
89
+ end
90
+ end
91
+
92
+ private
93
+
94
+ include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
95
+
96
+ def init_inputs
97
+ if target_idx < end_idx
98
+ # build list of incremental time buffers
99
+ bufs = ntimes.times.collect { |n| look_ahead + n * tspread }
100
+ # reverse it and subtract from the target's time
101
+ times = bufs.reverse.collect { |s| time(target_idx) - s }
102
+ # now you have the list of times at which each pointer should point
103
+ times.collect &method(:idx)
104
+ end
105
+ end
106
+
107
+ end
@@ -0,0 +1,105 @@
1
+
2
+ # Create train and test datasets for the training.
3
+ # @note: this diagram should help understanding how it works
4
+ # ----------------------------------------> data (time)
5
+ # v- this is the input+look_ahead window for first training target
6
+ # |win|train1|t1| -> train starts after window, test after training
7
+ # |train2|t2| -> train starts after window + 1 tset
8
+ # |train3|t3| -> train starts after window + 2 tset
9
+ # Note how the test sets line up. This allows the testing results plots
10
+ # to be continuous, no model is tested on data on which *itself* has been
11
+ # trained, and all data is used multiple times
12
+ class DataModeler::DatasetGen
13
+
14
+ attr_reader :data, :ds_args, :first_idx, :train_size, :test_size, :nrows
15
+
16
+ # @param data [Hash-like] the data, in an object that can be
17
+ # accessed by keys and return a time series per each key.
18
+ # It is required to include and be sorted by a series named `time`,
19
+ # and for all series to have equal length.
20
+ # @param ds_args [Hash] parameters for the Datasets: inputs, targets,
21
+ # first_idx, end_idx, ntimes. Check class Dataset for details.
22
+ # @train_size: how many points to predict for each training set
23
+ # @test_size: how many points to predict for each test set
24
+ def initialize data, ds_args:, train_size:, test_size:, min_nruns: 1
25
+ @data = data
26
+ @ds_args = ds_args
27
+ @first_idx = first_idx
28
+ @train_size = train_size
29
+ @test_size = test_size
30
+ @local_nrun = 1 # used to iterate over nruns with #next
31
+
32
+ @nrows = data[:time].size
33
+ validate_enough_data_for min_nruns
34
+ end
35
+
36
+ # Builds training set for the training
37
+ # @param nrun [Integer] will build different train+test for each run
38
+ # @return [Dataset]
39
+ # @raise [NoDataLeft] when there's not enough data left for a full train+test
40
+ def train nrun
41
+ first = min_eligible_trg + (nrun-1) * test_size
42
+ last = first + train_size
43
+ # make sure there's enough data for both train and test
44
+ raise NoDataLeft unless last + test_size < nrows
45
+ DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
46
+ end
47
+
48
+ # Builds test set for the training
49
+ # @param nrun [Integer] will build different train+test for each run
50
+ # @return [Dataset]
51
+ # @note we already checked pre-training there's enough data for the test too
52
+ def test nrun
53
+ first = min_eligible_trg + (nrun-1) * test_size + train_size
54
+ last = first + test_size
55
+ DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
56
+ end
57
+
58
+ # Returns the next pair [trainset, testset]
59
+ # @return [Array<Dataset, Dataset>]
60
+ def peek
61
+ [self.train(@local_nrun), self.test(@local_nrun)]
62
+ end
63
+
64
+ # TODO: @local_nrun is an ugly hack, refactor it!
65
+
66
+ # Returns the next pair [trainset, testset] and increments the counter
67
+ # @return [Array<Dataset, Dataset>]
68
+ def next
69
+ peek.tap { @local_nrun += 1 }
70
+ end
71
+
72
+ include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
73
+
74
+ # I want `#to_a` to return an array of arrays rather than an array of dataset
75
+
76
+ # @return [Array<Array[Dataset]>]
77
+ alias_method :to_ds_a, :to_a
78
+ # @return [Array<Array<Array<...>>]
79
+ def to_a
80
+ to_ds_a.collect do |run|
81
+ run.collect &:to_a
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
88
+
89
+ # Find the index of the first element in the data eligible as target for training
90
+ # @return [Integer] the index of the first eligible target
91
+ def min_eligible_trg
92
+ @min_eligible_trg ||= idx(time(0) +
93
+ # minimum time span required as input for the first target
94
+ ds_args[:look_ahead] + (ds_args[:ntimes]-1) * ds_args[:tspread]
95
+ )
96
+ end
97
+
98
+ # Check if there is enough data to build `min_nruns` train + test sets
99
+ # @raise [NotEnoughDataError] if `not enough minerals` (cit.)
100
+ # @note remember the schema: need to check for `|win|train1|t1|t2|...|tn|`
101
+ def validate_enough_data_for min_nruns
102
+ min_data_size = min_eligible_trg + train_size + min_nruns * test_size
103
+ raise NotEnoughDataError if nrows < min_data_size
104
+ end
105
+ end
@@ -0,0 +1,44 @@
1
+ module DataModeler
2
+ # Converts between time and indices for referencing data lines
3
+ module ConvertingTimeAndIndices
4
+ # Returns the time for a given index
5
+ # @param [Integer] idx row index
6
+ # @return [kind_of_time]
7
+ def time idx
8
+ data[:time][idx]
9
+ end
10
+
11
+ # Returns the index for a given time
12
+ # @param [time] time
13
+ # @return [Integer] row index
14
+ def idx time
15
+ # TODO: optimize with `from:`
16
+ # TODO: test corner case when index not found
17
+ # find index of first above time
18
+ idx = data[:time].index { |t| t > time }
19
+ # if index not found: all data is below time, "first above" is outofbound
20
+ idx ||= nrows
21
+ # if first above time is 0: there is no element with that time
22
+ raise TimeNotFoundError, "Time not found: #{time}" if idx.zero?
23
+ # return index of predecessor (last below time)
24
+ idx-1
25
+ end
26
+ end
27
+
28
+ # Provides each (which can return an `Iterator`) and `to_a` based on `#next`
29
+ module IteratingBasedOnNext
30
+ # Yields on each [inputs, targets] pair.
31
+ # @return [nil, Iterator] `block_given? ? nil : Iterator`
32
+ def each
33
+ return enum_for(:each) unless block_given?
34
+ loop { yield self.next }
35
+ nil
36
+ end
37
+
38
+ # @return [Array]
39
+ def to_a
40
+ each.to_a
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,12 @@
1
+ class DataModeler::Dataset
2
+ # Exception: the requested `time` is not present in the data
3
+ class TimeNotFoundError < StandardError; end
4
+ end
5
+
6
+ class DataModeler::DatasetGen
7
+ # Exception: the `data` is not sufficient for the training setup
8
+ class NotEnoughDataError < StandardError; end
9
+ # Exception: not enough `data` left to build another train+test
10
+ # @note being subclassed from `StopIteration`, it will break loops
11
+ class NoDataLeft < StopIteration; end
12
+ end
@@ -1,3 +1,5 @@
1
+ # Main gem module
1
2
  module DataModeler
2
- VERSION = "0.0.0"
3
+ # Version number
4
+ VERSION = "0.1.0"
3
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_modeler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giuseppe Cuccu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-10 00:00:00.000000000 Z
11
+ date: 2017-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pry
@@ -16,56 +16,56 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '0.10'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '0.10'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: pry-nav
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '0.2'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '0.2'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: pry-stack_explorer
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '0.4'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '0.4'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: pry-rescue
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0'
61
+ version: '1.4'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
68
+ version: '1.4'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: bundler
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -128,6 +128,10 @@ files:
128
128
  - bin/setup
129
129
  - data_modeler.gemspec
130
130
  - lib/data_modeler.rb
131
+ - lib/data_modeler/dataset.rb
132
+ - lib/data_modeler/dataset_gen.rb
133
+ - lib/data_modeler/dataset_helper.rb
134
+ - lib/data_modeler/exceptions.rb
131
135
  - lib/data_modeler/version.rb
132
136
  homepage: https://github.com/giuse/data_modeler
133
137
  licenses: