data_modeler 0.0.0 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9ab07761ab1c6a95194902cb445c9af4f63f13ad
4
- data.tar.gz: '0812c7257cb9b151790b1c95f7c796e4b94b8bbc'
3
+ metadata.gz: 2b665a6495480b44b7d337d0a1acf6a7ad826262
4
+ data.tar.gz: 85ab09bec546d8bb18b5f3c0ac3f0a48b926cd86
5
5
  SHA512:
6
- metadata.gz: 74fab45b2f6293fd7250ae07dddbe688bcd77dc1c2a258388fd63aa300abf824b850075533a676c6b115424d1615a4665ba2b5ba61cb14ba700f40fe1d80fc13
7
- data.tar.gz: 6b8484b312d6a6fc907d00491f3e9db4a921e68a5d07eeca462c9e428f5fddced203be1c168445b24be2742fbdfddefc728e90d1e0af4d9d514cce2f76e346aa
6
+ metadata.gz: 05d5e05210191c51be3b50fe1db36787d8db0ab6659cdf902d6b41f8d3d86155afb39f714b1a843321a4c39fdd967b3dfc050f0caccebe574301fc281c450e96
7
+ data.tar.gz: 5b565eb79f326a0e169a496d12e9b98a8b432bc23caa67df74ac0d03ebeb7772ead78840fb982e0b3bb8e1ffccb503e13a623a6eee9b335f0ffef125d6286091
data/data_modeler.gemspec CHANGED
@@ -22,10 +22,10 @@ Gem::Specification.new do |spec|
22
22
  spec.require_paths = ["lib"]
23
23
 
24
24
  # Debug
25
- spec.add_development_dependency 'pry', '~> 0'
26
- spec.add_development_dependency 'pry-nav', '~> 0'
27
- spec.add_development_dependency 'pry-stack_explorer', '~> 0'
28
- spec.add_development_dependency 'pry-rescue', '~> 0'
25
+ spec.add_development_dependency 'pry', '~> 0.10'
26
+ spec.add_development_dependency 'pry-nav', '~> 0.2'
27
+ spec.add_development_dependency 'pry-stack_explorer', '~> 0.4'
28
+ spec.add_development_dependency 'pry-rescue', '~> 1.4'
29
29
 
30
30
  # Test
31
31
  spec.add_development_dependency 'bundler', '~> 1.14'
data/lib/data_modeler.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require "data_modeler/version"
2
-
3
- module DataModeler
4
- # Your code goes here...
5
- end
2
+ require "data_modeler/exceptions"
3
+ require "data_modeler/dataset_helper"
4
+ require "data_modeler/dataset"
5
+ require "data_modeler/dataset_gen"
@@ -0,0 +1,107 @@
1
+
2
+ # Create complex inputs and targets for the model from the given data
3
+ # @note checks to validate if enough data is present (given ntimes, tspread
4
+ # and look_ahead) should be done on the caller (typically DatasetGen)
5
+ class DataModeler::Dataset
6
+
7
+ attr_reader :data, :input_series, :target_series, :first_idx, :end_idx,
8
+ :ntimes, :tspread, :look_ahead, :target_idx, :input_idxs,
9
+ :nrows
10
+
11
+ # @param data [Hash-like] the data, in an object that can be
12
+ # accessed by keys and return a time series per each key.
13
+ # It is required to include and be sorted by a series named `time`,
14
+ # and for all series to have equal length.
15
+ # @param inputs [Array] data key accessors for input series
16
+ # @param targets [Array] data key accessors for target series
17
+ # @param first_idx [Integer] index where the dataset starts on data
18
+ # @param end_idx [Integer] index where the dataset ends on data
19
+ # @param ntimes [Integer] number of lines/times/datapoints to be
20
+ # used to construct the input
21
+ # @param tspread [Numeric] distance (in `time`!) between the `ntimes`
22
+ # lines/times/datapoints used to construct the input
23
+ # @param look_ahead [Numeric] distance (in `time`!) between the
24
+ # most recent line/time/datapoint used for the input and
25
+ # the target -- i.e., how far ahead the model is trained to predict
26
+ # @note we expect Datasets indices to be used with left inclusion but
27
+ # right exclusion, i.e. targets are considered in the range `[from,to)`
28
+ def initialize data, inputs:, targets:, first_idx:, end_idx:,
29
+ ntimes:, tspread:, look_ahead:
30
+ @data = data
31
+ @input_series = inputs
32
+ @target_series = targets
33
+ @first_idx = first_idx
34
+ @end_idx = end_idx
35
+ @ntimes = ntimes
36
+ @nrows = data[:time].size
37
+ @tspread = tspread
38
+ @look_ahead = look_ahead
39
+ @target_idx = first_idx
40
+ @input_idxs = init_inputs
41
+ end
42
+
43
+ # TODO: make sure constructor requirements are unnecessary for static models
44
+ # TODO: check if enough data / minimum_target
45
+ # TODO: the check in `#init_target` should go in the `ds_gen`
46
+
47
+ # Builds inputs for the model
48
+ # @return [Array]
49
+ def inputs
50
+ input_idxs.flat_map do |idx|
51
+ input_series.collect do |s|
52
+ data[s][idx]
53
+ end
54
+ end
55
+ end
56
+
57
+ # Builds targets for the model
58
+ # @return [Array]
59
+ def targets
60
+ target_series.collect do |s|
61
+ data[s][target_idx]
62
+ end
63
+ end
64
+
65
+ # Returns the next pair [inputs, targets]
66
+ # @return [Array]
67
+ def peek
68
+ raise StopIteration if target_idx >= end_idx
69
+ [inputs, targets]
70
+ end
71
+
72
+ # Returns the next pair [inputs, targets] and increments the target
73
+ # @return [Array]
74
+ def next
75
+ peek.tap do
76
+ @target_idx += 1
77
+ @input_idxs = init_inputs
78
+ end
79
+ end
80
+
81
+ include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
82
+
83
+ # Overloaded comparison for easier testing
84
+ def == other
85
+ self.class == other.class &&
86
+ data.object_id == other.data.object_id &&
87
+ (instance_variables - [:@data]).all? do |var|
88
+ self.instance_variable_get(var) == other.instance_variable_get(var)
89
+ end
90
+ end
91
+
92
+ private
93
+
94
+ include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
95
+
96
+ def init_inputs
97
+ if target_idx < end_idx
98
+ # build list of incremental time buffers
99
+ bufs = ntimes.times.collect { |n| look_ahead + n * tspread }
100
+ # reverse it and subtract from the target's time
101
+ times = bufs.reverse.collect { |s| time(target_idx) - s }
102
+ # now you have the list of times at which each pointer should point
103
+ times.collect &method(:idx)
104
+ end
105
+ end
106
+
107
+ end
@@ -0,0 +1,105 @@
1
+
2
+ # Create train and test datasets for the training.
3
+ # @note: this diagram should help understanding how it works
4
+ # ----------------------------------------> data (time)
5
+ # v- this is the input+look_ahead window for first training target
6
+ # |win|train1|t1| -> train starts after window, test after training
7
+ # |train2|t2| -> train starts after window + 1 tset
8
+ # |train3|t3| -> train starts after window + 2 tset
9
+ # Note how the test sets line up. This allows the testing results plots
10
+ # to be continuous, no model is tested on data on which *itself* has been
11
+ # trained, and all data is used multiple times
12
+ class DataModeler::DatasetGen
13
+
14
+ attr_reader :data, :ds_args, :first_idx, :train_size, :test_size, :nrows
15
+
16
+ # @param data [Hash-like] the data, in an object that can be
17
+ # accessed by keys and return a time series per each key.
18
+ # It is required to include and be sorted by a series named `time`,
19
+ # and for all series to have equal length.
20
+ # @param ds_args [Hash] parameters for the Datasets: inputs, targets,
21
+ # first_idx, end_idx, ntimes. Check class Dataset for details.
22
+ # @train_size: how many points to predict for each training set
23
+ # @test_size: how many points to predict for each test set
24
+ def initialize data, ds_args:, train_size:, test_size:, min_nruns: 1
25
+ @data = data
26
+ @ds_args = ds_args
27
+ @first_idx = first_idx
28
+ @train_size = train_size
29
+ @test_size = test_size
30
+ @local_nrun = 1 # used to iterate over nruns with #next
31
+
32
+ @nrows = data[:time].size
33
+ validate_enough_data_for min_nruns
34
+ end
35
+
36
+ # Builds training set for the training
37
+ # @param nrun [Integer] will build different train+test for each run
38
+ # @return [Dataset]
39
+ # @raise [NoDataLeft] when there's not enough data left for a full train+test
40
+ def train nrun
41
+ first = min_eligible_trg + (nrun-1) * test_size
42
+ last = first + train_size
43
+ # make sure there's enough data for both train and test
44
+ raise NoDataLeft unless last + test_size < nrows
45
+ DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
46
+ end
47
+
48
+ # Builds test set for the training
49
+ # @param nrun [Integer] will build different train+test for each run
50
+ # @return [Dataset]
51
+ # @note we already checked pre-training there's enough data for the test too
52
+ def test nrun
53
+ first = min_eligible_trg + (nrun-1) * test_size + train_size
54
+ last = first + test_size
55
+ DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
56
+ end
57
+
58
+ # Returns the next pair [trainset, testset]
59
+ # @return [Array<Dataset, Dataset>]
60
+ def peek
61
+ [self.train(@local_nrun), self.test(@local_nrun)]
62
+ end
63
+
64
+ # TODO: @local_nrun is an ugly hack, refactor it!
65
+
66
+ # Returns the next pair [trainset, testset] and increments the counter
67
+ # @return [Array<Dataset, Dataset>]
68
+ def next
69
+ peek.tap { @local_nrun += 1 }
70
+ end
71
+
72
+ include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
73
+
74
+ # I want `#to_a` to return an array of arrays rather than an array of dataset
75
+
76
+ # @return [Array<Array[Dataset]>]
77
+ alias_method :to_ds_a, :to_a
78
+ # @return [Array<Array<Array<...>>]
79
+ def to_a
80
+ to_ds_a.collect do |run|
81
+ run.collect &:to_a
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
88
+
89
+ # Find the index of the first element in the data eligible as target for training
90
+ # @return [Integer] the index of the first eligible target
91
+ def min_eligible_trg
92
+ @min_eligible_trg ||= idx(time(0) +
93
+ # minimum time span required as input for the first target
94
+ ds_args[:look_ahead] + (ds_args[:ntimes]-1) * ds_args[:tspread]
95
+ )
96
+ end
97
+
98
+ # Check if there is enough data to build `min_nruns` train + test sets
99
+ # @raise [NotEnoughDataError] if `not enough minerals` (cit.)
100
+ # @note remember the schema: need to check for `|win|train1|t1|t2|...|tn|`
101
+ def validate_enough_data_for min_nruns
102
+ min_data_size = min_eligible_trg + train_size + min_nruns * test_size
103
+ raise NotEnoughDataError if nrows < min_data_size
104
+ end
105
+ end
@@ -0,0 +1,44 @@
1
+ module DataModeler
2
+ # Converts between time and indices for referencing data lines
3
+ module ConvertingTimeAndIndices
4
+ # Returns the time for a given index
5
+ # @param [Integer] idx row index
6
+ # @return [kind_of_time]
7
+ def time idx
8
+ data[:time][idx]
9
+ end
10
+
11
+ # Returns the index for a given time
12
+ # @param [time] time
13
+ # @return [Integer] row index
14
+ def idx time
15
+ # TODO: optimize with `from:`
16
+ # TODO: test corner case when index not found
17
+ # find index of first above time
18
+ idx = data[:time].index { |t| t > time }
19
+ # if index not found: all data is below time, "first above" is outofbound
20
+ idx ||= nrows
21
+ # if first above time is 0: there is no element with that time
22
+ raise TimeNotFoundError, "Time not found: #{time}" if idx.zero?
23
+ # return index of predecessor (last below time)
24
+ idx-1
25
+ end
26
+ end
27
+
28
+ # Provides each (which can return an `Iterator`) and `to_a` based on `#next`
29
+ module IteratingBasedOnNext
30
+ # Yields on each [inputs, targets] pair.
31
+ # @return [nil, Iterator] `block_given? ? nil : Iterator`
32
+ def each
33
+ return enum_for(:each) unless block_given?
34
+ loop { yield self.next }
35
+ nil
36
+ end
37
+
38
+ # @return [Array]
39
+ def to_a
40
+ each.to_a
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,12 @@
1
+ class DataModeler::Dataset
2
+ # Exception: the requested `time` is not present in the data
3
+ class TimeNotFoundError < StandardError; end
4
+ end
5
+
6
+ class DataModeler::DatasetGen
7
+ # Exception: the `data` is not sufficient for the training setup
8
+ class NotEnoughDataError < StandardError; end
9
+ # Exception: not enough `data` left to build another train+test
10
+ # @note being subclassed from `StopIteration`, it will break loops
11
+ class NoDataLeft < StopIteration; end
12
+ end
@@ -1,3 +1,5 @@
1
+ # Main gem module
1
2
  module DataModeler
2
- VERSION = "0.0.0"
3
+ # Version number
4
+ VERSION = "0.1.0"
3
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_modeler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Giuseppe Cuccu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-10 00:00:00.000000000 Z
11
+ date: 2017-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pry
@@ -16,56 +16,56 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '0.10'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '0.10'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: pry-nav
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '0.2'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '0.2'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: pry-stack_explorer
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '0.4'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '0.4'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: pry-rescue
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0'
61
+ version: '1.4'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0'
68
+ version: '1.4'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: bundler
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -128,6 +128,10 @@ files:
128
128
  - bin/setup
129
129
  - data_modeler.gemspec
130
130
  - lib/data_modeler.rb
131
+ - lib/data_modeler/dataset.rb
132
+ - lib/data_modeler/dataset_gen.rb
133
+ - lib/data_modeler/dataset_helper.rb
134
+ - lib/data_modeler/exceptions.rb
131
135
  - lib/data_modeler/version.rb
132
136
  homepage: https://github.com/giuse/data_modeler
133
137
  licenses: