data_modeler 0.0.0 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/data_modeler.gemspec +4 -4
- data/lib/data_modeler.rb +4 -4
- data/lib/data_modeler/dataset.rb +107 -0
- data/lib/data_modeler/dataset_gen.rb +105 -0
- data/lib/data_modeler/dataset_helper.rb +44 -0
- data/lib/data_modeler/exceptions.rb +12 -0
- data/lib/data_modeler/version.rb +3 -1
- metadata +14 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b665a6495480b44b7d337d0a1acf6a7ad826262
|
4
|
+
data.tar.gz: 85ab09bec546d8bb18b5f3c0ac3f0a48b926cd86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05d5e05210191c51be3b50fe1db36787d8db0ab6659cdf902d6b41f8d3d86155afb39f714b1a843321a4c39fdd967b3dfc050f0caccebe574301fc281c450e96
|
7
|
+
data.tar.gz: 5b565eb79f326a0e169a496d12e9b98a8b432bc23caa67df74ac0d03ebeb7772ead78840fb982e0b3bb8e1ffccb503e13a623a6eee9b335f0ffef125d6286091
|
data/data_modeler.gemspec
CHANGED
@@ -22,10 +22,10 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.require_paths = ["lib"]
|
23
23
|
|
24
24
|
# Debug
|
25
|
-
spec.add_development_dependency 'pry', '~> 0'
|
26
|
-
spec.add_development_dependency 'pry-nav', '~> 0'
|
27
|
-
spec.add_development_dependency 'pry-stack_explorer', '~> 0'
|
28
|
-
spec.add_development_dependency 'pry-rescue', '~>
|
25
|
+
spec.add_development_dependency 'pry', '~> 0.10'
|
26
|
+
spec.add_development_dependency 'pry-nav', '~> 0.2'
|
27
|
+
spec.add_development_dependency 'pry-stack_explorer', '~> 0.4'
|
28
|
+
spec.add_development_dependency 'pry-rescue', '~> 1.4'
|
29
29
|
|
30
30
|
# Test
|
31
31
|
spec.add_development_dependency 'bundler', '~> 1.14'
|
data/lib/data_modeler.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require "data_modeler/version"
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
require "data_modeler/exceptions"
|
3
|
+
require "data_modeler/dataset_helper"
|
4
|
+
require "data_modeler/dataset"
|
5
|
+
require "data_modeler/dataset_gen"
|
@@ -0,0 +1,107 @@
|
|
1
|
+
|
2
|
+
# Create complex inputs and targets for the model from the given data
|
3
|
+
# @note checks to validate if enough data is present (given ntimes, tspread
|
4
|
+
# and look_ahead) should be done on the caller (typically DatasetGen)
|
5
|
+
class DataModeler::Dataset
|
6
|
+
|
7
|
+
attr_reader :data, :input_series, :target_series, :first_idx, :end_idx,
|
8
|
+
:ntimes, :tspread, :look_ahead, :target_idx, :input_idxs,
|
9
|
+
:nrows
|
10
|
+
|
11
|
+
# @param data [Hash-like] the data, in an object that can be
|
12
|
+
# accessed by keys and return a time series per each key.
|
13
|
+
# It is required to include and be sorted by a series named `time`,
|
14
|
+
# and for all series to have equal length.
|
15
|
+
# @param inputs [Array] data key accessors for input series
|
16
|
+
# @param targets [Array] data key accessors for target series
|
17
|
+
# @param first_idx [Integer] index where the dataset starts on data
|
18
|
+
# @param end_idx [Integer] index where the dataset ends on data
|
19
|
+
# @param ntimes [Integer] number of lines/times/datapoints to be
|
20
|
+
# used to construct the input
|
21
|
+
# @param tspread [Numeric] distance (in `time`!) between the `ntimes`
|
22
|
+
# lines/times/datapoints used to construct the input
|
23
|
+
# @param look_ahead [Numeric] distance (in `time`!) between the
|
24
|
+
# most recent line/time/datapoint used for the input and
|
25
|
+
# the target -- i.e., how far ahead the model is trained to predict
|
26
|
+
# @note we expect Datasets indices to be used with left inclusion but
|
27
|
+
# right exclusion, i.e. targets are considered in the range `[from,to)`
|
28
|
+
def initialize data, inputs:, targets:, first_idx:, end_idx:,
|
29
|
+
ntimes:, tspread:, look_ahead:
|
30
|
+
@data = data
|
31
|
+
@input_series = inputs
|
32
|
+
@target_series = targets
|
33
|
+
@first_idx = first_idx
|
34
|
+
@end_idx = end_idx
|
35
|
+
@ntimes = ntimes
|
36
|
+
@nrows = data[:time].size
|
37
|
+
@tspread = tspread
|
38
|
+
@look_ahead = look_ahead
|
39
|
+
@target_idx = first_idx
|
40
|
+
@input_idxs = init_inputs
|
41
|
+
end
|
42
|
+
|
43
|
+
# TODO: make sure constructor requirements are unnecessary for static models
|
44
|
+
# TODO: check if enough data / minimum_target
|
45
|
+
# TODO: the check in `#init_target` should go in the `ds_gen`
|
46
|
+
|
47
|
+
# Builds inputs for the model
|
48
|
+
# @return [Array]
|
49
|
+
def inputs
|
50
|
+
input_idxs.flat_map do |idx|
|
51
|
+
input_series.collect do |s|
|
52
|
+
data[s][idx]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Builds targets for the model
|
58
|
+
# @return [Array]
|
59
|
+
def targets
|
60
|
+
target_series.collect do |s|
|
61
|
+
data[s][target_idx]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the next pair [inputs, targets]
|
66
|
+
# @return [Array]
|
67
|
+
def peek
|
68
|
+
raise StopIteration if target_idx >= end_idx
|
69
|
+
[inputs, targets]
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the next pair [inputs, targets] and increments the target
|
73
|
+
# @return [Array]
|
74
|
+
def next
|
75
|
+
peek.tap do
|
76
|
+
@target_idx += 1
|
77
|
+
@input_idxs = init_inputs
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
|
82
|
+
|
83
|
+
# Overloaded comparison for easier testing
|
84
|
+
def == other
|
85
|
+
self.class == other.class &&
|
86
|
+
data.object_id == other.data.object_id &&
|
87
|
+
(instance_variables - [:@data]).all? do |var|
|
88
|
+
self.instance_variable_get(var) == other.instance_variable_get(var)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
|
95
|
+
|
96
|
+
def init_inputs
|
97
|
+
if target_idx < end_idx
|
98
|
+
# build list of incremental time buffers
|
99
|
+
bufs = ntimes.times.collect { |n| look_ahead + n * tspread }
|
100
|
+
# reverse it and subtract from the target's time
|
101
|
+
times = bufs.reverse.collect { |s| time(target_idx) - s }
|
102
|
+
# now you have the list of times at which each pointer should point
|
103
|
+
times.collect &method(:idx)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
|
2
|
+
# Create train and test datasets for the training.
|
3
|
+
# @note: this diagram should help understanding how it works
|
4
|
+
# ----------------------------------------> data (time)
|
5
|
+
# v- this is the input+look_ahead window for first training target
|
6
|
+
# |win|train1|t1| -> train starts after window, test after training
|
7
|
+
# |train2|t2| -> train starts after window + 1 tset
|
8
|
+
# |train3|t3| -> train starts after window + 2 tset
|
9
|
+
# Note how the test sets line up. This allows the testing results plots
|
10
|
+
# to be continuous, no model is tested on data on which *itself* has been
|
11
|
+
# trained, and all data is used multiple times
|
12
|
+
class DataModeler::DatasetGen
|
13
|
+
|
14
|
+
attr_reader :data, :ds_args, :first_idx, :train_size, :test_size, :nrows
|
15
|
+
|
16
|
+
# @param data [Hash-like] the data, in an object that can be
|
17
|
+
# accessed by keys and return a time series per each key.
|
18
|
+
# It is required to include and be sorted by a series named `time`,
|
19
|
+
# and for all series to have equal length.
|
20
|
+
# @param ds_args [Hash] parameters for the Datasets: inputs, targets,
|
21
|
+
# first_idx, end_idx, ntimes. Check class Dataset for details.
|
22
|
+
# @train_size: how many points to predict for each training set
|
23
|
+
# @test_size: how many points to predict for each test set
|
24
|
+
def initialize data, ds_args:, train_size:, test_size:, min_nruns: 1
|
25
|
+
@data = data
|
26
|
+
@ds_args = ds_args
|
27
|
+
@first_idx = first_idx
|
28
|
+
@train_size = train_size
|
29
|
+
@test_size = test_size
|
30
|
+
@local_nrun = 1 # used to iterate over nruns with #next
|
31
|
+
|
32
|
+
@nrows = data[:time].size
|
33
|
+
validate_enough_data_for min_nruns
|
34
|
+
end
|
35
|
+
|
36
|
+
# Builds training set for the training
|
37
|
+
# @param nrun [Integer] will build different train+test for each run
|
38
|
+
# @return [Dataset]
|
39
|
+
# @raise [NoDataLeft] when there's not enough data left for a full train+test
|
40
|
+
def train nrun
|
41
|
+
first = min_eligible_trg + (nrun-1) * test_size
|
42
|
+
last = first + train_size
|
43
|
+
# make sure there's enough data for both train and test
|
44
|
+
raise NoDataLeft unless last + test_size < nrows
|
45
|
+
DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Builds test set for the training
|
49
|
+
# @param nrun [Integer] will build different train+test for each run
|
50
|
+
# @return [Dataset]
|
51
|
+
# @note we already checked pre-training there's enough data for the test too
|
52
|
+
def test nrun
|
53
|
+
first = min_eligible_trg + (nrun-1) * test_size + train_size
|
54
|
+
last = first + test_size
|
55
|
+
DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns the next pair [trainset, testset]
|
59
|
+
# @return [Array<Dataset, Dataset>]
|
60
|
+
def peek
|
61
|
+
[self.train(@local_nrun), self.test(@local_nrun)]
|
62
|
+
end
|
63
|
+
|
64
|
+
# TODO: @local_nrun is an ugly hack, refactor it!
|
65
|
+
|
66
|
+
# Returns the next pair [trainset, testset] and increments the counter
|
67
|
+
# @return [Array<Dataset, Dataset>]
|
68
|
+
def next
|
69
|
+
peek.tap { @local_nrun += 1 }
|
70
|
+
end
|
71
|
+
|
72
|
+
include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
|
73
|
+
|
74
|
+
# I want `#to_a` to return an array of arrays rather than an array of dataset
|
75
|
+
|
76
|
+
# @return [Array<Array[Dataset]>]
|
77
|
+
alias_method :to_ds_a, :to_a
|
78
|
+
# @return [Array<Array<Array<...>>]
|
79
|
+
def to_a
|
80
|
+
to_ds_a.collect do |run|
|
81
|
+
run.collect &:to_a
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
|
88
|
+
|
89
|
+
# Find the index of the first element in the data eligible as target for training
|
90
|
+
# @return [Integer] the index of the first eligible target
|
91
|
+
def min_eligible_trg
|
92
|
+
@min_eligible_trg ||= idx(time(0) +
|
93
|
+
# minimum time span required as input for the first target
|
94
|
+
ds_args[:look_ahead] + (ds_args[:ntimes]-1) * ds_args[:tspread]
|
95
|
+
)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Check if there is enough data to build `min_nruns` train + test sets
|
99
|
+
# @raise [NotEnoughDataError] if `not enough minerals` (cit.)
|
100
|
+
# @note remember the schema: need to check for `|win|train1|t1|t2|...|tn|`
|
101
|
+
def validate_enough_data_for min_nruns
|
102
|
+
min_data_size = min_eligible_trg + train_size + min_nruns * test_size
|
103
|
+
raise NotEnoughDataError if nrows < min_data_size
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module DataModeler
|
2
|
+
# Converts between time and indices for referencing data lines
|
3
|
+
module ConvertingTimeAndIndices
|
4
|
+
# Returns the time for a given index
|
5
|
+
# @param [Integer] idx row index
|
6
|
+
# @return [kind_of_time]
|
7
|
+
def time idx
|
8
|
+
data[:time][idx]
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns the index for a given time
|
12
|
+
# @param [time] time
|
13
|
+
# @return [Integer] row index
|
14
|
+
def idx time
|
15
|
+
# TODO: optimize with `from:`
|
16
|
+
# TODO: test corner case when index not found
|
17
|
+
# find index of first above time
|
18
|
+
idx = data[:time].index { |t| t > time }
|
19
|
+
# if index not found: all data is below time, "first above" is outofbound
|
20
|
+
idx ||= nrows
|
21
|
+
# if first above time is 0: there is no element with that time
|
22
|
+
raise TimeNotFoundError, "Time not found: #{time}" if idx.zero?
|
23
|
+
# return index of predecessor (last below time)
|
24
|
+
idx-1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Provides each (which can return an `Iterator`) and `to_a` based on `#next`
|
29
|
+
module IteratingBasedOnNext
|
30
|
+
# Yields on each [inputs, targets] pair.
|
31
|
+
# @return [nil, Iterator] `block_given? ? nil : Iterator`
|
32
|
+
def each
|
33
|
+
return enum_for(:each) unless block_given?
|
34
|
+
loop { yield self.next }
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
|
38
|
+
# @return [Array]
|
39
|
+
def to_a
|
40
|
+
each.to_a
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class DataModeler::Dataset
|
2
|
+
# Exception: the requested `time` is not present in the data
|
3
|
+
class TimeNotFoundError < StandardError; end
|
4
|
+
end
|
5
|
+
|
6
|
+
class DataModeler::DatasetGen
|
7
|
+
# Exception: the `data` is not sufficient for the training setup
|
8
|
+
class NotEnoughDataError < StandardError; end
|
9
|
+
# Exception: not enough `data` left to build another train+test
|
10
|
+
# @note being subclassed from `StopIteration`, it will break loops
|
11
|
+
class NoDataLeft < StopIteration; end
|
12
|
+
end
|
data/lib/data_modeler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_modeler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Giuseppe Cuccu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pry
|
@@ -16,56 +16,56 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '0.10'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '0.10'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: pry-nav
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
33
|
+
version: '0.2'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
40
|
+
version: '0.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: pry-stack_explorer
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
47
|
+
version: '0.4'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
54
|
+
version: '0.4'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: pry-rescue
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '1.4'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '1.4'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: bundler
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +128,10 @@ files:
|
|
128
128
|
- bin/setup
|
129
129
|
- data_modeler.gemspec
|
130
130
|
- lib/data_modeler.rb
|
131
|
+
- lib/data_modeler/dataset.rb
|
132
|
+
- lib/data_modeler/dataset_gen.rb
|
133
|
+
- lib/data_modeler/dataset_helper.rb
|
134
|
+
- lib/data_modeler/exceptions.rb
|
131
135
|
- lib/data_modeler/version.rb
|
132
136
|
homepage: https://github.com/giuse/data_modeler
|
133
137
|
licenses:
|