data_modeler 0.0.0 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/data_modeler.gemspec +4 -4
- data/lib/data_modeler.rb +4 -4
- data/lib/data_modeler/dataset.rb +107 -0
- data/lib/data_modeler/dataset_gen.rb +105 -0
- data/lib/data_modeler/dataset_helper.rb +44 -0
- data/lib/data_modeler/exceptions.rb +12 -0
- data/lib/data_modeler/version.rb +3 -1
- metadata +14 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2b665a6495480b44b7d337d0a1acf6a7ad826262
|
4
|
+
data.tar.gz: 85ab09bec546d8bb18b5f3c0ac3f0a48b926cd86
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05d5e05210191c51be3b50fe1db36787d8db0ab6659cdf902d6b41f8d3d86155afb39f714b1a843321a4c39fdd967b3dfc050f0caccebe574301fc281c450e96
|
7
|
+
data.tar.gz: 5b565eb79f326a0e169a496d12e9b98a8b432bc23caa67df74ac0d03ebeb7772ead78840fb982e0b3bb8e1ffccb503e13a623a6eee9b335f0ffef125d6286091
|
data/data_modeler.gemspec
CHANGED
@@ -22,10 +22,10 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.require_paths = ["lib"]
|
23
23
|
|
24
24
|
# Debug
|
25
|
-
spec.add_development_dependency 'pry', '~> 0'
|
26
|
-
spec.add_development_dependency 'pry-nav', '~> 0'
|
27
|
-
spec.add_development_dependency 'pry-stack_explorer', '~> 0'
|
28
|
-
spec.add_development_dependency 'pry-rescue', '~>
|
25
|
+
spec.add_development_dependency 'pry', '~> 0.10'
|
26
|
+
spec.add_development_dependency 'pry-nav', '~> 0.2'
|
27
|
+
spec.add_development_dependency 'pry-stack_explorer', '~> 0.4'
|
28
|
+
spec.add_development_dependency 'pry-rescue', '~> 1.4'
|
29
29
|
|
30
30
|
# Test
|
31
31
|
spec.add_development_dependency 'bundler', '~> 1.14'
|
data/lib/data_modeler.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require "data_modeler/version"
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
require "data_modeler/exceptions"
|
3
|
+
require "data_modeler/dataset_helper"
|
4
|
+
require "data_modeler/dataset"
|
5
|
+
require "data_modeler/dataset_gen"
|
@@ -0,0 +1,107 @@
|
|
1
|
+
|
2
|
+
# Create complex inputs and targets for the model from the given data
|
3
|
+
# @note checks to validate if enough data is present (given ntimes, tspread
|
4
|
+
# and look_ahead) should be done on the caller (typically DatasetGen)
|
5
|
+
class DataModeler::Dataset
|
6
|
+
|
7
|
+
attr_reader :data, :input_series, :target_series, :first_idx, :end_idx,
|
8
|
+
:ntimes, :tspread, :look_ahead, :target_idx, :input_idxs,
|
9
|
+
:nrows
|
10
|
+
|
11
|
+
# @param data [Hash-like] the data, in an object that can be
|
12
|
+
# accessed by keys and return a time series per each key.
|
13
|
+
# It is required to include and be sorted by a series named `time`,
|
14
|
+
# and for all series to have equal length.
|
15
|
+
# @param inputs [Array] data key accessors for input series
|
16
|
+
# @param targets [Array] data key accessors for target series
|
17
|
+
# @param first_idx [Integer] index where the dataset starts on data
|
18
|
+
# @param end_idx [Integer] index where the dataset ends on data
|
19
|
+
# @param ntimes [Integer] number of lines/times/datapoints to be
|
20
|
+
# used to construct the input
|
21
|
+
# @param tspread [Numeric] distance (in `time`!) between the `ntimes`
|
22
|
+
# lines/times/datapoints used to construct the input
|
23
|
+
# @param look_ahead [Numeric] distance (in `time`!) between the
|
24
|
+
# most recent line/time/datapoint used for the input and
|
25
|
+
# the target -- i.e., how far ahead the model is trained to predict
|
26
|
+
# @note we expect Datasets indices to be used with left inclusion but
|
27
|
+
# right exclusion, i.e. targets are considered in the range `[from,to)`
|
28
|
+
def initialize data, inputs:, targets:, first_idx:, end_idx:,
|
29
|
+
ntimes:, tspread:, look_ahead:
|
30
|
+
@data = data
|
31
|
+
@input_series = inputs
|
32
|
+
@target_series = targets
|
33
|
+
@first_idx = first_idx
|
34
|
+
@end_idx = end_idx
|
35
|
+
@ntimes = ntimes
|
36
|
+
@nrows = data[:time].size
|
37
|
+
@tspread = tspread
|
38
|
+
@look_ahead = look_ahead
|
39
|
+
@target_idx = first_idx
|
40
|
+
@input_idxs = init_inputs
|
41
|
+
end
|
42
|
+
|
43
|
+
# TODO: make sure constructor requirements are unnecessary for static models
|
44
|
+
# TODO: check if enough data / minimum_target
|
45
|
+
# TODO: the check in `#init_target` should go in the `ds_gen`
|
46
|
+
|
47
|
+
# Builds inputs for the model
|
48
|
+
# @return [Array]
|
49
|
+
def inputs
|
50
|
+
input_idxs.flat_map do |idx|
|
51
|
+
input_series.collect do |s|
|
52
|
+
data[s][idx]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Builds targets for the model
|
58
|
+
# @return [Array]
|
59
|
+
def targets
|
60
|
+
target_series.collect do |s|
|
61
|
+
data[s][target_idx]
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
# Returns the next pair [inputs, targets]
|
66
|
+
# @return [Array]
|
67
|
+
def peek
|
68
|
+
raise StopIteration if target_idx >= end_idx
|
69
|
+
[inputs, targets]
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns the next pair [inputs, targets] and increments the target
|
73
|
+
# @return [Array]
|
74
|
+
def next
|
75
|
+
peek.tap do
|
76
|
+
@target_idx += 1
|
77
|
+
@input_idxs = init_inputs
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
|
82
|
+
|
83
|
+
# Overloaded comparison for easier testing
|
84
|
+
def == other
|
85
|
+
self.class == other.class &&
|
86
|
+
data.object_id == other.data.object_id &&
|
87
|
+
(instance_variables - [:@data]).all? do |var|
|
88
|
+
self.instance_variable_get(var) == other.instance_variable_get(var)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
private
|
93
|
+
|
94
|
+
include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
|
95
|
+
|
96
|
+
def init_inputs
|
97
|
+
if target_idx < end_idx
|
98
|
+
# build list of incremental time buffers
|
99
|
+
bufs = ntimes.times.collect { |n| look_ahead + n * tspread }
|
100
|
+
# reverse it and subtract from the target's time
|
101
|
+
times = bufs.reverse.collect { |s| time(target_idx) - s }
|
102
|
+
# now you have the list of times at which each pointer should point
|
103
|
+
times.collect &method(:idx)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
|
2
|
+
# Create train and test datasets for the training.
|
3
|
+
# @note: this diagram should help understanding how it works
|
4
|
+
# ----------------------------------------> data (time)
|
5
|
+
# v- this is the input+look_ahead window for first training target
|
6
|
+
# |win|train1|t1| -> train starts after window, test after training
|
7
|
+
# |train2|t2| -> train starts after window + 1 tset
|
8
|
+
# |train3|t3| -> train starts after window + 2 tset
|
9
|
+
# Note how the test sets line up. This allows the testing results plots
|
10
|
+
# to be continuous, no model is tested on data on which *itself* has been
|
11
|
+
# trained, and all data is used multiple times
|
12
|
+
class DataModeler::DatasetGen
|
13
|
+
|
14
|
+
attr_reader :data, :ds_args, :first_idx, :train_size, :test_size, :nrows
|
15
|
+
|
16
|
+
# @param data [Hash-like] the data, in an object that can be
|
17
|
+
# accessed by keys and return a time series per each key.
|
18
|
+
# It is required to include and be sorted by a series named `time`,
|
19
|
+
# and for all series to have equal length.
|
20
|
+
# @param ds_args [Hash] parameters for the Datasets: inputs, targets,
|
21
|
+
# first_idx, end_idx, ntimes. Check class Dataset for details.
|
22
|
+
# @train_size: how many points to predict for each training set
|
23
|
+
# @test_size: how many points to predict for each test set
|
24
|
+
def initialize data, ds_args:, train_size:, test_size:, min_nruns: 1
|
25
|
+
@data = data
|
26
|
+
@ds_args = ds_args
|
27
|
+
@first_idx = first_idx
|
28
|
+
@train_size = train_size
|
29
|
+
@test_size = test_size
|
30
|
+
@local_nrun = 1 # used to iterate over nruns with #next
|
31
|
+
|
32
|
+
@nrows = data[:time].size
|
33
|
+
validate_enough_data_for min_nruns
|
34
|
+
end
|
35
|
+
|
36
|
+
# Builds training set for the training
|
37
|
+
# @param nrun [Integer] will build different train+test for each run
|
38
|
+
# @return [Dataset]
|
39
|
+
# @raise [NoDataLeft] when there's not enough data left for a full train+test
|
40
|
+
def train nrun
|
41
|
+
first = min_eligible_trg + (nrun-1) * test_size
|
42
|
+
last = first + train_size
|
43
|
+
# make sure there's enough data for both train and test
|
44
|
+
raise NoDataLeft unless last + test_size < nrows
|
45
|
+
DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Builds test set for the training
|
49
|
+
# @param nrun [Integer] will build different train+test for each run
|
50
|
+
# @return [Dataset]
|
51
|
+
# @note we already checked pre-training there's enough data for the test too
|
52
|
+
def test nrun
|
53
|
+
first = min_eligible_trg + (nrun-1) * test_size + train_size
|
54
|
+
last = first + test_size
|
55
|
+
DataModeler::Dataset.new data, ds_args.merge(first_idx: first, end_idx: last)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Returns the next pair [trainset, testset]
|
59
|
+
# @return [Array<Dataset, Dataset>]
|
60
|
+
def peek
|
61
|
+
[self.train(@local_nrun), self.test(@local_nrun)]
|
62
|
+
end
|
63
|
+
|
64
|
+
# TODO: @local_nrun is an ugly hack, refactor it!
|
65
|
+
|
66
|
+
# Returns the next pair [trainset, testset] and increments the counter
|
67
|
+
# @return [Array<Dataset, Dataset>]
|
68
|
+
def next
|
69
|
+
peek.tap { @local_nrun += 1 }
|
70
|
+
end
|
71
|
+
|
72
|
+
include DataModeler::IteratingBasedOnNext # `#each` and `#to_a` based on `#next`
|
73
|
+
|
74
|
+
# I want `#to_a` to return an array of arrays rather than an array of dataset
|
75
|
+
|
76
|
+
# @return [Array<Array[Dataset]>]
|
77
|
+
alias_method :to_ds_a, :to_a
|
78
|
+
# @return [Array<Array<Array<...>>]
|
79
|
+
def to_a
|
80
|
+
to_ds_a.collect do |run|
|
81
|
+
run.collect &:to_a
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
include DataModeler::ConvertingTimeAndIndices # `#time` and `#idx`
|
88
|
+
|
89
|
+
# Find the index of the first element in the data eligible as target for training
|
90
|
+
# @return [Integer] the index of the first eligible target
|
91
|
+
def min_eligible_trg
|
92
|
+
@min_eligible_trg ||= idx(time(0) +
|
93
|
+
# minimum time span required as input for the first target
|
94
|
+
ds_args[:look_ahead] + (ds_args[:ntimes]-1) * ds_args[:tspread]
|
95
|
+
)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Check if there is enough data to build `min_nruns` train + test sets
|
99
|
+
# @raise [NotEnoughDataError] if `not enough minerals` (cit.)
|
100
|
+
# @note remember the schema: need to check for `|win|train1|t1|t2|...|tn|`
|
101
|
+
def validate_enough_data_for min_nruns
|
102
|
+
min_data_size = min_eligible_trg + train_size + min_nruns * test_size
|
103
|
+
raise NotEnoughDataError if nrows < min_data_size
|
104
|
+
end
|
105
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module DataModeler
|
2
|
+
# Converts between time and indices for referencing data lines
|
3
|
+
module ConvertingTimeAndIndices
|
4
|
+
# Returns the time for a given index
|
5
|
+
# @param [Integer] idx row index
|
6
|
+
# @return [kind_of_time]
|
7
|
+
def time idx
|
8
|
+
data[:time][idx]
|
9
|
+
end
|
10
|
+
|
11
|
+
# Returns the index for a given time
|
12
|
+
# @param [time] time
|
13
|
+
# @return [Integer] row index
|
14
|
+
def idx time
|
15
|
+
# TODO: optimize with `from:`
|
16
|
+
# TODO: test corner case when index not found
|
17
|
+
# find index of first above time
|
18
|
+
idx = data[:time].index { |t| t > time }
|
19
|
+
# if index not found: all data is below time, "first above" is outofbound
|
20
|
+
idx ||= nrows
|
21
|
+
# if first above time is 0: there is no element with that time
|
22
|
+
raise TimeNotFoundError, "Time not found: #{time}" if idx.zero?
|
23
|
+
# return index of predecessor (last below time)
|
24
|
+
idx-1
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Provides each (which can return an `Iterator`) and `to_a` based on `#next`
|
29
|
+
module IteratingBasedOnNext
|
30
|
+
# Yields on each [inputs, targets] pair.
|
31
|
+
# @return [nil, Iterator] `block_given? ? nil : Iterator`
|
32
|
+
def each
|
33
|
+
return enum_for(:each) unless block_given?
|
34
|
+
loop { yield self.next }
|
35
|
+
nil
|
36
|
+
end
|
37
|
+
|
38
|
+
# @return [Array]
|
39
|
+
def to_a
|
40
|
+
each.to_a
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class DataModeler::Dataset
|
2
|
+
# Exception: the requested `time` is not present in the data
|
3
|
+
class TimeNotFoundError < StandardError; end
|
4
|
+
end
|
5
|
+
|
6
|
+
class DataModeler::DatasetGen
|
7
|
+
# Exception: the `data` is not sufficient for the training setup
|
8
|
+
class NotEnoughDataError < StandardError; end
|
9
|
+
# Exception: not enough `data` left to build another train+test
|
10
|
+
# @note being subclassed from `StopIteration`, it will break loops
|
11
|
+
class NoDataLeft < StopIteration; end
|
12
|
+
end
|
data/lib/data_modeler/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_modeler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Giuseppe Cuccu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-05-
|
11
|
+
date: 2017-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pry
|
@@ -16,56 +16,56 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '0.10'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '0.10'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: pry-nav
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
33
|
+
version: '0.2'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
40
|
+
version: '0.2'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: pry-stack_explorer
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
47
|
+
version: '0.4'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
54
|
+
version: '0.4'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: pry-rescue
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
61
|
+
version: '1.4'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
68
|
+
version: '1.4'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: bundler
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -128,6 +128,10 @@ files:
|
|
128
128
|
- bin/setup
|
129
129
|
- data_modeler.gemspec
|
130
130
|
- lib/data_modeler.rb
|
131
|
+
- lib/data_modeler/dataset.rb
|
132
|
+
- lib/data_modeler/dataset_gen.rb
|
133
|
+
- lib/data_modeler/dataset_helper.rb
|
134
|
+
- lib/data_modeler/exceptions.rb
|
131
135
|
- lib/data_modeler/version.rb
|
132
136
|
homepage: https://github.com/giuse/data_modeler
|
133
137
|
licenses:
|