easy_ml 0.2.0.pre.rc71 → 0.2.0.pre.rc75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +33 -0
- data/app/controllers/easy_ml/datasources_controller.rb +7 -0
- data/app/controllers/easy_ml/models_controller.rb +46 -0
- data/app/frontend/components/DatasetCard.tsx +212 -0
- data/app/frontend/components/ModelCard.tsx +114 -29
- data/app/frontend/components/StackTrace.tsx +13 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
- data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
- data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
- data/app/frontend/components/models/UploadModelModal.tsx +212 -0
- data/app/frontend/components/models/index.ts +2 -0
- data/app/frontend/pages/DatasetsPage.tsx +36 -130
- data/app/frontend/pages/DatasourcesPage.tsx +22 -2
- data/app/frontend/pages/ModelsPage.tsx +37 -11
- data/app/frontend/types/dataset.ts +1 -2
- data/app/frontend/types.ts +1 -1
- data/app/jobs/easy_ml/reaper.rb +55 -0
- data/app/jobs/easy_ml/training_job.rb +1 -1
- data/app/models/easy_ml/column/imputers/base.rb +4 -0
- data/app/models/easy_ml/column/imputers/clip.rb +5 -3
- data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
- data/app/models/easy_ml/column/imputers/mean.rb +7 -3
- data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
- data/app/models/easy_ml/column/imputers.rb +3 -1
- data/app/models/easy_ml/column/lineage/base.rb +5 -1
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
- data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +4 -0
- data/app/models/easy_ml/column.rb +79 -63
- data/app/models/easy_ml/column_history.rb +28 -28
- data/app/models/easy_ml/column_list/imputer.rb +23 -0
- data/app/models/easy_ml/column_list.rb +39 -26
- data/app/models/easy_ml/dataset/learner/base.rb +34 -0
- data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
- data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
- data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
- data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
- data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
- data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
- data/app/models/easy_ml/dataset/learner/query.rb +25 -0
- data/app/models/easy_ml/dataset/learner.rb +100 -0
- data/app/models/easy_ml/dataset.rb +150 -36
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/datasource.rb +9 -0
- data/app/models/easy_ml/event.rb +5 -7
- data/app/models/easy_ml/export/column.rb +27 -0
- data/app/models/easy_ml/export/dataset.rb +37 -0
- data/app/models/easy_ml/export/datasource.rb +12 -0
- data/app/models/easy_ml/export/feature.rb +24 -0
- data/app/models/easy_ml/export/model.rb +40 -0
- data/app/models/easy_ml/export/retraining_job.rb +20 -0
- data/app/models/easy_ml/export/splitter.rb +14 -0
- data/app/models/easy_ml/feature.rb +21 -0
- data/app/models/easy_ml/import/column.rb +35 -0
- data/app/models/easy_ml/import/dataset.rb +148 -0
- data/app/models/easy_ml/import/feature.rb +36 -0
- data/app/models/easy_ml/import/model.rb +136 -0
- data/app/models/easy_ml/import/retraining_job.rb +29 -0
- data/app/models/easy_ml/import/splitter.rb +34 -0
- data/app/models/easy_ml/lineage.rb +44 -0
- data/app/models/easy_ml/model.rb +101 -37
- data/app/models/easy_ml/model_file.rb +6 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
- data/app/models/easy_ml/models/xgboost.rb +33 -9
- data/app/models/easy_ml/retraining_job.rb +8 -1
- data/app/models/easy_ml/retraining_run.rb +7 -5
- data/app/models/easy_ml/splitter.rb +8 -0
- data/app/models/lineage_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +7 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
- data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
- data/config/routes.rb +14 -1
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
- data/lib/easy_ml/core/tuner.rb +13 -12
- data/lib/easy_ml/data/polars_column.rb +149 -100
- data/lib/easy_ml/data/polars_reader.rb +8 -5
- data/lib/easy_ml/data/polars_schema.rb +56 -0
- data/lib/easy_ml/data/splits/file_split.rb +20 -2
- data/lib/easy_ml/data/splits/split.rb +10 -1
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/deep_compact.rb +19 -0
- data/lib/easy_ml/engine.rb +1 -0
- data/lib/easy_ml/feature_store.rb +2 -6
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
- data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
- data/lib/easy_ml/timing.rb +34 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +2 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
- metadata +53 -12
- data/app/models/easy_ml/column/learners/base.rb +0 -103
- data/app/models/easy_ml/column/learners/boolean.rb +0 -11
- data/app/models/easy_ml/column/learners/categorical.rb +0 -51
- data/app/models/easy_ml/column/learners/datetime.rb +0 -19
- data/app/models/easy_ml/column/learners/null.rb +0 -22
- data/app/models/easy_ml/column/learners/numeric.rb +0 -33
- data/app/models/easy_ml/column/learners/string.rb +0 -15
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js +0 -489
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js.map +0 -1
@@ -1,6 +1,7 @@
|
|
1
1
|
module EasyML
|
2
2
|
module ColumnList
|
3
3
|
include Historiographer::Relation
|
4
|
+
include EasyML::Timing
|
4
5
|
|
5
6
|
def sync(delete: true)
|
6
7
|
return unless dataset.schema.present?
|
@@ -39,35 +40,28 @@ module EasyML
|
|
39
40
|
df
|
40
41
|
end
|
41
42
|
|
43
|
+
measure_method_timing :transform
|
44
|
+
|
45
|
+
def apply_clip(df)
|
46
|
+
clip_cols = has_clip.raw
|
47
|
+
return df unless clip_cols.any?
|
48
|
+
|
49
|
+
clipped_exprs = EasyML::ColumnList::Imputer.new(
|
50
|
+
dataset,
|
51
|
+
df,
|
52
|
+
columns: clip_cols,
|
53
|
+
imputers: [:clip],
|
54
|
+
).exprs
|
55
|
+
|
56
|
+
df.with_columns(clipped_exprs)
|
57
|
+
end
|
58
|
+
|
42
59
|
def learn(type: :raw, computed: false)
|
43
|
-
|
44
|
-
cols_to_learn = cols_to_learn.computed if computed
|
45
|
-
cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
|
46
|
-
cols_to_learn.each { |col| col.learn(type: type) }
|
47
|
-
EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
|
48
|
-
statistics
|
49
|
-
learned_at
|
50
|
-
sample_values
|
51
|
-
last_datasource_sha
|
52
|
-
is_learning
|
53
|
-
datatype
|
54
|
-
polars_datatype
|
55
|
-
] })
|
56
|
-
set_feature_lineage
|
60
|
+
EasyML::Dataset::Learner.new(dataset, type: type).learn
|
57
61
|
reload
|
58
62
|
end
|
59
63
|
|
60
|
-
|
61
|
-
names = dataset.features.computed_column_names
|
62
|
-
columns = where(name: names, computed_by: nil).map do |col|
|
63
|
-
col.assign_attributes(
|
64
|
-
is_computed: true,
|
65
|
-
computed_by: col.computing_feature&.name,
|
66
|
-
)
|
67
|
-
col
|
68
|
-
end
|
69
|
-
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
|
70
|
-
end
|
64
|
+
measure_method_timing :learn
|
71
65
|
|
72
66
|
def statistics
|
73
67
|
stats = { raw: {}, processed: {} }
|
@@ -115,6 +109,25 @@ module EasyML
|
|
115
109
|
column_list.sort_by { |col| [col.sort_required, col.name] }
|
116
110
|
end
|
117
111
|
|
112
|
+
def set_feature_lineage(cols_to_learn)
|
113
|
+
names = dataset.features.computed_column_names
|
114
|
+
columns = where(name: names, computed_by: nil).map do |col|
|
115
|
+
col.assign_attributes(
|
116
|
+
is_computed: true,
|
117
|
+
computed_by: col.computing_feature&.name,
|
118
|
+
)
|
119
|
+
col
|
120
|
+
end
|
121
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
|
122
|
+
|
123
|
+
lineage = cols_to_learn.flat_map do |col|
|
124
|
+
EasyML::Lineage.learn(col)
|
125
|
+
end.compact
|
126
|
+
EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
|
127
|
+
end
|
128
|
+
|
129
|
+
measure_method_timing :set_feature_lineage
|
130
|
+
|
118
131
|
private
|
119
132
|
|
120
133
|
def import_new(new_columns, existing_columns)
|
@@ -127,7 +140,7 @@ module EasyML
|
|
127
140
|
col
|
128
141
|
end
|
129
142
|
EasyML::Column.import(cols_to_insert)
|
130
|
-
set_feature_lineage
|
143
|
+
set_feature_lineage(cols_to_insert)
|
131
144
|
column_list.reload
|
132
145
|
end
|
133
146
|
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Base
|
5
|
+
attr_reader :dataset, :columns, :type
|
6
|
+
|
7
|
+
def initialize(dataset, columns, type: :raw)
|
8
|
+
@dataset = dataset
|
9
|
+
@columns = columns
|
10
|
+
@type = type
|
11
|
+
end
|
12
|
+
|
13
|
+
def skip_processing?(column, type)
|
14
|
+
(!column.in_raw_dataset? && type.to_sym != :processed) ||
|
15
|
+
(column.one_hot? && type.to_sym == :processed)
|
16
|
+
end
|
17
|
+
|
18
|
+
TYPES_ALL = %i(raw clipped processed)
|
19
|
+
TYPES_RAW = %i(raw clipped)
|
20
|
+
TYPES_PROCESSED = %i(processed)
|
21
|
+
|
22
|
+
def types(type = :all)
|
23
|
+
case type
|
24
|
+
when :all then TYPES_ALL
|
25
|
+
when :raw then TYPES_RAW
|
26
|
+
when :processed then TYPES_PROCESSED
|
27
|
+
else
|
28
|
+
TYPES_ALL
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Eager
|
5
|
+
class Categorical < Query
|
6
|
+
def train_query(df)
|
7
|
+
{
|
8
|
+
counts: counts(df).to_hash,
|
9
|
+
allowed_categories: allowed_categories(df).to_series.to_a,
|
10
|
+
}.merge!(
|
11
|
+
learn_encoder_decoder(df)
|
12
|
+
)
|
13
|
+
end
|
14
|
+
|
15
|
+
def learn_encoder_decoder(df)
|
16
|
+
unsorted = allowed_categories(df).lazy.with_row_count.collect.to_hash.invert
|
17
|
+
|
18
|
+
label_encoder = unsorted.transform_keys(&column.method(:cast)).keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
|
19
|
+
h.tap do
|
20
|
+
h[k] = i
|
21
|
+
end
|
22
|
+
end
|
23
|
+
label_decoder = label_encoder.invert
|
24
|
+
|
25
|
+
{
|
26
|
+
label_encoder: label_encoder,
|
27
|
+
label_decoder: label_decoder,
|
28
|
+
}
|
29
|
+
end
|
30
|
+
|
31
|
+
def counts(df)
|
32
|
+
return @counts if @counts
|
33
|
+
|
34
|
+
@counts = df.group_by(column.name)
|
35
|
+
.agg(Polars.col(column.name).count.alias("count"))
|
36
|
+
end
|
37
|
+
|
38
|
+
def allowed_categories(df)
|
39
|
+
return @allowed_categories if @allowed_categories
|
40
|
+
|
41
|
+
@allowed_categories = df.join(counts(df), on: column.name)
|
42
|
+
.filter(Polars.col("count").ge(column.categorical_min))
|
43
|
+
.select(column.name)
|
44
|
+
.unique
|
45
|
+
.sort(column.name, reverse: true)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Eager
|
5
|
+
class Query < EasyML::Dataset::Learner::Query
|
6
|
+
def execute(split, df)
|
7
|
+
case split.to_sym
|
8
|
+
when :train
|
9
|
+
train_query(df)
|
10
|
+
when :data
|
11
|
+
full_dataset_query(df)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def train_query(df)
|
16
|
+
{}
|
17
|
+
end
|
18
|
+
|
19
|
+
def full_dataset_query(df)
|
20
|
+
{}
|
21
|
+
end
|
22
|
+
|
23
|
+
def adapter
|
24
|
+
case (raw_dtype&.to_sym || dtype.to_sym)
|
25
|
+
when :categorical
|
26
|
+
Eager::Categorical
|
27
|
+
when :boolean
|
28
|
+
Eager::Boolean
|
29
|
+
else
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Eager < Base
|
5
|
+
def learn
|
6
|
+
types.reduce({}) do |h, type|
|
7
|
+
h.tap do
|
8
|
+
h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def learn_using_split(split, type)
|
16
|
+
return {} if @dataset.send(type).empty?
|
17
|
+
|
18
|
+
execute_queries(split, type) || {}
|
19
|
+
end
|
20
|
+
|
21
|
+
def fetch_df(split, type)
|
22
|
+
@dataset.send(type).send(split, all_columns: true)
|
23
|
+
end
|
24
|
+
|
25
|
+
def execute_queries(split, type)
|
26
|
+
@fetched = nil
|
27
|
+
|
28
|
+
columns.reduce({}) do |h, column|
|
29
|
+
h.tap do
|
30
|
+
next if skip_processing?(column, type)
|
31
|
+
|
32
|
+
adapter = Eager::Query.new(@dataset, column).adapter
|
33
|
+
next unless adapter.present?
|
34
|
+
|
35
|
+
@fetched ||= fetch_df(split, type)
|
36
|
+
h[column.name] = adapter.new(@dataset, column).execute(split, @fetched)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class Datetime < Query
|
6
|
+
def full_dataset_query
|
7
|
+
super.concat([
|
8
|
+
unique_count,
|
9
|
+
])
|
10
|
+
end
|
11
|
+
|
12
|
+
def unique_count
|
13
|
+
Polars.col(column.name).n_unique.alias("#{column.name}__unique_count")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class Numeric < Query
|
6
|
+
def train_query
|
7
|
+
super.concat([
|
8
|
+
Polars.col(column.name).mean.alias("#{column.name}__mean"),
|
9
|
+
Polars.col(column.name).median.alias("#{column.name}__median"),
|
10
|
+
Polars.col(column.name).min.alias("#{column.name}__min"),
|
11
|
+
Polars.col(column.name).max.alias("#{column.name}__max"),
|
12
|
+
Polars.col(column.name).std.alias("#{column.name}__std"),
|
13
|
+
])
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class Query < EasyML::Dataset::Learner::Query
|
6
|
+
def adapter
|
7
|
+
case dtype.to_sym
|
8
|
+
when :float, :integer
|
9
|
+
Lazy::Numeric
|
10
|
+
when :string, :text
|
11
|
+
Lazy::String
|
12
|
+
when :categorical
|
13
|
+
Lazy::Categorical
|
14
|
+
when :datetime, :date
|
15
|
+
Lazy::Datetime
|
16
|
+
when :boolean
|
17
|
+
Lazy::Boolean
|
18
|
+
when :null
|
19
|
+
Lazy::Null
|
20
|
+
else
|
21
|
+
raise "Don't know how to learn from dtype: #{dtype}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def execute(split)
|
26
|
+
case split.to_sym
|
27
|
+
when :train
|
28
|
+
train_query
|
29
|
+
when :data
|
30
|
+
full_dataset_query
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def full_dataset_query
|
37
|
+
[num_rows, null_count].compact
|
38
|
+
end
|
39
|
+
|
40
|
+
def train_query
|
41
|
+
[last_value, most_frequent_value].compact
|
42
|
+
end
|
43
|
+
|
44
|
+
def null_count
|
45
|
+
Polars.col(column.name).null_count.alias("#{column.name}__null_count")
|
46
|
+
end
|
47
|
+
|
48
|
+
def num_rows
|
49
|
+
Polars.col(column.name).len.alias("#{column.name}__num_rows")
|
50
|
+
end
|
51
|
+
|
52
|
+
def most_frequent_value
|
53
|
+
Polars.col(column.name).filter(Polars.col(column.name).is_not_null).mode.first.alias("#{column.name}__most_frequent_value")
|
54
|
+
end
|
55
|
+
|
56
|
+
def last_value
|
57
|
+
return unless dataset.date_column.present?
|
58
|
+
|
59
|
+
Polars.col(column.name)
|
60
|
+
.sort_by(dataset.date_column.name, reverse: true, nulls_last: true)
|
61
|
+
.filter(Polars.col(column.name).is_not_null)
|
62
|
+
.first
|
63
|
+
.alias("#{column.name}__last_value")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class String < Query
|
6
|
+
def full_dataset_query
|
7
|
+
super.concat([
|
8
|
+
unique_count,
|
9
|
+
])
|
10
|
+
end
|
11
|
+
|
12
|
+
def unique_count
|
13
|
+
Polars.col(column.name).cast(:str).n_unique.alias("#{column.name}__unique_count")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy < Base
|
5
|
+
def learn
|
6
|
+
# types.map
|
7
|
+
types.reduce({}) do |h, type|
|
8
|
+
h.tap do
|
9
|
+
h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def learn_using_split(split, type)
|
17
|
+
return {} if @dataset.send(type).empty?
|
18
|
+
|
19
|
+
get_column_statistics(run_queries(split, type))
|
20
|
+
end
|
21
|
+
|
22
|
+
def run_queries(split, type)
|
23
|
+
queries = build_queries(split, type)
|
24
|
+
@dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_column_statistics(query_results)
|
28
|
+
query_results.columns.group_by { |k| k.split("__").first }.reduce({}) do |h, (k, v)|
|
29
|
+
h.tap do
|
30
|
+
h[k] ||= {}
|
31
|
+
v.each do |col|
|
32
|
+
statistic_name = col.split("__").last
|
33
|
+
h[k][statistic_name] = query_results[col][0]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def build_queries(split, type)
|
40
|
+
columns.flat_map do |column|
|
41
|
+
next if skip_processing?(column, type)
|
42
|
+
|
43
|
+
query = Lazy::Query.new(@dataset, column)
|
44
|
+
query_adapter = query.adapter.new(@dataset, column)
|
45
|
+
query_adapter.execute(split)
|
46
|
+
end.compact
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Query
|
5
|
+
attr_reader :dataset, :column, :dtype, :raw_dtype
|
6
|
+
|
7
|
+
def initialize(dataset, column)
|
8
|
+
@dataset = dataset
|
9
|
+
@column = column
|
10
|
+
begin
|
11
|
+
# TODO: LAZIFY THIS
|
12
|
+
@dtype = column.datatype || EasyML::Data::PolarsColumn.determine_type(column.raw.data[column.name])
|
13
|
+
@raw_dtype = column.raw_dtype
|
14
|
+
rescue => e
|
15
|
+
raise "Unable to find column #{column.name}. If this column is computed by a feature, you forgot to declare computes_columns"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def execute(split)
|
20
|
+
adapter.execute(split)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
include EasyML::Timing
|
5
|
+
attr_accessor :dataset, :columns, :type, :computed, :raw_columns, :statistics
|
6
|
+
|
7
|
+
def initialize(dataset, type: :raw)
|
8
|
+
@dataset = dataset
|
9
|
+
@columns = dataset.columns.reload.needs_learn.sort_by(&:name)
|
10
|
+
|
11
|
+
if computed
|
12
|
+
@columns = @columns.computed
|
13
|
+
end
|
14
|
+
|
15
|
+
@columns = @columns.select(&:persisted?).reject(&:empty?)
|
16
|
+
@type = type
|
17
|
+
end
|
18
|
+
|
19
|
+
def learn
|
20
|
+
prepare
|
21
|
+
learn_statistics
|
22
|
+
save_statistics
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def save_statistics
|
28
|
+
columns.each do |col|
|
29
|
+
col.merge_statistics(statistics.dig(col.name))
|
30
|
+
col.set_sample_values
|
31
|
+
col.assign_attributes(
|
32
|
+
learned_at: EasyML::Support::UTC.now,
|
33
|
+
last_datasource_sha: col.dataset.last_datasource_sha,
|
34
|
+
last_feature_sha: col.feature&.sha,
|
35
|
+
is_learning: type == :raw,
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[
|
40
|
+
statistics
|
41
|
+
learned_at
|
42
|
+
sample_values
|
43
|
+
last_datasource_sha
|
44
|
+
is_learning
|
45
|
+
] })
|
46
|
+
dataset.columns.set_feature_lineage(columns)
|
47
|
+
end
|
48
|
+
|
49
|
+
measure_method_timing :save_statistics
|
50
|
+
|
51
|
+
def learn_statistics
|
52
|
+
return @statistics if @statistics
|
53
|
+
|
54
|
+
@statistics = lazy_statistics.deep_merge!(eager_statistics).reduce({}) do |h, (type, stat_group)|
|
55
|
+
h.tap do
|
56
|
+
stat_group.each do |statistic, value|
|
57
|
+
h[statistic] ||= {}
|
58
|
+
h[statistic][type] = value
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end.with_indifferent_access
|
62
|
+
|
63
|
+
if type != :raw
|
64
|
+
columns.select(&:one_hot?).each do |column|
|
65
|
+
@statistics[column.name][:processed] = @statistics[column.name][:raw]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
measure_method_timing :learn_statistics
|
71
|
+
|
72
|
+
def prepare
|
73
|
+
@schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
|
74
|
+
@raw_columns = @schema.keys.sort.map(&:to_s)
|
75
|
+
columns.each do |column|
|
76
|
+
attrs = {
|
77
|
+
in_raw_dataset: @raw_columns.include?(column.name),
|
78
|
+
datatype: column.read_attribute(:datatype).present? ? nil : @schema[column.name.to_sym],
|
79
|
+
}.compact
|
80
|
+
column.assign_attributes(attrs)
|
81
|
+
end
|
82
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
|
83
|
+
end
|
84
|
+
|
85
|
+
measure_method_timing :prepare
|
86
|
+
|
87
|
+
def lazy_statistics
|
88
|
+
Lazy.new(dataset, columns, type: type).learn
|
89
|
+
end
|
90
|
+
|
91
|
+
measure_method_timing :lazy_statistics
|
92
|
+
|
93
|
+
def eager_statistics
|
94
|
+
Eager.new(dataset, columns, type: type).learn
|
95
|
+
end
|
96
|
+
|
97
|
+
measure_method_timing :eager_statistics
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|