easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +319 -52
- data/app/models/easy_ml/column_history.rb +29 -22
- data/app/models/easy_ml/column_list.rb +63 -78
- data/app/models/easy_ml/dataset.rb +128 -96
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +3 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +19 -7
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +1 -1
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/engine.rb +1 -0
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +41 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -340
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -193
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -0,0 +1,22 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
module Learners
|
4
|
+
class Null < Base
|
5
|
+
def full_dataset_statistics(df)
|
6
|
+
return {} if df.nil?
|
7
|
+
|
8
|
+
{
|
9
|
+
num_rows: df.size,
|
10
|
+
null_count: df[column.name].null_count || 0,
|
11
|
+
}
|
12
|
+
end
|
13
|
+
|
14
|
+
def train_statistics(df)
|
15
|
+
{
|
16
|
+
last_value: last_value(df),
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
module Learners
|
4
|
+
class Numeric < Base
|
5
|
+
def train_statistics(df)
|
6
|
+
return {} if df.nil?
|
7
|
+
|
8
|
+
super(df).merge!({
|
9
|
+
mean: df[column.name].mean,
|
10
|
+
median: df[column.name].median,
|
11
|
+
min: df[column.name].min,
|
12
|
+
max: df[column.name].max,
|
13
|
+
std: df[column.name].std,
|
14
|
+
last_value: last_value(df),
|
15
|
+
}.compact)
|
16
|
+
end
|
17
|
+
|
18
|
+
def last_value(df)
|
19
|
+
if dataset.date_column.present?
|
20
|
+
sorted_df = df.sort(dataset.date_column.name, reverse: true)
|
21
|
+
last_value = sorted_df
|
22
|
+
.filter(Polars.col(column.name).is_not_null)
|
23
|
+
.select(column.name)
|
24
|
+
.head(1)
|
25
|
+
.item
|
26
|
+
|
27
|
+
last_value
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Lineage
|
4
|
+
class Base
|
5
|
+
attr_accessor :dataset, :column
|
6
|
+
|
7
|
+
def initialize(column)
|
8
|
+
@column = column
|
9
|
+
@dataset = column.dataset
|
10
|
+
end
|
11
|
+
|
12
|
+
def as_json
|
13
|
+
{
|
14
|
+
key: key,
|
15
|
+
description: description,
|
16
|
+
timestamp: timestamp,
|
17
|
+
}.with_indifferent_access
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Lineage
|
4
|
+
class ComputedByFeature < Base
|
5
|
+
def key
|
6
|
+
:computed_by_feature
|
7
|
+
end
|
8
|
+
|
9
|
+
def description
|
10
|
+
"Computed by #{column.computed_by}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def timestamp
|
14
|
+
column.feature.fit_at
|
15
|
+
end
|
16
|
+
|
17
|
+
def check
|
18
|
+
column.computed_by.present?
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Lineage
|
4
|
+
class Preprocessed < Base
|
5
|
+
def key
|
6
|
+
:preprocessed
|
7
|
+
end
|
8
|
+
|
9
|
+
def description
|
10
|
+
"Preprocessed using #{column.imputers.preprocessing_descriptions.join(", ")}"
|
11
|
+
end
|
12
|
+
|
13
|
+
def timestamp
|
14
|
+
column.dataset.refreshed_at
|
15
|
+
end
|
16
|
+
|
17
|
+
def check
|
18
|
+
column.preprocessing_steps.present?
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Lineage
|
4
|
+
class RawDataset < Base
|
5
|
+
def key
|
6
|
+
:raw_dataset
|
7
|
+
end
|
8
|
+
|
9
|
+
def description
|
10
|
+
"Present in raw dataset"
|
11
|
+
end
|
12
|
+
|
13
|
+
def timestamp
|
14
|
+
column.dataset.datasource.refreshed_at
|
15
|
+
end
|
16
|
+
|
17
|
+
def check
|
18
|
+
column.in_raw_dataset?
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Lineage
|
4
|
+
attr_accessor :dataset, :column
|
5
|
+
|
6
|
+
def initialize(column)
|
7
|
+
@column = column
|
8
|
+
@dataset = column.dataset
|
9
|
+
end
|
10
|
+
|
11
|
+
def sort_order
|
12
|
+
[
|
13
|
+
RawDataset,
|
14
|
+
ComputedByFeature,
|
15
|
+
Preprocessed,
|
16
|
+
]
|
17
|
+
end
|
18
|
+
|
19
|
+
def lineage
|
20
|
+
sort_order.map do |cl|
|
21
|
+
cl.new(column)
|
22
|
+
end.select(&:check)
|
23
|
+
.map(&:as_json)
|
24
|
+
.compact
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Selector
|
4
|
+
attr_accessor :selected, :dataset, :column, :transform
|
5
|
+
|
6
|
+
def initialize(column, selected = nil, &block)
|
7
|
+
@column = column
|
8
|
+
@dataset = column.dataset
|
9
|
+
@selected = selected
|
10
|
+
@transform = block
|
11
|
+
end
|
12
|
+
|
13
|
+
def name
|
14
|
+
column.name
|
15
|
+
end
|
16
|
+
|
17
|
+
def raw
|
18
|
+
if column.is_computed? && !column.in_raw_dataset?
|
19
|
+
Selector.new(column, :processed)
|
20
|
+
else
|
21
|
+
Selector.new(column, :raw)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def clipped
|
26
|
+
Selector.new(column, :raw) do |df|
|
27
|
+
column.imputers.training.clip(df)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def processed
|
32
|
+
Selector.new(column, :processed)
|
33
|
+
end
|
34
|
+
|
35
|
+
def train(**kwargs)
|
36
|
+
select(:train, **kwargs)
|
37
|
+
end
|
38
|
+
|
39
|
+
def test(**kwargs)
|
40
|
+
select(:test, **kwargs)
|
41
|
+
end
|
42
|
+
|
43
|
+
def valid(**kwargs)
|
44
|
+
select(:valid, **kwargs)
|
45
|
+
end
|
46
|
+
|
47
|
+
def data(**kwargs)
|
48
|
+
if column.is_computed?
|
49
|
+
Selector.new(column, :processed).send(:select, :data, **kwargs)
|
50
|
+
else
|
51
|
+
select(:data, **kwargs)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def select(segment, **orig_kwargs)
|
58
|
+
kwargs = orig_kwargs.clone
|
59
|
+
return nil if dataset.nil?
|
60
|
+
|
61
|
+
kwargs[:all_columns] = true
|
62
|
+
|
63
|
+
if kwargs.key?(:select)
|
64
|
+
kwargs[:select] = [kwargs[:select]].flatten
|
65
|
+
else
|
66
|
+
kwargs[:select] = []
|
67
|
+
end
|
68
|
+
|
69
|
+
if (selected == :processed || (selected.nil? && !dataset.needs_refresh?)) && column.one_hot?
|
70
|
+
kwargs[:select] << column.virtual_columns
|
71
|
+
else
|
72
|
+
kwargs[:select] << column.name
|
73
|
+
end
|
74
|
+
kwargs[:select] = kwargs[:select].flatten.uniq
|
75
|
+
|
76
|
+
if @selected.present?
|
77
|
+
available_columns = dataset.send(@selected).send(segment, limit: 1, all_columns: true)&.columns || []
|
78
|
+
kwargs[:select] = available_columns & kwargs[:select]
|
79
|
+
return Polars::DataFrame.new if kwargs[:select].empty?
|
80
|
+
result = dataset.send(@selected).send(segment, **kwargs)
|
81
|
+
else
|
82
|
+
available_columns = dataset.send(segment, limit: 1, all_columns: true)&.columns || []
|
83
|
+
kwargs[:select] = available_columns & kwargs[:select]
|
84
|
+
return Polars::DataFrame.new if kwargs[:select].empty?
|
85
|
+
result = dataset.send(segment, **kwargs)
|
86
|
+
end
|
87
|
+
|
88
|
+
if transform
|
89
|
+
result = transform.call(result)
|
90
|
+
end
|
91
|
+
|
92
|
+
result
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|