easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +33 -0
- data/app/controllers/easy_ml/datasources_controller.rb +7 -0
- data/app/controllers/easy_ml/models_controller.rb +38 -0
- data/app/frontend/components/DatasetCard.tsx +212 -0
- data/app/frontend/components/ModelCard.tsx +69 -29
- data/app/frontend/components/StackTrace.tsx +13 -0
- data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
- data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
- data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
- data/app/frontend/components/models/UploadModelModal.tsx +212 -0
- data/app/frontend/components/models/index.ts +2 -0
- data/app/frontend/pages/DatasetsPage.tsx +36 -130
- data/app/frontend/pages/DatasourcesPage.tsx +22 -2
- data/app/frontend/pages/ModelsPage.tsx +37 -11
- data/app/frontend/types/dataset.ts +1 -2
- data/app/frontend/types.ts +1 -1
- data/app/jobs/easy_ml/training_job.rb +2 -2
- data/app/models/easy_ml/column/imputers/base.rb +4 -0
- data/app/models/easy_ml/column/imputers/clip.rb +5 -3
- data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
- data/app/models/easy_ml/column/imputers/mean.rb +7 -3
- data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
- data/app/models/easy_ml/column/imputers.rb +3 -1
- data/app/models/easy_ml/column/lineage/base.rb +5 -1
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
- data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
- data/app/models/easy_ml/column/selector.rb +4 -0
- data/app/models/easy_ml/column.rb +79 -63
- data/app/models/easy_ml/column_history.rb +28 -28
- data/app/models/easy_ml/column_list/imputer.rb +23 -0
- data/app/models/easy_ml/column_list.rb +39 -26
- data/app/models/easy_ml/dataset/learner/base.rb +34 -0
- data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
- data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
- data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
- data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
- data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
- data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
- data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
- data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
- data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
- data/app/models/easy_ml/dataset/learner/query.rb +25 -0
- data/app/models/easy_ml/dataset/learner.rb +100 -0
- data/app/models/easy_ml/dataset.rb +150 -36
- data/app/models/easy_ml/dataset_history.rb +1 -0
- data/app/models/easy_ml/datasource.rb +9 -0
- data/app/models/easy_ml/event.rb +4 -0
- data/app/models/easy_ml/export/column.rb +27 -0
- data/app/models/easy_ml/export/dataset.rb +37 -0
- data/app/models/easy_ml/export/datasource.rb +12 -0
- data/app/models/easy_ml/export/feature.rb +24 -0
- data/app/models/easy_ml/export/model.rb +40 -0
- data/app/models/easy_ml/export/retraining_job.rb +20 -0
- data/app/models/easy_ml/export/splitter.rb +14 -0
- data/app/models/easy_ml/feature.rb +21 -0
- data/app/models/easy_ml/import/column.rb +35 -0
- data/app/models/easy_ml/import/dataset.rb +148 -0
- data/app/models/easy_ml/import/feature.rb +36 -0
- data/app/models/easy_ml/import/model.rb +136 -0
- data/app/models/easy_ml/import/retraining_job.rb +29 -0
- data/app/models/easy_ml/import/splitter.rb +34 -0
- data/app/models/easy_ml/lineage.rb +44 -0
- data/app/models/easy_ml/model.rb +93 -36
- data/app/models/easy_ml/model_file.rb +6 -0
- data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
- data/app/models/easy_ml/models/xgboost.rb +33 -9
- data/app/models/easy_ml/retraining_job.rb +8 -1
- data/app/models/easy_ml/retraining_run.rb +6 -4
- data/app/models/easy_ml/splitter.rb +8 -0
- data/app/models/lineage_history.rb +6 -0
- data/app/serializers/easy_ml/column_serializer.rb +7 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
- data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
- data/config/routes.rb +13 -1
- data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
- data/lib/easy_ml/core/tuner.rb +12 -11
- data/lib/easy_ml/data/polars_column.rb +149 -100
- data/lib/easy_ml/data/polars_reader.rb +8 -5
- data/lib/easy_ml/data/polars_schema.rb +56 -0
- data/lib/easy_ml/data/splits/file_split.rb +20 -2
- data/lib/easy_ml/data/splits/split.rb +10 -1
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/deep_compact.rb +19 -0
- data/lib/easy_ml/feature_store.rb +2 -6
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
- data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
- data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
- data/lib/easy_ml/timing.rb +34 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +2 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
- metadata +52 -12
- data/app/models/easy_ml/column/learners/base.rb +0 -103
- data/app/models/easy_ml/column/learners/boolean.rb +0 -11
- data/app/models/easy_ml/column/learners/categorical.rb +0 -51
- data/app/models/easy_ml/column/learners/datetime.rb +0 -19
- data/app/models/easy_ml/column/learners/null.rb +0 -22
- data/app/models/easy_ml/column/learners/numeric.rb +0 -33
- data/app/models/easy_ml/column/learners/string.rb +0 -15
- data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -0,0 +1,37 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Eager
|
5
|
+
class Query < EasyML::Dataset::Learner::Query
|
6
|
+
def execute(split, df)
|
7
|
+
case split.to_sym
|
8
|
+
when :train
|
9
|
+
train_query(df)
|
10
|
+
when :data
|
11
|
+
full_dataset_query(df)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def train_query(df)
|
16
|
+
{}
|
17
|
+
end
|
18
|
+
|
19
|
+
def full_dataset_query(df)
|
20
|
+
{}
|
21
|
+
end
|
22
|
+
|
23
|
+
def adapter
|
24
|
+
case (raw_dtype&.to_sym || dtype.to_sym)
|
25
|
+
when :categorical
|
26
|
+
Eager::Categorical
|
27
|
+
when :boolean
|
28
|
+
Eager::Boolean
|
29
|
+
else
|
30
|
+
nil
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Eager < Base
|
5
|
+
def learn
|
6
|
+
types.reduce({}) do |h, type|
|
7
|
+
h.tap do
|
8
|
+
h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def learn_using_split(split, type)
|
16
|
+
return {} if @dataset.send(type).empty?
|
17
|
+
|
18
|
+
execute_queries(split, type) || {}
|
19
|
+
end
|
20
|
+
|
21
|
+
def fetch_df(split, type)
|
22
|
+
@dataset.send(type).send(split, all_columns: true)
|
23
|
+
end
|
24
|
+
|
25
|
+
def execute_queries(split, type)
|
26
|
+
@fetched = nil
|
27
|
+
|
28
|
+
columns.reduce({}) do |h, column|
|
29
|
+
h.tap do
|
30
|
+
next if skip_processing?(column, type)
|
31
|
+
|
32
|
+
adapter = Eager::Query.new(@dataset, column).adapter
|
33
|
+
next unless adapter.present?
|
34
|
+
|
35
|
+
@fetched ||= fetch_df(split, type)
|
36
|
+
h[column.name] = adapter.new(@dataset, column).execute(split, @fetched)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class Datetime < Query
|
6
|
+
def full_dataset_query
|
7
|
+
super.concat([
|
8
|
+
unique_count,
|
9
|
+
])
|
10
|
+
end
|
11
|
+
|
12
|
+
def unique_count
|
13
|
+
Polars.col(column.name).n_unique.alias("#{column.name}__unique_count")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class Numeric < Query
|
6
|
+
def train_query
|
7
|
+
super.concat([
|
8
|
+
Polars.col(column.name).mean.alias("#{column.name}__mean"),
|
9
|
+
Polars.col(column.name).median.alias("#{column.name}__median"),
|
10
|
+
Polars.col(column.name).min.alias("#{column.name}__min"),
|
11
|
+
Polars.col(column.name).max.alias("#{column.name}__max"),
|
12
|
+
Polars.col(column.name).std.alias("#{column.name}__std"),
|
13
|
+
])
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,69 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class Query < EasyML::Dataset::Learner::Query
|
6
|
+
def adapter
|
7
|
+
case dtype.to_sym
|
8
|
+
when :float, :integer
|
9
|
+
Lazy::Numeric
|
10
|
+
when :string, :text
|
11
|
+
Lazy::String
|
12
|
+
when :categorical
|
13
|
+
Lazy::Categorical
|
14
|
+
when :datetime, :date
|
15
|
+
Lazy::Datetime
|
16
|
+
when :boolean
|
17
|
+
Lazy::Boolean
|
18
|
+
when :null
|
19
|
+
Lazy::Null
|
20
|
+
else
|
21
|
+
raise "Don't know how to learn from dtype: #{dtype}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def execute(split)
|
26
|
+
case split.to_sym
|
27
|
+
when :train
|
28
|
+
train_query
|
29
|
+
when :data
|
30
|
+
full_dataset_query
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def full_dataset_query
|
37
|
+
[num_rows, null_count].compact
|
38
|
+
end
|
39
|
+
|
40
|
+
def train_query
|
41
|
+
[last_value, most_frequent_value].compact
|
42
|
+
end
|
43
|
+
|
44
|
+
def null_count
|
45
|
+
Polars.col(column.name).null_count.alias("#{column.name}__null_count")
|
46
|
+
end
|
47
|
+
|
48
|
+
def num_rows
|
49
|
+
Polars.col(column.name).len.alias("#{column.name}__num_rows")
|
50
|
+
end
|
51
|
+
|
52
|
+
def most_frequent_value
|
53
|
+
Polars.col(column.name).filter(Polars.col(column.name).is_not_null).mode.first.alias("#{column.name}__most_frequent_value")
|
54
|
+
end
|
55
|
+
|
56
|
+
def last_value
|
57
|
+
return unless dataset.date_column.present?
|
58
|
+
|
59
|
+
Polars.col(column.name)
|
60
|
+
.sort_by(dataset.date_column.name, reverse: true, nulls_last: true)
|
61
|
+
.filter(Polars.col(column.name).is_not_null)
|
62
|
+
.first
|
63
|
+
.alias("#{column.name}__last_value")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy
|
5
|
+
class String < Query
|
6
|
+
def full_dataset_query
|
7
|
+
super.concat([
|
8
|
+
unique_count,
|
9
|
+
])
|
10
|
+
end
|
11
|
+
|
12
|
+
def unique_count
|
13
|
+
Polars.col(column.name).cast(:str).n_unique.alias("#{column.name}__unique_count")
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Lazy < Base
|
5
|
+
def learn
|
6
|
+
# types.map
|
7
|
+
types.reduce({}) do |h, type|
|
8
|
+
h.tap do
|
9
|
+
h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def learn_using_split(split, type)
|
17
|
+
return {} if @dataset.send(type).empty?
|
18
|
+
|
19
|
+
get_column_statistics(run_queries(split, type))
|
20
|
+
end
|
21
|
+
|
22
|
+
def run_queries(split, type)
|
23
|
+
queries = build_queries(split, type)
|
24
|
+
@dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
|
25
|
+
end
|
26
|
+
|
27
|
+
def get_column_statistics(query_results)
|
28
|
+
query_results.columns.group_by { |k| k.split("__").first }.reduce({}) do |h, (k, v)|
|
29
|
+
h.tap do
|
30
|
+
h[k] ||= {}
|
31
|
+
v.each do |col|
|
32
|
+
statistic_name = col.split("__").last
|
33
|
+
h[k][statistic_name] = query_results[col][0]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def build_queries(split, type)
|
40
|
+
columns.flat_map do |column|
|
41
|
+
next if skip_processing?(column, type)
|
42
|
+
|
43
|
+
query = Lazy::Query.new(@dataset, column)
|
44
|
+
query_adapter = query.adapter.new(@dataset, column)
|
45
|
+
query_adapter.execute(split)
|
46
|
+
end.compact
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
class Query
|
5
|
+
attr_reader :dataset, :column, :dtype, :raw_dtype
|
6
|
+
|
7
|
+
def initialize(dataset, column)
|
8
|
+
@dataset = dataset
|
9
|
+
@column = column
|
10
|
+
begin
|
11
|
+
# TODO: LAZIFY THIS
|
12
|
+
@dtype = column.datatype || EasyML::Data::PolarsColumn.determine_type(column.raw.data[column.name])
|
13
|
+
@raw_dtype = column.raw_dtype
|
14
|
+
rescue => e
|
15
|
+
raise "Unable to find column #{column.name}. If this column is computed by a feature, you forgot to declare computes_columns"
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def execute(split)
|
20
|
+
adapter.execute(split)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Dataset
|
3
|
+
class Learner
|
4
|
+
include EasyML::Timing
|
5
|
+
attr_accessor :dataset, :columns, :type, :computed, :raw_columns, :statistics
|
6
|
+
|
7
|
+
def initialize(dataset, type: :raw)
|
8
|
+
@dataset = dataset
|
9
|
+
@columns = dataset.columns.reload.needs_learn.sort_by(&:name)
|
10
|
+
|
11
|
+
if computed
|
12
|
+
@columns = @columns.computed
|
13
|
+
end
|
14
|
+
|
15
|
+
@columns = @columns.select(&:persisted?).reject(&:empty?)
|
16
|
+
@type = type
|
17
|
+
end
|
18
|
+
|
19
|
+
def learn
|
20
|
+
prepare
|
21
|
+
learn_statistics
|
22
|
+
save_statistics
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def save_statistics
|
28
|
+
columns.each do |col|
|
29
|
+
col.merge_statistics(statistics.dig(col.name))
|
30
|
+
col.set_sample_values
|
31
|
+
col.assign_attributes(
|
32
|
+
learned_at: EasyML::Support::UTC.now,
|
33
|
+
last_datasource_sha: col.dataset.last_datasource_sha,
|
34
|
+
last_feature_sha: col.feature&.sha,
|
35
|
+
is_learning: type == :raw,
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[
|
40
|
+
statistics
|
41
|
+
learned_at
|
42
|
+
sample_values
|
43
|
+
last_datasource_sha
|
44
|
+
is_learning
|
45
|
+
] })
|
46
|
+
dataset.columns.set_feature_lineage(columns)
|
47
|
+
end
|
48
|
+
|
49
|
+
measure_method_timing :save_statistics
|
50
|
+
|
51
|
+
def learn_statistics
|
52
|
+
return @statistics if @statistics
|
53
|
+
|
54
|
+
@statistics = lazy_statistics.deep_merge!(eager_statistics).reduce({}) do |h, (type, stat_group)|
|
55
|
+
h.tap do
|
56
|
+
stat_group.each do |statistic, value|
|
57
|
+
h[statistic] ||= {}
|
58
|
+
h[statistic][type] = value
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end.with_indifferent_access
|
62
|
+
|
63
|
+
if type != :raw
|
64
|
+
columns.select(&:one_hot?).each do |column|
|
65
|
+
@statistics[column.name][:processed] = @statistics[column.name][:raw]
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
measure_method_timing :learn_statistics
|
71
|
+
|
72
|
+
def prepare
|
73
|
+
@schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
|
74
|
+
@raw_columns = @schema.keys.sort.map(&:to_s)
|
75
|
+
columns.each do |column|
|
76
|
+
attrs = {
|
77
|
+
in_raw_dataset: @raw_columns.include?(column.name),
|
78
|
+
datatype: column.read_attribute(:datatype).present? ? nil : @schema[column.name.to_sym],
|
79
|
+
}.compact
|
80
|
+
column.assign_attributes(attrs)
|
81
|
+
end
|
82
|
+
EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
|
83
|
+
end
|
84
|
+
|
85
|
+
measure_method_timing :prepare
|
86
|
+
|
87
|
+
def lazy_statistics
|
88
|
+
Lazy.new(dataset, columns, type: type).learn
|
89
|
+
end
|
90
|
+
|
91
|
+
measure_method_timing :lazy_statistics
|
92
|
+
|
93
|
+
def eager_statistics
|
94
|
+
Eager.new(dataset, columns, type: type).learn
|
95
|
+
end
|
96
|
+
|
97
|
+
measure_method_timing :eager_statistics
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|