easy_ml 0.2.0.pre.rc85 → 0.2.0.pre.rc88
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +18 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
- data/app/frontend/pages/DatasetsPage.tsx +0 -1
- data/app/frontend/types/dataset.ts +5 -2
- data/app/models/easy_ml/column/imputers/base.rb +23 -2
- data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
- data/app/models/easy_ml/column/imputers.rb +47 -41
- data/app/models/easy_ml/column/selector.rb +2 -2
- data/app/models/easy_ml/column.rb +260 -56
- data/app/models/easy_ml/column_history.rb +6 -0
- data/app/models/easy_ml/column_list.rb +30 -1
- data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
- data/app/models/easy_ml/dataset/learner.rb +11 -0
- data/app/models/easy_ml/dataset.rb +6 -19
- data/app/models/easy_ml/lineage_history.rb +17 -0
- data/app/models/easy_ml/model.rb +11 -1
- data/app/models/easy_ml/models/xgboost.rb +37 -7
- data/app/models/easy_ml/pca_model.rb +21 -0
- data/app/models/easy_ml/prediction.rb +2 -1
- data/app/serializers/easy_ml/column_serializer.rb +13 -1
- data/config/initializers/inflections.rb +1 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
- data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
- data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
- data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
- data/lib/easy_ml/data/embeddings.rb +61 -0
- data/lib/easy_ml/data/polars_column.rb +3 -0
- data/lib/easy_ml/data/polars_reader.rb +54 -23
- data/lib/easy_ml/data/polars_schema.rb +28 -2
- data/lib/easy_ml/data/splits/file_split.rb +7 -2
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/embedding_store.rb +92 -0
- data/lib/easy_ml/engine.rb +4 -2
- data/lib/easy_ml/predict.rb +42 -20
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
- data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
- metadata +59 -6
- data/lib/tasks/profile.rake +0 -40
- data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js +0 -522
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js.map +0 -1
@@ -15,7 +15,6 @@ interface Props {
|
|
15
15
|
const ITEMS_PER_PAGE = 6;
|
16
16
|
|
17
17
|
export default function DatasetsPage({ datasets }: Props) {
|
18
|
-
console.log(`change`)
|
19
18
|
const { rootPath } = usePage().props;
|
20
19
|
const [searchQuery, setSearchQuery] = useState('');
|
21
20
|
const [currentPage, setCurrentPage] = useState(1);
|
@@ -37,6 +37,7 @@ export type PreprocessingStep = {
|
|
37
37
|
| "categorical"
|
38
38
|
| "constant"
|
39
39
|
| "today";
|
40
|
+
encoding?: "one_hot" | "ordinal" | "embedding" | null;
|
40
41
|
params: {
|
41
42
|
value?: number;
|
42
43
|
constant?: string;
|
@@ -45,8 +46,10 @@ export type PreprocessingStep = {
|
|
45
46
|
min?: number;
|
46
47
|
max?: number;
|
47
48
|
};
|
48
|
-
|
49
|
-
|
49
|
+
llm?: string;
|
50
|
+
model?: string;
|
51
|
+
dimensions?: number;
|
52
|
+
preset?: string;
|
50
53
|
};
|
51
54
|
};
|
52
55
|
|
@@ -15,6 +15,12 @@ module EasyML
|
|
15
15
|
Imputers.methods_by_class[self] << m.to_sym
|
16
16
|
end
|
17
17
|
|
18
|
+
def encoding_applies(e)
|
19
|
+
Imputers.supported_encodings << e.to_sym
|
20
|
+
Imputers.encodings_by_class[self] ||= []
|
21
|
+
Imputers.encodings_by_class[self] << e.to_sym
|
22
|
+
end
|
23
|
+
|
18
24
|
def description
|
19
25
|
"Unknown preprocessing method"
|
20
26
|
end
|
@@ -32,7 +38,7 @@ module EasyML
|
|
32
38
|
end
|
33
39
|
|
34
40
|
def applies?
|
35
|
-
method_applies? || param_applies?
|
41
|
+
method_applies? || param_applies? || encoding_applies?
|
36
42
|
end
|
37
43
|
|
38
44
|
def method_applies?
|
@@ -43,6 +49,12 @@ module EasyML
|
|
43
49
|
params.keys.any? { |p| imputers_own_params.include?(p.to_sym) && params[p] != false }
|
44
50
|
end
|
45
51
|
|
52
|
+
def encoding_applies?
|
53
|
+
return false unless encoding.present?
|
54
|
+
|
55
|
+
imputers_own_encodings.include?(encoding.to_sym)
|
56
|
+
end
|
57
|
+
|
46
58
|
def imputers_own_methods
|
47
59
|
Imputers.methods_by_class[self.class] || []
|
48
60
|
end
|
@@ -51,6 +63,10 @@ module EasyML
|
|
51
63
|
Imputers.params_by_class[self.class] || []
|
52
64
|
end
|
53
65
|
|
66
|
+
def imputers_own_encodings
|
67
|
+
Imputers.encodings_by_class[self.class] || []
|
68
|
+
end
|
69
|
+
|
54
70
|
def params
|
55
71
|
@preprocessing_step.dig(:params)
|
56
72
|
end
|
@@ -59,6 +75,10 @@ module EasyML
|
|
59
75
|
@preprocessing_step.dig(:method)
|
60
76
|
end
|
61
77
|
|
78
|
+
def encoding
|
79
|
+
@preprocessing_step.dig(:encoding)
|
80
|
+
end
|
81
|
+
|
62
82
|
def statistics(*args)
|
63
83
|
if column.is_computed
|
64
84
|
column.statistics.dig(:processed, *args)
|
@@ -74,8 +94,9 @@ module EasyML
|
|
74
94
|
def inspect
|
75
95
|
params_str = params ? params.map { |k, v| "#{k}: #{v}" }.join(", ") : "none"
|
76
96
|
method_str = method ? method : "none"
|
97
|
+
encoding_str = encoding ? encoding : "none"
|
77
98
|
|
78
|
-
"#<#{self.class.name} method=#{method_str.inspect} params={#{params_str}}>"
|
99
|
+
"#<#{self.class.name} method=#{method_str.inspect} encoding=#{encoding_str.inspect} params={#{params_str}}>"
|
79
100
|
end
|
80
101
|
|
81
102
|
alias_method :to_s, :inspect
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class EmbeddingEncoder < Base
|
5
|
+
encoding_applies :embedding
|
6
|
+
|
7
|
+
def self.description
|
8
|
+
"Generate embeddings"
|
9
|
+
end
|
10
|
+
|
11
|
+
def transform(df)
|
12
|
+
df = column.embed(df)
|
13
|
+
df
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -13,7 +13,7 @@ module EasyML
|
|
13
13
|
|
14
14
|
most_frequent = statistics(:most_frequent_value)
|
15
15
|
df = df.with_column(
|
16
|
-
Polars.col(column.name).fill_null(most_frequent).alias(column.name)
|
16
|
+
Polars.col(column.name).fill_null(Polars.lit(most_frequent).cast(column.polars_datatype)).alias(column.name)
|
17
17
|
)
|
18
18
|
df
|
19
19
|
end
|
@@ -5,56 +5,50 @@ module EasyML
|
|
5
5
|
|
6
6
|
ALLOWED_PARAMS = {
|
7
7
|
constant: [:constant],
|
8
|
-
categorical: %i[categorical_min
|
9
|
-
most_frequent:
|
8
|
+
categorical: %i[categorical_min],
|
9
|
+
most_frequent: [],
|
10
|
+
embedding: %i[llm model preset dimensions],
|
10
11
|
mean: [:clip],
|
11
12
|
median: [:clip],
|
12
13
|
}
|
13
14
|
|
15
|
+
LABELS = {
|
16
|
+
ffill: "Forward Fill",
|
17
|
+
categorical: "Categorical",
|
18
|
+
mean: "Mean",
|
19
|
+
median: "Median",
|
20
|
+
constant: "Constant Value",
|
21
|
+
most_frequent: "Most Frequent",
|
22
|
+
today: "Current Date",
|
23
|
+
}
|
24
|
+
|
14
25
|
PREPROCESSING_STRATEGIES = {
|
15
|
-
float:
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
{
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
],
|
37
|
-
string: [
|
38
|
-
{ value: "ffill", label: "Forward Fill" },
|
39
|
-
{ value: "most_frequent", label: "Most Frequent" },
|
40
|
-
{ value: "constant", label: "Constant Value" },
|
41
|
-
],
|
42
|
-
text: [
|
43
|
-
{ value: "ffill", label: "Forward Fill" },
|
44
|
-
{ value: "most_frequent", label: "Most Frequent" },
|
45
|
-
{ value: "constant", label: "Constant Value" },
|
46
|
-
],
|
47
|
-
categorical: [
|
48
|
-
{ value: "ffill", label: "Forward Fill" },
|
49
|
-
{ value: "categorical", label: "Categorical" },
|
50
|
-
{ value: "most_frequent", label: "Most Frequent" },
|
51
|
-
{ value: "constant", label: "Constant Value" },
|
52
|
-
],
|
53
|
-
}.freeze
|
26
|
+
float: %w(most_frequent ffill mean median constant),
|
27
|
+
integer: %w(most_frequent ffill mean median constant),
|
28
|
+
boolean: %w(ffill most_frequent constant),
|
29
|
+
datetime: %w(ffill today constant),
|
30
|
+
string: %w(ffill most_frequent constant),
|
31
|
+
text: %w(ffill most_frequent constant),
|
32
|
+
categorical: %w(ffill categorical most_frequent constant),
|
33
|
+
}.transform_values do |strategies|
|
34
|
+
strategies.map do |strategy|
|
35
|
+
{
|
36
|
+
value: strategy,
|
37
|
+
label: LABELS[strategy.to_sym],
|
38
|
+
}
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
ENCODING_STRATEGIES = {
|
43
|
+
categorical: %w(embedding one_hot ordinal),
|
44
|
+
string: %w(embedding),
|
45
|
+
text: %w(embedding),
|
46
|
+
}
|
54
47
|
|
55
48
|
def self.constants
|
56
49
|
{
|
57
50
|
preprocessing_strategies: PREPROCESSING_STRATEGIES,
|
51
|
+
encoding_strategies: ENCODING_STRATEGIES,
|
58
52
|
}
|
59
53
|
end
|
60
54
|
|
@@ -66,6 +60,10 @@ module EasyML
|
|
66
60
|
@methods_by_class ||= {}
|
67
61
|
end
|
68
62
|
|
63
|
+
def self.encodings_by_class
|
64
|
+
@encodings_by_class ||= {}
|
65
|
+
end
|
66
|
+
|
69
67
|
def self.supported_params
|
70
68
|
@supported_params ||= []
|
71
69
|
end
|
@@ -74,6 +72,10 @@ module EasyML
|
|
74
72
|
@supported_methods ||= []
|
75
73
|
end
|
76
74
|
|
75
|
+
def self.supported_encodings
|
76
|
+
@supported_encodings ||= []
|
77
|
+
end
|
78
|
+
|
77
79
|
def initialize(column, imputers: [])
|
78
80
|
@column = column
|
79
81
|
@dataset = column.dataset
|
@@ -88,6 +90,10 @@ module EasyML
|
|
88
90
|
def supported_methods
|
89
91
|
@supported_methods ||= []
|
90
92
|
end
|
93
|
+
|
94
|
+
def supported_encodings
|
95
|
+
@supported_encodings ||= []
|
96
|
+
end
|
91
97
|
end
|
92
98
|
|
93
99
|
def imputers
|
@@ -62,8 +62,8 @@ module EasyML
|
|
62
62
|
kwargs[:select] = []
|
63
63
|
end
|
64
64
|
|
65
|
-
if (selected == :processed || (selected.nil? && !dataset.needs_refresh?)) && column.
|
66
|
-
kwargs[:select] << column.
|
65
|
+
if (selected == :processed || (selected.nil? && !dataset.needs_refresh?)) && column.has_virtual_columns?
|
66
|
+
kwargs[:select] << column.aliases
|
67
67
|
else
|
68
68
|
kwargs[:select] << column.name
|
69
69
|
end
|