easy_ml 0.2.0.pre.rc84 → 0.2.0.pre.rc88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +19 -3
  3. data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
  4. data/app/frontend/types/dataset.ts +5 -2
  5. data/app/models/easy_ml/column/imputers/base.rb +23 -2
  6. data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
  7. data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
  8. data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
  9. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
  10. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
  11. data/app/models/easy_ml/column/imputers.rb +47 -41
  12. data/app/models/easy_ml/column/selector.rb +2 -2
  13. data/app/models/easy_ml/column.rb +260 -56
  14. data/app/models/easy_ml/column_history.rb +6 -0
  15. data/app/models/easy_ml/column_list.rb +30 -1
  16. data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
  17. data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
  18. data/app/models/easy_ml/dataset/learner.rb +11 -0
  19. data/app/models/easy_ml/dataset.rb +6 -19
  20. data/app/models/easy_ml/lineage_history.rb +17 -0
  21. data/app/models/easy_ml/model.rb +11 -1
  22. data/app/models/easy_ml/models/xgboost.rb +37 -7
  23. data/app/models/easy_ml/pca_model.rb +21 -0
  24. data/app/models/easy_ml/prediction.rb +2 -1
  25. data/app/serializers/easy_ml/column_serializer.rb +13 -1
  26. data/config/initializers/inflections.rb +1 -0
  27. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
  28. data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
  29. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
  30. data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
  31. data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
  32. data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
  33. data/lib/easy_ml/data/embeddings.rb +61 -0
  34. data/lib/easy_ml/data/polars_column.rb +3 -0
  35. data/lib/easy_ml/data/polars_reader.rb +54 -23
  36. data/lib/easy_ml/data/polars_schema.rb +28 -2
  37. data/lib/easy_ml/data/splits/file_split.rb +7 -2
  38. data/lib/easy_ml/data.rb +1 -0
  39. data/lib/easy_ml/embedding_store.rb +92 -0
  40. data/lib/easy_ml/engine.rb +4 -2
  41. data/lib/easy_ml/predict.rb +42 -20
  42. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
  43. data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
  44. data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
  45. data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
  46. data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
  47. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
  48. data/lib/easy_ml/version.rb +1 -1
  49. data/lib/easy_ml.rb +1 -0
  50. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  51. data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
  52. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
  53. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
  54. metadata +59 -6
  55. data/lib/tasks/profile.rake +0 -40
  56. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
  57. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +0 -522
  58. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js.map +0 -1
@@ -37,6 +37,7 @@ export type PreprocessingStep = {
37
37
  | "categorical"
38
38
  | "constant"
39
39
  | "today";
40
+ encoding?: "one_hot" | "ordinal" | "embedding" | null;
40
41
  params: {
41
42
  value?: number;
42
43
  constant?: string;
@@ -45,8 +46,10 @@ export type PreprocessingStep = {
45
46
  min?: number;
46
47
  max?: number;
47
48
  };
48
- one_hot?: boolean;
49
- ordinal_encoding?: boolean;
49
+ llm?: string;
50
+ model?: string;
51
+ dimensions?: number;
52
+ preset?: string;
50
53
  };
51
54
  };
52
55
 
@@ -15,6 +15,12 @@ module EasyML
15
15
  Imputers.methods_by_class[self] << m.to_sym
16
16
  end
17
17
 
18
+ def encoding_applies(e)
19
+ Imputers.supported_encodings << e.to_sym
20
+ Imputers.encodings_by_class[self] ||= []
21
+ Imputers.encodings_by_class[self] << e.to_sym
22
+ end
23
+
18
24
  def description
19
25
  "Unknown preprocessing method"
20
26
  end
@@ -32,7 +38,7 @@ module EasyML
32
38
  end
33
39
 
34
40
  def applies?
35
- method_applies? || param_applies?
41
+ method_applies? || param_applies? || encoding_applies?
36
42
  end
37
43
 
38
44
  def method_applies?
@@ -43,6 +49,12 @@ module EasyML
43
49
  params.keys.any? { |p| imputers_own_params.include?(p.to_sym) && params[p] != false }
44
50
  end
45
51
 
52
+ def encoding_applies?
53
+ return false unless encoding.present?
54
+
55
+ imputers_own_encodings.include?(encoding.to_sym)
56
+ end
57
+
46
58
  def imputers_own_methods
47
59
  Imputers.methods_by_class[self.class] || []
48
60
  end
@@ -51,6 +63,10 @@ module EasyML
51
63
  Imputers.params_by_class[self.class] || []
52
64
  end
53
65
 
66
+ def imputers_own_encodings
67
+ Imputers.encodings_by_class[self.class] || []
68
+ end
69
+
54
70
  def params
55
71
  @preprocessing_step.dig(:params)
56
72
  end
@@ -59,6 +75,10 @@ module EasyML
59
75
  @preprocessing_step.dig(:method)
60
76
  end
61
77
 
78
+ def encoding
79
+ @preprocessing_step.dig(:encoding)
80
+ end
81
+
62
82
  def statistics(*args)
63
83
  if column.is_computed
64
84
  column.statistics.dig(:processed, *args)
@@ -74,8 +94,9 @@ module EasyML
74
94
  def inspect
75
95
  params_str = params ? params.map { |k, v| "#{k}: #{v}" }.join(", ") : "none"
76
96
  method_str = method ? method : "none"
97
+ encoding_str = encoding ? encoding : "none"
77
98
 
78
- "#<#{self.class.name} method=#{method_str.inspect} params={#{params_str}}>"
99
+ "#<#{self.class.name} method=#{method_str.inspect} encoding=#{encoding_str.inspect} params={#{params_str}}>"
79
100
  end
80
101
 
81
102
  alias_method :to_s, :inspect
@@ -0,0 +1,18 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class EmbeddingEncoder < Base
5
+ encoding_applies :embedding
6
+
7
+ def self.description
8
+ "Generate embeddings"
9
+ end
10
+
11
+ def transform(df)
12
+ df = column.embed(df)
13
+ df
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
@@ -31,6 +31,7 @@ module EasyML
31
31
  Today,
32
32
  OneHotEncoder,
33
33
  OrdinalEncoder,
34
+ EmbeddingEncoder,
34
35
  ]
35
36
  end
36
37
 
@@ -13,7 +13,7 @@ module EasyML
13
13
 
14
14
  most_frequent = statistics(:most_frequent_value)
15
15
  df = df.with_column(
16
- Polars.col(column.name).fill_null(most_frequent).alias(column.name)
16
+ Polars.col(column.name).fill_null(Polars.lit(most_frequent).cast(column.polars_datatype)).alias(column.name)
17
17
  )
18
18
  df
19
19
  end
@@ -2,7 +2,7 @@ module EasyML
2
2
  class Column
3
3
  class Imputers
4
4
  class OneHotEncoder < Base
5
- param_applies :one_hot
5
+ encoding_applies :one_hot
6
6
 
7
7
  def self.description
8
8
  "One-hot encoder"
@@ -2,7 +2,7 @@ module EasyML
2
2
  class Column
3
3
  class Imputers
4
4
  class OrdinalEncoder < Base
5
- param_applies :ordinal_encoding
5
+ encoding_applies :ordinal
6
6
 
7
7
  def self.description
8
8
  "Ordinal encoder"
@@ -5,56 +5,50 @@ module EasyML
5
5
 
6
6
  ALLOWED_PARAMS = {
7
7
  constant: [:constant],
8
- categorical: %i[categorical_min one_hot ordinal_encoding],
9
- most_frequent: %i[one_hot ordinal_encoding],
8
+ categorical: %i[categorical_min],
9
+ most_frequent: [],
10
+ embedding: %i[llm model preset dimensions],
10
11
  mean: [:clip],
11
12
  median: [:clip],
12
13
  }
13
14
 
15
+ LABELS = {
16
+ ffill: "Forward Fill",
17
+ categorical: "Categorical",
18
+ mean: "Mean",
19
+ median: "Median",
20
+ constant: "Constant Value",
21
+ most_frequent: "Most Frequent",
22
+ today: "Current Date",
23
+ }
24
+
14
25
  PREPROCESSING_STRATEGIES = {
15
- float: [
16
- { value: "ffill", label: "Forward Fill" },
17
- { value: "mean", label: "Mean" },
18
- { value: "median", label: "Median" },
19
- { value: "constant", label: "Constant Value" },
20
- ],
21
- integer: [
22
- { value: "ffill", label: "Forward Fill" },
23
- { value: "mean", label: "Mean" },
24
- { value: "median", label: "Median" },
25
- { value: "constant", label: "Constant Value" },
26
- ],
27
- boolean: [
28
- { value: "ffill", label: "Forward Fill" },
29
- { value: "most_frequent", label: "Most Frequent" },
30
- { value: "constant", label: "Constant Value" },
31
- ],
32
- datetime: [
33
- { value: "ffill", label: "Forward Fill" },
34
- { value: "constant", label: "Constant Value" },
35
- { value: "today", label: "Current Date" },
36
- ],
37
- string: [
38
- { value: "ffill", label: "Forward Fill" },
39
- { value: "most_frequent", label: "Most Frequent" },
40
- { value: "constant", label: "Constant Value" },
41
- ],
42
- text: [
43
- { value: "ffill", label: "Forward Fill" },
44
- { value: "most_frequent", label: "Most Frequent" },
45
- { value: "constant", label: "Constant Value" },
46
- ],
47
- categorical: [
48
- { value: "ffill", label: "Forward Fill" },
49
- { value: "categorical", label: "Categorical" },
50
- { value: "most_frequent", label: "Most Frequent" },
51
- { value: "constant", label: "Constant Value" },
52
- ],
53
- }.freeze
26
+ float: %w(most_frequent ffill mean median constant),
27
+ integer: %w(most_frequent ffill mean median constant),
28
+ boolean: %w(ffill most_frequent constant),
29
+ datetime: %w(ffill today constant),
30
+ string: %w(ffill most_frequent constant),
31
+ text: %w(ffill most_frequent constant),
32
+ categorical: %w(ffill categorical most_frequent constant),
33
+ }.transform_values do |strategies|
34
+ strategies.map do |strategy|
35
+ {
36
+ value: strategy,
37
+ label: LABELS[strategy.to_sym],
38
+ }
39
+ end
40
+ end
41
+
42
+ ENCODING_STRATEGIES = {
43
+ categorical: %w(embedding one_hot ordinal),
44
+ string: %w(embedding),
45
+ text: %w(embedding),
46
+ }
54
47
 
55
48
  def self.constants
56
49
  {
57
50
  preprocessing_strategies: PREPROCESSING_STRATEGIES,
51
+ encoding_strategies: ENCODING_STRATEGIES,
58
52
  }
59
53
  end
60
54
 
@@ -66,6 +60,10 @@ module EasyML
66
60
  @methods_by_class ||= {}
67
61
  end
68
62
 
63
+ def self.encodings_by_class
64
+ @encodings_by_class ||= {}
65
+ end
66
+
69
67
  def self.supported_params
70
68
  @supported_params ||= []
71
69
  end
@@ -74,6 +72,10 @@ module EasyML
74
72
  @supported_methods ||= []
75
73
  end
76
74
 
75
+ def self.supported_encodings
76
+ @supported_encodings ||= []
77
+ end
78
+
77
79
  def initialize(column, imputers: [])
78
80
  @column = column
79
81
  @dataset = column.dataset
@@ -88,6 +90,10 @@ module EasyML
88
90
  def supported_methods
89
91
  @supported_methods ||= []
90
92
  end
93
+
94
+ def supported_encodings
95
+ @supported_encodings ||= []
96
+ end
91
97
  end
92
98
 
93
99
  def imputers
@@ -62,8 +62,8 @@ module EasyML
62
62
  kwargs[:select] = []
63
63
  end
64
64
 
65
- if (selected == :processed || (selected.nil? && !dataset.needs_refresh?)) && column.one_hot?
66
- kwargs[:select] << column.virtual_columns
65
+ if (selected == :processed || (selected.nil? && !dataset.needs_refresh?)) && column.has_virtual_columns?
66
+ kwargs[:select] << column.aliases
67
67
  else
68
68
  kwargs[:select] << column.name
69
69
  end