easy_ml 0.2.0.pre.rc85 → 0.2.0.pre.rc88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +18 -2
  3. data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
  4. data/app/frontend/pages/DatasetsPage.tsx +0 -1
  5. data/app/frontend/types/dataset.ts +5 -2
  6. data/app/models/easy_ml/column/imputers/base.rb +23 -2
  7. data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
  8. data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
  9. data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
  10. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
  11. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
  12. data/app/models/easy_ml/column/imputers.rb +47 -41
  13. data/app/models/easy_ml/column/selector.rb +2 -2
  14. data/app/models/easy_ml/column.rb +260 -56
  15. data/app/models/easy_ml/column_history.rb +6 -0
  16. data/app/models/easy_ml/column_list.rb +30 -1
  17. data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
  18. data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
  19. data/app/models/easy_ml/dataset/learner.rb +11 -0
  20. data/app/models/easy_ml/dataset.rb +6 -19
  21. data/app/models/easy_ml/lineage_history.rb +17 -0
  22. data/app/models/easy_ml/model.rb +11 -1
  23. data/app/models/easy_ml/models/xgboost.rb +37 -7
  24. data/app/models/easy_ml/pca_model.rb +21 -0
  25. data/app/models/easy_ml/prediction.rb +2 -1
  26. data/app/serializers/easy_ml/column_serializer.rb +13 -1
  27. data/config/initializers/inflections.rb +1 -0
  28. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
  29. data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
  30. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
  31. data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
  32. data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
  33. data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
  34. data/lib/easy_ml/data/embeddings.rb +61 -0
  35. data/lib/easy_ml/data/polars_column.rb +3 -0
  36. data/lib/easy_ml/data/polars_reader.rb +54 -23
  37. data/lib/easy_ml/data/polars_schema.rb +28 -2
  38. data/lib/easy_ml/data/splits/file_split.rb +7 -2
  39. data/lib/easy_ml/data.rb +1 -0
  40. data/lib/easy_ml/embedding_store.rb +92 -0
  41. data/lib/easy_ml/engine.rb +4 -2
  42. data/lib/easy_ml/predict.rb +42 -20
  43. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
  44. data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
  45. data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
  46. data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
  47. data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
  48. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
  49. data/lib/easy_ml/version.rb +1 -1
  50. data/lib/easy_ml.rb +1 -0
  51. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  52. data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
  53. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
  54. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
  55. metadata +59 -6
  56. data/lib/tasks/profile.rake +0 -40
  57. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
  58. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js +0 -522
  59. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js.map +0 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 53104f4ab5d52062d772983a09a1d31aecd524a7780b7104538327a346faab5e
4
- data.tar.gz: 023a0bf486a12c23e6a8724ae329e49849e8168ad647a4557d3d3ef2adec1a47
3
+ metadata.gz: f9a3eb82ce50657d230c9d00a75f84f7f09ce96b68d58fa8a1d615ad4fd31d77
4
+ data.tar.gz: f0ded08a9c73232bcea3eb23b9e678bc392647873e53c425c6b21f62104c983b
5
5
  SHA512:
6
- metadata.gz: fb9e184aea0eff595d296285e5a66a9ad4778a8e48b6a64f39b5628ff92cdd440d06953d6a5dbf63b6b0ca312db6edef717e57fcd2f12fcbea63caaaa586eb67
7
- data.tar.gz: b379bca69fc1817ec29da1f56df6a0c6e121b443849b1d4f6032b14ec86805dd50f629a84989bd0c80bdba691c0d9f51cdf4d80dc69e5bba71502281a840d96c
6
+ metadata.gz: 76b6f960c24f0032d4959f7435770f64245a5699f97150118127fac618e46965267ae996c3be3b4bb21a0734177c039ec2357b690a69ede3117c834e3ff09083
7
+ data.tar.gz: 241ae30e3e2934eef6a74aae16aa05cfb1e10ed9145562045ccb93403c292b46449003badee5a0798a70987394469d6781b6ef8d14d5a552ee9da954efc75f44
@@ -73,7 +73,23 @@ module EasyML
73
73
 
74
74
  # Iterate over columns to check and update preprocessing_steps
75
75
  dataset_params[:columns_attributes]&.each do |_, column_attrs|
76
- column_attrs[:preprocessing_steps] = nil if column_attrs.dig(:preprocessing_steps, :training, :method) == "none"
76
+ if column_attrs.dig(:preprocessing_steps, :training, :method) == "none"
77
+ column_attrs[:preprocessing_steps] = nil
78
+ elsif column_attrs.dig(:preprocessing_steps, :training)
79
+ # Ensure encoding is properly set for categorical columns
80
+ training_config = column_attrs.dig(:preprocessing_steps, :training)
81
+ if training_config[:params]
82
+ # Remove old encoding params as they're now part of the encoding field
83
+ training_config[:params].delete(:one_hot)
84
+ training_config[:params].delete(:ordinal_encoding)
85
+ end
86
+
87
+ # Ensure embedding params are present when encoding is embedding
88
+ if training_config[:encoding] == "embedding" && training_config[:params]
89
+ training_config[:params][:llm] ||= "openai"
90
+ training_config[:params][:model] ||= "text-embedding-3-small"
91
+ end
92
+ end
77
93
  end
78
94
 
79
95
  # Handle feature ID assignment for existing features
@@ -165,7 +181,7 @@ module EasyML
165
181
  private
166
182
 
167
183
  def preprocessing_params
168
- [:method, { params: [:constant, :categorical_min, :one_hot, :ordinal_encoding, { clip: %i[min max] }] }]
184
+ [:method, :encoding, { params: [:constant, :categorical_min, :llm, :model, :preset, :dimensions, { clip: %i[min max] }] }]
169
185
  end
170
186
 
171
187
  def dataset_params