easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/application_controller.rb +4 -0
  3. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  4. data/app/frontend/components/DatasetPreview.tsx +50 -19
  5. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  6. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  7. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  8. data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
  9. data/app/frontend/types/dataset.ts +3 -0
  10. data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
  11. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  12. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  13. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  14. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  15. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  16. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  17. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  18. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  19. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  20. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  21. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  22. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  23. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  24. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  25. data/app/models/easy_ml/column/imputers.rb +126 -0
  26. data/app/models/easy_ml/column/learner.rb +18 -0
  27. data/app/models/easy_ml/column/learners/base.rb +103 -0
  28. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  29. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  30. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  31. data/app/models/easy_ml/column/learners/null.rb +22 -0
  32. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  33. data/app/models/easy_ml/column/learners/string.rb +15 -0
  34. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  35. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  36. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  37. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  38. data/app/models/easy_ml/column/lineage.rb +28 -0
  39. data/app/models/easy_ml/column/selector.rb +96 -0
  40. data/app/models/easy_ml/column.rb +319 -52
  41. data/app/models/easy_ml/column_history.rb +29 -22
  42. data/app/models/easy_ml/column_list.rb +63 -78
  43. data/app/models/easy_ml/dataset.rb +128 -96
  44. data/app/models/easy_ml/dataset_history.rb +23 -23
  45. data/app/models/easy_ml/datasource.rb +3 -0
  46. data/app/models/easy_ml/datasource_history.rb +1 -0
  47. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  48. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  49. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  50. data/app/models/easy_ml/feature.rb +19 -7
  51. data/app/models/easy_ml/feature_history.rb +12 -0
  52. data/app/models/easy_ml/feature_list.rb +15 -0
  53. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  54. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  55. data/config/initializers/enumerable.rb +17 -0
  56. data/lib/easy_ml/data/date_converter.rb +137 -30
  57. data/lib/easy_ml/data/polars_column.rb +17 -0
  58. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  59. data/lib/easy_ml/data/polars_reader.rb +20 -1
  60. data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
  61. data/lib/easy_ml/data/splits/split.rb +2 -1
  62. data/lib/easy_ml/data/synced_directory.rb +1 -1
  63. data/lib/easy_ml/data.rb +1 -2
  64. data/lib/easy_ml/engine.rb +1 -0
  65. data/lib/easy_ml/feature_store.rb +33 -22
  66. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
  67. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
  68. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  69. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  70. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  71. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  72. data/lib/easy_ml/version.rb +1 -1
  73. data/lib/tasks/profile.rake +40 -0
  74. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  75. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  76. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  77. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  78. metadata +41 -10
  79. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  80. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  81. data/lib/easy_ml/data/preprocessor.rb +0 -340
  82. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  83. data/lib/easy_ml/data/statistics_learner.rb +0 -193
  84. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  85. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
  86. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -0,0 +1,22 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Null < Base
5
+ def full_dataset_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ {
9
+ num_rows: df.size,
10
+ null_count: df[column.name].null_count || 0,
11
+ }
12
+ end
13
+
14
+ def train_statistics(df)
15
+ {
16
+ last_value: last_value(df),
17
+ }
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Numeric < Base
5
+ def train_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ super(df).merge!({
9
+ mean: df[column.name].mean,
10
+ median: df[column.name].median,
11
+ min: df[column.name].min,
12
+ max: df[column.name].max,
13
+ std: df[column.name].std,
14
+ last_value: last_value(df),
15
+ }.compact)
16
+ end
17
+
18
+ def last_value(df)
19
+ if dataset.date_column.present?
20
+ sorted_df = df.sort(dataset.date_column.name, reverse: true)
21
+ last_value = sorted_df
22
+ .filter(Polars.col(column.name).is_not_null)
23
+ .select(column.name)
24
+ .head(1)
25
+ .item
26
+
27
+ last_value
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class String < Base
5
+ def full_dataset_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ super(df).merge!({
9
+ unique_count: df[column.name].cast(:str).n_unique,
10
+ })
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class Base
5
+ attr_accessor :dataset, :column
6
+
7
+ def initialize(column)
8
+ @column = column
9
+ @dataset = column.dataset
10
+ end
11
+
12
+ def as_json
13
+ {
14
+ key: key,
15
+ description: description,
16
+ timestamp: timestamp,
17
+ }.with_indifferent_access
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class ComputedByFeature < Base
5
+ def key
6
+ :computed_by_feature
7
+ end
8
+
9
+ def description
10
+ "Computed by #{column.computed_by}"
11
+ end
12
+
13
+ def timestamp
14
+ column.feature.fit_at
15
+ end
16
+
17
+ def check
18
+ column.computed_by.present?
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class Preprocessed < Base
5
+ def key
6
+ :preprocessed
7
+ end
8
+
9
+ def description
10
+ "Preprocessed using #{column.imputers.preprocessing_descriptions.join(", ")}"
11
+ end
12
+
13
+ def timestamp
14
+ column.dataset.refreshed_at
15
+ end
16
+
17
+ def check
18
+ column.preprocessing_steps.present?
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class RawDataset < Base
5
+ def key
6
+ :raw_dataset
7
+ end
8
+
9
+ def description
10
+ "Present in raw dataset"
11
+ end
12
+
13
+ def timestamp
14
+ column.dataset.datasource.refreshed_at
15
+ end
16
+
17
+ def check
18
+ column.in_raw_dataset?
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,28 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ attr_accessor :dataset, :column
5
+
6
+ def initialize(column)
7
+ @column = column
8
+ @dataset = column.dataset
9
+ end
10
+
11
+ def sort_order
12
+ [
13
+ RawDataset,
14
+ ComputedByFeature,
15
+ Preprocessed,
16
+ ]
17
+ end
18
+
19
+ def lineage
20
+ sort_order.map do |cl|
21
+ cl.new(column)
22
+ end.select(&:check)
23
+ .map(&:as_json)
24
+ .compact
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,96 @@
1
+ module EasyML
2
+ class Column
3
+ class Selector
4
+ attr_accessor :selected, :dataset, :column, :transform
5
+
6
+ def initialize(column, selected = nil, &block)
7
+ @column = column
8
+ @dataset = column.dataset
9
+ @selected = selected
10
+ @transform = block
11
+ end
12
+
13
+ def name
14
+ column.name
15
+ end
16
+
17
+ def raw
18
+ if column.is_computed? && !column.in_raw_dataset?
19
+ Selector.new(column, :processed)
20
+ else
21
+ Selector.new(column, :raw)
22
+ end
23
+ end
24
+
25
+ def clipped
26
+ Selector.new(column, :raw) do |df|
27
+ column.imputers.training.clip(df)
28
+ end
29
+ end
30
+
31
+ def processed
32
+ Selector.new(column, :processed)
33
+ end
34
+
35
+ def train(**kwargs)
36
+ select(:train, **kwargs)
37
+ end
38
+
39
+ def test(**kwargs)
40
+ select(:test, **kwargs)
41
+ end
42
+
43
+ def valid(**kwargs)
44
+ select(:valid, **kwargs)
45
+ end
46
+
47
+ def data(**kwargs)
48
+ if column.is_computed?
49
+ Selector.new(column, :processed).send(:select, :data, **kwargs)
50
+ else
51
+ select(:data, **kwargs)
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def select(segment, **orig_kwargs)
58
+ kwargs = orig_kwargs.clone
59
+ return nil if dataset.nil?
60
+
61
+ kwargs[:all_columns] = true
62
+
63
+ if kwargs.key?(:select)
64
+ kwargs[:select] = [kwargs[:select]].flatten
65
+ else
66
+ kwargs[:select] = []
67
+ end
68
+
69
+ if (selected == :processed || (selected.nil? && !dataset.needs_refresh?)) && column.one_hot?
70
+ kwargs[:select] << column.virtual_columns
71
+ else
72
+ kwargs[:select] << column.name
73
+ end
74
+ kwargs[:select] = kwargs[:select].flatten.uniq
75
+
76
+ if @selected.present?
77
+ available_columns = dataset.send(@selected).send(segment, limit: 1, all_columns: true)&.columns || []
78
+ kwargs[:select] = available_columns & kwargs[:select]
79
+ return Polars::DataFrame.new if kwargs[:select].empty?
80
+ result = dataset.send(@selected).send(segment, **kwargs)
81
+ else
82
+ available_columns = dataset.send(segment, limit: 1, all_columns: true)&.columns || []
83
+ kwargs[:select] = available_columns & kwargs[:select]
84
+ return Polars::DataFrame.new if kwargs[:select].empty?
85
+ result = dataset.send(segment, **kwargs)
86
+ end
87
+
88
+ if transform
89
+ result = transform.call(result)
90
+ end
91
+
92
+ result
93
+ end
94
+ end
95
+ end
96
+ end