easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/apis_controller.rb +8 -0
  3. data/app/controllers/easy_ml/application_controller.rb +4 -0
  4. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  5. data/app/controllers/easy_ml/models_controller.rb +3 -0
  6. data/app/controllers/easy_ml/predictions_controller.rb +10 -5
  7. data/app/frontend/components/DatasetPreview.tsx +50 -19
  8. data/app/frontend/components/ModelForm.tsx +1 -1
  9. data/app/frontend/components/SearchableSelect.tsx +0 -1
  10. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  11. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  12. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  13. data/app/frontend/components/dataset/PreprocessingConfig.tsx +82 -21
  14. data/app/frontend/pages/DatasourcesPage.tsx +0 -2
  15. data/app/frontend/types/dataset.ts +3 -0
  16. data/app/jobs/easy_ml/compute_feature_job.rb +0 -2
  17. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  18. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  19. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  20. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  21. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  22. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  23. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  24. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  25. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  26. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  27. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  28. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  29. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  30. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  31. data/app/models/easy_ml/column/imputers.rb +126 -0
  32. data/app/models/easy_ml/column/learner.rb +18 -0
  33. data/app/models/easy_ml/column/learners/base.rb +103 -0
  34. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  35. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  36. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  37. data/app/models/easy_ml/column/learners/null.rb +22 -0
  38. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  39. data/app/models/easy_ml/column/learners/string.rb +15 -0
  40. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  41. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  42. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  43. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  44. data/app/models/easy_ml/column/lineage.rb +28 -0
  45. data/app/models/easy_ml/column/selector.rb +96 -0
  46. data/app/models/easy_ml/column.rb +344 -39
  47. data/app/models/easy_ml/column_history.rb +31 -20
  48. data/app/models/easy_ml/column_list.rb +79 -62
  49. data/app/models/easy_ml/dataset.rb +156 -104
  50. data/app/models/easy_ml/dataset_history.rb +23 -23
  51. data/app/models/easy_ml/datasource.rb +4 -0
  52. data/app/models/easy_ml/datasource_history.rb +1 -0
  53. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  54. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  55. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  56. data/app/models/easy_ml/feature.rb +29 -10
  57. data/app/models/easy_ml/feature_history.rb +12 -0
  58. data/app/models/easy_ml/feature_list.rb +15 -0
  59. data/app/models/easy_ml/model.rb +25 -4
  60. data/app/models/easy_ml/model_history.rb +1 -0
  61. data/app/models/easy_ml/retraining_run.rb +1 -0
  62. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  63. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  64. data/config/initializers/enumerable.rb +17 -0
  65. data/config/initializers/inflections.rb +2 -0
  66. data/config/routes.rb +3 -0
  67. data/lib/easy_ml/core/tuner.rb +1 -1
  68. data/lib/easy_ml/data/date_converter.rb +137 -30
  69. data/lib/easy_ml/data/polars_column.rb +17 -0
  70. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  71. data/lib/easy_ml/data/polars_reader.rb +20 -1
  72. data/lib/easy_ml/data/splits/in_memory_split.rb +7 -5
  73. data/lib/easy_ml/data/splits/split.rb +2 -1
  74. data/lib/easy_ml/data/synced_directory.rb +5 -3
  75. data/lib/easy_ml/data.rb +1 -2
  76. data/lib/easy_ml/feature_store.rb +33 -22
  77. data/lib/easy_ml/predict.rb +13 -2
  78. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +7 -0
  79. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +18 -0
  80. data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
  81. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  82. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  83. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  84. data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
  85. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  86. data/lib/easy_ml/version.rb +1 -1
  87. data/lib/tasks/profile.rake +40 -0
  88. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  89. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  90. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  91. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  92. metadata +45 -10
  93. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  94. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  95. data/lib/easy_ml/data/preprocessor.rb +0 -383
  96. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  97. data/lib/easy_ml/data/statistics_learner.rb +0 -128
  98. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  99. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js +0 -474
  100. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DTZ2348z.js.map +0 -1
@@ -0,0 +1,103 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Base
5
+ attr_accessor :column, :dataset, :dtype, :select
6
+
7
+ def initialize(column)
8
+ @column = column
9
+ @dataset = column.dataset
10
+ @select = dataset.date_column.present? ? [dataset.date_column.name] : []
11
+ end
12
+
13
+ def self.adapter(column)
14
+ begin
15
+ dtype = column.datatype || EasyML::Data::PolarsColumn.determine_type(column.raw.data[column.name])
16
+ rescue => e
17
+ raise "Unable to find column #{column.name}. If this column is computed by a feature, you forgot to declare computes_columns"
18
+ end
19
+
20
+ case dtype.to_sym
21
+ when :float, :integer
22
+ EasyML::Column::Learners::Numeric
23
+ when :string, :text
24
+ EasyML::Column::Learners::String
25
+ when :categorical
26
+ EasyML::Column::Learners::Categorical
27
+ when :datetime, :date
28
+ EasyML::Column::Learners::Datetime
29
+ when :boolean
30
+ EasyML::Column::Learners::Boolean
31
+ when :null
32
+ EasyML::Column::Learners::Null
33
+ else
34
+ raise "Don't know how to learn from dtype: #{dtype}"
35
+ end
36
+ end
37
+
38
+ TYPES_ALL = %i(raw clipped processed)
39
+ TYPES_RAW = %i(raw clipped)
40
+ TYPES_PROCESSED = %i(processed)
41
+
42
+ def types(type = :all)
43
+ return TYPES_PROCESSED if !column.in_raw_dataset?
44
+
45
+ case type
46
+ when :all then TYPES_ALL
47
+ when :raw then TYPES_RAW
48
+ when :processed then TYPES_PROCESSED
49
+ else
50
+ TYPES_ALL
51
+ end
52
+ end
53
+
54
+ def learn(type: :all)
55
+ types(type).each_with_object({}) do |t, h|
56
+ h[t] = learn_split(column.send(t))
57
+ end
58
+ end
59
+
60
+ def full_dataset_statistics(df)
61
+ return {} if df.nil?
62
+
63
+ {
64
+ num_rows: df.size,
65
+ null_count: df[column.name].null_count || 0,
66
+ }
67
+ end
68
+
69
+ def train_statistics(df)
70
+ return {} if df.nil?
71
+
72
+ {
73
+ last_value: last_value(df),
74
+ most_frequent_value: df[column.name].mode.sort.to_a&.first,
75
+ }
76
+ end
77
+
78
+ def learn_split(split)
79
+ df = split.data(select: select)
80
+ train_df = split.train(select: select)
81
+ full_dataset_stats = full_dataset_statistics(df)
82
+ train_stats = train_statistics(train_df)
83
+ full_dataset_stats.merge!(train_stats)
84
+ end
85
+
86
+ def last_value(df)
87
+ return unless dataset.date_column.present?
88
+ return nil if df.empty? || !df.columns.include?(dataset.date_column.name)
89
+
90
+ # Sort by date and get the last non-null value
91
+ sorted_df = df.sort(dataset.date_column.name, reverse: true)
92
+ last_value = sorted_df
93
+ .filter(Polars.col(column.name).is_not_null)
94
+ .select(column.name)
95
+ .head(1)
96
+ .item
97
+
98
+ last_value
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,11 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Boolean < Categorical
5
+ def sort_by(value)
6
+ value == true ? 1 : 0
7
+ end
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,51 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Categorical < String
5
+ def learn(type)
6
+ types(type).each_with_object({}) do |type, h|
7
+ h[type] = case type
8
+ when :raw then learn_split(column.raw)
9
+ when :processed then learn_split(column.raw).merge!(null_count: 0)
10
+ end
11
+ end
12
+ end
13
+
14
+ def train_statistics(df)
15
+ return {} if df.nil?
16
+
17
+ super(df).merge!({
18
+ allowed_categories: allowed_categories(df),
19
+ counts: df[column.name].value_counts.to_hash,
20
+ }.merge!(learn_encoder_decoder(df)))
21
+ end
22
+
23
+ def learn_encoder_decoder(df)
24
+ value_counts = df[column.name].value_counts
25
+ column_names = value_counts.columns
26
+ value_column = column_names[0]
27
+ count_column = column_names[1]
28
+
29
+ as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&column.method(:cast))
30
+ label_encoder = as_hash.keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
31
+ h.tap do
32
+ h[k] = i
33
+ end
34
+ end
35
+ label_decoder = label_encoder.invert
36
+
37
+ {
38
+ value: as_hash,
39
+ label_encoder: label_encoder,
40
+ label_decoder: label_decoder,
41
+ }
42
+ end
43
+
44
+ def allowed_categories(df)
45
+ val_counts = df[column.name].value_counts
46
+ val_counts[val_counts["count"] >= column.categorical_min][column.name].to_a.compact.sort_by(&column.method(:sort_by))
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Datetime < Base
5
+ def full_dataset_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ super(df).merge!({
9
+ unique_count: df[column.name].n_unique,
10
+ })
11
+ end
12
+
13
+ def last_value(df)
14
+ df[column.name].sort[-1]
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,22 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Null < Base
5
+ def full_dataset_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ {
9
+ num_rows: df.size,
10
+ null_count: df[column.name].null_count || 0,
11
+ }
12
+ end
13
+
14
+ def train_statistics(df)
15
+ {
16
+ last_value: last_value(df),
17
+ }
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class Numeric < Base
5
+ def train_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ super(df).merge!({
9
+ mean: df[column.name].mean,
10
+ median: df[column.name].median,
11
+ min: df[column.name].min,
12
+ max: df[column.name].max,
13
+ std: df[column.name].std,
14
+ last_value: last_value(df),
15
+ }.compact)
16
+ end
17
+
18
+ def last_value(df)
19
+ if dataset.date_column.present?
20
+ sorted_df = df.sort(dataset.date_column.name, reverse: true)
21
+ last_value = sorted_df
22
+ .filter(Polars.col(column.name).is_not_null)
23
+ .select(column.name)
24
+ .head(1)
25
+ .item
26
+
27
+ last_value
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,15 @@
1
+ module EasyML
2
+ class Column
3
+ module Learners
4
+ class String < Base
5
+ def full_dataset_statistics(df)
6
+ return {} if df.nil?
7
+
8
+ super(df).merge!({
9
+ unique_count: df[column.name].cast(:str).n_unique,
10
+ })
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class Base
5
+ attr_accessor :dataset, :column
6
+
7
+ def initialize(column)
8
+ @column = column
9
+ @dataset = column.dataset
10
+ end
11
+
12
+ def as_json
13
+ {
14
+ key: key,
15
+ description: description,
16
+ timestamp: timestamp,
17
+ }.with_indifferent_access
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class ComputedByFeature < Base
5
+ def key
6
+ :computed_by_feature
7
+ end
8
+
9
+ def description
10
+ "Computed by #{column.computed_by}"
11
+ end
12
+
13
+ def timestamp
14
+ column.feature.fit_at
15
+ end
16
+
17
+ def check
18
+ column.computed_by.present?
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class Preprocessed < Base
5
+ def key
6
+ :preprocessed
7
+ end
8
+
9
+ def description
10
+ "Preprocessed using #{column.imputers.preprocessing_descriptions.join(", ")}"
11
+ end
12
+
13
+ def timestamp
14
+ column.dataset.refreshed_at
15
+ end
16
+
17
+ def check
18
+ column.preprocessing_steps.present?
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ class RawDataset < Base
5
+ def key
6
+ :raw_dataset
7
+ end
8
+
9
+ def description
10
+ "Present in raw dataset"
11
+ end
12
+
13
+ def timestamp
14
+ column.dataset.datasource.refreshed_at
15
+ end
16
+
17
+ def check
18
+ column.in_raw_dataset?
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,28 @@
1
+ module EasyML
2
+ class Column
3
+ class Lineage
4
+ attr_accessor :dataset, :column
5
+
6
+ def initialize(column)
7
+ @column = column
8
+ @dataset = column.dataset
9
+ end
10
+
11
+ def sort_order
12
+ [
13
+ RawDataset,
14
+ ComputedByFeature,
15
+ Preprocessed,
16
+ ]
17
+ end
18
+
19
+ def lineage
20
+ sort_order.map do |cl|
21
+ cl.new(column)
22
+ end.select(&:check)
23
+ .map(&:as_json)
24
+ .compact
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,96 @@
1
+ module EasyML
2
+ class Column
3
+ class Selector
4
+ attr_accessor :selected, :dataset, :column, :transform
5
+
6
+ def initialize(column, selected = nil, &block)
7
+ @column = column
8
+ @dataset = column.dataset
9
+ @selected = selected
10
+ @transform = block
11
+ end
12
+
13
+ def name
14
+ column.name
15
+ end
16
+
17
+ def raw
18
+ if column.is_computed? && !column.in_raw_dataset?
19
+ Selector.new(column, :processed)
20
+ else
21
+ Selector.new(column, :raw)
22
+ end
23
+ end
24
+
25
+ def clipped
26
+ Selector.new(column, :raw) do |df|
27
+ column.imputers.training.clip(df)
28
+ end
29
+ end
30
+
31
+ def processed
32
+ Selector.new(column, :processed)
33
+ end
34
+
35
+ def train(**kwargs)
36
+ select(:train, **kwargs)
37
+ end
38
+
39
+ def test(**kwargs)
40
+ select(:test, **kwargs)
41
+ end
42
+
43
+ def valid(**kwargs)
44
+ select(:valid, **kwargs)
45
+ end
46
+
47
+ def data(**kwargs)
48
+ if column.is_computed?
49
+ Selector.new(column, :processed).send(:select, :data, **kwargs)
50
+ else
51
+ select(:data, **kwargs)
52
+ end
53
+ end
54
+
55
+ private
56
+
57
+ def select(segment, **orig_kwargs)
58
+ kwargs = orig_kwargs.clone
59
+ return nil if dataset.nil?
60
+
61
+ kwargs[:all_columns] = true
62
+
63
+ if kwargs.key?(:select)
64
+ kwargs[:select] = [kwargs[:select]].flatten
65
+ else
66
+ kwargs[:select] = []
67
+ end
68
+
69
+ if (selected == :processed || (selected.nil? && !dataset.needs_refresh?)) && column.one_hot?
70
+ kwargs[:select] << column.virtual_columns
71
+ else
72
+ kwargs[:select] << column.name
73
+ end
74
+ kwargs[:select] = kwargs[:select].flatten.uniq
75
+
76
+ if @selected.present?
77
+ available_columns = dataset.send(@selected).send(segment, limit: 1, all_columns: true)&.columns || []
78
+ kwargs[:select] = available_columns & kwargs[:select]
79
+ return Polars::DataFrame.new if kwargs[:select].empty?
80
+ result = dataset.send(@selected).send(segment, **kwargs)
81
+ else
82
+ available_columns = dataset.send(segment, limit: 1, all_columns: true)&.columns || []
83
+ kwargs[:select] = available_columns & kwargs[:select]
84
+ return Polars::DataFrame.new if kwargs[:select].empty?
85
+ result = dataset.send(segment, **kwargs)
86
+ end
87
+
88
+ if transform
89
+ result = transform.call(result)
90
+ end
91
+
92
+ result
93
+ end
94
+ end
95
+ end
96
+ end