easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (115) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +33 -0
  3. data/app/controllers/easy_ml/datasources_controller.rb +7 -0
  4. data/app/controllers/easy_ml/models_controller.rb +38 -0
  5. data/app/frontend/components/DatasetCard.tsx +212 -0
  6. data/app/frontend/components/ModelCard.tsx +69 -29
  7. data/app/frontend/components/StackTrace.tsx +13 -0
  8. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
  9. data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
  10. data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
  11. data/app/frontend/components/models/UploadModelModal.tsx +212 -0
  12. data/app/frontend/components/models/index.ts +2 -0
  13. data/app/frontend/pages/DatasetsPage.tsx +36 -130
  14. data/app/frontend/pages/DatasourcesPage.tsx +22 -2
  15. data/app/frontend/pages/ModelsPage.tsx +37 -11
  16. data/app/frontend/types/dataset.ts +1 -2
  17. data/app/frontend/types.ts +1 -1
  18. data/app/jobs/easy_ml/training_job.rb +2 -2
  19. data/app/models/easy_ml/column/imputers/base.rb +4 -0
  20. data/app/models/easy_ml/column/imputers/clip.rb +5 -3
  21. data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
  22. data/app/models/easy_ml/column/imputers/mean.rb +7 -3
  23. data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
  24. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
  25. data/app/models/easy_ml/column/imputers.rb +3 -1
  26. data/app/models/easy_ml/column/lineage/base.rb +5 -1
  27. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
  28. data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
  29. data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
  30. data/app/models/easy_ml/column/selector.rb +4 -0
  31. data/app/models/easy_ml/column.rb +79 -63
  32. data/app/models/easy_ml/column_history.rb +28 -28
  33. data/app/models/easy_ml/column_list/imputer.rb +23 -0
  34. data/app/models/easy_ml/column_list.rb +39 -26
  35. data/app/models/easy_ml/dataset/learner/base.rb +34 -0
  36. data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
  37. data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
  38. data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
  39. data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
  40. data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
  41. data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
  42. data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
  43. data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
  44. data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
  45. data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
  46. data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
  47. data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
  48. data/app/models/easy_ml/dataset/learner/query.rb +25 -0
  49. data/app/models/easy_ml/dataset/learner.rb +100 -0
  50. data/app/models/easy_ml/dataset.rb +150 -36
  51. data/app/models/easy_ml/dataset_history.rb +1 -0
  52. data/app/models/easy_ml/datasource.rb +9 -0
  53. data/app/models/easy_ml/event.rb +4 -0
  54. data/app/models/easy_ml/export/column.rb +27 -0
  55. data/app/models/easy_ml/export/dataset.rb +37 -0
  56. data/app/models/easy_ml/export/datasource.rb +12 -0
  57. data/app/models/easy_ml/export/feature.rb +24 -0
  58. data/app/models/easy_ml/export/model.rb +40 -0
  59. data/app/models/easy_ml/export/retraining_job.rb +20 -0
  60. data/app/models/easy_ml/export/splitter.rb +14 -0
  61. data/app/models/easy_ml/feature.rb +21 -0
  62. data/app/models/easy_ml/import/column.rb +35 -0
  63. data/app/models/easy_ml/import/dataset.rb +148 -0
  64. data/app/models/easy_ml/import/feature.rb +36 -0
  65. data/app/models/easy_ml/import/model.rb +136 -0
  66. data/app/models/easy_ml/import/retraining_job.rb +29 -0
  67. data/app/models/easy_ml/import/splitter.rb +34 -0
  68. data/app/models/easy_ml/lineage.rb +44 -0
  69. data/app/models/easy_ml/model.rb +93 -36
  70. data/app/models/easy_ml/model_file.rb +6 -0
  71. data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
  72. data/app/models/easy_ml/models/xgboost.rb +33 -9
  73. data/app/models/easy_ml/retraining_job.rb +8 -1
  74. data/app/models/easy_ml/retraining_run.rb +6 -4
  75. data/app/models/easy_ml/splitter.rb +8 -0
  76. data/app/models/lineage_history.rb +6 -0
  77. data/app/serializers/easy_ml/column_serializer.rb +7 -1
  78. data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
  79. data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
  80. data/config/routes.rb +13 -1
  81. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
  82. data/lib/easy_ml/core/tuner.rb +12 -11
  83. data/lib/easy_ml/data/polars_column.rb +149 -100
  84. data/lib/easy_ml/data/polars_reader.rb +8 -5
  85. data/lib/easy_ml/data/polars_schema.rb +56 -0
  86. data/lib/easy_ml/data/splits/file_split.rb +20 -2
  87. data/lib/easy_ml/data/splits/split.rb +10 -1
  88. data/lib/easy_ml/data.rb +1 -0
  89. data/lib/easy_ml/deep_compact.rb +19 -0
  90. data/lib/easy_ml/feature_store.rb +2 -6
  91. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
  92. data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
  93. data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
  94. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
  95. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
  96. data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
  97. data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
  98. data/lib/easy_ml/timing.rb +34 -0
  99. data/lib/easy_ml/version.rb +1 -1
  100. data/lib/easy_ml.rb +2 -0
  101. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  102. data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
  103. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
  104. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
  105. metadata +52 -12
  106. data/app/models/easy_ml/column/learners/base.rb +0 -103
  107. data/app/models/easy_ml/column/learners/boolean.rb +0 -11
  108. data/app/models/easy_ml/column/learners/categorical.rb +0 -51
  109. data/app/models/easy_ml/column/learners/datetime.rb +0 -19
  110. data/app/models/easy_ml/column/learners/null.rb +0 -22
  111. data/app/models/easy_ml/column/learners/numeric.rb +0 -33
  112. data/app/models/easy_ml/column/learners/string.rb +0 -15
  113. data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
  114. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
  115. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1
@@ -0,0 +1,37 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager
5
+ class Query < EasyML::Dataset::Learner::Query
6
+ def execute(split, df)
7
+ case split.to_sym
8
+ when :train
9
+ train_query(df)
10
+ when :data
11
+ full_dataset_query(df)
12
+ end
13
+ end
14
+
15
+ def train_query(df)
16
+ {}
17
+ end
18
+
19
+ def full_dataset_query(df)
20
+ {}
21
+ end
22
+
23
+ def adapter
24
+ case (raw_dtype&.to_sym || dtype.to_sym)
25
+ when :categorical
26
+ Eager::Categorical
27
+ when :boolean
28
+ Eager::Boolean
29
+ else
30
+ nil
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,43 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager < Base
5
+ def learn
6
+ types.reduce({}) do |h, type|
7
+ h.tap do
8
+ h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
9
+ end
10
+ end
11
+ end
12
+
13
+ private
14
+
15
+ def learn_using_split(split, type)
16
+ return {} if @dataset.send(type).empty?
17
+
18
+ execute_queries(split, type) || {}
19
+ end
20
+
21
+ def fetch_df(split, type)
22
+ @dataset.send(type).send(split, all_columns: true)
23
+ end
24
+
25
+ def execute_queries(split, type)
26
+ @fetched = nil
27
+
28
+ columns.reduce({}) do |h, column|
29
+ h.tap do
30
+ next if skip_processing?(column, type)
31
+
32
+ adapter = Eager::Query.new(@dataset, column).adapter
33
+ next unless adapter.present?
34
+
35
+ @fetched ||= fetch_df(split, type)
36
+ h[column.name] = adapter.new(@dataset, column).execute(split, @fetched)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,13 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Boolean < Categorical
6
+ def sort_by(value)
7
+ value == true ? 1 : 0
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,10 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Categorical < String
6
+ end
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Datetime < Query
6
+ def full_dataset_query
7
+ super.concat([
8
+ unique_count,
9
+ ])
10
+ end
11
+
12
+ def unique_count
13
+ Polars.col(column.name).n_unique.alias("#{column.name}__unique_count")
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,17 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Null < Query
6
+ def full_dataset_query
7
+ []
8
+ end
9
+
10
+ def train_query
11
+ []
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Numeric < Query
6
+ def train_query
7
+ super.concat([
8
+ Polars.col(column.name).mean.alias("#{column.name}__mean"),
9
+ Polars.col(column.name).median.alias("#{column.name}__median"),
10
+ Polars.col(column.name).min.alias("#{column.name}__min"),
11
+ Polars.col(column.name).max.alias("#{column.name}__max"),
12
+ Polars.col(column.name).std.alias("#{column.name}__std"),
13
+ ])
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,69 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Query < EasyML::Dataset::Learner::Query
6
+ def adapter
7
+ case dtype.to_sym
8
+ when :float, :integer
9
+ Lazy::Numeric
10
+ when :string, :text
11
+ Lazy::String
12
+ when :categorical
13
+ Lazy::Categorical
14
+ when :datetime, :date
15
+ Lazy::Datetime
16
+ when :boolean
17
+ Lazy::Boolean
18
+ when :null
19
+ Lazy::Null
20
+ else
21
+ raise "Don't know how to learn from dtype: #{dtype}"
22
+ end
23
+ end
24
+
25
+ def execute(split)
26
+ case split.to_sym
27
+ when :train
28
+ train_query
29
+ when :data
30
+ full_dataset_query
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def full_dataset_query
37
+ [num_rows, null_count].compact
38
+ end
39
+
40
+ def train_query
41
+ [last_value, most_frequent_value].compact
42
+ end
43
+
44
+ def null_count
45
+ Polars.col(column.name).null_count.alias("#{column.name}__null_count")
46
+ end
47
+
48
+ def num_rows
49
+ Polars.col(column.name).len.alias("#{column.name}__num_rows")
50
+ end
51
+
52
+ def most_frequent_value
53
+ Polars.col(column.name).filter(Polars.col(column.name).is_not_null).mode.first.alias("#{column.name}__most_frequent_value")
54
+ end
55
+
56
+ def last_value
57
+ return unless dataset.date_column.present?
58
+
59
+ Polars.col(column.name)
60
+ .sort_by(dataset.date_column.name, reverse: true, nulls_last: true)
61
+ .filter(Polars.col(column.name).is_not_null)
62
+ .first
63
+ .alias("#{column.name}__last_value")
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class String < Query
6
+ def full_dataset_query
7
+ super.concat([
8
+ unique_count,
9
+ ])
10
+ end
11
+
12
+ def unique_count
13
+ Polars.col(column.name).cast(:str).n_unique.alias("#{column.name}__unique_count")
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,51 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy < Base
5
+ def learn
6
+ # types.map
7
+ types.reduce({}) do |h, type|
8
+ h.tap do
9
+ h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
10
+ end
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ def learn_using_split(split, type)
17
+ return {} if @dataset.send(type).empty?
18
+
19
+ get_column_statistics(run_queries(split, type))
20
+ end
21
+
22
+ def run_queries(split, type)
23
+ queries = build_queries(split, type)
24
+ @dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
25
+ end
26
+
27
+ def get_column_statistics(query_results)
28
+ query_results.columns.group_by { |k| k.split("__").first }.reduce({}) do |h, (k, v)|
29
+ h.tap do
30
+ h[k] ||= {}
31
+ v.each do |col|
32
+ statistic_name = col.split("__").last
33
+ h[k][statistic_name] = query_results[col][0]
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ def build_queries(split, type)
40
+ columns.flat_map do |column|
41
+ next if skip_processing?(column, type)
42
+
43
+ query = Lazy::Query.new(@dataset, column)
44
+ query_adapter = query.adapter.new(@dataset, column)
45
+ query_adapter.execute(split)
46
+ end.compact
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,25 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Query
5
+ attr_reader :dataset, :column, :dtype, :raw_dtype
6
+
7
+ def initialize(dataset, column)
8
+ @dataset = dataset
9
+ @column = column
10
+ begin
11
+ # TODO: LAZIFY THIS
12
+ @dtype = column.datatype || EasyML::Data::PolarsColumn.determine_type(column.raw.data[column.name])
13
+ @raw_dtype = column.raw_dtype
14
+ rescue => e
15
+ raise "Unable to find column #{column.name}. If this column is computed by a feature, you forgot to declare computes_columns"
16
+ end
17
+ end
18
+
19
+ def execute(split)
20
+ adapter.execute(split)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,100 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ include EasyML::Timing
5
+ attr_accessor :dataset, :columns, :type, :computed, :raw_columns, :statistics
6
+
7
+ def initialize(dataset, type: :raw)
8
+ @dataset = dataset
9
+ @columns = dataset.columns.reload.needs_learn.sort_by(&:name)
10
+
11
+ if computed
12
+ @columns = @columns.computed
13
+ end
14
+
15
+ @columns = @columns.select(&:persisted?).reject(&:empty?)
16
+ @type = type
17
+ end
18
+
19
+ def learn
20
+ prepare
21
+ learn_statistics
22
+ save_statistics
23
+ end
24
+
25
+ private
26
+
27
+ def save_statistics
28
+ columns.each do |col|
29
+ col.merge_statistics(statistics.dig(col.name))
30
+ col.set_sample_values
31
+ col.assign_attributes(
32
+ learned_at: EasyML::Support::UTC.now,
33
+ last_datasource_sha: col.dataset.last_datasource_sha,
34
+ last_feature_sha: col.feature&.sha,
35
+ is_learning: type == :raw,
36
+ )
37
+ end
38
+
39
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[
40
+ statistics
41
+ learned_at
42
+ sample_values
43
+ last_datasource_sha
44
+ is_learning
45
+ ] })
46
+ dataset.columns.set_feature_lineage(columns)
47
+ end
48
+
49
+ measure_method_timing :save_statistics
50
+
51
+ def learn_statistics
52
+ return @statistics if @statistics
53
+
54
+ @statistics = lazy_statistics.deep_merge!(eager_statistics).reduce({}) do |h, (type, stat_group)|
55
+ h.tap do
56
+ stat_group.each do |statistic, value|
57
+ h[statistic] ||= {}
58
+ h[statistic][type] = value
59
+ end
60
+ end
61
+ end.with_indifferent_access
62
+
63
+ if type != :raw
64
+ columns.select(&:one_hot?).each do |column|
65
+ @statistics[column.name][:processed] = @statistics[column.name][:raw]
66
+ end
67
+ end
68
+ end
69
+
70
+ measure_method_timing :learn_statistics
71
+
72
+ def prepare
73
+ @schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
74
+ @raw_columns = @schema.keys.sort.map(&:to_s)
75
+ columns.each do |column|
76
+ attrs = {
77
+ in_raw_dataset: @raw_columns.include?(column.name),
78
+ datatype: column.read_attribute(:datatype).present? ? nil : @schema[column.name.to_sym],
79
+ }.compact
80
+ column.assign_attributes(attrs)
81
+ end
82
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
83
+ end
84
+
85
+ measure_method_timing :prepare
86
+
87
+ def lazy_statistics
88
+ Lazy.new(dataset, columns, type: type).learn
89
+ end
90
+
91
+ measure_method_timing :lazy_statistics
92
+
93
+ def eager_statistics
94
+ Eager.new(dataset, columns, type: type).learn
95
+ end
96
+
97
+ measure_method_timing :eager_statistics
98
+ end
99
+ end
100
+ end