easy_ml 0.2.0.pre.rc71 → 0.2.0.pre.rc75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +33 -0
  3. data/app/controllers/easy_ml/datasources_controller.rb +7 -0
  4. data/app/controllers/easy_ml/models_controller.rb +46 -0
  5. data/app/frontend/components/DatasetCard.tsx +212 -0
  6. data/app/frontend/components/ModelCard.tsx +114 -29
  7. data/app/frontend/components/StackTrace.tsx +13 -0
  8. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
  9. data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
  10. data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
  11. data/app/frontend/components/models/UploadModelModal.tsx +212 -0
  12. data/app/frontend/components/models/index.ts +2 -0
  13. data/app/frontend/pages/DatasetsPage.tsx +36 -130
  14. data/app/frontend/pages/DatasourcesPage.tsx +22 -2
  15. data/app/frontend/pages/ModelsPage.tsx +37 -11
  16. data/app/frontend/types/dataset.ts +1 -2
  17. data/app/frontend/types.ts +1 -1
  18. data/app/jobs/easy_ml/reaper.rb +55 -0
  19. data/app/jobs/easy_ml/training_job.rb +1 -1
  20. data/app/models/easy_ml/column/imputers/base.rb +4 -0
  21. data/app/models/easy_ml/column/imputers/clip.rb +5 -3
  22. data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
  23. data/app/models/easy_ml/column/imputers/mean.rb +7 -3
  24. data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
  25. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
  26. data/app/models/easy_ml/column/imputers.rb +3 -1
  27. data/app/models/easy_ml/column/lineage/base.rb +5 -1
  28. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
  29. data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
  30. data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
  31. data/app/models/easy_ml/column/selector.rb +4 -0
  32. data/app/models/easy_ml/column.rb +79 -63
  33. data/app/models/easy_ml/column_history.rb +28 -28
  34. data/app/models/easy_ml/column_list/imputer.rb +23 -0
  35. data/app/models/easy_ml/column_list.rb +39 -26
  36. data/app/models/easy_ml/dataset/learner/base.rb +34 -0
  37. data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
  38. data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
  39. data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
  40. data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
  41. data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
  42. data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
  43. data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
  44. data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
  45. data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
  46. data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
  47. data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
  48. data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
  49. data/app/models/easy_ml/dataset/learner/query.rb +25 -0
  50. data/app/models/easy_ml/dataset/learner.rb +100 -0
  51. data/app/models/easy_ml/dataset.rb +150 -36
  52. data/app/models/easy_ml/dataset_history.rb +1 -0
  53. data/app/models/easy_ml/datasource.rb +9 -0
  54. data/app/models/easy_ml/event.rb +5 -7
  55. data/app/models/easy_ml/export/column.rb +27 -0
  56. data/app/models/easy_ml/export/dataset.rb +37 -0
  57. data/app/models/easy_ml/export/datasource.rb +12 -0
  58. data/app/models/easy_ml/export/feature.rb +24 -0
  59. data/app/models/easy_ml/export/model.rb +40 -0
  60. data/app/models/easy_ml/export/retraining_job.rb +20 -0
  61. data/app/models/easy_ml/export/splitter.rb +14 -0
  62. data/app/models/easy_ml/feature.rb +21 -0
  63. data/app/models/easy_ml/import/column.rb +35 -0
  64. data/app/models/easy_ml/import/dataset.rb +148 -0
  65. data/app/models/easy_ml/import/feature.rb +36 -0
  66. data/app/models/easy_ml/import/model.rb +136 -0
  67. data/app/models/easy_ml/import/retraining_job.rb +29 -0
  68. data/app/models/easy_ml/import/splitter.rb +34 -0
  69. data/app/models/easy_ml/lineage.rb +44 -0
  70. data/app/models/easy_ml/model.rb +101 -37
  71. data/app/models/easy_ml/model_file.rb +6 -0
  72. data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
  73. data/app/models/easy_ml/models/xgboost.rb +33 -9
  74. data/app/models/easy_ml/retraining_job.rb +8 -1
  75. data/app/models/easy_ml/retraining_run.rb +7 -5
  76. data/app/models/easy_ml/splitter.rb +8 -0
  77. data/app/models/lineage_history.rb +6 -0
  78. data/app/serializers/easy_ml/column_serializer.rb +7 -1
  79. data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
  80. data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
  81. data/config/routes.rb +14 -1
  82. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
  83. data/lib/easy_ml/core/tuner.rb +13 -12
  84. data/lib/easy_ml/data/polars_column.rb +149 -100
  85. data/lib/easy_ml/data/polars_reader.rb +8 -5
  86. data/lib/easy_ml/data/polars_schema.rb +56 -0
  87. data/lib/easy_ml/data/splits/file_split.rb +20 -2
  88. data/lib/easy_ml/data/splits/split.rb +10 -1
  89. data/lib/easy_ml/data.rb +1 -0
  90. data/lib/easy_ml/deep_compact.rb +19 -0
  91. data/lib/easy_ml/engine.rb +1 -0
  92. data/lib/easy_ml/feature_store.rb +2 -6
  93. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
  94. data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
  95. data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
  96. data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
  97. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
  98. data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
  99. data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
  100. data/lib/easy_ml/timing.rb +34 -0
  101. data/lib/easy_ml/version.rb +1 -1
  102. data/lib/easy_ml.rb +2 -0
  103. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  104. data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
  105. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
  106. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
  107. metadata +53 -12
  108. data/app/models/easy_ml/column/learners/base.rb +0 -103
  109. data/app/models/easy_ml/column/learners/boolean.rb +0 -11
  110. data/app/models/easy_ml/column/learners/categorical.rb +0 -51
  111. data/app/models/easy_ml/column/learners/datetime.rb +0 -19
  112. data/app/models/easy_ml/column/learners/null.rb +0 -22
  113. data/app/models/easy_ml/column/learners/numeric.rb +0 -33
  114. data/app/models/easy_ml/column/learners/string.rb +0 -15
  115. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +0 -1
  116. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js +0 -489
  117. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js.map +0 -1
@@ -1,6 +1,7 @@
1
1
  module EasyML
2
2
  module ColumnList
3
3
  include Historiographer::Relation
4
+ include EasyML::Timing
4
5
 
5
6
  def sync(delete: true)
6
7
  return unless dataset.schema.present?
@@ -39,35 +40,28 @@ module EasyML
39
40
  df
40
41
  end
41
42
 
43
+ measure_method_timing :transform
44
+
45
+ def apply_clip(df)
46
+ clip_cols = has_clip.raw
47
+ return df unless clip_cols.any?
48
+
49
+ clipped_exprs = EasyML::ColumnList::Imputer.new(
50
+ dataset,
51
+ df,
52
+ columns: clip_cols,
53
+ imputers: [:clip],
54
+ ).exprs
55
+
56
+ df.with_columns(clipped_exprs)
57
+ end
58
+
42
59
  def learn(type: :raw, computed: false)
43
- cols_to_learn = column_list.reload.needs_learn
44
- cols_to_learn = cols_to_learn.computed if computed
45
- cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
46
- cols_to_learn.each { |col| col.learn(type: type) }
47
- EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
48
- statistics
49
- learned_at
50
- sample_values
51
- last_datasource_sha
52
- is_learning
53
- datatype
54
- polars_datatype
55
- ] })
56
- set_feature_lineage
60
+ EasyML::Dataset::Learner.new(dataset, type: type).learn
57
61
  reload
58
62
  end
59
63
 
60
- def set_feature_lineage
61
- names = dataset.features.computed_column_names
62
- columns = where(name: names, computed_by: nil).map do |col|
63
- col.assign_attributes(
64
- is_computed: true,
65
- computed_by: col.computing_feature&.name,
66
- )
67
- col
68
- end
69
- EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
70
- end
64
+ measure_method_timing :learn
71
65
 
72
66
  def statistics
73
67
  stats = { raw: {}, processed: {} }
@@ -115,6 +109,25 @@ module EasyML
115
109
  column_list.sort_by { |col| [col.sort_required, col.name] }
116
110
  end
117
111
 
112
+ def set_feature_lineage(cols_to_learn)
113
+ names = dataset.features.computed_column_names
114
+ columns = where(name: names, computed_by: nil).map do |col|
115
+ col.assign_attributes(
116
+ is_computed: true,
117
+ computed_by: col.computing_feature&.name,
118
+ )
119
+ col
120
+ end
121
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
122
+
123
+ lineage = cols_to_learn.flat_map do |col|
124
+ EasyML::Lineage.learn(col)
125
+ end.compact
126
+ EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
127
+ end
128
+
129
+ measure_method_timing :set_feature_lineage
130
+
118
131
  private
119
132
 
120
133
  def import_new(new_columns, existing_columns)
@@ -127,7 +140,7 @@ module EasyML
127
140
  col
128
141
  end
129
142
  EasyML::Column.import(cols_to_insert)
130
- set_feature_lineage
143
+ set_feature_lineage(cols_to_insert)
131
144
  column_list.reload
132
145
  end
133
146
 
@@ -0,0 +1,34 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Base
5
+ attr_reader :dataset, :columns, :type
6
+
7
+ def initialize(dataset, columns, type: :raw)
8
+ @dataset = dataset
9
+ @columns = columns
10
+ @type = type
11
+ end
12
+
13
+ def skip_processing?(column, type)
14
+ (!column.in_raw_dataset? && type.to_sym != :processed) ||
15
+ (column.one_hot? && type.to_sym == :processed)
16
+ end
17
+
18
+ TYPES_ALL = %i(raw clipped processed)
19
+ TYPES_RAW = %i(raw clipped)
20
+ TYPES_PROCESSED = %i(processed)
21
+
22
+ def types(type = :all)
23
+ case type
24
+ when :all then TYPES_ALL
25
+ when :raw then TYPES_RAW
26
+ when :processed then TYPES_PROCESSED
27
+ else
28
+ TYPES_ALL
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,10 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager
5
+ class Boolean < Categorical
6
+ end
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,51 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager
5
+ class Categorical < Query
6
+ def train_query(df)
7
+ {
8
+ counts: counts(df).to_hash,
9
+ allowed_categories: allowed_categories(df).to_series.to_a,
10
+ }.merge!(
11
+ learn_encoder_decoder(df)
12
+ )
13
+ end
14
+
15
+ def learn_encoder_decoder(df)
16
+ unsorted = allowed_categories(df).lazy.with_row_count.collect.to_hash.invert
17
+
18
+ label_encoder = unsorted.transform_keys(&column.method(:cast)).keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
19
+ h.tap do
20
+ h[k] = i
21
+ end
22
+ end
23
+ label_decoder = label_encoder.invert
24
+
25
+ {
26
+ label_encoder: label_encoder,
27
+ label_decoder: label_decoder,
28
+ }
29
+ end
30
+
31
+ def counts(df)
32
+ return @counts if @counts
33
+
34
+ @counts = df.group_by(column.name)
35
+ .agg(Polars.col(column.name).count.alias("count"))
36
+ end
37
+
38
+ def allowed_categories(df)
39
+ return @allowed_categories if @allowed_categories
40
+
41
+ @allowed_categories = df.join(counts(df), on: column.name)
42
+ .filter(Polars.col("count").ge(column.categorical_min))
43
+ .select(column.name)
44
+ .unique
45
+ .sort(column.name, reverse: true)
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,37 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager
5
+ class Query < EasyML::Dataset::Learner::Query
6
+ def execute(split, df)
7
+ case split.to_sym
8
+ when :train
9
+ train_query(df)
10
+ when :data
11
+ full_dataset_query(df)
12
+ end
13
+ end
14
+
15
+ def train_query(df)
16
+ {}
17
+ end
18
+
19
+ def full_dataset_query(df)
20
+ {}
21
+ end
22
+
23
+ def adapter
24
+ case (raw_dtype&.to_sym || dtype.to_sym)
25
+ when :categorical
26
+ Eager::Categorical
27
+ when :boolean
28
+ Eager::Boolean
29
+ else
30
+ nil
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,43 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Eager < Base
5
+ def learn
6
+ types.reduce({}) do |h, type|
7
+ h.tap do
8
+ h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
9
+ end
10
+ end
11
+ end
12
+
13
+ private
14
+
15
+ def learn_using_split(split, type)
16
+ return {} if @dataset.send(type).empty?
17
+
18
+ execute_queries(split, type) || {}
19
+ end
20
+
21
+ def fetch_df(split, type)
22
+ @dataset.send(type).send(split, all_columns: true)
23
+ end
24
+
25
+ def execute_queries(split, type)
26
+ @fetched = nil
27
+
28
+ columns.reduce({}) do |h, column|
29
+ h.tap do
30
+ next if skip_processing?(column, type)
31
+
32
+ adapter = Eager::Query.new(@dataset, column).adapter
33
+ next unless adapter.present?
34
+
35
+ @fetched ||= fetch_df(split, type)
36
+ h[column.name] = adapter.new(@dataset, column).execute(split, @fetched)
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,13 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Boolean < Categorical
6
+ def sort_by(value)
7
+ value == true ? 1 : 0
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,10 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Categorical < String
6
+ end
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Datetime < Query
6
+ def full_dataset_query
7
+ super.concat([
8
+ unique_count,
9
+ ])
10
+ end
11
+
12
+ def unique_count
13
+ Polars.col(column.name).n_unique.alias("#{column.name}__unique_count")
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,17 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Null < Query
6
+ def full_dataset_query
7
+ []
8
+ end
9
+
10
+ def train_query
11
+ []
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Numeric < Query
6
+ def train_query
7
+ super.concat([
8
+ Polars.col(column.name).mean.alias("#{column.name}__mean"),
9
+ Polars.col(column.name).median.alias("#{column.name}__median"),
10
+ Polars.col(column.name).min.alias("#{column.name}__min"),
11
+ Polars.col(column.name).max.alias("#{column.name}__max"),
12
+ Polars.col(column.name).std.alias("#{column.name}__std"),
13
+ ])
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,69 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class Query < EasyML::Dataset::Learner::Query
6
+ def adapter
7
+ case dtype.to_sym
8
+ when :float, :integer
9
+ Lazy::Numeric
10
+ when :string, :text
11
+ Lazy::String
12
+ when :categorical
13
+ Lazy::Categorical
14
+ when :datetime, :date
15
+ Lazy::Datetime
16
+ when :boolean
17
+ Lazy::Boolean
18
+ when :null
19
+ Lazy::Null
20
+ else
21
+ raise "Don't know how to learn from dtype: #{dtype}"
22
+ end
23
+ end
24
+
25
+ def execute(split)
26
+ case split.to_sym
27
+ when :train
28
+ train_query
29
+ when :data
30
+ full_dataset_query
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def full_dataset_query
37
+ [num_rows, null_count].compact
38
+ end
39
+
40
+ def train_query
41
+ [last_value, most_frequent_value].compact
42
+ end
43
+
44
+ def null_count
45
+ Polars.col(column.name).null_count.alias("#{column.name}__null_count")
46
+ end
47
+
48
+ def num_rows
49
+ Polars.col(column.name).len.alias("#{column.name}__num_rows")
50
+ end
51
+
52
+ def most_frequent_value
53
+ Polars.col(column.name).filter(Polars.col(column.name).is_not_null).mode.first.alias("#{column.name}__most_frequent_value")
54
+ end
55
+
56
+ def last_value
57
+ return unless dataset.date_column.present?
58
+
59
+ Polars.col(column.name)
60
+ .sort_by(dataset.date_column.name, reverse: true, nulls_last: true)
61
+ .filter(Polars.col(column.name).is_not_null)
62
+ .first
63
+ .alias("#{column.name}__last_value")
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,19 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy
5
+ class String < Query
6
+ def full_dataset_query
7
+ super.concat([
8
+ unique_count,
9
+ ])
10
+ end
11
+
12
+ def unique_count
13
+ Polars.col(column.name).cast(:str).n_unique.alias("#{column.name}__unique_count")
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,51 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Lazy < Base
5
+ def learn
6
+ # types.map
7
+ types.reduce({}) do |h, type|
8
+ h.tap do
9
+ h[type] = learn_using_split(:train, type).deep_merge!(learn_using_split(:data, type))
10
+ end
11
+ end
12
+ end
13
+
14
+ private
15
+
16
+ def learn_using_split(split, type)
17
+ return {} if @dataset.send(type).empty?
18
+
19
+ get_column_statistics(run_queries(split, type))
20
+ end
21
+
22
+ def run_queries(split, type)
23
+ queries = build_queries(split, type)
24
+ @dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
25
+ end
26
+
27
+ def get_column_statistics(query_results)
28
+ query_results.columns.group_by { |k| k.split("__").first }.reduce({}) do |h, (k, v)|
29
+ h.tap do
30
+ h[k] ||= {}
31
+ v.each do |col|
32
+ statistic_name = col.split("__").last
33
+ h[k][statistic_name] = query_results[col][0]
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ def build_queries(split, type)
40
+ columns.flat_map do |column|
41
+ next if skip_processing?(column, type)
42
+
43
+ query = Lazy::Query.new(@dataset, column)
44
+ query_adapter = query.adapter.new(@dataset, column)
45
+ query_adapter.execute(split)
46
+ end.compact
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,25 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ class Query
5
+ attr_reader :dataset, :column, :dtype, :raw_dtype
6
+
7
+ def initialize(dataset, column)
8
+ @dataset = dataset
9
+ @column = column
10
+ begin
11
+ # TODO: LAZIFY THIS
12
+ @dtype = column.datatype || EasyML::Data::PolarsColumn.determine_type(column.raw.data[column.name])
13
+ @raw_dtype = column.raw_dtype
14
+ rescue => e
15
+ raise "Unable to find column #{column.name}. If this column is computed by a feature, you forgot to declare computes_columns"
16
+ end
17
+ end
18
+
19
+ def execute(split)
20
+ adapter.execute(split)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,100 @@
1
+ module EasyML
2
+ class Dataset
3
+ class Learner
4
+ include EasyML::Timing
5
+ attr_accessor :dataset, :columns, :type, :computed, :raw_columns, :statistics
6
+
7
+ def initialize(dataset, type: :raw)
8
+ @dataset = dataset
9
+ @columns = dataset.columns.reload.needs_learn.sort_by(&:name)
10
+
11
+ if computed
12
+ @columns = @columns.computed
13
+ end
14
+
15
+ @columns = @columns.select(&:persisted?).reject(&:empty?)
16
+ @type = type
17
+ end
18
+
19
+ def learn
20
+ prepare
21
+ learn_statistics
22
+ save_statistics
23
+ end
24
+
25
+ private
26
+
27
+ def save_statistics
28
+ columns.each do |col|
29
+ col.merge_statistics(statistics.dig(col.name))
30
+ col.set_sample_values
31
+ col.assign_attributes(
32
+ learned_at: EasyML::Support::UTC.now,
33
+ last_datasource_sha: col.dataset.last_datasource_sha,
34
+ last_feature_sha: col.feature&.sha,
35
+ is_learning: type == :raw,
36
+ )
37
+ end
38
+
39
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[
40
+ statistics
41
+ learned_at
42
+ sample_values
43
+ last_datasource_sha
44
+ is_learning
45
+ ] })
46
+ dataset.columns.set_feature_lineage(columns)
47
+ end
48
+
49
+ measure_method_timing :save_statistics
50
+
51
+ def learn_statistics
52
+ return @statistics if @statistics
53
+
54
+ @statistics = lazy_statistics.deep_merge!(eager_statistics).reduce({}) do |h, (type, stat_group)|
55
+ h.tap do
56
+ stat_group.each do |statistic, value|
57
+ h[statistic] ||= {}
58
+ h[statistic][type] = value
59
+ end
60
+ end
61
+ end.with_indifferent_access
62
+
63
+ if type != :raw
64
+ columns.select(&:one_hot?).each do |column|
65
+ @statistics[column.name][:processed] = @statistics[column.name][:raw]
66
+ end
67
+ end
68
+ end
69
+
70
+ measure_method_timing :learn_statistics
71
+
72
+ def prepare
73
+ @schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
74
+ @raw_columns = @schema.keys.sort.map(&:to_s)
75
+ columns.each do |column|
76
+ attrs = {
77
+ in_raw_dataset: @raw_columns.include?(column.name),
78
+ datatype: column.read_attribute(:datatype).present? ? nil : @schema[column.name.to_sym],
79
+ }.compact
80
+ column.assign_attributes(attrs)
81
+ end
82
+ EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
83
+ end
84
+
85
+ measure_method_timing :prepare
86
+
87
+ def lazy_statistics
88
+ Lazy.new(dataset, columns, type: type).learn
89
+ end
90
+
91
+ measure_method_timing :lazy_statistics
92
+
93
+ def eager_statistics
94
+ Eager.new(dataset, columns, type: type).learn
95
+ end
96
+
97
+ measure_method_timing :eager_statistics
98
+ end
99
+ end
100
+ end