RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc72 → 0.2.0.pre.rc75 - Mend

easy_ml 0.2.0.pre.rc72 → 0.2.0.pre.rc75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (115) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/datasets_controller.rb +33 -0
data/app/controllers/easy_ml/datasources_controller.rb +7 -0
data/app/controllers/easy_ml/models_controller.rb +38 -0
data/app/frontend/components/DatasetCard.tsx +212 -0
data/app/frontend/components/ModelCard.tsx +69 -29
data/app/frontend/components/StackTrace.tsx +13 -0
data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
data/app/frontend/components/models/UploadModelModal.tsx +212 -0
data/app/frontend/components/models/index.ts +2 -0
data/app/frontend/pages/DatasetsPage.tsx +36 -130
data/app/frontend/pages/DatasourcesPage.tsx +22 -2
data/app/frontend/pages/ModelsPage.tsx +37 -11
data/app/frontend/types/dataset.ts +1 -2
data/app/frontend/types.ts +1 -1
data/app/jobs/easy_ml/training_job.rb +2 -2
data/app/models/easy_ml/column/imputers/base.rb +4 -0
data/app/models/easy_ml/column/imputers/clip.rb +5 -3
data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
data/app/models/easy_ml/column/imputers/mean.rb +7 -3
data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
data/app/models/easy_ml/column/imputers.rb +3 -1
data/app/models/easy_ml/column/lineage/base.rb +5 -1
data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
data/app/models/easy_ml/column/selector.rb +4 -0
data/app/models/easy_ml/column.rb +79 -63
data/app/models/easy_ml/column_history.rb +28 -28
data/app/models/easy_ml/column_list/imputer.rb +23 -0
data/app/models/easy_ml/column_list.rb +39 -26
data/app/models/easy_ml/dataset/learner/base.rb +34 -0
data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
data/app/models/easy_ml/dataset/learner/query.rb +25 -0
data/app/models/easy_ml/dataset/learner.rb +100 -0
data/app/models/easy_ml/dataset.rb +150 -36
data/app/models/easy_ml/dataset_history.rb +1 -0
data/app/models/easy_ml/datasource.rb +9 -0
data/app/models/easy_ml/event.rb +4 -0
data/app/models/easy_ml/export/column.rb +27 -0
data/app/models/easy_ml/export/dataset.rb +37 -0
data/app/models/easy_ml/export/datasource.rb +12 -0
data/app/models/easy_ml/export/feature.rb +24 -0
data/app/models/easy_ml/export/model.rb +40 -0
data/app/models/easy_ml/export/retraining_job.rb +20 -0
data/app/models/easy_ml/export/splitter.rb +14 -0
data/app/models/easy_ml/feature.rb +21 -0
data/app/models/easy_ml/import/column.rb +35 -0
data/app/models/easy_ml/import/dataset.rb +148 -0
data/app/models/easy_ml/import/feature.rb +36 -0
data/app/models/easy_ml/import/model.rb +136 -0
data/app/models/easy_ml/import/retraining_job.rb +29 -0
data/app/models/easy_ml/import/splitter.rb +34 -0
data/app/models/easy_ml/lineage.rb +44 -0
data/app/models/easy_ml/model.rb +93 -36
data/app/models/easy_ml/model_file.rb +6 -0
data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
data/app/models/easy_ml/models/xgboost.rb +33 -9
data/app/models/easy_ml/retraining_job.rb +8 -1
data/app/models/easy_ml/retraining_run.rb +6 -4
data/app/models/easy_ml/splitter.rb +8 -0
data/app/models/lineage_history.rb +6 -0
data/app/serializers/easy_ml/column_serializer.rb +7 -1
data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
data/config/routes.rb +13 -1
data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
data/lib/easy_ml/core/tuner.rb +12 -11
data/lib/easy_ml/data/polars_column.rb +149 -100
data/lib/easy_ml/data/polars_reader.rb +8 -5
data/lib/easy_ml/data/polars_schema.rb +56 -0
data/lib/easy_ml/data/splits/file_split.rb +20 -2
data/lib/easy_ml/data/splits/split.rb +10 -1
data/lib/easy_ml/data.rb +1 -0
data/lib/easy_ml/deep_compact.rb +19 -0
data/lib/easy_ml/feature_store.rb +2 -6
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
data/lib/easy_ml/timing.rb +34 -0
data/lib/easy_ml/version.rb +1 -1
data/lib/easy_ml.rb +2 -0
data/public/easy_ml/assets/.vite/manifest.json +2 -2
data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
metadata +52 -12
data/app/models/easy_ml/column/learners/base.rb +0 -103
data/app/models/easy_ml/column/learners/boolean.rb +0 -11
data/app/models/easy_ml/column/learners/categorical.rb +0 -51
data/app/models/easy_ml/column/learners/datetime.rb +0 -19
data/app/models/easy_ml/column/learners/null.rb +0 -22
data/app/models/easy_ml/column/learners/numeric.rb +0 -33
data/app/models/easy_ml/column/learners/string.rb +0 -15
data/public/easy_ml/assets/assets/Application-B3sRjyMT.css +0 -1
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js +0 -489
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dfg-nTrB.js.map +0 -1

data/app/models/easy_ml/column/imputers/mean.rb CHANGED Viewed

@@ -8,13 +8,17 @@ module EasyML
           "Mean imputation"
         end
+        def expr
+          return super unless mean.present?
+          Polars.col(column.name).fill_null(mean).alias(column.name)
+        end
         def transform(df)
           return df unless mean.present?
           mean = statistics(:mean)
-          df = df.with_column(
-            Polars.col(column.name).fill_null(mean).alias(column.name)
-          )
+          df = df.with_column(expr)
           df
         end

data/app/models/easy_ml/column/imputers/null_imputer.rb CHANGED Viewed

@@ -6,6 +6,9 @@ module EasyML
           false
         end
+        def exprs
+        end
         def method_missing(_name, df)
           df
         end

data/app/models/easy_ml/column/imputers/ordinal_encoder.rb CHANGED Viewed

@@ -50,7 +50,11 @@ module EasyML
         end
         def cast_encoder(encoder)
-          encoder.transform_keys { |k| column.cast(k) }
+          begin
+            encoder.transform_keys { |k| column.cast(k) }
+          rescue => e
+            binding.pry
+          end
         end
         def cast_decoder(decoder)

data/app/models/easy_ml/column/imputers.rb CHANGED Viewed

@@ -74,9 +74,10 @@ module EasyML
         @supported_methods ||= []
       end
-      def initialize(column)
+      def initialize(column, imputers: [])
         @column = column
         @dataset = column.dataset
+        @_imputers = imputers
       end
       class << self
@@ -97,6 +98,7 @@ module EasyML
             hash[key.to_sym] = Imputer.new(
               column,
               column.preprocessing_steps[key],
+              @_imputers
             )
           end
         end

data/app/models/easy_ml/column/lineage/base.rb CHANGED Viewed

@@ -9,11 +9,15 @@ module EasyML
           @dataset = column.dataset
         end
+        def expr
+          Polars.col(column.name)
+        end
         def as_json
           {
             key: key,
             description: description,
-            timestamp: timestamp,
+            occurred_at: occurred_at,
           }.with_indifferent_access
         end
       end

data/app/models/easy_ml/column/lineage/computed_by_feature.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module EasyML
           "Computed by #{column.computed_by}"
         end
-        def timestamp
+        def occurred_at
           column.feature.fit_at || column.feature.applied_at
         end

data/app/models/easy_ml/column/lineage/preprocessed.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module EasyML
           "Preprocessed using #{column.imputers.preprocessing_descriptions.join(", ")}"
         end
-        def timestamp
+        def occurred_at
           column.dataset.refreshed_at
         end

data/app/models/easy_ml/column/lineage/raw_dataset.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module EasyML
           "Present in raw dataset"
         end
-        def timestamp
+        def occurred_at
           column.dataset.datasource.refreshed_at
         end

data/app/models/easy_ml/column/selector.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module EasyML
   class Column
     class Selector
+      include EasyML::Timing
       attr_accessor :selected, :dataset, :column, :transform
       def initialize(column, selected = nil, &block)
@@ -28,6 +30,8 @@ module EasyML
         end
       end
+      measure_method_timing :clipped
       def processed
         Selector.new(column, :processed)
       end

data/app/models/easy_ml/column.rb CHANGED Viewed

@@ -2,29 +2,29 @@
 #
 # Table name: easy_ml_columns
 #
-#  id                       :bigint           not null, primary key
-#  dataset_id               :bigint           not null
-#  name                     :string           not null
-#  description              :string
-#  datatype                 :string
-#  polars_datatype          :string
-#  is_target                :boolean          default(FALSE)
-#  hidden                   :boolean          default(FALSE)
-#  drop_if_null             :boolean          default(FALSE)
-#  preprocessing_steps      :json
-#  sample_values            :json
-#  statistics               :json
-#  created_at               :datetime         not null
-#  updated_at               :datetime         not null
-#  is_date_column           :boolean          default(FALSE)
-#  computed_by              :string
-#  is_computed              :boolean          default(FALSE)
-#  feature_id               :bigint
-#  learned_at               :datetime
-#  is_learning              :boolean          default(FALSE)
-#  last_datasource_sha      :string
-#  last_feature_sha         :string
-#  configuration_changed_at :datetime
+#  id                  :bigint           not null, primary key
+#  dataset_id          :bigint           not null
+#  name                :string           not null
+#  description         :string
+#  datatype            :string
+#  polars_datatype     :string
+#  is_target           :boolean          default(FALSE)
+#  hidden              :boolean          default(FALSE)
+#  drop_if_null        :boolean          default(FALSE)
+#  preprocessing_steps :jsonb
+#  sample_values       :json
+#  statistics          :json
+#  created_at          :datetime         not null
+#  updated_at          :datetime         not null
+#  is_date_column      :boolean          default(FALSE)
+#  computed_by         :string
+#  is_computed         :boolean          default(FALSE)
+#  feature_id          :bigint
+#  learned_at          :datetime
+#  is_learning         :boolean          default(FALSE)
+#  last_datasource_sha :string
+#  last_feature_sha    :string
+#  in_raw_dataset      :boolean
 #
 module EasyML
   class Column < ActiveRecord::Base
@@ -32,8 +32,11 @@ module EasyML
     include Historiographer::Silent
     historiographer_mode :snapshot_only
+    include EasyML::Timing
     belongs_to :dataset, class_name: "EasyML::Dataset"
     belongs_to :feature, class_name: "EasyML::Feature", optional: true
+    has_many :lineages, class_name: "EasyML::Lineage"
     validates :name, presence: true
     validates :name, uniqueness: { scope: :dataset_id }
@@ -43,7 +46,7 @@ module EasyML
     before_save :set_defaults
     before_save :set_feature_lineage
     before_save :set_polars_datatype
-    after_find :ensure_feature_exists
+    # after_find :ensure_feature_exists
     # Scopes
     scope :visible, -> { where(hidden: false) }
@@ -60,6 +63,7 @@ module EasyML
     scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
     scope :computed, -> { where(is_computed: true) }
     scope :raw, -> { where(is_computed: false) }
+    scope :has_clip, -> { where("preprocessing_steps->'training'->>'params' IS NOT NULL AND preprocessing_steps->'training'->'params' @> jsonb_build_object('clip', jsonb_build_object())") }
     scope :needs_learn, -> {
             datasource_changed
               .or(feature_applied)
@@ -142,26 +146,10 @@ module EasyML
       data.blank?
     end
-    def learn(type: :all)
-      return if (!in_raw_dataset? && type != :processed)
-      if !in_raw_dataset? && read_attribute(:datatype).nil?
-        assign_attributes(datatype: processed.data.to_series.dtype)
-      end
-      set_sample_values
-      new_stats = learner.learn(type: type).symbolize_keys
-      if !in_raw_dataset?
-        new_stats[:raw] = new_stats[:processed]
-      end
+    def merge_statistics(new_stats)
+      return unless new_stats.present?
-      assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(new_stats))
-      assign_attributes(
-        learned_at: UTC.now,
-        last_datasource_sha: dataset.last_datasource_sha,
-        last_feature_sha: feature&.sha,
-        is_learning: type == :raw,
-      )
+      assign_attributes(statistics: (statistics || {}).symbolize_keys.deep_merge!(new_stats))
     end
     def set_configuration_changed_at
@@ -174,7 +162,7 @@ module EasyML
       use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
       base = use_processed ? processed : raw
-      sample_values = base.data(limit: 5, unique: true)
+      sample_values = base.data(limit: 5, unique: true, select: [name])
       if sample_values.columns.include?(name)
         sample_values = sample_values[name].to_a.uniq[0...5]
         assign_attributes(sample_values: sample_values)
@@ -188,8 +176,8 @@ module EasyML
       df
     end
-    def imputers
-      @imputers ||= Column::Imputers.new(self)
+    def imputers(imputers = [])
+      @imputers ||= Column::Imputers.new(self, imputers: imputers)
     end
     def decode_labels(df)
@@ -202,29 +190,29 @@ module EasyML
     def datatype=(dtype)
       if dtype.is_a?(Polars::DataType)
-        dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
+        dtype = polars_to_sym(dtype)
       end
       write_attribute(:datatype, dtype)
       set_polars_datatype
     end
+    def polars_to_sym(dtype)
+      EasyML::Data::PolarsColumn.polars_to_sym(dtype)
+    end
     def datatype
-      read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
+      read_attribute(:datatype) || write_attribute(:datatype, polars_to_sym(assumed_datatype))
     end
     def raw_dtype
-      return @raw_dtype if @raw_dtype
-      set_feature_lineage
+      dtype = dataset.raw_schema[name]
+      return nil if dtype.nil?
-      if in_raw_dataset?
-        @raw_dtype = raw&.data&.to_series.try(:dtype)
-      elsif already_computed?
-        @raw_dtype = processed&.data&.to_series&.dtype
-      end
+      polars_to_sym(dtype)
     end
     def set_polars_datatype
-      raw_type = raw_dtype
+      raw_type = datatype
       user_type = get_polars_type(datatype)
       if raw_type == user_type
@@ -267,8 +255,11 @@ module EasyML
       return @assumed_datatype if @assumed_datatype
       if in_raw_dataset?
-        series = (raw.data || datasource_raw).to_series
-        @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
+        @assumed_datatype = dataset.raw_schema[name]
+        # series = (raw.data || datasource_raw).to_series
+        # @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
+      elsif dataset.processed_schema.present?
+        @assumed_datatype = dataset.processed_schema[name]
       elsif already_computed?
         return nil if processed.data.nil?
@@ -277,9 +268,16 @@ module EasyML
     end
     def in_raw_dataset?
+      value = read_attribute(:in_raw_dataset)
+      return value unless value.nil?
+      write_attribute(:in_raw_dataset, check_in_raw_dataset?)
+    end
+    def check_in_raw_dataset?
       return false if dataset&.raw&.data.nil?
-      dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
+      dataset.raw.data(all_columns: true, lazy: true).schema.key?(name) || false
     end
     def computing_feature
@@ -398,10 +396,6 @@ module EasyML
       is_date_column
     end
-    def lineage
-      @lineage ||= EasyML::Column::Lineage.new(self).lineage
-    end
     def required?
       !is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
     end
@@ -420,6 +414,28 @@ module EasyML
       }.compact
     end
+    UNCONFIGURABLE_COLUMNS = %w(
+      id
+      feature_id
+      dataset_id
+      last_datasource_sha
+      last_feature_sha
+      learned_at
+      is_learning
+      configuration_changed_at
+      statistics
+      created_at
+      updated_at
+    )
+    def to_config
+      EasyML::Export::Column.to_config(self)
+    end
+    def self.from_config(config, dataset, action: :create)
+      EasyML::Import::Column.from_config(config, dataset, action: action)
+    end
     def cast(value)
       return value if value.nil?

data/app/models/easy_ml/column_history.rb CHANGED Viewed

@@ -2,34 +2,34 @@
 #
 # Table name: easy_ml_column_histories
 #
-#  id                       :bigint           not null, primary key
-#  column_id                :integer          not null
-#  dataset_id               :integer          not null
-#  name                     :string           not null
-#  description              :string
-#  datatype                 :string
-#  polars_datatype          :string
-#  is_target                :boolean          default(FALSE)
-#  hidden                   :boolean          default(FALSE)
-#  drop_if_null             :boolean          default(FALSE)
-#  preprocessing_steps      :json
-#  sample_values            :json
-#  statistics               :json
-#  created_at               :datetime         not null
-#  updated_at               :datetime         not null
-#  history_started_at       :datetime         not null
-#  history_ended_at         :datetime
-#  history_user_id          :integer
-#  snapshot_id              :string
-#  is_date_column           :boolean          default(FALSE)
-#  computed_by              :string
-#  is_computed              :boolean          default(FALSE)
-#  feature_id               :bigint
-#  learned_at               :datetime
-#  is_learning              :boolean          default(FALSE)
-#  last_datasource_sha      :string
-#  last_feature_sha         :string
-#  configuration_changed_at :datetime
+#  id                  :bigint           not null, primary key
+#  column_id           :integer          not null
+#  dataset_id          :integer          not null
+#  name                :string           not null
+#  description         :string
+#  datatype            :string
+#  polars_datatype     :string
+#  is_target           :boolean          default(FALSE)
+#  hidden              :boolean          default(FALSE)
+#  drop_if_null        :boolean          default(FALSE)
+#  preprocessing_steps :jsonb
+#  sample_values       :json
+#  statistics          :json
+#  created_at          :datetime         not null
+#  updated_at          :datetime         not null
+#  history_started_at  :datetime         not null
+#  history_ended_at    :datetime
+#  history_user_id     :integer
+#  snapshot_id         :string
+#  is_date_column      :boolean          default(FALSE)
+#  computed_by         :string
+#  is_computed         :boolean          default(FALSE)
+#  feature_id          :bigint
+#  learned_at          :datetime
+#  is_learning         :boolean          default(FALSE)
+#  last_datasource_sha :string
+#  last_feature_sha    :string
+#  in_raw_dataset      :boolean
 #
 module EasyML
   class ColumnHistory < ActiveRecord::Base

data/app/models/easy_ml/column_list/imputer.rb ADDED Viewed

@@ -0,0 +1,23 @@
+module EasyML
+  module ColumnList
+    class Imputer
+      attr_accessor :dataset, :df, :inference, :columns
+      def initialize(dataset, df, columns: nil, imputers: [], inference: false)
+        @dataset = dataset
+        @df = df
+        @columns = (columns.nil? || columns.empty?) ? dataset.columns : columns
+        @inference = inference
+        @_imputers = imputers
+      end
+      def imputers
+        @imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
+      end
+      def exprs
+        imputers.flat_map(&:exprs).compact
+      end
+    end
+  end
+end

data/app/models/easy_ml/column_list.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 module EasyML
   module ColumnList
     include Historiographer::Relation
+    include EasyML::Timing
     def sync(delete: true)
       return unless dataset.schema.present?
@@ -39,35 +40,28 @@ module EasyML
       df
     end
+    measure_method_timing :transform
+    def apply_clip(df)
+      clip_cols = has_clip.raw
+      return df unless clip_cols.any?
+      clipped_exprs = EasyML::ColumnList::Imputer.new(
+        dataset,
+        df,
+        columns: clip_cols,
+        imputers: [:clip],
+      ).exprs
+      df.with_columns(clipped_exprs)
+    end
     def learn(type: :raw, computed: false)
-      cols_to_learn = column_list.reload.needs_learn
-      cols_to_learn = cols_to_learn.computed if computed
-      cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
-      cols_to_learn.each { |col| col.learn(type: type) }
-      EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
-                                             statistics
-                                             learned_at
-                                             sample_values
-                                             last_datasource_sha
-                                             is_learning
-                                             datatype
-                                             polars_datatype
-                                           ] })
-      set_feature_lineage
+      EasyML::Dataset::Learner.new(dataset, type: type).learn
       reload
     end
-    def set_feature_lineage
-      names = dataset.features.computed_column_names
-      columns = where(name: names, computed_by: nil).map do |col|
-        col.assign_attributes(
-          is_computed: true,
-          computed_by: col.computing_feature&.name,
-        )
-        col
-      end
-      EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
-    end
+    measure_method_timing :learn
     def statistics
       stats = { raw: {}, processed: {} }
@@ -115,6 +109,25 @@ module EasyML
       column_list.sort_by { |col| [col.sort_required, col.name] }
     end
+    def set_feature_lineage(cols_to_learn)
+      names = dataset.features.computed_column_names
+      columns = where(name: names, computed_by: nil).map do |col|
+        col.assign_attributes(
+          is_computed: true,
+          computed_by: col.computing_feature&.name,
+        )
+        col
+      end
+      EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
+      lineage = cols_to_learn.flat_map do |col|
+        EasyML::Lineage.learn(col)
+      end.compact
+      EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
+    end
+    measure_method_timing :set_feature_lineage
     private
     def import_new(new_columns, existing_columns)
@@ -127,7 +140,7 @@ module EasyML
         col
       end
       EasyML::Column.import(cols_to_insert)
-      set_feature_lineage
+      set_feature_lineage(cols_to_insert)
       column_list.reload
     end

data/app/models/easy_ml/dataset/learner/base.rb ADDED Viewed

@@ -0,0 +1,34 @@
+module EasyML
+  class Dataset
+    class Learner
+      class Base
+        attr_reader :dataset, :columns, :type
+        def initialize(dataset, columns, type: :raw)
+          @dataset = dataset
+          @columns = columns
+          @type = type
+        end
+        def skip_processing?(column, type)
+          (!column.in_raw_dataset? && type.to_sym != :processed) ||
+            (column.one_hot? && type.to_sym == :processed)
+        end
+        TYPES_ALL = %i(raw clipped processed)
+        TYPES_RAW = %i(raw clipped)
+        TYPES_PROCESSED = %i(processed)
+        def types(type = :all)
+          case type
+          when :all then TYPES_ALL
+          when :raw then TYPES_RAW
+          when :processed then TYPES_PROCESSED
+          else
+            TYPES_ALL
+          end
+        end
+      end
+    end
+  end
+end

data/app/models/easy_ml/dataset/learner/eager/boolean.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module EasyML
+  class Dataset
+    class Learner
+      class Eager
+        class Boolean < Categorical
+        end
+      end
+    end
+  end
+end

data/app/models/easy_ml/dataset/learner/eager/categorical.rb ADDED Viewed

@@ -0,0 +1,51 @@
+module EasyML
+  class Dataset
+    class Learner
+      class Eager
+        class Categorical < Query
+          def train_query(df)
+            {
+              counts: counts(df).to_hash,
+              allowed_categories: allowed_categories(df).to_series.to_a,
+            }.merge!(
+              learn_encoder_decoder(df)
+            )
+          end
+          def learn_encoder_decoder(df)
+            unsorted = allowed_categories(df).lazy.with_row_count.collect.to_hash.invert
+            label_encoder = unsorted.transform_keys(&column.method(:cast)).keys.compact.sort_by(&column.method(:sort_by)).each.with_index.reduce({}) do |h, (k, i)|
+              h.tap do
+                h[k] = i
+              end
+            end
+            label_decoder = label_encoder.invert
+            {
+              label_encoder: label_encoder,
+              label_decoder: label_decoder,
+            }
+          end
+          def counts(df)
+            return @counts if @counts
+            @counts = df.group_by(column.name)
+                        .agg(Polars.col(column.name).count.alias("count"))
+          end
+          def allowed_categories(df)
+            return @allowed_categories if @allowed_categories
+            @allowed_categories = df.join(counts(df), on: column.name)
+              .filter(Polars.col("count").ge(column.categorical_min))
+              .select(column.name)
+              .unique
+              .sort(column.name, reverse: true)
+          end
+        end
+      end
+    end
+  end
+end