RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc58 → 0.2.0.pre.rc60 - Mend

easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/application_controller.rb +4 -0
data/app/controllers/easy_ml/datasets_controller.rb +32 -1
data/app/frontend/components/DatasetPreview.tsx +50 -19
data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
data/app/frontend/components/dataset/ColumnList.tsx +14 -2
data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
data/app/frontend/types/dataset.ts +3 -0
data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
data/app/models/easy_ml/column/imputers/base.rb +89 -0
data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
data/app/models/easy_ml/column/imputers/clip.rb +30 -0
data/app/models/easy_ml/column/imputers/constant.rb +27 -0
data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
data/app/models/easy_ml/column/imputers/mean.rb +27 -0
data/app/models/easy_ml/column/imputers/median.rb +27 -0
data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
data/app/models/easy_ml/column/imputers/today.rb +20 -0
data/app/models/easy_ml/column/imputers.rb +126 -0
data/app/models/easy_ml/column/learner.rb +18 -0
data/app/models/easy_ml/column/learners/base.rb +103 -0
data/app/models/easy_ml/column/learners/boolean.rb +11 -0
data/app/models/easy_ml/column/learners/categorical.rb +51 -0
data/app/models/easy_ml/column/learners/datetime.rb +19 -0
data/app/models/easy_ml/column/learners/null.rb +22 -0
data/app/models/easy_ml/column/learners/numeric.rb +33 -0
data/app/models/easy_ml/column/learners/string.rb +15 -0
data/app/models/easy_ml/column/lineage/base.rb +22 -0
data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
data/app/models/easy_ml/column/lineage.rb +28 -0
data/app/models/easy_ml/column/selector.rb +96 -0
data/app/models/easy_ml/column.rb +319 -52
data/app/models/easy_ml/column_history.rb +29 -22
data/app/models/easy_ml/column_list.rb +63 -78
data/app/models/easy_ml/dataset.rb +128 -96
data/app/models/easy_ml/dataset_history.rb +23 -23
data/app/models/easy_ml/datasource.rb +3 -0
data/app/models/easy_ml/datasource_history.rb +1 -0
data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
data/app/models/easy_ml/feature.rb +19 -7
data/app/models/easy_ml/feature_history.rb +12 -0
data/app/models/easy_ml/feature_list.rb +15 -0
data/app/serializers/easy_ml/column_serializer.rb +11 -1
data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
data/config/initializers/enumerable.rb +17 -0
data/lib/easy_ml/data/date_converter.rb +137 -30
data/lib/easy_ml/data/polars_column.rb +17 -0
data/lib/easy_ml/data/polars_in_memory.rb +30 -0
data/lib/easy_ml/data/polars_reader.rb +20 -1
data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
data/lib/easy_ml/data/splits/split.rb +2 -1
data/lib/easy_ml/data/synced_directory.rb +1 -1
data/lib/easy_ml/data.rb +1 -2
data/lib/easy_ml/feature_store.rb +33 -22
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
data/lib/easy_ml/version.rb +1 -1
data/lib/tasks/profile.rake +40 -0
data/public/easy_ml/assets/.vite/manifest.json +2 -2
data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
metadata +41 -10
data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
data/lib/easy_ml/data/preprocessor.rb +0 -340
data/lib/easy_ml/data/simple_imputer.rb +0 -255
data/lib/easy_ml/data/statistics_learner.rb +0 -193
data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1

data/app/models/easy_ml/column.rb CHANGED Viewed

@@ -2,23 +2,29 @@
 #
 # Table name: easy_ml_columns
 #
-#  id                  :bigint           not null, primary key
-#  dataset_id          :bigint           not null
-#  name                :string           not null
-#  description         :string
-#  datatype            :string
-#  polars_datatype     :string
-#  is_target           :boolean          default(FALSE)
-#  hidden              :boolean          default(FALSE)
-#  drop_if_null        :boolean          default(FALSE)
-#  preprocessing_steps :json
-#  sample_values       :json
-#  statistics          :json
-#  created_at          :datetime         not null
-#  updated_at          :datetime         not null
-#  is_date_column      :boolean          default(FALSE)
-#  computed_by         :string
-#  is_computed         :boolean          default(FALSE)
+#  id                       :bigint           not null, primary key
+#  dataset_id               :bigint           not null
+#  name                     :string           not null
+#  description              :string
+#  datatype                 :string
+#  polars_datatype          :string
+#  is_target                :boolean          default(FALSE)
+#  hidden                   :boolean          default(FALSE)
+#  drop_if_null             :boolean          default(FALSE)
+#  preprocessing_steps      :json
+#  sample_values            :json
+#  statistics               :json
+#  created_at               :datetime         not null
+#  updated_at               :datetime         not null
+#  is_date_column           :boolean          default(FALSE)
+#  computed_by              :string
+#  is_computed              :boolean          default(FALSE)
+#  feature_id               :bigint
+#  learned_at               :datetime
+#  is_learning              :boolean          default(FALSE)
+#  last_datasource_sha      :string
+#  last_feature_sha         :string
+#  configuration_changed_at :datetime
 #
 module EasyML
   class Column < ActiveRecord::Base
@@ -27,6 +33,7 @@ module EasyML
     historiographer_mode :snapshot_only
     belongs_to :dataset, class_name: "EasyML::Dataset"
+    belongs_to :feature, class_name: "EasyML::Feature", optional: true
     validates :name, presence: true
     validates :name, uniqueness: { scope: :dataset_id }
@@ -34,6 +41,8 @@ module EasyML
     before_save :ensure_valid_datatype
     after_save :handle_date_column_change
     before_save :set_defaults
+    before_save :set_feature_lineage
+    before_save :set_polars_datatype
     # Scopes
     scope :visible, -> { where(hidden: false) }
@@ -41,9 +50,71 @@ module EasyML
     scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
     scope :datetime, -> { where(datatype: "datetime") }
     scope :date_column, -> { where(is_date_column: true) }
-    scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
+    scope :not_preprocessed, -> { where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
+    scope :preprocessed, -> { where("preprocessing_steps IS NOT NULL AND preprocessing_steps::text != '{}'::text") }
+    scope :required, -> { raw.visible.not_target.not_preprocessed }
+    scope :optional, -> { required.not }
+    scope :target, -> { where(is_target: true) }
+    scope :not_target, -> { where(is_target: false) }
     scope :api_inputs, -> { where(is_computed: false, hidden: false, is_target: false) }
     scope :computed, -> { where(is_computed: true) }
+    scope :raw, -> { where(is_computed: false) }
+    scope :needs_learn, -> {
+            datasource_changed
+              .or(feature_applied)
+              .or(feature_changed)
+              .or(column_changed)
+              .or(never_learned)
+              .or(is_learning)
+          }
+    scope :datasource_changed, -> {
+            left_joins(dataset: :datasource)
+              .left_joins(:feature)
+              .where(
+                arel_table[:last_datasource_sha].not_eq(
+                  Datasource.arel_table[:sha]
+                )
+              )
+          }
+    scope :feature_changed, -> {
+            where(feature_id: Feature.has_changes.map(&:id))
+          }
+    scope :feature_applied, -> {
+            left_joins(dataset: :datasource)
+              .left_joins(:feature)
+              .where(
+                Feature.arel_table[:applied_at].gt(
+                  Arel.sql("COALESCE(#{arel_table.name}.learned_at, '1970-01-01')")
+                ).and(
+                  arel_table[:feature_id].not_eq(nil)
+                )
+              )
+          }
+    scope :column_changed, -> {
+        left_joins(dataset: :datasource)
+          .left_joins(:feature)
+          .where(Dataset.arel_table[:refreshed_at].lt(arel_table[:updated_at]))
+      }
+    scope :never_learned, -> {
+            left_joins(dataset: :datasource)
+              .left_joins(:feature)
+              .where(arel_table[:learned_at].eq(nil))
+              .where(Datasource.arel_table[:sha].not_eq(nil))
+          }
+    scope :is_learning, -> { where(is_learning: true) }
+    def display_attributes
+      attributes.except(:statistics)
+    end
+    def inspect
+      "#<#{self.class.name} #{display_attributes.map { |k, v| "#{k}: #{v}" }.join(", ")}>"
+    end
     def aliases
       [name].concat(virtual_columns)
@@ -57,12 +128,174 @@ module EasyML
       end
     end
+    delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
+    def empty?
+      data.blank?
+    end
+    def learn(type: :all)
+      return if (!in_raw_dataset? && type != :processed)
+      if !in_raw_dataset? && read_attribute(:datatype).nil?
+        assign_attributes(datatype: processed.data.to_series.dtype)
+      end
+      set_sample_values
+      assign_attributes(statistics: (read_attribute(:statistics) || {}).symbolize_keys.merge!(learner.learn(type: type).symbolize_keys))
+      assign_attributes(
+        learned_at: UTC.now,
+        last_datasource_sha: dataset.last_datasource_sha,
+        last_feature_sha: feature&.sha,
+        is_learning: type == :raw,
+      )
+    end
+    def set_configuration_changed_at
+      if preprocessing_steps_changed? || datatype_changed?
+        self.configuration_changed_at = Time.now
+      end
+    end
+    def set_sample_values
+      use_processed = !one_hot? && processed.data(limit: 1).present? && in_raw_dataset?
+      base = use_processed ? processed : raw
+      sample_values = base.data(limit: 5, unique: true)
+      if sample_values.columns.include?(name)
+        sample_values = sample_values[name].to_a.uniq[0...5]
+        assign_attributes(sample_values: sample_values)
+      end
+    end
+    def transform(df, inference: false, computed: false)
+      imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
+      df = imputer.transform(df)
+      df
+    end
+    def imputers
+      @imputers ||= Column::Imputers.new(self)
+    end
+    def decode_labels(df)
+      imputers.training.decode_labels(df)
+    end
+    def preprocessed?
+      !preprocessing_steps.blank?
+    end
     def datatype=(dtype)
+      if dtype.is_a?(Polars::DataType)
+        dtype = EasyML::Data::PolarsColumn.polars_to_sym(dtype)
+      end
       write_attribute(:datatype, dtype)
-      write_attribute(:polars_datatype, dtype)
+      set_polars_datatype
+    end
+    def datatype
+      read_attribute(:datatype) || write_attribute(:datatype, assumed_datatype)
+    end
+    def raw_dtype
+      return @raw_dtype if @raw_dtype
+      if in_raw_dataset?
+        @raw_dtype = raw&.data&.to_series&.dtype
+      elsif already_computed?
+        @raw_dtype = processed&.data&.to_series&.dtype
+      end
+    end
+    def set_polars_datatype
+      raw_type = raw_dtype
+      user_type = get_polars_type(datatype)
+      if raw_type == user_type
+        # A raw type of Polars::Datetime might have extra information like timezone, so prefer the raw type
+        write_attribute(:polars_datatype, raw_type.to_s)
+      else
+        # If a user specified type doesn't match the raw type, use the user type
+        write_attribute(:polars_datatype, user_type.to_s)
+      end
+    end
+    def polars_datatype
+      begin
+        raw_attr = read_attribute(:polars_datatype)
+        if raw_attr.nil?
+          get_polars_type(datatype)
+        else
+          EasyML::Data::PolarsColumn.parse_polars_dtype(raw_attr)
+        end
+      rescue => e
+        get_polars_type(datatype)
+      end
+    end
+    EasyML::Data::PolarsColumn::TYPE_MAP.keys.each do |dtype|
+      define_method("#{dtype}?") do
+        datatype.to_s == dtype.to_s
+      end
+    end
+    def datasource_raw
+      dataset.datasource.query(select: name)
+    end
+    def already_computed?
+      is_computed && computing_feature&.fit_at.present? || computing_feature&.applied_at.present?
+    end
+    def assumed_datatype
+      return @assumed_datatype if @assumed_datatype
+      if in_raw_dataset?
+        series = (raw.data || datasource_raw).to_series
+        @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(series)
+      elsif already_computed?
+        return nil if processed.data.nil?
+        @assumed_datatype = EasyML::Data::PolarsColumn.determine_type(processed.data.to_series)
+      end
+    end
+    def in_raw_dataset?
+      return false if dataset&.raw&.data.nil?
+      dataset.raw.data(all_columns: true)&.columns&.include?(name) || false
+    end
+    def computing_feature
+      dataset&.features&.detect { |feature| feature.computes_columns.include?(name) }.tap do |computing_feature|
+        if computing_feature.present? && feature_id != computing_feature.id
+          update(feature_id: computing_feature.id)
+        end
+      end
+    end
+    alias_method :feature, :computing_feature
+    def set_feature_lineage
+      if dataset.features.computed_column_names.include?(name)
+        if computed_by.nil?
+          assign_attributes(
+            is_computed: true,
+            computed_by: computing_feature&.name,
+          )
+        end
+      elsif computed_by.present?
+        assign_attributes(
+          is_computed: false,
+          computed_by: nil,
+        )
+      end
     end
     def get_polars_type(dtype)
+      return nil if dtype.nil?
       EasyML::Data::PolarsColumn::TYPE_MAP[dtype.to_sym]
     end
@@ -84,7 +317,7 @@ module EasyML
         next config unless config[:params]&.key?(:constant)
         config.deep_dup.tap do |c|
-          c[:params][:constant] = convert_to_type(c[:params][:constant])
+          c[:params][:constant] = cast(c[:params][:constant])
         end
       end
@@ -103,15 +336,47 @@ module EasyML
       preprocessing_steps.deep_symbolize_keys.dig(:training, :params, :ordinal_encoding) == true
     end
+    def encoding
+      return nil unless categorical?
+      return :ordinal if ordinal_encoding?
+      return :one_hot
+    end
+    def categorical_min
+      return default_categorical_min unless categorical?
+      (preprocessing_steps || {}).deep_symbolize_keys.dig(:training, :params, :categorical_min) || default_categorical_min
+    end
+    def default_categorical_min
+      1
+    end
+    def statistics
+      (read_attribute(:statistics) || {}).with_indifferent_access
+    end
     def allowed_categories
-      return [] unless one_hot?
-      stats = dataset.statistics
+      stats = statistics
       return [] if stats.nil? || stats.blank?
       stats = stats.deep_symbolize_keys
-      stats = stats.dig(:raw)
+      type = is_computed? ? :processed : :raw
+      stats = stats.dig(type)
-      (stats.dig(name.to_sym, :allowed_categories) || []).sort.concat(["other"])
+      # Can we LEARN dtype during LEARN phase... for computed columns to deal with this ish man
+      sorted = (stats.dig(:allowed_categories) || []).sort_by(&method(:sort_by))
+      sorted = sorted.concat(["other"]) if categorical?
+      sorted
+    end
+    def sort_by(value)
+      case datatype.to_sym
+      when :boolean
+        value == true ? 1 : 0
+      else
+        value
+      end
     end
     def date_column?
@@ -119,19 +384,11 @@ module EasyML
     end
     def lineage
-      [
-        present_in_raw_dataset ? "Raw dataset" : nil,
-        computed_by ? "Computed by #{computed_by}" : nil,
-        preprocessing_steps.present? ? "Preprocessed using #{preprocessing_steps.keys.join(", ")}" : nil,
-      ].compact
+      @lineage ||= EasyML::Column::Lineage.new(self).lineage
     end
     def required?
-      is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
-    end
-    def present_in_raw_dataset
-      dataset.raw.data&.columns&.include?(name) || false
+      !is_computed && (preprocessing_steps.nil? || preprocessing_steps == {}) && !hidden && !is_target
     end
     def sort_required
@@ -148,6 +405,28 @@ module EasyML
       }.compact
     end
+    def cast(value)
+      return value if value.nil?
+      case datatype&.to_sym
+      when :float
+        Float(value)
+      when :integer
+        Integer(value)
+      when :boolean
+        ActiveModel::Type::Boolean.new.cast(value)
+      when :datetime
+        value.is_a?(String) ? Time.parse(value) : value
+      when :categorical
+        value
+      else
+        value.to_s
+      end
+    rescue ArgumentError, TypeError
+      # If conversion fails, return original value
+      value
+    end
     private
     def set_defaults
@@ -247,26 +526,14 @@ module EasyML
       throw :abort
     end
-    def convert_to_type(value)
-      return value if value.nil?
+    NUMERIC_METHODS = %i[mean median].freeze
-      case datatype&.to_sym
-      when :float
-        Float(value)
-      when :integer
-        Integer(value)
-      when :boolean
-        ActiveModel::Type::Boolean.new.cast(value)
-      when :datetime
-        value.is_a?(String) ? Time.parse(value) : value
-      else
-        value.to_s
-      end
-    rescue ArgumentError, TypeError
-      # If conversion fails, return original value
-      value
+    def data_selector
+      @data_selector ||= Column::Selector.new(self)
     end
-    NUMERIC_METHODS = %i[mean median].freeze
+    def learner
+      @learner ||= Column::Learner.new(self)
+    end
   end
 end

data/app/models/easy_ml/column_history.rb CHANGED Viewed

@@ -2,28 +2,34 @@
 #
 # Table name: easy_ml_column_histories
 #
-#  id                  :bigint           not null, primary key
-#  column_id           :integer          not null
-#  dataset_id          :integer          not null
-#  name                :string           not null
-#  description         :string
-#  datatype            :string
-#  polars_datatype     :string
-#  is_target           :boolean          default(FALSE)
-#  hidden              :boolean          default(FALSE)
-#  drop_if_null        :boolean          default(FALSE)
-#  preprocessing_steps :json
-#  sample_values       :json
-#  statistics          :json
-#  created_at          :datetime         not null
-#  updated_at          :datetime         not null
-#  history_started_at  :datetime         not null
-#  history_ended_at    :datetime
-#  history_user_id     :integer
-#  snapshot_id         :string
-#  is_date_column      :boolean          default(FALSE)
-#  computed_by         :string
-#  is_computed         :boolean          default(FALSE)
+#  id                       :bigint           not null, primary key
+#  column_id                :integer          not null
+#  dataset_id               :integer          not null
+#  name                     :string           not null
+#  description              :string
+#  datatype                 :string
+#  polars_datatype          :string
+#  is_target                :boolean          default(FALSE)
+#  hidden                   :boolean          default(FALSE)
+#  drop_if_null             :boolean          default(FALSE)
+#  preprocessing_steps      :json
+#  sample_values            :json
+#  statistics               :json
+#  created_at               :datetime         not null
+#  updated_at               :datetime         not null
+#  history_started_at       :datetime         not null
+#  history_ended_at         :datetime
+#  history_user_id          :integer
+#  snapshot_id              :string
+#  is_date_column           :boolean          default(FALSE)
+#  computed_by              :string
+#  is_computed              :boolean          default(FALSE)
+#  feature_id               :bigint
+#  learned_at               :datetime
+#  is_learning              :boolean          default(FALSE)
+#  last_datasource_sha      :string
+#  last_feature_sha         :string
+#  configuration_changed_at :datetime
 #
 module EasyML
   class ColumnHistory < ActiveRecord::Base
@@ -31,5 +37,6 @@ module EasyML
     include Historiographer::History
     scope :required, -> { where(is_computed: false, hidden: false, is_target: false).where("preprocessing_steps IS NULL OR preprocessing_steps::text = '{}'::text") }
     scope :computed, -> { where(is_computed: true) }
+    scope :raw, -> { where(is_computed: false) }
   end
 end

data/app/models/easy_ml/column_list.rb CHANGED Viewed

@@ -9,8 +9,7 @@ module EasyML
         col_names = syncable
         existing_columns = where(name: col_names)
         import_new(col_names, existing_columns)
-        update_existing(existing_columns)
-        set_feature_lineage
+        # update_existing(existing_columns)
         if delete
           delete_missing(col_names)
@@ -22,6 +21,64 @@ module EasyML
       end
     end
+    def transform(df, inference: false, computed: false)
+      return df if df.nil?
+      if computed
+        cols = column_list.computed
+      else
+        cols = column_list.raw
+      end
+      by_name = cols.index_by(&:name)
+      df.columns.each do |col|
+        column = by_name[col]
+        df = column.transform(df, inference: inference, computed: computed) if column
+      end
+      df
+    end
+    def learn(type: :raw, computed: false)
+      cols_to_learn = column_list.reload.needs_learn
+      cols_to_learn = cols_to_learn.computed if computed
+      cols_to_learn = cols_to_learn.select(&:persisted?).reject(&:empty?)
+      cols_to_learn.each { |col| col.learn(type: type) }
+      EasyML::Column.import(cols_to_learn, on_duplicate_key_update: { columns: %i[
+                                             statistics
+                                             learned_at
+                                             sample_values
+                                             last_datasource_sha
+                                             is_learning
+                                             datatype
+                                             polars_datatype
+                                           ] })
+      set_feature_lineage
+      reload
+    end
+    def set_feature_lineage
+      names = dataset.features.computed_column_names
+      columns = where(name: names, computed_by: nil).map do |col|
+        col.assign_attributes(
+          is_computed: true,
+          computed_by: col.computing_feature&.name,
+        )
+        col
+      end
+      EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
+    end
+    def statistics
+      stats = { raw: {}, processed: {} }
+      select(&:persisted?).inject(stats) do |h, col|
+        h.tap do
+          h[:raw][col.name] = col.statistics.dig(:raw)
+          h[:processed][col.name] = col.statistics.dig(:processed)
+        end
+      end.with_indifferent_access
+    end
     def one_hots
       column_list.select(&:one_hot?)
     end
@@ -60,94 +117,22 @@ module EasyML
     private
-    def set_feature_lineage
-      # Get all features that compute columns
-      features_computing_columns = dataset.features.all.map do |feature|
-        [feature.name, feature.computes_columns]
-      end.compact.to_h
-      updates = column_list.reload.map do |column|
-        # Check if column is computed by any feature
-        computing_feature = features_computing_columns.find { |_, cols| cols.include?(column.name) }&.first
-        is_computed = !computing_feature.nil?
-        column.assign_attributes(
-          computed_by: computing_feature,
-          is_computed: is_computed,
-        )
-        next unless column.changed?
-        column
-      end.compact
-      EasyML::Column.import(updates.to_a, { on_duplicate_key_update: { columns: %i[computed_by is_computed] } })
-      cols = EasyML::Column.where(id: updates.map(&:id)).to_a
-      column_list.bulk_record_history(cols, { history_user_id: 1 })
-    end
     def import_new(new_columns, existing_columns)
       new_columns = new_columns - existing_columns.map(&:name)
       cols_to_insert = new_columns.map do |col_name|
-        EasyML::Column.new(
+        col = EasyML::Column.new(
           name: col_name,
           dataset_id: dataset.id,
         )
+        col
       end
       EasyML::Column.import(cols_to_insert)
+      set_feature_lineage
       column_list.reload
     end
-    def update_existing(existing_columns)
-      stats = dataset.statistics
-      use_processed = dataset.processed.data(limit: 1).present?
-      cached_sample = use_processed ? dataset.processed.data(limit: 10, all_columns: true) : dataset.raw.data(limit: 10, all_columns: true)
-      existing_types = existing_columns.map(&:name).zip(existing_columns.map(&:datatype)).to_h
-      polars_types = cached_sample.columns.zip((cached_sample.dtypes.map do |dtype|
-        EasyML::Data::PolarsColumn.polars_to_sym(dtype).to_s
-      end)).to_h
-      existing_columns.each do |column|
-        new_polars_type = polars_types[column.name]
-        existing_type = existing_types[column.name]
-        schema_type = dataset.schema[column.name]
-        # Keep both datatype and polars_datatype if it's an ordinal encoding case
-        if column.ordinal_encoding?
-          actual_type = existing_type
-          actual_schema_type = existing_type
-        else
-          actual_type = new_polars_type
-          actual_schema_type = schema_type
-        end
-        if column.one_hot?
-          base = dataset.raw
-          processed = stats.dig("raw", column.name).dup
-          processed["null_count"] = 0
-          actual_schema_type = "categorical"
-          actual_type = "categorical"
-        else
-          base = use_processed ? dataset.processed : dataset.raw
-          processed = stats.dig("processed", column.name)
-        end
-        sample_values = base.send(:data, unique: true, limit: 5, all_columns: true, select: column.name)[column.name].to_a.uniq[0...5]
-        column.assign_attributes(
-          statistics: {
-            raw: stats.dig("raw", column.name),
-            processed: processed,
-          },
-          datatype: actual_schema_type,
-          polars_datatype: actual_type,
-          sample_values: sample_values,
-        )
-      end
-      EasyML::Column.import(existing_columns.to_a,
-                            { on_duplicate_key_update: { columns: %i[statistics datatype polars_datatype
-                                                                   sample_values computed_by is_computed] } })
-    end
     def delete_missing(col_names)
-      raw_cols = dataset.best_segment.train(all_columns: true, limit: 1).columns
+      raw_cols = dataset.best_segment.data(all_columns: true, limit: 1).columns
       raw_cols = where(name: raw_cols)
       columns_to_delete = column_list.select do |col|
         col_names.exclude?(col.name) &&