RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc58 → 0.2.0.pre.rc61 - Mend

easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/application_controller.rb +4 -0
data/app/controllers/easy_ml/datasets_controller.rb +32 -1
data/app/frontend/components/DatasetPreview.tsx +50 -19
data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
data/app/frontend/components/dataset/ColumnList.tsx +14 -2
data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
data/app/frontend/types/dataset.ts +3 -0
data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
data/app/models/easy_ml/column/imputers/base.rb +89 -0
data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
data/app/models/easy_ml/column/imputers/clip.rb +30 -0
data/app/models/easy_ml/column/imputers/constant.rb +27 -0
data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
data/app/models/easy_ml/column/imputers/mean.rb +27 -0
data/app/models/easy_ml/column/imputers/median.rb +27 -0
data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
data/app/models/easy_ml/column/imputers/today.rb +20 -0
data/app/models/easy_ml/column/imputers.rb +126 -0
data/app/models/easy_ml/column/learner.rb +18 -0
data/app/models/easy_ml/column/learners/base.rb +103 -0
data/app/models/easy_ml/column/learners/boolean.rb +11 -0
data/app/models/easy_ml/column/learners/categorical.rb +51 -0
data/app/models/easy_ml/column/learners/datetime.rb +19 -0
data/app/models/easy_ml/column/learners/null.rb +22 -0
data/app/models/easy_ml/column/learners/numeric.rb +33 -0
data/app/models/easy_ml/column/learners/string.rb +15 -0
data/app/models/easy_ml/column/lineage/base.rb +22 -0
data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
data/app/models/easy_ml/column/lineage.rb +28 -0
data/app/models/easy_ml/column/selector.rb +96 -0
data/app/models/easy_ml/column.rb +319 -52
data/app/models/easy_ml/column_history.rb +29 -22
data/app/models/easy_ml/column_list.rb +63 -78
data/app/models/easy_ml/dataset.rb +128 -96
data/app/models/easy_ml/dataset_history.rb +23 -23
data/app/models/easy_ml/datasource.rb +3 -0
data/app/models/easy_ml/datasource_history.rb +1 -0
data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
data/app/models/easy_ml/feature.rb +19 -7
data/app/models/easy_ml/feature_history.rb +12 -0
data/app/models/easy_ml/feature_list.rb +15 -0
data/app/serializers/easy_ml/column_serializer.rb +11 -1
data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
data/config/initializers/enumerable.rb +17 -0
data/lib/easy_ml/data/date_converter.rb +137 -30
data/lib/easy_ml/data/polars_column.rb +17 -0
data/lib/easy_ml/data/polars_in_memory.rb +30 -0
data/lib/easy_ml/data/polars_reader.rb +20 -1
data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
data/lib/easy_ml/data/splits/split.rb +2 -1
data/lib/easy_ml/data/synced_directory.rb +1 -1
data/lib/easy_ml/data.rb +1 -2
data/lib/easy_ml/engine.rb +1 -0
data/lib/easy_ml/feature_store.rb +33 -22
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
data/lib/easy_ml/version.rb +1 -1
data/lib/tasks/profile.rake +40 -0
data/public/easy_ml/assets/.vite/manifest.json +2 -2
data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
metadata +41 -10
data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
data/lib/easy_ml/data/preprocessor.rb +0 -340
data/lib/easy_ml/data/simple_imputer.rb +0 -255
data/lib/easy_ml/data/statistics_learner.rb +0 -193
data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1

data/app/models/easy_ml/dataset.rb CHANGED Viewed

@@ -1,24 +1,24 @@
-# == Schetuma Information
+# == Schema Information
 #
 # Table name: easy_ml_datasets
 #
-#  id                      :bigint           not null, primary key
-#  name                    :string           not null
-#  description             :string
-#  dataset_type            :string
-#  status                  :string
-#  version                 :string
-#  datasource_id           :bigint
-#  root_dir                :string
-#  configuration           :json
-#  num_rows                :bigint
-#  workflow_status         :string
-#  statistics              :json
-#  preprocessor_statistics :json
-#  schema                  :json
-#  refreshed_at            :datetime
-#  created_at              :datetime         not null
-#  updated_at              :datetime         not null
+#  id                  :bigint           not null, primary key
+#  name                :string           not null
+#  description         :string
+#  dataset_type        :string
+#  status              :string
+#  version             :string
+#  datasource_id       :bigint
+#  root_dir            :string
+#  configuration       :json
+#  num_rows            :bigint
+#  workflow_status     :string
+#  statistics          :json
+#  schema              :json
+#  refreshed_at        :datetime
+#  created_at          :datetime         not null
+#  updated_at          :datetime         not null
+#  last_datasource_sha :string
 #
 module EasyML
   class Dataset < ActiveRecord::Base
@@ -45,7 +45,7 @@ module EasyML
     has_many :columns, class_name: "EasyML::Column", dependent: :destroy, extend: EasyML::ColumnList
     accepts_nested_attributes_for :columns, allow_destroy: true, update_only: true
-    has_many :features, dependent: :destroy, class_name: "EasyML::Feature"
+    has_many :features, dependent: :destroy, class_name: "EasyML::Feature", extend: EasyML::FeatureList
     accepts_nested_attributes_for :features, allow_destroy: true
     has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
@@ -80,7 +80,7 @@ module EasyML
         column_types: EasyML::Data::PolarsColumn::TYPE_MAP.keys.map do |type|
           { value: type.to_s, label: type.to_s.titleize }
         end,
-        preprocessing_strategies: EasyML::Data::Preprocessor.constants[:preprocessing_strategies],
+        preprocessing_strategies: EasyML::Column::Imputers.constants[:preprocessing_strategies],
         feature_options: EasyML::Features::Registry.list_flat,
         splitter_constants: EasyML::Splitter.constants,
       }
@@ -119,13 +119,6 @@ module EasyML
       processed.data(limit: 1)&.schema || raw.data(limit: 1)&.schema
     end
-    def refresh_datatypes
-      return unless columns_need_refresh?
-      cleanup
-      datasource.reread(columns)
-    end
     def num_rows
       if datasource&.num_rows.nil?
         datasource.after_sync
@@ -142,7 +135,7 @@ module EasyML
     def best_segment
       [processed, raw].detect do |segment|
-        segment.send(:train, all_columns: true, limit: 1)&.columns
+        segment.send(:data, all_columns: true, limit: 1)&.columns
       end
     end
@@ -168,15 +161,27 @@ module EasyML
       save
     end
+    def refreshed_datasource?
+      last_datasource_sha_changed?
+    end
+    def prepare_features
+      features.update_all(workflow_status: "ready")
+    end
     def prepare!
+      prepare_features
       cleanup
       refresh_datasource!
       split_data
+      process_data
     end
     def prepare
+      prepare_features
       refresh_datasource
       split_data
+      process_data
     end
     def actually_refresh
@@ -184,7 +189,8 @@ module EasyML
         learn(delete: false) # After syncing datasource, learn new statistics + sync columns
         process_data
         fully_reload
-        learn # After processing data, we may have new columns from newly applied features
+        learn
+        learn_statistics(type: :processed) # After processing data, we learn any new statistics
         now = UTC.now
         update(workflow_status: "ready", refreshed_at: now, updated_at: now)
         fully_reload
@@ -252,19 +258,57 @@ module EasyML
       features_need_fit.any?
     end
-    def refresh_reasons
+    # Some of these are expensive to calculate, so we only want to include
+    # them in the refresh reasons if they are actually needed.
+    #
+    # During dataset_serializer for instance, we don't want to check s3,
+    # we only do that during background jobs.
+    #
+    # So yes this is an annoying way to structure a method, but it's helpful for performance
+    #
+    def refresh_reasons(exclude: [])
       {
-        "Not split" => not_split?,
-        "Refreshed at is nil" => refreshed_at.nil?,
-        "Columns need refresh" => columns_need_refresh?,
-        "Features need refresh" => features_need_fit?,
-        "Datasource needs refresh" => datasource_needs_refresh?,
-        "Datasource was refreshed" => datasource_was_refreshed?,
-      }.select { |k, v| v }.map { |k, v| k }
+        not_split: {
+          name: "Not split",
+          check: -> { not_split? },
+        },
+        refreshed_at_is_nil: {
+          name: "Refreshed at is nil",
+          check: -> { refreshed_at.nil? },
+        },
+        columns_need_refresh: {
+          name: "Columns need refresh",
+          check: -> { columns_need_refresh? },
+        },
+        features_need_fit: {
+          name: "Features need refresh",
+          check: -> { features_need_fit? },
+        },
+        datasource_needs_refresh: {
+          name: "Datasource needs refresh",
+          check: -> { datasource_needs_refresh? },
+        },
+        refreshed_datasource: {
+          name: "Refreshed datasource",
+          check: -> { refreshed_datasource? },
+        },
+        datasource_was_refreshed: {
+          name: "Datasource was refreshed",
+          check: -> { datasource_was_refreshed? },
+        },
+      }.except(*exclude).select do |k, config|
+        config[:check].call
+      end.map do |k, config|
+        config[:name]
+      end
+    end
+    def needs_refresh?(exclude: [])
+      refresh_reasons(exclude: exclude).any?
     end
-    def needs_refresh?
-      refresh_reasons.any?
+    def processed?
+      !needs_refresh?
     end
     def not_split?
@@ -281,7 +325,6 @@ module EasyML
     def learn(delete: true)
       learn_schema
-      learn_statistics
       columns.sync(delete: delete)
     end
@@ -333,6 +376,8 @@ module EasyML
     def learn_schema
       data = processed.data(limit: 1).to_a.any? ? processed.data : raw.data
+      return nil if data.nil?
       schema = data.schema.reduce({}) do |h, (k, v)|
         h.tap do
           h[k] = EasyML::Data::PolarsColumn.polars_to_sym(v)
@@ -341,19 +386,15 @@ module EasyML
       write_attribute(:schema, schema)
     end
-    def learn_statistics
-      stats = {
-        raw: EasyML::Data::StatisticsLearner.learn(raw, self, :raw),
-      }
-      stats.merge!(processed: EasyML::Data::StatisticsLearner.learn(processed, self, :processed)) if processed.data.present?
-      columns.select(&:is_computed).each do |col|
-        if stats.dig(:processed, col.name)
-          stats[:raw][col.name] = stats[:processed][col.name]
-        end
-      end
+    def learn_statistics(type: :raw, computed: false)
+      columns.learn(type: type, computed: computed)
+      update(
+        statistics: columns.reload.statistics,
+      )
+    end
-      update(statistics: stats)
+    def statistics
+      (read_attribute(:statistics) || {}).with_indifferent_access
     end
     def process_data
@@ -410,10 +451,9 @@ module EasyML
     def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
       df = apply_missing_features(df, inference: inference)
       df = drop_nulls(df)
-      df = preprocessor.postprocess(df, inference: inference)
+      df = columns.transform(df, inference: inference)
       df = apply_features(df, features)
-      learn unless inference # After applying features, we need to learn new statistics
-      df = preprocessor.postprocess(df, inference: inference, computed: true)
+      df = columns.transform(df, inference: inference, computed: true)
       df = apply_column_mask(df, inference: inference) unless all_columns
       df, = processed.split_features_targets(df, true, target) if split_ys
       df
@@ -494,16 +534,15 @@ module EasyML
       result.empty? ? nil : result
     end
-    def processed?
-      !should_split?
-    end
     def decode_labels(ys, col: nil)
-      preprocessor.decode_labels(ys, col: col.nil? ? target : col)
+      if col.nil?
+        col = target
+      end
+      preloaded_columns.find_by(name: col).decode_labels(ys)
     end
     def preprocessing_steps
-      return {} if columns.nil? || (columns.respond_to?(:empty?) && columns.empty?)
+      return {} if preloaded_columns.nil? || (preloaded_columns.respond_to?(:empty?) && preloaded_columns.empty?)
       return @preprocessing_steps if @preprocessing_steps.present?
       training = standardize_preprocessing_steps(:training)
@@ -515,19 +554,12 @@ module EasyML
       }.compact.deep_symbolize_keys
     end
-    def preprocessor
-      @preprocessor ||= initialize_preprocessor
-      return @preprocessor if @preprocessor.preprocessing_steps == preprocessing_steps
-      @preprocessor = initialize_preprocessor
-    end
     def target
       @target ||= preloaded_columns.find(&:is_target)&.name
     end
     def date_column
-      @date_column ||= columns.date_column.first
+      @date_column ||= preloaded_columns.find(&:is_date_column?)
     end
     def drop_cols
@@ -596,7 +628,7 @@ module EasyML
     end
     def upload_remote_files
-      return unless processed?
+      return if !needs_refresh?
       processed.upload.tap do
         features.each(&:upload_remote_files)
@@ -668,13 +700,16 @@ module EasyML
     def refresh_datasource
       datasource.reload.refresh
-      refresh_datatypes
-      initialize_splits
+      after_refresh_datasource
     end
     def refresh_datasource!
       datasource.reload.refresh!
-      refresh_datatypes
+      after_refresh_datasource
+    end
+    def after_refresh_datasource
+      update(last_datasource_sha: datasource.sha)
       initialize_splits
     end
@@ -683,12 +718,24 @@ module EasyML
       SPLIT_ORDER.each do |segment|
         df = raw.read(segment)
+        learn_computed_columns(df) if segment == :train
         processed_df = normalize(df, all_columns: true)
         processed.save(segment, processed_df)
       end
       @normalized = true
     end
+    def learn_computed_columns(df)
+      return unless features.ready_to_apply.any?
+      df = df.clone
+      df = apply_features(df)
+      processed.save(:train, df)
+      learn(delete: false)
+      learn_statistics(type: :processed, computed: true)
+      processed.cleanup
+    end
     def drop_nulls(df)
       return df if drop_if_null.nil? || drop_if_null.empty?
@@ -698,8 +745,12 @@ module EasyML
       df.drop_nulls(subset: drop)
     end
+    # Pass refresh: false for frontend views so we don't query S3 during web requests
     def load_data(segment, **kwargs, &block)
-      if processed?
+      needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
+      kwargs.delete(:refresh)
+      if !needs_refresh
         processed.load_data(segment, **kwargs, &block)
       else
         raw.load_data(segment, **kwargs, &block)
@@ -707,9 +758,7 @@ module EasyML
     end
     def fit
-      computed_statistics = columns.where(is_computed: true).reduce({}) { |h, c| h.tap { h[c.name] = c.statistics.dig("processed") } }
-      preprocessor.fit(raw.train(all_columns: true), computed_statistics)
-      update(preprocessor_statistics: preprocessor.statistics)
+      learn_statistics(type: :raw)
     end
     # log_method :fit, "Learning statistics", verbose: true
@@ -719,7 +768,7 @@ module EasyML
     end
     def split_data(force: false)
-      return unless force || should_split?
+      return unless force || needs_refresh?
       cleanup
       splitter.split(datasource) do |train_df, valid_df, test_df|
@@ -729,10 +778,6 @@ module EasyML
       end
     end
-    def should_split?
-      needs_refresh?
-    end
     def filter_duplicate_features
       return unless attributes["features_attributes"].present?
@@ -753,6 +798,7 @@ module EasyML
     end
     def apply_features(df, features = self.features)
+      features = features.ready_to_apply
       if features.nil? || features.empty?
         df
       else
@@ -774,10 +820,6 @@ module EasyML
           result = feature.transform_batch(acc_df)
-          unless result.is_a?(Polars::DataFrame)
-            raise "Feature '#{feature.name}' must return a Polars::DataFrame, got #{result.class}"
-          end
           result
         end
       end
@@ -789,16 +831,6 @@ module EasyML
       end).to_h.compact.reject { |_k, v| v["method"] == "none" }
     end
-    def initialize_preprocessor
-      EasyML::Data::Preprocessor.new(
-        directory: Pathname.new(root_dir).append("preprocessor"),
-        preprocessing_steps: preprocessing_steps,
-        dataset: self,
-      ).tap do |preprocessor|
-        preprocessor.statistics = preprocessor_statistics
-      end
-    end
     def fully_reload
       return unless persisted?

data/app/models/easy_ml/dataset_history.rb CHANGED Viewed

@@ -2,28 +2,28 @@
 #
 # Table name: easy_ml_dataset_histories
 #
-#  id                      :bigint           not null, primary key
-#  dataset_id              :integer          not null
-#  name                    :string           not null
-#  description             :string
-#  dataset_type            :string
-#  status                  :string
-#  version                 :string
-#  datasource_id           :integer
-#  root_dir                :string
-#  configuration           :json
-#  num_rows                :integer
-#  workflow_status         :string
-#  statistics              :json
-#  preprocessor_statistics :json
-#  schema                  :json
-#  refreshed_at            :datetime
-#  created_at              :datetime         not null
-#  updated_at              :datetime         not null
-#  history_started_at      :datetime         not null
-#  history_ended_at        :datetime
-#  history_user_id         :integer
-#  snapshot_id             :string
+#  id                  :bigint           not null, primary key
+#  dataset_id          :integer          not null
+#  name                :string           not null
+#  description         :string
+#  dataset_type        :string
+#  status              :string
+#  version             :string
+#  datasource_id       :integer
+#  root_dir            :string
+#  configuration       :json
+#  num_rows            :integer
+#  workflow_status     :string
+#  statistics          :json
+#  schema              :json
+#  refreshed_at        :datetime
+#  created_at          :datetime         not null
+#  updated_at          :datetime         not null
+#  history_started_at  :datetime         not null
+#  history_ended_at    :datetime
+#  history_user_id     :integer
+#  snapshot_id         :string
+#  last_datasource_sha :string
 #
 module EasyML
   class DatasetHistory < ActiveRecord::Base
@@ -44,7 +44,7 @@ module EasyML
       true
     end
-    def should_split?
+    def needs_refresh?
       false
     end
   end

data/app/models/easy_ml/datasource.rb CHANGED Viewed

@@ -10,6 +10,7 @@
 #  refreshed_at    :datetime
 #  created_at      :datetime         not null
 #  updated_at      :datetime         not null
+#  sha             :string
 #
 module EasyML
   class Datasource < ActiveRecord::Base
@@ -119,11 +120,13 @@ module EasyML
       self.num_rows = data.shape[0]
       self.is_syncing = false
       self.refreshed_at = Time.now
+      self.sha = adapter.sha
       save
     end
     def refresh
       unless adapter.needs_refresh?
+        update(sha: adapter.sha) if sha.nil?
         update!(is_syncing: false)
         return
       end

data/app/models/easy_ml/datasource_history.rb CHANGED Viewed

@@ -15,6 +15,7 @@
 #  history_ended_at   :datetime
 #  history_user_id    :integer
 #  snapshot_id        :string
+#  sha                :string
 #
 module EasyML
   class DatasourceHistory < ActiveRecord::Base

data/app/models/easy_ml/datasources/file_datasource.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module EasyML
   module Datasources
     class FileDatasource < BaseDatasource
-      delegate :query, :convert_to_parquet, to: :reader
+      delegate :query, :convert_to_parquet, :sha, to: :reader
       def after_sync
         reader.normalize

data/app/models/easy_ml/datasources/polars_datasource.rb CHANGED Viewed

@@ -6,18 +6,8 @@ module EasyML
       validates :df, presence: true
       add_configuration_attributes :df
-      def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
-        return if df.nil?
-        df = self.df.clone
-        df = df.filter(filter) if filter
-        df = df.select(select) if select.present?
-        df = df.unique if unique
-        drop_cols &= df.columns
-        df = df.drop(drop_cols) unless drop_cols.empty?
-        df = df.sort(sort, reverse: descending) if sort
-        df = df.limit(limit) if limit
-        df
+      def query(**kwargs)
+        EasyML::Data::PolarsInMemory.query(df, **kwargs)
       end
       def in_batches(of: 10_000)
@@ -40,6 +30,10 @@ module EasyML
         datasource.updated_at
       end
+      def sha
+        nil
+      end
       def data
         df
       end

data/app/models/easy_ml/datasources/s3_datasource.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module EasyML
       add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :cache_for
       delegate :query, :data, :s3_access_key_id, :s3_secret_access_key, :before_sync, :after_sync, :clean,
-               to: :synced_directory
+               :sha, to: :synced_directory
       def in_batches(&block)
         synced_directory.in_batches(&block)

data/app/models/easy_ml/feature.rb CHANGED Viewed

@@ -55,6 +55,7 @@ module EasyML
     end
     belongs_to :dataset, class_name: "EasyML::Dataset"
+    has_many :columns, class_name: "EasyML::Column", dependent: :destroy
     validates :feature_class, presence: true
     validates :feature_position, presence: true, numericality: { only_integer: true, greater_than_or_equal_to: 0 }
@@ -72,7 +73,7 @@ module EasyML
       end
       # Combine all conditions with OR
-      where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).select { |f| f.adapter.respond_to?(:fit) }.map(&:id))
+      where(id: where(needs_fit: true).or(where(conditions.join(" OR "))).map(&:id))
     }
     scope :never_applied, -> { where(applied_at: nil) }
     scope :never_fit, -> do
@@ -81,6 +82,7 @@ module EasyML
             where(id: fittable.map(&:id))
           end
     scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
+    scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
     before_save :apply_defaults, if: :new_record?
     before_save :update_sha
@@ -223,8 +225,11 @@ module EasyML
     def fit(features: [self], async: false)
       ordered_features = features.sort_by(&:feature_position)
       jobs = ordered_features.map(&:build_batches)
+      job_count = jobs.dup.flatten.size
-      if async
+      # This is very important! For whatever reason, Resque BatchJob does not properly
+      # handle batch finished callbacks for batch size = 1
+      if async && job_count > 1
         EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
       else
         jobs.flatten.each do |job|
@@ -240,7 +245,8 @@ module EasyML
       if batch_args.key?(:batch_start)
         actually_fit_batch(batch_args)
       else
-        actually_fit_batch(get_batch_args(**batch_args))
+        batch_args = get_batch_args(**batch_args)
+        actually_fit_batch(batch_args)
       end
     end
@@ -288,12 +294,14 @@ module EasyML
         batch_args.symbolize_keys!
         if adapter.respond_to?(:batch)
-          batch_df = adapter.fit(dataset.raw, self, batch_args)
+          df = dataset.raw
         else
           df = build_batch(batch_args)
-          batch_df = adapter.fit(df, self, batch_args)
         end
       end
+      return if df.blank?
+      batch_df = adapter.fit(df, self, batch_args)
       if batch_df.present?
         store(batch_df)
       else
@@ -306,7 +314,11 @@ module EasyML
       return nil unless df.is_a?(Polars::DataFrame)
       return df if !adapter.respond_to?(:transform) && feature_store.empty?
+      df_len_was = df.shape[0]
       result = adapter.transform(df, self)
+      raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
+      df_len_now = result.shape[0]
+      raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
       update!(applied_at: Time.current)
       result
     end
@@ -384,8 +396,8 @@ module EasyML
       feature_store.list_partitions
     end
-    def query(filter: nil)
-      feature_store.query(filter: filter)
+    def query(**kwargs)
+      feature_store.query(**kwargs)
     end
     def store(df)

data/app/models/easy_ml/feature_history.rb CHANGED Viewed

@@ -31,6 +31,18 @@ module EasyML
     after_find :download_remote_files
     scope :ordered, -> { order(feature_position: :asc) }
+    scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
+    scope :has_changes, lambda {
+      none
+    }
+    scope :never_applied, -> { where(applied_at: nil) }
+    scope :never_fit, -> do
+            fittable = where(fit_at: nil)
+            fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
+            where(id: fittable.map(&:id))
+          end
+    scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
+    scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
     def download_remote_files
       feature_store&.download

data/app/models/easy_ml/feature_list.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module EasyML
+  module FeatureList
+    def feature_list
+      self
+    end
+    def dataset
+      proxy_association.owner
+    end
+    def computed_column_names
+      flat_map(&:computes_columns).uniq
+    end
+  end
+end