RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc89 → 0.2.0.pre.rc91 - Mend

easy_ml 0.2.0.pre.rc89 → 0.2.0.pre.rc91

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/predictions_controller.rb +9 -4
data/app/jobs/easy_ml/training_job.rb +2 -2
data/app/models/easy_ml/column/imputers/base.rb +1 -1
data/app/models/easy_ml/column/imputers/categorical.rb +1 -1
data/app/models/easy_ml/column/imputers/embedding_encoder.rb +2 -0
data/app/models/easy_ml/column/imputers/imputer.rb +4 -0
data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -0
data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -0
data/app/models/easy_ml/column.rb +25 -1
data/app/models/easy_ml/column_list/imputer.rb +4 -0
data/app/models/easy_ml/column_list.rb +23 -8
data/app/models/easy_ml/dataset/learner.rb +0 -10
data/app/models/easy_ml/dataset.rb +28 -43
data/app/models/easy_ml/deploy.rb +28 -19
data/app/models/easy_ml/feature.rb +10 -8
data/app/models/easy_ml/feature_history.rb +9 -0
data/app/models/easy_ml/model.rb +12 -7
data/app/models/easy_ml/models/xgboost/evals_callback.rb +2 -2
data/app/models/easy_ml/prediction.rb +2 -2
data/app/serializers/easy_ml/prediction_serializer.rb +2 -0
data/lib/easy_ml/core/model_evaluator.rb +2 -0
data/lib/easy_ml/core/tuner.rb +1 -1
data/lib/easy_ml/data/dataset_manager/writer/base.rb +24 -5
data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +20 -7
data/lib/easy_ml/data/dataset_manager/writer.rb +4 -0
data/lib/easy_ml/data/dataset_manager.rb +4 -0
data/lib/easy_ml/data/polars_column.rb +0 -6
data/lib/easy_ml/feature_store.rb +9 -13
data/lib/easy_ml/predict.rb +5 -4
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt +13 -0
data/lib/easy_ml/timing.rb +3 -1
data/lib/easy_ml/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a0eb5ce84bdd93da3ea53e97f1b1ceab81a529a9bb076596f4edf7e49349eadf
-  data.tar.gz: 5262f39ff5a1236729d28a8fa6715d1b12e6dd5b4319225a6df512493872bba0
+  metadata.gz: 3a12058c269a91c130f9158e1507c58dc94ad33517aabe568a2f0bc9f78b88eb
+  data.tar.gz: 6a37a568b6a8d8c100c21dea96487cb85494ef11ba3b30ac51aad8cab45654d7
 SHA512:
-  metadata.gz: 8e8094ed3309b80e0ee70543667e7af982e0805ba6fe81a62ad2f46297eae4487440a4e6ffdca75f32ef045ef8b25440a8396032cf00f9fd6c191d63fc8c0386
-  data.tar.gz: 97d057eb4ffa2acdb319a52de427cc3f3e7c42db8de349d063b36cd91479c68eca10ec150689d11070f0d20f24b620c0ce7a77f3e335420dd8c73612d706d1a0
+  metadata.gz: 4f344c00a9e2b557079943f7f6f2c4d7923dbb5b425423b81d58dbc6d63ac15d5e978d6f7e1ab2c02233fe8cc33b55d168ed1ffb0d1f2e1cfcc59670b812285d
+  data.tar.gz: 4831eac6b35b452b300408b37695d5116b6840404cf045e85d98780d3a120ae2d267b07852bc80b6918379b70f9c04a52a712e599751e469af72a0be6e1889c4

data/app/controllers/easy_ml/predictions_controller.rb CHANGED Viewed

@@ -4,22 +4,27 @@ module EasyML
     def create
       slug = params[:model]
-      unless EasyML::Model.find_by(slug: slug).inference_version.present?
+      model = EasyML::Model.find_by(slug: slug)
+      unless model.present?
+        return render json: { error: "Model not found" }, status: :not_found
+      end
+      unless model.inference_version.present?
         return render json: { error: "Model not found" }, status: :not_found
       end
       unless params.key?(:input)
-        return render json: { error: "Must provide key: input" }, status: :not_found
+        return render json: { error: "Must provide key: input" }, status: :unprocessable_entity
       end
       input = params[:input].permit!.to_h
       unless input.is_a?(Hash)
-        return render json: { error: "Input must be a hash" }, status: :not_found
+        return render json: { error: "Input must be a hash" }, status: :unprocessable_entity
       end
       valid, fields = EasyML::Predict.validate_input(slug, input)
       unless valid
-        return render json: { error: "Missing required fields: #{fields}" }, status: :not_found
+        return render json: { error: "Missing required fields: #{fields}" }, status: :unprocessable_entity
       end
       type = (params[:type] || :predict).to_sym

data/app/jobs/easy_ml/training_job.rb CHANGED Viewed

@@ -10,13 +10,13 @@ module EasyML
       @last_activity = Time.current
       setup_signal_traps
-      @monitor_thread = start_monitor_thread
+      # @monitor_thread = start_monitor_thread
       @model.actually_train do |iteration_info|
         @last_activity = Time.current
       end
     ensure
-      @monitor_thread&.exit
+      # @monitor_thread&.exit
       @model.unlock!
     end

data/app/models/easy_ml/column/imputers/base.rb CHANGED Viewed

@@ -26,7 +26,7 @@ module EasyML
           end
         end
-        attr_accessor :column, :preprocessing_step
+        attr_accessor :column, :preprocessing_step, :encode
         def initialize(column, preprocessing_step)
           @column = column

data/app/models/easy_ml/column/imputers/categorical.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module EasyML
         def transform(df)
           return df unless allowed_categories.present?
-          case column.datatype
+          case column.datatype.to_sym
           when :categorical
             df = df.with_column(
               Polars.when(Polars.col(column.name).is_in(allowed_categories))

data/app/models/easy_ml/column/imputers/embedding_encoder.rb CHANGED Viewed

@@ -9,6 +9,8 @@ module EasyML
         end
         def transform(df)
+          return df unless encode
           df = column.embed(df)
           df
         end

data/app/models/easy_ml/column/imputers/imputer.rb CHANGED Viewed

@@ -43,6 +43,10 @@ module EasyML
           @adapters ||= ordered_adapters.map { |klass| klass.new(column, preprocessing_step) }.select { |adapter| allowed?(adapter) && adapter.applies? }
         end
+        def encode=(value)
+          adapters.each { |adapter| adapter.encode = value }
+        end
         def description
           adapters.map(&:description).compact.join(", ")
         end

data/app/models/easy_ml/column/imputers/one_hot_encoder.rb CHANGED Viewed

@@ -9,6 +9,7 @@ module EasyML
         end
         def transform(df)
+          return df unless encode
           return df unless allowed_categories.present?
           allowed_categories.each do |value|

data/app/models/easy_ml/column/imputers/ordinal_encoder.rb CHANGED Viewed

@@ -9,6 +9,7 @@ module EasyML
         end
         def transform(df)
+          return df unless encode
           return df unless label_encoder.present?
           case column.datatype

data/app/models/easy_ml/column.rb CHANGED Viewed

@@ -184,9 +184,10 @@ module EasyML
       end
     end
-    def transform(df, inference: false, computed: false)
+    def transform(df, inference: false, encode: true)
       imputer = inference && imputers.inference.anything? ? imputers.inference : imputers.training
+      imputer.encode = encode
       df = imputer.transform(df)
       df
     end
@@ -513,6 +514,29 @@ module EasyML
       EasyML::Import::Column.from_config(config, dataset, action: action)
     end
+    def cast_statement(df, df_col, expected_dtype)
+      expected_dtype = expected_dtype.is_a?(Polars::DataType) ? expected_dtype : expected_dtype.class
+      actual_type = df[df_col].dtype
+      cast_statement = case expected_dtype
+                       when Polars::Boolean
+                          case actual_type
+                          when Polars::Boolean
+                            Polars.col(df_col).cast(expected_dtype)
+                          when Polars::String, Polars::Categorical
+                            Polars.col(df_col).eq("true").cast(expected_dtype)
+                          when Polars::Null
+                            Polars.col(df_col)
+                          else
+                            raise "Unexpected dtype: #{actual_type} for column: #{df_col}"
+                          end
+                        else
+                          Polars.col(df_col).cast(expected_dtype)
+                        end
+      cast_statement.alias(df_col)
+    end
     def cast(value)
       return value if value.nil?

data/app/models/easy_ml/column_list/imputer.rb CHANGED Viewed

@@ -15,6 +15,10 @@ module EasyML
         @imputers ||= columns.map { |column| inference ? column.imputers(@_imputers).inference : column.imputers(@_imputers).training }
       end
+      def encode=(encode)
+        imputers.each { |imputer| imputer.encode = encode }
+      end
       def exprs
         imputers.flat_map(&:exprs).compact
       end

data/app/models/easy_ml/column_list.rb CHANGED Viewed

@@ -22,7 +22,7 @@ module EasyML
       end
     end
-    def transform(df, inference: false, computed: false)
+    def transform(df, inference: false, computed: false, encode: true)
       return df if df.nil?
       if computed
@@ -33,14 +33,12 @@ module EasyML
       by_name = cols.index_by(&:name)
       cols.each do |column|
-        df = column.transform(df, inference: inference, computed: computed) if column
+        df = column.transform(df, inference: inference, encode: encode) if column
       end
       df
     end
-    measure_method_timing :transform
     def apply_clip(df)
       clip_cols = has_clip.raw
       return df unless clip_cols.any?
@@ -60,8 +58,6 @@ module EasyML
       reload
     end
-    measure_method_timing :learn
     def statistics
       stats = { raw: {}, processed: {} }
       select(&:persisted?).inject(stats) do |h, col|
@@ -94,6 +90,27 @@ module EasyML
       end.sort.map { |arr| arr[1] }.uniq
     end
+    def apply_cast(df)
+      schema = dataset.schema
+      column_index = reduce({}) do |h, col|
+        h.tap do
+          col.aliases.each do |alias_name|
+            h[alias_name] = col
+          end
+        end
+      end
+      cast_statements = (df.columns & schema.keys.map(&:to_s)).map do |df_col|
+        db_col = column_index[df_col]
+        expected_dtype = schema[df_col.to_sym]
+        db_col.cast_statement(df, df_col, expected_dtype)
+      end
+      begin
+        df = df.with_columns(cast_statements)
+      rescue => e
+        binding.pry
+      end
+    end
     def cast(processed_or_raw)
       columns = where(is_computed: false)
       is_processed = processed_or_raw == :processed
@@ -154,8 +171,6 @@ module EasyML
       EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
     end
-    measure_method_timing :set_feature_lineage
     private
     def import_new(new_columns, existing_columns)

data/app/models/easy_ml/dataset/learner.rb CHANGED Viewed

@@ -57,8 +57,6 @@ module EasyML
         dataset.columns.set_feature_lineage(columns)
       end
-      measure_method_timing :save_statistics
       def learn_statistics
         return @statistics if @statistics
@@ -78,8 +76,6 @@ module EasyML
         end
       end
-      measure_method_timing :learn_statistics
       def prepare
         @schema = EasyML::Data::PolarsSchema.simplify(@dataset.raw_schema).symbolize_keys
         @raw_columns = @schema.keys.sort.map(&:to_s)
@@ -93,19 +89,13 @@ module EasyML
         EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[in_raw_dataset datatype] })
       end
-      measure_method_timing :prepare
       def lazy_statistics
         Lazy.new(dataset, columns, type: type).learn
       end
-      measure_method_timing :lazy_statistics
       def eager_statistics
         Eager.new(dataset, columns, type: type).learn
       end
-      measure_method_timing :eager_statistics
     end
   end
 end

data/app/models/easy_ml/dataset.rb CHANGED Viewed

@@ -215,9 +215,10 @@ module EasyML
       @raw = raw.cp(version)
       @processed = processed.cp(version)
-      features.each(&:bump_version)
-      save
+      save.tap do
+        features.each(&:bump_version)
+        EasyML::Feature.import(features.to_a, on_duplicate_key_update: [:version])
+      end
     end
     def refreshed_datasource?
@@ -257,9 +258,6 @@ module EasyML
       end
     end
-    include EasyML::Timing
-    measure_method_timing :actually_refresh
     def refresh!(async: false)
       refreshing do
         prepare!
@@ -276,29 +274,22 @@ module EasyML
       end
     end
-    measure_method_timing :refresh
     def fit_features!(async: false, features: self.features)
       fit_features(async: async, features: features, force: true)
     end
     def fit_features(async: false, features: self.features, force: false)
       features_to_compute = force ? features : features.needs_fit
-      puts "Features to compute.... #{features_to_compute}"
       return after_fit_features if features_to_compute.empty?
       features.first.fit(features: features_to_compute, async: async)
     end
-    measure_method_timing :fit_features
     def after_fit_features
-      puts "After fit features"
       unlock!
       reload
       return if failed?
-      puts "Actually refresh..."
       actually_refresh
     end
@@ -476,15 +467,24 @@ module EasyML
     def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
       df = apply_missing_columns(df, inference: inference)
-      df = columns.transform(df, inference: inference)
-      df = apply_features(df, features)
-      df = columns.transform(df, inference: inference)
+      df = transform_columns(df, inference: inference, encode: false)
+      df = apply_features(df, features, inference: inference)
+      df = apply_cast(df) if inference
+      df = transform_columns(df, inference: inference)
       df = apply_column_mask(df, inference: inference) unless all_columns
       df = drop_nulls(df) unless inference
       df, = processed.split_features_targets(df, true, target) if split_ys
       df
     end
+    def transform_columns(df, inference: false, encode: true)
+      columns.transform(df, inference: inference, encode: encode)
+    end
+    def apply_cast(df)
+      columns.apply_cast(df)
+    end
     # Massage out one-hot cats to their canonical name
     #
     # Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
@@ -503,8 +503,6 @@ module EasyML
       end.uniq.sort
     end
-    measure_method_timing :normalize
     def missing_required_fields(df)
       desc_df = df.describe
@@ -633,22 +631,19 @@ module EasyML
       df[column_mask(df, inference: inference)]
     end
-    measure_method_timing :apply_column_mask
-    def apply_missing_columns(df, inference: false, include_one_hots: false)
+    def apply_missing_columns(df, inference: false)
       return df unless inference
-      missing_columns = (col_order(inference: inference) - df.columns).compact
-      unless include_one_hots
-        columns.one_hots.each do |one_hot|
-          virtual_columns = one_hot.virtual_columns
-          if virtual_columns.all? { |vc| df.columns.include?(vc) }
-            missing_columns -= columns.one_hots.flat_map(&:virtual_columns)
-          else
-            missing_columns += columns.one_hots.map(&:name) - df.columns
-          end
+      required_cols = col_order(inference: inference).compact.uniq
+      columns.one_hots.each do |one_hot|
+        virtual_columns = one_hot.virtual_columns
+        if virtual_columns.all? { |vc| df.columns.include?(vc) }
+          required_cols -= virtual_columns
+        else
+          required_cols += [one_hot.name]
         end
       end
+      missing_columns = required_cols - df.columns
       df.with_columns(missing_columns.map { |f| Polars.lit(nil).alias(f) })
     end
@@ -771,8 +766,6 @@ module EasyML
       after_refresh_datasource
     end
-    measure_method_timing :refresh_datasource
     def refresh_datasource!
       datasource.reload.refresh!
       after_refresh_datasource
@@ -798,8 +791,6 @@ module EasyML
       @normalized = true
     end
-    measure_method_timing :normalize_all
     def learn_computed_columns(df)
       return unless features.ready_to_apply.any?
@@ -811,8 +802,6 @@ module EasyML
       processed.cleanup
     end
-    measure_method_timing :learn_computed_columns
     def drop_nulls(df)
       return df if drop_if_null.nil? || drop_if_null.empty?
@@ -822,8 +811,6 @@ module EasyML
       df.drop_nulls(subset: drop)
     end
-    measure_method_timing :drop_nulls
     # Pass refresh: false for frontend views so we don't query S3 during web requests
     def load_data(segment, **kwargs, &block)
       needs_refresh = kwargs.key?(:refresh) ? kwargs[:refresh] : needs_refresh?
@@ -876,8 +863,8 @@ module EasyML
       columns.find_by(name: column_name).update(is_date_column: true)
     end
-    def apply_features(df, features = self.features)
-      features = features.ready_to_apply
+    def apply_features(df, features = self.features, inference: false)
+      features = inference ? preloaded_features : features.ready_to_apply
       if features.nil? || features.empty?
         df
       else
@@ -897,15 +884,13 @@ module EasyML
           # Set SHA without querying
           feature.instance_variable_set(:@current_sha, shas[feature.feature_class])
-          result = feature.transform_batch(acc_df)
+          result = feature.transform_batch(acc_df, inference: inference)
           result
         end
       end
     end
-    measure_method_timing :apply_features
     def standardize_preprocessing_steps(type)
       columns.map(&:name).zip(columns.map do |col|
         col.preprocessing_steps&.dig(type)

data/app/models/easy_ml/deploy.rb CHANGED Viewed

@@ -48,28 +48,37 @@ module EasyML
     def actually_deploy
       lock_deploy do
-        update(status: "running")
-        EasyML::Event.create_event(self, "started")
-        if identical_deploy.present?
-          self.model_file = identical_deploy.model_file
-          self.model_version = identical_deploy.model_version
-        else
-          if model_file.present?
-            model.model_file = model_file
+        begin
+          update(status: "running")
+          EasyML::Event.create_event(self, "started")
+          if identical_deploy.present?
+            self.model_file = identical_deploy.model_file
+            self.model_version = identical_deploy.model_version
+          else
+            if model_file.present?
+              model.model_file = model_file
+            end
+            # model.load_model
+            self.model_version = model.actually_deploy
           end
-          model.load_model
-          self.model_version = model.actually_deploy
-        end
-        EasyML::Deploy.transaction do
-          update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
-          model.retraining_runs.where(status: :deployed).update_all(status: :success)
-          retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed, is_deploying: false)
-        end
+          EasyML::Deploy.transaction do
+            update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, status: :success)
+            model.retraining_runs.where(status: :deployed).update_all(status: :success)
+            retraining_run.update(model_history_id: self.model_version.id, snapshot_id: self.model_version.snapshot_id, deploy_id: id, status: :deployed,)
+          end
-        model_version.tap do
-          EasyML::Event.create_event(self, "success")
+          model_version.tap do
+            EasyML::Event.create_event(self, "success")
+          end
+        rescue => e
+          update(status: "failed")
+          retraining_run.update(is_deploying: false)
+          EasyML::Event.create_event(self, "failed")
+          raise e
+        ensure
+          unlock!
         end
       end
     end

data/app/models/easy_ml/feature.rb CHANGED Viewed

@@ -82,7 +82,7 @@ module EasyML
             fittable = fittable.select(&:fittable?)
             where(id: fittable.map(&:id))
           end
-    scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
+    scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed).or(where(needs_fit: true)) }
     scope :datasource_was_refreshed, -> do
             where(id: all.select(&:datasource_was_refreshed?).map(&:id))
           end
@@ -310,9 +310,9 @@ module EasyML
     end
     # Transform a single batch, used for testing the user's feature implementation
-    def transform_batch(df = nil, batch_args = {})
+    def transform_batch(df = nil, batch_args = {}, inference: false)
       if df.is_a?(Polars::DataFrame)
-        actually_transform_batch(df)
+        actually_transform_batch(df, inference: inference)
       else
         actually_transform_batch(build_batch(get_batch_args(**batch_args)))
       end
@@ -374,11 +374,12 @@ module EasyML
       batch_df
     end
-    def actually_transform_batch(df)
+    def actually_transform_batch(df, inference: false)
       return nil unless df.is_a?(Polars::DataFrame)
       return df if !adapter.respond_to?(:transform) && feature_store.empty?
       df_len_was = df.shape[0]
+      orig_df = df.clone
       begin
         result = adapter.transform(df, self)
       rescue => e
@@ -386,8 +387,10 @@ module EasyML
       end
       raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
       df_len_now = result.shape[0]
-      raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if df_len_now != df_len_was
-      update!(applied_at: Time.current)
+      missing_columns = orig_df.columns - result.columns
+      raise "Feature #{feature_class}#transform: output size must match input size! Input size: #{df_len_now}, output size: #{df_len_was}." if (df_len_now != df_len_was)
+      raise "Feature #{feature_class} removed #{missing_columns} columns" if missing_columns.any?
+      update!(applied_at: Time.current) unless inference
       result
     end
@@ -432,9 +435,8 @@ module EasyML
     end
     def bump_version
-      old_version = version
+      feature_store.bump_version(version)
       write_attribute(:version, version + 1)
-      feature_store.cp(old_version, version)
       self
     end

data/app/models/easy_ml/feature_history.rb CHANGED Viewed

@@ -44,7 +44,16 @@ module EasyML
     scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
     scope :ready_to_apply, -> { where.not(id: needs_fit.map(&:id)) }
+    def wipe
+      false
+    end
     def download_remote_files
+      return unless snapshot_id # if not finished saving, skip
+      return if feature_store.synced?
+      return if @downloaded
+      @downloaded = true
       feature_store&.download
     end
   end

data/app/models/easy_ml/model.rb CHANGED Viewed

@@ -182,6 +182,7 @@ module EasyML
       lock_model do
         run = pending_run
         run.wrap_training do
+          dataset.refresh if dataset.needs_refresh?
           raise untrainable_error unless trainable?
           best_params = nil
@@ -210,6 +211,10 @@ module EasyML
       end
     end
+    def locked?
+      Support::Lockable.locked?(lock_key)
+    end
     def with_lock
       EasyML::Support::Lockable.with_lock(lock_key, stale_timeout: 60, resources: 1) do |client|
         yield client
@@ -273,7 +278,7 @@ module EasyML
     end
     def inference_version
-      latest_deploy&.model_version
+      deploys.where(status: :success).order(id: :desc).limit(1).last&.model_version
     end
     alias_method :current_version, :inference_version
@@ -296,21 +301,21 @@ module EasyML
       )
     end
-    def prepare_predict(xs)
+    def prepare_predict(xs, normalized: false)
       load_model!
-      unless xs.is_a?(XGBoost::DMatrix)
+      if !normalized
         xs = dataset.normalize(xs, inference: true)
       end
       xs
     end
-    def predict(xs)
-      xs = prepare_predict(xs)
+    def predict(xs, normalized: false)
+      xs = prepare_predict(xs, normalized: normalized)
       adapter.predict(xs)
     end
-    def predict_proba(xs)
-      xs = prepare_predict(xs)
+    def predict_proba(xs, normalized: false)
+      xs = prepare_predict(xs, normalized: normalized)
       adapter.predict_proba(xs)
     end

data/app/models/easy_ml/models/xgboost/evals_callback.rb CHANGED Viewed

@@ -49,7 +49,7 @@ module EasyML
             x_valid, y_valid = valid_dataset
             x_valid = x_valid.select(model.dataset.col_order(inference: true))
             @preprocessed ||= model.preprocess(x_valid, y_valid)
-            y_pred = model.predict(@preprocessed)
+            y_pred = model.predict(@preprocessed, normalized: true)
             dataset = model.dataset.processed.valid(all_columns: true)
             metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
@@ -78,7 +78,7 @@ module EasyML
           track_feature_importance(booster)
           if tuner.nil?
-            track_cumulative_feature_importance(false)
+            track_cumulative_feature_importance
           end
           booster

data/app/models/easy_ml/prediction.rb CHANGED Viewed

@@ -17,8 +17,8 @@ module EasyML
   class Prediction < ActiveRecord::Base
     self.table_name = "easy_ml_predictions"
-    belongs_to :model
-    belongs_to :model_history, optional: true
+    belongs_to :model, class_name: "EasyML::Model"
+    belongs_to :model_history, class_name: "EasyML::ModelHistory", optional: true
     validates :model_id, presence: true
     validates :prediction_type, presence: true, inclusion: { in: %w[regression classification] }

data/app/serializers/easy_ml/prediction_serializer.rb CHANGED Viewed

@@ -10,6 +10,8 @@ module EasyML
         object.prediction_value.symbolize_keys.dig(:value)
       when Numeric
         object.prediction_value
+      when Array
+        object.prediction_value
       end
     end

data/lib/easy_ml/core/model_evaluator.rb CHANGED Viewed

@@ -153,6 +153,8 @@ module EasyML
         def normalize_input(input)
           case input
+          when Polars::LazyFrame
+            normalize_input(input.collect)
           when Array
             if input.first.class == TrueClass || input.first.class == FalseClass
               input = input.map { |value| value ? 1.0 : 0.0 }

data/lib/easy_ml/core/tuner.rb CHANGED Viewed

@@ -147,7 +147,7 @@ module EasyML
           end
         end
-        y_pred = model.predict(x_normalized)
+        y_pred = model.predict(x_normalized, normalized: true)
         model.metrics = metrics
         metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
         metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)

data/lib/easy_ml/data/dataset_manager/writer/base.rb CHANGED Viewed

@@ -24,18 +24,31 @@ module EasyML
           def compact
             files = self.files
+            rows = query(lazy: true).collect
+            return unless rows.shape[0] > 0
+            FileUtils.rm(files)
             clear_unique_id
             File.join(root_dir, "compacted.parquet").tap do |target_file|
-              safe_write(
-                query(lazy: true),
-                target_file
-              )
-              FileUtils.rm(files)
+              safe_write(rows, target_file)
             end
             clear_unique_id
           end
+          def cp(from,to)
+            return if from.nil? || !Dir.exist?(from)
+            FileUtils.mkdir_p(to)
+            files_to_cp = Dir.glob(Pathname.new(from).join("**/*")).select { |f| File.file?(f) }
+            files_to_cp.each do |file|
+              target_file = file.gsub(from, to)
+              FileUtils.mkdir_p(File.dirname(target_file))
+              FileUtils.cp(file, target_file)
+            end
+          end
           def unlock!
             clear_all_keys
           end
@@ -65,6 +78,8 @@ module EasyML
           end
           def safe_write(df, path)
+            raise "df must be a Polars::DataFrame or Polars::LazyFrame" unless df.is_a?(Polars::DataFrame) || df.is_a?(Polars::LazyFrame)
             FileUtils.mkdir_p(File.dirname(path))
             if df.is_a?(Polars::LazyFrame)
               # Depending on the query plan, sometimes sink_parquet will throw an error...
@@ -81,6 +96,10 @@ module EasyML
               df.write_parquet(path)
             end
             path
+          ensure
+            if Polars.scan_parquet(path).limit(1).schema.keys.empty?
+              raise "Failed to store to #{path}"
+            end
           end
           def clear_all_keys

data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb CHANGED Viewed

@@ -17,9 +17,7 @@ module EasyML
           end
           def wipe
-            partitions.each do |partition|
-              FileUtils.rm_rf(File.join(root_dir, partition))
-            end
+            super
             clear_all_keys
           end
@@ -33,22 +31,37 @@ module EasyML
           end
           def compact
-            files = self.files
+            return if compacted?
             @df = query(lazy: true)
             clear_unique_id(subdir: "compacted")
             compact_each_partition.tap do
-              FileUtils.rm(files)
               clear_unique_id
             end
+            uncompacted_folders.each do |folder|
+              FileUtils.rm_rf(File.join(root_dir, folder))
+            end
           end
           private
-          def partitions
-            Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
+          def compacted?
+            uncompacted_folders.empty?
           end
+          def uncompacted_folders
+            folders - ["compacted"]
+          end
+          def folders
+            Dir.glob(File.join(root_dir, "**/*")).select { |f| File.directory?(f) }.map { |f| f.split("/").last }
+          end
+          # def partitions
+          #   Dir.glob(File.join(root_dir, "**/*")).map { |f| f.split("/").last }
+          # end
           def compact_each_partition
             with_each_partition do |partition_df, _|
               safe_write(

data/lib/easy_ml/data/dataset_manager/writer.rb CHANGED Viewed

@@ -31,6 +31,10 @@ module EasyML
           adapter_class.new(options).unlock!
         end
+        def cp(from, to)
+          adapter_class.new(options).cp(from, to)
+        end
         def store(df, *args)
           return df if df.is_a?(Polars::LazyFrame) ? df.schema.empty? : df.empty?

data/lib/easy_ml/data/dataset_manager.rb CHANGED Viewed

@@ -51,6 +51,10 @@ module EasyML
         def num_rows
           Reader.num_rows
         end
+        def cp(from, to)
+          Writer.cp(from, to)
+        end
       end
       def list_nulls(input = nil, **kwargs, &block)

data/lib/easy_ml/data/polars_column.rb CHANGED Viewed

@@ -121,8 +121,6 @@ module EasyML
         polars_type ? sym_to_polars(type_name) : type_name
       end
-      measure_method_timing :determine_type
       # Determines if a string field is a date, text, or categorical
       # @param series [Polars::Series] The string series to analyze
       # @return [Symbol] One of :datetime, :text, or :categorical
@@ -149,8 +147,6 @@ module EasyML
         end
       end
-      measure_method_timing :determine_string_type
       # Determines if a string field is categorical or free text
       # @param series [Polars::Series] The string series to analyze
       # @return [Symbol] Either :categorical or :text
@@ -178,8 +174,6 @@ module EasyML
         avg_percentage < 1.0 ? :text : :categorical
       end
-      measure_method_timing :categorical_or_text?
       # Returns whether the field type is numeric
       # @param field_type [Symbol] The field type to check
       # @return [Boolean]

data/lib/easy_ml/feature_store.rb CHANGED Viewed

@@ -23,20 +23,16 @@ module EasyML
       end
     end
-    def cp(old_version, new_version)
-      old_dir = feature_dir_for_version(old_version)
-      new_dir = feature_dir_for_version(new_version)
-      return if old_dir.nil? || !Dir.exist?(old_dir)
-      FileUtils.mkdir_p(new_dir)
-      files_to_cp = Dir.glob(Pathname.new(old_dir).join("**/*")).select { |f| File.file?(f) }
+    def synced?
+      files.any?
+    end
-      files_to_cp.each do |file|
-        target_file = file.gsub(old_dir, new_dir)
-        FileUtils.mkdir_p(File.dirname(target_file))
-        FileUtils.cp(file, target_file)
-      end
+    def bump_version(version)
+      compact
+      cp(
+        feature_dir_for_version(version),
+        feature_dir_for_version(version + 1),
+      )
     end
     private

data/lib/easy_ml/predict.rb CHANGED Viewed

@@ -3,6 +3,7 @@ require "singleton"
 module EasyML
   class Predict
     include Singleton
+    include EasyML::Timing
     attr_reader :models
@@ -20,7 +21,7 @@ module EasyML
     def self.predict(model_name, df, serialize: false)
       df = normalize_input(df)
       output = make_predictions(model_name, df) do |model, normalized_df|
-        model.predict(normalized_df)
+        model.predict(normalized_df, normalized: true)
       end
       if serialize
@@ -33,7 +34,7 @@ module EasyML
     def self.predict_proba(model_name, df, serialize: false)
       df = normalize_input(df)
       output = make_predictions(model_name, df) do |model, normalized_df|
-        probas = model.predict_proba(normalized_df)
+        probas = model.predict_proba(normalized_df, normalized: true)
         probas.map { |proba_array| proba_array.map { |p| p.round(4) } }
       end
@@ -91,8 +92,8 @@ module EasyML
       output = predictions.zip(raw_input, normalized_input).map do |pred, raw, norm|
         EasyML::Prediction.create!(
-          model: current_version.model,
-          model_history: current_version,
+          model_id: current_version.model.id,
+          model_history_id: current_version.id,
           prediction_type: current_version.model.task,
           prediction_value: pred,
           raw_input: raw,

data/lib/easy_ml/railtie/generators/migration/migration_generator.rb CHANGED Viewed

@@ -59,6 +59,7 @@ module EasyML
             add_pca_model_id_to_easy_ml_columns
             add_workflow_status_to_easy_ml_dataset_histories
             add_metadata_to_easy_ml_predictions
+            add_unique_constraint_to_dataset_names
           ].freeze
           # Specify the next migration number

data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt ADDED Viewed

@@ -0,0 +1,13 @@
+class AddUniqueConstraintToDatasetNames < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
+  def change
+    if index_exists?(:easy_ml_datasets, :name)
+      remove_index :easy_ml_datasets, :name
+    end
+    add_index :easy_ml_datasets, :name, unique: true
+    if index_exists?(:easy_ml_dataset_histories, :name)
+      remove_index :easy_ml_dataset_histories, :name
+    end
+    add_index :easy_ml_dataset_histories, :name, unique: true
+  end
+end

data/lib/easy_ml/timing.rb CHANGED Viewed

@@ -19,7 +19,9 @@ module EasyML
         result = send(method_alias, *args, **kwargs, &block)
         ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
         elapsed = ending - starting
-        puts "#{method_name} took #{elapsed.round(2)} seconds"
+        10.times do
+          puts "#{method_name} took #{elapsed.round(2)} seconds"
+        end
         # StatsD.measure("#{Rails.env}.#{prefix.present? ? "#{prefix}." : ""}#{method_name}.timing", elapsed)
         result
       end

data/lib/easy_ml/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module EasyML
-  VERSION = "0.2.0-rc89"
+  VERSION = "0.2.0-rc91"
   module Version
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: easy_ml
 version: !ruby/object:Gem::Version
-  version: 0.2.0.pre.rc89
+  version: 0.2.0.pre.rc91
 platform: ruby
 authors:
 - Brett Shollenberger
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-03-03 00:00:00.000000000 Z
+date: 2025-03-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activerecord
@@ -803,6 +803,7 @@ files:
 - lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt
 - lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt
 - lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt
+- lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_dataset_names.rb.tt
 - lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt
 - lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt
 - lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_features.rb.tt