RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc76 → 0.2.0.pre.rc78 - Mend

easy_ml 0.2.0.pre.rc76 → 0.2.0.pre.rc78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/models_controller.rb +3 -2
data/app/frontend/components/ModelForm.tsx +16 -0
data/app/frontend/components/ScheduleModal.tsx +0 -2
data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -6
data/app/jobs/easy_ml/application_job.rb +1 -0
data/app/jobs/easy_ml/batch_job.rb +47 -6
data/app/jobs/easy_ml/compute_feature_job.rb +10 -10
data/app/jobs/easy_ml/reaper.rb +14 -10
data/app/jobs/easy_ml/refresh_dataset_job.rb +2 -0
data/app/jobs/easy_ml/sync_datasource_job.rb +1 -0
data/app/models/concerns/easy_ml/dataframe_serialization.rb +1 -17
data/app/models/easy_ml/column/imputers/base.rb +1 -1
data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -5
data/app/models/easy_ml/column/imputers/today.rb +1 -1
data/app/models/easy_ml/column/selector.rb +0 -8
data/app/models/easy_ml/column.rb +1 -1
data/app/models/easy_ml/dataset/learner/base.rb +2 -2
data/app/models/easy_ml/dataset/learner/eager.rb +3 -1
data/app/models/easy_ml/dataset/learner/lazy.rb +4 -1
data/app/models/easy_ml/dataset/refresh_reasons.rb +12 -0
data/app/models/easy_ml/dataset.rb +29 -76
data/app/models/easy_ml/datasource.rb +0 -6
data/app/models/easy_ml/feature.rb +27 -38
data/app/models/easy_ml/model.rb +20 -2
data/app/models/easy_ml/models/xgboost/evals_callback.rb +3 -2
data/app/models/easy_ml/models/xgboost.rb +52 -36
data/app/models/easy_ml/retraining_run.rb +1 -1
data/app/serializers/easy_ml/dataset_serializer.rb +1 -1
data/app/serializers/easy_ml/model_serializer.rb +1 -0
data/lib/easy_ml/core/tuner.rb +7 -4
data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
data/lib/easy_ml/data/dataset_manager/reader/base.rb +80 -0
data/lib/easy_ml/data/dataset_manager/reader/batch.rb +106 -0
data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +23 -0
data/lib/easy_ml/data/dataset_manager/reader/file.rb +75 -0
data/lib/easy_ml/data/dataset_manager/reader.rb +58 -0
data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +67 -0
data/lib/easy_ml/data/dataset_manager/writer/base.rb +139 -0
data/lib/easy_ml/data/dataset_manager/writer/named.rb +14 -0
data/lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb +15 -0
data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +150 -0
data/lib/easy_ml/data/dataset_manager/writer.rb +80 -0
data/lib/easy_ml/data/dataset_manager.rb +140 -0
data/lib/easy_ml/data/partition/boundaries.rb +60 -0
data/lib/easy_ml/data/partition.rb +7 -0
data/lib/easy_ml/data/polars_column.rb +19 -5
data/lib/easy_ml/data/synced_directory.rb +1 -2
data/lib/easy_ml/data.rb +2 -0
data/lib/easy_ml/engine.rb +16 -14
data/lib/easy_ml/feature_store.rb +21 -188
data/lib/easy_ml/reasons.rb +41 -0
data/lib/easy_ml/support/lockable.rb +1 -5
data/lib/easy_ml/version.rb +1 -1
data/lib/easy_ml.rb +1 -1
data/public/easy_ml/assets/.vite/manifest.json +1 -1
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Bbf3mD_b.js +522 -0
data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-B1qLZuyu.js.map → Application.tsx-Bbf3mD_b.js.map} +1 -1
metadata +24 -9
data/app/models/easy_ml/datasources/polars_datasource.rb +0 -69
data/lib/easy_ml/data/filter_extensions.rb +0 -31
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-B1qLZuyu.js +0 -522
/data/app/models/{lineage_history.rb → easy_ml/lineage_history.rb} +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1eebc157e0f33c3da40ef2b1bdb7cc0ed1c2b6f73615cdf26a6898cb60e60d2d
-  data.tar.gz: a12b441fe0736f251de773574858316346ba19c5b3784d73f3db200af0e619e4
+  metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
+  data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
 SHA512:
-  metadata.gz: 4aabb816a9d02a6f2bd870cde3db3eaaf00a314cf5e0d50a11bf707534b9d93eddee648d62304f48976916ea9d5942269dbeded81d49df23199ffcc13d6ae0eb
-  data.tar.gz: 284973f49424ac622ceb3e44071e88336ea316154dee788b0e7c865441eeb01939192289deea84283b691bf8f5a3b79f708d3d62ab9fcec3d596f67ff4c093a9
+  metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
+  data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99

data/app/controllers/easy_ml/models_controller.rb CHANGED Viewed

@@ -30,7 +30,7 @@ module EasyML
     def new
       render inertia: "pages/NewModelPage", props: {
         datasets: EasyML::Dataset.all.map do |dataset|
-          dataset.slice(:id, :name, :num_rows)
+          dataset_to_json(dataset)
         end,
         constants: EasyML::Model.constants,
       }
@@ -41,7 +41,7 @@ module EasyML
       render inertia: "pages/EditModelPage", props: {
         model: model_to_json(model),
         datasets: EasyML::Dataset.all.map do |dataset|
-          dataset.slice(:id, :name, :num_rows)
+          dataset_to_json(dataset)
         end,
         constants: EasyML::Model.constants,
       }
@@ -177,6 +177,7 @@ module EasyML
         :dataset_id,
         :task,
         :objective,
+        :weights_column,
         metrics: [],
         retraining_job_attributes: [
           :id,

data/app/frontend/components/ModelForm.tsx CHANGED Viewed

@@ -16,6 +16,7 @@ interface ModelFormProps {
     task: string;
     objective?: string;
     metrics?: string[];
+    weights_column?: string;
     retraining_job?: {
       frequency: string;
       at: {
@@ -75,6 +76,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
       task: initialData?.task || 'classification',
       objective: initialData?.objective || 'binary:logistic',
       metrics: initialData?.metrics || ['accuracy_score'],
+      weights_column: initialData?.weights_column || '',
       retraining_job_attributes: initialData?.retraining_job ? {
         id: initialData.retraining_job.id,
         frequency: initialData.retraining_job.frequency,
@@ -165,6 +167,7 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
   };
   const selectedDataset = datasets.find(d => d.id === data.model.dataset_id);
+  const columns = selectedDataset?.columns || [];
   const filteredTunerJobConstants = constants.tuner_job_constants[data.model.model_type] || {};
@@ -246,6 +249,19 @@ export function ModelForm({ initialData, datasets, constants, isEditing, errors:
             <ErrorDisplay error={errors.dataset_id} />
           </div>
+          <div>
+            <label className="block text-sm font-medium text-gray-700 mb-1">
+              Weights Column (Optional)
+            </label>
+            <SearchableSelect
+              value={data.model.weights_column}
+              options={columns.map(col => ({ value: col.name, label: col.name }))}
+              onChange={(value) => setData('model.weights_column', value)}
+              isClearable={true}
+            />
+            <ErrorDisplay error={errors.weights_column} />
+          </div>
           <div>
             <label className="block text-sm font-medium text-gray-700 mb-1">
               Task

data/app/frontend/components/ScheduleModal.tsx CHANGED Viewed

@@ -587,8 +587,6 @@ export function ScheduleModal({ isOpen, onClose, onSave, initialData, metrics, t
                           value={formData.retraining_job_attributes.threshold}
                           onChange={(e) => handleEvaluatorChange('threshold', parseFloat(e.target.value))}
                           step={0.01}
-                          min={0}
-                          max={1}
                           className="block w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500 py-2 px-4 shadow-sm border-gray-300 border"
                         />
                       </div>

data/app/frontend/components/dataset/PreprocessingConfig.tsx CHANGED Viewed

@@ -250,16 +250,17 @@ export function PreprocessingConfig({
     setIsEditingDescription(true);
   };
-  let nullCount = (column.statistics?.processed.null_count || column.statistics?.raw?.null_count) || 0;
-  const nullPercentage = nullCount && column.statistics?.raw.num_rows
-    ? ((nullCount / column.statistics.raw.num_rows) * 100)
+  let nullCount = (column.statistics?.processed?.null_count || column.statistics?.raw?.null_count) || 0;
+  let numRows = (column.statistics?.processed?.num_rows) || (column.statistics?.raw?.num_rows) || 0;
+  const nullPercentage = nullCount && numRows
+    ? ((nullCount / numRows) * 100)
     : 0;
-  const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.raw.num_rows
-    ? ((column.statistics.processed.null_count / column.statistics.raw.num_rows) * 100)
+  const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.processed?.num_rows
+    ? ((column.statistics.processed.null_count / column.statistics.processed.num_rows) * 100)
     : 0;
-  const totalRows = column.statistics?.raw.num_rows ?? 0;
+  const totalRows = numRows;
   const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
     const strategy = type === 'training' ? training : inference;

data/app/jobs/easy_ml/application_job.rb CHANGED Viewed

@@ -1,5 +1,6 @@
 module EasyML
   class ApplicationJob < ActiveJob::Base
+    @queue = :easy_ml
     queue_as :easy_ml
     def create_event(model, status, error = nil)

data/app/jobs/easy_ml/batch_job.rb CHANGED Viewed

@@ -39,15 +39,15 @@ module EasyML
         rest.map do |batch|
           Resque.redis.rpush("batch:#{parent_id}:remaining", batch.to_json)
         end
+        track_batch(parent_id)
         handle_batch(parent_id, batch)
       end
       def handle_batch(parent_id, batch)
         if batch.size > 1
-          enqueue_batch(batch)
+          enqueue_batch(batch, parent_id)
         else
-          run_one_batch(parent_id, batch.first)
+          new.perform(parent_id, batch.first)
           after_batch_hook(parent_id, batch)
         end
       end
@@ -60,7 +60,21 @@ module EasyML
       end
       def next_batch?(parent_id)
-        batches_remaining(parent_id) > 0
+        (batches_remaining(parent_id) > 0)
+      end
+      def list_batches
+        Resque.redis.hkeys("batches:tracking")
+      end
+      def track_batch(parent_id)
+        Resque.redis.hset("batches:tracking", parent_id, 1)
+      end
+      def cleanup_all
+        list_batches.each do |batch_id|
+          cleanup_batch(batch_id)
+        end
       end
       def batches_remaining(parent_id)
@@ -69,12 +83,39 @@ module EasyML
       def cleanup_batch(parent_id)
         Resque.redis.del("batch:#{parent_id}:remaining")
+        Resque.redis.hdel("batches:tracking", parent_id)
       end
-      private
+      def batch_args
+        list_batches.map do |batch_id|
+          fetch_batch_arguments(batch_id)
+        end
+      end
+      def select_batches(&block)
+        list_batches.select do |batch_id|
+          yield fetch_batch_arguments(batch_id)
+        end
+      end
+      def poll
+        while true
+          sleep 2
+          EasyML::BatchJob.list_batches.map do |batch|
+            puts "Batch #{batch} | Remaining : #{EasyML::BatchJob.batches_remaining(batch)}"
+          end
+        end
+      end
       def get_parent_batch_id(args_list)
-        args_list.dup.flatten.first.dig(:parent_batch_id)
+        args_list.dup.flatten.detect { |arg| arg.dig(:parent_batch_id) }.dig(:parent_batch_id)
+      end
+      private
+      def get_args_list(batch_id)
+        redis_key = "#{batch(batch_id)}:original_args"
+        redis.get(redis_key)
       end
       # Store batch arguments in Redis

data/app/jobs/easy_ml/compute_feature_job.rb CHANGED Viewed

@@ -14,31 +14,31 @@ module EasyML
       #
       # https://github.com/drfeelngood/resque-batched-job/blob/master/lib/resque/plugins/batched_job.rb#L86
       batch_args = batch_args.dup
-      run_one_batch(batch_id, batch_args)
+      EasyML::ComputeFeatureJob.new.perform(batch_id, batch_args)
     end
-    def self.run_one_batch(batch_id, batch_args)
+    def perform(batch_id, batch_args = {})
       EasyML::Feature.fit_one_batch(batch_id, batch_args)
     end
     def self.after_batch_hook(batch_id, *args)
-      batch_args = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys)
-      feature_ids = batch_args.pluck(:feature_id).uniq
-      parent_id = batch_args.pluck(:parent_batch_id).first
+      args = args.flatten.first.with_indifferent_access
+      feature_id = args.dig(:feature_id)
-      feature = EasyML::Feature.find_by(id: feature_ids.first)
+      feature = EasyML::Feature.find_by(id: feature_id)
       if feature.failed?
         dataset.features.where(workflow_status: :analyzing).update_all(workflow_status: :ready)
-        return BatchJob.cleanup_batch(parent_id)
+        return BatchJob.cleanup_batch(batch_id)
       end
       feature.after_fit
-      if BatchJob.next_batch?(parent_id)
-        BatchJob.enqueue_next_batch(self, parent_id)
+      if BatchJob.next_batch?(batch_id)
+        BatchJob.enqueue_next_batch(self, batch_id)
       else
-        dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
+        cleanup_batch(batch_id)
+        dataset = feature.dataset
         dataset.after_fit_features
       end
     end

data/app/jobs/easy_ml/reaper.rb CHANGED Viewed

@@ -9,8 +9,8 @@ module EasyML
             {
               worker: worker,
               working: true,
-              class: args.dig("job_class"),
-              args: args.dig("arguments"),
+              class: args.is_a?(Hash) ? args.dig("job_class") : nil,
+              args: args.is_a?(Hash) ? args.dig("arguments") : nil,
               pid: worker.pid,
             }
           else
@@ -19,17 +19,23 @@ module EasyML
         end
       end
-      def find_job(worker_class, *args)
+      def find_job(worker_class, *args, &block)
         list_workers.select do |config|
-          config.dig(:class) == worker_class.to_s && config.dig(:args) == args
+          selected = config.dig(:class) == worker_class.to_s
+          if block_given?
+            selected &&= yield(config)
+          else
+            selected &= config.dig(:args) == args
+          end
+          selected
         end
       end
-      def kill(worker_class, *args)
-        find_job(worker_class, *args).each do |job|
+      def kill(worker_class, *args, &block)
+        find_job(worker_class, *args, &block).each do |job|
           begin
-            # Send TERM signal to the process
-            Process.kill("TERM", job[:pid])
+            # Send HUP signal to the process
+            Process.kill("USR1", job[:pid])
             # Remove the worker from Redis so it doesn't show up as a zombie
             # in the Resque web interface. This is important because:
@@ -37,12 +43,10 @@ module EasyML
             # 2. Prevents confusion about running workers
             # 3. Allows proper worker cleanup in Redis
             job[:worker].done_working
-            job[:worker].unregister_worker
           rescue Errno::ESRCH
             # Process already gone, but still try to clean up Redis
             begin
               job[:worker].done_working
-              job[:worker].unregister_worker
             rescue => e
               # Redis cleanup failed, worker might already be unregistered
               puts "Failed to unregister worker: #{e.message}"

data/app/jobs/easy_ml/refresh_dataset_job.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 module EasyML
   class RefreshDatasetJob < ApplicationJob
+    @queue = :easy_ml
     def perform(id)
       begin
         dataset = EasyML::Dataset.find(id)

data/app/jobs/easy_ml/sync_datasource_job.rb CHANGED Viewed

@@ -8,6 +8,7 @@ module EasyML
       begin
         datasource.refresh
+        datasource.after_sync
       rescue StandardError => e
         datasource.update!(is_syncing: false)
         handle_error(datasource, e)

data/app/models/concerns/easy_ml/dataframe_serialization.rb CHANGED Viewed

@@ -8,23 +8,7 @@ module EasyML
     end
     def deserialize_dataframe(df_data)
-      return unless df_data.present? && df_data.key?("columns")
-      columns = df_data["columns"].map do |col|
-        dtype = case col["datatype"]
-          when Hash
-            if col["datatype"]["Datetime"]
-              Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
-            else
-              Polars::Utf8
-            end
-          else
-            Polars.const_get(col["datatype"])
-          end
-        Polars::Series.new(col["name"], col["values"], dtype: dtype)
-      end
-      Polars::DataFrame.new(columns)
+      Polars::DataFrame.new(df_data)
     end
   end
 end

data/app/models/easy_ml/column/imputers/base.rb CHANGED Viewed

@@ -63,7 +63,7 @@ module EasyML
           if column.is_computed
             column.statistics.dig(:processed, *args)
           else
-            column.statistics.dig(:clipped, *args) || column.statistics.dig(:raw, *args)
+            column.statistics.dig(:raw, *args)
           end
         end

data/app/models/easy_ml/column/imputers/ordinal_encoder.rb CHANGED Viewed

@@ -50,11 +50,7 @@ module EasyML
         end
         def cast_encoder(encoder)
-          begin
-            encoder.transform_keys { |k| column.cast(k) }
-          rescue => e
-            binding.pry
-          end
+          encoder.transform_keys { |k| column.cast(k) }
         end
         def cast_decoder(decoder)

data/app/models/easy_ml/column/imputers/today.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module EasyML
         def transform(df)
           df = df.with_column(
-            Polars.col(column.name).fill_null(Polars.lit(UTC.today.beginning_of_day)).alias(column.name)
+            Polars.col(column.name).fill_null(Polars.lit(EasyML::Support::UTC.today.beginning_of_day)).alias(column.name)
           )
           df
         end

data/app/models/easy_ml/column/selector.rb CHANGED Viewed

@@ -24,14 +24,6 @@ module EasyML
         end
       end
-      def clipped
-        Selector.new(column, :raw) do |df|
-          column.imputers.training.clip(df)
-        end
-      end
-      measure_method_timing :clipped
       def processed
         Selector.new(column, :processed)
       end

data/app/models/easy_ml/column.rb CHANGED Viewed

@@ -140,7 +140,7 @@ module EasyML
       end
     end
-    delegate :raw, :processed, :data, :train, :test, :valid, :clipped, to: :data_selector
+    delegate :raw, :processed, :data, :train, :test, :valid, to: :data_selector
     def empty?
       data.blank?

data/app/models/easy_ml/dataset/learner/base.rb CHANGED Viewed

@@ -15,8 +15,8 @@ module EasyML
             (column.one_hot? && type.to_sym == :processed)
         end
-        TYPES_ALL = %i(raw clipped processed)
-        TYPES_RAW = %i(raw clipped)
+        TYPES_ALL = %i(raw processed)
+        TYPES_RAW = %i(raw)
         TYPES_PROCESSED = %i(processed)
         def types(type = :all)

data/app/models/easy_ml/dataset/learner/eager.rb CHANGED Viewed

@@ -19,7 +19,9 @@ module EasyML
         end
         def fetch_df(split, type)
-          @dataset.send(type).send(split, all_columns: true)
+          dataset.columns.apply_clip(
+            @dataset.send(type).send(split, all_columns: true)
+          )
         end
         def execute_queries(split, type)

data/app/models/easy_ml/dataset/learner/lazy.rb CHANGED Viewed

@@ -21,7 +21,10 @@ module EasyML
         def run_queries(split, type)
           queries = build_queries(split, type)
-          @dataset.send(type).send(split, all_columns: true, lazy: true).select(queries).collect
+          dataset.columns.apply_clip(
+            @dataset.send(type).send(split, all_columns: true, lazy: true)
+          ).select(queries).collect
         end
         def get_column_statistics(query_results)

data/app/models/easy_ml/dataset/refresh_reasons.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module EasyML
+  class Dataset
+    class RefreshReasons < EasyML::Reasons
+      add_reason "Not split", -> { not_split? }
+      add_reason "Refreshed at is nil", -> { refreshed_at.nil? }
+      add_reason "Columns need refresh", -> { columns_need_refresh? }
+      add_reason "Features need refresh", -> { features_need_fit? }
+      add_reason "Datasource needs refresh", -> { datasource_needs_refresh? }
+      add_reason "Datasource was refreshed", -> { datasource_was_refreshed? }
+    end
+  end
+end

data/app/models/easy_ml/dataset.rb CHANGED Viewed

@@ -180,6 +180,8 @@ module EasyML
       EasyML::Reaper.kill(EasyML::RefreshDatasetJob, id)
       update(workflow_status: :ready)
       unlock!
+      features.update_all(needs_fit: true, workflow_status: "ready")
+      features.each(&:wipe)
     end
     def refresh_async
@@ -201,12 +203,6 @@ module EasyML
       @raw = initialize_split("raw")
     end
-    def clipped
-      return @clipped if @clipped && @clipped.dataset
-      @clipped = initialize_split("clipped")
-    end
     def processed
       return @processed if @processed && @processed.dataset
@@ -265,9 +261,7 @@ module EasyML
     def refresh!(async: false)
       refreshing do
-        puts "Prepare..."
         prepare!
-        puts "Fit features..."
         fit_features!(async: async)
       end
     end
@@ -276,9 +270,7 @@ module EasyML
       return refresh_async if async
       refreshing do
-        puts "prepare.."
         prepare
-        puts "fit features..."
         fit_features(async: async)
       end
     end
@@ -291,6 +283,7 @@ module EasyML
     def fit_features(async: false, features: self.features, force: false)
       features_to_compute = force ? features : features.needs_fit
+      puts "Features to compute.... #{features_to_compute}"
       return after_fit_features if features_to_compute.empty?
       features.first.fit(features: features_to_compute, async: async)
@@ -299,11 +292,12 @@ module EasyML
     measure_method_timing :fit_features
     def after_fit_features
-      puts "after fit features..."
+      puts "After fit features"
       unlock!
       reload
       return if failed?
+      puts "Actually refresh..."
       actually_refresh
     end
@@ -338,45 +332,12 @@ module EasyML
     #
     # So yes this is an annoying way to structure a method, but it's helpful for performance
     #
-    def refresh_reasons(exclude: [])
-      {
-        not_split: {
-          name: "Not split",
-          check: -> { not_split? },
-        },
-        refreshed_at_is_nil: {
-          name: "Refreshed at is nil",
-          check: -> { refreshed_at.nil? },
-        },
-        columns_need_refresh: {
-          name: "Columns need refresh",
-          check: -> { columns_need_refresh? },
-        },
-        features_need_fit: {
-          name: "Features need refresh",
-          check: -> { features_need_fit? },
-        },
-        datasource_needs_refresh: {
-          name: "Datasource needs refresh",
-          check: -> { datasource_needs_refresh? },
-        },
-        refreshed_datasource: {
-          name: "Refreshed datasource",
-          check: -> { refreshed_datasource? },
-        },
-        datasource_was_refreshed: {
-          name: "Datasource was refreshed",
-          check: -> { datasource_was_refreshed? },
-        },
-      }.except(*exclude).select do |k, config|
-        config[:check].call
-      end.map do |k, config|
-        config[:name]
-      end
+    def refresh_reasons(except: [])
+      RefreshReasons.new(self).check(except: except)
     end
-    def needs_refresh?(exclude: [])
-      refresh_reasons(exclude: exclude).any?
+    def needs_refresh?(except: [])
+      refresh_reasons(except: except).any?
     end
     def processed?
@@ -423,6 +384,7 @@ module EasyML
     def unlock!
       Support::Lockable.unlock!(lock_key)
+      features.each(&:unlock!)
     end
     def locked?
@@ -518,23 +480,34 @@ module EasyML
     end
     def normalize(df = nil, split_ys: false, inference: false, all_columns: false, features: self.features)
-      puts "Apply missing features..."
       df = apply_missing_columns(df, inference: inference)
-      puts "Transform columns..."
       df = columns.transform(df, inference: inference)
-      puts "Apply features..."
       df = apply_features(df, features)
-      puts "Transform columns..."
       df = columns.transform(df, inference: inference, computed: true)
-      puts "Apply column mask..."
       df = apply_column_mask(df, inference: inference) unless all_columns
-      puts "Drop nulls..."
       df = drop_nulls(df) unless inference
-      puts "Split features and targets..."
       df, = processed.split_features_targets(df, true, target) if split_ys
       df
     end
+    # Massage out one-hot cats to their canonical name
+    #
+    # Takes: ["Sex_male", "Sex_female", "Embarked_c", "PassengerId"]
+    # Returns: ["Embarked", "Sex", "PassengerId"]
+    def regular_columns(col_list)
+      one_hot_cats = columns.allowed_categories.invert.reduce({}) do |h, (k, v)|
+        h.tap do
+          k.each do |k2|
+            h["#{v}_#{k2}"] = v
+          end
+        end
+      end
+      col_list.map do |col|
+        one_hot_cats.key?(col) ? one_hot_cats[col] : col
+      end.uniq.sort
+    end
     measure_method_timing :normalize
     def missing_required_fields(df)
@@ -582,7 +555,6 @@ module EasyML
     def cleanup
       raw.cleanup
-      clipped.cleanup
       processed.cleanup
     end
@@ -775,10 +747,8 @@ module EasyML
     def initialize_splits
       @raw = nil
-      @clipped = nil
       @processed = nil
       raw
-      clipped
       processed
     end
@@ -823,7 +793,7 @@ module EasyML
       processed.cleanup
       SPLIT_ORDER.each do |segment|
-        df = clipped.read(segment)
+        df = raw.read(segment)
         learn_computed_columns(df) if segment == :train
         processed_df = normalize(df, all_columns: true)
         processed.save(segment, processed_df)
@@ -870,26 +840,9 @@ module EasyML
     end
     def fit
-      apply_clip
       learn_statistics(type: :raw)
     end
-    def apply_clip
-      clipped.cleanup
-      SPLIT_ORDER.each do |segment|
-        df = raw.send(segment, lazy: true, all_columns: true)
-        clipped.save(
-          segment,
-          columns.apply_clip(df) # Ensuring this returns a LazyFrame means we'll automatically use sink_parquet
-        )
-      end
-    end
-    measure_method_timing :apply_clip
-    # log_method :fit, "Learning statistics", verbose: true
     def split_data!
       split_data(force: true)
     end