RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc71 → 0.2.0.pre.rc75 - Mend

easy_ml 0.2.0.pre.rc71 → 0.2.0.pre.rc75

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (117) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/datasets_controller.rb +33 -0
data/app/controllers/easy_ml/datasources_controller.rb +7 -0
data/app/controllers/easy_ml/models_controller.rb +46 -0
data/app/frontend/components/DatasetCard.tsx +212 -0
data/app/frontend/components/ModelCard.tsx +114 -29
data/app/frontend/components/StackTrace.tsx +13 -0
data/app/frontend/components/dataset/FeatureConfigPopover.tsx +10 -7
data/app/frontend/components/datasets/UploadDatasetButton.tsx +51 -0
data/app/frontend/components/models/DownloadModelModal.tsx +90 -0
data/app/frontend/components/models/UploadModelModal.tsx +212 -0
data/app/frontend/components/models/index.ts +2 -0
data/app/frontend/pages/DatasetsPage.tsx +36 -130
data/app/frontend/pages/DatasourcesPage.tsx +22 -2
data/app/frontend/pages/ModelsPage.tsx +37 -11
data/app/frontend/types/dataset.ts +1 -2
data/app/frontend/types.ts +1 -1
data/app/jobs/easy_ml/reaper.rb +55 -0
data/app/jobs/easy_ml/training_job.rb +1 -1
data/app/models/easy_ml/column/imputers/base.rb +4 -0
data/app/models/easy_ml/column/imputers/clip.rb +5 -3
data/app/models/easy_ml/column/imputers/imputer.rb +11 -13
data/app/models/easy_ml/column/imputers/mean.rb +7 -3
data/app/models/easy_ml/column/imputers/null_imputer.rb +3 -0
data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +5 -1
data/app/models/easy_ml/column/imputers.rb +3 -1
data/app/models/easy_ml/column/lineage/base.rb +5 -1
data/app/models/easy_ml/column/lineage/computed_by_feature.rb +1 -1
data/app/models/easy_ml/column/lineage/preprocessed.rb +1 -1
data/app/models/easy_ml/column/lineage/raw_dataset.rb +1 -1
data/app/models/easy_ml/column/selector.rb +4 -0
data/app/models/easy_ml/column.rb +79 -63
data/app/models/easy_ml/column_history.rb +28 -28
data/app/models/easy_ml/column_list/imputer.rb +23 -0
data/app/models/easy_ml/column_list.rb +39 -26
data/app/models/easy_ml/dataset/learner/base.rb +34 -0
data/app/models/easy_ml/dataset/learner/eager/boolean.rb +10 -0
data/app/models/easy_ml/dataset/learner/eager/categorical.rb +51 -0
data/app/models/easy_ml/dataset/learner/eager/query.rb +37 -0
data/app/models/easy_ml/dataset/learner/eager.rb +43 -0
data/app/models/easy_ml/dataset/learner/lazy/boolean.rb +13 -0
data/app/models/easy_ml/dataset/learner/lazy/categorical.rb +10 -0
data/app/models/easy_ml/dataset/learner/lazy/datetime.rb +19 -0
data/app/models/easy_ml/dataset/learner/lazy/null.rb +17 -0
data/app/models/easy_ml/dataset/learner/lazy/numeric.rb +19 -0
data/app/models/easy_ml/dataset/learner/lazy/query.rb +69 -0
data/app/models/easy_ml/dataset/learner/lazy/string.rb +19 -0
data/app/models/easy_ml/dataset/learner/lazy.rb +51 -0
data/app/models/easy_ml/dataset/learner/query.rb +25 -0
data/app/models/easy_ml/dataset/learner.rb +100 -0
data/app/models/easy_ml/dataset.rb +150 -36
data/app/models/easy_ml/dataset_history.rb +1 -0
data/app/models/easy_ml/datasource.rb +9 -0
data/app/models/easy_ml/event.rb +5 -7
data/app/models/easy_ml/export/column.rb +27 -0
data/app/models/easy_ml/export/dataset.rb +37 -0
data/app/models/easy_ml/export/datasource.rb +12 -0
data/app/models/easy_ml/export/feature.rb +24 -0
data/app/models/easy_ml/export/model.rb +40 -0
data/app/models/easy_ml/export/retraining_job.rb +20 -0
data/app/models/easy_ml/export/splitter.rb +14 -0
data/app/models/easy_ml/feature.rb +21 -0
data/app/models/easy_ml/import/column.rb +35 -0
data/app/models/easy_ml/import/dataset.rb +148 -0
data/app/models/easy_ml/import/feature.rb +36 -0
data/app/models/easy_ml/import/model.rb +136 -0
data/app/models/easy_ml/import/retraining_job.rb +29 -0
data/app/models/easy_ml/import/splitter.rb +34 -0
data/app/models/easy_ml/lineage.rb +44 -0
data/app/models/easy_ml/model.rb +101 -37
data/app/models/easy_ml/model_file.rb +6 -0
data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -7
data/app/models/easy_ml/models/xgboost.rb +33 -9
data/app/models/easy_ml/retraining_job.rb +8 -1
data/app/models/easy_ml/retraining_run.rb +7 -5
data/app/models/easy_ml/splitter.rb +8 -0
data/app/models/lineage_history.rb +6 -0
data/app/serializers/easy_ml/column_serializer.rb +7 -1
data/app/serializers/easy_ml/dataset_serializer.rb +2 -1
data/app/serializers/easy_ml/lineage_serializer.rb +9 -0
data/config/routes.rb +14 -1
data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +3 -3
data/lib/easy_ml/core/tuner.rb +13 -12
data/lib/easy_ml/data/polars_column.rb +149 -100
data/lib/easy_ml/data/polars_reader.rb +8 -5
data/lib/easy_ml/data/polars_schema.rb +56 -0
data/lib/easy_ml/data/splits/file_split.rb +20 -2
data/lib/easy_ml/data/splits/split.rb +10 -1
data/lib/easy_ml/data.rb +1 -0
data/lib/easy_ml/deep_compact.rb +19 -0
data/lib/easy_ml/engine.rb +1 -0
data/lib/easy_ml/feature_store.rb +2 -6
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +6 -0
data/lib/easy_ml/railtie/templates/migration/add_extra_metadata_to_columns.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_raw_schema_to_datasets.rb.tt +9 -0
data/lib/easy_ml/railtie/templates/migration/add_unique_constraint_to_easy_ml_model_names.rb.tt +8 -0
data/lib/easy_ml/railtie/templates/migration/create_easy_ml_lineages.rb.tt +24 -0
data/lib/easy_ml/railtie/templates/migration/remove_evaluator_from_retraining_jobs.rb.tt +7 -0
data/lib/easy_ml/railtie/templates/migration/update_preprocessing_steps_to_jsonb.rb.tt +18 -0
data/lib/easy_ml/timing.rb +34 -0
data/lib/easy_ml/version.rb +1 -1
data/lib/easy_ml.rb +2 -0
data/public/easy_ml/assets/.vite/manifest.json +2 -2
data/public/easy_ml/assets/assets/Application-Q7L6ioxr.css +1 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js +522 -0
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Rrzo4ecT.js.map +1 -0
metadata +53 -12
data/app/models/easy_ml/column/learners/base.rb +0 -103
data/app/models/easy_ml/column/learners/boolean.rb +0 -11
data/app/models/easy_ml/column/learners/categorical.rb +0 -51
data/app/models/easy_ml/column/learners/datetime.rb +0 -19
data/app/models/easy_ml/column/learners/null.rb +0 -22
data/app/models/easy_ml/column/learners/numeric.rb +0 -33
data/app/models/easy_ml/column/learners/string.rb +0 -15
data/public/easy_ml/assets/assets/Application-BbFobaXt.css +0 -1
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js +0 -489
data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CibZcrBc.js.map +0 -1

data/app/models/easy_ml/models/xgboost/evals_callback.rb CHANGED Viewed

@@ -32,9 +32,9 @@ module EasyML
           false
         end
-        def test_dataset
+        def valid_dataset
           if tuner.present?
-            [tuner.x_true, tuner.y_true]
+            [tuner.x_valid, tuner.y_valid]
           else
             model.dataset.valid(split_ys: true)
           end
@@ -46,12 +46,12 @@ module EasyML
           log_frequency = 10
           if epoch % log_frequency == 0
             model.adapter.external_model = booster
-            x_true, y_true = test_dataset
-            @preprocessed ||= model.preprocess(x_true)
+            x_valid, y_valid = valid_dataset
+            @preprocessed ||= model.preprocess(x_valid)
             y_pred = model.predict(@preprocessed)
-            dataset = model.dataset.test(all_columns: true)
+            dataset = model.dataset.valid(all_columns: true)
-            metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
+            metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
             Wandb.log(metrics)
           end
@@ -67,7 +67,7 @@ module EasyML
         def after_training(booster)
           return booster unless wandb_enabled?
-          if model.last_run&.wandb_url.nil?
+          if model.last_run.present? && model.last_run&.wandb_url.nil?
             if tuner.present? && !tuner.current_run.wandb_url.present?
               tuner.current_run.wandb_url = Wandb.current_run.url
             end

data/app/models/easy_ml/models/xgboost.rb CHANGED Viewed

@@ -199,7 +199,7 @@ module EasyML
         set_default_wandb_project_name unless tuning
         # Prepare validation data
-        x_valid, y_valid = dataset.valid(split_ys: true)
+        x_valid, y_valid = dataset.valid(split_ys: true, select: dataset.col_order)
         d_valid = preprocess(x_valid, y_valid)
         num_iterations = hyperparameters.to_h[:n_estimators]
@@ -217,7 +217,7 @@ module EasyML
         callbacks << ::XGBoost::EvaluationMonitor.new(period: 1)
         # Generate batches without loading full dataset
-        batches = dataset.train(split_ys: true, batch_size: batch_size, batch_start: batch_start, batch_key: batch_key)
+        batches = dataset.train(split_ys: true, batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, select: dataset.col_order)
         prev_xs = []
         prev_ys = []
@@ -281,9 +281,32 @@ module EasyML
         return @booster
       end
-      def weights
-        @booster.save_model("tmp/xgboost_model.json")
-        @booster.get_dump
+      def weights(model_file)
+        return nil unless model_file.present? && model_file.fit?
+        JSON.parse(model_file.read)
+      end
+      def set_weights(model_file, weights)
+        raise ArgumentError, "Weights must be provided" unless weights.present?
+        # Create a temp file with the weights
+        temp_file = Tempfile.new(["xgboost_weights", ".json"])
+        begin
+          temp_file.write(weights.to_json)
+          temp_file.close
+          # Load the weights into a new booster
+          initialize_model do
+            attrs = {
+              params: hyperparameters.to_h.symbolize_keys.compact,
+              model_file: temp_file.path,
+            }.compact
+            booster_class.new(**attrs)
+          end
+        ensure
+          temp_file.unlink
+        end
       end
       def predict(xs)
@@ -397,11 +420,12 @@ module EasyML
       def prepare_data
         if @d_train.nil?
-          x_sample, y_sample = dataset.train(split_ys: true, limit: 5)
+          col_order = dataset.col_order
+          x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order)
           preprocess(x_sample, y_sample) # Ensure we fail fast if the dataset is misconfigured
-          x_train, y_train = dataset.train(split_ys: true)
-          x_valid, y_valid = dataset.valid(split_ys: true)
-          x_test, y_test = dataset.test(split_ys: true)
+          x_train, y_train = dataset.train(split_ys: true, select: col_order)
+          x_valid, y_valid = dataset.valid(split_ys: true, select: col_order)
+          x_test, y_test = dataset.test(split_ys: true, select: col_order)
           @d_train = preprocess(x_train, y_train)
           @d_valid = preprocess(x_valid, y_valid)
           @d_test = preprocess(x_test, y_test)

data/app/models/easy_ml/retraining_job.rb CHANGED Viewed

@@ -6,7 +6,6 @@
 #  model_id         :bigint
 #  frequency        :string           not null
 #  at               :json             not null
-#  evaluator        :json
 #  tuning_enabled   :boolean          default(FALSE)
 #  tuner_config     :json
 #  tuning_frequency :string
@@ -160,6 +159,14 @@ module EasyML
       }[frequency.to_sym]
     end
+    def to_config
+      EasyML::Export::RetrainingJob.to_config(self)
+    end
+    def self.from_config(config, model)
+      EasyML::Import::RetrainingJob.from_config(config, model)
+    end
     private
     def metric_class

data/app/models/easy_ml/retraining_run.rb CHANGED Viewed

@@ -37,7 +37,7 @@ module EasyML
     belongs_to :model_file, class_name: "EasyML::ModelFile", optional: true
     has_many :events, as: :eventable, class_name: "EasyML::Event", dependent: :destroy
-    validates :status, presence: true, inclusion: { in: %w[pending running success failed deployed] }
+    validates :status, presence: true, inclusion: { in: %w[pending running success failed deployed aborted] }
     scope :running, -> { where(status: "running") }
@@ -83,7 +83,6 @@ module EasyML
             completed_at: failed_reasons.none? ? Time.current : nil,
             error_message: failed_reasons.any? ? failed_reasons&.first : nil,
             model: training_model,
-            metrics: training_model.evaluate,
             best_params: best_params,
             tuner_job_id: tuner&.id,
             metadata: tuner&.metadata,
@@ -109,6 +108,7 @@ module EasyML
         end
         true
       rescue => e
+        puts EasyML::Event.easy_ml_context(e.backtrace)
         EasyML::Event.handle_error(self, e)
         update!(
           status: "failed",
@@ -150,14 +150,15 @@ module EasyML
       training_model.dataset.refresh
       evaluator = retraining_job.evaluator.symbolize_keys
-      x_true, y_true = training_model.dataset.test(split_ys: true)
-      y_pred = training_model.predict(x_true)
+      x_test, y_test = training_model.dataset.test(split_ys: true)
+      y_pred = training_model.predict(x_test)
       metric = evaluator[:metric].to_sym
       metrics = EasyML::Core::ModelEvaluator.evaluate(
         model: training_model,
         y_pred: y_pred,
-        y_true: y_true,
+        y_true: y_test,
+        x_true: x_test,
         dataset: training_model.dataset.test(all_columns: true),
         evaluator: evaluator,
       )
@@ -176,6 +177,7 @@ module EasyML
       {
         metric_value: metric_value,
+        metrics: metrics,
         threshold: threshold,
         threshold_direction: threshold_direction,
         deployable: deployable,

data/app/models/easy_ml/splitter.rb CHANGED Viewed

@@ -75,6 +75,14 @@ module EasyML
       }
     end
+    def to_config
+      EasyML::Export::Splitter.to_config(self)
+    end
+    def self.from_config(config, dataset)
+      EasyML::Import::Splitter.from_config(config, dataset)
+    end
     def split(df, &block)
       adapter.split(df, &block)
     end

data/app/models/lineage_history.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module EasyML
+  class LineageHistory < ActiveRecord::Base
+    self.table_name = "easy_ml_lineage_histories"
+    include Historiographer::History
+  end
+end

data/app/serializers/easy_ml/column_serializer.rb CHANGED Viewed

@@ -28,10 +28,16 @@ module EasyML
     attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
                :hidden, :drop_if_null, :sample_values, :statistics, :is_target,
-               :is_computed, :computed_by, :lineage
+               :is_computed, :computed_by
     attribute :required do |object|
       object.required?
     end
+    attribute :lineage do |column|
+      column.lineages.map do |lineage|
+        LineageSerializer.new(lineage).serializable_hash.dig(:data, :attributes)
+      end
+    end
   end
 end

data/app/serializers/easy_ml/dataset_serializer.rb CHANGED Viewed

@@ -59,7 +59,8 @@ module EasyML
     end
     attribute :columns do |dataset|
-      dataset.columns.order(:id).map do |column|
+      col_order = dataset.col_order
+      dataset.columns.sort_by { |c| col_order.index(c.name) || Float::INFINITY }.map do |column|
         ColumnSerializer.new(column).serializable_hash.dig(:data, :attributes)
       end
     end

data/app/serializers/easy_ml/lineage_serializer.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require "jsonapi/serializer"
+module EasyML
+  class LineageSerializer
+    include JSONAPI::Serializer
+    attributes :id, :key, :description, :occurred_at
+  end
+end

data/config/routes.rb CHANGED Viewed

@@ -17,10 +17,16 @@ EasyML::Engine.routes.draw do
   resources :models, as: :easy_ml_models do
     member do
       post :train
+      post :abort
+      get :download
+      post :upload
       get :retraining_runs, to: "retraining_runs#index"
     end
+    collection do
+      get "new", as: "new"
+      post :upload
+    end
     resources :deploys, only: [:create]
-    get "new", on: :collection, as: "new"
   end
   resources :retraining_runs, only: [:show]
@@ -29,6 +35,7 @@ EasyML::Engine.routes.draw do
   resources :datasources, as: :easy_ml_datasources do
     member do
       post :sync
+      post :abort
     end
   end
@@ -36,6 +43,12 @@ EasyML::Engine.routes.draw do
   resources :datasets, as: :easy_ml_datasets do
     member do
       post :refresh
+      post :abort
+      get :download
+      post :upload
+    end
+    collection do
+      post :upload
     end
   end

data/lib/easy_ml/core/tuner/adapters/base_adapter.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module EasyML
       module Adapters
         class BaseAdapter
           attr_accessor :config, :project_name, :tune_started_at, :model,
-                        :x_true, :y_true, :metadata, :model
+                        :x_valid, :y_valid, :metadata, :model
           def initialize(options = {})
             @model = options[:model]
@@ -12,8 +12,8 @@ module EasyML
             @project_name = options[:project_name]
             @tune_started_at = options[:tune_started_at]
             @model = options[:model]
-            @x_true = options[:x_true]
-            @y_true = options[:y_true]
+            @x_valid = options[:x_valid]
+            @y_valid = options[:y_valid]
             @metadata = options[:metadata] || {}
           end

data/lib/easy_ml/core/tuner.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module EasyML
     class Tuner
       attr_accessor :model, :dataset, :project_name, :task, :config,
                     :metrics, :objective, :n_trials, :direction, :evaluator,
-                    :study, :results, :adapter, :tune_started_at, :x_true, :y_true,
+                    :study, :results, :adapter, :tune_started_at, :x_valid, :y_valid,
                     :project_name, :job, :current_run, :trial_enumerator, :progress_block,
                     :tuner_job, :dataset
@@ -34,7 +34,7 @@ module EasyML
             config: config,
             project_name: project_name,
             tune_started_at: nil,  # This will be set during tune
-            y_true: nil, # This will be set during tune
+            y_valid: nil, # This will be set during tune
           )
         end
       end
@@ -70,17 +70,16 @@ module EasyML
         @job = tuner_job
         @study = Optuna::Study.new(direction: direction)
         @results = []
-        model.evaluator = evaluator if evaluator.present?
         model.task = task
-        model.dataset.refresh
-        x_true, y_true = model.dataset.test(split_ys: true)
-        self.x_true = x_true
-        self.y_true = y_true
-        self.dataset = model.dataset.test(all_columns: true)
+        model.dataset.refresh if model.dataset.needs_refresh?
+        x_valid, y_valid = model.dataset.valid(split_ys: true, select: model.dataset.col_order)
+        self.x_valid = x_valid
+        self.y_valid = y_valid
+        self.dataset = model.dataset.valid(all_columns: true)
         adapter.tune_started_at = tune_started_at
-        adapter.y_true = y_true
-        adapter.x_true = x_true
+        adapter.x_valid = x_valid
+        adapter.y_valid = y_valid
         model.prepare_data unless model.batch_mode
         model.prepare_callbacks(self)
@@ -99,6 +98,7 @@ module EasyML
             @results.push(result)
             @study.tell(@current_trial, result)
           rescue StandardError => e
+            puts EasyML::Event.easy_ml_context(e.backtrace)
             @tuner_run.update!(status: :failed, hyperparameters: {})
             puts "Optuna failed with: #{e.message}"
             raise e
@@ -118,6 +118,7 @@ module EasyML
         best_run&.hyperparameters
       rescue StandardError => e
+        puts EasyML::Event.easy_ml_context(e.backtrace)
         tuner_job&.update!(status: :failed, completed_at: Time.current)
         raise e
       end
@@ -137,9 +138,9 @@ module EasyML
           end
         end
-        y_pred = model.predict(x_true)
+        y_pred = model.predict(x_valid)
         model.metrics = metrics
-        metrics = model.evaluate(y_pred: y_pred, y_true: y_true, x_true: x_true, dataset: dataset)
+        metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
         metric = metrics.symbolize_keys.dig(model.evaluator[:metric].to_sym)
         puts metrics

data/lib/easy_ml/data/polars_column.rb CHANGED Viewed

@@ -2,7 +2,7 @@ require_relative "date_converter"
 module EasyML
   module Data
-    module PolarsColumn
+    class PolarsColumn
       TYPE_MAP = {
         float: Polars::Float64,
         integer: Polars::Int64,
@@ -14,132 +14,181 @@ module EasyML
         categorical: Polars::Categorical,
         null: Polars::Null,
       }
-      POLARS_MAP = TYPE_MAP.invert.stringify_keys
+      POLARS_MAP = {
+        Polars::Float64 => :float,
+        Polars::Int64 => :integer,
+        Polars::Float32 => :float,
+        Polars::Int32 => :integer,
+        Polars::Boolean => :boolean,
+        Polars::Datetime => :datetime,
+        Polars::Date => :date,
+        Polars::String => :string,
+        Polars::Categorical => :categorical,
+        Polars::Null => :null,
+      }.stringify_keys
+      include EasyML::Timing
       class << self
         def polars_to_sym(polars_type)
-          POLARS_MAP.dig(polars_type.class.to_s)
+          new.polars_to_sym(polars_type)
+        end
+        def determine_type(series, polars_type = false)
+          new.determine_type(series, polars_type)
         end
         def parse_polars_dtype(dtype_string)
-          case dtype_string
-          when /^Polars::Datetime/
-            time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
-            time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
-            time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
-            Polars::Datetime.new(time_unit, time_zone)
-          when /^Polars::/
-            Polars.const_get(dtype_string.split("::").last)
-          else
-            raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
-          end
+          new.parse_polars_dtype(dtype_string)
+        end
+        def get_polars_type(dtype)
+          new.get_polars_type(dtype)
+        end
+        def polars_dtype_to_sym(dtype_string)
+          new.polars_dtype_to_sym(dtype_string)
         end
         def sym_to_polars(symbol)
-          TYPE_MAP.dig(symbol)
+          new.sym_to_polars(symbol)
         end
+      end
-        # Determines the semantic type of a field based on its data
-        # @param series [Polars::Series] The series to analyze
-        # @return [Symbol] One of :numeric, :datetime, :categorical, or :text
-        def determine_type(series, polars_type = false)
-          dtype = series.dtype
-          if dtype.is_a?(Polars::Utf8)
-            string_type = determine_string_type(series)
-            if string_type == :datetime
-              date = EasyML::Data::DateConverter.maybe_convert_date(series)
-              return polars_type ? date[date.columns.first].dtype : :datetime
-            end
-          end
+      def polars_to_sym(polars_type)
+        return nil if polars_type.nil?
+        if polars_type.is_a?(Polars::DataType)
+          POLARS_MAP.dig(polars_type.class.to_s)
+        else
+          polars_type.to_sym if TYPE_MAP.keys.include?(polars_type.to_sym)
+        end
+      end
-          type_name = case dtype
-            when Polars::Float64
-              :float
-            when Polars::Int64
-              :integer
-            when Polars::Datetime
-              :datetime
-            when Polars::Date
-              :date
-            when Polars::Boolean
-              :boolean
-            when Polars::Utf8
-              determine_string_type(series)
-            when Polars::Null
-              :null
-            else
-              :categorical
-            end
-          polars_type ? sym_to_polars(type_name) : type_name
+      def parse_polars_dtype(dtype_string)
+        case dtype_string
+        when /^Polars::Datetime/
+          time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
+          time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
+          time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
+          Polars::Datetime.new(time_unit, time_zone)
+        when /^Polars::/
+          Polars.const_get(dtype_string.split("::").last)
+        else
+          raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
         end
+      end
-        # Determines if a string field is a date, text, or categorical
-        # @param series [Polars::Series] The string series to analyze
-        # @return [Symbol] One of :datetime, :text, or :categorical
-        def determine_string_type(series)
-          if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
-                                                            :temp)[:temp].dtype.is_a?(Polars::Datetime)
+      def sym_to_polars(symbol)
+        TYPE_MAP.dig(symbol.to_sym)
+      end
+      # Determines the semantic type of a field based on its data
+      # @param series [Polars::Series] The series to analyze
+      # @return [Symbol] One of :numeric, :datetime, :categorical, or :text
+      def determine_type(series, polars_type = false)
+        dtype = series.dtype
+        if dtype.is_a?(Polars::Utf8)
+          string_type = determine_string_type(series)
+          if string_type == :datetime
+            date = EasyML::Data::DateConverter.maybe_convert_date(series)
+            return polars_type ? date[date.columns.first].dtype : :datetime
+          end
+        end
+        type_name = case dtype
+          when Polars::Float64
+            :float
+          when Polars::Int64
+            :integer
+          when Polars::Datetime
             :datetime
+          when Polars::Date
+            :date
+          when Polars::Boolean
+            :boolean
+          when Polars::Utf8
+            determine_string_type(series)
+          when Polars::Null
+            :null
           else
-            categorical_or_text?(series)
+            :categorical
           end
+        polars_type ? sym_to_polars(type_name) : type_name
+      end
+      measure_method_timing :determine_type
+      # Determines if a string field is a date, text, or categorical
+      # @param series [Polars::Series] The string series to analyze
+      # @return [Symbol] One of :datetime, :text, or :categorical
+      def determine_string_type(series)
+        if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
+                                                          :temp)[:temp].dtype.is_a?(Polars::Datetime)
+          :datetime
+        else
+          categorical_or_text?(series)
         end
+      end
-        # Determines if a string field is categorical or free text
-        # @param series [Polars::Series] The string series to analyze
-        # @return [Symbol] Either :categorical or :text
-        def categorical_or_text?(series)
-          return :categorical if series.null_count == series.len
+      measure_method_timing :determine_string_type
-          # Get non-null count for percentage calculations
-          non_null_count = series.len - series.null_count
-          return :categorical if non_null_count == 0
+      # Determines if a string field is categorical or free text
+      # @param series [Polars::Series] The string series to analyze
+      # @return [Symbol] Either :categorical or :text
+      def categorical_or_text?(series)
+        return :categorical if series.null_count == series.len
-          # Get value counts as percentages
-          value_counts = series.value_counts(parallel: true)
-          percentages = value_counts.with_column(
-            (value_counts["count"] / non_null_count.to_f * 100).alias("percentage")
-          )
+        # Get non-null count for percentage calculations
+        non_null_count = series.len - series.null_count
+        return :categorical if non_null_count == 0
-          # Check if any category represents more than 10% of the data
-          max_percentage = percentages["percentage"].max
-          return :text if max_percentage < 10.0
+        # Get value counts as percentages
+        value_counts = series.value_counts(parallel: true)
+        percentages = value_counts.with_column(
+          (value_counts["count"] / non_null_count.to_f * 100).alias("percentage")
+        )
-          # Calculate average percentage per category
-          avg_percentage = 100.0 / series.n_unique
+        # Check if any category represents more than 10% of the data
+        max_percentage = percentages["percentage"].max
+        return :text if max_percentage < 10.0
-          # If average category represents less than 1% of data, it's likely text
-          avg_percentage < 1.0 ? :text : :categorical
-        end
+        # Calculate average percentage per category
+        avg_percentage = 100.0 / series.n_unique
-        # Returns whether the field type is numeric
-        # @param field_type [Symbol] The field type to check
-        # @return [Boolean]
-        def numeric?(field_type)
-          field_type == :numeric
-        end
+        # If average category represents less than 1% of data, it's likely text
+        avg_percentage < 1.0 ? :text : :categorical
+      end
-        # Returns whether the field type is categorical
-        # @param field_type [Symbol] The field type to check
-        # @return [Boolean]
-        def categorical?(field_type)
-          field_type == :categorical
-        end
+      measure_method_timing :categorical_or_text?
-        # Returns whether the field type is datetime
-        # @param field_type [Symbol] The field type to check
-        # @return [Boolean]
-        def datetime?(field_type)
-          field_type == :datetime
-        end
+      # Returns whether the field type is numeric
+      # @param field_type [Symbol] The field type to check
+      # @return [Boolean]
+      def numeric?(field_type)
+        field_type == :numeric
+      end
-        # Returns whether the field type is text
-        # @param field_type [Symbol] The field type to check
-        # @return [Boolean]
-        def text?(field_type)
-          field_type == :text
-        end
+      # Returns whether the field type is categorical
+      # @param field_type [Symbol] The field type to check
+      # @return [Boolean]
+      def categorical?(field_type)
+        field_type == :categorical
+      end
+      # Returns whether the field type is datetime
+      # @param field_type [Symbol] The field type to check
+      # @return [Boolean]
+      def datetime?(field_type)
+        field_type == :datetime
+      end
+      # Returns whether the field type is text
+      # @param field_type [Symbol] The field type to check
+      # @return [Boolean]
+      def text?(field_type)
+        field_type == :text
       end
     end
   end