RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc78 → 0.2.0.pre.rc82 - Mend

easy_ml 0.2.0.pre.rc78 → 0.2.0.pre.rc82

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/datasets_controller.rb +3 -3
data/app/controllers/easy_ml/models_controller.rb +2 -2
data/app/jobs/easy_ml/training_job.rb +2 -2
data/app/models/easy_ml/column/imputers/imputer.rb +2 -0
data/app/models/easy_ml/column_list.rb +2 -3
data/app/models/easy_ml/dataset.rb +22 -11
data/app/models/easy_ml/feature.rb +27 -7
data/app/models/easy_ml/model.rb +11 -6
data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +21 -5
data/app/models/easy_ml/models/xgboost/evals_callback.rb +7 -4
data/app/models/easy_ml/models/xgboost.rb +18 -16
data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +16 -3
data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +0 -17
data/lib/easy_ml/core/tuner.rb +9 -3
data/lib/easy_ml/data/dataset_manager/reader/base.rb +12 -0
data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb +8 -3
data/lib/easy_ml/data/dataset_manager/reader/file.rb +5 -0
data/lib/easy_ml/data/dataset_manager/reader.rb +7 -1
data/lib/easy_ml/data/dataset_manager/writer/base.rb +1 -1
data/lib/easy_ml/data/dataset_manager.rb +10 -2
data/lib/easy_ml/data/embeddings/adapters.rb +56 -0
data/lib/easy_ml/data/embeddings/compression.rb +1 -0
data/lib/easy_ml/data/embeddings.rb +43 -0
data/lib/easy_ml/version.rb +1 -1
metadata +5 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 13858267adb9445f665a01214f2109bc23dd63a76d5ab0ae502c60ac94a6d2d4
-  data.tar.gz: bc1b37afabf4757ce1e7e311699d6e8ac0bea2230025d8e696ada4071b0b3563
+  metadata.gz: ce245d6900c4c5c001c0de9982894ccf6b41faef31e8c958dc540ef05fe426e4
+  data.tar.gz: a120f14076a9ff83ca6afb8b0bd651b9ea9ed0185d42f23b82f5de6b2a4de831
 SHA512:
-  metadata.gz: ccd5fc9e0b9529da07012a1745f826cf8e88391b24e3df20ba636c9e6ccf853172d18916cccc3087692873971a9dd2b72aa7151e286824df5cb500255610d603
-  data.tar.gz: 6034abbae5e25a00f204a649c62b568a90a76481c6ff91aaadd766fe515fe76dbf6692bebabe905c5a4bc1b9642717c77f4cbfda6b43684624a5e32517f73d99
+  metadata.gz: db2b7292bf07b5122a7949a111c56a25f8848496eaadfc94f101a271054fd4c21d401b4c941f69d7579c8cdc9887e68c4105f11720dba13aa7fd0fcdafb79b81
+  data.tar.gz: f5dc8ffee52fb67b6cda8e4462e540900d528dd9c15ff49faa8ca45f21f9e4968129b996a5018a91801aae431a65b5274d64e69045e5feeeec6f12f09900cda3

data/app/controllers/easy_ml/datasets_controller.rb CHANGED Viewed

@@ -23,7 +23,7 @@
 module EasyML
   class DatasetsController < ApplicationController
     def index
-      datasets = Dataset.all.order(id: :desc)
+      datasets = Dataset.all.includes(:columns, :datasource).order(id: :desc)
       render inertia: "pages/DatasetsPage", props: {
         datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
@@ -80,7 +80,7 @@ module EasyML
       if dataset_params[:features_attributes].present?
         # Clean up any feature IDs that don't exist anymore
         feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
-        existing_feature_ids = Feature.where(id: feature_ids).pluck(:id)
+        existing_feature_ids = dataset.features.where(id: feature_ids).pluck(:id)
         params[:dataset][:features_attributes].each do |attrs|
           if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
@@ -93,7 +93,7 @@ module EasyML
           attrs[:feature_class] if attrs[:id].blank?
         }.compact
-        existing_features = Feature.where(feature_class: feature_classes)
+        existing_features = dataset.features.where(feature_class: feature_classes)
         # Update params with existing feature IDs
         existing_features.each do |feature|

data/app/controllers/easy_ml/models_controller.rb CHANGED Viewed

@@ -41,7 +41,7 @@ module EasyML
       render inertia: "pages/EditModelPage", props: {
         model: model_to_json(model),
         datasets: EasyML::Dataset.all.map do |dataset|
-          dataset_to_json(dataset)
+          dataset_to_json_small(dataset)
         end,
         constants: EasyML::Model.constants,
       }
@@ -167,7 +167,7 @@ module EasyML
     private
     def includes_list
-      [:retraining_runs, :retraining_job, dataset: [:columns, :features, :splitter]]
+      [:retraining_runs, :retraining_job, dataset: [:features, :splitter, columns: [:lineages]]]
     end
     def model_params

data/app/jobs/easy_ml/training_job.rb CHANGED Viewed

@@ -10,13 +10,13 @@ module EasyML
       @last_activity = Time.current
       setup_signal_traps
-      @monitor_thread = start_monitor_thread
+      # @monitor_thread = start_monitor_thread
       @model.actually_train do |iteration_info|
         @last_activity = Time.current
       end
     ensure
-      @monitor_thread&.exit
+      # @monitor_thread&.exit
       @model.unlock!
     end

data/app/models/easy_ml/column/imputers/imputer.rb CHANGED Viewed

@@ -54,6 +54,8 @@ module EasyML
           return df unless anything?
           adapters.reduce(df) do |df, adapter|
+            next df if df.columns.exclude?(column.name)
             adapter.transform(df)
           end
         end

data/app/models/easy_ml/column_list.rb CHANGED Viewed

@@ -28,12 +28,11 @@ module EasyML
       if computed
         cols = column_list.computed
       else
-        cols = column_list.raw
+        cols = column_list
       end
       by_name = cols.index_by(&:name)
-      df.columns.each do |col|
-        column = by_name[col]
+      cols.each do |column|
         df = column.transform(df, inference: inference, computed: computed) if column
       end

data/app/models/easy_ml/dataset.rb CHANGED Viewed

@@ -232,20 +232,20 @@ module EasyML
       cleanup
       refresh_datasource!
       split_data
-      process_data
+      fit
     end
     def prepare
       prepare_features
       refresh_datasource
       split_data
-      process_data
+      fit
     end
     def actually_refresh
       refreshing do
-        learn(delete: false) # After syncing datasource, learn new statistics + sync columns
-        process_data
+        fit
+        normalize_all
         fully_reload
         learn
         learn_statistics(type: :processed) # After processing data, we learn any new statistics
@@ -385,6 +385,7 @@ module EasyML
     def unlock!
       Support::Lockable.unlock!(lock_key)
       features.each(&:unlock!)
+      true
     end
     def locked?
@@ -427,12 +428,6 @@ module EasyML
       (read_attribute(:statistics) || {}).with_indifferent_access
     end
-    def process_data
-      learn(delete: false)
-      fit
-      normalize_all
-    end
     def needs_learn?
       return true if columns_need_refresh?
@@ -483,7 +478,7 @@ module EasyML
       df = apply_missing_columns(df, inference: inference)
       df = columns.transform(df, inference: inference)
       df = apply_features(df, features)
-      df = columns.transform(df, inference: inference, computed: true)
+      df = columns.transform(df, inference: inference)
       df = apply_column_mask(df, inference: inference) unless all_columns
       df = drop_nulls(df) unless inference
       df, = processed.split_features_targets(df, true, target) if split_ys
@@ -722,6 +717,20 @@ module EasyML
       reload
     end
+    def list_nulls(input = nil, list_raw = false)
+      input = data(lazy: true) if input.nil?
+      case input
+      when Polars::DataFrame
+        input = input.lazy
+      when String, Symbol
+        input = input.to_sym
+        input = send(input).data(lazy: true)
+      end
+      col_list = EasyML::Data::DatasetManager.list_nulls(input)
+      list_raw ? col_list : regular_columns(col_list)
+    end
     private
     def apply_date_splitter_config
@@ -798,6 +807,7 @@ module EasyML
         processed_df = normalize(df, all_columns: true)
         processed.save(segment, processed_df)
       end
+      features.select { |f| !f.fittable? }.each(&:after_transform)
       @normalized = true
     end
@@ -840,6 +850,7 @@ module EasyML
     end
     def fit
+      learn(delete: false)
       learn_statistics(type: :raw)
     end

data/app/models/easy_ml/feature.rb CHANGED Viewed

@@ -78,11 +78,21 @@ module EasyML
     scope :never_applied, -> { where(applied_at: nil) }
     scope :never_fit, -> do
             fittable = where(fit_at: nil)
-            fittable = fittable.select { |f| f.adapter.respond_to?(:fit) }
+            fittable = fittable.select(&:fittable?)
             where(id: fittable.map(&:id))
           end
-    scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit) }
-    scope :ready_to_apply, -> { where(needs_fit: false).where.not(id: has_changes.map(&:id)) }
+    scope :needs_fit, -> { has_changes.or(never_applied).or(never_fit).or(datasource_was_refreshed) }
+    scope :datasource_was_refreshed, -> do
+            where(id: all.select(&:datasource_was_refreshed?).map(&:id))
+          end
+    scope :ready_to_apply, -> do
+            base = where(needs_fit: false).where.not(id: has_changes.map(&:id))
+            doesnt_fit = where_no_fit
+            where(id: base.map(&:id).concat(doesnt_fit.map(&:id)))
+          end
+    scope :fittable, -> { all.select(&:fittable?) }
+    scope :where_no_fit, -> { all.reject(&:fittable?) }
     before_save :apply_defaults, if: :new_record?
     before_save :update_sha
@@ -100,6 +110,10 @@ module EasyML
       feature_klass.present?
     end
+    def fittable?
+      adapter.respond_to?(:fit)
+    end
     def adapter
       @adapter ||= feature_klass.new
     end
@@ -133,6 +147,7 @@ module EasyML
     end
     def datasource_was_refreshed?
+      return false unless fittable?
       return true if fit_at.nil?
       return false if dataset.datasource.refreshed_at.nil?
@@ -213,13 +228,14 @@ module EasyML
     end
     def wipe
+      update(needs_fit: true) if fittable?
       feature_store.wipe
     end
     def fit(features: [self], async: false)
       ordered_features = features.sort_by(&:feature_position)
       parent_batch_id = Random.uuid
-      jobs = ordered_features.map do |feature|
+      jobs = ordered_features.select(&:fittable?).map do |feature|
         feature.build_batches.map do |batch_args|
           batch_args.merge(parent_batch_id: parent_batch_id)
         end
@@ -450,7 +466,7 @@ module EasyML
     def after_fit
       update_sha
-      feature_store.compact
+      feature_store.compact if fittable?
       updates = {
         fit_at: Time.current,
         needs_fit: false,
@@ -459,6 +475,10 @@ module EasyML
       update!(updates)
     end
+    def after_transform
+      feature_store.compact if !fittable?
+    end
     def unlock!
       feature_store.unlock!
     end
@@ -517,14 +537,14 @@ module EasyML
       new_sha = compute_sha
       if new_sha != self.sha
         self.sha = new_sha
-        self.needs_fit = true
+        self.needs_fit = fittable?
       end
     end
     def update_from_feature_class
       if read_attribute(:batch_size) != config.dig(:batch_size)
         write_attribute(:batch_size, config.dig(:batch_size))
-        self.needs_fit = true
+        self.needs_fit = fittable?
       end
       if self.primary_key != config.dig(:primary_key)

data/app/models/easy_ml/model.rb CHANGED Viewed

@@ -179,17 +179,18 @@ module EasyML
     end
     def actually_train(&progress_block)
-      raise untrainable_error unless trainable?
       lock_model do
         run = pending_run
         run.wrap_training do
+          raise untrainable_error unless trainable?
           best_params = nil
           if run.should_tune?
             best_params = hyperparameter_search(&progress_block)
+          else
+            fit(&progress_block)
+            save
           end
-          fit(&progress_block)
-          save
           [self, best_params]
         end
         update(is_training: false)
@@ -393,6 +394,10 @@ module EasyML
       adapter.after_tuning
     end
+    def cleanup
+      adapter.cleanup
+    end
     def fit_in_batches(tuning: false, batch_size: nil, batch_overlap: nil, batch_key: nil, checkpoint_dir: Rails.root.join("tmp", "xgboost_checkpoints"), &progress_block)
       adapter.fit_in_batches(tuning: tuning, batch_size: batch_size, batch_overlap: batch_overlap, batch_key: batch_key, checkpoint_dir: checkpoint_dir, &progress_block)
     end
@@ -619,8 +624,8 @@ module EasyML
     private
     def default_evaluation_inputs
-      x_true, y_true = dataset.test(split_ys: true)
-      ds = dataset.test(all_columns: true)
+      x_true, y_true = dataset.processed.test(split_ys: true, all_columns: true)
+      ds = dataset.processed.test(all_columns: true)
       y_pred = predict(x_true)
       {
         x_true: x_true,

data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb CHANGED Viewed

@@ -37,6 +37,20 @@ module EasyML
                 max: 10,
                 step: 0.1,
               },
+              scale_pos_weight: {
+                label: "Scale Pos Weight",
+                description: "Balance of positive and negative weights",
+                min: 0,
+                max: 200,
+                step: 1,
+              },
+              max_delta_step: {
+                label: "Max Delta Step",
+                description: "Maximum delta step",
+                min: 0,
+                max: 10,
+                step: 1,
+              },
               gamma: {
                 label: "Gamma",
                 description: "Minimum loss reduction required to make a further partition",
@@ -81,11 +95,13 @@ module EasyML
                     label: "Histogram",
                     description: "Fast histogram optimized approximate greedy algorithm",
                   },
-                  {
-                    value: "gpu_hist",
-                    label: "GPU Histogram",
-                    description: "GPU implementation of hist algorithm",
-                  },
+                # Only when compiled wih GPU support...
+                # How to make this not a default optoin
+                # {
+                #   value: "gpu_hist",
+                #   label: "GPU Histogram",
+                #   description: "GPU implementation of hist algorithm",
+                # },
                 ],
               },
             )

data/app/models/easy_ml/models/xgboost/evals_callback.rb CHANGED Viewed

@@ -50,7 +50,7 @@ module EasyML
             x_valid = x_valid.select(model.dataset.col_order(inference: true))
             @preprocessed ||= model.preprocess(x_valid, y_valid)
             y_pred = model.predict(@preprocessed)
-            dataset = model.dataset.valid(all_columns: true)
+            dataset = model.dataset.processed.valid(all_columns: true)
             metrics = model.evaluate(y_pred: y_pred, y_true: y_valid, x_true: x_valid, dataset: dataset)
             Wandb.log(metrics)
@@ -103,7 +103,7 @@ module EasyML
           model.callbacks.detect { |cb| cb.class == Wandb::XGBoostCallback }
         end
-        def track_cumulative_feature_importance(finish = true)
+        def track_cumulative_feature_importance
           return unless @feature_importances
           project_name = model.adapter.get_wandb_project
@@ -127,13 +127,16 @@ module EasyML
             "feature_importance" => bar_plot.__pyptr__,
           }
           Wandb.log(log_data)
-          model.adapter.delete_wandb_project if finish
-          Wandb.finish if finish
         end
         def after_tuning
           track_cumulative_feature_importance
         end
+        def cleanup
+          model.adapter.delete_wandb_project
+          Wandb.finish
+        end
       end
     end
   end

data/app/models/easy_ml/models/xgboost.rb CHANGED Viewed

@@ -135,6 +135,12 @@ module EasyML
         end
       end
+      def cleanup
+        model.callbacks.each do |callback|
+          callback.cleanup if callback.respond_to?(:cleanup)
+        end
+      end
       def prepare_callbacks(tuner)
         set_wandb_project(tuner.project_name)
@@ -421,11 +427,11 @@ module EasyML
       def prepare_data
         if @d_train.nil?
           col_order = dataset.col_order
-          x_sample, y_sample = dataset.train(split_ys: true, limit: 5, select: col_order, lazy: true)
+          x_sample, y_sample = dataset.processed.train(split_ys: true, limit: 5, select: col_order, lazy: true)
           preprocess(x_sample, y_sample) # Ensure we fail fast if the dataset is misconfigured
-          x_train, y_train = dataset.train(split_ys: true, select: col_order, lazy: true)
-          x_valid, y_valid = dataset.valid(split_ys: true, select: col_order, lazy: true)
-          x_test, y_test = dataset.test(split_ys: true, select: col_order, lazy: true)
+          x_train, y_train = dataset.processed.train(split_ys: true, select: col_order, lazy: true)
+          x_valid, y_valid = dataset.processed.valid(split_ys: true, select: col_order, lazy: true)
+          x_test, y_test = dataset.processed.test(split_ys: true, select: col_order, lazy: true)
           @d_train = preprocess(x_train, y_train)
           @d_valid = preprocess(x_valid, y_valid)
           @d_test = preprocess(x_test, y_test)
@@ -439,22 +445,19 @@ module EasyML
       end
       def untrainable_columns
-        df = model.dataset.processed.data(lazy: true)
+        model.dataset.refresh if model.dataset.processed.nil?
-        columns = df.columns
-        selects = columns.map do |col|
-          Polars.col(col).null_count.alias(col)
-        end
-        null_info = df.select(selects).collect
-        null_info.to_hashes.first.compact
-        col_list = null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
-        model.dataset.regular_columns(col_list)
+        model.dataset.list_nulls(
+          model.dataset.processed.data(lazy: true)
+        )
       end
       def preprocess(xs, ys = nil)
         return xs if xs.is_a?(::XGBoost::DMatrix)
-        weights_col = model.weights_column || nil
+        lazy = xs.is_a?(Polars::LazyFrame)
+        return xs if (lazy ? xs.limit(1).collect : xs).shape[0] == 0
+        weights_col = (model.weights_column.nil? || model.weights_column.blank?) ? nil : model.weights_column
         if weights_col == model.dataset.target
           raise ArgumentError, "Weight column cannot be the target column"
@@ -463,7 +466,6 @@ module EasyML
         # Extract feature columns (all columns except label and weight)
         feature_cols = xs.columns
         feature_cols -= [weights_col] if weights_col
-        lazy = xs.is_a?(Polars::LazyFrame)
         # Get features, labels and weights
         features = lazy ? xs.select(feature_cols).collect.to_numo : xs.select(feature_cols).to_numo

data/lib/easy_ml/core/tuner/adapters/base_adapter.rb CHANGED Viewed

@@ -18,12 +18,22 @@ module EasyML
           end
           def defaults
-            {}
+            model.adapter.hyperparameters.class.hyperparameter_constants.transform_values do |constant|
+              values = constant.slice(:min, :max, :step, :options)
+              if values.key?(:options)
+                values[:options] = values[:options].map { |option| option[:value] }
+              end
+              values
+            end
           end
           def run_trial(trial)
             config = deep_merge_defaults(self.config.clone.deep_symbolize_keys)
-            suggest_parameters(trial, config)
+            # For first trial, re-use the original hyperparameters, so they
+            # serve as our starting point/imputers
+            unless trial == 1
+              suggest_parameters(trial, config)
+            end
             yield model
           end
@@ -57,8 +67,11 @@ module EasyML
             min = param_config[:min]
             max = param_config[:max]
             log = param_config[:log]
+            options = param_config[:options]
-            if log
+            if options
+              trial.suggest_categorical(param_name.to_s, options)
+            elsif log
               trial.suggest_loguniform(param_name.to_s, min, max)
             elsif max.is_a?(Integer) && min.is_a?(Integer)
               trial.suggest_int(param_name.to_s, min, max)

data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb CHANGED Viewed

@@ -5,23 +5,6 @@ module EasyML
     class Tuner
       module Adapters
         class XGBoostAdapter < BaseAdapter
-          def defaults
-            {
-              learning_rate: {
-                min: 0.001,
-                max: 0.1,
-                log: true,
-              },
-              n_estimators: {
-                min: 100,
-                max: 1_000,
-              },
-              max_depth: {
-                min: 2,
-                max: 20,
-              },
-            }
-          end
         end
       end
     end

data/lib/easy_ml/core/tuner.rb CHANGED Viewed

@@ -73,13 +73,13 @@ module EasyML
         model.task = task
         model.dataset.refresh if model.dataset.needs_refresh?
-        x_valid, y_valid = model.dataset.valid(split_ys: true, all_columns: true)
+        x_valid, y_valid = model.dataset.processed.valid(split_ys: true, all_columns: true)
         x_normalized = model.dataset.normalize(x_valid, inference: true)
         x_normalized = model.preprocess(x_normalized)
         self.x_valid = x_valid
         self.y_valid = y_valid
         self.x_normalized = x_normalized
-        self.dataset = model.dataset.valid(all_columns: true)
+        self.dataset = model.dataset.processed.valid(all_columns: true)
         adapter.tune_started_at = tune_started_at
         adapter.x_valid = x_valid
         adapter.y_valid = y_valid
@@ -108,7 +108,6 @@ module EasyML
           end
         end
-        model.after_tuning
         return nil if tuner_job.tuner_runs.all?(&:failed?)
         best_run = tuner_job.best_run
@@ -118,6 +117,13 @@ module EasyML
           status: :success,
           completed_at: Time.current,
         )
+        model.after_tuning
+        if best_run&.hyperparameters.present?
+          model.hyperparameters = best_run.hyperparameters
+          model.fit
+          model.save
+        end
+        model.cleanup
         best_run&.hyperparameters
       rescue StandardError => e

data/lib/easy_ml/data/dataset_manager/reader/base.rb CHANGED Viewed

@@ -35,6 +35,18 @@ module EasyML
           private
+          def list_df_nulls(df)
+            df = df.lazy
+            columns = df.columns
+            selects = columns.map do |col|
+              Polars.col(col).null_count.alias(col)
+            end
+            null_info = df.select(selects).collect
+            null_info.to_hashes.first.compact
+            null_info.to_hashes.first.transform_values { |v| v > 0 ? v : nil }.compact.keys
+          end
           def apply_defaults(kwargs)
             options = kwargs.dup

data/lib/easy_ml/data/dataset_manager/reader/data_frame.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 module EasyML
   module Data
     class DatasetManager
@@ -8,11 +7,17 @@ module EasyML
             return query_dataframes(lazy_frames, schema)
           end
+          def list_nulls
+            df = lazy_frames
+            list_df_nulls(df)
+          end
           def schema
             input.schema
           end
-        private
+          private
           def lazy_frames
             input.lazy
           end
@@ -20,4 +25,4 @@ module EasyML
       end
     end
   end
-end
+end

data/lib/easy_ml/data/dataset_manager/reader/file.rb CHANGED Viewed

@@ -15,6 +15,11 @@ module EasyML
             return Batch.new(options, &block).query
           end
+          def list_nulls
+            df = dataframe.lazy
+            list_df_nulls(df)
+          end
           def schema
             @schema ||= files.any? ? Polars.read_parquet_schema(files.first) : nil
           end

data/lib/easy_ml/data/dataset_manager/reader.rb CHANGED Viewed

@@ -17,12 +17,18 @@ module EasyML
           ).query
         end
-        def self.schema(input, **kwargs, &block)
+        def self.schema(input = nil, **kwargs, &block)
           adapter(input).new(
             kwargs.merge!(input: input), &block
           ).schema
         end
+        def self.list_nulls(input = nil, **kwargs, &block)
+          adapter(input).new(
+            kwargs.merge!(input: input), &block
+          ).list_nulls
+        end
         def self.files(dir)
           Dir.glob(::File.join(dir, "**/*.{parquet}"))
         end

data/lib/easy_ml/data/dataset_manager/writer/base.rb CHANGED Viewed

@@ -95,7 +95,7 @@ module EasyML
             keylist = unique_id_key(subdir: "keylist")
             acquire_lock(keylist) do |suo|
-              suo.client.sadd(keylist, key)
+              suo.client.sadd?(keylist, key)
             end
           end

data/lib/easy_ml/data/dataset_manager.rb CHANGED Viewed

@@ -44,13 +44,21 @@ module EasyML
           Reader.schema(input, **kwargs, &block)
         end
+        def list_nulls(input = nil, **kwargs, &block)
+          Reader.list_nulls(input, **kwargs, &block)
+        end
         def num_rows
           Reader.num_rows
         end
       end
-      def num_rows
-        Reader.num_rows(root_dir)
+      def list_nulls(input = nil, **kwargs, &block)
+        Reader.list_nulls(input, **kwargs, &block)
+      end
+      def num_rows(input = nil, **kwargs, &block)
+        Reader.num_rows(input, **kwargs, &block)
       end
       def query(input = nil, **kwargs, &block)

data/lib/easy_ml/data/embeddings/adapters.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module EasyML
+  module Data
+    class Embeddings
+      class Adapters
+        attr_accessor :model, :config
+        ADAPTERS = {
+          anthropic: Langchain::LLM::Anthropic,
+          gemini: Langchain::LLM::GoogleGemini,
+          openai: Langchain::LLM::OpenAI,
+          ollama: Langchain::LLM::Ollama,
+        }
+        DEFAULTS = {
+          api_key: {
+            anthropic: ENV["ANTHROPIC_API_KEY"],
+            gemini: ENV["GEMINI_API_KEY"],
+            openai: ENV["OPENAI_API_KEY"],
+            ollama: ENV["OLLAMA_API_KEY"],
+          },
+        }
+        def initialize(model, config = {})
+          @model = model.to_sym
+          @config = config.symbolize_keys
+          apply_defaults
+        end
+        def embed(df, col)
+          pick
+          texts = df[col].to_a
+          df = df.with_column(
+            embeddings: adapter.embed(text: texts),
+          )
+        end
+        private
+        def pick
+          @adapter ||= ADAPTERS[@model].new(config)
+          self
+        end
+        def apply_defaults
+          @config = @config.deep_symbolize_keys
+          DEFAULTS.each do |k, v|
+            unless @config.key?(k)
+              @config[k] = v[@model]
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/easy_ml/data/embeddings/compression.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+

data/lib/easy_ml/data/embeddings.rb ADDED Viewed

@@ -0,0 +1,43 @@
+module EasyML
+  module Data
+    class Embeddings
+      COMPRESSION_DEFAULT = {
+        present: :balanced,
+      }
+      attr_reader :df, :column, :model, :adapter, :compression,
+                  :embeddings, :compressed_embeddings
+      def initialize(options = {})
+        @df = options[:df]
+        @column = options[:column]
+        @model = options[:model]
+        @config = options[:config] || {}
+        @compression = options[:compression] || COMPRESSION_DEFAULT
+      end
+      def create
+        embed
+        compress
+      end
+      def embed
+        @embeddings ||= adapter.embed(df, column)
+      end
+      def compress
+        @compressed_embeddings ||= compression_adapter.compress(embeddings)
+      end
+      private
+      def adapter
+        @adapter ||= EasyML::Data::Embeddings::Adapters.new(model, config)
+      end
+      def compression_adapter
+        @compression_adapter ||= EasyML::Data::Embeddings::Compression.new(compression)
+      end
+    end
+  end
+end

data/lib/easy_ml/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module EasyML
-  VERSION = "0.2.0-rc78"
+  VERSION = "0.2.0-rc82"
   module Version
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: easy_ml
 version: !ruby/object:Gem::Version
-  version: 0.2.0.pre.rc78
+  version: 0.2.0.pre.rc82
 platform: ruby
 authors:
 - Brett Shollenberger
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-02-18 00:00:00.000000000 Z
+date: 2025-02-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activerecord
@@ -719,6 +719,9 @@ files:
 - lib/easy_ml/data/dataset_manager/writer/partitioned.rb
 - lib/easy_ml/data/dataset_manager/writer/partitioned/partition_reasons.rb
 - lib/easy_ml/data/date_converter.rb
+- lib/easy_ml/data/embeddings.rb
+- lib/easy_ml/data/embeddings/adapters.rb
+- lib/easy_ml/data/embeddings/compression.rb
 - lib/easy_ml/data/partition.rb
 - lib/easy_ml/data/partition/boundaries.rb
 - lib/easy_ml/data/polars_column.rb