RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc57 → 0.2.0.pre.rc58 - Mend

easy_ml 0.2.0.pre.rc57 → 0.2.0.pre.rc58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/apis_controller.rb +8 -0
data/app/controllers/easy_ml/models_controller.rb +3 -0
data/app/controllers/easy_ml/predictions_controller.rb +10 -5
data/app/frontend/components/ModelForm.tsx +1 -1
data/app/frontend/components/SearchableSelect.tsx +0 -1
data/app/frontend/components/dataset/PreprocessingConfig.tsx +1 -1
data/app/frontend/pages/DatasourcesPage.tsx +0 -2
data/app/jobs/easy_ml/compute_feature_job.rb +1 -0
data/app/models/easy_ml/column.rb +42 -4
data/app/models/easy_ml/column_history.rb +5 -1
data/app/models/easy_ml/column_list.rb +43 -11
data/app/models/easy_ml/dataset.rb +45 -25
data/app/models/easy_ml/datasource.rb +1 -0
data/app/models/easy_ml/feature.rb +10 -3
data/app/models/easy_ml/model.rb +25 -4
data/app/models/easy_ml/model_history.rb +1 -0
data/app/models/easy_ml/retraining_run.rb +1 -0
data/config/initializers/inflections.rb +2 -0
data/config/routes.rb +3 -0
data/lib/easy_ml/core/tuner.rb +1 -1
data/lib/easy_ml/data/preprocessor.rb +10 -53
data/lib/easy_ml/data/splits/in_memory_split.rb +4 -0
data/lib/easy_ml/data/statistics_learner.rb +79 -14
data/lib/easy_ml/data/synced_directory.rb +4 -2
data/lib/easy_ml/predict.rb +13 -2
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +3 -0
data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +14 -0
data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt +6 -0
data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt +20 -0
data/lib/easy_ml/version.rb +1 -1
data/public/easy_ml/assets/.vite/manifest.json +1 -1
data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js → Application.tsx-DmkdJsDd.js} +34 -34
data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-DTZ2348z.js.map → Application.tsx-DmkdJsDd.js.map} +1 -1
metadata +8 -4

data/lib/easy_ml/core/tuner.rb CHANGED Viewed

@@ -173,7 +173,7 @@ module EasyML
         end
         raise ArgumentError, "Objectives required for EasyML::Core::Tuner" unless objective.present?
-        self.metrics = EasyML::Model.new(task: task).allowed_metrics if metrics.nil? || metrics.empty?
+        self.metrics = EasyML::Model.new(task: task).default_metrics if metrics.nil? || metrics.empty?
       end
     end
   end

data/lib/easy_ml/data/preprocessor.rb CHANGED Viewed

@@ -90,46 +90,19 @@ module EasyML::Data
       df
     end
-    def learn_categorical_min(df, preprocessing_steps)
-      preprocessing_steps ||= {}
-      preprocessing_steps.deep_symbolize_keys!
-      allowed_categories = {}
-      (preprocessing_steps[:training] || {}).each_key do |col|
-        next unless [
-          preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
-          preprocessing_steps.dig(:training, col, :params, :one_hot),
-          preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
-        ].any?
-        cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
-        val_counts = df[col].value_counts
-        allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
-      end
-      allowed_categories
-    end
-    def fit(df)
+    def fit(df, precomputed_stats = {})
       return if df.nil?
       return if preprocessing_steps.nil? || preprocessing_steps.keys.none?
       preprocessing_steps.deep_symbolize_keys!
       df = apply_clip(df, preprocessing_steps)
-      allowed_categories = learn_categorical_min(df, preprocessing_steps)
-      self.statistics = StatisticsLearner.learn_df(df, dataset: dataset).deep_symbolize_keys
-      # Merge allowed categories into statistics
-      allowed_categories.each do |col, categories|
-        statistics[col] ||= {}
-        statistics[col][:allowed_categories] = categories
-        statistics[col].merge!(
-          fit_categorical(df[col], preprocessing_steps)
-        )
-      end
+      self.statistics = StatisticsLearner.learn_df(df, dataset: dataset, type: :raw).deep_symbolize_keys.merge!(
+        precomputed_stats
+      ).deep_symbolize_keys
     end
-    def postprocess(df, inference: false)
+    def postprocess(df, inference: false, computed: false)
       puts "Postprocessing..." if verbose
       return df if preprocessing_steps.nil? || preprocessing_steps.keys.none?
@@ -139,6 +112,11 @@ module EasyML::Data
           preprocessing_steps[:training]
         end
+      if computed
+        computed_cols = dataset.columns.computed.map(&:name).map(&:to_sym)
+        steps = steps.deep_dup.slice(*computed_cols)
+      end
       df = apply_transformations(df, steps)
       puts "Postprocessing complete." if @verbose
@@ -260,27 +238,6 @@ module EasyML::Data
       )
     end
-    def fit_categorical(series, _preprocessing_steps)
-      value_counts = series.value_counts
-      column_names = value_counts.columns
-      value_column = column_names[0]
-      count_column = column_names[1]
-      as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
-      label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
-        h.tap do
-          h[k] = i
-        end
-      end
-      label_decoder = label_encoder.invert
-      {
-        value: as_hash,
-        label_encoder: label_encoder,
-        label_decoder: label_decoder,
-      }
-    end
     def prepare_for_imputation(df, col)
       df = df.with_column(Polars.col(col).cast(Polars::Float64))
       df.with_column(Polars.when(Polars.col(col).is_null).then(Float::NAN).otherwise(Polars.col(col)).alias(col))

data/lib/easy_ml/data/splits/in_memory_split.rb CHANGED Viewed

@@ -41,6 +41,10 @@ module EasyML
           split_features_targets(df, split_ys, target)
         end
+        def query(**kwargs)
+          read("all", **kwargs)
+        end
         def cleanup
           @data.clear
         end

data/lib/easy_ml/data/statistics_learner.rb CHANGED Viewed

@@ -9,15 +9,16 @@ module EasyML::Data
       @verbose = options[:verbose]
     end
-    def self.learn(df, dataset = nil)
-      new(df, dataset).learn
+    def self.learn(df, dataset, type)
+      new(df, dataset, type).learn
     end
-    attr_reader :df, :dataset
+    attr_reader :df, :dataset, :type
-    def initialize(df, dataset)
+    def initialize(df, dataset, type)
       @df = df
       @dataset = dataset
+      @type = type.to_sym
     end
     def learn
@@ -27,18 +28,73 @@ module EasyML::Data
     def learn_split(split)
       df = split.read(:all)
       train_df = split.read(:train)
-      all_stats = learn_df(df, dataset: dataset)
-      train_stats = learn_df(train_df, dataset: dataset)
+      all_stats = learn_df(df)
+      train_stats = learn_df(train_df)
       all_stats.reduce({}) do |output, (k, _)|
         output.tap do
           output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
-            train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value, :last_known_value)
+            train_stats[k].slice(:mean, :median, :min, :max, :std,
+                                 :last_value, :most_frequent_value, :last_known_value,
+                                 :allowed_categories, :label_encoder, :label_decoder)
           )
         end
       end
     end
+    def learn_categorical(df)
+      allowed_categories = learn_allowed_categories(df)
+      allowed_categories.reduce({}) do |statistics, (col, categories)|
+        statistics.tap do
+          statistics[col] ||= {}
+          statistics[col][:allowed_categories] = categories
+          statistics[col].merge!(
+            learn_categorical_encoder_decoder(df[col])
+          )
+        end
+      end
+    end
+    def learn_categorical_encoder_decoder(series)
+      value_counts = series.value_counts
+      column_names = value_counts.columns
+      value_column = column_names[0]
+      count_column = column_names[1]
+      as_hash = value_counts.select([value_column, count_column]).rows.to_a.to_h.transform_keys(&:to_s)
+      label_encoder = as_hash.keys.sort.each.with_index.reduce({}) do |h, (k, i)|
+        h.tap do
+          h[k] = i
+        end
+      end
+      label_decoder = label_encoder.invert
+      {
+        value: as_hash,
+        label_encoder: label_encoder,
+        label_decoder: label_decoder,
+      }
+    end
+    def learn_allowed_categories(df)
+      preprocessing_steps = dataset.preprocessing_steps || {}
+      preprocessing_steps.deep_symbolize_keys!
+      allowed_categories = {}
+      (preprocessing_steps[:training] || {}).each_key do |col|
+        next unless [
+          preprocessing_steps.dig(:training, col, :params, :ordinal_encoding),
+          preprocessing_steps.dig(:training, col, :params, :one_hot),
+          preprocessing_steps.dig(:training, col, :method).to_sym == :categorical,
+        ].any?
+        cat_min = preprocessing_steps.dig(:training, col, :params, :categorical_min) || 1
+        val_counts = df[col].value_counts
+        allowed_categories[col] = val_counts[val_counts["count"] >= cat_min][col].to_a.compact
+      end
+      allowed_categories
+    end
     def last_known_value(df, col, date_col)
       return nil if df.empty? || !df.columns.include?(date_col)
@@ -53,13 +109,22 @@ module EasyML::Data
       last_value
     end
-    def learn_df(df, dataset: nil)
-      self.class.learn_df(df, dataset: dataset)
+    def learn_df(df)
+      return if df.nil?
+      stats = learn_base_stats(df, dataset: dataset).stringify_keys
+      if type == :raw
+        categorical = learn_categorical(df).stringify_keys
+        categorical.each { |k, v| stats[k].merge!(v) }
+      end
+      stats
     end
-    def self.learn_df(df, dataset: nil)
-      return if df.nil?
+    def self.learn_df(df, dataset: nil, type: :raw)
+      new(df, dataset, type).learn_df(df)
+    end
+    def learn_base_stats(df, dataset: nil)
       base_stats = describe_to_h(df).deep_symbolize_keys
       # Add basic column statistics first
@@ -103,16 +168,16 @@ module EasyML::Data
       end
     end
-    def self.id_column?(column)
+    def id_column?(column)
       col = column.to_s.downcase
       col.match?(/^id$/) || col.match?(/.*_id/)
     end
-    def self.last_value(df, col, date_col)
+    def last_value(df, col, date_col)
       df.filter(Polars.col(col).is_not_null).sort(date_col)[col][-1]
     end
-    def self.describe_to_h(df)
+    def describe_to_h(df)
       init_h = df.describe.to_h
       rows = init_h.values.map(&:to_a)
       keys = rows.first

data/lib/easy_ml/data/synced_directory.rb CHANGED Viewed

@@ -127,8 +127,10 @@ module EasyML
         )
         Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
-        ungzipped_file_path = ungzip_file(local_file_path)
-        Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
+        if object.key.end_with?(".gz")
+          ungzipped_file_path = ungzip_file(local_file_path)
+          Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
+        end
       rescue Aws::S3::Errors::ServiceError, Net::OpenTimeout, Net::ReadTimeout, StandardError => e
         Rails.logger.error("Failed to process #{object.key}: #{e.message}")
         raise e

data/lib/easy_ml/predict.rb CHANGED Viewed

@@ -10,11 +10,17 @@ module EasyML
       @models = {}
     end
-    def self.predict(model_name, df, serialize: false)
+    def self.normalize_input(df)
       if df.is_a?(Hash)
         df = Polars::DataFrame.new(df)
       end
+      df
+    end
+    def self.predict(model_name, df, serialize: false)
+      df = normalize_input(df)
       raw_input = df.to_hashes
       df = instance.normalize(model_name, df)
       normalized_input = df.to_hashes
       preds = instance.predict(model_name, df)
@@ -52,6 +58,11 @@ module EasyML
       get_model(model_name).predict(df)
     end
+    def self.validate_input(model_name, df)
+      df = normalize_input(df)
+      instance.get_model(model_name).dataset.validate_input(df)
+    end
     def normalize(model_name, df)
       get_model(model_name).dataset.normalize(df, inference: true)
     end
@@ -72,7 +83,7 @@ module EasyML
     private
     def load_model(model_name)
-      current_model = EasyML::Model.find_by!(name: model_name).inference_version
+      current_model = EasyML::Model.find_by!(slug: model_name).inference_version
       # Load new model if not loaded or different version
       model_not_loaded = models[model_name].nil?

data/lib/easy_ml/railtie/generators/migration/migration_generator.rb CHANGED Viewed

@@ -41,6 +41,9 @@ module EasyML
             add_workflow_status_to_easy_ml_features
             drop_path_from_easy_ml_model_files
             add_is_date_column_to_easy_ml_columns
+            add_computed_columns_to_easy_ml_columns
+            add_slug_to_easy_ml_models
+            add_default_to_is_target
           ].freeze
           # Specify the next migration number

data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt ADDED Viewed

@@ -0,0 +1,14 @@
+class AddComputedColumnsToEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
+  def change
+    add_column :easy_ml_columns, :computed_by, :string
+    add_column :easy_ml_columns, :is_computed, :boolean, default: false
+    add_index :easy_ml_columns, :computed_by
+    add_index :easy_ml_columns, :is_computed
+    add_column :easy_ml_column_histories, :computed_by, :string
+    add_index :easy_ml_column_histories, :computed_by
+    add_column :easy_ml_column_histories, :is_computed, :boolean, default: false
+    add_index :easy_ml_column_histories, :is_computed
+  end
+end

data/lib/easy_ml/railtie/templates/migration/add_default_to_is_target.rb.tt ADDED Viewed

@@ -0,0 +1,6 @@
+class AddDefaultToIsTarget < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
+  def change
+    change_column_default(:easy_ml_columns, :is_target, false)
+    change_column_default(:easy_ml_column_histories, :is_target, false)
+  end
+end

data/lib/easy_ml/railtie/templates/migration/add_slug_to_easy_ml_models.rb.tt ADDED Viewed

@@ -0,0 +1,20 @@
+class AddSlugToEasyMLModels < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
+  def change
+    add_column :easy_ml_models, :slug, :string
+    add_index :easy_ml_models, :slug, unique: true
+    reversible do |dir|
+      dir.up do
+        execute <<-SQL
+          UPDATE easy_ml_models
+          SET slug = LOWER(REPLACE(name, ' ', '_'))
+        SQL
+      end
+    end
+    change_column_null :easy_ml_models, :slug, false
+    add_column :easy_ml_model_histories, :slug, :string
+    add_index :easy_ml_model_histories, :slug
+  end
+end

data/lib/easy_ml/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module EasyML
-  VERSION = "0.2.0-rc57"
+  VERSION = "0.2.0-rc58"
   module Version
   end

data/public/easy_ml/assets/.vite/manifest.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "entrypoints/Application.tsx": {
-    "file": "assets/entrypoints/Application.tsx-DTZ2348z.js",
+    "file": "assets/entrypoints/Application.tsx-DmkdJsDd.js",
     "name": "entrypoints/Application.tsx",
     "src": "entrypoints/Application.tsx",
     "isEntry": true,