RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc39 → 0.2.0.pre.rc41 - Mend

easy_ml 0.2.0.pre.rc39 → 0.2.0.pre.rc41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a53f346d534b4333dcc9f8880c9d0fd4d14acf7a596be57caf42ea789490e4e4
-  data.tar.gz: 912b118a2c82f2397afce24d39a5c556d98fc58a310647de7232851f7ee606b4
+  metadata.gz: 03333c45a1103bf7e75446a0c54d6799b62e64646abc1ab2abac123a206d1424
+  data.tar.gz: cb5ba985a5b8e5fd136b92e5ca5f65162d5189f06ba91bdd6e6763a69f5fbe56
 SHA512:
-  metadata.gz: 4d641839d982e782d5921cc7086ef583f3ae0f5901446d0c132deacb5a7101b420428cbbb52f58b0a4f9a94fb3ed6cd7f72cb1705eb2c7c3c4f938c4a7e9702d
-  data.tar.gz: b19d313ece15cb343138f4d6e036947ba764708b80695ef6d7b502c45e3e34662dec959d8e1c1f423109f89ac87101b3383d64b80da3d32cc7e9bee7b30c6f5c
+  metadata.gz: ddd88ca06fecf8366a7e3fa370b2f78e0e73ebc4c29fefcdd6cd0b208286710d503b767d6fabcf9af5eb40105bb0ffe63ddd773278b0cc9379750dfb4763d87f
+  data.tar.gz: 0c3b8dfdb6d293692439a1818dca5fe1974e27039895e54f08e492bb621c563a77cc4e0921f1df58603526a0b95354741684052aa9f1df8928e5b54de6f8caac

data/Rakefile CHANGED Viewed

@@ -1,6 +1,5 @@
 # frozen_string_literal: true
-require "sprockets/railtie"
 require "bundler/gem_tasks"
 require "rspec/core/rake_task"
@@ -20,16 +19,19 @@ require_relative "lib/easy_ml"
 # Load the annotate tasks
 require "annotate/annotate_models"
+require "combustion"
+Combustion.path = "spec/internal"
+Combustion::Application.configure_for_combustion
 task :environment do
-  require "combustion"
-  require "sprockets"
-  Combustion.path = "spec/internal"
-  Combustion.initialize! :active_record do |config|
-    config.assets = ActiveSupport::OrderedOptions.new # Stub to avoid errors
-    config.assets.enabled = false # Set false since assets are handled by Vite
-  end
-  EasyML::Engine.eager_load!
+  Combustion::Application.initialize!
+  # Reset migrations paths so we can keep the migrations in the project root,
+  # not the Rails root
+  migrations_paths = ["spec/internal/db/migrate"]
+  ActiveRecord::Tasks::DatabaseTasks.migrations_paths = migrations_paths
+  ActiveRecord::Migrator.migrations_paths = migrations_paths
 end
+Combustion::Application.load_tasks
 namespace :easy_ml do
   task annotate_models: :environment do

data/app/controllers/easy_ml/application_controller.rb CHANGED Viewed

@@ -12,7 +12,7 @@ module EasyML
     before_action :hot_reload
     def hot_reload
-      return unless Rails.env.development? && ENV["EASY_ML_DEMO_APP"]
+      return unless Rails.env.development? && ENV["EASY_ML_DEV"]
       Dir[EasyML::Engine.root.join("lib/**/*")].select { |f| Pathname.new(f).extname == ".rb" }.each do |file|
         load file

data/app/frontend/components/dataset/splitters/DateSplitter.tsx CHANGED Viewed

@@ -12,14 +12,14 @@ export function DateSplitter({ attributes, columns, onChange }: DateSplitterProp
   return (
     <div className="space-y-4">
       <div>
-        <label htmlFor="date_column" className="block text-sm font-medium text-gray-700">
+        <label htmlFor="date_col" className="block text-sm font-medium text-gray-700">
           Date Column
         </label>
         <SearchableSelect
-          id="date_column"
-          value={attributes.date_column}
+          id="date_col"
+          value={attributes.date_col}
           options={columns.map(col => ({ value: col, label: col }))}
-          onChange={(value) => onChange({ ...attributes, date_column: value })}
+          onChange={(value) => onChange({ ...attributes, date_col: value })}
           placeholder="Select date column"
         />
       </div>

data/app/frontend/components/dataset/splitters/types.ts CHANGED Viewed

@@ -18,7 +18,7 @@ export type SplitterType =
   | 'leave_p_out';
 export interface DateSplitConfig {
-  date_column: string;
+  date_col: string;
   months_test: number;
   months_valid: number;
 }
@@ -81,7 +81,7 @@ export interface ValidationResult {
 // Validation functions for each splitter type
 export const validateDateSplitter = (config: DateSplitConfig): ValidationResult => {
-  if (!config.date_column) {
+  if (!config.date_col) {
     return { isValid: false, error: "Please select a date column" };
   }
   if (!config.months_test || config.months_test <= 0) {
@@ -108,7 +108,7 @@ export const validateRandomSplitter = (config: RandomSplitConfig): ValidationRes
 };
 export const validatePredefinedSplitter = (config: PredefinedSplitConfig): ValidationResult => {
-  if (!config.files || config.files.length === 0) {
+  if (!config.train_files || config.train_files.length === 0) {
     return { isValid: false, error: "Please select at least one file for splitting" };
   }
   return { isValid: true };

data/app/frontend/pages/NewDatasetPage.tsx CHANGED Viewed

@@ -30,7 +30,7 @@ export default function NewDatasetPage({ constants, datasources }: NewDatasetFor
     switch (type) {
       case 'date':
         const dateConfig: DateSplitConfig = {
-          date_column: '',
+          date_col: '',
           months_test: 2,
           months_valid: 2
         };

data/app/helpers/easy_ml/application_helper.rb CHANGED Viewed

@@ -3,8 +3,8 @@
 module EasyML
   module ApplicationHelper
     # Override: Returns the engine assets manifest.
-    def vite_manifest
-      ViteRuby.new(EasyML::Engine.root).manifest
+    def easy_ml_manifest
+      ViteRuby.new(root: EasyML::Engine.root).manifest
     end
     def prod_script_tags

data/app/jobs/easy_ml/compute_feature_job.rb CHANGED Viewed

@@ -1,12 +1,43 @@
 module EasyML
   class ComputeFeatureJob < BatchJob
+    extend EasyML::DataframeSerialization
     @queue = :easy_ml
     def self.perform(batch_id, options = {})
+      puts "processing batch_id #{batch_id}"
       options.symbolize_keys!
       feature_id = options.dig(:feature_id)
       feature = EasyML::Feature.find(feature_id)
-      feature.fit_batch(options)
+      dataset = feature.dataset
+      # Check if any feature has failed before proceeding
+      if dataset.features.any? { |f| f.workflow_status == "failed" }
+        puts "Aborting feature computation due to previous feature failure"
+        return
+      end
+      begin
+        feature.fit_batch(options.merge!(batch_id: batch_id))
+      rescue => e
+        puts "Error computing feature: #{e.message}"
+        EasyML::Feature.transaction do
+          return if dataset.reload.workflow_status == :failed
+          puts "Logging error"
+          feature.update(workflow_status: :failed)
+          dataset.update(workflow_status: :failed)
+          build_error_with_context(dataset, e, batch_id, feature)
+        end
+      end
+    end
+    def self.build_error_with_context(dataset, error, batch_id, feature)
+      error = EasyML::Event.handle_error(dataset, error)
+      batch = feature.build_batch(batch_id: batch_id)
+      # Convert any dataframes in the context to serialized form
+      error.create_context(context: batch)
     end
     def self.after_batch_hook(batch_id, *args)
@@ -15,5 +46,27 @@ module EasyML
       dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
       dataset.after_fit_features
     end
+    def self.feature_fully_processed?(feature)
+    end
+    private
+    def self.remove_remaining_batch_jobs(batch_id)
+      # Remove all remaining jobs in the batch
+      while (jobs = Resque.peek(:easy_ml, 0, 1000)).any?
+        jobs.each do |job|
+          if job["args"][0] == batch_id
+            Resque.dequeue(self, *job["args"])
+          end
+        end
+        # Break if we've processed all jobs (no more jobs match our batch_id)
+        break unless jobs.any? { |job| job["args"][0] == batch_id }
+      end
+    end
   end
 end
+# If any feature fails, the entire batch fails
+# If any feature fails, the RELATED batches should fail

data/app/models/concerns/easy_ml/dataframe_serialization.rb ADDED Viewed

@@ -0,0 +1,30 @@
+module EasyML
+  module DataframeSerialization
+    extend ActiveSupport::Concern
+    def serialize_dataframe(df)
+      return unless df
+      JSON.parse(df.write_json)
+    end
+    def deserialize_dataframe(df_data)
+      return unless df_data.present? && df_data.key?("columns")
+      columns = df_data["columns"].map do |col|
+        dtype = case col["datatype"]
+          when Hash
+            if col["datatype"]["Datetime"]
+              Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
+            else
+              Polars::Utf8
+            end
+          else
+            Polars.const_get(col["datatype"])
+          end
+        Polars::Series.new(col["name"], col["values"], dtype: dtype)
+      end
+      Polars::DataFrame.new(columns)
+    end
+  end
+end

data/app/models/easy_ml/dataset.rb CHANGED Viewed

@@ -102,13 +102,9 @@ module EasyML
     end
     def root_dir
-      persisted = read_attribute(:root_dir)
+      relative_dir = read_attribute(:root_dir) || default_root_dir
-      if persisted.present? && !persisted.blank?
-        EasyML::Engine.root_dir.join(persisted).to_s
-      else
-        default_root_dir
-      end
+      EasyML::Engine.root_dir.join(relative_dir).to_s
     end
     def destructively_cleanup!
@@ -219,8 +215,11 @@ module EasyML
     end
     def after_fit_features
-      features.update_all(needs_fit: false, fit_at: Time.current)
       unlock!
+      reload
+      return if failed?
+      features.update_all(needs_fit: false, fit_at: Time.current)
       actually_refresh
     end
@@ -281,22 +280,24 @@ module EasyML
     end
     def refreshing
-      return false if is_history_class?
-      unlock! unless analyzing?
-      lock_dataset do
-        update(workflow_status: "analyzing")
-        fully_reload
-        yield
-      ensure
-        unlock!
-      end
-    rescue => e
-      update(workflow_status: "failed")
-      e.backtrace.grep(/easy_ml/).each do |line|
-        puts line
+      begin
+        return false if is_history_class?
+        unlock! unless analyzing?
+        lock_dataset do
+          update(workflow_status: "analyzing")
+          fully_reload
+          yield
+        ensure
+          unlock!
+        end
+      rescue => e
+        update(workflow_status: "failed")
+        e.backtrace.grep(/easy_ml/).each do |line|
+          puts line
+        end
+        raise e
       end
-      raise e
     end
     def unlock!

data/app/models/easy_ml/dataset_history.rb CHANGED Viewed

@@ -30,17 +30,12 @@ module EasyML
     self.table_name = "easy_ml_dataset_histories"
     include Historiographer::History
-    has_many :columns,
-      ->(dataset_history) { where(snapshot_id: dataset_history.snapshot_id) },
+    has_many :columns, ->(dataset_history) { where(snapshot_id: dataset_history.snapshot_id) },
       class_name: "EasyML::ColumnHistory",
       foreign_key: "dataset_id",
       primary_key: "dataset_id",
       extend: EasyML::ColumnList
-    def root_dir
-      read_attribute(:root_dir)
-    end
     def fit
       false
     end

data/app/models/easy_ml/datasources/polars_datasource.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module EasyML
   module Datasources
     class PolarsDatasource < BaseDatasource
+      include EasyML::DataframeSerialization
       validates :df, presence: true
       add_configuration_attributes :df
@@ -58,7 +60,7 @@ module EasyML
         return unless df
         datasource.configuration = (datasource.configuration || {}).merge(
-          "df" => JSON.parse(df.write_json),
+          "df" => serialize_dataframe(df),
         )
       end
@@ -66,23 +68,7 @@ module EasyML
         return unless datasource.configuration&.key?("df")
         df_data = datasource.configuration["df"]
-        return unless df_data.present? && df_data.key?("columns")
-        columns = df_data["columns"].map do |col|
-          dtype = case col["datatype"]
-            when Hash
-              if col["datatype"]["Datetime"]
-                Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
-              else
-                Polars::Utf8
-              end
-            else
-              Polars.const_get(col["datatype"])
-            end
-          Polars::Series.new(col["name"], col["values"], dtype: dtype)
-        end
-        datasource.df = Polars::DataFrame.new(columns)
+        datasource.df = deserialize_dataframe(df_data)
       end
     end
   end

data/app/models/easy_ml/event.rb CHANGED Viewed

@@ -19,6 +19,7 @@ module EasyML
     STATUSES = %w[started success failed].freeze
     belongs_to :eventable, polymorphic: true, optional: true
+    has_one :context, dependent: :destroy, class_name: "EasyML::EventContext"
     validates :name, presence: true
     validates :status, presence: true, inclusion: { in: STATUSES }
@@ -51,8 +52,8 @@ module EasyML
           error = e
         end
       end
-      create_event(model, "failed", error)
       Rails.logger.error("#{self.class.name} failed: #{error.message}")
+      create_event(model, "failed", error)
     end
     def self.format_stacktrace(error)

data/app/models/easy_ml/event_context.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# == Schema Information
+#
+# Table name: easy_ml_event_contexts
+#
+#  id         :bigint           not null, primary key
+#  event_id   :bigint           not null
+#  context    :jsonb            not null
+#  created_at :datetime         not null
+#  updated_at :datetime         not null
+#
+module EasyML
+  class EventContext < ActiveRecord::Base
+    include EasyML::DataframeSerialization
+    self.table_name = "easy_ml_event_contexts"
+    belongs_to :event
+    validates :context, presence: true
+    validates :event, presence: true
+    def context=(new_context)
+      write_attribute(:context, serialize_context(new_context))
+      @context = new_context
+    end
+    def context
+      @context ||= deserialize_context(read_attribute(:context))
+    end
+    private
+    def serialize_context(new_context)
+      case new_context
+      when Hash
+        self.format = :json
+        new_context.to_json
+      when YAML
+        self.format = :yaml
+        new_context.to_yaml
+      when Polars::DataFrame
+        self.format = :dataframe
+        serialize_dataframe(new_context)
+      end
+    end
+    def deserialize_context(context)
+      case format.to_sym
+      when :json
+        JSON.parse(context)
+      when :yaml
+        YAML.safe_load(context)
+      when :dataframe
+        deserialize_dataframe(context)
+      end
+    end
+  end
+end

data/app/models/easy_ml/feature.rb CHANGED Viewed

@@ -17,6 +17,7 @@
 #  refresh_every    :bigint
 #  created_at       :datetime         not null
 #  updated_at       :datetime         not null
+#  workflow_status  :string
 #
 module EasyML
   class Feature < ActiveRecord::Base
@@ -24,6 +25,11 @@ module EasyML
     include Historiographer::Silent
     historiographer_mode :snapshot_only
+    enum workflow_status: {
+      analyzing: "analyzing",
+      ready: "ready",
+      failed: "failed",
+    }
     class << self
       def compute_sha(feature_class)
         require "digest"
@@ -135,13 +141,22 @@ module EasyML
       adapter.respond_to?(:batch) || config.dig(:batch_size).present?
     end
+    def primary_key
+      pkey = config.dig(:primary_key)
+      if pkey.is_a?(Array)
+        pkey
+      else
+        [pkey]
+      end
+    end
     def numeric_primary_key?
       if primary_key.nil?
         return false unless should_be_batchable?
         raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
       end
-      dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map(&:values).all? do |value|
+      dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map { |h| h.respond_to?(:values) ? h.values : h }.all? do |value|
         case value
         when String then value.match?(/\A[-+]?\d+(\.\d+)?\z/)
         else
@@ -171,14 +186,14 @@ module EasyML
           unless primary_key.present?
             raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
           end
-          df = reader.query(select: [primary_key.first])
+          df = reader.query(select: primary_key)
         rescue => e
           raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
         end
         return [] if df.nil?
         min_id = df[primary_key.first].min
-        max_id = df[primary_key.first].max
+        max_id = df[primary_key.last].max
       end
       (min_id..max_id).step(batch_size).map do |batch_start|
@@ -196,7 +211,11 @@ module EasyML
     end
     def fit(features: [self], async: false)
-      jobs = features.flat_map(&:build_batches)
+      # Sort features by position to ensure they're processed in order
+      features.update_all(workflow_status: :analyzing)
+      ordered_features = features.sort_by(&:feature_position)
+      jobs = ordered_features.flat_map(&:build_batches)
       if async
         EasyML::ComputeFeatureJob.enqueue_batch(jobs)
       else
@@ -266,13 +285,11 @@ module EasyML
           batch_df = adapter.fit(df, self, batch_args)
         end
       end
-      raise "Feature #{feature_class}#fit must return a dataframe" unless batch_df.present?
-      store(batch_df)
-      updates = {
-        applied_at: Time.current,
-        needs_fit: false,
-      }.compact
-      update!(updates)
+      if batch_df.present?
+        store(batch_df)
+      else
+        "Feature #{feature_class}#fit should return a dataframe, received #{batch_df.class}"
+      end
       batch_df
     end
@@ -335,6 +352,7 @@ module EasyML
     def apply_defaults
       self.name ||= self.feature_class.demodulize.titleize
       self.version ||= 1
+      self.workflow_status ||= :ready
     end
     def needs_columns
@@ -371,6 +389,17 @@ module EasyML
         (should_be_batchable? ? 10_000 : nil)
     end
+    def after_fit
+      updates = {
+        applied_at: Time.current,
+        needs_fit: false,
+      }.compact
+      update!(updates)
+    end
+    def fully_processed?
+    end
     private
     def bulk_update_positions(features)

data/app/models/easy_ml/model.rb CHANGED Viewed

@@ -544,7 +544,6 @@ module EasyML
     def new_model_file!
       build_model_file(
-        root_dir: root_dir,
         model: self,
         s3_bucket: EasyML::Configuration.s3_bucket,
         s3_region: EasyML::Configuration.s3_region,

data/app/models/easy_ml/model_file.rb CHANGED Viewed

@@ -23,7 +23,7 @@ module EasyML
     belongs_to :model, class_name: "EasyML::Model"
     include EasyML::Concerns::Configurable
-    add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :s3_access_key_id, :s3_secret_access_key, :root_dir
+    add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :s3_access_key_id, :s3_secret_access_key
     def synced_file
       EasyML::Support::SyncedFile.new(
@@ -33,10 +33,14 @@ module EasyML
         s3_region: s3_region,
         s3_access_key_id: s3_access_key_id,
         s3_secret_access_key: s3_secret_access_key,
-        root_dir: root_dir,
+        root_dir: full_dir,
       )
     end
+    def root_dir
+      model.root_dir
+    end
     def exist?
       fit?
     end
@@ -103,7 +107,7 @@ module EasyML
     end
     def relative_dir
-      root_dir.to_s.gsub(Regexp.new(Rails.root.to_s), "").gsub!(%r{^/}, "")
+      root_dir.to_s.gsub(Regexp.new(Rails.root.to_s), "").gsub(%r{^/}, "")
     end
     def full_dir

data/app/models/easy_ml/splitter_history.rb CHANGED Viewed

@@ -1,3 +1,19 @@
+# == Schema Information
+#
+# Table name: easy_ml_splitter_histories
+#
+#  id                 :bigint           not null, primary key
+#  splitter_id        :integer          not null
+#  splitter_type      :string           not null
+#  configuration      :json
+#  dataset_id         :integer          not null
+#  created_at         :datetime         not null
+#  updated_at         :datetime         not null
+#  history_started_at :datetime         not null
+#  history_ended_at   :datetime
+#  history_user_id    :integer
+#  snapshot_id        :string
+#
 module EasyML
   class SplitterHistory < ActiveRecord::Base
     self.table_name = "easy_ml_splitter_histories"

data/config/initializers/zhong.rb CHANGED Viewed

@@ -7,5 +7,9 @@ if %w[zhong:start].include?(ARGV.first)
     every 1.hour, "cleanup" do
       EasyML::CleanJob.perform_later
     end
+    every 1.hour, "cleanup" do
+      EasyML::ScheduleRetrainingJob.perform_later
+    end
   end
 end

data/lib/easy_ml/data/date_converter.rb CHANGED Viewed

@@ -3,6 +3,7 @@ module EasyML
     module DateConverter
       COMMON_DATE_FORMATS = [
         "%Y-%m-%dT%H:%M:%S.%6N",   # e.g., "2021-01-01T00:00:00.000000"
+        "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
         "%Y-%m-%d %H:%M:%S.%L",    # e.g., "2021-01-01 00:01:36.000"
         "%Y-%m-%d %H:%M:%S.%L",   # e.g., "2021-01-01 00:01:36.000"
         "%Y-%m-%d %H:%M:%S",      # e.g., "2021-01-01 00:01:36"