RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc100 → 0.2.0.pre.rc102 - Mend

easy_ml 0.2.0.pre.rc100 → 0.2.0.pre.rc102

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/app/controllers/easy_ml/datasets_controller.rb +1 -0
data/app/frontend/components/dataset/splitters/types.ts +3 -4
data/app/frontend/pages/NewDatasetPage.tsx +17 -0
data/app/frontend/types/datasource.ts +14 -6
data/app/models/easy_ml/column/imputers/base.rb +3 -1
data/app/models/easy_ml/column.rb +8 -0
data/app/models/easy_ml/dataset/learner/lazy.rb +16 -3
data/app/models/easy_ml/dataset.rb +47 -9
data/app/models/easy_ml/dataset_history.rb +1 -0
data/app/models/easy_ml/feature.rb +5 -13
data/app/models/easy_ml/lineage.rb +2 -1
data/app/models/easy_ml/models/xgboost/evals_callback.rb +1 -0
data/app/models/easy_ml/models/xgboost.rb +7 -2
data/app/models/easy_ml/prediction.rb +1 -1
data/app/models/easy_ml/splitters/base_splitter.rb +4 -8
data/app/models/easy_ml/splitters/date_splitter.rb +2 -1
data/app/models/easy_ml/splitters/predefined_splitter.rb +8 -3
data/config/initializers/zhong.rb +6 -0
data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb +201 -0
data/lib/easy_ml/data/dataset_manager/schema.rb +9 -0
data/lib/easy_ml/data/dataset_manager.rb +5 -0
data/lib/easy_ml/data/date_converter.rb +24 -165
data/lib/easy_ml/data/polars_column.rb +5 -2
data/lib/easy_ml/data/polars_reader.rb +4 -1
data/lib/easy_ml/data/synced_directory.rb +36 -23
data/lib/easy_ml/engine.rb +4 -0
data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt +9 -0
data/lib/easy_ml/version.rb +1 -1
data/public/easy_ml/assets/.vite/manifest.json +1 -1
data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js → Application.tsx-CRS5bRgw.js} +8 -8
data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js.map → Application.tsx-CRS5bRgw.js.map} +1 -1
metadata +7 -5
data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0

data/lib/easy_ml/data/dataset_manager/schema.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module EasyML
+  module Data
+    class DatasetManager
+      class Schema
+        require_relative "schema/normalizer"
+      end
+    end
+  end
+end

data/lib/easy_ml/data/dataset_manager.rb CHANGED Viewed

@@ -3,6 +3,7 @@ module EasyML
     class DatasetManager
       require_relative "dataset_manager/writer"
       require_relative "dataset_manager/reader"
+      require_relative "dataset_manager/schema"
       attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
                     :partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
@@ -55,6 +56,10 @@ module EasyML
         def cp(from, to)
           Writer.cp(from, to)
         end
+        def normalize_schema(files)
+          Schema::Normalizer.new(files).normalize
+        end
       end
       def list_nulls(input = nil, **kwargs, &block)

data/lib/easy_ml/data/date_converter.rb CHANGED Viewed

@@ -2,10 +2,10 @@ module EasyML
   module Data
     module DateConverter
       COMMON_DATE_FORMATS = [
+        "%Y-%m-%d %H:%M:%S.%f %Z",
         "%Y-%m-%dT%H:%M:%S.%6N",   # e.g., "2021-01-01T00:00:00.000000"
         "%Y-%m-%d %H:%M:%S.%L Z",   # e.g., "2025-01-03 23:04:49.492 Z"
         "%Y-%m-%d %H:%M:%S.%L",     # e.g., "2021-01-01 00:01:36.000"
-        "%Y-%m-%d %H:%M:%S.%L",     # duplicate format intentionally
         "%Y-%m-%d %H:%M:%S",        # e.g., "2021-01-01 00:01:36"
         "%Y-%m-%d %H:%M",           # e.g., "2021-01-01 00:01"
         "%Y-%m-%d",                 # e.g., "2021-01-01"
@@ -19,179 +19,38 @@ module EasyML
         "%Y/%m/%d",                # e.g., "2021/01/01"
       ].freeze
-      FORMAT_MAPPINGS = {
-        ruby_to_polars: {
-          "%L" => "%3f",  # milliseconds
-          "%6N" => "%6f",  # microseconds
-          "%N" => "%9f",  # nanoseconds
-        },
-      }.freeze
-      class << self
-        # Infers a strftime format string from the given date string.
-        #
-        # @param date_str [String] The date string to analyze.
-        # @return [String, nil] The corresponding strftime format if recognized, or nil if not.
-        def infer_strftime_format(date_str)
-          return nil if date_str.blank?
-          # YYYY-MM-DD (e.g., "2021-01-01")
-          return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
-          # YYYY/MM/DD (e.g., "2021/01/01")
-          return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
-          # Date & time with T separator (ISO 8601-like)
-          if date_str.include?("T")
-            # Without fractional seconds, e.g., "2021-01-01T12:34:56"
-            return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
-            # With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
-            if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
-              fraction = Regexp.last_match(1)
-              case fraction.length
-              when 3 then return "%Y-%m-%dT%H:%M:%S.%L"  # milliseconds
-              when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
-              when 9 then return "%Y-%m-%dT%H:%M:%S.%N"  # nanoseconds
-              else
-                # Fallback if fractional part has unexpected length:
-                return "%Y-%m-%dT%H:%M:%S.%N"
-              end
-            end
-          end
-          # Date & time with space separator
-          if date_str.include?(" ")
-            # Without fractional seconds, e.g., "2021-01-01 12:34:56"
-            return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
-            # With fractional seconds, e.g., "2021-01-01 12:34:56.789"
-            if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
-              fraction = Regexp.last_match(1)
-              case fraction.length
-              when 3 then return "%Y-%m-%d %H:%M:%S.%L"
-              when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
-              when 9 then return "%Y-%m-%d %H:%M:%S.%N"
-              else
-                return "%Y-%m-%d %H:%M:%S.%N"
-              end
-            end
-          end
-          # Common US-style formats
-          # MM/DD/YYYY (e.g., "01/31/2021")
-          return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
-          # DD-MM-YYYY (e.g., "31-01-2021")
-          return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
-          # DD-Mon-YYYY (e.g., "31-Jan-2021")
-          return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
-          # Mon DD, YYYY (e.g., "Jan 31, 2021")
-          return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
-          # Could add additional heuristics as needed...
-          nil  # Return nil if no known format matches.
+      def self.maybe_convert_date(df, column = nil)
+        column = column.to_s if column.present?
+        if df.is_a?(Polars::Series)
+          column = "temp" if column.nil?
+          df = Polars::DataFrame.new({ column.to_s => df })
         end
+        return df unless df.columns.include?(column)
+        return df if df[column].dtype.is_a?(Polars::Datetime)
-        # Attempts to convert a string column to datetime if it appears to be a date.
-        # @param df [Polars::DataFrame] The dataframe containing the series.
-        # @param column [String] The name of the column to convert.
-        # @return [Polars::DataFrame] The dataframe with the converted column (if successful).
-        def maybe_convert_date(df, column = nil)
-          if column.nil?
-            series = df
-            column = series.name
-            df = Polars::DataFrame.new(series)
-          else
-            series = df[column]
-          end
-          return df if series.dtype.is_a?(Polars::Datetime)
-          return df unless series.dtype == Polars::Utf8
-          sample = series.filter(series.is_not_null).head(100).to_a
-          ruby_format = detect_date_format(sample)
+        conversions = df.select(queries(column)).to_hashes&.first || []
+        return df unless conversions.any?
-          if ruby_format
-            format = convert_format(:ruby_to_polars, ruby_format)
-            df = try_format(df, column, format)
+        conversions = conversions.select { |k, v| v }
+        return df unless conversions.any?
-            if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
-              df = df.drop("TRY")
-              best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
-                infer_strftime_format(date_str)
-              end.max_by { |_format, count| count }[0]
-              df = try_format(df, column, best_format)
-            end
-            df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
-          end
-          df
-        end
-        private
-        def try_format(df, column, format)
-          df = df.with_column(
-            Polars.col(column.to_s)
-              .str
-              .strptime(Polars::Datetime, format, strict: false)
-              .alias("TRY")
-          )
-        end
-        def detect_polars_format(series)
-          return nil unless series.is_a?(Polars::Series)
-          sample = series.filter(series.is_not_null).head(100).to_a
-          ruby_format = detect_date_format(sample)
-          convert_format(:ruby_to_polars, ruby_format)
+        conversions.map do |k, _|
+          conversion = conversion(k)
+          df = df.with_columns(conversion)
         end
-        def detect_date_format(date_strings)
-          return nil if date_strings.empty?
-          sample = date_strings.compact.sample([100, date_strings.length].min)
-          best_format = nil
-          best_success_rate = 0.0
-          sample_count = sample.length
-          COMMON_DATE_FORMATS.each do |fmt|
-            success_count = sample.count do |date_str|
-              begin
-                DateTime.strptime(date_str, fmt)
-                true
-              rescue StandardError
-                false
-              end
-            end
-            success_rate = success_count.to_f / sample_count
-            if success_rate > best_success_rate
-              best_success_rate = success_rate
-              best_format = fmt
-            end
-            # If every sample string matches this format, return it immediately.
-            return fmt if success_rate == 1.0
-          end
+        df
+      end
-          best_success_rate >= 0.8 ? best_format : nil
+      def self.queries(column)
+        COMMON_DATE_FORMATS.map do |format|
+          Polars.col(column).cast(Polars::String).str.strptime(Polars::Datetime, format, strict: false).is_not_null().all().alias("convert_#{column}_to_#{format}")
         end
+      end
-        def convert_format(conversion, format)
-          return nil if format.nil?
-          result = format.dup
-          FORMAT_MAPPINGS[conversion].each do |from, to|
-            result = result.gsub(from, to)
-          end
-          result
-        end
+      def self.conversion(key)
+        key, ruby_type = key.split("convert_").last.split("_to_")
+        Polars.col(key).cast(Polars::String).str.strptime(Polars::Datetime, ruby_type, strict: false).cast(Polars::Datetime).alias(key)
       end
     end
   end

data/lib/easy_ml/data/polars_column.rb CHANGED Viewed

@@ -16,10 +16,12 @@ module EasyML
         array: Polars::List,
       }
       POLARS_MAP = {
+        Polars::Decimal => :float,
         Polars::Float64 => :float,
         Polars::Int64 => :integer,
-        Polars::Float32 => :float,
         Polars::Int32 => :integer,
+        Polars::Int8 => :integer,
+        Polars::Float32 => :float,
         Polars::Boolean => :boolean,
         Polars::Datetime => :datetime,
         Polars::Date => :date,
@@ -137,8 +139,9 @@ module EasyML
             return :numeric
           rescue StandardError
             # If not numeric, check for datetime or categorical
-            if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
+            is_datetime = EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
                                                               :temp)[:temp].dtype.is_a?(Polars::Datetime)
+            if is_datetime
               :datetime
             else
               categorical_or_text?(series)

data/lib/easy_ml/data/polars_reader.rb CHANGED Viewed

@@ -319,7 +319,10 @@ module EasyML
       def learn_dataset
         return schema if schema.present?
-        existing_schema = existing_parquet_schema
+        if parquet_files.present?
+          EasyML::Data::DatasetManager.normalize_schema(parquet_files)
+          existing_schema = existing_parquet_schema
+        end
         schema = existing_schema || normalize_dataset
         self.schema = schema

data/lib/easy_ml/data/synced_directory.rb CHANGED Viewed

@@ -126,7 +126,7 @@ module EasyML
         )
         Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
-        if object.key.end_with?(".gz")
+        if object.key.end_with?(".gz") && !object.key.end_with?(".parquet.gz")
           ungzipped_file_path = ungzip_file(local_file_path)
           Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
         end
@@ -284,48 +284,61 @@ module EasyML
         relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
         s3_key = s3_prefix.present? ? File.join(s3_prefix, File.basename(relative_path)) : relative_path
-        # Create a temporary gzipped version of the file
-        gzipped_file_path = "#{file_path}.gz"
         begin
-          Rails.logger.info("Compressing and uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
+          Rails.logger.info("Uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
-          # Compress the file
-          Zlib::GzipWriter.open(gzipped_file_path) do |gz|
+          if file_path.end_with?(".parquet")
+            # Upload parquet files directly without compression
             File.open(file_path, "rb") do |file|
-              gz.write(file.read)
+              s3.put_object(
+                bucket: s3_bucket,
+                key: s3_key,
+                body: file
+              )
+            end
+            Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}")
+          else
+            # Create a temporary gzipped version of the file
+            gzipped_file_path = "#{file_path}.gz"
+            # Compress the file
+            Zlib::GzipWriter.open(gzipped_file_path) do |gz|
+              File.open(file_path, "rb") do |file|
+                gz.write(file.read)
+              end
             end
-          end
-          # Upload the gzipped file
-          File.open(gzipped_file_path, "rb") do |file|
-            s3.put_object(
-              bucket: s3_bucket,
-              key: "#{s3_key}.gz",
-              body: file,
-              content_encoding: "gzip",
-            )
-          end
+            # Upload the gzipped file
+            File.open(gzipped_file_path, "rb") do |file|
+              s3.put_object(
+                bucket: s3_bucket,
+                key: "#{s3_key}.gz",
+                body: file,
+                content_encoding: "gzip",
+              )
+            end
+            Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
-          Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
+            # Clean up temporary gzipped file
+            File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
+          end
         rescue Aws::S3::Errors::ServiceError, StandardError => e
           Rails.logger.error("Failed to upload #{file_path}: #{e.message}")
           raise e
-        ensure
-          # Clean up temporary gzipped file
-          File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
         end
       end
       def should_upload?(file_path)
         relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
         s3_key = s3_prefix.present? ? File.join(s3_prefix, relative_path) : relative_path
+        s3_key = "#{s3_key}.gz" unless file_path.end_with?(".parquet")
         begin
           # Check if file exists in S3
           response = s3.head_object(
             bucket: s3_bucket,
-            key: "#{s3_key}.gz",
+            key: s3_key,
           )
           # Compare modification times

data/lib/easy_ml/engine.rb CHANGED Viewed

@@ -123,6 +123,10 @@ module EasyML
       end
     end
+    config.after_initialize do
+      Dir.glob(Rails.root.join("app/datasets/**/*.rb")).each { |f| require f }
+    end
     if ENV["EASY_ML_DEV"]
       require "vite_ruby"
       require "vite_rails"

data/lib/easy_ml/railtie/generators/migration/migration_generator.rb CHANGED Viewed

@@ -60,6 +60,7 @@ module EasyML
             add_workflow_status_to_easy_ml_dataset_histories
             add_metadata_to_easy_ml_predictions
             add_unique_constraint_to_dataset_names
+            add_view_class_to_easy_ml_datasets
           ].freeze
           # Specify the next migration number

data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt ADDED Viewed

@@ -0,0 +1,9 @@
+class AddViewClassToEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
+  def change
+    add_column :easy_ml_datasets, :view_class, :string
+    add_index :easy_ml_datasets, :view_class
+    add_column :easy_ml_dataset_histories, :view_class, :string
+    add_index :easy_ml_dataset_histories, :view_class
+  end
+end

data/lib/easy_ml/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module EasyML
-  VERSION = "0.2.0-rc100"
+  VERSION = "0.2.0-rc102"
   module Version
   end

data/public/easy_ml/assets/.vite/manifest.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "entrypoints/Application.tsx": {
-    "file": "assets/entrypoints/Application.tsx-BXwsBCuQ.js",
+    "file": "assets/entrypoints/Application.tsx-CRS5bRgw.js",
     "name": "entrypoints/Application.tsx",
     "src": "entrypoints/Application.tsx",
     "isEntry": true,