RubyGems - easy_ml - Versions diffs - 0.2.0.pre.rc101 → 0.2.0.pre.rc103 - Mend

easy_ml 0.2.0.pre.rc101 → 0.2.0.pre.rc103

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb ADDED Viewed

@@ -0,0 +1,201 @@
+module EasyML
+  module Data
+    class DatasetManager
+      class Schema
+        class Normalizer
+          attr_accessor :files
+          def initialize(files)
+            @files = files
+          end
+          def normalize
+            shared_schema = find_common_schema(files)
+            if schema_changed?(files, shared_schema)
+              queries = schema_to_queries(shared_schema)
+              rewrite_dataset(files, queries)
+            end
+            queries = improve_schema(files, shared_schema)
+            if queries.any?
+              rewrite_dataset(files, queries)
+            end
+            files
+          end
+        private
+          def schema_changed?(files, schema)
+            Polars.scan_parquet(files.first).schema != schema
+          end
+          def rewrite_dataset(files, queries)
+            files.each do |file|
+              Polars.scan_parquet(file).select(queries).collect.write_parquet("#{file}_normalized.parquet")
+              puts "Rewriting #{file}..."
+              File.delete(file)
+              FileUtils.mv("#{file}_normalized.parquet", file)
+            end
+          end
+          def improve_schema(files, schema)
+            checks = schema_checks(schema)
+            return [] unless checks.any?
+            improvements = Polars.scan_parquet(files).select(checks).collect
+            conversions = improvements.to_hashes&.first || []
+            return [] unless conversions.any?
+            conversions = conversions&.select { |k,v| v }
+            return [] unless conversions.any?
+            conversions = conversions.reduce({}) do |hash, (k, _)|
+              hash.tap do
+                key, ruby_type = k.split("convert_").last.split("_to_")
+                conversion = case ruby_type
+                            when "int"
+                              Polars.col(key).cast(Polars::Int64).alias(key)
+                            else
+                              EasyML::Data::DateConverter.conversion(k)
+                            end
+                hash[key] = conversion
+              end
+            end
+            schema.map do |k, v|
+              conversions[k] || Polars.col(k).cast(v).alias(k)
+            end
+          end
+          def schema_to_queries(schema)
+            schema.map do |k, v|
+              Polars.col(k).cast(v).alias(k)
+            end
+          end
+          def schema_checks(schema)
+            schema.flat_map do |key, value|
+              case value
+              when Polars::FloatType, Polars::Decimal
+                Polars.col(key).cast(Polars::Int64).cast(value).eq(Polars.col(key)).all().alias("convert_#{key}_to_int")
+              when Polars::String
+                EasyML::Data::DateConverter.queries(key)
+              end
+            end.compact
+          end
+          # Function to find a common schema across multiple parquet files
+          def find_common_schema(parquet_files)
+            # Get schema from each file
+            schemas = []
+            parquet_files.each do |file|
+              begin
+                # Read just the schema without loading data
+                schema = Polars.scan_parquet(file).schema
+                schemas << schema
+              rescue => e
+                puts "Warning: Error reading schema from #{file}: #{e.message}"
+              end
+            end
+            # Find common schema - start with first file's schema
+            return {} if schemas.empty?
+            key_count = Hash.new(0)
+            common_schema = schemas.first
+            # Reconcile types across all schemas
+            schemas.each do |schema|
+              schema.each do |name, dtype|
+                key_count[name] += 1
+                if common_schema.key?(name)
+                  # If types don't match, choose the more general type
+                  if common_schema[name] != dtype
+                    common_schema[name] = choose_compatible_type(common_schema[name], dtype)
+                  end
+                end
+              end
+            end
+            # Filter out columns that aren't present in all files
+            common_schema = common_schema.select { |name, _| key_count[name] == schemas.length }
+            return common_schema
+          end
+          # Choose a type that's compatible with both input types
+          def choose_compatible_type(type1, type2)
+            # Integer types - use the larger of the two
+            int_types = [Polars::Int8, Polars::Int16, Polars::Int32, Polars::Int64]
+            # If both are integers, choose the larger one
+            if int_types.include?(type1.class) && int_types.include?(type2.class)
+              return [type1, type2].max_by { |t| int_types.index(t.class) }
+            end
+            # If one is Int64 and one is Decimal with scale 0, use Decimal
+            if (type1.is_a?(Polars::Int64) && type2.is_a?(Polars::Decimal) && type2.scale == 0) ||
+              (type2.is_a?(Polars::Int64) && type1.is_a?(Polars::Decimal) && type1.scale == 0)
+              return type1.is_a?(Polars::Decimal) ? type1 : type2
+            end
+            # If types are drastically different, convert to string as a safe fallback
+            if [Polars::String, Polars::Categorical].include?(type1.class) ||
+              [Polars::String, Polars::Categorical].include?(type2.class)
+              return Polars::String.new
+            end
+            # For float vs decimal, choose decimal if it has scale > 0
+            if (type1.is_a?(Polars::Float64) && type2.is_a?(Polars::Decimal) && type2.scale > 0) ||
+              (type2.is_a?(Polars::Float64) && type1.is_a?(Polars::Decimal) && type1.scale > 0)
+              return type1.is_a?(Polars::Decimal) ? type1 : type2
+            end
+            # Default to Float64 for numeric type conflicts
+            if [Polars::Float32, Polars::Float64, Polars::Decimal, Polars::Int64].any? { |t| type1.is_a?(t) } &&
+              [Polars::Float32, Polars::Float64, Polars::Decimal, Polars::Int64].any? { |t| type2.is_a?(t) }
+              return Polars::Float64.new
+            end
+            # Fallback - use first type
+            return type1
+          end
+          # Apply a common schema to read all parquet files
+          def read_with_common_schema(parquet_files)
+            schema = find_common_schema(parquet_files)
+            return Polars.scan_parquet(parquet_files).with_schema(schema).collect
+          end
+          # Alternative approach using a union scan
+          def union_scan_parquet(parquet_files)
+            if parquet_files.empty?
+              return Polars.DataFrame.new
+            end
+            # Create separate scans with explicit schemas
+            scans = []
+            schema = find_common_schema(parquet_files)
+            parquet_files.each do |file|
+              scans << Polars.scan_parquet(file).with_schema(schema)
+            end
+            # Union all scans
+            if scans.length == 1
+              return scans.first.collect
+            else
+              # Combine using concat (union all)
+              union = scans.first
+              scans[1..-1].each do |scan|
+                union = union.concat(scan)
+              end
+              return union.collect
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/easy_ml/data/dataset_manager/schema.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module EasyML
+  module Data
+    class DatasetManager
+      class Schema
+        require_relative "schema/normalizer"
+      end
+    end
+  end
+end

data/lib/easy_ml/data/dataset_manager.rb CHANGED Viewed

@@ -3,6 +3,7 @@ module EasyML
     class DatasetManager
       require_relative "dataset_manager/writer"
       require_relative "dataset_manager/reader"
+      require_relative "dataset_manager/schema"
       attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
                     :partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
@@ -55,6 +56,10 @@ module EasyML
         def cp(from, to)
           Writer.cp(from, to)
         end
+        def normalize_schema(files)
+          Schema::Normalizer.new(files).normalize
+        end
       end
       def list_nulls(input = nil, **kwargs, &block)

data/lib/easy_ml/data/date_converter.rb CHANGED Viewed

@@ -2,10 +2,10 @@ module EasyML
   module Data
     module DateConverter
       COMMON_DATE_FORMATS = [
+        "%Y-%m-%d %H:%M:%S.%f %Z",
         "%Y-%m-%dT%H:%M:%S.%6N",   # e.g., "2021-01-01T00:00:00.000000"
         "%Y-%m-%d %H:%M:%S.%L Z",   # e.g., "2025-01-03 23:04:49.492 Z"
         "%Y-%m-%d %H:%M:%S.%L",     # e.g., "2021-01-01 00:01:36.000"
-        "%Y-%m-%d %H:%M:%S.%L",     # duplicate format intentionally
         "%Y-%m-%d %H:%M:%S",        # e.g., "2021-01-01 00:01:36"
         "%Y-%m-%d %H:%M",           # e.g., "2021-01-01 00:01"
         "%Y-%m-%d",                 # e.g., "2021-01-01"
@@ -19,179 +19,38 @@ module EasyML
         "%Y/%m/%d",                # e.g., "2021/01/01"
       ].freeze
-      FORMAT_MAPPINGS = {
-        ruby_to_polars: {
-          "%L" => "%3f",  # milliseconds
-          "%6N" => "%6f",  # microseconds
-          "%N" => "%9f",  # nanoseconds
-        },
-      }.freeze
-      class << self
-        # Infers a strftime format string from the given date string.
-        #
-        # @param date_str [String] The date string to analyze.
-        # @return [String, nil] The corresponding strftime format if recognized, or nil if not.
-        def infer_strftime_format(date_str)
-          return nil if date_str.blank?
-          # YYYY-MM-DD (e.g., "2021-01-01")
-          return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
-          # YYYY/MM/DD (e.g., "2021/01/01")
-          return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
-          # Date & time with T separator (ISO 8601-like)
-          if date_str.include?("T")
-            # Without fractional seconds, e.g., "2021-01-01T12:34:56"
-            return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
-            # With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
-            if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
-              fraction = Regexp.last_match(1)
-              case fraction.length
-              when 3 then return "%Y-%m-%dT%H:%M:%S.%L"  # milliseconds
-              when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
-              when 9 then return "%Y-%m-%dT%H:%M:%S.%N"  # nanoseconds
-              else
-                # Fallback if fractional part has unexpected length:
-                return "%Y-%m-%dT%H:%M:%S.%N"
-              end
-            end
-          end
-          # Date & time with space separator
-          if date_str.include?(" ")
-            # Without fractional seconds, e.g., "2021-01-01 12:34:56"
-            return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
-            # With fractional seconds, e.g., "2021-01-01 12:34:56.789"
-            if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
-              fraction = Regexp.last_match(1)
-              case fraction.length
-              when 3 then return "%Y-%m-%d %H:%M:%S.%L"
-              when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
-              when 9 then return "%Y-%m-%d %H:%M:%S.%N"
-              else
-                return "%Y-%m-%d %H:%M:%S.%N"
-              end
-            end
-          end
-          # Common US-style formats
-          # MM/DD/YYYY (e.g., "01/31/2021")
-          return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
-          # DD-MM-YYYY (e.g., "31-01-2021")
-          return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
-          # DD-Mon-YYYY (e.g., "31-Jan-2021")
-          return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
-          # Mon DD, YYYY (e.g., "Jan 31, 2021")
-          return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
-          # Could add additional heuristics as needed...
-          nil  # Return nil if no known format matches.
+      def self.maybe_convert_date(df, column = nil)
+        column = column.to_s if column.present?
+        if df.is_a?(Polars::Series)
+          column = "temp" if column.nil?
+          df = Polars::DataFrame.new({ column.to_s => df })
         end
+        return df unless df.columns.include?(column)
+        return df if df[column].dtype.is_a?(Polars::Datetime)
-        # Attempts to convert a string column to datetime if it appears to be a date.
-        # @param df [Polars::DataFrame] The dataframe containing the series.
-        # @param column [String] The name of the column to convert.
-        # @return [Polars::DataFrame] The dataframe with the converted column (if successful).
-        def maybe_convert_date(df, column = nil)
-          if column.nil?
-            series = df
-            column = series.name
-            df = Polars::DataFrame.new(series)
-          else
-            series = df[column]
-          end
-          return df if series.dtype.is_a?(Polars::Datetime)
-          return df unless series.dtype == Polars::Utf8
-          sample = series.filter(series.is_not_null).head(100).to_a
-          ruby_format = detect_date_format(sample)
+        conversions = df.select(queries(column)).to_hashes&.first || []
+        return df unless conversions.any?
-          if ruby_format
-            format = convert_format(:ruby_to_polars, ruby_format)
-            df = try_format(df, column, format)
+        conversions = conversions.select { |k, v| v }
+        return df unless conversions.any?
-            if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
-              df = df.drop("TRY")
-              best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
-                infer_strftime_format(date_str)
-              end.max_by { |_format, count| count }[0]
-              df = try_format(df, column, best_format)
-            end
-            df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
-          end
-          df
-        end
-        private
-        def try_format(df, column, format)
-          df = df.with_column(
-            Polars.col(column.to_s)
-              .str
-              .strptime(Polars::Datetime, format, strict: false)
-              .alias("TRY")
-          )
-        end
-        def detect_polars_format(series)
-          return nil unless series.is_a?(Polars::Series)
-          sample = series.filter(series.is_not_null).head(100).to_a
-          ruby_format = detect_date_format(sample)
-          convert_format(:ruby_to_polars, ruby_format)
+        conversions.map do |k, _|
+          conversion = conversion(k)
+          df = df.with_columns(conversion)
         end
-        def detect_date_format(date_strings)
-          return nil if date_strings.empty?
-          sample = date_strings.compact.sample([100, date_strings.length].min)
-          best_format = nil
-          best_success_rate = 0.0
-          sample_count = sample.length
-          COMMON_DATE_FORMATS.each do |fmt|
-            success_count = sample.count do |date_str|
-              begin
-                DateTime.strptime(date_str, fmt)
-                true
-              rescue StandardError
-                false
-              end
-            end
-            success_rate = success_count.to_f / sample_count
-            if success_rate > best_success_rate
-              best_success_rate = success_rate
-              best_format = fmt
-            end
-            # If every sample string matches this format, return it immediately.
-            return fmt if success_rate == 1.0
-          end
+        df
+      end
-          best_success_rate >= 0.8 ? best_format : nil
+      def self.queries(column)
+        COMMON_DATE_FORMATS.map do |format|
+          Polars.col(column).cast(Polars::String).str.strptime(Polars::Datetime, format, strict: false).is_not_null().all().alias("convert_#{column}_to_#{format}")
         end
+      end
-        def convert_format(conversion, format)
-          return nil if format.nil?
-          result = format.dup
-          FORMAT_MAPPINGS[conversion].each do |from, to|
-            result = result.gsub(from, to)
-          end
-          result
-        end
+      def self.conversion(key)
+        key, ruby_type = key.split("convert_").last.split("_to_")
+        Polars.col(key).cast(Polars::String).str.strptime(Polars::Datetime, ruby_type, strict: false).cast(Polars::Datetime).alias(key)
       end
     end
   end

data/lib/easy_ml/data/polars_column.rb CHANGED Viewed

@@ -19,8 +19,9 @@ module EasyML
         Polars::Decimal => :float,
         Polars::Float64 => :float,
         Polars::Int64 => :integer,
-        Polars::Float32 => :float,
         Polars::Int32 => :integer,
+        Polars::Int8 => :integer,
+        Polars::Float32 => :float,
         Polars::Boolean => :boolean,
         Polars::Datetime => :datetime,
         Polars::Date => :date,
@@ -138,8 +139,9 @@ module EasyML
             return :numeric
           rescue StandardError
             # If not numeric, check for datetime or categorical
-            if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
+            is_datetime = EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
                                                               :temp)[:temp].dtype.is_a?(Polars::Datetime)
+            if is_datetime
               :datetime
             else
               categorical_or_text?(series)

data/lib/easy_ml/data/polars_reader.rb CHANGED Viewed

@@ -175,7 +175,7 @@ module EasyML
           end
           combined_lazy_df = combined_lazy_df.with_columns(
             cast.map do |col, dtype|
-              Polars.col(col).cast(dtype).alias(col)
+              Polars.col(col).cast(dtype, strict: false).alias(col)
             end
           )
         end
@@ -319,7 +319,10 @@ module EasyML
       def learn_dataset
         return schema if schema.present?
-        existing_schema = existing_parquet_schema
+        if parquet_files.present?
+          EasyML::Data::DatasetManager.normalize_schema(parquet_files)
+          existing_schema = existing_parquet_schema
+        end
         schema = existing_schema || normalize_dataset
         self.schema = schema

data/lib/easy_ml/engine.rb CHANGED Viewed

@@ -123,6 +123,10 @@ module EasyML
       end
     end
+    config.after_initialize do
+      Dir.glob(Rails.root.join("app/datasets/**/*.rb")).each { |f| require f }
+    end
     if ENV["EASY_ML_DEV"]
       require "vite_ruby"
       require "vite_rails"

data/lib/easy_ml/railtie/generators/migration/migration_generator.rb CHANGED Viewed

@@ -60,6 +60,7 @@ module EasyML
             add_workflow_status_to_easy_ml_dataset_histories
             add_metadata_to_easy_ml_predictions
             add_unique_constraint_to_dataset_names
+            add_view_class_to_easy_ml_datasets
           ].freeze
           # Specify the next migration number

data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt ADDED Viewed

@@ -0,0 +1,9 @@
+class AddViewClassToEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
+  def change
+    add_column :easy_ml_datasets, :view_class, :string
+    add_index :easy_ml_datasets, :view_class
+    add_column :easy_ml_dataset_histories, :view_class, :string
+    add_index :easy_ml_dataset_histories, :view_class
+  end
+end

data/lib/easy_ml/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module EasyML
-  VERSION = "0.2.0-rc101"
+  VERSION = "0.2.0-rc103"
   module Version
   end

data/public/easy_ml/assets/.vite/manifest.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "entrypoints/Application.tsx": {
-    "file": "assets/entrypoints/Application.tsx-BXwsBCuQ.js",
+    "file": "assets/entrypoints/Application.tsx-gkZ77wo8.js",
     "name": "entrypoints/Application.tsx",
     "src": "entrypoints/Application.tsx",
     "isEntry": true,