easy_ml 0.2.0.pre.rc101 → 0.2.0.pre.rc102

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +1 -0
  3. data/app/frontend/components/dataset/splitters/types.ts +3 -4
  4. data/app/frontend/pages/NewDatasetPage.tsx +17 -0
  5. data/app/frontend/types/datasource.ts +14 -6
  6. data/app/models/easy_ml/column/imputers/base.rb +3 -1
  7. data/app/models/easy_ml/column.rb +8 -0
  8. data/app/models/easy_ml/dataset/learner/lazy.rb +16 -3
  9. data/app/models/easy_ml/dataset.rb +47 -9
  10. data/app/models/easy_ml/dataset_history.rb +1 -0
  11. data/app/models/easy_ml/feature.rb +5 -13
  12. data/app/models/easy_ml/lineage.rb +2 -1
  13. data/app/models/easy_ml/models/xgboost/evals_callback.rb +1 -0
  14. data/app/models/easy_ml/models/xgboost.rb +7 -2
  15. data/app/models/easy_ml/prediction.rb +1 -1
  16. data/app/models/easy_ml/splitters/base_splitter.rb +4 -8
  17. data/app/models/easy_ml/splitters/date_splitter.rb +2 -1
  18. data/app/models/easy_ml/splitters/predefined_splitter.rb +8 -3
  19. data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb +201 -0
  20. data/lib/easy_ml/data/dataset_manager/schema.rb +9 -0
  21. data/lib/easy_ml/data/dataset_manager.rb +5 -0
  22. data/lib/easy_ml/data/date_converter.rb +24 -165
  23. data/lib/easy_ml/data/polars_column.rb +4 -2
  24. data/lib/easy_ml/data/polars_reader.rb +4 -1
  25. data/lib/easy_ml/engine.rb +4 -0
  26. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
  27. data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt +9 -0
  28. data/lib/easy_ml/version.rb +1 -1
  29. data/public/easy_ml/assets/.vite/manifest.json +1 -1
  30. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js → Application.tsx-CRS5bRgw.js} +8 -8
  31. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js.map → Application.tsx-CRS5bRgw.js.map} +1 -1
  32. metadata +7 -5
  33. data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
@@ -3,6 +3,7 @@ module EasyML
3
3
  class DatasetManager
4
4
  require_relative "dataset_manager/writer"
5
5
  require_relative "dataset_manager/reader"
6
+ require_relative "dataset_manager/schema"
6
7
 
7
8
  attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
8
9
  :partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
@@ -55,6 +56,10 @@ module EasyML
55
56
  def cp(from, to)
56
57
  Writer.cp(from, to)
57
58
  end
59
+
60
+ def normalize_schema(files)
61
+ Schema::Normalizer.new(files).normalize
62
+ end
58
63
  end
59
64
 
60
65
  def list_nulls(input = nil, **kwargs, &block)
@@ -2,10 +2,10 @@ module EasyML
2
2
  module Data
3
3
  module DateConverter
4
4
  COMMON_DATE_FORMATS = [
5
+ "%Y-%m-%d %H:%M:%S.%f %Z",
5
6
  "%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
6
7
  "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
7
8
  "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
8
- "%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
9
9
  "%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
10
10
  "%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
11
11
  "%Y-%m-%d", # e.g., "2021-01-01"
@@ -19,179 +19,38 @@ module EasyML
19
19
  "%Y/%m/%d", # e.g., "2021/01/01"
20
20
  ].freeze
21
21
 
22
- FORMAT_MAPPINGS = {
23
- ruby_to_polars: {
24
- "%L" => "%3f", # milliseconds
25
- "%6N" => "%6f", # microseconds
26
- "%N" => "%9f", # nanoseconds
27
- },
28
- }.freeze
29
-
30
- class << self
31
- # Infers a strftime format string from the given date string.
32
- #
33
- # @param date_str [String] The date string to analyze.
34
- # @return [String, nil] The corresponding strftime format if recognized, or nil if not.
35
- def infer_strftime_format(date_str)
36
- return nil if date_str.blank?
37
-
38
- # YYYY-MM-DD (e.g., "2021-01-01")
39
- return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
40
-
41
- # YYYY/MM/DD (e.g., "2021/01/01")
42
- return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
43
-
44
- # Date & time with T separator (ISO 8601-like)
45
- if date_str.include?("T")
46
- # Without fractional seconds, e.g., "2021-01-01T12:34:56"
47
- return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
48
-
49
- # With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
50
- if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
51
- fraction = Regexp.last_match(1)
52
- case fraction.length
53
- when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
54
- when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
55
- when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
56
- else
57
- # Fallback if fractional part has unexpected length:
58
- return "%Y-%m-%dT%H:%M:%S.%N"
59
- end
60
- end
61
- end
62
-
63
- # Date & time with space separator
64
- if date_str.include?(" ")
65
- # Without fractional seconds, e.g., "2021-01-01 12:34:56"
66
- return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
67
-
68
- # With fractional seconds, e.g., "2021-01-01 12:34:56.789"
69
- if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
70
- fraction = Regexp.last_match(1)
71
- case fraction.length
72
- when 3 then return "%Y-%m-%d %H:%M:%S.%L"
73
- when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
74
- when 9 then return "%Y-%m-%d %H:%M:%S.%N"
75
- else
76
- return "%Y-%m-%d %H:%M:%S.%N"
77
- end
78
- end
79
- end
80
-
81
- # Common US-style formats
82
-
83
- # MM/DD/YYYY (e.g., "01/31/2021")
84
- return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
85
-
86
- # DD-MM-YYYY (e.g., "31-01-2021")
87
- return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
88
-
89
- # DD-Mon-YYYY (e.g., "31-Jan-2021")
90
- return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
91
-
92
- # Mon DD, YYYY (e.g., "Jan 31, 2021")
93
- return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
94
-
95
- # Could add additional heuristics as needed...
96
-
97
- nil # Return nil if no known format matches.
22
+ def self.maybe_convert_date(df, column = nil)
23
+ column = column.to_s if column.present?
24
+ if df.is_a?(Polars::Series)
25
+ column = "temp" if column.nil?
26
+ df = Polars::DataFrame.new({ column.to_s => df })
98
27
  end
28
+ return df unless df.columns.include?(column)
29
+ return df if df[column].dtype.is_a?(Polars::Datetime)
99
30
 
100
- # Attempts to convert a string column to datetime if it appears to be a date.
101
- # @param df [Polars::DataFrame] The dataframe containing the series.
102
- # @param column [String] The name of the column to convert.
103
- # @return [Polars::DataFrame] The dataframe with the converted column (if successful).
104
- def maybe_convert_date(df, column = nil)
105
- if column.nil?
106
- series = df
107
- column = series.name
108
- df = Polars::DataFrame.new(series)
109
- else
110
- series = df[column]
111
- end
112
-
113
- return df if series.dtype.is_a?(Polars::Datetime)
114
- return df unless series.dtype == Polars::Utf8
115
-
116
- sample = series.filter(series.is_not_null).head(100).to_a
117
- ruby_format = detect_date_format(sample)
31
+ conversions = df.select(queries(column)).to_hashes&.first || []
32
+ return df unless conversions.any?
118
33
 
119
- if ruby_format
120
- format = convert_format(:ruby_to_polars, ruby_format)
121
- df = try_format(df, column, format)
34
+ conversions = conversions.select { |k, v| v }
35
+ return df unless conversions.any?
122
36
 
123
- if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
124
- df = df.drop("TRY")
125
- best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
126
- infer_strftime_format(date_str)
127
- end.max_by { |_format, count| count }[0]
128
- df = try_format(df, column, best_format)
129
- end
130
-
131
- df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
132
- end
133
-
134
- df
135
- end
136
-
137
- private
138
-
139
- def try_format(df, column, format)
140
- df = df.with_column(
141
- Polars.col(column.to_s)
142
- .str
143
- .strptime(Polars::Datetime, format, strict: false)
144
- .alias("TRY")
145
- )
146
- end
147
-
148
- def detect_polars_format(series)
149
- return nil unless series.is_a?(Polars::Series)
150
-
151
- sample = series.filter(series.is_not_null).head(100).to_a
152
- ruby_format = detect_date_format(sample)
153
- convert_format(:ruby_to_polars, ruby_format)
37
+ conversions.map do |k, _|
38
+ conversion = conversion(k)
39
+ df = df.with_columns(conversion)
154
40
  end
155
41
 
156
- def detect_date_format(date_strings)
157
- return nil if date_strings.empty?
158
-
159
- sample = date_strings.compact.sample([100, date_strings.length].min)
160
-
161
- best_format = nil
162
- best_success_rate = 0.0
163
- sample_count = sample.length
164
-
165
- COMMON_DATE_FORMATS.each do |fmt|
166
- success_count = sample.count do |date_str|
167
- begin
168
- DateTime.strptime(date_str, fmt)
169
- true
170
- rescue StandardError
171
- false
172
- end
173
- end
174
- success_rate = success_count.to_f / sample_count
175
- if success_rate > best_success_rate
176
- best_success_rate = success_rate
177
- best_format = fmt
178
- end
179
- # If every sample string matches this format, return it immediately.
180
- return fmt if success_rate == 1.0
181
- end
42
+ df
43
+ end
182
44
 
183
- best_success_rate >= 0.8 ? best_format : nil
45
+ def self.queries(column)
46
+ COMMON_DATE_FORMATS.map do |format|
47
+ Polars.col(column).cast(Polars::String).str.strptime(Polars::Datetime, format, strict: false).is_not_null().all().alias("convert_#{column}_to_#{format}")
184
48
  end
49
+ end
185
50
 
186
- def convert_format(conversion, format)
187
- return nil if format.nil?
188
-
189
- result = format.dup
190
- FORMAT_MAPPINGS[conversion].each do |from, to|
191
- result = result.gsub(from, to)
192
- end
193
- result
194
- end
51
+ def self.conversion(key)
52
+ key, ruby_type = key.split("convert_").last.split("_to_")
53
+ Polars.col(key).cast(Polars::String).str.strptime(Polars::Datetime, ruby_type, strict: false).cast(Polars::Datetime).alias(key)
195
54
  end
196
55
  end
197
56
  end
@@ -19,8 +19,9 @@ module EasyML
19
19
  Polars::Decimal => :float,
20
20
  Polars::Float64 => :float,
21
21
  Polars::Int64 => :integer,
22
- Polars::Float32 => :float,
23
22
  Polars::Int32 => :integer,
23
+ Polars::Int8 => :integer,
24
+ Polars::Float32 => :float,
24
25
  Polars::Boolean => :boolean,
25
26
  Polars::Datetime => :datetime,
26
27
  Polars::Date => :date,
@@ -138,8 +139,9 @@ module EasyML
138
139
  return :numeric
139
140
  rescue StandardError
140
141
  # If not numeric, check for datetime or categorical
141
- if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
142
+ is_datetime = EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
142
143
  :temp)[:temp].dtype.is_a?(Polars::Datetime)
144
+ if is_datetime
143
145
  :datetime
144
146
  else
145
147
  categorical_or_text?(series)
@@ -319,7 +319,10 @@ module EasyML
319
319
  def learn_dataset
320
320
  return schema if schema.present?
321
321
 
322
- existing_schema = existing_parquet_schema
322
+ if parquet_files.present?
323
+ EasyML::Data::DatasetManager.normalize_schema(parquet_files)
324
+ existing_schema = existing_parquet_schema
325
+ end
323
326
  schema = existing_schema || normalize_dataset
324
327
 
325
328
  self.schema = schema
@@ -123,6 +123,10 @@ module EasyML
123
123
  end
124
124
  end
125
125
 
126
+ config.after_initialize do
127
+ Dir.glob(Rails.root.join("app/datasets/**/*.rb")).each { |f| require f }
128
+ end
129
+
126
130
  if ENV["EASY_ML_DEV"]
127
131
  require "vite_ruby"
128
132
  require "vite_rails"
@@ -60,6 +60,7 @@ module EasyML
60
60
  add_workflow_status_to_easy_ml_dataset_histories
61
61
  add_metadata_to_easy_ml_predictions
62
62
  add_unique_constraint_to_dataset_names
63
+ add_view_class_to_easy_ml_datasets
63
64
  ].freeze
64
65
 
65
66
  # Specify the next migration number
@@ -0,0 +1,9 @@
1
+ class AddViewClassToEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_datasets, :view_class, :string
4
+ add_index :easy_ml_datasets, :view_class
5
+
6
+ add_column :easy_ml_dataset_histories, :view_class, :string
7
+ add_index :easy_ml_dataset_histories, :view_class
8
+ end
9
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc101"
4
+ VERSION = "0.2.0-rc102"
5
5
 
6
6
  module Version
7
7
  end
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "entrypoints/Application.tsx": {
3
- "file": "assets/entrypoints/Application.tsx-BXwsBCuQ.js",
3
+ "file": "assets/entrypoints/Application.tsx-CRS5bRgw.js",
4
4
  "name": "entrypoints/Application.tsx",
5
5
  "src": "entrypoints/Application.tsx",
6
6
  "isEntry": true,