easy_ml 0.2.0.pre.rc100 → 0.2.0.pre.rc102

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +1 -0
  3. data/app/frontend/components/dataset/splitters/types.ts +3 -4
  4. data/app/frontend/pages/NewDatasetPage.tsx +17 -0
  5. data/app/frontend/types/datasource.ts +14 -6
  6. data/app/models/easy_ml/column/imputers/base.rb +3 -1
  7. data/app/models/easy_ml/column.rb +8 -0
  8. data/app/models/easy_ml/dataset/learner/lazy.rb +16 -3
  9. data/app/models/easy_ml/dataset.rb +47 -9
  10. data/app/models/easy_ml/dataset_history.rb +1 -0
  11. data/app/models/easy_ml/feature.rb +5 -13
  12. data/app/models/easy_ml/lineage.rb +2 -1
  13. data/app/models/easy_ml/models/xgboost/evals_callback.rb +1 -0
  14. data/app/models/easy_ml/models/xgboost.rb +7 -2
  15. data/app/models/easy_ml/prediction.rb +1 -1
  16. data/app/models/easy_ml/splitters/base_splitter.rb +4 -8
  17. data/app/models/easy_ml/splitters/date_splitter.rb +2 -1
  18. data/app/models/easy_ml/splitters/predefined_splitter.rb +8 -3
  19. data/config/initializers/zhong.rb +6 -0
  20. data/lib/easy_ml/data/dataset_manager/schema/normalizer.rb +201 -0
  21. data/lib/easy_ml/data/dataset_manager/schema.rb +9 -0
  22. data/lib/easy_ml/data/dataset_manager.rb +5 -0
  23. data/lib/easy_ml/data/date_converter.rb +24 -165
  24. data/lib/easy_ml/data/polars_column.rb +5 -2
  25. data/lib/easy_ml/data/polars_reader.rb +4 -1
  26. data/lib/easy_ml/data/synced_directory.rb +36 -23
  27. data/lib/easy_ml/engine.rb +4 -0
  28. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
  29. data/lib/easy_ml/railtie/templates/migration/add_view_class_to_easy_ml_datasets.rb.tt +9 -0
  30. data/lib/easy_ml/version.rb +1 -1
  31. data/public/easy_ml/assets/.vite/manifest.json +1 -1
  32. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js → Application.tsx-CRS5bRgw.js} +8 -8
  33. data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BXwsBCuQ.js.map → Application.tsx-CRS5bRgw.js.map} +1 -1
  34. metadata +7 -5
  35. data/lib/easy_ml/data/dataset_manager/normalizer.rb +0 -0
@@ -0,0 +1,9 @@
1
+ module EasyML
2
+ module Data
3
+ class DatasetManager
4
+ class Schema
5
+ require_relative "schema/normalizer"
6
+ end
7
+ end
8
+ end
9
+ end
@@ -3,6 +3,7 @@ module EasyML
3
3
  class DatasetManager
4
4
  require_relative "dataset_manager/writer"
5
5
  require_relative "dataset_manager/reader"
6
+ require_relative "dataset_manager/schema"
6
7
 
7
8
  attr_accessor :root_dir, :partition, :append_only, :filenames, :primary_key,
8
9
  :partition_size, :s3_bucket, :s3_prefix, :s3_access_key_id,
@@ -55,6 +56,10 @@ module EasyML
55
56
  def cp(from, to)
56
57
  Writer.cp(from, to)
57
58
  end
59
+
60
+ def normalize_schema(files)
61
+ Schema::Normalizer.new(files).normalize
62
+ end
58
63
  end
59
64
 
60
65
  def list_nulls(input = nil, **kwargs, &block)
@@ -2,10 +2,10 @@ module EasyML
2
2
  module Data
3
3
  module DateConverter
4
4
  COMMON_DATE_FORMATS = [
5
+ "%Y-%m-%d %H:%M:%S.%f %Z",
5
6
  "%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
6
7
  "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
7
8
  "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
8
- "%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
9
9
  "%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
10
10
  "%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
11
11
  "%Y-%m-%d", # e.g., "2021-01-01"
@@ -19,179 +19,38 @@ module EasyML
19
19
  "%Y/%m/%d", # e.g., "2021/01/01"
20
20
  ].freeze
21
21
 
22
- FORMAT_MAPPINGS = {
23
- ruby_to_polars: {
24
- "%L" => "%3f", # milliseconds
25
- "%6N" => "%6f", # microseconds
26
- "%N" => "%9f", # nanoseconds
27
- },
28
- }.freeze
29
-
30
- class << self
31
- # Infers a strftime format string from the given date string.
32
- #
33
- # @param date_str [String] The date string to analyze.
34
- # @return [String, nil] The corresponding strftime format if recognized, or nil if not.
35
- def infer_strftime_format(date_str)
36
- return nil if date_str.blank?
37
-
38
- # YYYY-MM-DD (e.g., "2021-01-01")
39
- return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
40
-
41
- # YYYY/MM/DD (e.g., "2021/01/01")
42
- return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
43
-
44
- # Date & time with T separator (ISO 8601-like)
45
- if date_str.include?("T")
46
- # Without fractional seconds, e.g., "2021-01-01T12:34:56"
47
- return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
48
-
49
- # With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
50
- if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
51
- fraction = Regexp.last_match(1)
52
- case fraction.length
53
- when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
54
- when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
55
- when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
56
- else
57
- # Fallback if fractional part has unexpected length:
58
- return "%Y-%m-%dT%H:%M:%S.%N"
59
- end
60
- end
61
- end
62
-
63
- # Date & time with space separator
64
- if date_str.include?(" ")
65
- # Without fractional seconds, e.g., "2021-01-01 12:34:56"
66
- return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
67
-
68
- # With fractional seconds, e.g., "2021-01-01 12:34:56.789"
69
- if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
70
- fraction = Regexp.last_match(1)
71
- case fraction.length
72
- when 3 then return "%Y-%m-%d %H:%M:%S.%L"
73
- when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
74
- when 9 then return "%Y-%m-%d %H:%M:%S.%N"
75
- else
76
- return "%Y-%m-%d %H:%M:%S.%N"
77
- end
78
- end
79
- end
80
-
81
- # Common US-style formats
82
-
83
- # MM/DD/YYYY (e.g., "01/31/2021")
84
- return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
85
-
86
- # DD-MM-YYYY (e.g., "31-01-2021")
87
- return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
88
-
89
- # DD-Mon-YYYY (e.g., "31-Jan-2021")
90
- return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
91
-
92
- # Mon DD, YYYY (e.g., "Jan 31, 2021")
93
- return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
94
-
95
- # Could add additional heuristics as needed...
96
-
97
- nil # Return nil if no known format matches.
22
+ def self.maybe_convert_date(df, column = nil)
23
+ column = column.to_s if column.present?
24
+ if df.is_a?(Polars::Series)
25
+ column = "temp" if column.nil?
26
+ df = Polars::DataFrame.new({ column.to_s => df })
98
27
  end
28
+ return df unless df.columns.include?(column)
29
+ return df if df[column].dtype.is_a?(Polars::Datetime)
99
30
 
100
- # Attempts to convert a string column to datetime if it appears to be a date.
101
- # @param df [Polars::DataFrame] The dataframe containing the series.
102
- # @param column [String] The name of the column to convert.
103
- # @return [Polars::DataFrame] The dataframe with the converted column (if successful).
104
- def maybe_convert_date(df, column = nil)
105
- if column.nil?
106
- series = df
107
- column = series.name
108
- df = Polars::DataFrame.new(series)
109
- else
110
- series = df[column]
111
- end
112
-
113
- return df if series.dtype.is_a?(Polars::Datetime)
114
- return df unless series.dtype == Polars::Utf8
115
-
116
- sample = series.filter(series.is_not_null).head(100).to_a
117
- ruby_format = detect_date_format(sample)
31
+ conversions = df.select(queries(column)).to_hashes&.first || []
32
+ return df unless conversions.any?
118
33
 
119
- if ruby_format
120
- format = convert_format(:ruby_to_polars, ruby_format)
121
- df = try_format(df, column, format)
34
+ conversions = conversions.select { |k, v| v }
35
+ return df unless conversions.any?
122
36
 
123
- if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
124
- df = df.drop("TRY")
125
- best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
126
- infer_strftime_format(date_str)
127
- end.max_by { |_format, count| count }[0]
128
- df = try_format(df, column, best_format)
129
- end
130
-
131
- df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
132
- end
133
-
134
- df
135
- end
136
-
137
- private
138
-
139
- def try_format(df, column, format)
140
- df = df.with_column(
141
- Polars.col(column.to_s)
142
- .str
143
- .strptime(Polars::Datetime, format, strict: false)
144
- .alias("TRY")
145
- )
146
- end
147
-
148
- def detect_polars_format(series)
149
- return nil unless series.is_a?(Polars::Series)
150
-
151
- sample = series.filter(series.is_not_null).head(100).to_a
152
- ruby_format = detect_date_format(sample)
153
- convert_format(:ruby_to_polars, ruby_format)
37
+ conversions.map do |k, _|
38
+ conversion = conversion(k)
39
+ df = df.with_columns(conversion)
154
40
  end
155
41
 
156
- def detect_date_format(date_strings)
157
- return nil if date_strings.empty?
158
-
159
- sample = date_strings.compact.sample([100, date_strings.length].min)
160
-
161
- best_format = nil
162
- best_success_rate = 0.0
163
- sample_count = sample.length
164
-
165
- COMMON_DATE_FORMATS.each do |fmt|
166
- success_count = sample.count do |date_str|
167
- begin
168
- DateTime.strptime(date_str, fmt)
169
- true
170
- rescue StandardError
171
- false
172
- end
173
- end
174
- success_rate = success_count.to_f / sample_count
175
- if success_rate > best_success_rate
176
- best_success_rate = success_rate
177
- best_format = fmt
178
- end
179
- # If every sample string matches this format, return it immediately.
180
- return fmt if success_rate == 1.0
181
- end
42
+ df
43
+ end
182
44
 
183
- best_success_rate >= 0.8 ? best_format : nil
45
+ def self.queries(column)
46
+ COMMON_DATE_FORMATS.map do |format|
47
+ Polars.col(column).cast(Polars::String).str.strptime(Polars::Datetime, format, strict: false).is_not_null().all().alias("convert_#{column}_to_#{format}")
184
48
  end
49
+ end
185
50
 
186
- def convert_format(conversion, format)
187
- return nil if format.nil?
188
-
189
- result = format.dup
190
- FORMAT_MAPPINGS[conversion].each do |from, to|
191
- result = result.gsub(from, to)
192
- end
193
- result
194
- end
51
+ def self.conversion(key)
52
+ key, ruby_type = key.split("convert_").last.split("_to_")
53
+ Polars.col(key).cast(Polars::String).str.strptime(Polars::Datetime, ruby_type, strict: false).cast(Polars::Datetime).alias(key)
195
54
  end
196
55
  end
197
56
  end
@@ -16,10 +16,12 @@ module EasyML
16
16
  array: Polars::List,
17
17
  }
18
18
  POLARS_MAP = {
19
+ Polars::Decimal => :float,
19
20
  Polars::Float64 => :float,
20
21
  Polars::Int64 => :integer,
21
- Polars::Float32 => :float,
22
22
  Polars::Int32 => :integer,
23
+ Polars::Int8 => :integer,
24
+ Polars::Float32 => :float,
23
25
  Polars::Boolean => :boolean,
24
26
  Polars::Datetime => :datetime,
25
27
  Polars::Date => :date,
@@ -137,8 +139,9 @@ module EasyML
137
139
  return :numeric
138
140
  rescue StandardError
139
141
  # If not numeric, check for datetime or categorical
140
- if EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
142
+ is_datetime = EasyML::Data::DateConverter.maybe_convert_date(Polars::DataFrame.new({ temp: series }),
141
143
  :temp)[:temp].dtype.is_a?(Polars::Datetime)
144
+ if is_datetime
142
145
  :datetime
143
146
  else
144
147
  categorical_or_text?(series)
@@ -319,7 +319,10 @@ module EasyML
319
319
  def learn_dataset
320
320
  return schema if schema.present?
321
321
 
322
- existing_schema = existing_parquet_schema
322
+ if parquet_files.present?
323
+ EasyML::Data::DatasetManager.normalize_schema(parquet_files)
324
+ existing_schema = existing_parquet_schema
325
+ end
323
326
  schema = existing_schema || normalize_dataset
324
327
 
325
328
  self.schema = schema
@@ -126,7 +126,7 @@ module EasyML
126
126
  )
127
127
 
128
128
  Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
129
- if object.key.end_with?(".gz")
129
+ if object.key.end_with?(".gz") && !object.key.end_with?(".parquet.gz")
130
130
  ungzipped_file_path = ungzip_file(local_file_path)
131
131
  Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
132
132
  end
@@ -284,48 +284,61 @@ module EasyML
284
284
  relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
285
285
  s3_key = s3_prefix.present? ? File.join(s3_prefix, File.basename(relative_path)) : relative_path
286
286
 
287
- # Create a temporary gzipped version of the file
288
- gzipped_file_path = "#{file_path}.gz"
289
-
290
287
  begin
291
- Rails.logger.info("Compressing and uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
288
+ Rails.logger.info("Uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
292
289
 
293
- # Compress the file
294
- Zlib::GzipWriter.open(gzipped_file_path) do |gz|
290
+ if file_path.end_with?(".parquet")
291
+ # Upload parquet files directly without compression
295
292
  File.open(file_path, "rb") do |file|
296
- gz.write(file.read)
293
+ s3.put_object(
294
+ bucket: s3_bucket,
295
+ key: s3_key,
296
+ body: file
297
+ )
298
+ end
299
+ Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}")
300
+ else
301
+ # Create a temporary gzipped version of the file
302
+ gzipped_file_path = "#{file_path}.gz"
303
+
304
+ # Compress the file
305
+ Zlib::GzipWriter.open(gzipped_file_path) do |gz|
306
+ File.open(file_path, "rb") do |file|
307
+ gz.write(file.read)
308
+ end
297
309
  end
298
- end
299
310
 
300
- # Upload the gzipped file
301
- File.open(gzipped_file_path, "rb") do |file|
302
- s3.put_object(
303
- bucket: s3_bucket,
304
- key: "#{s3_key}.gz",
305
- body: file,
306
- content_encoding: "gzip",
307
- )
308
- end
311
+ # Upload the gzipped file
312
+ File.open(gzipped_file_path, "rb") do |file|
313
+ s3.put_object(
314
+ bucket: s3_bucket,
315
+ key: "#{s3_key}.gz",
316
+ body: file,
317
+ content_encoding: "gzip",
318
+ )
319
+ end
320
+
321
+ Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
309
322
 
310
- Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
323
+ # Clean up temporary gzipped file
324
+ File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
325
+ end
311
326
  rescue Aws::S3::Errors::ServiceError, StandardError => e
312
327
  Rails.logger.error("Failed to upload #{file_path}: #{e.message}")
313
328
  raise e
314
- ensure
315
- # Clean up temporary gzipped file
316
- File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
317
329
  end
318
330
  end
319
331
 
320
332
  def should_upload?(file_path)
321
333
  relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
322
334
  s3_key = s3_prefix.present? ? File.join(s3_prefix, relative_path) : relative_path
335
+ s3_key = "#{s3_key}.gz" unless file_path.end_with?(".parquet")
323
336
 
324
337
  begin
325
338
  # Check if file exists in S3
326
339
  response = s3.head_object(
327
340
  bucket: s3_bucket,
328
- key: "#{s3_key}.gz",
341
+ key: s3_key,
329
342
  )
330
343
 
331
344
  # Compare modification times
@@ -123,6 +123,10 @@ module EasyML
123
123
  end
124
124
  end
125
125
 
126
+ config.after_initialize do
127
+ Dir.glob(Rails.root.join("app/datasets/**/*.rb")).each { |f| require f }
128
+ end
129
+
126
130
  if ENV["EASY_ML_DEV"]
127
131
  require "vite_ruby"
128
132
  require "vite_rails"
@@ -60,6 +60,7 @@ module EasyML
60
60
  add_workflow_status_to_easy_ml_dataset_histories
61
61
  add_metadata_to_easy_ml_predictions
62
62
  add_unique_constraint_to_dataset_names
63
+ add_view_class_to_easy_ml_datasets
63
64
  ].freeze
64
65
 
65
66
  # Specify the next migration number
@@ -0,0 +1,9 @@
1
+ class AddViewClassToEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_datasets, :view_class, :string
4
+ add_index :easy_ml_datasets, :view_class
5
+
6
+ add_column :easy_ml_dataset_histories, :view_class, :string
7
+ add_index :easy_ml_dataset_histories, :view_class
8
+ end
9
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc100"
4
+ VERSION = "0.2.0-rc102"
5
5
 
6
6
  module Version
7
7
  end
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "entrypoints/Application.tsx": {
3
- "file": "assets/entrypoints/Application.tsx-BXwsBCuQ.js",
3
+ "file": "assets/entrypoints/Application.tsx-CRS5bRgw.js",
4
4
  "name": "entrypoints/Application.tsx",
5
5
  "src": "entrypoints/Application.tsx",
6
6
  "isEntry": true,