easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/application_controller.rb +4 -0
  3. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  4. data/app/frontend/components/DatasetPreview.tsx +50 -19
  5. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  6. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  7. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  8. data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
  9. data/app/frontend/types/dataset.ts +3 -0
  10. data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
  11. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  12. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  13. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  14. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  15. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  16. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  17. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  18. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  19. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  20. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  21. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  22. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  23. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  24. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  25. data/app/models/easy_ml/column/imputers.rb +126 -0
  26. data/app/models/easy_ml/column/learner.rb +18 -0
  27. data/app/models/easy_ml/column/learners/base.rb +103 -0
  28. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  29. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  30. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  31. data/app/models/easy_ml/column/learners/null.rb +22 -0
  32. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  33. data/app/models/easy_ml/column/learners/string.rb +15 -0
  34. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  35. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  36. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  37. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  38. data/app/models/easy_ml/column/lineage.rb +28 -0
  39. data/app/models/easy_ml/column/selector.rb +96 -0
  40. data/app/models/easy_ml/column.rb +319 -52
  41. data/app/models/easy_ml/column_history.rb +29 -22
  42. data/app/models/easy_ml/column_list.rb +63 -78
  43. data/app/models/easy_ml/dataset.rb +128 -96
  44. data/app/models/easy_ml/dataset_history.rb +23 -23
  45. data/app/models/easy_ml/datasource.rb +3 -0
  46. data/app/models/easy_ml/datasource_history.rb +1 -0
  47. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  48. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  49. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  50. data/app/models/easy_ml/feature.rb +19 -7
  51. data/app/models/easy_ml/feature_history.rb +12 -0
  52. data/app/models/easy_ml/feature_list.rb +15 -0
  53. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  54. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  55. data/config/initializers/enumerable.rb +17 -0
  56. data/lib/easy_ml/data/date_converter.rb +137 -30
  57. data/lib/easy_ml/data/polars_column.rb +17 -0
  58. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  59. data/lib/easy_ml/data/polars_reader.rb +20 -1
  60. data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
  61. data/lib/easy_ml/data/splits/split.rb +2 -1
  62. data/lib/easy_ml/data/synced_directory.rb +1 -1
  63. data/lib/easy_ml/data.rb +1 -2
  64. data/lib/easy_ml/feature_store.rb +33 -22
  65. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
  66. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
  67. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  68. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  69. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  70. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  71. data/lib/easy_ml/version.rb +1 -1
  72. data/lib/tasks/profile.rake +40 -0
  73. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  74. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  75. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  76. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  77. metadata +41 -10
  78. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  79. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  80. data/lib/easy_ml/data/preprocessor.rb +0 -340
  81. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  82. data/lib/easy_ml/data/statistics_learner.rb +0 -193
  83. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  84. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
  85. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
@@ -19,9 +19,19 @@
19
19
  #
20
20
  module EasyML
21
21
  class ColumnSerializer
22
+ class SmallSerializer
23
+ include JSONAPI::Serializer
24
+ attributes :id, :name
25
+ end
26
+
22
27
  include JSONAPI::Serializer
23
28
 
24
29
  attributes :id, :name, :description, :dataset_id, :datatype, :polars_datatype, :preprocessing_steps,
25
- :hidden, :drop_if_null, :sample_values, :statistics, :is_target
30
+ :hidden, :drop_if_null, :sample_values, :statistics, :is_target,
31
+ :is_computed, :computed_by, :lineage
32
+
33
+ attribute :required do |object|
34
+ object.required?
35
+ end
26
36
  end
27
37
  end
@@ -24,6 +24,27 @@ require_relative "./column_serializer"
24
24
  #
25
25
  module EasyML
26
26
  class DatasetSerializer
27
+ class SmallSerializer
28
+ include JSONAPI::Serializer
29
+
30
+ attributes :id, :name, :description, :target, :num_rows, :status,
31
+ :datasource_id, :preprocessing_steps, :workflow_status, :statistics
32
+
33
+ attribute :columns do |dataset|
34
+ dataset.columns.order(:id).map do |column|
35
+ ColumnSerializer::SmallSerializer.new(column).serializable_hash.dig(:data, :attributes)
36
+ end
37
+ end
38
+ attribute :stacktrace do |object|
39
+ if !object.failed? || object.events.empty?
40
+ nil
41
+ else
42
+ last_event = object.events.where(status: :failed).order(id: :desc).limit(1).last
43
+ last_event&.stacktrace
44
+ end
45
+ end
46
+ end
47
+
27
48
  include JSONAPI::Serializer
28
49
 
29
50
  attributes :id, :name, :description, :target, :num_rows, :status,
@@ -47,7 +68,7 @@ module EasyML
47
68
  if dataset.workflow_status.to_sym == :analyzing
48
69
  nil
49
70
  else
50
- dataset.data(limit: 10, all_columns: true)&.to_hashes
71
+ dataset.data(limit: 10, all_columns: true, refresh: false)&.to_hashes || dataset.raw.data(limit: 10, all_columns: true).to_hashes
51
72
  end
52
73
  end
53
74
 
@@ -62,7 +83,7 @@ module EasyML
62
83
  end
63
84
 
64
85
  attribute :needs_refresh do |dataset|
65
- dataset.needs_refresh?
86
+ dataset.needs_refresh?(exclude: [:datasource_needs_refresh])
66
87
  end
67
88
 
68
89
  attribute :stacktrace do |object|
@@ -0,0 +1,17 @@
1
+ module Enumerable
2
+ def count_by(&block)
3
+ self.group_by(&block).inject({}) do |h, (k, v)|
4
+ h.tap do
5
+ h[k] = v.count
6
+ end
7
+ end
8
+ end
9
+
10
+ def key_by(&block)
11
+ self.group_by(&block).inject({}) do |h, (k, v)|
12
+ h.tap do
13
+ h[k] = v.first
14
+ end
15
+ end
16
+ end
17
+ end
@@ -3,35 +3,104 @@ module EasyML
3
3
  module DateConverter
4
4
  COMMON_DATE_FORMATS = [
5
5
  "%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
6
- "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
7
- "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
8
- "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
9
- "%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
10
- "%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
11
- "%Y-%m-%d", # e.g., "2021-01-01"
12
- "%m/%d/%Y %H:%M:%S", # e.g., "01/01/2021 00:01:36"
13
- "%m/%d/%Y", # e.g., "01/01/2021"
14
- "%d-%m-%Y", # e.g., "01-01-2021"
15
- "%d-%b-%Y %H:%M:%S", # e.g., "01-Jan-2021 00:01:36"
16
- "%d-%b-%Y", # e.g., "01-Jan-2021"
17
- "%b %d, %Y", # e.g., "Jan 01, 2021"
18
- "%Y/%m/%d %H:%M:%S", # e.g., "2021/01/01 00:01:36"
6
+ "%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
7
+ "%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
8
+ "%Y-%m-%d %H:%M:%S.%L", # duplicate format intentionally
9
+ "%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
10
+ "%Y-%m-%d %H:%M", # e.g., "2021-01-01 00:01"
11
+ "%Y-%m-%d", # e.g., "2021-01-01"
12
+ "%m/%d/%Y %H:%M:%S", # e.g., "01/01/2021 00:01:36"
13
+ "%m/%d/%Y", # e.g., "01/01/2021"
14
+ "%d-%m-%Y", # e.g., "01-01-2021"
15
+ "%d-%b-%Y %H:%M:%S", # e.g., "01-Jan-2021 00:01:36"
16
+ "%d-%b-%Y", # e.g., "01-Jan-2021"
17
+ "%b %d, %Y", # e.g., "Jan 01, 2021"
18
+ "%Y/%m/%d %H:%M:%S", # e.g., "2021/01/01 00:01:36"
19
19
  "%Y/%m/%d", # e.g., "2021/01/01"
20
20
  ].freeze
21
21
 
22
22
  FORMAT_MAPPINGS = {
23
23
  ruby_to_polars: {
24
24
  "%L" => "%3f", # milliseconds
25
- "%6N" => "%6f", # microseconds
26
- "%N" => "%9f", # nanoseconds
25
+ "%6N" => "%6f", # microseconds
26
+ "%N" => "%9f", # nanoseconds
27
27
  },
28
28
  }.freeze
29
29
 
30
30
  class << self
31
- # Attempts to convert a string column to datetime if it appears to be a date
32
- # @param df [Polars::DataFrame] The dataframe containing the series
33
- # @param column [String] The name of the column to convert
34
- # @return [Polars::DataFrame] The dataframe with converted column (if successful)
31
+ # Infers a strftime format string from the given date string.
32
+ #
33
+ # @param date_str [String] The date string to analyze.
34
+ # @return [String, nil] The corresponding strftime format if recognized, or nil if not.
35
+ def infer_strftime_format(date_str)
36
+ return nil if date_str.blank?
37
+
38
+ # YYYY-MM-DD (e.g., "2021-01-01")
39
+ return "%Y-%m-%d" if date_str =~ /^\d{4}-\d{2}-\d{2}$/
40
+
41
+ # YYYY/MM/DD (e.g., "2021/01/01")
42
+ return "%Y/%m/%d" if date_str =~ /^\d{4}\/\d{2}\/\d{2}$/
43
+
44
+ # Date & time with T separator (ISO 8601-like)
45
+ if date_str.include?("T")
46
+ # Without fractional seconds, e.g., "2021-01-01T12:34:56"
47
+ return "%Y-%m-%dT%H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/
48
+
49
+ # With fractional seconds, e.g., "2021-01-01T12:34:56.789" or "2021-01-01T12:34:56.123456"
50
+ if date_str =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.(\d+)$/
51
+ fraction = Regexp.last_match(1)
52
+ case fraction.length
53
+ when 3 then return "%Y-%m-%dT%H:%M:%S.%L" # milliseconds
54
+ when 6 then return "%Y-%m-%dT%H:%M:%S.%6N" # microseconds
55
+ when 9 then return "%Y-%m-%dT%H:%M:%S.%N" # nanoseconds
56
+ else
57
+ # Fallback if fractional part has unexpected length:
58
+ return "%Y-%m-%dT%H:%M:%S.%N"
59
+ end
60
+ end
61
+ end
62
+
63
+ # Date & time with space separator
64
+ if date_str.include?(" ")
65
+ # Without fractional seconds, e.g., "2021-01-01 12:34:56"
66
+ return "%Y-%m-%d %H:%M:%S" if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/
67
+
68
+ # With fractional seconds, e.g., "2021-01-01 12:34:56.789"
69
+ if date_str =~ /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.(\d+)$/
70
+ fraction = Regexp.last_match(1)
71
+ case fraction.length
72
+ when 3 then return "%Y-%m-%d %H:%M:%S.%L"
73
+ when 6 then return "%Y-%m-%d %H:%M:%S.%6N"
74
+ when 9 then return "%Y-%m-%d %H:%M:%S.%N"
75
+ else
76
+ return "%Y-%m-%d %H:%M:%S.%N"
77
+ end
78
+ end
79
+ end
80
+
81
+ # Common US-style formats
82
+
83
+ # MM/DD/YYYY (e.g., "01/31/2021")
84
+ return "%m/%d/%Y" if date_str =~ /^\d{2}\/\d{2}\/\d{4}$/
85
+
86
+ # DD-MM-YYYY (e.g., "31-01-2021")
87
+ return "%d-%m-%Y" if date_str =~ /^\d{2}-\d{2}-\d{4}$/
88
+
89
+ # DD-Mon-YYYY (e.g., "31-Jan-2021")
90
+ return "%d-%b-%Y" if date_str =~ /^\d{2}-[A-Za-z]{3}-\d{4}$/
91
+
92
+ # Mon DD, YYYY (e.g., "Jan 31, 2021")
93
+ return "%b %d, %Y" if date_str =~ /^[A-Za-z]{3} \d{2}, \d{4}$/
94
+
95
+ # Could add additional heuristics as needed...
96
+
97
+ nil # Return nil if no known format matches.
98
+ end
99
+
100
+ # Attempts to convert a string column to datetime if it appears to be a date.
101
+ # @param df [Polars::DataFrame] The dataframe containing the series.
102
+ # @param column [String] The name of the column to convert.
103
+ # @return [Polars::DataFrame] The dataframe with the converted column (if successful).
35
104
  def maybe_convert_date(df, column = nil)
36
105
  if column.nil?
37
106
  series = df
@@ -40,19 +109,42 @@ module EasyML
40
109
  else
41
110
  series = df[column]
42
111
  end
112
+
43
113
  return df if series.dtype.is_a?(Polars::Datetime)
44
114
  return df unless series.dtype == Polars::Utf8
45
115
 
46
- format = detect_polars_format(series)
47
- return df unless format
116
+ sample = series.filter(series.is_not_null).head(100).to_a
117
+ ruby_format = detect_date_format(sample)
48
118
 
49
- df.with_column(
50
- Polars.col(column.to_s).str.strptime(Polars::Datetime, format).alias(column.to_s)
51
- )
119
+ if ruby_format
120
+ format = convert_format(:ruby_to_polars, ruby_format)
121
+ df = try_format(df, column, format)
122
+
123
+ if df.filter(Polars.col("TRY").is_null).count > df.filter(Polars.col(column.to_s).is_null).count
124
+ df = df.drop("TRY")
125
+ best_format = df[column.to_s][0..100].to_a.count_by do |date_str|
126
+ infer_strftime_format(date_str)
127
+ end.max_by { |_format, count| count }[0]
128
+ df = try_format(df, column, best_format)
129
+ end
130
+
131
+ df = df.with_column(df["TRY"].alias(column.to_s)).drop("TRY")
132
+ end
133
+
134
+ df
52
135
  end
53
136
 
54
137
  private
55
138
 
139
+ def try_format(df, column, format)
140
+ df = df.with_column(
141
+ Polars.col(column.to_s)
142
+ .str
143
+ .strptime(Polars::Datetime, format, strict: false)
144
+ .alias("TRY")
145
+ )
146
+ end
147
+
56
148
  def detect_polars_format(series)
57
149
  return nil unless series.is_a?(Polars::Series)
58
150
 
@@ -66,14 +158,29 @@ module EasyML
66
158
 
67
159
  sample = date_strings.compact.sample([100, date_strings.length].min)
68
160
 
69
- COMMON_DATE_FORMATS.detect do |format|
70
- sample.all? do |date_str|
71
- DateTime.strptime(date_str, format)
72
- true
73
- rescue StandardError
74
- false
161
+ best_format = nil
162
+ best_success_rate = 0.0
163
+ sample_count = sample.length
164
+
165
+ COMMON_DATE_FORMATS.each do |fmt|
166
+ success_count = sample.count do |date_str|
167
+ begin
168
+ DateTime.strptime(date_str, fmt)
169
+ true
170
+ rescue StandardError
171
+ false
172
+ end
75
173
  end
174
+ success_rate = success_count.to_f / sample_count
175
+ if success_rate > best_success_rate
176
+ best_success_rate = success_rate
177
+ best_format = fmt
178
+ end
179
+ # If every sample string matches this format, return it immediately.
180
+ return fmt if success_rate == 1.0
76
181
  end
182
+
183
+ best_success_rate >= 0.8 ? best_format : nil
77
184
  end
78
185
 
79
186
  def convert_format(conversion, format)
@@ -12,6 +12,7 @@ module EasyML
12
12
  string: Polars::String,
13
13
  text: Polars::String,
14
14
  categorical: Polars::Categorical,
15
+ null: Polars::Null,
15
16
  }
16
17
  POLARS_MAP = TYPE_MAP.invert.stringify_keys
17
18
  class << self
@@ -19,6 +20,20 @@ module EasyML
19
20
  POLARS_MAP.dig(polars_type.class.to_s)
20
21
  end
21
22
 
23
+ def parse_polars_dtype(dtype_string)
24
+ case dtype_string
25
+ when /^Polars::Datetime/
26
+ time_unit = dtype_string[/time_unit: "(.*?)"/, 1]
27
+ time_zone = dtype_string[/time_zone: (.*)?\)/, 1]
28
+ time_zone = time_zone == "nil" ? nil : time_zone&.delete('"')
29
+ Polars::Datetime.new(time_unit, time_zone)
30
+ when /^Polars::/
31
+ Polars.const_get(dtype_string.split("::").last)
32
+ else
33
+ raise ArgumentError, "Unknown Polars data type: #{dtype_string}"
34
+ end
35
+ end
36
+
22
37
  def sym_to_polars(symbol)
23
38
  TYPE_MAP.dig(symbol)
24
39
  end
@@ -50,6 +65,8 @@ module EasyML
50
65
  :boolean
51
66
  when Polars::Utf8
52
67
  determine_string_type(series)
68
+ when Polars::Null
69
+ :null
53
70
  else
54
71
  :categorical
55
72
  end
@@ -0,0 +1,30 @@
1
+ module EasyML
2
+ module Data
3
+ class PolarsInMemory
4
+ attr_reader :df
5
+
6
+ def initialize(df)
7
+ @df = df
8
+ end
9
+
10
+ def self.query(df, **kwargs)
11
+ new(df).query(**kwargs)
12
+ end
13
+
14
+ def query(drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
15
+ return if df.nil?
16
+
17
+ df = self.df.clone
18
+ df = df.filter(filter) if filter
19
+ select = df.columns & ([select] || []).flatten
20
+ df = df.select(select) if select.present?
21
+ df = df.unique if unique
22
+ drop_cols &= df.columns
23
+ df = df.drop(drop_cols) unless drop_cols.empty?
24
+ df = df.sort(sort, reverse: descending) if sort
25
+ df = df.limit(limit) if limit
26
+ df
27
+ end
28
+ end
29
+ end
30
+ end
@@ -12,6 +12,22 @@ module EasyML
12
12
  @schema = options[:schema]
13
13
  end
14
14
 
15
+ def sha
16
+ files = parquet_files.sort
17
+
18
+ file_hashes = files.map do |file|
19
+ meta = Polars.read_parquet_schema(file)
20
+ row_count = Polars.scan_parquet(file).select(Polars.col("*").count).collect[0, 0]
21
+
22
+ Digest::SHA256.hexdigest([
23
+ meta.to_json,
24
+ row_count.to_s,
25
+ ].join("|"))
26
+ end
27
+
28
+ Digest::SHA256.hexdigest(file_hashes.join)
29
+ end
30
+
15
31
  def schema=(value)
16
32
  @schema = value
17
33
  polars_args[:dtypes] = value
@@ -55,7 +71,10 @@ module EasyML
55
71
  return files if any_parquet? && columns.nil?
56
72
 
57
73
  puts "Converting to Parquet..."
58
-
74
+ if columns.nil? || columns.all? { |c| c.datatype.nil? }
75
+ learn_dataset
76
+ columns = nil
77
+ end
59
78
  csv_files.each do |path|
60
79
  df = read_file(path, columns)
61
80
  df = cast(df, columns)
@@ -23,7 +23,7 @@ module EasyML
23
23
  end
24
24
 
25
25
  def read(segment, split_ys: false, target: nil, drop_cols: [], filter: nil, limit: nil, select: nil,
26
- unique: nil)
26
+ unique: nil, sort: nil, descending: false)
27
27
  return nil if @data.keys.none?
28
28
 
29
29
  df = if segment.to_s == "all"
@@ -33,10 +33,8 @@ module EasyML
33
33
  end
34
34
  return nil if df.nil?
35
35
 
36
- df = df.filter(filter) if filter.present?
37
- drop_cols &= df.columns
38
- df = df.drop(drop_cols) unless drop_cols.empty?
39
- df = df.unique if unique
36
+ df = EasyML::Data::PolarsInMemory.query(df, drop_cols: drop_cols, filter: filter, limit: limit, select: select,
37
+ unique: unique, sort: sort, descending: descending)
40
38
 
41
39
  split_features_targets(df, split_ys, target)
42
40
  end
@@ -7,10 +7,11 @@ module EasyML
7
7
  VALID_SEGMENTS = %w[train test valid all].freeze
8
8
 
9
9
  def initialize(options = {})
10
+ # Method kept for compatibility with subclasses
10
11
  end
11
12
 
12
13
  def load_data(segment, **kwargs)
13
- drop_cols = dataset.drop_columns(all_columns: kwargs[:all_columns] || false)
14
+ drop_cols = dataset.drop_columns(all_columns: kwargs.key?(:all_columns) && kwargs[:all_columns])
14
15
  kwargs.delete(:all_columns)
15
16
  kwargs = kwargs.merge!(drop_cols: drop_cols, target: dataset.target)
16
17
  read(segment, **kwargs)
@@ -18,7 +18,7 @@ module EasyML
18
18
  @polars_args = options.dig(:polars_args)
19
19
  end
20
20
 
21
- delegate :query, :data, :all_files, :files, to: :reader
21
+ delegate :query, :data, :all_files, :files, :sha, to: :reader
22
22
 
23
23
  def before_sync
24
24
  return unless should_sync?
data/lib/easy_ml/data.rb CHANGED
@@ -2,11 +2,10 @@ module EasyML
2
2
  module Data
3
3
  require_relative "data/utils"
4
4
  require_relative "data/polars_reader"
5
+ require_relative "data/polars_in_memory"
5
6
  require_relative "data/synced_directory"
6
- require_relative "data/preprocessor"
7
7
  require_relative "data/splits"
8
8
  require_relative "data/polars_column"
9
- require_relative "data/statistics_learner"
10
9
  require_relative "data/date_converter"
11
10
  end
12
11
  end
@@ -40,8 +40,8 @@ module EasyML
40
40
  end
41
41
  end
42
42
 
43
- def query(filter: nil)
44
- query_all_partitions(filter)
43
+ def query(**kwargs)
44
+ query_all_partitions(**kwargs)
45
45
  end
46
46
 
47
47
  def empty?
@@ -82,18 +82,40 @@ module EasyML
82
82
 
83
83
  private
84
84
 
85
+ def cleanup(type: :partitions)
86
+ case type
87
+ when :partitions
88
+ list_partitions.each do |partition|
89
+ FileUtils.rm(partition)
90
+ end
91
+ when :no_partitions
92
+ FileUtils.rm_rf(feature_path)
93
+ when :all
94
+ wipe
95
+ end
96
+ end
97
+
85
98
  def store_without_partitioning(df)
86
99
  lock_file do
100
+ cleanup(type: :partitions)
87
101
  path = feature_path
102
+ safe_write(df, path)
103
+ end
104
+ end
105
+
106
+ def safe_write(df, path)
107
+ begin
88
108
  FileUtils.mkdir_p(File.dirname(path))
89
109
  df.write_parquet(path)
110
+ rescue => e
111
+ binding.pry
90
112
  end
91
113
  end
92
114
 
93
115
  def store_partition(partition_df, primary_key, partition_start)
94
116
  lock_partition(partition_start) do
117
+ cleanup(type: :no_partitions)
95
118
  path = partition_path(partition_start)
96
- FileUtils.mkdir_p(File.dirname(path))
97
119
 
98
120
  if File.exist?(path)
99
121
  reader = EasyML::Data::PolarsReader.new
@@ -101,36 +123,25 @@ module EasyML
101
123
  preserved_records = existing_df.filter(
102
124
  Polars.col(primary_key).is_in(partition_df[primary_key]).is_not
103
125
  )
104
- partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
126
+ if preserved_records.shape[1] != partition_df.shape[1]
127
+ wipe
128
+ else
129
+ partition_df = Polars.concat([preserved_records, partition_df], how: "vertical")
130
+ end
105
131
  end
106
132
 
107
- partition_df.write_parquet(path)
133
+ safe_write(partition_df, path)
108
134
  end
109
135
  end
110
136
 
111
- def query_partitions(filter)
112
- primary_key_values = filter.extract_primary_key_values
113
- batch_size = feature.batch_size || 10_000
114
-
115
- partition_files = primary_key_values.map do |key|
116
- partition_start = (key / batch_size.to_f).floor * batch_size
117
- partition_path(partition_start)
118
- end.uniq.select { |path| File.exist?(path) }
119
-
120
- return Polars::DataFrame.new if partition_files.empty?
121
-
122
- reader = EasyML::Data::PolarsReader.new
123
- reader.query(partition_files, filter: filter)
124
- end
125
-
126
- def query_all_partitions(filter)
137
+ def query_all_partitions(**kwargs)
127
138
  reader = EasyML::Data::PolarsReader.new
128
139
  pattern = File.join(feature_dir, "feature*.parquet")
129
140
  files = Dir.glob(pattern)
130
141
 
131
142
  return Polars::DataFrame.new if files.empty?
132
143
 
133
- reader.query(files, filter: filter)
144
+ reader.query(files, **kwargs)
134
145
  end
135
146
 
136
147
  def compute_partition_boundaries(min_key, max_key, batch_size)
@@ -44,6 +44,10 @@ module EasyML
44
44
  add_computed_columns_to_easy_ml_columns
45
45
  add_slug_to_easy_ml_models
46
46
  add_default_to_is_target
47
+ remove_preprocessor_statistics_from_easy_ml_datasets
48
+ add_learned_at_to_easy_ml_columns
49
+ add_sha_to_datasources_datasets_and_columns
50
+ add_last_feature_sha_to_columns
47
51
  ].freeze
48
52
 
49
53
  # Specify the next migration number
@@ -2,13 +2,17 @@ class AddComputedColumnsToEasyMLColumns < ActiveRecord::Migration[<%= ActiveReco
2
2
  def change
3
3
  add_column :easy_ml_columns, :computed_by, :string
4
4
  add_column :easy_ml_columns, :is_computed, :boolean, default: false
5
+ add_column :easy_ml_columns, :feature_id, :bigint
5
6
 
6
7
  add_index :easy_ml_columns, :computed_by
7
8
  add_index :easy_ml_columns, :is_computed
9
+ add_index :easy_ml_columns, :feature_id
8
10
 
9
11
  add_column :easy_ml_column_histories, :computed_by, :string
10
12
  add_index :easy_ml_column_histories, :computed_by
11
13
  add_column :easy_ml_column_histories, :is_computed, :boolean, default: false
12
14
  add_index :easy_ml_column_histories, :is_computed
15
+ add_column :easy_ml_column_histories, :feature_id, :bigint
16
+ add_index :easy_ml_column_histories, :feature_id
13
17
  end
14
18
  end
@@ -0,0 +1,9 @@
1
+ class AddLastFeatureShaToColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_columns, :last_feature_sha, :string
4
+ add_index :easy_ml_columns, :last_feature_sha
5
+
6
+ add_column :easy_ml_column_histories, :last_feature_sha, :string
7
+ add_index :easy_ml_column_histories, :last_feature_sha
8
+ end
9
+ end
@@ -0,0 +1,13 @@
1
+ class AddLearnedAtToEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_columns, :learned_at, :timestamp
4
+ add_column :easy_ml_columns, :is_learning, :boolean, default: false
5
+ add_index :easy_ml_columns, :learned_at
6
+ add_index :easy_ml_columns, :is_learning
7
+
8
+ add_column :easy_ml_column_histories, :learned_at, :timestamp
9
+ add_column :easy_ml_column_histories, :is_learning, :boolean, default: false
10
+ add_index :easy_ml_column_histories, :learned_at
11
+ add_index :easy_ml_column_histories, :is_learning
12
+ end
13
+ end
@@ -0,0 +1,21 @@
1
+ class AddShaToDatasourcesDatasetsAndColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_datasources, :sha, :string
4
+ add_column :easy_ml_datasets, :last_datasource_sha, :string
5
+
6
+ add_index :easy_ml_datasources, :sha
7
+ add_index :easy_ml_datasets, :last_datasource_sha
8
+
9
+ add_column :easy_ml_datasource_histories, :sha, :string
10
+ add_index :easy_ml_datasource_histories, :sha
11
+
12
+ add_column :easy_ml_dataset_histories, :last_datasource_sha, :string
13
+ add_index :easy_ml_dataset_histories, :last_datasource_sha
14
+
15
+ add_column :easy_ml_columns, :last_datasource_sha, :string
16
+ add_index :easy_ml_columns, :last_datasource_sha
17
+
18
+ add_column :easy_ml_column_histories, :last_datasource_sha, :string
19
+ add_index :easy_ml_column_histories, :last_datasource_sha
20
+ end
21
+ end
@@ -0,0 +1,11 @@
1
+ class RemovePreprocessorStatisticsFromEasyMLDatasets < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ if column_exists?(:easy_ml_datasets, :preprocessor_statistics)
4
+ remove_column :easy_ml_datasets, :preprocessor_statistics
5
+ end
6
+
7
+ if column_exists?(:easy_ml_dataset_histories, :preprocessor_statistics)
8
+ remove_column :easy_ml_dataset_histories, :preprocessor_statistics
9
+ end
10
+ end
11
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc58"
4
+ VERSION = "0.2.0-rc60"
5
5
 
6
6
  module Version
7
7
  end