easy_ml 0.2.0.pre.rc85 → 0.2.0.pre.rc89

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +18 -2
  3. data/app/controllers/easy_ml/predictions_controller.rb +9 -1
  4. data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
  5. data/app/frontend/pages/DatasetsPage.tsx +0 -1
  6. data/app/frontend/types/dataset.ts +5 -2
  7. data/app/models/easy_ml/column/imputers/base.rb +23 -2
  8. data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
  9. data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
  10. data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
  11. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
  12. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
  13. data/app/models/easy_ml/column/imputers.rb +47 -41
  14. data/app/models/easy_ml/column/selector.rb +2 -2
  15. data/app/models/easy_ml/column.rb +260 -56
  16. data/app/models/easy_ml/column_history.rb +6 -0
  17. data/app/models/easy_ml/column_list.rb +30 -1
  18. data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
  19. data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
  20. data/app/models/easy_ml/dataset/learner.rb +11 -0
  21. data/app/models/easy_ml/dataset.rb +6 -19
  22. data/app/models/easy_ml/lineage_history.rb +17 -0
  23. data/app/models/easy_ml/model.rb +11 -1
  24. data/app/models/easy_ml/models/xgboost.rb +37 -7
  25. data/app/models/easy_ml/pca_model.rb +21 -0
  26. data/app/models/easy_ml/prediction.rb +2 -1
  27. data/app/serializers/easy_ml/column_serializer.rb +13 -1
  28. data/config/initializers/inflections.rb +1 -0
  29. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
  30. data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
  31. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
  32. data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
  33. data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
  34. data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
  35. data/lib/easy_ml/data/embeddings.rb +61 -0
  36. data/lib/easy_ml/data/polars_column.rb +3 -0
  37. data/lib/easy_ml/data/polars_reader.rb +54 -23
  38. data/lib/easy_ml/data/polars_schema.rb +28 -2
  39. data/lib/easy_ml/data/splits/file_split.rb +7 -2
  40. data/lib/easy_ml/data.rb +1 -0
  41. data/lib/easy_ml/embedding_store.rb +92 -0
  42. data/lib/easy_ml/engine.rb +4 -2
  43. data/lib/easy_ml/predict.rb +42 -20
  44. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
  45. data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
  46. data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
  47. data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
  48. data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
  49. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
  50. data/lib/easy_ml/version.rb +1 -1
  51. data/lib/easy_ml.rb +1 -0
  52. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  53. data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
  54. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
  55. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
  56. metadata +59 -6
  57. data/lib/tasks/profile.rake +0 -40
  58. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
  59. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js +0 -522
  60. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js.map +0 -1
@@ -0,0 +1,61 @@
1
+ module EasyML
2
+ module Data
3
+ class Embeddings
4
+ require_relative "embeddings/compressor"
5
+ require_relative "embeddings/embedder"
6
+
7
+ attr_reader :df, :column, :model, :adapter, :compression,
8
+ :embeddings, :compressed_embeddings, :config,
9
+ :llm, :output_column, :preset, :dimensions
10
+
11
+ def initialize(options = {})
12
+ @df = options[:df]
13
+ @column = options[:column]
14
+ @output_column = options[:output_column]
15
+ @llm = options[:llm] || "openai"
16
+ @config = options[:config] || {}
17
+ @preset = options.dig(:preset)
18
+ @dimensions = options.dig(:dimensions)
19
+ @pca_model = options.dig(:pca_model)
20
+ end
21
+
22
+ def create
23
+ embed
24
+ compress(embeddings)
25
+ end
26
+
27
+ def embed
28
+ @embeddings ||= adapter.embed(df, column, output_column)
29
+ end
30
+
31
+ def compress(embeddings, fit: false)
32
+ @compressed_embeddings ||= compressor.compress(embeddings, column, output_column, fit: fit)
33
+ end
34
+
35
+ def pca_model
36
+ return @pca_model if @pca_model.present?
37
+ return @compressor.pca_model if @compressor
38
+
39
+ nil
40
+ end
41
+
42
+ private
43
+
44
+ def adapter
45
+ @adapter ||= EasyML::Data::Embeddings::Embedder.new(llm, config)
46
+ end
47
+
48
+ def compressor_args
49
+ {
50
+ preset: preset,
51
+ dimensions: dimensions,
52
+ pca_model: pca_model,
53
+ }.compact
54
+ end
55
+
56
+ def compressor
57
+ @compressor ||= EasyML::Data::Embeddings::Compressor.new(compressor_args)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -13,6 +13,7 @@ module EasyML
13
13
  text: Polars::String,
14
14
  categorical: Polars::Categorical,
15
15
  null: Polars::Null,
16
+ array: Polars::List,
16
17
  }
17
18
  POLARS_MAP = {
18
19
  Polars::Float64 => :float,
@@ -25,6 +26,8 @@ module EasyML
25
26
  Polars::String => :string,
26
27
  Polars::Categorical => :categorical,
27
28
  Polars::Null => :null,
29
+ Polars::List => :array,
30
+ Polars::Array => :array,
28
31
  }.stringify_keys
29
32
  include EasyML::Timing
30
33
 
@@ -88,32 +88,34 @@ module EasyML
88
88
  end
89
89
 
90
90
  def query(files = nil, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
91
- batch_size: nil, batch_start: nil, batch_key: nil, lazy: false, &block)
91
+ batch_size: nil, batch_start: nil, batch_key: nil, lazy: false, cast: nil, &block)
92
92
  files ||= self.files
93
93
  PolarsReader.query(files, drop_cols: drop_cols, filter: filter, limit: limit,
94
94
  select: select, unique: unique, sort: sort, descending: descending,
95
- batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, lazy: lazy, &block)
95
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, lazy: lazy, cast: cast, &block)
96
96
  end
97
97
 
98
98
  def self.query(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
99
- batch_size: nil, batch_start: nil, batch_key: nil, lazy: false, &block)
99
+ batch_size: nil, batch_start: nil, batch_key: nil, lazy: false, cast: nil, &block)
100
100
  unless batch_size.present?
101
101
  result = query_files(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select,
102
- unique: unique, sort: sort, descending: descending)
102
+ unique: unique, sort: sort, descending: descending, cast: cast)
103
103
  return lazy ? result : result.collect
104
104
  end
105
105
 
106
- return batch_enumerator(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
107
- batch_size: batch_size, batch_start: batch_start, batch_key: batch_key) unless block_given?
106
+ unless block_given?
107
+ return batch_enumerator(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
108
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, cast: cast)
109
+ end
108
110
 
109
111
  process_batches(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
110
- batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, &block)
112
+ batch_size: batch_size, batch_start: batch_start, batch_key: batch_key, cast: cast, &block)
111
113
  end
112
114
 
113
115
  private
114
116
 
115
117
  def self.batch_enumerator(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
116
- batch_size: nil, batch_start: nil, batch_key: nil, &block)
118
+ batch_size: nil, batch_start: nil, batch_key: nil, cast: nil, &block)
117
119
  Enumerator.new do |yielder|
118
120
  process_batches(files, drop_cols: drop_cols, filter: filter, limit: limit, select: select, unique: unique, sort: sort, descending: descending,
119
121
  batch_size: batch_size, batch_start: batch_start, batch_key: batch_key) do |batch|
@@ -123,27 +125,32 @@ module EasyML
123
125
  end
124
126
 
125
127
  def self.process_batches(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false,
126
- batch_size: nil, batch_start: nil, batch_key: nil, &block)
128
+ batch_size: nil, batch_start: nil, batch_key: nil, cast: nil, &block)
127
129
  batch_key ||= identify_primary_key(files, select: select)
128
130
  raise "When using batch_size, sort must match primary key (#{batch_key})" if sort.present? && batch_key != sort
129
131
 
130
132
  sort = batch_key
131
- batch_start = query_files(files, sort: sort, descending: descending, select: batch_key, limit: 1).collect[batch_key].to_a.last unless batch_start
132
- final_value = query_files(files, sort: sort, descending: !descending, select: batch_key, limit: 1).collect[batch_key].to_a.last
133
+ batch_start ||= query_files(files, sort: sort, descending: descending, select: batch_key, cast: cast,
134
+ limit: 1).collect[batch_key].to_a.last
135
+ final_value = query_files(files, sort: sort, descending: !descending, select: batch_key, cast: cast,
136
+ limit: 1).collect[batch_key].to_a.last
133
137
 
134
138
  is_first_batch = true
135
139
  current_start = batch_start
136
140
 
137
141
  while current_start < final_value
138
142
  filter = is_first_batch ? Polars.col(sort) >= current_start : Polars.col(sort) > current_start
139
- batch = query_files(files, drop_cols: drop_cols, filter: filter, limit: batch_size, select: select, unique: unique, sort: sort, descending: descending)
143
+ batch = query_files(files, drop_cols: drop_cols, filter: filter, limit: batch_size, select: select,
144
+ unique: unique, sort: sort, descending: descending, cast: cast)
140
145
  yield batch
141
- current_start = query_files(files, sort: sort, descending: descending, limit: batch_size, filter: filter).sort(sort, reverse: !descending).limit(1).select(batch_key).collect[batch_key].to_a.last
146
+ current_start = query_files(files, sort: sort, descending: descending, limit: batch_size, filter: filter, cast: cast).sort(
147
+ sort, reverse: !descending,
148
+ ).limit(1).select(batch_key).collect[batch_key].to_a.last
142
149
  is_first_batch = false
143
150
  end
144
151
  end
145
152
 
146
- def self.query_files(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false)
153
+ def self.query_files(files, drop_cols: [], filter: nil, limit: nil, select: nil, unique: nil, sort: nil, descending: false, cast: nil)
147
154
  lazy_frames = to_lazy_frames(files)
148
155
  combined_lazy_df = Polars.concat(lazy_frames)
149
156
 
@@ -160,6 +167,32 @@ module EasyML
160
167
  drop_cols &= combined_lazy_df.columns
161
168
  combined_lazy_df = combined_lazy_df.drop(drop_cols) unless drop_cols.empty?
162
169
 
170
+ if cast && cast.keys.any?
171
+ schema = combined_lazy_df.schema
172
+ in_schema = schema.keys & cast.keys
173
+ cast = cast.select do |col, dtype|
174
+ in_schema.include?(col) && dtype != schema[col]
175
+ end
176
+ combined_lazy_df = combined_lazy_df.with_columns(
177
+ cast.map do |col, dtype|
178
+ Polars.col(col).cast(dtype).alias(col)
179
+ end
180
+ )
181
+ end
182
+
183
+ str_types = [Polars::Utf8, Polars::String, Polars::Categorical]
184
+ str_keys = combined_lazy_df.schema.select { |k, v| v.class.in?(str_types) }
185
+ # Cast empty strings to null
186
+ str_keys.each do |k, v|
187
+ combined_lazy_df = combined_lazy_df.with_columns(
188
+ Polars.when(
189
+ Polars.col(k).eq("")
190
+ ).then(nil)
191
+ .otherwise(Polars.col(k))
192
+ .alias(k)
193
+ )
194
+ end
195
+
163
196
  # Collect the DataFrame (execute the lazy operations)
164
197
  combined_lazy_df = combined_lazy_df.limit(limit) if limit
165
198
  combined_lazy_df
@@ -184,16 +217,12 @@ module EasyML
184
217
 
185
218
  if primary_keys.count > 1
186
219
  key = primary_keys.detect { |key| key.underscore.split("_").any? { |k| k.match?(/id/) } }
187
- if key
188
- primary_keys = [key]
189
- end
220
+ primary_keys = [key] if key
190
221
  end
191
222
 
192
- if primary_keys.count != 1
193
- raise "Unable to determine primary key for dataset"
194
- end
223
+ raise "Unable to determine primary key for dataset" if primary_keys.count != 1
195
224
 
196
- return primary_keys.first
225
+ primary_keys.first
197
226
  end
198
227
 
199
228
  def self.lazy_schema(files)
@@ -249,7 +278,7 @@ module EasyML
249
278
  date_cols = (filtered[:dtypes] || {}).select { |k, v| v.class == Polars::Datetime }.keys
250
279
  filtered[:dtypes] = (filtered[:dtypes] || {}).reject { |k, v| v.class == Polars::Datetime }.compact.to_h
251
280
  filtered = filtered.select { |k, _| supported_params.include?(k) }
252
- return filtered, date_cols
281
+ [filtered, date_cols]
253
282
  end
254
283
 
255
284
  def csv_files
@@ -261,7 +290,9 @@ module EasyML
261
290
  end
262
291
 
263
292
  def columns_to_dtypes(columns)
264
- columns.reduce({}) { |h, c| h[c.name] = c.polars_type; h }
293
+ columns.each_with_object({}) do |c, h|
294
+ h[c.name] = c.polars_type
295
+ end
265
296
  end
266
297
 
267
298
  def cast(df, columns = [])
@@ -22,9 +22,9 @@ module EasyML
22
22
  schema.reduce({}) do |h, (key, type_info)|
23
23
  h.tap do
24
24
  polars_type = PolarsColumn.sym_to_polars(type_info[:type].to_sym)
25
- params = type_info[:params]&.transform_keys(&:to_sym) || {}
25
+ params = deserialize_params(type_info[:params])
26
26
 
27
- h[key] = polars_type.new(**params)
27
+ h[key] = initialize_polars_type(polars_type, params)
28
28
  end
29
29
  end
30
30
  end
@@ -38,6 +38,28 @@ module EasyML
38
38
 
39
39
  private
40
40
 
41
+ def self.initialize_polars_type(polars_type, params)
42
+ case polars_type.name
43
+ when "Polars::List"
44
+ polars_type.new(params[:inner])
45
+ else
46
+ polars_type.new(**params)
47
+ end
48
+ end
49
+
50
+ def self.deserialize_params(params)
51
+ params.reduce({}) do |h, (k, param)|
52
+ h.tap do
53
+ case k.to_sym
54
+ when :inner
55
+ h[:inner] = PolarsColumn.sym_to_polars(param.to_sym)
56
+ else
57
+ h[k] = param
58
+ end
59
+ end
60
+ end
61
+ end
62
+
41
63
  def self.dtype_params(dtype)
42
64
  case dtype
43
65
  when Polars::Categorical
@@ -47,6 +69,10 @@ module EasyML
47
69
  time_unit: dtype.time_unit,
48
70
  time_zone: dtype.time_zone,
49
71
  }
72
+ when Polars::List, Polars::Array
73
+ {
74
+ inner: PolarsColumn.polars_to_sym(dtype.inner),
75
+ }
50
76
  else
51
77
  {}
52
78
  end
@@ -22,7 +22,7 @@ module EasyML
22
22
  end
23
23
 
24
24
  def processed?
25
- dir.match?(%r{processed$})
25
+ dir.match?(/processed$/)
26
26
  end
27
27
 
28
28
  def raw?
@@ -121,8 +121,12 @@ module EasyML
121
121
  end
122
122
 
123
123
  def read(segment, split_ys: false, target: nil, drop_cols: [], filter: nil, limit: nil, select: nil,
124
- unique: nil, sort: nil, descending: false, batch_size: nil, batch_start: nil, batch_key: nil, lazy: false, &block)
124
+ unique: nil, sort: nil, descending: false, batch_size: nil, batch_start: nil,
125
+ batch_key: nil, lazy: false, cast: true, &block)
125
126
  files = files_for_segment(segment)
127
+ if cast == true
128
+ cast = dataset.columns.cast(processed? ? :processed : :raw)
129
+ end
126
130
  return split_ys ? [nil, nil] : nil if files.empty?
127
131
 
128
132
  query_params = {
@@ -136,6 +140,7 @@ module EasyML
136
140
  batch_size: batch_size,
137
141
  batch_start: batch_start,
138
142
  batch_key: batch_key,
143
+ cast: cast,
139
144
  lazy: lazy,
140
145
  }.compact
141
146
 
data/lib/easy_ml/data.rb CHANGED
@@ -3,6 +3,7 @@ module EasyML
3
3
  require_relative "data/utils"
4
4
  require_relative "data/polars_reader"
5
5
  require_relative "data/polars_in_memory"
6
+ require_relative "data/embeddings"
6
7
  require_relative "data/synced_directory"
7
8
  require_relative "data/splits"
8
9
  require_relative "data/polars_column"
@@ -0,0 +1,92 @@
1
+ module EasyML
2
+ class EmbeddingStore
3
+ attr_reader :column, :dataset, :datasource, :full_store, :compressed_store
4
+
5
+ def initialize(column)
6
+ @column = column
7
+ @dataset = column&.dataset
8
+ @datasource = dataset&.datasource
9
+
10
+ @full_store = EasyML::Data::DatasetManager.new(defaults.merge!(root_dir: embedding_dir(compressed: false)))
11
+ @compressed_store = EasyML::Data::DatasetManager.new(defaults.merge!(root_dir: embedding_dir(compressed: true)))
12
+ end
13
+
14
+ def cp(old_version, new_version)
15
+ false
16
+ end
17
+
18
+ def wipe
19
+ full_store.wipe
20
+ compressed_store.wipe
21
+ end
22
+
23
+ def files
24
+ full_store.files + compressed_store.files
25
+ end
26
+
27
+ def empty?(compressed: false)
28
+ if compressed
29
+ compressed_store.empty?
30
+ else
31
+ full_store.empty?
32
+ end
33
+ end
34
+
35
+ def compact
36
+ full_store.compact
37
+ compressed_store.compact
38
+ end
39
+
40
+ def store(df, compressed: false)
41
+ df = df.select(column.name, column.embedding_column).filter(Polars.col(column.embedding_column).is_not_null)
42
+
43
+ if compressed
44
+ compressed_store.store(df)
45
+ else
46
+ full_store.store(df)
47
+ end
48
+ end
49
+
50
+ def query(**kwargs)
51
+ compressed = kwargs.delete(:compressed) || false
52
+ if compressed
53
+ compressed_store.query(**kwargs).filter(Polars.col(column.embedding_column).is_not_null)
54
+ else
55
+ full_store.query(**kwargs).filter(Polars.col(column.embedding_column).is_not_null)
56
+ end
57
+ end
58
+
59
+ private
60
+
61
+ def defaults
62
+ datasource_config = column&.dataset&.datasource&.configuration
63
+ if datasource_config
64
+ options = {
65
+ filenames: "embedding",
66
+ append_only: true,
67
+ primary_key: column.name,
68
+ s3_bucket: datasource_config.dig("s3_bucket") || EasyML::Configuration.s3_bucket,
69
+ s3_prefix: s3_prefix,
70
+ polars_args: datasource_config.dig("polars_args"),
71
+ }.compact
72
+ else
73
+ {}
74
+ end
75
+ end
76
+
77
+ def embedding_dir(compressed: false)
78
+ File.join(
79
+ Rails.root,
80
+ "easy_ml/datasets",
81
+ column&.dataset&.name&.parameterize&.gsub("-", "_"),
82
+ "embeddings",
83
+ compressed ? "compressed" : "full",
84
+ column&.name&.parameterize&.gsub("-", "_")
85
+ )
86
+ end
87
+
88
+ def s3_prefix
89
+ File.join("datasets", embedding_dir.split("datasets").last)
90
+ end
91
+ end
92
+ end
@@ -3,14 +3,12 @@ require "awesome_print"
3
3
  require "rails/all"
4
4
  require "inertia_rails"
5
5
  require "jsonapi/serializer"
6
- require "numo/narray"
7
6
  require "numpy"
8
7
  require "parallel"
9
8
  require "polars-df"
10
9
  require "pycall"
11
10
  require "optuna"
12
11
  require "wandb"
13
- require "xgb"
14
12
  require "rails/engine"
15
13
  require "activerecord-import"
16
14
  require "historiographer"
@@ -19,6 +17,10 @@ require "rake"
19
17
  require "resque/tasks"
20
18
  require "zhong"
21
19
  require "dotenv"
20
+ require "langchainrb"
21
+ require "numo/narray"
22
+ require "xgb"
23
+ require "rumale"
22
24
 
23
25
  module EasyML
24
26
  class Engine < Rails::Engine
@@ -19,29 +19,23 @@ module EasyML
19
19
 
20
20
  def self.predict(model_name, df, serialize: false)
21
21
  df = normalize_input(df)
22
- raw_input = df.to_hashes
23
-
24
- df = instance.normalize(model_name, df)
25
- normalized_input = df.to_hashes
26
- preds = instance.predict(model_name, df)
27
- current_version = instance.get_model(model_name)
22
+ output = make_predictions(model_name, df) do |model, normalized_df|
23
+ model.predict(normalized_df)
24
+ end
28
25
 
29
- output = preds.zip(raw_input, normalized_input).map do |pred, raw, norm|
30
- EasyML::Prediction.create!(
31
- model: current_version.model,
32
- model_history: current_version,
33
- prediction_type: current_version.model.task,
34
- prediction_value: pred,
35
- raw_input: raw,
36
- normalized_input: norm,
37
- )
26
+ if serialize
27
+ EasyML::PredictionSerializer.new(output).serializable_hash
28
+ else
29
+ output
38
30
  end
31
+ end
39
32
 
40
- output = if output.is_a?(Array) && output.count == 1
41
- output.first
42
- else
43
- output
44
- end
33
+ def self.predict_proba(model_name, df, serialize: false)
34
+ df = normalize_input(df)
35
+ output = make_predictions(model_name, df) do |model, normalized_df|
36
+ probas = model.predict_proba(normalized_df)
37
+ probas.map { |proba_array| proba_array.map { |p| p.round(4) } }
38
+ end
45
39
 
46
40
  if serialize
47
41
  EasyML::PredictionSerializer.new(output).serializable_hash
@@ -58,6 +52,10 @@ module EasyML
58
52
  get_model(model_name).predict(df)
59
53
  end
60
54
 
55
+ def predict_proba(model_name, df)
56
+ get_model(model_name).predict_proba(df)
57
+ end
58
+
61
59
  def self.validate_input(model_name, df)
62
60
  df = normalize_input(df)
63
61
  instance.get_model(model_name).dataset.validate_input(df)
@@ -82,6 +80,30 @@ module EasyML
82
80
 
83
81
  private
84
82
 
83
+ def self.make_predictions(model_name, df)
84
+ raw_input = df.to_hashes
85
+ normalized_df = instance.normalize(model_name, df)
86
+ normalized_input = normalized_df.to_hashes
87
+ current_version = instance.get_model(model_name)
88
+
89
+ predictions = yield(current_version, normalized_df)
90
+ proba = predictions.is_a?(Array) ? predictions : nil
91
+
92
+ output = predictions.zip(raw_input, normalized_input).map do |pred, raw, norm|
93
+ EasyML::Prediction.create!(
94
+ model: current_version.model,
95
+ model_history: current_version,
96
+ prediction_type: current_version.model.task,
97
+ prediction_value: pred,
98
+ raw_input: raw,
99
+ normalized_input: norm,
100
+ metadata: proba ? { probabilities: pred } : {},
101
+ )
102
+ end
103
+
104
+ output.count == 1 ? output.first : output
105
+ end
106
+
85
107
  def load_model(model_name)
86
108
  current_model = EasyML::Model.find_by!(slug: model_name).inference_version
87
109
 
@@ -54,6 +54,11 @@ module EasyML
54
54
  add_raw_schema_to_datasets
55
55
  remove_evaluator_from_retraining_jobs
56
56
  add_unique_constraint_to_easy_ml_model_names
57
+ add_is_primary_key_to_easy_ml_columns
58
+ create_easy_ml_pca_models
59
+ add_pca_model_id_to_easy_ml_columns
60
+ add_workflow_status_to_easy_ml_dataset_histories
61
+ add_metadata_to_easy_ml_predictions
57
62
  ].freeze
58
63
 
59
64
  # Specify the next migration number
@@ -0,0 +1,9 @@
1
+ class AddIsPrimaryKeyToEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_columns, :is_primary_key, :boolean
4
+ add_index :easy_ml_columns, :is_primary_key
5
+
6
+ add_column :easy_ml_column_histories, :is_primary_key, :boolean
7
+ add_index :easy_ml_column_histories, :is_primary_key
8
+ end
9
+ end
@@ -0,0 +1,6 @@
1
+ class AddMetadataToEasyMLPredictions < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_predictions, :metadata, :jsonb, default: {}, null: false
4
+ add_index :easy_ml_predictions, :metadata, using: :gin
5
+ end
6
+ end
@@ -0,0 +1,9 @@
1
+ class AddPCAModelIdToEasyMLColumns < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ add_column :easy_ml_columns, :pca_model_id, :integer
4
+ add_index :easy_ml_columns, :pca_model_id
5
+
6
+ add_column :easy_ml_column_histories, :pca_model_id, :integer
7
+ add_index :easy_ml_column_histories, :pca_model_id
8
+ end
9
+ end
@@ -0,0 +1,13 @@
1
+ class AddWorkflowStatusToEasyMLDatasetHistories < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ unless column_exists?(:easy_ml_dataset_histories, :workflow_status)
4
+ add_column :easy_ml_dataset_histories, :workflow_status, :string
5
+ add_index :easy_ml_dataset_histories, :workflow_status
6
+ end
7
+
8
+ unless column_exists?(:easy_ml_feature_histories, :workflow_status)
9
+ add_column :easy_ml_feature_histories, :workflow_status, :string
10
+ add_index :easy_ml_feature_histories, :workflow_status
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,14 @@
1
+ class CreateEasyMLPCAModels < ActiveRecord::Migration[<%= ActiveRecord::Migration.current_version %>]
2
+ def change
3
+ unless table_exists?(:easy_ml_pca_models)
4
+ create_table :easy_ml_pca_models do |t|
5
+ t.binary :model, null: false
6
+ t.datetime :fit_at
7
+ t.timestamps
8
+
9
+ t.index :created_at
10
+ t.index :fit_at
11
+ end
12
+ end
13
+ end
14
+ end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc85"
4
+ VERSION = "0.2.0-rc89"
5
5
 
6
6
  module Version
7
7
  end
data/lib/easy_ml.rb CHANGED
@@ -25,6 +25,7 @@ module EasyML
25
25
  require_relative "easy_ml/evaluators"
26
26
  require_relative "easy_ml/features"
27
27
  require_relative "easy_ml/feature_store"
28
+ require_relative "easy_ml/embedding_store"
28
29
  require_relative "easy_ml/core"
29
30
  require_relative "easy_ml/predict"
30
31
  require_relative "easy_ml/pending_migrations"
@@ -1,11 +1,11 @@
1
1
  {
2
2
  "entrypoints/Application.tsx": {
3
- "file": "assets/entrypoints/Application.tsx-CD8voxfL.js",
3
+ "file": "assets/entrypoints/Application.tsx-KENNRQpC.js",
4
4
  "name": "entrypoints/Application.tsx",
5
5
  "src": "entrypoints/Application.tsx",
6
6
  "isEntry": true,
7
7
  "css": [
8
- "assets/Application-nnn_XLuL.css"
8
+ "assets/Application-DfPoyRr8.css"
9
9
  ]
10
10
  }
11
11
  }