easy_ml 0.2.0.pre.rc91 → 0.2.0.pre.rc93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3a12058c269a91c130f9158e1507c58dc94ad33517aabe568a2f0bc9f78b88eb
4
- data.tar.gz: 6a37a568b6a8d8c100c21dea96487cb85494ef11ba3b30ac51aad8cab45654d7
3
+ metadata.gz: f402f4f0d35dd702c8e97c841f22681626cff3c1ac030991cde884788b31833b
4
+ data.tar.gz: 1666649bca8f2ab7bae0ec7a5faa3a4561f85db5b22ffdda92e472f7661fab85
5
5
  SHA512:
6
- metadata.gz: 4f344c00a9e2b557079943f7f6f2c4d7923dbb5b425423b81d58dbc6d63ac15d5e978d6f7e1ab2c02233fe8cc33b55d168ed1ffb0d1f2e1cfcc59670b812285d
7
- data.tar.gz: 4831eac6b35b452b300408b37695d5116b6840404cf045e85d98780d3a120ae2d267b07852bc80b6918379b70f9c04a52a712e599751e469af72a0be6e1889c4
6
+ metadata.gz: fa77810edaf759b75240f57a8f6dcc99a54043326ba92ff95c6df6732bd614bb932a5558722d690544367b21c929755b249ff8ede56581bbb6c7abe37688db63
7
+ data.tar.gz: 7df62d34d6fd6dcd83eef86ab27dea0581bcbbe5c384c9fae00399661a62a882d3f163807171128d0efcc1026cd887789f961d9ce9e99ecaace8f91432f9193b
@@ -14,18 +14,14 @@ module EasyML
14
14
  create_event(dataset, "started")
15
15
 
16
16
  dataset.unlock!
17
- dataset.prepare
18
- if dataset.features.needs_fit.any?
19
- dataset.fit_features(async: true)
20
- else
21
- dataset.after_fit_features
17
+ dataset.refreshing do
18
+ dataset.prepare
19
+ if dataset.features.needs_fit.any?
20
+ dataset.fit_features(async: true)
21
+ else
22
+ dataset.after_fit_features
23
+ end
22
24
  end
23
- rescue StandardError => e
24
- if Rails.env.test?
25
- raise e
26
- end
27
- dataset.update(workflow_status: :failed)
28
- EasyML::Event.handle_error(dataset, e)
29
25
  end
30
26
  end
31
27
  end
@@ -515,17 +515,17 @@ module EasyML
515
515
  end
516
516
 
517
517
  def cast_statement(df, df_col, expected_dtype)
518
- expected_dtype = expected_dtype.is_a?(Polars::DataType) ? expected_dtype : expected_dtype.class
518
+ expected_dtype = expected_dtype.is_a?(Polars::DataType) ? expected_dtype.class : expected_dtype
519
519
  actual_type = df[df_col].dtype
520
520
 
521
- cast_statement = case expected_dtype
522
- when Polars::Boolean
523
- case actual_type
524
- when Polars::Boolean
521
+ cast_statement = case expected_dtype.to_s
522
+ when "Polars::Boolean"
523
+ case actual_type.to_s
524
+ when "Polars::Boolean"
525
525
  Polars.col(df_col).cast(expected_dtype)
526
- when Polars::String, Polars::Categorical
526
+ when "Polars::Utf8", "Polars::Categorical", "Polars::String"
527
527
  Polars.col(df_col).eq("true").cast(expected_dtype)
528
- when Polars::Null
528
+ when "Polars::Null"
529
529
  Polars.col(df_col)
530
530
  else
531
531
  raise "Unexpected dtype: #{actual_type} for column: #{df_col}"
@@ -104,11 +104,7 @@ module EasyML
104
104
  expected_dtype = schema[df_col.to_sym]
105
105
  db_col.cast_statement(df, df_col, expected_dtype)
106
106
  end
107
- begin
108
- df = df.with_columns(cast_statements)
109
- rescue => e
110
- binding.pry
111
- end
107
+ df = df.with_columns(cast_statements)
112
108
  end
113
109
 
114
110
  def cast(processed_or_raw)
@@ -165,7 +161,8 @@ module EasyML
165
161
  end
166
162
  EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[ is_computed computed_by ] })
167
163
 
168
- lineage = cols_to_learn.flat_map do |col|
164
+ cols = where(id: cols_to_learn.map(&:id)).includes(:lineages, :feature)
165
+ lineage = cols.flat_map do |col|
169
166
  EasyML::Lineage.learn(col)
170
167
  end.compact
171
168
  EasyML::Lineage.import(lineage, on_duplicate_key_update: { columns: %i[ column_id key occurred_at description ] })
@@ -2,18 +2,15 @@ module EasyML
2
2
  class Dataset
3
3
  class Learner
4
4
  include EasyML::Timing
5
- attr_accessor :dataset, :columns, :type, :computed, :raw_columns, :statistics
5
+ attr_accessor :dataset, :columns, :all_columns, :type, :computed, :raw_columns, :statistics
6
6
 
7
7
  def initialize(dataset, type: :raw)
8
8
  @dataset = dataset
9
- @columns = dataset.columns.reload.needs_learn.sort_by(&:name)
10
-
11
- if computed
12
- @columns = @columns.computed
13
- end
14
-
15
- @columns = @columns.select(&:persisted?).reject(&:empty?)
9
+ @columns = dataset.columns.reload.needs_learn.includes(:feature).sort_by(&:name)
16
10
  @type = type
11
+ @all_columns = @columns.dup
12
+ @columns = @columns.select(&:persisted?)
13
+ @columns = @columns.select { |c| available_columns.include?(c.name) }
17
14
  end
18
15
 
19
16
  def learn
@@ -23,6 +20,10 @@ module EasyML
23
20
  save_statistics
24
21
  end
25
22
 
23
+ def available_columns
24
+ @available_columns ||= dataset.send(type).data(lazy: true).schema.keys & columns.map(&:name)
25
+ end
26
+
26
27
  private
27
28
 
28
29
  def fit_models
@@ -35,10 +36,20 @@ module EasyML
35
36
  end
36
37
  end
37
38
 
39
+ def get_sample_values
40
+ needs_sample = EasyML::Column.where(id: columns.map(&:id)).where(sample_values: nil).map(&:name)
41
+ sampleable_cols = available_columns & needs_sample
42
+ selects = sampleable_cols.map do |col|
43
+ Polars.col(col).filter(Polars.col(col).is_not_null).limit(5).alias(col)
44
+ end
45
+ df = dataset.send(type).train(all_columns: true, lazy: true).select(selects).collect.to_h.transform_values(&:to_a)
46
+ end
47
+
38
48
  def save_statistics
39
- columns.each do |col|
49
+ samples = get_sample_values
50
+ all_columns.each do |col|
40
51
  col.merge_statistics(statistics.dig(col.name))
41
- col.set_sample_values
52
+ col.assign_attributes(sample_values: samples[col.name]) if samples[col.name].present?
42
53
  col.assign_attributes(
43
54
  learned_at: EasyML::Support::UTC.now,
44
55
  last_datasource_sha: col.dataset.last_datasource_sha,
@@ -47,7 +58,7 @@ module EasyML
47
58
  )
48
59
  end
49
60
 
50
- EasyML::Column.import(columns, on_duplicate_key_update: { columns: %i[
61
+ EasyML::Column.import(all_columns, on_duplicate_key_update: { columns: %i[
51
62
  statistics
52
63
  learned_at
53
64
  sample_values
@@ -362,15 +362,13 @@ module EasyML
362
362
  update(workflow_status: "analyzing")
363
363
  fully_reload
364
364
  yield
365
- ensure
366
- unlock!
367
365
  end
368
366
  rescue => e
369
367
  update(workflow_status: "failed")
370
- e.backtrace.grep(/easy_ml/).each do |line|
371
- puts line
372
- end
368
+ EasyML::Event.handle_error(self, e)
373
369
  raise e
370
+ ensure
371
+ unlock!
374
372
  end
375
373
  end
376
374
 
@@ -797,8 +795,7 @@ module EasyML
797
795
  df = df.clone
798
796
  df = apply_features(df)
799
797
  processed.save(:train, df)
800
- learn(delete: false)
801
- learn_statistics(type: :processed, computed: true)
798
+ learn_statistics(type: :processed)
802
799
  processed.cleanup
803
800
  end
804
801
 
@@ -900,5 +897,21 @@ module EasyML
900
897
  def underscored_name
901
898
  name.gsub(/\s{2,}/, " ").gsub(/\s/, "_").downcase
902
899
  end
900
+
901
+ TIME_METHODS = %w(
902
+ refresh
903
+ prepare_features
904
+ refresh_datasource
905
+ split_data
906
+ fit
907
+ normalize_all
908
+ learn
909
+ learn_statistics
910
+ fit_features
911
+ )
912
+ include EasyML::Timing
913
+ TIME_METHODS.each do |method|
914
+ measure_method_timing method
915
+ end
903
916
  end
904
917
  end
@@ -57,11 +57,26 @@ module EasyML
57
57
  end
58
58
 
59
59
  def self.easy_ml_context(stacktrace)
60
- stacktrace.select { |loc| loc.match?(/easy_ml/) }
60
+ stacktrace.select(&MATCH_EASY_ML_CONTEXT)
61
+ end
62
+
63
+ MATCH_USER_CONTEXT = proc { |loc| !loc.match?(/features|evaluators/) && !loc.match?(/easy_ml/) }
64
+ MATCH_EASY_ML_CONTEXT = proc { |loc| loc.match?(/easy_ml/) }
65
+
66
+ def self.user_context?(stacktrace)
67
+ stacktrace.any?(&MATCH_USER_CONTEXT)
68
+ end
69
+
70
+ def self.get_context(stacktrace)
71
+ if user_context?(stacktrace)
72
+ stacktrace
73
+ else
74
+ easy_ml_context(stacktrace)
75
+ end
61
76
  end
62
77
 
63
78
  def self.called_by?(matcher)
64
- easy_ml_context(caller).any? { |line| line.match?(matcher) }
79
+ get_context(caller).any? { |line| line.match?(matcher) }
65
80
  end
66
81
 
67
82
  def self.format_stacktrace(error)
@@ -69,7 +84,7 @@ module EasyML
69
84
 
70
85
  topline = error.inspect
71
86
 
72
- stacktrace = easy_ml_context(error.backtrace)
87
+ stacktrace = get_context(error.backtrace)
73
88
 
74
89
  %(#{topline}
75
90
 
@@ -250,12 +250,14 @@ module EasyML
250
250
  if async && job_count > 1
251
251
  EasyML::ComputeFeatureJob.enqueue_ordered_batches(jobs)
252
252
  else
253
- jobs.each do |feature_batch|
253
+ feature_idx = ordered_features.index_by(&:id)
254
+ change_set = jobs.map do |feature_batch|
254
255
  feature_batch.each do |batch_args|
255
256
  EasyML::ComputeFeatureJob.perform(nil, batch_args)
256
257
  end
257
- feature = EasyML::Feature.find(feature_batch.first.dig(:feature_id))
258
+ feature = feature_idx[feature_batch.first.dig(:feature_id)]
258
259
  feature.after_fit
260
+ feature
259
261
  end
260
262
  dataset.after_fit_features
261
263
  end
@@ -380,11 +382,8 @@ module EasyML
380
382
 
381
383
  df_len_was = df.shape[0]
382
384
  orig_df = df.clone
383
- begin
384
- result = adapter.transform(df, self)
385
- rescue => e
386
- raise "Feature #{feature_class}#transform failed: #{e.message}"
387
- end
385
+ result = adapter.transform(df, self)
386
+
388
387
  raise "Feature '#{name}' must return a Polars::DataFrame, got #{result.class}" unless result.is_a?(Polars::DataFrame)
389
388
  df_len_now = result.shape[0]
390
389
  missing_columns = orig_df.columns - result.columns
@@ -18,8 +18,8 @@ module EasyML
18
18
  def learn(column)
19
19
  @lineage = EasyML::Column::Lineage.new(column).lineage
20
20
 
21
- existing_lineage = where(column_id: column.id)
22
- missing_lineage = @lineage.select { |l| !existing_lineage.exists?(key: l[:key]) }
21
+ existing_lineage = column.lineages.index_by(&:key)
22
+ missing_lineage = @lineage.select { |l| !existing_lineage.key?(l[:key].to_s) }
23
23
 
24
24
  missing_lineage = missing_lineage.map { |l|
25
25
  EasyML::Lineage.new(
@@ -29,7 +29,7 @@ module EasyML
29
29
  description: l[:description],
30
30
  )
31
31
  }
32
- existing_lineage = existing_lineage.map do |lineage|
32
+ existing_lineage = existing_lineage.map do |key, lineage|
33
33
  matching_lineage = @lineage.detect { |ll| ll[:key].to_sym == lineage.key.to_sym }
34
34
 
35
35
  lineage&.assign_attributes(
@@ -556,13 +556,15 @@ module EasyML
556
556
  self.sha = model_file.sha
557
557
  save
558
558
  dataset.upload_remote_files
559
- snapshot.tap do
560
- # Prepare the model to be retrained (reset values so they don't conflict with our snapshotted version)
561
- bump_version(force: true)
562
- dataset.bump_versions(version)
563
- self.model_file = new_model_file!
564
- save
565
- end
559
+ model_snapshot = snapshot
560
+
561
+ # Prepare the model to be retrained (reset values so they don't conflict with our snapshotted version)
562
+ bump_version(force: true)
563
+ dataset.bump_versions(version)
564
+ self.model_file = new_model_file!
565
+ save
566
+
567
+ model_snapshot
566
568
  end
567
569
 
568
570
  def cannot_deploy_reasons
@@ -11,7 +11,10 @@
11
11
  module EasyML
12
12
  class PCAModel < ActiveRecord::Base
13
13
  def model
14
- Marshal.load(read_attribute(:model))
14
+ model = read_attribute(:model)
15
+ return nil if model.nil?
16
+
17
+ Marshal.load(model)
15
18
  end
16
19
 
17
20
  def model=(model)
@@ -4,10 +4,5 @@ class AddUniqueConstraintToDatasetNames < ActiveRecord::Migration[<%= ActiveReco
4
4
  remove_index :easy_ml_datasets, :name
5
5
  end
6
6
  add_index :easy_ml_datasets, :name, unique: true
7
-
8
- if index_exists?(:easy_ml_dataset_histories, :name)
9
- remove_index :easy_ml_dataset_histories, :name
10
- end
11
- add_index :easy_ml_dataset_histories, :name, unique: true
12
7
  end
13
8
  end
@@ -19,9 +19,7 @@ module EasyML
19
19
  result = send(method_alias, *args, **kwargs, &block)
20
20
  ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
21
21
  elapsed = ending - starting
22
- 10.times do
23
- puts "#{method_name} took #{elapsed.round(2)} seconds"
24
- end
22
+ puts "#{method_name} took #{elapsed} seconds"
25
23
  # StatsD.measure("#{Rails.env}.#{prefix.present? ? "#{prefix}." : ""}#{method_name}.timing", elapsed)
26
24
  result
27
25
  end
@@ -1,7 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module EasyML
4
- VERSION = "0.2.0-rc91"
4
+ VERSION = "0.2.0-rc93"
5
5
 
6
6
  module Version
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: easy_ml
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0.pre.rc91
4
+ version: 0.2.0.pre.rc93
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brett Shollenberger
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-03-04 00:00:00.000000000 Z
11
+ date: 2025-03-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activerecord