easy_ml 0.2.0.pre.rc39 → 0.2.0.pre.rc41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +11 -9
- data/app/controllers/easy_ml/application_controller.rb +1 -1
- data/app/frontend/components/dataset/splitters/DateSplitter.tsx +4 -4
- data/app/frontend/components/dataset/splitters/types.ts +3 -3
- data/app/frontend/pages/NewDatasetPage.tsx +1 -1
- data/app/helpers/easy_ml/application_helper.rb +2 -2
- data/app/jobs/easy_ml/compute_feature_job.rb +54 -1
- data/app/models/concerns/easy_ml/dataframe_serialization.rb +30 -0
- data/app/models/easy_ml/dataset.rb +23 -22
- data/app/models/easy_ml/dataset_history.rb +1 -6
- data/app/models/easy_ml/datasources/polars_datasource.rb +4 -18
- data/app/models/easy_ml/event.rb +2 -1
- data/app/models/easy_ml/event_context.rb +58 -0
- data/app/models/easy_ml/feature.rb +40 -11
- data/app/models/easy_ml/model.rb +0 -1
- data/app/models/easy_ml/model_file.rb +7 -3
- data/app/models/easy_ml/splitter_history.rb +16 -0
- data/config/initializers/zhong.rb +4 -0
- data/lib/easy_ml/data/date_converter.rb +1 -0
- data/lib/easy_ml/data/polars_reader.rb +17 -4
- data/lib/easy_ml/data/statistics_learner.rb +1 -1
- data/lib/easy_ml/engine.rb +12 -1
- data/lib/easy_ml/pending_migrations.rb +19 -0
- data/lib/easy_ml/predict.rb +1 -3
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +38 -157
- data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_features.rb.tt +8 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +4 -2
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +22 -20
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +5 -3
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +26 -24
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +5 -3
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +12 -10
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +21 -19
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_event_contexts.rb.tt +14 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +16 -14
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +10 -8
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +27 -25
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +5 -3
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +13 -11
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +5 -3
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +28 -26
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +13 -11
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +70 -67
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +6 -4
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +6 -4
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +11 -9
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +34 -30
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -0
- data/public/easy_ml/assets/.vite/manifest.json +1 -1
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BRRjHz4-.js → Application.tsx-DF5SSkYi.js} +2 -2
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BRRjHz4-.js.map → Application.tsx-DF5SSkYi.js.map} +1 -1
- metadata +9 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 03333c45a1103bf7e75446a0c54d6799b62e64646abc1ab2abac123a206d1424
|
4
|
+
data.tar.gz: cb5ba985a5b8e5fd136b92e5ca5f65162d5189f06ba91bdd6e6763a69f5fbe56
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddd88ca06fecf8366a7e3fa370b2f78e0e73ebc4c29fefcdd6cd0b208286710d503b767d6fabcf9af5eb40105bb0ffe63ddd773278b0cc9379750dfb4763d87f
|
7
|
+
data.tar.gz: 0c3b8dfdb6d293692439a1818dca5fe1974e27039895e54f08e492bb621c563a77cc4e0921f1df58603526a0b95354741684052aa9f1df8928e5b54de6f8caac
|
data/Rakefile
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require "sprockets/railtie"
|
4
3
|
require "bundler/gem_tasks"
|
5
4
|
require "rspec/core/rake_task"
|
6
5
|
|
@@ -20,16 +19,19 @@ require_relative "lib/easy_ml"
|
|
20
19
|
# Load the annotate tasks
|
21
20
|
require "annotate/annotate_models"
|
22
21
|
|
22
|
+
require "combustion"
|
23
|
+
Combustion.path = "spec/internal"
|
24
|
+
Combustion::Application.configure_for_combustion
|
23
25
|
task :environment do
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
EasyML::Engine.eager_load!
|
26
|
+
Combustion::Application.initialize!
|
27
|
+
|
28
|
+
# Reset migrations paths so we can keep the migrations in the project root,
|
29
|
+
# not the Rails root
|
30
|
+
migrations_paths = ["spec/internal/db/migrate"]
|
31
|
+
ActiveRecord::Tasks::DatabaseTasks.migrations_paths = migrations_paths
|
32
|
+
ActiveRecord::Migrator.migrations_paths = migrations_paths
|
32
33
|
end
|
34
|
+
Combustion::Application.load_tasks
|
33
35
|
|
34
36
|
namespace :easy_ml do
|
35
37
|
task annotate_models: :environment do
|
@@ -12,7 +12,7 @@ module EasyML
|
|
12
12
|
before_action :hot_reload
|
13
13
|
|
14
14
|
def hot_reload
|
15
|
-
return unless Rails.env.development? && ENV["
|
15
|
+
return unless Rails.env.development? && ENV["EASY_ML_DEV"]
|
16
16
|
|
17
17
|
Dir[EasyML::Engine.root.join("lib/**/*")].select { |f| Pathname.new(f).extname == ".rb" }.each do |file|
|
18
18
|
load file
|
@@ -12,14 +12,14 @@ export function DateSplitter({ attributes, columns, onChange }: DateSplitterProp
|
|
12
12
|
return (
|
13
13
|
<div className="space-y-4">
|
14
14
|
<div>
|
15
|
-
<label htmlFor="
|
15
|
+
<label htmlFor="date_col" className="block text-sm font-medium text-gray-700">
|
16
16
|
Date Column
|
17
17
|
</label>
|
18
18
|
<SearchableSelect
|
19
|
-
id="
|
20
|
-
value={attributes.
|
19
|
+
id="date_col"
|
20
|
+
value={attributes.date_col}
|
21
21
|
options={columns.map(col => ({ value: col, label: col }))}
|
22
|
-
onChange={(value) => onChange({ ...attributes,
|
22
|
+
onChange={(value) => onChange({ ...attributes, date_col: value })}
|
23
23
|
placeholder="Select date column"
|
24
24
|
/>
|
25
25
|
</div>
|
@@ -18,7 +18,7 @@ export type SplitterType =
|
|
18
18
|
| 'leave_p_out';
|
19
19
|
|
20
20
|
export interface DateSplitConfig {
|
21
|
-
|
21
|
+
date_col: string;
|
22
22
|
months_test: number;
|
23
23
|
months_valid: number;
|
24
24
|
}
|
@@ -81,7 +81,7 @@ export interface ValidationResult {
|
|
81
81
|
|
82
82
|
// Validation functions for each splitter type
|
83
83
|
export const validateDateSplitter = (config: DateSplitConfig): ValidationResult => {
|
84
|
-
if (!config.
|
84
|
+
if (!config.date_col) {
|
85
85
|
return { isValid: false, error: "Please select a date column" };
|
86
86
|
}
|
87
87
|
if (!config.months_test || config.months_test <= 0) {
|
@@ -108,7 +108,7 @@ export const validateRandomSplitter = (config: RandomSplitConfig): ValidationRes
|
|
108
108
|
};
|
109
109
|
|
110
110
|
export const validatePredefinedSplitter = (config: PredefinedSplitConfig): ValidationResult => {
|
111
|
-
if (!config.
|
111
|
+
if (!config.train_files || config.train_files.length === 0) {
|
112
112
|
return { isValid: false, error: "Please select at least one file for splitting" };
|
113
113
|
}
|
114
114
|
return { isValid: true };
|
@@ -3,8 +3,8 @@
|
|
3
3
|
module EasyML
|
4
4
|
module ApplicationHelper
|
5
5
|
# Override: Returns the engine assets manifest.
|
6
|
-
def
|
7
|
-
ViteRuby.new(EasyML::Engine.root).manifest
|
6
|
+
def easy_ml_manifest
|
7
|
+
ViteRuby.new(root: EasyML::Engine.root).manifest
|
8
8
|
end
|
9
9
|
|
10
10
|
def prod_script_tags
|
@@ -1,12 +1,43 @@
|
|
1
1
|
module EasyML
|
2
2
|
class ComputeFeatureJob < BatchJob
|
3
|
+
extend EasyML::DataframeSerialization
|
4
|
+
|
3
5
|
@queue = :easy_ml
|
4
6
|
|
5
7
|
def self.perform(batch_id, options = {})
|
8
|
+
puts "processing batch_id #{batch_id}"
|
6
9
|
options.symbolize_keys!
|
7
10
|
feature_id = options.dig(:feature_id)
|
8
11
|
feature = EasyML::Feature.find(feature_id)
|
9
|
-
feature.
|
12
|
+
dataset = feature.dataset
|
13
|
+
|
14
|
+
# Check if any feature has failed before proceeding
|
15
|
+
if dataset.features.any? { |f| f.workflow_status == "failed" }
|
16
|
+
puts "Aborting feature computation due to previous feature failure"
|
17
|
+
return
|
18
|
+
end
|
19
|
+
|
20
|
+
begin
|
21
|
+
feature.fit_batch(options.merge!(batch_id: batch_id))
|
22
|
+
rescue => e
|
23
|
+
puts "Error computing feature: #{e.message}"
|
24
|
+
EasyML::Feature.transaction do
|
25
|
+
return if dataset.reload.workflow_status == :failed
|
26
|
+
|
27
|
+
puts "Logging error"
|
28
|
+
feature.update(workflow_status: :failed)
|
29
|
+
dataset.update(workflow_status: :failed)
|
30
|
+
build_error_with_context(dataset, e, batch_id, feature)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.build_error_with_context(dataset, error, batch_id, feature)
|
36
|
+
error = EasyML::Event.handle_error(dataset, error)
|
37
|
+
batch = feature.build_batch(batch_id: batch_id)
|
38
|
+
|
39
|
+
# Convert any dataframes in the context to serialized form
|
40
|
+
error.create_context(context: batch)
|
10
41
|
end
|
11
42
|
|
12
43
|
def self.after_batch_hook(batch_id, *args)
|
@@ -15,5 +46,27 @@ module EasyML
|
|
15
46
|
dataset = EasyML::Feature.find_by(id: feature_ids.first).dataset
|
16
47
|
dataset.after_fit_features
|
17
48
|
end
|
49
|
+
|
50
|
+
def self.feature_fully_processed?(feature)
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def self.remove_remaining_batch_jobs(batch_id)
|
56
|
+
# Remove all remaining jobs in the batch
|
57
|
+
while (jobs = Resque.peek(:easy_ml, 0, 1000)).any?
|
58
|
+
jobs.each do |job|
|
59
|
+
if job["args"][0] == batch_id
|
60
|
+
Resque.dequeue(self, *job["args"])
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Break if we've processed all jobs (no more jobs match our batch_id)
|
65
|
+
break unless jobs.any? { |job| job["args"][0] == batch_id }
|
66
|
+
end
|
67
|
+
end
|
18
68
|
end
|
19
69
|
end
|
70
|
+
|
71
|
+
# If any feature fails, the entire batch fails
|
72
|
+
# If any feature fails, the RELATED batches should fail
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EasyML
|
2
|
+
module DataframeSerialization
|
3
|
+
extend ActiveSupport::Concern
|
4
|
+
|
5
|
+
def serialize_dataframe(df)
|
6
|
+
return unless df
|
7
|
+
JSON.parse(df.write_json)
|
8
|
+
end
|
9
|
+
|
10
|
+
def deserialize_dataframe(df_data)
|
11
|
+
return unless df_data.present? && df_data.key?("columns")
|
12
|
+
|
13
|
+
columns = df_data["columns"].map do |col|
|
14
|
+
dtype = case col["datatype"]
|
15
|
+
when Hash
|
16
|
+
if col["datatype"]["Datetime"]
|
17
|
+
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
18
|
+
else
|
19
|
+
Polars::Utf8
|
20
|
+
end
|
21
|
+
else
|
22
|
+
Polars.const_get(col["datatype"])
|
23
|
+
end
|
24
|
+
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
25
|
+
end
|
26
|
+
|
27
|
+
Polars::DataFrame.new(columns)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -102,13 +102,9 @@ module EasyML
|
|
102
102
|
end
|
103
103
|
|
104
104
|
def root_dir
|
105
|
-
|
105
|
+
relative_dir = read_attribute(:root_dir) || default_root_dir
|
106
106
|
|
107
|
-
|
108
|
-
EasyML::Engine.root_dir.join(persisted).to_s
|
109
|
-
else
|
110
|
-
default_root_dir
|
111
|
-
end
|
107
|
+
EasyML::Engine.root_dir.join(relative_dir).to_s
|
112
108
|
end
|
113
109
|
|
114
110
|
def destructively_cleanup!
|
@@ -219,8 +215,11 @@ module EasyML
|
|
219
215
|
end
|
220
216
|
|
221
217
|
def after_fit_features
|
222
|
-
features.update_all(needs_fit: false, fit_at: Time.current)
|
223
218
|
unlock!
|
219
|
+
reload
|
220
|
+
return if failed?
|
221
|
+
|
222
|
+
features.update_all(needs_fit: false, fit_at: Time.current)
|
224
223
|
actually_refresh
|
225
224
|
end
|
226
225
|
|
@@ -281,22 +280,24 @@ module EasyML
|
|
281
280
|
end
|
282
281
|
|
283
282
|
def refreshing
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
283
|
+
begin
|
284
|
+
return false if is_history_class?
|
285
|
+
unlock! unless analyzing?
|
286
|
+
|
287
|
+
lock_dataset do
|
288
|
+
update(workflow_status: "analyzing")
|
289
|
+
fully_reload
|
290
|
+
yield
|
291
|
+
ensure
|
292
|
+
unlock!
|
293
|
+
end
|
294
|
+
rescue => e
|
295
|
+
update(workflow_status: "failed")
|
296
|
+
e.backtrace.grep(/easy_ml/).each do |line|
|
297
|
+
puts line
|
298
|
+
end
|
299
|
+
raise e
|
298
300
|
end
|
299
|
-
raise e
|
300
301
|
end
|
301
302
|
|
302
303
|
def unlock!
|
@@ -30,17 +30,12 @@ module EasyML
|
|
30
30
|
self.table_name = "easy_ml_dataset_histories"
|
31
31
|
include Historiographer::History
|
32
32
|
|
33
|
-
has_many :columns,
|
34
|
-
->(dataset_history) { where(snapshot_id: dataset_history.snapshot_id) },
|
33
|
+
has_many :columns, ->(dataset_history) { where(snapshot_id: dataset_history.snapshot_id) },
|
35
34
|
class_name: "EasyML::ColumnHistory",
|
36
35
|
foreign_key: "dataset_id",
|
37
36
|
primary_key: "dataset_id",
|
38
37
|
extend: EasyML::ColumnList
|
39
38
|
|
40
|
-
def root_dir
|
41
|
-
read_attribute(:root_dir)
|
42
|
-
end
|
43
|
-
|
44
39
|
def fit
|
45
40
|
false
|
46
41
|
end
|
@@ -1,6 +1,8 @@
|
|
1
1
|
module EasyML
|
2
2
|
module Datasources
|
3
3
|
class PolarsDatasource < BaseDatasource
|
4
|
+
include EasyML::DataframeSerialization
|
5
|
+
|
4
6
|
validates :df, presence: true
|
5
7
|
add_configuration_attributes :df
|
6
8
|
|
@@ -58,7 +60,7 @@ module EasyML
|
|
58
60
|
return unless df
|
59
61
|
|
60
62
|
datasource.configuration = (datasource.configuration || {}).merge(
|
61
|
-
"df" =>
|
63
|
+
"df" => serialize_dataframe(df),
|
62
64
|
)
|
63
65
|
end
|
64
66
|
|
@@ -66,23 +68,7 @@ module EasyML
|
|
66
68
|
return unless datasource.configuration&.key?("df")
|
67
69
|
|
68
70
|
df_data = datasource.configuration["df"]
|
69
|
-
|
70
|
-
|
71
|
-
columns = df_data["columns"].map do |col|
|
72
|
-
dtype = case col["datatype"]
|
73
|
-
when Hash
|
74
|
-
if col["datatype"]["Datetime"]
|
75
|
-
Polars::Datetime.new(col["datatype"]["Datetime"][0].downcase.to_sym).class
|
76
|
-
else
|
77
|
-
Polars::Utf8
|
78
|
-
end
|
79
|
-
else
|
80
|
-
Polars.const_get(col["datatype"])
|
81
|
-
end
|
82
|
-
Polars::Series.new(col["name"], col["values"], dtype: dtype)
|
83
|
-
end
|
84
|
-
|
85
|
-
datasource.df = Polars::DataFrame.new(columns)
|
71
|
+
datasource.df = deserialize_dataframe(df_data)
|
86
72
|
end
|
87
73
|
end
|
88
74
|
end
|
data/app/models/easy_ml/event.rb
CHANGED
@@ -19,6 +19,7 @@ module EasyML
|
|
19
19
|
STATUSES = %w[started success failed].freeze
|
20
20
|
|
21
21
|
belongs_to :eventable, polymorphic: true, optional: true
|
22
|
+
has_one :context, dependent: :destroy, class_name: "EasyML::EventContext"
|
22
23
|
|
23
24
|
validates :name, presence: true
|
24
25
|
validates :status, presence: true, inclusion: { in: STATUSES }
|
@@ -51,8 +52,8 @@ module EasyML
|
|
51
52
|
error = e
|
52
53
|
end
|
53
54
|
end
|
54
|
-
create_event(model, "failed", error)
|
55
55
|
Rails.logger.error("#{self.class.name} failed: #{error.message}")
|
56
|
+
create_event(model, "failed", error)
|
56
57
|
end
|
57
58
|
|
58
59
|
def self.format_stacktrace(error)
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_event_contexts
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# event_id :bigint not null
|
7
|
+
# context :jsonb not null
|
8
|
+
# created_at :datetime not null
|
9
|
+
# updated_at :datetime not null
|
10
|
+
#
|
11
|
+
module EasyML
|
12
|
+
class EventContext < ActiveRecord::Base
|
13
|
+
include EasyML::DataframeSerialization
|
14
|
+
|
15
|
+
self.table_name = "easy_ml_event_contexts"
|
16
|
+
|
17
|
+
belongs_to :event
|
18
|
+
|
19
|
+
validates :context, presence: true
|
20
|
+
validates :event, presence: true
|
21
|
+
|
22
|
+
def context=(new_context)
|
23
|
+
write_attribute(:context, serialize_context(new_context))
|
24
|
+
@context = new_context
|
25
|
+
end
|
26
|
+
|
27
|
+
def context
|
28
|
+
@context ||= deserialize_context(read_attribute(:context))
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def serialize_context(new_context)
|
34
|
+
case new_context
|
35
|
+
when Hash
|
36
|
+
self.format = :json
|
37
|
+
new_context.to_json
|
38
|
+
when YAML
|
39
|
+
self.format = :yaml
|
40
|
+
new_context.to_yaml
|
41
|
+
when Polars::DataFrame
|
42
|
+
self.format = :dataframe
|
43
|
+
serialize_dataframe(new_context)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def deserialize_context(context)
|
48
|
+
case format.to_sym
|
49
|
+
when :json
|
50
|
+
JSON.parse(context)
|
51
|
+
when :yaml
|
52
|
+
YAML.safe_load(context)
|
53
|
+
when :dataframe
|
54
|
+
deserialize_dataframe(context)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -17,6 +17,7 @@
|
|
17
17
|
# refresh_every :bigint
|
18
18
|
# created_at :datetime not null
|
19
19
|
# updated_at :datetime not null
|
20
|
+
# workflow_status :string
|
20
21
|
#
|
21
22
|
module EasyML
|
22
23
|
class Feature < ActiveRecord::Base
|
@@ -24,6 +25,11 @@ module EasyML
|
|
24
25
|
include Historiographer::Silent
|
25
26
|
historiographer_mode :snapshot_only
|
26
27
|
|
28
|
+
enum workflow_status: {
|
29
|
+
analyzing: "analyzing",
|
30
|
+
ready: "ready",
|
31
|
+
failed: "failed",
|
32
|
+
}
|
27
33
|
class << self
|
28
34
|
def compute_sha(feature_class)
|
29
35
|
require "digest"
|
@@ -135,13 +141,22 @@ module EasyML
|
|
135
141
|
adapter.respond_to?(:batch) || config.dig(:batch_size).present?
|
136
142
|
end
|
137
143
|
|
144
|
+
def primary_key
|
145
|
+
pkey = config.dig(:primary_key)
|
146
|
+
if pkey.is_a?(Array)
|
147
|
+
pkey
|
148
|
+
else
|
149
|
+
[pkey]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
138
153
|
def numeric_primary_key?
|
139
154
|
if primary_key.nil?
|
140
155
|
return false unless should_be_batchable?
|
141
156
|
raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
|
142
157
|
end
|
143
158
|
|
144
|
-
dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map(
|
159
|
+
dataset.raw.data(limit: 1, select: primary_key)[primary_key].to_a.flat_map { |h| h.respond_to?(:values) ? h.values : h }.all? do |value|
|
145
160
|
case value
|
146
161
|
when String then value.match?(/\A[-+]?\d+(\.\d+)?\z/)
|
147
162
|
else
|
@@ -171,14 +186,14 @@ module EasyML
|
|
171
186
|
unless primary_key.present?
|
172
187
|
raise "Couldn't find primary key for feature #{feature_class}, check your feature class"
|
173
188
|
end
|
174
|
-
df = reader.query(select:
|
189
|
+
df = reader.query(select: primary_key)
|
175
190
|
rescue => e
|
176
191
|
raise "Couldn't find primary key #{primary_key.first} for feature #{feature_class}: #{e.message}"
|
177
192
|
end
|
178
193
|
return [] if df.nil?
|
179
194
|
|
180
195
|
min_id = df[primary_key.first].min
|
181
|
-
max_id = df[primary_key.
|
196
|
+
max_id = df[primary_key.last].max
|
182
197
|
end
|
183
198
|
|
184
199
|
(min_id..max_id).step(batch_size).map do |batch_start|
|
@@ -196,7 +211,11 @@ module EasyML
|
|
196
211
|
end
|
197
212
|
|
198
213
|
def fit(features: [self], async: false)
|
199
|
-
|
214
|
+
# Sort features by position to ensure they're processed in order
|
215
|
+
features.update_all(workflow_status: :analyzing)
|
216
|
+
ordered_features = features.sort_by(&:feature_position)
|
217
|
+
jobs = ordered_features.flat_map(&:build_batches)
|
218
|
+
|
200
219
|
if async
|
201
220
|
EasyML::ComputeFeatureJob.enqueue_batch(jobs)
|
202
221
|
else
|
@@ -266,13 +285,11 @@ module EasyML
|
|
266
285
|
batch_df = adapter.fit(df, self, batch_args)
|
267
286
|
end
|
268
287
|
end
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
}.compact
|
275
|
-
update!(updates)
|
288
|
+
if batch_df.present?
|
289
|
+
store(batch_df)
|
290
|
+
else
|
291
|
+
"Feature #{feature_class}#fit should return a dataframe, received #{batch_df.class}"
|
292
|
+
end
|
276
293
|
batch_df
|
277
294
|
end
|
278
295
|
|
@@ -335,6 +352,7 @@ module EasyML
|
|
335
352
|
def apply_defaults
|
336
353
|
self.name ||= self.feature_class.demodulize.titleize
|
337
354
|
self.version ||= 1
|
355
|
+
self.workflow_status ||= :ready
|
338
356
|
end
|
339
357
|
|
340
358
|
def needs_columns
|
@@ -371,6 +389,17 @@ module EasyML
|
|
371
389
|
(should_be_batchable? ? 10_000 : nil)
|
372
390
|
end
|
373
391
|
|
392
|
+
def after_fit
|
393
|
+
updates = {
|
394
|
+
applied_at: Time.current,
|
395
|
+
needs_fit: false,
|
396
|
+
}.compact
|
397
|
+
update!(updates)
|
398
|
+
end
|
399
|
+
|
400
|
+
def fully_processed?
|
401
|
+
end
|
402
|
+
|
374
403
|
private
|
375
404
|
|
376
405
|
def bulk_update_positions(features)
|
data/app/models/easy_ml/model.rb
CHANGED
@@ -23,7 +23,7 @@ module EasyML
|
|
23
23
|
belongs_to :model, class_name: "EasyML::Model"
|
24
24
|
|
25
25
|
include EasyML::Concerns::Configurable
|
26
|
-
add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :s3_access_key_id, :s3_secret_access_key
|
26
|
+
add_configuration_attributes :s3_bucket, :s3_prefix, :s3_region, :s3_access_key_id, :s3_secret_access_key
|
27
27
|
|
28
28
|
def synced_file
|
29
29
|
EasyML::Support::SyncedFile.new(
|
@@ -33,10 +33,14 @@ module EasyML
|
|
33
33
|
s3_region: s3_region,
|
34
34
|
s3_access_key_id: s3_access_key_id,
|
35
35
|
s3_secret_access_key: s3_secret_access_key,
|
36
|
-
root_dir:
|
36
|
+
root_dir: full_dir,
|
37
37
|
)
|
38
38
|
end
|
39
39
|
|
40
|
+
def root_dir
|
41
|
+
model.root_dir
|
42
|
+
end
|
43
|
+
|
40
44
|
def exist?
|
41
45
|
fit?
|
42
46
|
end
|
@@ -103,7 +107,7 @@ module EasyML
|
|
103
107
|
end
|
104
108
|
|
105
109
|
def relative_dir
|
106
|
-
root_dir.to_s.gsub(Regexp.new(Rails.root.to_s), "").gsub
|
110
|
+
root_dir.to_s.gsub(Regexp.new(Rails.root.to_s), "").gsub(%r{^/}, "")
|
107
111
|
end
|
108
112
|
|
109
113
|
def full_dir
|
@@ -1,3 +1,19 @@
|
|
1
|
+
# == Schema Information
|
2
|
+
#
|
3
|
+
# Table name: easy_ml_splitter_histories
|
4
|
+
#
|
5
|
+
# id :bigint not null, primary key
|
6
|
+
# splitter_id :integer not null
|
7
|
+
# splitter_type :string not null
|
8
|
+
# configuration :json
|
9
|
+
# dataset_id :integer not null
|
10
|
+
# created_at :datetime not null
|
11
|
+
# updated_at :datetime not null
|
12
|
+
# history_started_at :datetime not null
|
13
|
+
# history_ended_at :datetime
|
14
|
+
# history_user_id :integer
|
15
|
+
# snapshot_id :string
|
16
|
+
#
|
1
17
|
module EasyML
|
2
18
|
class SplitterHistory < ActiveRecord::Base
|
3
19
|
self.table_name = "easy_ml_splitter_histories"
|
@@ -3,6 +3,7 @@ module EasyML
|
|
3
3
|
module DateConverter
|
4
4
|
COMMON_DATE_FORMATS = [
|
5
5
|
"%Y-%m-%dT%H:%M:%S.%6N", # e.g., "2021-01-01T00:00:00.000000"
|
6
|
+
"%Y-%m-%d %H:%M:%S.%L Z", # e.g., "2025-01-03 23:04:49.492 Z"
|
6
7
|
"%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
|
7
8
|
"%Y-%m-%d %H:%M:%S.%L", # e.g., "2021-01-01 00:01:36.000"
|
8
9
|
"%Y-%m-%d %H:%M:%S", # e.g., "2021-01-01 00:01:36"
|