easy_ml 0.2.0.pre.rc45 → 0.2.0.pre.rc46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +2 -1
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +91 -28
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +7 -2
- data/app/frontend/types/dataset.ts +1 -0
- data/app/helpers/easy_ml/application_helper.rb +3 -3
- data/app/models/easy_ml/column.rb +22 -0
- data/app/models/easy_ml/column_history.rb +1 -0
- data/app/models/easy_ml/column_list.rb +3 -0
- data/app/models/easy_ml/dataset.rb +29 -7
- data/app/models/easy_ml/event_context.rb +1 -0
- data/app/models/easy_ml/feature_history.rb +1 -0
- data/app/models/easy_ml/model_file.rb +0 -1
- data/app/models/easy_ml/model_file_history.rb +0 -1
- data/app/models/easy_ml/splitter.rb +12 -0
- data/app/models/easy_ml/splitters/date_splitter.rb +4 -0
- data/app/serializers/easy_ml/dataset_serializer.rb +4 -0
- data/config/initializers/dataframe.rb +9 -0
- data/lib/easy_ml/data/preprocessor.rb +9 -2
- data/lib/easy_ml/data/statistics_learner.rb +44 -9
- data/lib/easy_ml/engine.rb +9 -1
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +1 -0
- data/lib/easy_ml/railtie/templates/migration/add_is_date_column_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/version.rb +1 -1
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/{Application-zpGA_Q9c.css → Application-D6L0eW4P.css} +1 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KJwHDm3F.js +474 -0
- data/public/easy_ml/assets/assets/entrypoints/{Application.tsx-BQL_uYxE.js.map → Application.tsx-KJwHDm3F.js.map} +1 -1
- metadata +21 -5
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-BQL_uYxE.js +0 -474
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b4c7d737a4d6775aa7bc7be7408994b4a68e76f55288563ed56990fe1aa2bb2
|
4
|
+
data.tar.gz: d1c147bbc6087489e908cb2df4f61de0021f553d4673f82955929177b4ddd754
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9f182c7d0d9cd5f01124d4a3af4cb9c2f99f1450e3f98e77151761b77fd387cd8ee810d38c9797a8af182f44bc6896ab7789d71f2d475d962733fbd8cd8f0e5a
|
7
|
+
data.tar.gz: 1d4d930f6be09ab56838e79ae9b52a464ebbc25905644557aa2911bd1d2dfdffe96bc7563753b02a9d09c1016da050ef7c6e2ddf2124f9b11d1cddc4775f4792
|
@@ -23,7 +23,7 @@
|
|
23
23
|
module EasyML
|
24
24
|
class DatasetsController < ApplicationController
|
25
25
|
def index
|
26
|
-
datasets = Dataset.all
|
26
|
+
datasets = Dataset.all.order(id: :desc)
|
27
27
|
|
28
28
|
render inertia: "pages/DatasetsPage", props: {
|
29
29
|
datasets: datasets.map { |dataset| dataset_to_json(dataset) },
|
@@ -131,6 +131,7 @@ module EasyML
|
|
131
131
|
:datatype,
|
132
132
|
:polars_datatype,
|
133
133
|
:is_target,
|
134
|
+
:is_date_column,
|
134
135
|
:hidden,
|
135
136
|
:drop_if_null,
|
136
137
|
:sample_values,
|
@@ -10,6 +10,7 @@ import {
|
|
10
10
|
Play,
|
11
11
|
Loader2,
|
12
12
|
Sparkles,
|
13
|
+
Calendar,
|
13
14
|
} from "lucide-react";
|
14
15
|
import { PreprocessingConfig } from "./PreprocessingConfig";
|
15
16
|
import { ColumnList } from "./ColumnList";
|
@@ -24,6 +25,7 @@ import { router } from "@inertiajs/react";
|
|
24
25
|
|
25
26
|
interface ColumnConfig {
|
26
27
|
targetColumn?: string;
|
28
|
+
dateColumn?: string;
|
27
29
|
}
|
28
30
|
|
29
31
|
interface ColumnConfigModalProps {
|
@@ -45,9 +47,13 @@ export function ColumnConfigModal({
|
|
45
47
|
const [activeTab, setActiveTab] = useState<"columns" | "features">(
|
46
48
|
"columns"
|
47
49
|
);
|
50
|
+
const [activeColumnSubTab, setActiveColumnSubTab] = useState<
|
51
|
+
"target" | "date"
|
52
|
+
>("target");
|
48
53
|
const [isApplying, setIsApplying] = useState(false);
|
49
54
|
const [config, setConfig] = useState<ColumnConfig>({
|
50
55
|
targetColumn: dataset.target,
|
56
|
+
dateColumn: dataset.date_column,
|
51
57
|
});
|
52
58
|
const [selectedColumn, setSelectedColumn] = useState<string | null>(null);
|
53
59
|
const [searchQuery, setSearchQuery] = useState("");
|
@@ -127,6 +133,15 @@ export function ColumnConfigModal({
|
|
127
133
|
[dataset.columns]
|
128
134
|
);
|
129
135
|
|
136
|
+
const dateColumnOptions = useMemo(() => {
|
137
|
+
return dataset.columns
|
138
|
+
.filter((column) => column.datatype === "datetime")
|
139
|
+
.map((column) => ({
|
140
|
+
value: column.name,
|
141
|
+
label: column.name,
|
142
|
+
}));
|
143
|
+
}, [dataset.columns]);
|
144
|
+
|
130
145
|
const handleColumnSelect = (columnName: string) => {
|
131
146
|
setSelectedColumn(columnName);
|
132
147
|
};
|
@@ -146,7 +161,7 @@ export function ColumnConfigModal({
|
|
146
161
|
|
147
162
|
const setTargetColumn = (columnName: string) => {
|
148
163
|
const name = String(columnName);
|
149
|
-
setConfig({ targetColumn: columnName });
|
164
|
+
setConfig({ ...config, targetColumn: columnName });
|
150
165
|
const updatedColumns = dataset.columns.map((c) => ({
|
151
166
|
...c,
|
152
167
|
is_target: c.name === name,
|
@@ -159,6 +174,21 @@ export function ColumnConfigModal({
|
|
159
174
|
setNeedsRefresh(true);
|
160
175
|
};
|
161
176
|
|
177
|
+
const setDateColumn = (columnName: string) => {
|
178
|
+
const name = String(columnName);
|
179
|
+
setConfig((prev) => ({ ...prev, dateColumn: columnName }));
|
180
|
+
const updatedColumns = dataset.columns.map((c) => ({
|
181
|
+
...c,
|
182
|
+
is_date_column: c.name === name,
|
183
|
+
}));
|
184
|
+
|
185
|
+
setDataset({
|
186
|
+
...dataset,
|
187
|
+
columns: updatedColumns,
|
188
|
+
});
|
189
|
+
setNeedsRefresh(true);
|
190
|
+
};
|
191
|
+
|
162
192
|
const setColumnType = (columnName: string, datatype: string) => {
|
163
193
|
const updatedColumns = dataset.columns.map((c) => ({
|
164
194
|
...c,
|
@@ -285,32 +315,25 @@ export function ColumnConfigModal({
|
|
285
315
|
<div className="flex border-b shrink-0">
|
286
316
|
<button
|
287
317
|
onClick={() => setActiveTab("columns")}
|
288
|
-
className={`
|
318
|
+
className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
|
289
319
|
activeTab === "columns"
|
290
320
|
? "border-blue-500 text-blue-600"
|
291
321
|
: "border-transparent text-gray-500 hover:text-gray-700"
|
292
322
|
}`}
|
293
323
|
>
|
294
|
-
<
|
295
|
-
|
296
|
-
Column Configuration
|
297
|
-
</div>
|
324
|
+
<Settings2 className="w-4 h-4" />
|
325
|
+
Preprocessing
|
298
326
|
</button>
|
299
327
|
<button
|
300
328
|
onClick={() => setActiveTab("features")}
|
301
|
-
className={`
|
329
|
+
className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
|
302
330
|
activeTab === "features"
|
303
331
|
? "border-blue-500 text-blue-600"
|
304
332
|
: "border-transparent text-gray-500 hover:text-gray-700"
|
305
333
|
}`}
|
306
334
|
>
|
307
|
-
<
|
308
|
-
|
309
|
-
Features
|
310
|
-
<span className="px-1.5 py-0.5 text-xs font-medium bg-blue-100 text-blue-600 rounded-full">
|
311
|
-
{constants.feature_options.length}
|
312
|
-
</span>
|
313
|
-
</div>
|
335
|
+
<Wand2 className="w-4 h-4" />
|
336
|
+
Feature Engineering
|
314
337
|
</button>
|
315
338
|
|
316
339
|
{needsRefresh && (
|
@@ -341,20 +364,60 @@ export function ColumnConfigModal({
|
|
341
364
|
<React.Fragment>
|
342
365
|
<div className="grid grid-cols-7 flex-1 min-h-0">
|
343
366
|
<div className="col-span-3 border-r overflow-hidden flex flex-col">
|
344
|
-
<div className="p-4 border-b
|
345
|
-
<
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
367
|
+
<div className="p-4 border-b">
|
368
|
+
<div className="flex border-b">
|
369
|
+
<button
|
370
|
+
onClick={() => setActiveColumnSubTab("target")}
|
371
|
+
className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
|
372
|
+
activeColumnSubTab === "target"
|
373
|
+
? "border-blue-500 text-blue-600"
|
374
|
+
: "border-transparent text-gray-500 hover:text-gray-700"
|
375
|
+
}`}
|
376
|
+
>
|
377
|
+
<Target className="w-4 h-4" />
|
378
|
+
Target Column
|
379
|
+
</button>
|
380
|
+
<button
|
381
|
+
onClick={() => setActiveColumnSubTab("date")}
|
382
|
+
className={`flex items-center gap-2 px-4 py-2 border-b-2 ${
|
383
|
+
activeColumnSubTab === "date"
|
384
|
+
? "border-blue-500 text-blue-600"
|
385
|
+
: "border-transparent text-gray-500 hover:text-gray-700"
|
386
|
+
}`}
|
387
|
+
>
|
388
|
+
<Calendar className="w-4 h-4" />
|
389
|
+
Date Column
|
390
|
+
</button>
|
391
|
+
</div>
|
392
|
+
|
393
|
+
{activeColumnSubTab === "target" ? (
|
394
|
+
<div className="mt-4">
|
395
|
+
<SearchableSelect
|
396
|
+
value={config.targetColumn || ""}
|
397
|
+
onChange={(value) => setTargetColumn(value)}
|
398
|
+
options={dataset.columns.map((column) => ({
|
399
|
+
value: column.name,
|
400
|
+
label: column.name,
|
401
|
+
}))}
|
402
|
+
placeholder="Select target column..."
|
403
|
+
/>
|
404
|
+
</div>
|
405
|
+
) : (
|
406
|
+
<div className="mt-4">
|
407
|
+
{dateColumnOptions.length > 0 ? (
|
408
|
+
<SearchableSelect
|
409
|
+
options={dateColumnOptions}
|
410
|
+
value={config.dateColumn}
|
411
|
+
onChange={setDateColumn}
|
412
|
+
placeholder="Select a date column..."
|
413
|
+
/>
|
414
|
+
) : (
|
415
|
+
<div className="text-center py-4 text-gray-500 bg-gray-50 rounded-md">
|
416
|
+
No date columns available
|
417
|
+
</div>
|
418
|
+
)}
|
419
|
+
</div>
|
420
|
+
)}
|
358
421
|
</div>
|
359
422
|
<div className="shrink-0">
|
360
423
|
<ColumnFilters
|
@@ -266,8 +266,13 @@ export function PreprocessingConfig({
|
|
266
266
|
let content;
|
267
267
|
if (strategy.method === 'most_frequent' && column.statistics?.raw.most_frequent_value) {
|
268
268
|
content = `Most Frequent Value: ${column.statistics.raw.most_frequent_value}`
|
269
|
-
} else if (strategy.method === 'ffill'
|
270
|
-
|
269
|
+
} else if (strategy.method === 'ffill') {
|
270
|
+
const lastValue = column.statistics?.raw.last_value;
|
271
|
+
if (lastValue !== undefined && lastValue !== null) {
|
272
|
+
content = `Forward Fill using Last Value: ${lastValue}`;
|
273
|
+
} else {
|
274
|
+
content = 'Set date column & apply preprocessing to see last value';
|
275
|
+
}
|
271
276
|
} else if (strategy.method === 'median' && column.statistics?.raw?.median) {
|
272
277
|
content = `Median: ${column.statistics.raw.median}`
|
273
278
|
} else if (strategy.method === 'mean' && column.statistics?.raw?.mean) {
|
@@ -2,9 +2,9 @@
|
|
2
2
|
|
3
3
|
module EasyML
|
4
4
|
module ApplicationHelper
|
5
|
-
|
6
|
-
|
7
|
-
|
5
|
+
def vite_manifest
|
6
|
+
# ViteRuby.new(root: EasyML::Engine.root).manifest
|
7
|
+
EasyML::Engine.vite_ruby.manifest
|
8
8
|
end
|
9
9
|
|
10
10
|
def prod_script_tags
|
@@ -16,6 +16,7 @@
|
|
16
16
|
# statistics :json
|
17
17
|
# created_at :datetime not null
|
18
18
|
# updated_at :datetime not null
|
19
|
+
# is_date_column :boolean default(FALSE)
|
19
20
|
#
|
20
21
|
module EasyML
|
21
22
|
class Column < ActiveRecord::Base
|
@@ -29,12 +30,15 @@ module EasyML
|
|
29
30
|
validates :name, uniqueness: { scope: :dataset_id }
|
30
31
|
|
31
32
|
before_save :ensure_valid_datatype
|
33
|
+
after_create :set_date_column_if_date_splitter
|
34
|
+
after_save :handle_date_column_change
|
32
35
|
|
33
36
|
# Scopes
|
34
37
|
scope :visible, -> { where(hidden: false) }
|
35
38
|
scope :numeric, -> { where(datatype: %w[float integer]) }
|
36
39
|
scope :categorical, -> { where(datatype: %w[categorical string boolean]) }
|
37
40
|
scope :datetime, -> { where(datatype: "datetime") }
|
41
|
+
scope :date_column, -> { where(is_date_column: true) }
|
38
42
|
|
39
43
|
def datatype=(dtype)
|
40
44
|
write_attribute(:datatype, dtype)
|
@@ -88,8 +92,26 @@ module EasyML
|
|
88
92
|
dataset.preprocessor.statistics.dup.to_h.dig(name.to_sym, :allowed_categories).sort.concat(["other"])
|
89
93
|
end
|
90
94
|
|
95
|
+
def date_column?
|
96
|
+
is_date_column
|
97
|
+
end
|
98
|
+
|
91
99
|
private
|
92
100
|
|
101
|
+
def set_date_column_if_date_splitter
|
102
|
+
binding.pry
|
103
|
+
end
|
104
|
+
|
105
|
+
def handle_date_column_change
|
106
|
+
return unless saved_change_to_is_date_column? && is_date_column?
|
107
|
+
|
108
|
+
Column.transaction do
|
109
|
+
dataset.columns.where.not(id: id).update_all(is_date_column: false)
|
110
|
+
dataset.learn_statistics
|
111
|
+
dataset.columns.sync
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
93
115
|
def ensure_valid_datatype
|
94
116
|
return if datatype.blank?
|
95
117
|
|
@@ -336,9 +336,12 @@ module EasyML
|
|
336
336
|
end
|
337
337
|
|
338
338
|
def learn_statistics
|
339
|
-
|
340
|
-
|
341
|
-
|
339
|
+
stats = {
|
340
|
+
raw: EasyML::Data::StatisticsLearner.learn(raw, self),
|
341
|
+
}
|
342
|
+
stats.merge!(processed: EasyML::Data::StatisticsLearner.learn(processed, self)) if processed.data.present?
|
343
|
+
|
344
|
+
update(statistics: stats)
|
342
345
|
end
|
343
346
|
|
344
347
|
def process_data
|
@@ -508,6 +511,10 @@ module EasyML
|
|
508
511
|
@target ||= preloaded_columns.find(&:is_target)&.name
|
509
512
|
end
|
510
513
|
|
514
|
+
def date_column
|
515
|
+
@date_column ||= columns.date_column.first
|
516
|
+
end
|
517
|
+
|
511
518
|
def drop_cols
|
512
519
|
@drop_cols ||= preloaded_columns.select(&:hidden).map(&:name)
|
513
520
|
end
|
@@ -588,8 +595,18 @@ module EasyML
|
|
588
595
|
self
|
589
596
|
end
|
590
597
|
|
598
|
+
def after_create_columns
|
599
|
+
apply_date_splitter_config
|
600
|
+
end
|
601
|
+
|
591
602
|
private
|
592
603
|
|
604
|
+
def apply_date_splitter_config
|
605
|
+
return unless splitter.date_splitter?
|
606
|
+
|
607
|
+
set_date_column(splitter.date_col)
|
608
|
+
end
|
609
|
+
|
593
610
|
def preloaded_features
|
594
611
|
@preloaded_features ||= features.includes(:dataset).load
|
595
612
|
end
|
@@ -670,10 +687,8 @@ module EasyML
|
|
670
687
|
end
|
671
688
|
end
|
672
689
|
|
673
|
-
def fit
|
674
|
-
|
675
|
-
|
676
|
-
preprocessor.fit(xs)
|
690
|
+
def fit
|
691
|
+
preprocessor.fit(raw.train(all_columns: true))
|
677
692
|
self.preprocessor_statistics = preprocessor.statistics
|
678
693
|
end
|
679
694
|
|
@@ -712,6 +727,12 @@ module EasyML
|
|
712
727
|
end
|
713
728
|
end
|
714
729
|
|
730
|
+
def set_date_column(column_name)
|
731
|
+
return unless column_name.present?
|
732
|
+
|
733
|
+
columns.find_by(name: column_name).update(is_date_column: true)
|
734
|
+
end
|
735
|
+
|
715
736
|
def apply_features(df, features = self.features)
|
716
737
|
if features.nil? || features.empty?
|
717
738
|
df
|
@@ -753,6 +774,7 @@ module EasyML
|
|
753
774
|
EasyML::Data::Preprocessor.new(
|
754
775
|
directory: Pathname.new(root_dir).append("preprocessor"),
|
755
776
|
preprocessing_steps: preprocessing_steps,
|
777
|
+
dataset: self,
|
756
778
|
).tap do |preprocessor|
|
757
779
|
preprocessor.statistics = preprocessor_statistics
|
758
780
|
end
|
@@ -52,6 +52,18 @@ module EasyML
|
|
52
52
|
add_configuration_attributes attribute
|
53
53
|
end
|
54
54
|
|
55
|
+
def date_splitter?
|
56
|
+
splitter_type == "date"
|
57
|
+
end
|
58
|
+
|
59
|
+
def random_splitter?
|
60
|
+
splitter_type == "random"
|
61
|
+
end
|
62
|
+
|
63
|
+
def predefined_splitter?
|
64
|
+
splitter_type == "predefined"
|
65
|
+
end
|
66
|
+
|
55
67
|
def self.constants
|
56
68
|
{
|
57
69
|
SPLITTER_TYPES: SPLITTER_TYPES,
|
@@ -29,6 +29,10 @@ module EasyML
|
|
29
29
|
attributes :id, :name, :description, :target, :num_rows, :status,
|
30
30
|
:datasource_id, :preprocessing_steps, :workflow_status, :statistics
|
31
31
|
|
32
|
+
attribute :date_column do |dataset|
|
33
|
+
dataset.date_column&.name
|
34
|
+
end
|
35
|
+
|
32
36
|
attribute :splitter do |dataset|
|
33
37
|
dataset.splitter
|
34
38
|
end
|
@@ -18,16 +18,19 @@ module EasyML::Data
|
|
18
18
|
|
19
19
|
PREPROCESSING_STRATEGIES = {
|
20
20
|
float: [
|
21
|
+
{ value: "ffill", label: "Forward Fill" },
|
21
22
|
{ value: "mean", label: "Mean" },
|
22
23
|
{ value: "median", label: "Median" },
|
23
24
|
{ value: "constant", label: "Constant Value" },
|
24
25
|
],
|
25
26
|
integer: [
|
27
|
+
{ value: "ffill", label: "Forward Fill" },
|
26
28
|
{ value: "mean", label: "Mean" },
|
27
29
|
{ value: "median", label: "Median" },
|
28
30
|
{ value: "constant", label: "Constant Value" },
|
29
31
|
],
|
30
32
|
boolean: [
|
33
|
+
{ value: "ffill", label: "Forward Fill" },
|
31
34
|
{ value: "most_frequent", label: "Most Frequent" },
|
32
35
|
{ value: "constant", label: "Constant Value" },
|
33
36
|
],
|
@@ -37,21 +40,24 @@ module EasyML::Data
|
|
37
40
|
{ value: "today", label: "Current Date" },
|
38
41
|
],
|
39
42
|
string: [
|
43
|
+
{ value: "ffill", label: "Forward Fill" },
|
40
44
|
{ value: "most_frequent", label: "Most Frequent" },
|
41
45
|
{ value: "constant", label: "Constant Value" },
|
42
46
|
],
|
43
47
|
text: [
|
48
|
+
{ value: "ffill", label: "Forward Fill" },
|
44
49
|
{ value: "most_frequent", label: "Most Frequent" },
|
45
50
|
{ value: "constant", label: "Constant Value" },
|
46
51
|
],
|
47
52
|
categorical: [
|
53
|
+
{ value: "ffill", label: "Forward Fill" },
|
48
54
|
{ value: "categorical", label: "Categorical" },
|
49
55
|
{ value: "most_frequent", label: "Most Frequent" },
|
50
56
|
{ value: "constant", label: "Constant Value" },
|
51
57
|
],
|
52
58
|
}.freeze
|
53
59
|
|
54
|
-
attr_accessor :directory, :verbose, :imputers, :preprocessing_steps
|
60
|
+
attr_accessor :directory, :verbose, :imputers, :preprocessing_steps, :dataset
|
55
61
|
attr_reader :statistics
|
56
62
|
|
57
63
|
def initialize(options = {})
|
@@ -59,6 +65,7 @@ module EasyML::Data
|
|
59
65
|
@verbose = options[:verbose]
|
60
66
|
@imputers = options[:imputers]
|
61
67
|
@preprocessing_steps = options[:preprocessing_steps]
|
68
|
+
@dataset = options[:dataset]
|
62
69
|
@statistics = {}
|
63
70
|
end
|
64
71
|
|
@@ -110,7 +117,7 @@ module EasyML::Data
|
|
110
117
|
df = apply_clip(df, preprocessing_steps)
|
111
118
|
allowed_categories = learn_categorical_min(df, preprocessing_steps)
|
112
119
|
|
113
|
-
self.statistics = StatisticsLearner.learn_df(df).deep_symbolize_keys
|
120
|
+
self.statistics = StatisticsLearner.learn_df(df, dataset: dataset).deep_symbolize_keys
|
114
121
|
|
115
122
|
# Merge allowed categories into statistics
|
116
123
|
allowed_categories.each do |col, categories|
|
@@ -9,28 +9,55 @@ module EasyML::Data
|
|
9
9
|
@verbose = options[:verbose]
|
10
10
|
end
|
11
11
|
|
12
|
-
def self.learn(
|
13
|
-
|
14
|
-
output[:processed] = learn_split(processed) if processed.data.present?
|
15
|
-
output
|
12
|
+
def self.learn(df, dataset = nil)
|
13
|
+
new(df, dataset).learn
|
16
14
|
end
|
17
15
|
|
18
|
-
|
16
|
+
attr_reader :df, :dataset
|
17
|
+
|
18
|
+
def initialize(df, dataset)
|
19
|
+
@df = df
|
20
|
+
@dataset = dataset
|
21
|
+
end
|
22
|
+
|
23
|
+
def learn
|
24
|
+
learn_split(df)
|
25
|
+
end
|
26
|
+
|
27
|
+
def learn_split(split)
|
19
28
|
df = split.read(:all)
|
20
29
|
train_df = split.read(:train)
|
21
|
-
all_stats = learn_df(df)
|
22
|
-
train_stats = learn_df(train_df)
|
30
|
+
all_stats = learn_df(df, dataset: dataset)
|
31
|
+
train_stats = learn_df(train_df, dataset: dataset)
|
23
32
|
|
24
33
|
all_stats.reduce({}) do |output, (k, _)|
|
25
34
|
output.tap do
|
26
35
|
output[k] = all_stats[k].slice(:num_rows, :null_count, :unique_count, :counts).merge!(
|
27
|
-
train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value)
|
36
|
+
train_stats[k].slice(:mean, :median, :min, :max, :std, :last_value, :most_frequent_value, :last_known_value)
|
28
37
|
)
|
29
38
|
end
|
30
39
|
end
|
31
40
|
end
|
32
41
|
|
33
|
-
def
|
42
|
+
def last_known_value(df, col, date_col)
|
43
|
+
return nil if df.empty? || !df.columns.include?(date_col)
|
44
|
+
|
45
|
+
# Sort by date and get the last non-null value
|
46
|
+
sorted_df = df.sort(date_col, reverse: true)
|
47
|
+
last_value = sorted_df
|
48
|
+
.filter(Polars.col(col).is_not_null)
|
49
|
+
.select(col)
|
50
|
+
.head(1)
|
51
|
+
.item
|
52
|
+
|
53
|
+
last_value
|
54
|
+
end
|
55
|
+
|
56
|
+
def learn_df(df, dataset: nil)
|
57
|
+
self.class.learn_df(df, dataset: dataset)
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.learn_df(df, dataset: nil)
|
34
61
|
return if df.nil?
|
35
62
|
|
36
63
|
base_stats = describe_to_h(df).deep_symbolize_keys
|
@@ -46,6 +73,10 @@ module EasyML::Data
|
|
46
73
|
null_count: base_stats[col.to_sym][:null_count].to_i,
|
47
74
|
}
|
48
75
|
|
76
|
+
if dataset&.date_column.present?
|
77
|
+
stats[col][:last_value] = last_value(df, col, dataset.date_column.name)
|
78
|
+
end
|
79
|
+
|
49
80
|
# Add type-specific statistics
|
50
81
|
case field_type
|
51
82
|
when :integer, :float
|
@@ -77,6 +108,10 @@ module EasyML::Data
|
|
77
108
|
col.match?(/^id$/) || col.match?(/.*_id/)
|
78
109
|
end
|
79
110
|
|
111
|
+
def self.last_value(df, col, date_col)
|
112
|
+
df.filter(Polars.col(col).is_not_null).sort(date_col)[col][-1]
|
113
|
+
end
|
114
|
+
|
80
115
|
def self.describe_to_h(df)
|
81
116
|
init_h = df.describe.to_h
|
82
117
|
rows = init_h.values.map(&:to_a)
|
data/lib/easy_ml/engine.rb
CHANGED
@@ -18,11 +18,16 @@ require "resque-batched-job"
|
|
18
18
|
require "rake"
|
19
19
|
require "resque/tasks"
|
20
20
|
require "zhong"
|
21
|
+
require "vite_ruby"
|
22
|
+
require "vite_rails"
|
23
|
+
require "dotenv"
|
21
24
|
|
22
25
|
module EasyML
|
23
26
|
class Engine < Rails::Engine
|
24
27
|
isolate_namespace EasyML
|
25
28
|
|
29
|
+
Dotenv.load if File.exist?(".env")
|
30
|
+
|
26
31
|
def root_dir
|
27
32
|
Rails.root.join("easy_ml")
|
28
33
|
end
|
@@ -118,7 +123,10 @@ module EasyML
|
|
118
123
|
end
|
119
124
|
|
120
125
|
puts "Running dev proxy"
|
121
|
-
config.app_middleware.insert_before 0,
|
126
|
+
config.app_middleware.insert_before 0,
|
127
|
+
ViteRuby::DevServerProxy,
|
128
|
+
vite_ruby: vite_ruby,
|
129
|
+
ssl_verify_none: true
|
122
130
|
else
|
123
131
|
config.app_middleware.use(
|
124
132
|
Rack::Static,
|