easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/application_controller.rb +4 -0
- data/app/controllers/easy_ml/datasets_controller.rb +32 -1
- data/app/frontend/components/DatasetPreview.tsx +50 -19
- data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
- data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
- data/app/frontend/components/dataset/ColumnList.tsx +14 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
- data/app/frontend/types/dataset.ts +3 -0
- data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
- data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
- data/app/models/easy_ml/column/imputers/base.rb +89 -0
- data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
- data/app/models/easy_ml/column/imputers/clip.rb +30 -0
- data/app/models/easy_ml/column/imputers/constant.rb +27 -0
- data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
- data/app/models/easy_ml/column/imputers/mean.rb +27 -0
- data/app/models/easy_ml/column/imputers/median.rb +27 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
- data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
- data/app/models/easy_ml/column/imputers/today.rb +20 -0
- data/app/models/easy_ml/column/imputers.rb +126 -0
- data/app/models/easy_ml/column/learner.rb +18 -0
- data/app/models/easy_ml/column/learners/base.rb +103 -0
- data/app/models/easy_ml/column/learners/boolean.rb +11 -0
- data/app/models/easy_ml/column/learners/categorical.rb +51 -0
- data/app/models/easy_ml/column/learners/datetime.rb +19 -0
- data/app/models/easy_ml/column/learners/null.rb +22 -0
- data/app/models/easy_ml/column/learners/numeric.rb +33 -0
- data/app/models/easy_ml/column/learners/string.rb +15 -0
- data/app/models/easy_ml/column/lineage/base.rb +22 -0
- data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
- data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
- data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
- data/app/models/easy_ml/column/lineage.rb +28 -0
- data/app/models/easy_ml/column/selector.rb +96 -0
- data/app/models/easy_ml/column.rb +319 -52
- data/app/models/easy_ml/column_history.rb +29 -22
- data/app/models/easy_ml/column_list.rb +63 -78
- data/app/models/easy_ml/dataset.rb +128 -96
- data/app/models/easy_ml/dataset_history.rb +23 -23
- data/app/models/easy_ml/datasource.rb +3 -0
- data/app/models/easy_ml/datasource_history.rb +1 -0
- data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
- data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
- data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
- data/app/models/easy_ml/feature.rb +19 -7
- data/app/models/easy_ml/feature_history.rb +12 -0
- data/app/models/easy_ml/feature_list.rb +15 -0
- data/app/serializers/easy_ml/column_serializer.rb +11 -1
- data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
- data/config/initializers/enumerable.rb +17 -0
- data/lib/easy_ml/data/date_converter.rb +137 -30
- data/lib/easy_ml/data/polars_column.rb +17 -0
- data/lib/easy_ml/data/polars_in_memory.rb +30 -0
- data/lib/easy_ml/data/polars_reader.rb +20 -1
- data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
- data/lib/easy_ml/data/splits/split.rb +2 -1
- data/lib/easy_ml/data/synced_directory.rb +1 -1
- data/lib/easy_ml/data.rb +1 -2
- data/lib/easy_ml/feature_store.rb +33 -22
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
- data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
- data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/tasks/profile.rake +40 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
- metadata +41 -10
- data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
- data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
- data/lib/easy_ml/data/preprocessor.rb +0 -340
- data/lib/easy_ml/data/simple_imputer.rb +0 -255
- data/lib/easy_ml/data/statistics_learner.rb +0 -193
- data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 43809758ea028bdeb30b9255c50fd951d7870cd5286d0e3ae9f0e30e09bf22a6
|
4
|
+
data.tar.gz: 5b2f0ae171a1043b8ce76dc438cab2931c2562f481de8c024beaeee8b15accfc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 012d1c40e4a4efdf330702effe53eef60b84f8ff7fb5277d0b0030192990f8e86ab9ec7e73ec80972328425c1bf53b8ccf9a275c0e083b7b8e13eeb04be5914b
|
7
|
+
data.tar.gz: b2b504db55932b4abcf8acd06ed19ea5eb9f3c79d246ac7a3398494f7220451e886ca35196c94816356542f774c11c12ca599595af2841aa76a14c0e6304e982
|
@@ -23,6 +23,10 @@ module EasyML
|
|
23
23
|
SettingsSerializer.new(settings).serializable_hash.dig(:data, :attributes)
|
24
24
|
end
|
25
25
|
|
26
|
+
def dataset_to_json_small(dataset)
|
27
|
+
DatasetSerializer::SmallSerializer.new(dataset).serializable_hash.dig(:data, :attributes)
|
28
|
+
end
|
29
|
+
|
26
30
|
def dataset_to_json(dataset)
|
27
31
|
DatasetSerializer.new(dataset).serializable_hash.dig(:data, :attributes)
|
28
32
|
end
|
@@ -26,7 +26,7 @@ module EasyML
|
|
26
26
|
datasets = Dataset.all.order(id: :desc)
|
27
27
|
|
28
28
|
render inertia: "pages/DatasetsPage", props: {
|
29
|
-
datasets: datasets.map { |dataset|
|
29
|
+
datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
|
30
30
|
constants: Dataset.constants,
|
31
31
|
}
|
32
32
|
end
|
@@ -76,6 +76,37 @@ module EasyML
|
|
76
76
|
column_attrs[:preprocessing_steps] = nil if column_attrs.dig(:preprocessing_steps, :training, :method) == "none"
|
77
77
|
end
|
78
78
|
|
79
|
+
# Handle feature ID assignment for existing features
|
80
|
+
if dataset_params[:features_attributes].present?
|
81
|
+
# Clean up any feature IDs that don't exist anymore
|
82
|
+
feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
|
83
|
+
existing_feature_ids = Feature.where(id: feature_ids).pluck(:id)
|
84
|
+
|
85
|
+
params[:dataset][:features_attributes].each do |attrs|
|
86
|
+
if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
|
87
|
+
attrs.delete(:id)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Find existing features by feature_class
|
92
|
+
feature_classes = dataset_params[:features_attributes].map { |attrs|
|
93
|
+
attrs[:feature_class] if attrs[:id].blank?
|
94
|
+
}.compact
|
95
|
+
|
96
|
+
existing_features = Feature.where(feature_class: feature_classes)
|
97
|
+
|
98
|
+
# Update params with existing feature IDs
|
99
|
+
existing_features.each do |feature|
|
100
|
+
matching_param_index = params[:dataset][:features_attributes].find_index { |attrs|
|
101
|
+
attrs[:feature_class] == feature.feature_class
|
102
|
+
}
|
103
|
+
|
104
|
+
if matching_param_index
|
105
|
+
params[:dataset][:features_attributes][matching_param_index][:id] = feature.id
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
79
110
|
if dataset.update(dataset_params)
|
80
111
|
flash.now[:notice] = "Dataset configuration was successfully updated."
|
81
112
|
render inertia: "pages/DatasetDetailsPage", props: {
|
@@ -58,9 +58,11 @@ export function DatasetPreview({ dataset }: DatasetPreviewProps) {
|
|
58
58
|
key={column.name}
|
59
59
|
className="bg-gray-50 rounded-lg p-4"
|
60
60
|
>
|
61
|
-
<div className="flex items-center justify-between mb-2">
|
62
|
-
<h4 className="font-medium text-gray-900">
|
63
|
-
|
61
|
+
<div className="flex items-center justify-between mb-2 gap-2">
|
62
|
+
<h4 className="font-medium text-gray-900 break-normal max-w-[70%] word-break:break-word overflow-wrap:anywhere whitespace-pre-wrap">
|
63
|
+
{column.name.split('_').join('_\u200B')}
|
64
|
+
</h4>
|
65
|
+
<span className="text-xs font-medium text-gray-500 px-2 py-1 bg-gray-200 rounded-full flex-shrink-0">
|
64
66
|
{column.datatype}
|
65
67
|
</span>
|
66
68
|
</div>
|
@@ -68,23 +70,48 @@ export function DatasetPreview({ dataset }: DatasetPreviewProps) {
|
|
68
70
|
{column.statistics && (
|
69
71
|
<div className="space-y-1">
|
70
72
|
{Object.entries(column.statistics.raw).map(([key, value]) => {
|
71
|
-
|
73
|
+
// Skip internal keys and null/undefined values
|
74
|
+
if (key === "counts" ||
|
75
|
+
key === "allowed_categories" ||
|
76
|
+
key === "value" ||
|
77
|
+
key === "label_encoder" ||
|
78
|
+
key === "label_decoder" ||
|
79
|
+
value === null ||
|
80
|
+
value === undefined) {
|
72
81
|
return null;
|
73
82
|
}
|
83
|
+
|
84
|
+
// Format the value based on its type
|
85
|
+
let displayValue: string;
|
86
|
+
if (typeof value === 'number') {
|
87
|
+
displayValue = value.toLocaleString(undefined, {
|
88
|
+
maximumFractionDigits: 2
|
89
|
+
});
|
90
|
+
} else if (typeof value === 'object') {
|
91
|
+
// Handle arrays or other objects
|
92
|
+
displayValue = JSON.stringify(value);
|
93
|
+
} else if (typeof value === 'boolean') {
|
94
|
+
displayValue = value.toString();
|
95
|
+
} else {
|
96
|
+
displayValue = String(value);
|
97
|
+
}
|
98
|
+
|
99
|
+
// Truncate long strings
|
100
|
+
if (displayValue.length > 50) {
|
101
|
+
displayValue = displayValue.slice(0, 47) + '...';
|
102
|
+
}
|
103
|
+
|
74
104
|
return (
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
</span>
|
86
|
-
</div>
|
87
|
-
)})}
|
105
|
+
<div key={key} className="flex justify-between text-sm gap-2">
|
106
|
+
<span className="text-gray-500 flex-shrink-0">
|
107
|
+
{key.charAt(0).toUpperCase() + key.slice(1)}:
|
108
|
+
</span>
|
109
|
+
<span className="font-medium text-gray-900 text-right break-all">
|
110
|
+
{displayValue}
|
111
|
+
</span>
|
112
|
+
</div>
|
113
|
+
);
|
114
|
+
})}
|
88
115
|
</div>
|
89
116
|
)}
|
90
117
|
</div>
|
@@ -142,10 +169,14 @@ export function DatasetPreview({ dataset }: DatasetPreviewProps) {
|
|
142
169
|
<tr key={i}>
|
143
170
|
{columns.map((column) => (
|
144
171
|
<td
|
145
|
-
key={
|
172
|
+
key={`${i}-${column}`}
|
146
173
|
className="whitespace-nowrap px-3 py-4 text-sm text-gray-500"
|
147
174
|
>
|
148
|
-
{row[column]
|
175
|
+
{row[column] === null || row[column] === undefined
|
176
|
+
? ''
|
177
|
+
: typeof row[column] === 'object'
|
178
|
+
? JSON.stringify(row[column])
|
179
|
+
: String(row[column])}
|
149
180
|
</td>
|
150
181
|
))}
|
151
182
|
</tr>
|
@@ -58,7 +58,7 @@ export function ColumnConfigModal({
|
|
58
58
|
const [selectedColumn, setSelectedColumn] = useState<string | null>(null);
|
59
59
|
const [searchQuery, setSearchQuery] = useState("");
|
60
60
|
const [activeFilters, setActiveFilters] = useState<{
|
61
|
-
view: "all" | "training" | "hidden" | "preprocessed" | "nulls";
|
61
|
+
view: "all" | "training" | "hidden" | "preprocessed" | "nulls" | "computed" | "required";
|
62
62
|
types: string[];
|
63
63
|
}>({
|
64
64
|
view: "all",
|
@@ -103,6 +103,10 @@ export function ColumnConfigModal({
|
|
103
103
|
return colHasPreprocessingSteps(column);
|
104
104
|
case "nulls":
|
105
105
|
return (column.statistics?.processed?.null_count || 0) > 0;
|
106
|
+
case "computed":
|
107
|
+
return column.is_computed;
|
108
|
+
case "required":
|
109
|
+
return column.required;
|
106
110
|
default:
|
107
111
|
return true;
|
108
112
|
}
|
@@ -124,6 +128,8 @@ export function ColumnConfigModal({
|
|
124
128
|
withNulls: dataset.columns.filter(
|
125
129
|
(c) => (c.statistics?.processed?.null_count || 0) > 0
|
126
130
|
).length,
|
131
|
+
computed: dataset.columns.filter((c) => c.is_computed === true).length,
|
132
|
+
required: dataset.columns.filter((c) => c.required === true).length,
|
127
133
|
}),
|
128
134
|
[dataset.columns, filteredColumns]
|
129
135
|
);
|
@@ -1,16 +1,16 @@
|
|
1
1
|
import React, { useState } from 'react';
|
2
|
-
import { Filter, Database, Wrench, Eye, EyeOff, AlertTriangle, ChevronLeft, ChevronRight } from 'lucide-react';
|
2
|
+
import { Filter, Database, Wrench, Eye, EyeOff, AlertTriangle, ChevronLeft, ChevronRight, Calculator, Target } from 'lucide-react';
|
3
3
|
import type { Column } from '../../types';
|
4
4
|
|
5
5
|
const ITEMS_PER_PAGE = 5;
|
6
6
|
interface ColumnFiltersProps {
|
7
7
|
types: string[];
|
8
8
|
activeFilters: {
|
9
|
-
view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls';
|
9
|
+
view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls' | 'computed' | 'required';
|
10
10
|
types: string[];
|
11
11
|
};
|
12
12
|
onFilterChange: (filters: {
|
13
|
-
view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls';
|
13
|
+
view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls' | 'computed' | 'required';
|
14
14
|
types: string[];
|
15
15
|
}) => void;
|
16
16
|
columnStats: {
|
@@ -20,6 +20,8 @@ interface ColumnFiltersProps {
|
|
20
20
|
hidden: number;
|
21
21
|
withPreprocessing: number;
|
22
22
|
withNulls: number;
|
23
|
+
computed: number;
|
24
|
+
required: number;
|
23
25
|
};
|
24
26
|
colHasPreprocessingSteps: (col: Column) => boolean;
|
25
27
|
columns: Column[];
|
@@ -43,6 +45,10 @@ export function ColumnFilters({
|
|
43
45
|
return `${columnStats.withPreprocessing} columns`;
|
44
46
|
case 'nulls':
|
45
47
|
return `${columnStats.withNulls} columns`;
|
48
|
+
case 'computed':
|
49
|
+
return `${columnStats.computed} columns`;
|
50
|
+
case 'required':
|
51
|
+
return `${columnStats.required} columns`;
|
46
52
|
default:
|
47
53
|
return `${columnStats.total} columns`;
|
48
54
|
}
|
@@ -158,6 +164,34 @@ export function ColumnFilters({
|
|
158
164
|
({getViewStats('nulls')})
|
159
165
|
</span>
|
160
166
|
</button>
|
167
|
+
<button
|
168
|
+
onClick={() => onFilterChange({ ...activeFilters, view: 'computed' })}
|
169
|
+
className={`inline-flex items-center gap-1 px-3 py-1.5 rounded-md text-sm font-medium ${
|
170
|
+
activeFilters.view === 'computed'
|
171
|
+
? 'bg-purple-100 text-purple-900'
|
172
|
+
: 'text-gray-600 hover:bg-gray-50'
|
173
|
+
}`}
|
174
|
+
>
|
175
|
+
<Calculator className="w-4 h-4" />
|
176
|
+
Computed
|
177
|
+
<span className="text-xs text-gray-500 ml-1">
|
178
|
+
({getViewStats('computed')})
|
179
|
+
</span>
|
180
|
+
</button>
|
181
|
+
<button
|
182
|
+
onClick={() => onFilterChange({ ...activeFilters, view: 'required' })}
|
183
|
+
className={`inline-flex items-center gap-1 px-3 py-1.5 rounded-md text-sm font-medium ${
|
184
|
+
activeFilters.view === 'required'
|
185
|
+
? 'bg-blue-100 text-blue-900'
|
186
|
+
: 'text-gray-600 hover:bg-gray-50'
|
187
|
+
}`}
|
188
|
+
>
|
189
|
+
<Target className="w-4 h-4" />
|
190
|
+
Required
|
191
|
+
<span className="text-xs text-gray-500 ml-1">
|
192
|
+
({getViewStats('required')})
|
193
|
+
</span>
|
194
|
+
</button>
|
161
195
|
</div>
|
162
196
|
|
163
197
|
{/* Column Types */}
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import React from 'react';
|
2
|
-
import { Settings2, AlertCircle, Target, EyeOff, Eye } from 'lucide-react';
|
2
|
+
import { Settings2, AlertCircle, Target, EyeOff, Eye, Calculator, Star } from 'lucide-react';
|
3
3
|
import type { Column } from '../../types';
|
4
4
|
import { usePage } from "@inertiajs/react";
|
5
5
|
|
@@ -79,11 +79,23 @@ export function ColumnList({
|
|
79
79
|
</p>
|
80
80
|
)}
|
81
81
|
<div className="flex flex-wrap gap-2">
|
82
|
+
{column.required && (
|
83
|
+
<div className="flex items-center gap-1 text-blue-600">
|
84
|
+
<Star className="w-3 h-3" />
|
85
|
+
<span className="text-xs">required</span>
|
86
|
+
</div>
|
87
|
+
)}
|
88
|
+
{column.is_computed && (
|
89
|
+
<div className="flex items-center gap-1 text-purple-600">
|
90
|
+
<Calculator className="w-3 h-3" />
|
91
|
+
<span className="text-xs">computed</span>
|
92
|
+
</div>
|
93
|
+
)}
|
82
94
|
{column.preprocessing_steps && column.preprocessing_steps?.training &&
|
83
95
|
column.preprocessing_steps?.training?.method !== 'none' && (
|
84
96
|
<div className="flex items-center gap-1 text-blue-600">
|
85
97
|
<AlertCircle className="w-3 h-3" />
|
86
|
-
<span className="text-xs">
|
98
|
+
<span className="text-xs">preprocessing configured</span>
|
87
99
|
</div>
|
88
100
|
)}
|
89
101
|
{column.hidden && (
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import React, { useState, useEffect } from 'react';
|
2
|
-
import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database } from 'lucide-react';
|
2
|
+
import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database, Calculator, GitBranch } from 'lucide-react';
|
3
3
|
import type { Dataset, Column, ColumnType, PreprocessingConstants, PreprocessingSteps, PreprocessingStep } from '../../types/dataset';
|
4
4
|
import { Badge } from "@/components/ui/badge";
|
5
5
|
|
@@ -332,26 +332,41 @@ export function PreprocessingConfig({
|
|
332
332
|
</div>
|
333
333
|
</div>
|
334
334
|
<div className="flex items-center gap-4 flex-shrink-0">
|
335
|
-
|
336
|
-
<
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
<span className="flex items-center gap-1 text-gray-700">
|
349
|
-
<Trash2 className="w-4 h-4 text-gray-400" />
|
350
|
-
Drop if null
|
351
|
-
</span>
|
352
|
-
</label>
|
335
|
+
<div className="relative flex items-center gap-2">
|
336
|
+
<div className="absolute right-0 -top-8 flex items-center gap-2">
|
337
|
+
{column.required && (
|
338
|
+
<Badge variant="secondary" className="bg-blue-100 text-blue-800">
|
339
|
+
Required
|
340
|
+
</Badge>
|
341
|
+
)}
|
342
|
+
{column.is_computed && (
|
343
|
+
<Badge variant="secondary" className="bg-purple-100 text-purple-800">
|
344
|
+
<Calculator className="w-3 h-3 mr-1" />
|
345
|
+
Computed
|
346
|
+
</Badge>
|
347
|
+
)}
|
353
348
|
</div>
|
354
|
-
|
349
|
+
{column.is_target ? (
|
350
|
+
<span className="inline-flex items-center px-3 py-1 rounded-full text-sm font-medium bg-purple-100 text-purple-800">
|
351
|
+
Target Column
|
352
|
+
</span>
|
353
|
+
) : (
|
354
|
+
<div className="flex items-center gap-2">
|
355
|
+
<label className="flex items-center gap-2 text-sm">
|
356
|
+
<input
|
357
|
+
type="checkbox"
|
358
|
+
checked={column.drop_if_null}
|
359
|
+
onChange={onToggleDropIfNull}
|
360
|
+
className="rounded border-gray-300 text-red-600 focus:ring-red-500"
|
361
|
+
/>
|
362
|
+
<span className="flex items-center gap-1 text-gray-700">
|
363
|
+
<Trash2 className="w-4 h-4 text-gray-400" />
|
364
|
+
Drop if null
|
365
|
+
</span>
|
366
|
+
</label>
|
367
|
+
</div>
|
368
|
+
)}
|
369
|
+
</div>
|
355
370
|
</div>
|
356
371
|
</div>
|
357
372
|
|
@@ -481,6 +496,52 @@ export function PreprocessingConfig({
|
|
481
496
|
)}
|
482
497
|
</div>
|
483
498
|
|
499
|
+
{/* Column Lineage Section */}
|
500
|
+
{column.lineage && column.lineage.length > 0 && (
|
501
|
+
<div className="bg-white rounded-lg border border-gray-200 p-6">
|
502
|
+
<h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
|
503
|
+
<GitBranch className="w-5 h-5 text-gray-500" />
|
504
|
+
Column Lineage
|
505
|
+
</h3>
|
506
|
+
<div className="space-y-4">
|
507
|
+
{column.lineage.map((step, index) => (
|
508
|
+
<div key={index} className="flex items-start gap-3">
|
509
|
+
<div className={`w-8 h-8 rounded-full flex items-center justify-center flex-shrink-0 ${
|
510
|
+
step.key === 'raw_dataset'
|
511
|
+
? 'bg-gray-100'
|
512
|
+
: step.key === 'computed_by_feature'
|
513
|
+
? 'bg-purple-100'
|
514
|
+
: 'bg-blue-100'
|
515
|
+
}`}>
|
516
|
+
{step.key === 'raw_dataset' ? (
|
517
|
+
<Database className="w-4 h-4 text-gray-600" />
|
518
|
+
) : step.key === 'computed_by_feature' ? (
|
519
|
+
<Calculator className="w-4 h-4 text-purple-600" />
|
520
|
+
) : (
|
521
|
+
<Settings2 className="w-4 h-4 text-blue-600" />
|
522
|
+
)}
|
523
|
+
</div>
|
524
|
+
<div className="flex-1">
|
525
|
+
<div className="flex items-center justify-between">
|
526
|
+
<p className="text-sm font-medium text-gray-900">
|
527
|
+
{step.description}
|
528
|
+
</p>
|
529
|
+
{step.timestamp && (
|
530
|
+
<span className="text-xs text-gray-500">
|
531
|
+
{new Date(step.timestamp).toLocaleString()}
|
532
|
+
</span>
|
533
|
+
)}
|
534
|
+
</div>
|
535
|
+
{index < column.lineage.length - 1 && (
|
536
|
+
<div className="ml-4 mt-2 mb-2 w-0.5 h-4 bg-gray-200" />
|
537
|
+
)}
|
538
|
+
</div>
|
539
|
+
</div>
|
540
|
+
))}
|
541
|
+
</div>
|
542
|
+
</div>
|
543
|
+
)}
|
544
|
+
|
484
545
|
{/* Data Type Section */}
|
485
546
|
<div className="bg-white rounded-lg border border-gray-200 p-6">
|
486
547
|
<h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
|
@@ -5,7 +5,6 @@ module EasyML
|
|
5
5
|
@queue = :easy_ml
|
6
6
|
|
7
7
|
def self.perform(batch_id, options = {})
|
8
|
-
puts "Performing compute feature job with options #{options}"
|
9
8
|
begin
|
10
9
|
options.symbolize_keys!
|
11
10
|
feature_id = options.dig(:feature_id)
|
@@ -14,7 +13,6 @@ module EasyML
|
|
14
13
|
|
15
14
|
# Check if any feature has failed before proceeding
|
16
15
|
if dataset.features.any? { |f| f.workflow_status == "failed" }
|
17
|
-
puts "Aborting feature computation due to previous feature failure"
|
18
16
|
return
|
19
17
|
end
|
20
18
|
|
@@ -40,7 +38,6 @@ module EasyML
|
|
40
38
|
end
|
41
39
|
|
42
40
|
def self.after_batch_hook(batch_id, *args)
|
43
|
-
puts "After batch!"
|
44
41
|
batch_args = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys)
|
45
42
|
feature_ids = batch_args.pluck(:feature_id).uniq
|
46
43
|
parent_id = batch_args.pluck(:parent_batch_id).first
|
@@ -3,28 +3,22 @@ module EasyML
|
|
3
3
|
def perform(id)
|
4
4
|
begin
|
5
5
|
dataset = EasyML::Dataset.find(id)
|
6
|
-
return if dataset.workflow_status == :analyzing
|
7
6
|
|
8
7
|
puts "Refreshing dataset #{dataset.name}"
|
9
|
-
puts "Needs refresh? #{dataset.needs_refresh?}"
|
10
8
|
unless dataset.needs_refresh?
|
11
9
|
dataset.update(workflow_status: :ready)
|
12
10
|
end
|
13
11
|
|
14
12
|
create_event(dataset, "started")
|
15
13
|
|
16
|
-
puts "Prepare! #{dataset.name}"
|
17
14
|
dataset.unlock!
|
18
15
|
dataset.prepare
|
19
16
|
if dataset.features.needs_fit.any?
|
20
17
|
dataset.fit_features(async: true)
|
21
|
-
puts "Computing features!"
|
22
18
|
else
|
23
19
|
dataset.actually_refresh
|
24
|
-
puts "Done!"
|
25
20
|
end
|
26
21
|
rescue StandardError => e
|
27
|
-
puts "Error #{e.message}"
|
28
22
|
if Rails.env.test?
|
29
23
|
raise e
|
30
24
|
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Base
|
5
|
+
class << self
|
6
|
+
def param_applies(p)
|
7
|
+
Imputers.supported_params << p
|
8
|
+
Imputers.params_by_class[self] ||= []
|
9
|
+
Imputers.params_by_class[self] << p.to_sym
|
10
|
+
end
|
11
|
+
|
12
|
+
def method_applies(m)
|
13
|
+
Imputers.supported_methods << m.to_sym
|
14
|
+
Imputers.methods_by_class[self] ||= []
|
15
|
+
Imputers.methods_by_class[self] << m.to_sym
|
16
|
+
end
|
17
|
+
|
18
|
+
def description
|
19
|
+
"Unknown preprocessing method"
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
attr_accessor :column, :preprocessing_step
|
24
|
+
|
25
|
+
def initialize(column, preprocessing_step)
|
26
|
+
@column = column
|
27
|
+
@preprocessing_step = preprocessing_step.with_indifferent_access
|
28
|
+
end
|
29
|
+
|
30
|
+
def applies?
|
31
|
+
method_applies? || param_applies?
|
32
|
+
end
|
33
|
+
|
34
|
+
def method_applies?
|
35
|
+
imputers_own_methods.include?(method.to_sym)
|
36
|
+
end
|
37
|
+
|
38
|
+
def param_applies?
|
39
|
+
params.keys.any? { |p| imputers_own_params.include?(p.to_sym) && params[p] != false }
|
40
|
+
end
|
41
|
+
|
42
|
+
def imputers_own_methods
|
43
|
+
Imputers.methods_by_class[self.class] || []
|
44
|
+
end
|
45
|
+
|
46
|
+
def imputers_own_params
|
47
|
+
Imputers.params_by_class[self.class] || []
|
48
|
+
end
|
49
|
+
|
50
|
+
def params
|
51
|
+
@preprocessing_step.dig(:params)
|
52
|
+
end
|
53
|
+
|
54
|
+
def method
|
55
|
+
@preprocessing_step.dig(:method)
|
56
|
+
end
|
57
|
+
|
58
|
+
def statistics(*args)
|
59
|
+
if column.is_computed
|
60
|
+
column.statistics.dig(:processed, *args)
|
61
|
+
else
|
62
|
+
column.statistics.dig(:clipped, *args) || column.statistics.dig(:raw, *args)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def anything?
|
67
|
+
true
|
68
|
+
end
|
69
|
+
|
70
|
+
def inspect
|
71
|
+
params_str = params ? params.map { |k, v| "#{k}: #{v}" }.join(", ") : "none"
|
72
|
+
method_str = method ? method : "none"
|
73
|
+
|
74
|
+
"#<#{self.class.name} method=#{method_str.inspect} params={#{params_str}}>"
|
75
|
+
end
|
76
|
+
|
77
|
+
alias_method :to_s, :inspect
|
78
|
+
|
79
|
+
def transform(df)
|
80
|
+
raise "Method not implemented"
|
81
|
+
end
|
82
|
+
|
83
|
+
def description
|
84
|
+
self.class.description
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Categorical < Base
|
5
|
+
method_applies :categorical
|
6
|
+
param_applies :categorical_min
|
7
|
+
|
8
|
+
def self.description
|
9
|
+
"Categorical imputation"
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform(df)
|
13
|
+
return df unless allowed_categories.present?
|
14
|
+
|
15
|
+
case column.datatype
|
16
|
+
when :categorical
|
17
|
+
df = df.with_column(
|
18
|
+
Polars.when(Polars.col(column.name).is_in(allowed_categories))
|
19
|
+
.then(Polars.col(column.name))
|
20
|
+
.otherwise(Polars.lit("other"))
|
21
|
+
.alias(column.name)
|
22
|
+
)
|
23
|
+
when :boolean
|
24
|
+
# no-op
|
25
|
+
end
|
26
|
+
df
|
27
|
+
end
|
28
|
+
|
29
|
+
def allowed_categories
|
30
|
+
column.allowed_categories
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Clip < Base
|
5
|
+
attr_accessor :column, :dataset, :preprocessing_step
|
6
|
+
|
7
|
+
param_applies :clip
|
8
|
+
|
9
|
+
def self.description
|
10
|
+
"Clip"
|
11
|
+
end
|
12
|
+
|
13
|
+
def transform(df)
|
14
|
+
df = df.with_column(
|
15
|
+
Polars.col(column.name).clip(min, max).alias(column.name)
|
16
|
+
)
|
17
|
+
df
|
18
|
+
end
|
19
|
+
|
20
|
+
def min
|
21
|
+
params.dig(:clip, :min) || 0
|
22
|
+
end
|
23
|
+
|
24
|
+
def max
|
25
|
+
params.dig(:clip, :max) || Float::INFINITY
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module EasyML
|
2
|
+
class Column
|
3
|
+
class Imputers
|
4
|
+
class Constant < Base
|
5
|
+
method_applies :constant
|
6
|
+
param_applies :constant
|
7
|
+
|
8
|
+
def self.description
|
9
|
+
"Constant value imputation"
|
10
|
+
end
|
11
|
+
|
12
|
+
def transform(df)
|
13
|
+
return df unless constant.present?
|
14
|
+
|
15
|
+
df = df.with_column(
|
16
|
+
Polars.col(column.name).fill_null(constant).alias(column.name)
|
17
|
+
)
|
18
|
+
df
|
19
|
+
end
|
20
|
+
|
21
|
+
def constant
|
22
|
+
params.dig(:constant)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|