easy_ml 0.2.0.pre.rc85 → 0.2.0.pre.rc88
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/controllers/easy_ml/datasets_controller.rb +18 -2
- data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
- data/app/frontend/pages/DatasetsPage.tsx +0 -1
- data/app/frontend/types/dataset.ts +5 -2
- data/app/models/easy_ml/column/imputers/base.rb +23 -2
- data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
- data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
- data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
- data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
- data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
- data/app/models/easy_ml/column/imputers.rb +47 -41
- data/app/models/easy_ml/column/selector.rb +2 -2
- data/app/models/easy_ml/column.rb +260 -56
- data/app/models/easy_ml/column_history.rb +6 -0
- data/app/models/easy_ml/column_list.rb +30 -1
- data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
- data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
- data/app/models/easy_ml/dataset/learner.rb +11 -0
- data/app/models/easy_ml/dataset.rb +6 -19
- data/app/models/easy_ml/lineage_history.rb +17 -0
- data/app/models/easy_ml/model.rb +11 -1
- data/app/models/easy_ml/models/xgboost.rb +37 -7
- data/app/models/easy_ml/pca_model.rb +21 -0
- data/app/models/easy_ml/prediction.rb +2 -1
- data/app/serializers/easy_ml/column_serializer.rb +13 -1
- data/config/initializers/inflections.rb +1 -0
- data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
- data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
- data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
- data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
- data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
- data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
- data/lib/easy_ml/data/embeddings.rb +61 -0
- data/lib/easy_ml/data/polars_column.rb +3 -0
- data/lib/easy_ml/data/polars_reader.rb +54 -23
- data/lib/easy_ml/data/polars_schema.rb +28 -2
- data/lib/easy_ml/data/splits/file_split.rb +7 -2
- data/lib/easy_ml/data.rb +1 -0
- data/lib/easy_ml/embedding_store.rb +92 -0
- data/lib/easy_ml/engine.rb +4 -2
- data/lib/easy_ml/predict.rb +42 -20
- data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
- data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
- data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
- data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
- data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
- data/lib/easy_ml/version.rb +1 -1
- data/lib/easy_ml.rb +1 -0
- data/public/easy_ml/assets/.vite/manifest.json +2 -2
- data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
- metadata +59 -6
- data/lib/tasks/profile.rake +0 -40
- data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js +0 -522
- data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js.map +0 -1
@@ -1,7 +1,8 @@
|
|
1
1
|
import React, { useState, useEffect } from 'react';
|
2
|
-
import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database, Calculator, GitBranch } from 'lucide-react';
|
2
|
+
import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database, Calculator, GitBranch, Brain, HardDrive, Maximize2, Minimize2 } from 'lucide-react';
|
3
3
|
import type { Dataset, Column, ColumnType, PreprocessingConstants, PreprocessingSteps, PreprocessingStep } from '../../types/dataset';
|
4
4
|
import { Badge } from "@/components/ui/badge";
|
5
|
+
import { SearchableSelect } from '../SearchableSelect';
|
5
6
|
|
6
7
|
interface PreprocessingConfigProps {
|
7
8
|
column: Column;
|
@@ -19,15 +20,21 @@ interface PreprocessingConfigProps {
|
|
19
20
|
const isNumericType = (type: ColumnType): boolean =>
|
20
21
|
type === 'float' || type === 'integer';
|
21
22
|
|
23
|
+
const canUseEmbedding = (type: ColumnType): boolean =>
|
24
|
+
type === 'text' || type === 'string' || type === 'categorical';
|
25
|
+
|
22
26
|
const createPreprocessingStep = (steps?: PreprocessingStep): PreprocessingStep => ({
|
23
27
|
method: steps?.method || 'none',
|
28
|
+
encoding: steps?.encoding,
|
24
29
|
params: {
|
25
|
-
constant: steps?.params?.constant,
|
26
30
|
categorical_min: steps?.params?.categorical_min ?? 100,
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
+
clip: steps?.params?.clip,
|
32
|
+
constant: steps?.params?.constant,
|
33
|
+
llm: steps?.params?.llm,
|
34
|
+
model: steps?.params?.model,
|
35
|
+
dimensions: steps?.params?.dimensions,
|
36
|
+
preset: steps?.params?.preset,
|
37
|
+
},
|
31
38
|
});
|
32
39
|
|
33
40
|
export function PreprocessingConfig({
|
@@ -64,31 +71,33 @@ export function PreprocessingConfig({
|
|
64
71
|
method: PreprocessingStep['method']
|
65
72
|
) => {
|
66
73
|
let defaultParams: PreprocessingStep['params'] = {};
|
67
|
-
|
68
|
-
|
74
|
+
const strategy = type === 'training' ? training : inference;
|
75
|
+
|
76
|
+
// Preserve existing encoding for text/string columns if it's already set
|
77
|
+
let defaultEncoding: string | null = null;
|
78
|
+
|
79
|
+
if (canUseEmbedding(selectedType) && strategy.encoding) {
|
80
|
+
// Keep the existing encoding if it's already set
|
81
|
+
defaultEncoding = strategy.encoding;
|
82
|
+
} else if (selectedType === 'categorical') {
|
69
83
|
if (method === 'categorical') {
|
70
84
|
defaultParams = {
|
71
85
|
...defaultParams,
|
72
86
|
categorical_min: 100,
|
73
|
-
one_hot: true
|
74
|
-
};
|
75
|
-
} else if (method != 'none') {
|
76
|
-
defaultParams = {
|
77
|
-
...defaultParams,
|
78
|
-
one_hot: true
|
79
87
|
};
|
88
|
+
defaultEncoding = 'one_hot';
|
89
|
+
} else if (method !== 'none') {
|
90
|
+
defaultEncoding = 'one_hot';
|
80
91
|
}
|
81
92
|
}
|
82
93
|
|
83
94
|
if (column.is_target) {
|
84
|
-
|
85
|
-
...defaultParams,
|
86
|
-
ordinal_encoding: true
|
87
|
-
};
|
95
|
+
defaultEncoding = 'ordinal';
|
88
96
|
}
|
89
97
|
|
90
98
|
const newStrategy: PreprocessingStep = {
|
91
99
|
method,
|
100
|
+
encoding: defaultEncoding,
|
92
101
|
params: defaultParams
|
93
102
|
};
|
94
103
|
|
@@ -113,8 +122,12 @@ export function PreprocessingConfig({
|
|
113
122
|
...strategy,
|
114
123
|
params: {
|
115
124
|
categorical_min: strategy.params.categorical_min,
|
116
|
-
|
117
|
-
|
125
|
+
clip: strategy.params.clip,
|
126
|
+
constant: strategy.params.constant,
|
127
|
+
llm: strategy.params.llm,
|
128
|
+
model: strategy.params.model,
|
129
|
+
dimensions: strategy.params.dimensions,
|
130
|
+
preset: strategy.params.preset,
|
118
131
|
...updates
|
119
132
|
}
|
120
133
|
};
|
@@ -177,6 +190,70 @@ export function PreprocessingConfig({
|
|
177
190
|
}
|
178
191
|
};
|
179
192
|
|
193
|
+
const handleEmbeddingParamChange = (
|
194
|
+
type: 'training' | 'inference',
|
195
|
+
updates: Partial<PreprocessingStep['params']>
|
196
|
+
) => {
|
197
|
+
const strategy = type === 'training' ? training : inference;
|
198
|
+
const setStrategy = type === 'training' ? setTraining : setInference;
|
199
|
+
|
200
|
+
const newStrategy: PreprocessingStep = {
|
201
|
+
...strategy,
|
202
|
+
params: {
|
203
|
+
...strategy.params,
|
204
|
+
...updates
|
205
|
+
}
|
206
|
+
};
|
207
|
+
|
208
|
+
setStrategy(newStrategy);
|
209
|
+
if (type === 'training') {
|
210
|
+
onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
|
211
|
+
} else {
|
212
|
+
onUpdate(training, newStrategy, useDistinctInference);
|
213
|
+
}
|
214
|
+
};
|
215
|
+
|
216
|
+
const handleEncodingChange = (
|
217
|
+
type: 'training' | 'inference',
|
218
|
+
encoding: string | null
|
219
|
+
) => {
|
220
|
+
const strategy = type === 'training' ? training : inference;
|
221
|
+
const setStrategy = type === 'training' ? setTraining : setInference;
|
222
|
+
|
223
|
+
let updatedParams = { ...strategy.params };
|
224
|
+
|
225
|
+
// If selecting embedding encoding, ensure we have default llm and model params
|
226
|
+
if (encoding === 'embedding') {
|
227
|
+
const embeddingConstants = constants.embedding_constants;
|
228
|
+
if (embeddingConstants) {
|
229
|
+
const defaultProvider = 'openai';
|
230
|
+
const defaultModel = (embeddingConstants.models[defaultProvider] || [])[0]?.value;
|
231
|
+
const defaultDimensions = (embeddingConstants.models[defaultProvider] || []).find(m => m.value === defaultModel)?.dimensions || 1536;
|
232
|
+
|
233
|
+
updatedParams = {
|
234
|
+
...updatedParams,
|
235
|
+
llm: updatedParams.llm || defaultProvider,
|
236
|
+
model: updatedParams.model || defaultModel,
|
237
|
+
dimensions: updatedParams.dimensions || defaultDimensions,
|
238
|
+
preset: updatedParams.preset || 'high_quality',
|
239
|
+
};
|
240
|
+
}
|
241
|
+
}
|
242
|
+
|
243
|
+
const newStrategy: PreprocessingStep = {
|
244
|
+
...strategy,
|
245
|
+
encoding: encoding === 'none' ? null : encoding,
|
246
|
+
params: updatedParams
|
247
|
+
};
|
248
|
+
|
249
|
+
setStrategy(newStrategy);
|
250
|
+
if (type === 'training') {
|
251
|
+
onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
|
252
|
+
} else {
|
253
|
+
onUpdate(training, newStrategy, useDistinctInference);
|
254
|
+
}
|
255
|
+
};
|
256
|
+
|
180
257
|
const renderConstantValueInput = (type: 'training' | 'inference') => {
|
181
258
|
const strategy = type === 'training' ? training : inference;
|
182
259
|
if (strategy.method !== 'constant') return null;
|
@@ -207,6 +284,330 @@ export function PreprocessingConfig({
|
|
207
284
|
);
|
208
285
|
};
|
209
286
|
|
287
|
+
const renderEncodingConfig = (type: 'training' | 'inference') => {
|
288
|
+
const strategy = type === 'training' ? training : inference;
|
289
|
+
if (!strategy || !canUseEmbedding(selectedType)) return null;
|
290
|
+
|
291
|
+
return (
|
292
|
+
<div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
|
293
|
+
<h4 className="text-sm font-medium text-gray-900 mb-2">Encoding</h4>
|
294
|
+
<div className="flex items-center gap-2">
|
295
|
+
<input
|
296
|
+
type="radio"
|
297
|
+
id="noneEncode"
|
298
|
+
name="encoding"
|
299
|
+
checked={strategy.encoding === null}
|
300
|
+
onChange={() => handleEncodingChange(type, 'none')}
|
301
|
+
className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
|
302
|
+
/>
|
303
|
+
<label htmlFor="noneEncode" className="text-sm text-gray-700">
|
304
|
+
No encoding
|
305
|
+
</label>
|
306
|
+
</div>
|
307
|
+
{selectedType === 'categorical' && (
|
308
|
+
<>
|
309
|
+
<div className="flex items-center gap-2">
|
310
|
+
<input
|
311
|
+
type="radio"
|
312
|
+
id="oneHotEncode"
|
313
|
+
name="encoding"
|
314
|
+
checked={strategy.encoding === 'one_hot'}
|
315
|
+
onChange={() => handleEncodingChange(type, 'one_hot')}
|
316
|
+
className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
|
317
|
+
/>
|
318
|
+
<label htmlFor="oneHotEncode" className="text-sm text-gray-700">
|
319
|
+
One-hot encode categories
|
320
|
+
</label>
|
321
|
+
</div>
|
322
|
+
<div className="flex items-center gap-2">
|
323
|
+
<input
|
324
|
+
type="radio"
|
325
|
+
id="ordinalEncode"
|
326
|
+
name="encoding"
|
327
|
+
checked={strategy.encoding === 'ordinal'}
|
328
|
+
onChange={() => handleEncodingChange(type, 'ordinal')}
|
329
|
+
className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
|
330
|
+
/>
|
331
|
+
<label htmlFor="ordinalEncode" className="text-sm text-gray-700">
|
332
|
+
Ordinal encode categories
|
333
|
+
</label>
|
334
|
+
</div>
|
335
|
+
</>
|
336
|
+
)}
|
337
|
+
<div className="flex items-center gap-2">
|
338
|
+
<input
|
339
|
+
type="radio"
|
340
|
+
id="embeddingEncode"
|
341
|
+
name="encoding"
|
342
|
+
checked={strategy.encoding === 'embedding'}
|
343
|
+
onChange={() => handleEncodingChange(type, 'embedding')}
|
344
|
+
className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
|
345
|
+
/>
|
346
|
+
<label htmlFor="embeddingEncode" className="text-sm text-gray-700">
|
347
|
+
Embedding encode
|
348
|
+
</label>
|
349
|
+
</div>
|
350
|
+
</div>
|
351
|
+
);
|
352
|
+
};
|
353
|
+
|
354
|
+
const renderEmbeddingConfig = (type: 'training' | 'inference') => {
|
355
|
+
const strategy = type === 'training' ? training : inference;
|
356
|
+
if (!strategy || strategy.encoding !== 'embedding' || !constants.embedding_constants) return null;
|
357
|
+
|
358
|
+
const embeddingConstants = constants.embedding_constants;
|
359
|
+
const providers = embeddingConstants.providers || [];
|
360
|
+
const models = embeddingConstants.models || {};
|
361
|
+
const compressionPresets = Object.entries(embeddingConstants.compression_presets || {}).map(([key, preset]) => ({
|
362
|
+
value: key,
|
363
|
+
label: key.split('_').map(word => word.charAt(0).toUpperCase() + word.slice(1)).join(' '),
|
364
|
+
description: preset.description,
|
365
|
+
variance_target: preset.variance_target,
|
366
|
+
}));
|
367
|
+
|
368
|
+
const getModelsForProvider = (provider: string) => {
|
369
|
+
return models[provider] || [];
|
370
|
+
};
|
371
|
+
|
372
|
+
const getCurrentModelDimensions = () => {
|
373
|
+
const provider = strategy.params?.llm || 'openai';
|
374
|
+
const modelValue = strategy.params?.model || getModelsForProvider(provider)[0]?.value;
|
375
|
+
const model = getModelsForProvider(provider).find(m => m.value === modelValue);
|
376
|
+
return model?.dimensions || 1536; // Default to 1536 if not found
|
377
|
+
};
|
378
|
+
|
379
|
+
const getPresetForVariance = (variance: number) => {
|
380
|
+
return compressionPresets.find(preset =>
|
381
|
+
Math.abs(preset.variance_target - variance) < 0.05
|
382
|
+
)?.value || null;
|
383
|
+
};
|
384
|
+
|
385
|
+
const getVarianceForPreset = (presetValue: string) => {
|
386
|
+
return compressionPresets.find(preset =>
|
387
|
+
preset.value === presetValue
|
388
|
+
)?.variance_target || 0.85; // Default to balanced
|
389
|
+
};
|
390
|
+
|
391
|
+
const handleDimensionsChange = (dimensions: number) => {
|
392
|
+
const variance = dimensions / getCurrentModelDimensions(); // Normalize to 0-1
|
393
|
+
const matchingPreset = getPresetForVariance(variance);
|
394
|
+
|
395
|
+
handleEmbeddingParamChange(type, {
|
396
|
+
dimensions,
|
397
|
+
preset: matchingPreset,
|
398
|
+
});
|
399
|
+
};
|
400
|
+
|
401
|
+
const handlePresetChange = (presetValue: string) => {
|
402
|
+
const variance = getVarianceForPreset(presetValue);
|
403
|
+
const dimensions = Math.round(variance * getCurrentModelDimensions());
|
404
|
+
|
405
|
+
handleEmbeddingParamChange(type, {
|
406
|
+
dimensions,
|
407
|
+
preset: presetValue,
|
408
|
+
});
|
409
|
+
};
|
410
|
+
|
411
|
+
return (
|
412
|
+
<div className="space-y-6 mt-8">
|
413
|
+
<div className="bg-blue-50 rounded-lg p-4">
|
414
|
+
<div className="flex gap-2">
|
415
|
+
<Brain className="w-5 h-5 text-blue-500 flex-shrink-0" />
|
416
|
+
<div>
|
417
|
+
<h4 className="text-sm font-medium text-blue-900">Text Embeddings</h4>
|
418
|
+
<p className="text-sm text-blue-700 mt-1">
|
419
|
+
Convert text into numerical vectors for machine learning, preserving semantic meaning while optimizing for storage and performance.
|
420
|
+
</p>
|
421
|
+
</div>
|
422
|
+
</div>
|
423
|
+
</div>
|
424
|
+
|
425
|
+
<div className="space-y-4">
|
426
|
+
<div>
|
427
|
+
<label className="block text-sm font-medium text-gray-700 mb-1">
|
428
|
+
Embedding Provider
|
429
|
+
</label>
|
430
|
+
<SearchableSelect
|
431
|
+
value={strategy.params?.llm || 'openai'}
|
432
|
+
onChange={(value) => {
|
433
|
+
const newModels = getModelsForProvider(value);
|
434
|
+
const firstModel = newModels[0]?.value;
|
435
|
+
const dimensions = newModels[0]?.dimensions || 1536;
|
436
|
+
|
437
|
+
handleEmbeddingParamChange(type, {
|
438
|
+
...strategy.params,
|
439
|
+
llm: value,
|
440
|
+
model: firstModel,
|
441
|
+
dimensions: dimensions,
|
442
|
+
preset: 'high_quality',
|
443
|
+
});
|
444
|
+
}}
|
445
|
+
options={providers}
|
446
|
+
placeholder="Select a provider"
|
447
|
+
/>
|
448
|
+
</div>
|
449
|
+
|
450
|
+
<div>
|
451
|
+
<label className="block text-sm font-medium text-gray-700 mb-1">
|
452
|
+
Model
|
453
|
+
</label>
|
454
|
+
<SearchableSelect
|
455
|
+
value={strategy.params?.model || getModelsForProvider(strategy.params?.llm || 'openai')[0]?.value}
|
456
|
+
onChange={(value) => {
|
457
|
+
const model = getModelsForProvider(strategy.params?.llm || 'openai').find(m => m.value === value);
|
458
|
+
const dimensions = model?.dimensions || 1536;
|
459
|
+
|
460
|
+
handleEmbeddingParamChange(type, {
|
461
|
+
...strategy.params,
|
462
|
+
model: value,
|
463
|
+
dimensions: dimensions,
|
464
|
+
preset: 'high_quality',
|
465
|
+
});
|
466
|
+
}}
|
467
|
+
options={getModelsForProvider(strategy.params?.llm || 'openai')}
|
468
|
+
placeholder="Select a model"
|
469
|
+
/>
|
470
|
+
</div>
|
471
|
+
|
472
|
+
<div className="space-y-4">
|
473
|
+
<div className="flex items-center justify-between">
|
474
|
+
<h4 className="text-sm font-medium text-gray-900">
|
475
|
+
Storage & Quality
|
476
|
+
</h4>
|
477
|
+
<div className="flex items-center gap-2 text-sm text-gray-500">
|
478
|
+
<Minimize2 className="w-4 h-4" />
|
479
|
+
<span>Storage</span>
|
480
|
+
<span className="mx-2">•</span>
|
481
|
+
<span>Quality</span>
|
482
|
+
<Maximize2 className="w-4 h-4" />
|
483
|
+
</div>
|
484
|
+
</div>
|
485
|
+
|
486
|
+
<div className="space-y-6">
|
487
|
+
<div>
|
488
|
+
<div className="flex items-center justify-between mb-2">
|
489
|
+
<span className="text-sm text-gray-600">Target Dimensions</span>
|
490
|
+
<span className="text-sm font-medium text-gray-900">{strategy.params?.dimensions || getCurrentModelDimensions()}</span>
|
491
|
+
</div>
|
492
|
+
<input
|
493
|
+
type="range"
|
494
|
+
min="2"
|
495
|
+
max={getCurrentModelDimensions()}
|
496
|
+
value={strategy.params?.dimensions || getCurrentModelDimensions()}
|
497
|
+
onChange={(e) => handleDimensionsChange(parseInt(e.target.value))}
|
498
|
+
className="w-full"
|
499
|
+
/>
|
500
|
+
<div className="flex justify-between text-xs text-gray-500 mt-1">
|
501
|
+
<span>2</span>
|
502
|
+
<span>{getCurrentModelDimensions()}</span>
|
503
|
+
</div>
|
504
|
+
</div>
|
505
|
+
|
506
|
+
<div className="space-y-3">
|
507
|
+
<h5 className="text-sm font-medium text-gray-900">Quality Presets</h5>
|
508
|
+
{compressionPresets.map((preset) => (
|
509
|
+
<div
|
510
|
+
key={preset.value}
|
511
|
+
onClick={() => handlePresetChange(preset.value)}
|
512
|
+
className={`p-4 rounded-lg border transition-colors cursor-pointer
|
513
|
+
${strategy.params?.preset === preset.value
|
514
|
+
? 'border-blue-500 bg-blue-50'
|
515
|
+
: 'border-gray-200 hover:border-gray-300 bg-white'
|
516
|
+
}`}
|
517
|
+
>
|
518
|
+
<div className="flex items-center justify-between">
|
519
|
+
<div className="flex items-center gap-2">
|
520
|
+
<input
|
521
|
+
type="radio"
|
522
|
+
checked={strategy.params?.preset === preset.value}
|
523
|
+
onChange={() => handlePresetChange(preset.value)}
|
524
|
+
className="rounded-full border-gray-300 text-blue-600 focus:ring-blue-500"
|
525
|
+
/>
|
526
|
+
<span className="font-medium text-gray-900">{preset.label}</span>
|
527
|
+
</div>
|
528
|
+
</div>
|
529
|
+
<p className="text-sm text-gray-600 mt-1 ml-6">{preset.description}</p>
|
530
|
+
</div>
|
531
|
+
))}
|
532
|
+
</div>
|
533
|
+
|
534
|
+
<div className="space-y-4">
|
535
|
+
<div className="bg-gray-50 rounded-lg p-4">
|
536
|
+
<div className="flex items-start gap-2">
|
537
|
+
<HardDrive className="w-5 h-5 text-gray-400 flex-shrink-0 mt-0.5" />
|
538
|
+
<div className="flex-1">
|
539
|
+
<h5 className="text-sm font-medium text-gray-900">Storage Efficiency</h5>
|
540
|
+
<div className="mt-2">
|
541
|
+
<div className="w-full bg-gray-200 rounded-full h-2.5">
|
542
|
+
<div
|
543
|
+
className="h-full bg-green-600 rounded-full"
|
544
|
+
style={{ width: `${100 - ((strategy.params?.dimensions || 24) / getCurrentModelDimensions()) * 100}%` }}
|
545
|
+
/>
|
546
|
+
</div>
|
547
|
+
<p className="text-sm text-gray-600 mt-2">
|
548
|
+
{strategy.params?.dimensions && strategy.params.dimensions <= getCurrentModelDimensions() * 0.25
|
549
|
+
? "Optimized for storage. Maintains core meaning while significantly reducing storage requirements."
|
550
|
+
: strategy.params?.dimensions && strategy.params.dimensions <= getCurrentModelDimensions() * 0.5
|
551
|
+
? "Balanced approach. Good compromise between quality and storage efficiency."
|
552
|
+
: "Prioritizes quality. Preserves more nuanced relationships but requires more storage."}
|
553
|
+
</p>
|
554
|
+
</div>
|
555
|
+
</div>
|
556
|
+
</div>
|
557
|
+
</div>
|
558
|
+
|
559
|
+
<div className="bg-gray-50 rounded-lg p-4">
|
560
|
+
<div className="flex items-start gap-2">
|
561
|
+
<Brain className="w-5 h-5 text-gray-400 flex-shrink-0 mt-0.5" />
|
562
|
+
<div className="flex-1">
|
563
|
+
<h5 className="text-sm font-medium text-gray-900">Information Preservation</h5>
|
564
|
+
<div className="mt-2">
|
565
|
+
<div className="w-full bg-gray-200 rounded-full h-2.5">
|
566
|
+
<div
|
567
|
+
className="bg-blue-600 h-2.5 rounded-full transition-all duration-300"
|
568
|
+
style={{ width: `${((strategy.params?.dimensions || 24) / getCurrentModelDimensions()) * 100}%` }}
|
569
|
+
/>
|
570
|
+
</div>
|
571
|
+
<p className="text-sm text-gray-600 mt-2">
|
572
|
+
Preserves approximately {Math.round(((strategy.params?.dimensions || 24) / getCurrentModelDimensions()) * 100)}% of the original information
|
573
|
+
</p>
|
574
|
+
</div>
|
575
|
+
</div>
|
576
|
+
</div>
|
577
|
+
</div>
|
578
|
+
</div>
|
579
|
+
</div>
|
580
|
+
</div>
|
581
|
+
</div>
|
582
|
+
</div>
|
583
|
+
);
|
584
|
+
};
|
585
|
+
|
586
|
+
useEffect(() => {
|
587
|
+
// When component mounts or when column changes, update default dimensions
|
588
|
+
if (training.encoding === 'embedding' && !training.params?.dimensions) {
|
589
|
+
const provider = training.params?.llm || 'openai';
|
590
|
+
const modelValue = training.params?.model || (constants.embedding_constants?.models[provider] || [])[0]?.value;
|
591
|
+
const model = (constants.embedding_constants?.models[provider] || []).find(m => m.value === modelValue);
|
592
|
+
const defaultDimensions = model?.dimensions || 1536;
|
593
|
+
|
594
|
+
handleEmbeddingParamChange('training', {
|
595
|
+
dimensions: defaultDimensions
|
596
|
+
});
|
597
|
+
}
|
598
|
+
|
599
|
+
if (useDistinctInference && inference.encoding === 'embedding' && !inference.params?.dimensions) {
|
600
|
+
const provider = inference.params?.llm || 'openai';
|
601
|
+
const modelValue = inference.params?.model || (constants.embedding_constants?.models[provider] || [])[0]?.value;
|
602
|
+
const model = (constants.embedding_constants?.models[provider] || []).find(m => m.value === modelValue);
|
603
|
+
const defaultDimensions = model?.dimensions || 1536;
|
604
|
+
|
605
|
+
handleEmbeddingParamChange('inference', {
|
606
|
+
dimensions: defaultDimensions
|
607
|
+
});
|
608
|
+
}
|
609
|
+
}, [training.encoding, inference.encoding, column.id]);
|
610
|
+
|
210
611
|
const [isEditingDescription, setIsEditingDescription] = useState(false);
|
211
612
|
|
212
613
|
const onToggleDropIfNull = (e: React.ChangeEvent<HTMLInputElement>) => {
|
@@ -265,7 +666,7 @@ export function PreprocessingConfig({
|
|
265
666
|
const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
|
266
667
|
const strategy = type === 'training' ? training : inference;
|
267
668
|
let content;
|
268
|
-
if (strategy.method === 'most_frequent' && column.statistics?.raw
|
669
|
+
if (strategy.method === 'most_frequent' && column.statistics?.raw?.most_frequent_value !== undefined) {
|
269
670
|
content = `Most Frequent Value: ${column.statistics.raw.most_frequent_value}`
|
270
671
|
} else if (strategy.method === 'ffill') {
|
271
672
|
const lastValue = column.statistics?.raw.last_value;
|
@@ -290,6 +691,47 @@ export function PreprocessingConfig({
|
|
290
691
|
);
|
291
692
|
};
|
292
693
|
|
694
|
+
const renderLineageInfo = () => {
|
695
|
+
return (
|
696
|
+
<div className="space-y-4">
|
697
|
+
{column.lineage.map((step, index) => (
|
698
|
+
<div key={index} className="flex items-start gap-3">
|
699
|
+
<div className={`w-8 h-8 rounded-full flex items-center justify-center flex-shrink-0 ${
|
700
|
+
step.key === 'raw_dataset'
|
701
|
+
? 'bg-gray-100'
|
702
|
+
: step.key === 'computed_by_feature'
|
703
|
+
? 'bg-purple-100'
|
704
|
+
: 'bg-blue-100'
|
705
|
+
}`}>
|
706
|
+
{step.key === 'raw_dataset' ? (
|
707
|
+
<Database className="w-4 h-4 text-gray-600" />
|
708
|
+
) : step.key === 'computed_by_feature' ? (
|
709
|
+
<Calculator className="w-4 h-4 text-purple-600" />
|
710
|
+
) : (
|
711
|
+
<Settings2 className="w-4 h-4 text-blue-600" />
|
712
|
+
)}
|
713
|
+
</div>
|
714
|
+
<div className="flex-1">
|
715
|
+
<div className="flex items-center justify-between">
|
716
|
+
<p className="text-sm font-medium text-gray-900">
|
717
|
+
{step.description}
|
718
|
+
</p>
|
719
|
+
{step.timestamp && (
|
720
|
+
<span className="text-xs text-gray-500">
|
721
|
+
{new Date(step.timestamp).toLocaleString()}
|
722
|
+
</span>
|
723
|
+
)}
|
724
|
+
</div>
|
725
|
+
{index < column.lineage.length - 1 && (
|
726
|
+
<div className="ml-4 mt-2 mb-2 w-0.5 h-4 bg-gray-200" />
|
727
|
+
)}
|
728
|
+
</div>
|
729
|
+
</div>
|
730
|
+
))}
|
731
|
+
</div>
|
732
|
+
);
|
733
|
+
};
|
734
|
+
|
293
735
|
return (
|
294
736
|
<div className="space-y-8">
|
295
737
|
{/* Column Header Section */}
|
@@ -381,7 +823,7 @@ export function PreprocessingConfig({
|
|
381
823
|
<div className="space-y-2">
|
382
824
|
<div className="flex justify-between text-sm">
|
383
825
|
<span className="text-gray-600">Null Values:</span>
|
384
|
-
<span className="font-medium text-gray-900">{column.statistics?.raw?.null_count
|
826
|
+
<span className="font-medium text-gray-900">{column.statistics?.raw?.null_count?.toLocaleString()}</span>
|
385
827
|
</div>
|
386
828
|
<div className="flex justify-between text-sm">
|
387
829
|
<span className="text-gray-600">Total Rows:</span>
|
@@ -392,7 +834,7 @@ export function PreprocessingConfig({
|
|
392
834
|
<span className="font-medium text-gray-900">{nullPercentage.toFixed(2)}%</span>
|
393
835
|
</div>
|
394
836
|
<div className="mt-2">
|
395
|
-
<div className="w-full
|
837
|
+
<div className="w-full bg-gray-200 rounded-full h-2.5">
|
396
838
|
<div
|
397
839
|
className="h-full bg-blue-600 rounded-full"
|
398
840
|
style={{ width: `${nullPercentage}%` }}
|
@@ -422,7 +864,7 @@ export function PreprocessingConfig({
|
|
422
864
|
<span className="font-medium text-gray-900">{nullPercentageProcessed.toFixed(2)}%</span>
|
423
865
|
</div>
|
424
866
|
<div className="mt-2">
|
425
|
-
<div className="w-full
|
867
|
+
<div className="w-full bg-gray-200 rounded-full h-2.5">
|
426
868
|
<div
|
427
869
|
className="h-full bg-blue-600 rounded-full"
|
428
870
|
style={{ width: `${nullPercentageProcessed}%` }}
|
@@ -462,7 +904,7 @@ export function PreprocessingConfig({
|
|
462
904
|
<div className="flex items-center justify-between mb-2">
|
463
905
|
<span className="text-sm font-medium text-gray-700">Null Distribution</span>
|
464
906
|
<span className="text-sm text-gray-500">
|
465
|
-
{nullPercentage}% of values are null
|
907
|
+
{nullPercentage.toFixed(2)}% of values are null
|
466
908
|
</span>
|
467
909
|
</div>
|
468
910
|
<div className="relative h-2 bg-gray-100 rounded-full overflow-hidden">
|
@@ -486,63 +928,21 @@ export function PreprocessingConfig({
|
|
486
928
|
<h4 className="text-sm font-medium text-gray-700 mb-2">Sample Values</h4>
|
487
929
|
<div className="bg-gray-50 rounded-lg p-4">
|
488
930
|
<div className="flex flex-wrap gap-2">
|
489
|
-
{column.
|
490
|
-
<span key={index} className="
|
491
|
-
|
931
|
+
{Array.isArray(column.sample_values) ? column.sample_values.slice(0, 3).map((value: any, index: number) => (
|
932
|
+
<span key={index} className="m-1 flex-items items-center">
|
933
|
+
<Badge>
|
934
|
+
{typeof value === 'string'
|
935
|
+
? value.split(/\s+/).slice(0, 10).join(' ') + (value.split(/\s+/).length > 10 ? '...' : '')
|
936
|
+
: String(value)}
|
937
|
+
</Badge>
|
492
938
|
</span>
|
493
|
-
))}
|
939
|
+
)) : []}
|
494
940
|
</div>
|
495
941
|
</div>
|
496
942
|
</div>
|
497
943
|
)}
|
498
944
|
</div>
|
499
945
|
|
500
|
-
{/* Column Lineage Section */}
|
501
|
-
{column.lineage && column.lineage.length > 0 && (
|
502
|
-
<div className="bg-white rounded-lg border border-gray-200 p-6">
|
503
|
-
<h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
|
504
|
-
<GitBranch className="w-5 h-5 text-gray-500" />
|
505
|
-
Column Lineage
|
506
|
-
</h3>
|
507
|
-
<div className="space-y-4">
|
508
|
-
{column.lineage.map((step, index) => (
|
509
|
-
<div key={index} className="flex items-start gap-3">
|
510
|
-
<div className={`w-8 h-8 rounded-full flex items-center justify-center flex-shrink-0 ${
|
511
|
-
step.key === 'raw_dataset'
|
512
|
-
? 'bg-gray-100'
|
513
|
-
: step.key === 'computed_by_feature'
|
514
|
-
? 'bg-purple-100'
|
515
|
-
: 'bg-blue-100'
|
516
|
-
}`}>
|
517
|
-
{step.key === 'raw_dataset' ? (
|
518
|
-
<Database className="w-4 h-4 text-gray-600" />
|
519
|
-
) : step.key === 'computed_by_feature' ? (
|
520
|
-
<Calculator className="w-4 h-4 text-purple-600" />
|
521
|
-
) : (
|
522
|
-
<Settings2 className="w-4 h-4 text-blue-600" />
|
523
|
-
)}
|
524
|
-
</div>
|
525
|
-
<div className="flex-1">
|
526
|
-
<div className="flex items-center justify-between">
|
527
|
-
<p className="text-sm font-medium text-gray-900">
|
528
|
-
{step.description}
|
529
|
-
</p>
|
530
|
-
{step.timestamp && (
|
531
|
-
<span className="text-xs text-gray-500">
|
532
|
-
{new Date(step.timestamp).toLocaleString()}
|
533
|
-
</span>
|
534
|
-
)}
|
535
|
-
</div>
|
536
|
-
{index < column.lineage.length - 1 && (
|
537
|
-
<div className="ml-4 mt-2 mb-2 w-0.5 h-4 bg-gray-200" />
|
538
|
-
)}
|
539
|
-
</div>
|
540
|
-
</div>
|
541
|
-
))}
|
542
|
-
</div>
|
543
|
-
</div>
|
544
|
-
)}
|
545
|
-
|
546
946
|
{/* Data Type Section */}
|
547
947
|
<div className="bg-white rounded-lg border border-gray-200 p-6">
|
548
948
|
<h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
|
@@ -555,20 +955,14 @@ export function PreprocessingConfig({
|
|
555
955
|
<label className="block text-sm font-medium text-gray-700 mb-1">
|
556
956
|
Column Type
|
557
957
|
</label>
|
558
|
-
<
|
958
|
+
<SearchableSelect
|
559
959
|
value={selectedType}
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
</option>
|
567
|
-
))}
|
568
|
-
</select>
|
569
|
-
<p className="mt-1 text-sm text-gray-500">
|
570
|
-
Column type cannot be changed after creation
|
571
|
-
</p>
|
960
|
+
onChange={(value) => setColumnType(column.name, value)}
|
961
|
+
options={constants.column_types.map(type => ({
|
962
|
+
value: type.value,
|
963
|
+
label: type.label
|
964
|
+
}))}
|
965
|
+
/>
|
572
966
|
</div>
|
573
967
|
|
574
968
|
<div className="bg-gray-50 rounded-md p-4">
|
@@ -577,7 +971,9 @@ export function PreprocessingConfig({
|
|
577
971
|
{Array.isArray(column.sample_values) ? column.sample_values.slice(0, 3).map((value: any, index: number) => (
|
578
972
|
<span key={index} className="m-1 flex-items items-center">
|
579
973
|
<Badge>
|
580
|
-
{
|
974
|
+
{typeof value === 'string'
|
975
|
+
? value.split(/\s+/).slice(0, 10).join(' ') + (value.split(/\s+/).length > 10 ? '...' : '')
|
976
|
+
: String(value)}
|
581
977
|
</Badge>
|
582
978
|
</span>
|
583
979
|
)) : []}
|
@@ -621,23 +1017,25 @@ export function PreprocessingConfig({
|
|
621
1017
|
</div>
|
622
1018
|
|
623
1019
|
<div className={useDistinctInference ? "grid grid-cols-2 gap-6" : ""}>
|
624
|
-
<div>
|
625
|
-
<
|
1020
|
+
<div className="relative z-50">
|
1021
|
+
<SearchableSelect
|
626
1022
|
value={training.method}
|
627
|
-
onChange={(
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
1023
|
+
onChange={(value) => handleStrategyChange('training', value as PreprocessingStep['method'])}
|
1024
|
+
options={[
|
1025
|
+
{ value: 'none', label: 'No preprocessing' },
|
1026
|
+
...(constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => ({
|
1027
|
+
value: strategy.value,
|
1028
|
+
label: strategy.label
|
1029
|
+
})) || [])
|
1030
|
+
]}
|
1031
|
+
options={constants.preprocessing_strategies[selectedType]}
|
1032
|
+
/>
|
637
1033
|
|
638
1034
|
{renderStrategySpecificInfo('training')}
|
639
1035
|
{renderConstantValueInput('training')}
|
640
|
-
|
1036
|
+
{renderEncodingConfig('training')}
|
1037
|
+
{renderEmbeddingConfig('training')}
|
1038
|
+
{(column.datatype === 'categorical' && training.method === 'categorical') && (
|
641
1039
|
<div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
|
642
1040
|
<div>
|
643
1041
|
<label className="block text-sm font-medium text-gray-700 mb-1">
|
@@ -658,43 +1056,6 @@ export function PreprocessingConfig({
|
|
658
1056
|
</div>
|
659
1057
|
</div>
|
660
1058
|
)}
|
661
|
-
{(column.datatype === 'categorical' && training.method !== 'none') && (
|
662
|
-
<div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
|
663
|
-
<h4 className="text-sm font-medium text-gray-900 mb-2">Encoding</h4>
|
664
|
-
<div className="flex items-center gap-2">
|
665
|
-
<input
|
666
|
-
type="radio"
|
667
|
-
id="oneHotEncode"
|
668
|
-
name="encoding"
|
669
|
-
checked={training.params.one_hot}
|
670
|
-
onChange={() => handleCategoricalParamChange('training', {
|
671
|
-
one_hot: true,
|
672
|
-
ordinal_encoding: false
|
673
|
-
})}
|
674
|
-
className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
|
675
|
-
/>
|
676
|
-
<label htmlFor="oneHotEncode" className="text-sm text-gray-700">
|
677
|
-
One-hot encode categories
|
678
|
-
</label>
|
679
|
-
</div>
|
680
|
-
<div className="flex items-center gap-2">
|
681
|
-
<input
|
682
|
-
type="radio"
|
683
|
-
id="ordinalEncode"
|
684
|
-
name="encoding"
|
685
|
-
checked={training.params.ordinal_encoding}
|
686
|
-
onChange={() => handleCategoricalParamChange('training', {
|
687
|
-
one_hot: false,
|
688
|
-
ordinal_encoding: true
|
689
|
-
})}
|
690
|
-
className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
|
691
|
-
/>
|
692
|
-
<label htmlFor="ordinalEncode" className="text-sm text-gray-700">
|
693
|
-
Ordinal encode categories
|
694
|
-
</label>
|
695
|
-
</div>
|
696
|
-
</div>
|
697
|
-
)}
|
698
1059
|
</div>
|
699
1060
|
|
700
1061
|
{useDistinctInference && (
|
@@ -705,20 +1066,21 @@ export function PreprocessingConfig({
|
|
705
1066
|
Inference Strategy
|
706
1067
|
</span>
|
707
1068
|
</div>
|
708
|
-
<
|
1069
|
+
<SearchableSelect
|
709
1070
|
value={inference.method}
|
710
|
-
onChange={(
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
</select>
|
1071
|
+
onChange={(value) => handleStrategyChange('inference', value as PreprocessingStep['method'])}
|
1072
|
+
options={[
|
1073
|
+
{ value: 'none', label: 'No preprocessing' },
|
1074
|
+
...(constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => ({
|
1075
|
+
value: strategy.value,
|
1076
|
+
label: strategy.label
|
1077
|
+
})) || [])
|
1078
|
+
]}
|
1079
|
+
/>
|
720
1080
|
|
721
1081
|
{renderConstantValueInput('inference')}
|
1082
|
+
{renderEncodingConfig('inference')}
|
1083
|
+
{renderEmbeddingConfig('inference')}
|
722
1084
|
</div>
|
723
1085
|
)}
|
724
1086
|
</div>
|
@@ -766,6 +1128,17 @@ export function PreprocessingConfig({
|
|
766
1128
|
</div>
|
767
1129
|
</div>
|
768
1130
|
|
1131
|
+
{/* Column Lineage Section */}
|
1132
|
+
<div className="bg-white rounded-lg border border-gray-200 p-6">
|
1133
|
+
<h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
|
1134
|
+
<GitBranch className="w-5 h-5 text-gray-500" />
|
1135
|
+
Column Lineage
|
1136
|
+
</h3>
|
1137
|
+
|
1138
|
+
<div className="space-y-4">
|
1139
|
+
{renderLineageInfo()}
|
1140
|
+
</div>
|
1141
|
+
</div>
|
769
1142
|
</div>
|
770
1143
|
);
|
771
1144
|
}
|