easy_ml 0.2.0.pre.rc85 → 0.2.0.pre.rc88

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/datasets_controller.rb +18 -2
  3. data/app/frontend/components/dataset/PreprocessingConfig.tsx +523 -150
  4. data/app/frontend/pages/DatasetsPage.tsx +0 -1
  5. data/app/frontend/types/dataset.ts +5 -2
  6. data/app/models/easy_ml/column/imputers/base.rb +23 -2
  7. data/app/models/easy_ml/column/imputers/embedding_encoder.rb +18 -0
  8. data/app/models/easy_ml/column/imputers/imputer.rb +1 -0
  9. data/app/models/easy_ml/column/imputers/most_frequent.rb +1 -1
  10. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +1 -1
  11. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +1 -1
  12. data/app/models/easy_ml/column/imputers.rb +47 -41
  13. data/app/models/easy_ml/column/selector.rb +2 -2
  14. data/app/models/easy_ml/column.rb +260 -56
  15. data/app/models/easy_ml/column_history.rb +6 -0
  16. data/app/models/easy_ml/column_list.rb +30 -1
  17. data/app/models/easy_ml/dataset/learner/lazy/embedding.rb +10 -0
  18. data/app/models/easy_ml/dataset/learner/lazy/query.rb +2 -0
  19. data/app/models/easy_ml/dataset/learner.rb +11 -0
  20. data/app/models/easy_ml/dataset.rb +6 -19
  21. data/app/models/easy_ml/lineage_history.rb +17 -0
  22. data/app/models/easy_ml/model.rb +11 -1
  23. data/app/models/easy_ml/models/xgboost.rb +37 -7
  24. data/app/models/easy_ml/pca_model.rb +21 -0
  25. data/app/models/easy_ml/prediction.rb +2 -1
  26. data/app/serializers/easy_ml/column_serializer.rb +13 -1
  27. data/config/initializers/inflections.rb +1 -0
  28. data/lib/easy_ml/data/dataset_manager/writer/append_only.rb +6 -8
  29. data/lib/easy_ml/data/dataset_manager/writer/base.rb +15 -2
  30. data/lib/easy_ml/data/dataset_manager/writer/partitioned.rb +0 -1
  31. data/lib/easy_ml/data/dataset_manager/writer.rb +2 -0
  32. data/lib/easy_ml/data/embeddings/compressor.rb +179 -0
  33. data/lib/easy_ml/data/embeddings/embedder.rb +226 -0
  34. data/lib/easy_ml/data/embeddings.rb +61 -0
  35. data/lib/easy_ml/data/polars_column.rb +3 -0
  36. data/lib/easy_ml/data/polars_reader.rb +54 -23
  37. data/lib/easy_ml/data/polars_schema.rb +28 -2
  38. data/lib/easy_ml/data/splits/file_split.rb +7 -2
  39. data/lib/easy_ml/data.rb +1 -0
  40. data/lib/easy_ml/embedding_store.rb +92 -0
  41. data/lib/easy_ml/engine.rb +4 -2
  42. data/lib/easy_ml/predict.rb +42 -20
  43. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +5 -0
  44. data/lib/easy_ml/railtie/templates/migration/add_is_primary_key_to_easy_ml_columns.rb.tt +9 -0
  45. data/lib/easy_ml/railtie/templates/migration/add_metadata_to_easy_ml_predictions.rb.tt +6 -0
  46. data/lib/easy_ml/railtie/templates/migration/add_pca_model_id_to_easy_ml_columns.rb.tt +9 -0
  47. data/lib/easy_ml/railtie/templates/migration/add_workflow_status_to_easy_ml_dataset_histories.rb.tt +13 -0
  48. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_pca_models.rb.tt +14 -0
  49. data/lib/easy_ml/version.rb +1 -1
  50. data/lib/easy_ml.rb +1 -0
  51. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  52. data/public/easy_ml/assets/assets/Application-DfPoyRr8.css +1 -0
  53. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js +533 -0
  54. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-KENNRQpC.js.map +1 -0
  55. metadata +59 -6
  56. data/lib/tasks/profile.rake +0 -40
  57. data/public/easy_ml/assets/assets/Application-nnn_XLuL.css +0 -1
  58. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js +0 -522
  59. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-CD8voxfL.js.map +0 -1
@@ -1,7 +1,8 @@
1
1
  import React, { useState, useEffect } from 'react';
2
- import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database, Calculator, GitBranch } from 'lucide-react';
2
+ import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database, Calculator, GitBranch, Brain, HardDrive, Maximize2, Minimize2 } from 'lucide-react';
3
3
  import type { Dataset, Column, ColumnType, PreprocessingConstants, PreprocessingSteps, PreprocessingStep } from '../../types/dataset';
4
4
  import { Badge } from "@/components/ui/badge";
5
+ import { SearchableSelect } from '../SearchableSelect';
5
6
 
6
7
  interface PreprocessingConfigProps {
7
8
  column: Column;
@@ -19,15 +20,21 @@ interface PreprocessingConfigProps {
19
20
  const isNumericType = (type: ColumnType): boolean =>
20
21
  type === 'float' || type === 'integer';
21
22
 
23
+ const canUseEmbedding = (type: ColumnType): boolean =>
24
+ type === 'text' || type === 'string' || type === 'categorical';
25
+
22
26
  const createPreprocessingStep = (steps?: PreprocessingStep): PreprocessingStep => ({
23
27
  method: steps?.method || 'none',
28
+ encoding: steps?.encoding,
24
29
  params: {
25
- constant: steps?.params?.constant,
26
30
  categorical_min: steps?.params?.categorical_min ?? 100,
27
- one_hot: steps?.params?.one_hot ?? true,
28
- ordinal_encoding: steps?.params?.ordinal_encoding ?? false,
29
- clip: steps?.params?.clip
30
- }
31
+ clip: steps?.params?.clip,
32
+ constant: steps?.params?.constant,
33
+ llm: steps?.params?.llm,
34
+ model: steps?.params?.model,
35
+ dimensions: steps?.params?.dimensions,
36
+ preset: steps?.params?.preset,
37
+ },
31
38
  });
32
39
 
33
40
  export function PreprocessingConfig({
@@ -64,31 +71,33 @@ export function PreprocessingConfig({
64
71
  method: PreprocessingStep['method']
65
72
  ) => {
66
73
  let defaultParams: PreprocessingStep['params'] = {};
67
-
68
- if (selectedType === 'categorical') {
74
+ const strategy = type === 'training' ? training : inference;
75
+
76
+ // Preserve existing encoding for text/string columns if it's already set
77
+ let defaultEncoding: string | null = null;
78
+
79
+ if (canUseEmbedding(selectedType) && strategy.encoding) {
80
+ // Keep the existing encoding if it's already set
81
+ defaultEncoding = strategy.encoding;
82
+ } else if (selectedType === 'categorical') {
69
83
  if (method === 'categorical') {
70
84
  defaultParams = {
71
85
  ...defaultParams,
72
86
  categorical_min: 100,
73
- one_hot: true
74
- };
75
- } else if (method != 'none') {
76
- defaultParams = {
77
- ...defaultParams,
78
- one_hot: true
79
87
  };
88
+ defaultEncoding = 'one_hot';
89
+ } else if (method !== 'none') {
90
+ defaultEncoding = 'one_hot';
80
91
  }
81
92
  }
82
93
 
83
94
  if (column.is_target) {
84
- defaultParams = {
85
- ...defaultParams,
86
- ordinal_encoding: true
87
- };
95
+ defaultEncoding = 'ordinal';
88
96
  }
89
97
 
90
98
  const newStrategy: PreprocessingStep = {
91
99
  method,
100
+ encoding: defaultEncoding,
92
101
  params: defaultParams
93
102
  };
94
103
 
@@ -113,8 +122,12 @@ export function PreprocessingConfig({
113
122
  ...strategy,
114
123
  params: {
115
124
  categorical_min: strategy.params.categorical_min,
116
- one_hot: strategy.params.one_hot,
117
- ordinal_encoding: strategy.params.ordinal_encoding,
125
+ clip: strategy.params.clip,
126
+ constant: strategy.params.constant,
127
+ llm: strategy.params.llm,
128
+ model: strategy.params.model,
129
+ dimensions: strategy.params.dimensions,
130
+ preset: strategy.params.preset,
118
131
  ...updates
119
132
  }
120
133
  };
@@ -177,6 +190,70 @@ export function PreprocessingConfig({
177
190
  }
178
191
  };
179
192
 
193
+ const handleEmbeddingParamChange = (
194
+ type: 'training' | 'inference',
195
+ updates: Partial<PreprocessingStep['params']>
196
+ ) => {
197
+ const strategy = type === 'training' ? training : inference;
198
+ const setStrategy = type === 'training' ? setTraining : setInference;
199
+
200
+ const newStrategy: PreprocessingStep = {
201
+ ...strategy,
202
+ params: {
203
+ ...strategy.params,
204
+ ...updates
205
+ }
206
+ };
207
+
208
+ setStrategy(newStrategy);
209
+ if (type === 'training') {
210
+ onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
211
+ } else {
212
+ onUpdate(training, newStrategy, useDistinctInference);
213
+ }
214
+ };
215
+
216
+ const handleEncodingChange = (
217
+ type: 'training' | 'inference',
218
+ encoding: string | null
219
+ ) => {
220
+ const strategy = type === 'training' ? training : inference;
221
+ const setStrategy = type === 'training' ? setTraining : setInference;
222
+
223
+ let updatedParams = { ...strategy.params };
224
+
225
+ // If selecting embedding encoding, ensure we have default llm and model params
226
+ if (encoding === 'embedding') {
227
+ const embeddingConstants = constants.embedding_constants;
228
+ if (embeddingConstants) {
229
+ const defaultProvider = 'openai';
230
+ const defaultModel = (embeddingConstants.models[defaultProvider] || [])[0]?.value;
231
+ const defaultDimensions = (embeddingConstants.models[defaultProvider] || []).find(m => m.value === defaultModel)?.dimensions || 1536;
232
+
233
+ updatedParams = {
234
+ ...updatedParams,
235
+ llm: updatedParams.llm || defaultProvider,
236
+ model: updatedParams.model || defaultModel,
237
+ dimensions: updatedParams.dimensions || defaultDimensions,
238
+ preset: updatedParams.preset || 'high_quality',
239
+ };
240
+ }
241
+ }
242
+
243
+ const newStrategy: PreprocessingStep = {
244
+ ...strategy,
245
+ encoding: encoding === 'none' ? null : encoding,
246
+ params: updatedParams
247
+ };
248
+
249
+ setStrategy(newStrategy);
250
+ if (type === 'training') {
251
+ onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
252
+ } else {
253
+ onUpdate(training, newStrategy, useDistinctInference);
254
+ }
255
+ };
256
+
180
257
  const renderConstantValueInput = (type: 'training' | 'inference') => {
181
258
  const strategy = type === 'training' ? training : inference;
182
259
  if (strategy.method !== 'constant') return null;
@@ -207,6 +284,330 @@ export function PreprocessingConfig({
207
284
  );
208
285
  };
209
286
 
287
+ const renderEncodingConfig = (type: 'training' | 'inference') => {
288
+ const strategy = type === 'training' ? training : inference;
289
+ if (!strategy || !canUseEmbedding(selectedType)) return null;
290
+
291
+ return (
292
+ <div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
293
+ <h4 className="text-sm font-medium text-gray-900 mb-2">Encoding</h4>
294
+ <div className="flex items-center gap-2">
295
+ <input
296
+ type="radio"
297
+ id="noneEncode"
298
+ name="encoding"
299
+ checked={strategy.encoding === null}
300
+ onChange={() => handleEncodingChange(type, 'none')}
301
+ className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
302
+ />
303
+ <label htmlFor="noneEncode" className="text-sm text-gray-700">
304
+ No encoding
305
+ </label>
306
+ </div>
307
+ {selectedType === 'categorical' && (
308
+ <>
309
+ <div className="flex items-center gap-2">
310
+ <input
311
+ type="radio"
312
+ id="oneHotEncode"
313
+ name="encoding"
314
+ checked={strategy.encoding === 'one_hot'}
315
+ onChange={() => handleEncodingChange(type, 'one_hot')}
316
+ className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
317
+ />
318
+ <label htmlFor="oneHotEncode" className="text-sm text-gray-700">
319
+ One-hot encode categories
320
+ </label>
321
+ </div>
322
+ <div className="flex items-center gap-2">
323
+ <input
324
+ type="radio"
325
+ id="ordinalEncode"
326
+ name="encoding"
327
+ checked={strategy.encoding === 'ordinal'}
328
+ onChange={() => handleEncodingChange(type, 'ordinal')}
329
+ className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
330
+ />
331
+ <label htmlFor="ordinalEncode" className="text-sm text-gray-700">
332
+ Ordinal encode categories
333
+ </label>
334
+ </div>
335
+ </>
336
+ )}
337
+ <div className="flex items-center gap-2">
338
+ <input
339
+ type="radio"
340
+ id="embeddingEncode"
341
+ name="encoding"
342
+ checked={strategy.encoding === 'embedding'}
343
+ onChange={() => handleEncodingChange(type, 'embedding')}
344
+ className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
345
+ />
346
+ <label htmlFor="embeddingEncode" className="text-sm text-gray-700">
347
+ Embedding encode
348
+ </label>
349
+ </div>
350
+ </div>
351
+ );
352
+ };
353
+
354
+ const renderEmbeddingConfig = (type: 'training' | 'inference') => {
355
+ const strategy = type === 'training' ? training : inference;
356
+ if (!strategy || strategy.encoding !== 'embedding' || !constants.embedding_constants) return null;
357
+
358
+ const embeddingConstants = constants.embedding_constants;
359
+ const providers = embeddingConstants.providers || [];
360
+ const models = embeddingConstants.models || {};
361
+ const compressionPresets = Object.entries(embeddingConstants.compression_presets || {}).map(([key, preset]) => ({
362
+ value: key,
363
+ label: key.split('_').map(word => word.charAt(0).toUpperCase() + word.slice(1)).join(' '),
364
+ description: preset.description,
365
+ variance_target: preset.variance_target,
366
+ }));
367
+
368
+ const getModelsForProvider = (provider: string) => {
369
+ return models[provider] || [];
370
+ };
371
+
372
+ const getCurrentModelDimensions = () => {
373
+ const provider = strategy.params?.llm || 'openai';
374
+ const modelValue = strategy.params?.model || getModelsForProvider(provider)[0]?.value;
375
+ const model = getModelsForProvider(provider).find(m => m.value === modelValue);
376
+ return model?.dimensions || 1536; // Default to 1536 if not found
377
+ };
378
+
379
+ const getPresetForVariance = (variance: number) => {
380
+ return compressionPresets.find(preset =>
381
+ Math.abs(preset.variance_target - variance) < 0.05
382
+ )?.value || null;
383
+ };
384
+
385
+ const getVarianceForPreset = (presetValue: string) => {
386
+ return compressionPresets.find(preset =>
387
+ preset.value === presetValue
388
+ )?.variance_target || 0.85; // Default to balanced
389
+ };
390
+
391
+ const handleDimensionsChange = (dimensions: number) => {
392
+ const variance = dimensions / getCurrentModelDimensions(); // Normalize to 0-1
393
+ const matchingPreset = getPresetForVariance(variance);
394
+
395
+ handleEmbeddingParamChange(type, {
396
+ dimensions,
397
+ preset: matchingPreset,
398
+ });
399
+ };
400
+
401
+ const handlePresetChange = (presetValue: string) => {
402
+ const variance = getVarianceForPreset(presetValue);
403
+ const dimensions = Math.round(variance * getCurrentModelDimensions());
404
+
405
+ handleEmbeddingParamChange(type, {
406
+ dimensions,
407
+ preset: presetValue,
408
+ });
409
+ };
410
+
411
+ return (
412
+ <div className="space-y-6 mt-8">
413
+ <div className="bg-blue-50 rounded-lg p-4">
414
+ <div className="flex gap-2">
415
+ <Brain className="w-5 h-5 text-blue-500 flex-shrink-0" />
416
+ <div>
417
+ <h4 className="text-sm font-medium text-blue-900">Text Embeddings</h4>
418
+ <p className="text-sm text-blue-700 mt-1">
419
+ Convert text into numerical vectors for machine learning, preserving semantic meaning while optimizing for storage and performance.
420
+ </p>
421
+ </div>
422
+ </div>
423
+ </div>
424
+
425
+ <div className="space-y-4">
426
+ <div>
427
+ <label className="block text-sm font-medium text-gray-700 mb-1">
428
+ Embedding Provider
429
+ </label>
430
+ <SearchableSelect
431
+ value={strategy.params?.llm || 'openai'}
432
+ onChange={(value) => {
433
+ const newModels = getModelsForProvider(value);
434
+ const firstModel = newModels[0]?.value;
435
+ const dimensions = newModels[0]?.dimensions || 1536;
436
+
437
+ handleEmbeddingParamChange(type, {
438
+ ...strategy.params,
439
+ llm: value,
440
+ model: firstModel,
441
+ dimensions: dimensions,
442
+ preset: 'high_quality',
443
+ });
444
+ }}
445
+ options={providers}
446
+ placeholder="Select a provider"
447
+ />
448
+ </div>
449
+
450
+ <div>
451
+ <label className="block text-sm font-medium text-gray-700 mb-1">
452
+ Model
453
+ </label>
454
+ <SearchableSelect
455
+ value={strategy.params?.model || getModelsForProvider(strategy.params?.llm || 'openai')[0]?.value}
456
+ onChange={(value) => {
457
+ const model = getModelsForProvider(strategy.params?.llm || 'openai').find(m => m.value === value);
458
+ const dimensions = model?.dimensions || 1536;
459
+
460
+ handleEmbeddingParamChange(type, {
461
+ ...strategy.params,
462
+ model: value,
463
+ dimensions: dimensions,
464
+ preset: 'high_quality',
465
+ });
466
+ }}
467
+ options={getModelsForProvider(strategy.params?.llm || 'openai')}
468
+ placeholder="Select a model"
469
+ />
470
+ </div>
471
+
472
+ <div className="space-y-4">
473
+ <div className="flex items-center justify-between">
474
+ <h4 className="text-sm font-medium text-gray-900">
475
+ Storage & Quality
476
+ </h4>
477
+ <div className="flex items-center gap-2 text-sm text-gray-500">
478
+ <Minimize2 className="w-4 h-4" />
479
+ <span>Storage</span>
480
+ <span className="mx-2">•</span>
481
+ <span>Quality</span>
482
+ <Maximize2 className="w-4 h-4" />
483
+ </div>
484
+ </div>
485
+
486
+ <div className="space-y-6">
487
+ <div>
488
+ <div className="flex items-center justify-between mb-2">
489
+ <span className="text-sm text-gray-600">Target Dimensions</span>
490
+ <span className="text-sm font-medium text-gray-900">{strategy.params?.dimensions || getCurrentModelDimensions()}</span>
491
+ </div>
492
+ <input
493
+ type="range"
494
+ min="2"
495
+ max={getCurrentModelDimensions()}
496
+ value={strategy.params?.dimensions || getCurrentModelDimensions()}
497
+ onChange={(e) => handleDimensionsChange(parseInt(e.target.value))}
498
+ className="w-full"
499
+ />
500
+ <div className="flex justify-between text-xs text-gray-500 mt-1">
501
+ <span>2</span>
502
+ <span>{getCurrentModelDimensions()}</span>
503
+ </div>
504
+ </div>
505
+
506
+ <div className="space-y-3">
507
+ <h5 className="text-sm font-medium text-gray-900">Quality Presets</h5>
508
+ {compressionPresets.map((preset) => (
509
+ <div
510
+ key={preset.value}
511
+ onClick={() => handlePresetChange(preset.value)}
512
+ className={`p-4 rounded-lg border transition-colors cursor-pointer
513
+ ${strategy.params?.preset === preset.value
514
+ ? 'border-blue-500 bg-blue-50'
515
+ : 'border-gray-200 hover:border-gray-300 bg-white'
516
+ }`}
517
+ >
518
+ <div className="flex items-center justify-between">
519
+ <div className="flex items-center gap-2">
520
+ <input
521
+ type="radio"
522
+ checked={strategy.params?.preset === preset.value}
523
+ onChange={() => handlePresetChange(preset.value)}
524
+ className="rounded-full border-gray-300 text-blue-600 focus:ring-blue-500"
525
+ />
526
+ <span className="font-medium text-gray-900">{preset.label}</span>
527
+ </div>
528
+ </div>
529
+ <p className="text-sm text-gray-600 mt-1 ml-6">{preset.description}</p>
530
+ </div>
531
+ ))}
532
+ </div>
533
+
534
+ <div className="space-y-4">
535
+ <div className="bg-gray-50 rounded-lg p-4">
536
+ <div className="flex items-start gap-2">
537
+ <HardDrive className="w-5 h-5 text-gray-400 flex-shrink-0 mt-0.5" />
538
+ <div className="flex-1">
539
+ <h5 className="text-sm font-medium text-gray-900">Storage Efficiency</h5>
540
+ <div className="mt-2">
541
+ <div className="w-full bg-gray-200 rounded-full h-2.5">
542
+ <div
543
+ className="h-full bg-green-600 rounded-full"
544
+ style={{ width: `${100 - ((strategy.params?.dimensions || 24) / getCurrentModelDimensions()) * 100}%` }}
545
+ />
546
+ </div>
547
+ <p className="text-sm text-gray-600 mt-2">
548
+ {strategy.params?.dimensions && strategy.params.dimensions <= getCurrentModelDimensions() * 0.25
549
+ ? "Optimized for storage. Maintains core meaning while significantly reducing storage requirements."
550
+ : strategy.params?.dimensions && strategy.params.dimensions <= getCurrentModelDimensions() * 0.5
551
+ ? "Balanced approach. Good compromise between quality and storage efficiency."
552
+ : "Prioritizes quality. Preserves more nuanced relationships but requires more storage."}
553
+ </p>
554
+ </div>
555
+ </div>
556
+ </div>
557
+ </div>
558
+
559
+ <div className="bg-gray-50 rounded-lg p-4">
560
+ <div className="flex items-start gap-2">
561
+ <Brain className="w-5 h-5 text-gray-400 flex-shrink-0 mt-0.5" />
562
+ <div className="flex-1">
563
+ <h5 className="text-sm font-medium text-gray-900">Information Preservation</h5>
564
+ <div className="mt-2">
565
+ <div className="w-full bg-gray-200 rounded-full h-2.5">
566
+ <div
567
+ className="bg-blue-600 h-2.5 rounded-full transition-all duration-300"
568
+ style={{ width: `${((strategy.params?.dimensions || 24) / getCurrentModelDimensions()) * 100}%` }}
569
+ />
570
+ </div>
571
+ <p className="text-sm text-gray-600 mt-2">
572
+ Preserves approximately {Math.round(((strategy.params?.dimensions || 24) / getCurrentModelDimensions()) * 100)}% of the original information
573
+ </p>
574
+ </div>
575
+ </div>
576
+ </div>
577
+ </div>
578
+ </div>
579
+ </div>
580
+ </div>
581
+ </div>
582
+ </div>
583
+ );
584
+ };
585
+
586
+ useEffect(() => {
587
+ // When component mounts or when column changes, update default dimensions
588
+ if (training.encoding === 'embedding' && !training.params?.dimensions) {
589
+ const provider = training.params?.llm || 'openai';
590
+ const modelValue = training.params?.model || (constants.embedding_constants?.models[provider] || [])[0]?.value;
591
+ const model = (constants.embedding_constants?.models[provider] || []).find(m => m.value === modelValue);
592
+ const defaultDimensions = model?.dimensions || 1536;
593
+
594
+ handleEmbeddingParamChange('training', {
595
+ dimensions: defaultDimensions
596
+ });
597
+ }
598
+
599
+ if (useDistinctInference && inference.encoding === 'embedding' && !inference.params?.dimensions) {
600
+ const provider = inference.params?.llm || 'openai';
601
+ const modelValue = inference.params?.model || (constants.embedding_constants?.models[provider] || [])[0]?.value;
602
+ const model = (constants.embedding_constants?.models[provider] || []).find(m => m.value === modelValue);
603
+ const defaultDimensions = model?.dimensions || 1536;
604
+
605
+ handleEmbeddingParamChange('inference', {
606
+ dimensions: defaultDimensions
607
+ });
608
+ }
609
+ }, [training.encoding, inference.encoding, column.id]);
610
+
210
611
  const [isEditingDescription, setIsEditingDescription] = useState(false);
211
612
 
212
613
  const onToggleDropIfNull = (e: React.ChangeEvent<HTMLInputElement>) => {
@@ -265,7 +666,7 @@ export function PreprocessingConfig({
265
666
  const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
266
667
  const strategy = type === 'training' ? training : inference;
267
668
  let content;
268
- if (strategy.method === 'most_frequent' && column.statistics?.raw.most_frequent_value !== undefined) {
669
+ if (strategy.method === 'most_frequent' && column.statistics?.raw?.most_frequent_value !== undefined) {
269
670
  content = `Most Frequent Value: ${column.statistics.raw.most_frequent_value}`
270
671
  } else if (strategy.method === 'ffill') {
271
672
  const lastValue = column.statistics?.raw.last_value;
@@ -290,6 +691,47 @@ export function PreprocessingConfig({
290
691
  );
291
692
  };
292
693
 
694
+ const renderLineageInfo = () => {
695
+ return (
696
+ <div className="space-y-4">
697
+ {column.lineage.map((step, index) => (
698
+ <div key={index} className="flex items-start gap-3">
699
+ <div className={`w-8 h-8 rounded-full flex items-center justify-center flex-shrink-0 ${
700
+ step.key === 'raw_dataset'
701
+ ? 'bg-gray-100'
702
+ : step.key === 'computed_by_feature'
703
+ ? 'bg-purple-100'
704
+ : 'bg-blue-100'
705
+ }`}>
706
+ {step.key === 'raw_dataset' ? (
707
+ <Database className="w-4 h-4 text-gray-600" />
708
+ ) : step.key === 'computed_by_feature' ? (
709
+ <Calculator className="w-4 h-4 text-purple-600" />
710
+ ) : (
711
+ <Settings2 className="w-4 h-4 text-blue-600" />
712
+ )}
713
+ </div>
714
+ <div className="flex-1">
715
+ <div className="flex items-center justify-between">
716
+ <p className="text-sm font-medium text-gray-900">
717
+ {step.description}
718
+ </p>
719
+ {step.timestamp && (
720
+ <span className="text-xs text-gray-500">
721
+ {new Date(step.timestamp).toLocaleString()}
722
+ </span>
723
+ )}
724
+ </div>
725
+ {index < column.lineage.length - 1 && (
726
+ <div className="ml-4 mt-2 mb-2 w-0.5 h-4 bg-gray-200" />
727
+ )}
728
+ </div>
729
+ </div>
730
+ ))}
731
+ </div>
732
+ );
733
+ };
734
+
293
735
  return (
294
736
  <div className="space-y-8">
295
737
  {/* Column Header Section */}
@@ -381,7 +823,7 @@ export function PreprocessingConfig({
381
823
  <div className="space-y-2">
382
824
  <div className="flex justify-between text-sm">
383
825
  <span className="text-gray-600">Null Values:</span>
384
- <span className="font-medium text-gray-900">{column.statistics?.raw?.null_count.toLocaleString()}</span>
826
+ <span className="font-medium text-gray-900">{column.statistics?.raw?.null_count?.toLocaleString()}</span>
385
827
  </div>
386
828
  <div className="flex justify-between text-sm">
387
829
  <span className="text-gray-600">Total Rows:</span>
@@ -392,7 +834,7 @@ export function PreprocessingConfig({
392
834
  <span className="font-medium text-gray-900">{nullPercentage.toFixed(2)}%</span>
393
835
  </div>
394
836
  <div className="mt-2">
395
- <div className="w-full h-2 bg-gray-200 rounded-full overflow-hidden">
837
+ <div className="w-full bg-gray-200 rounded-full h-2.5">
396
838
  <div
397
839
  className="h-full bg-blue-600 rounded-full"
398
840
  style={{ width: `${nullPercentage}%` }}
@@ -422,7 +864,7 @@ export function PreprocessingConfig({
422
864
  <span className="font-medium text-gray-900">{nullPercentageProcessed.toFixed(2)}%</span>
423
865
  </div>
424
866
  <div className="mt-2">
425
- <div className="w-full h-2 bg-gray-200 rounded-full overflow-hidden">
867
+ <div className="w-full bg-gray-200 rounded-full h-2.5">
426
868
  <div
427
869
  className="h-full bg-blue-600 rounded-full"
428
870
  style={{ width: `${nullPercentageProcessed}%` }}
@@ -462,7 +904,7 @@ export function PreprocessingConfig({
462
904
  <div className="flex items-center justify-between mb-2">
463
905
  <span className="text-sm font-medium text-gray-700">Null Distribution</span>
464
906
  <span className="text-sm text-gray-500">
465
- {nullPercentage}% of values are null
907
+ {nullPercentage.toFixed(2)}% of values are null
466
908
  </span>
467
909
  </div>
468
910
  <div className="relative h-2 bg-gray-100 rounded-full overflow-hidden">
@@ -486,63 +928,21 @@ export function PreprocessingConfig({
486
928
  <h4 className="text-sm font-medium text-gray-700 mb-2">Sample Values</h4>
487
929
  <div className="bg-gray-50 rounded-lg p-4">
488
930
  <div className="flex flex-wrap gap-2">
489
- {column.statistics?.raw?.sample_data && column.statistics.raw.sample_data.map((value, index) => (
490
- <span key={index} className="px-2 py-1 bg-gray-100 rounded text-sm text-gray-700">
491
- {String(value)}
931
+ {Array.isArray(column.sample_values) ? column.sample_values.slice(0, 3).map((value: any, index: number) => (
932
+ <span key={index} className="m-1 flex-items items-center">
933
+ <Badge>
934
+ {typeof value === 'string'
935
+ ? value.split(/\s+/).slice(0, 10).join(' ') + (value.split(/\s+/).length > 10 ? '...' : '')
936
+ : String(value)}
937
+ </Badge>
492
938
  </span>
493
- ))}
939
+ )) : []}
494
940
  </div>
495
941
  </div>
496
942
  </div>
497
943
  )}
498
944
  </div>
499
945
 
500
- {/* Column Lineage Section */}
501
- {column.lineage && column.lineage.length > 0 && (
502
- <div className="bg-white rounded-lg border border-gray-200 p-6">
503
- <h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
504
- <GitBranch className="w-5 h-5 text-gray-500" />
505
- Column Lineage
506
- </h3>
507
- <div className="space-y-4">
508
- {column.lineage.map((step, index) => (
509
- <div key={index} className="flex items-start gap-3">
510
- <div className={`w-8 h-8 rounded-full flex items-center justify-center flex-shrink-0 ${
511
- step.key === 'raw_dataset'
512
- ? 'bg-gray-100'
513
- : step.key === 'computed_by_feature'
514
- ? 'bg-purple-100'
515
- : 'bg-blue-100'
516
- }`}>
517
- {step.key === 'raw_dataset' ? (
518
- <Database className="w-4 h-4 text-gray-600" />
519
- ) : step.key === 'computed_by_feature' ? (
520
- <Calculator className="w-4 h-4 text-purple-600" />
521
- ) : (
522
- <Settings2 className="w-4 h-4 text-blue-600" />
523
- )}
524
- </div>
525
- <div className="flex-1">
526
- <div className="flex items-center justify-between">
527
- <p className="text-sm font-medium text-gray-900">
528
- {step.description}
529
- </p>
530
- {step.timestamp && (
531
- <span className="text-xs text-gray-500">
532
- {new Date(step.timestamp).toLocaleString()}
533
- </span>
534
- )}
535
- </div>
536
- {index < column.lineage.length - 1 && (
537
- <div className="ml-4 mt-2 mb-2 w-0.5 h-4 bg-gray-200" />
538
- )}
539
- </div>
540
- </div>
541
- ))}
542
- </div>
543
- </div>
544
- )}
545
-
546
946
  {/* Data Type Section */}
547
947
  <div className="bg-white rounded-lg border border-gray-200 p-6">
548
948
  <h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
@@ -555,20 +955,14 @@ export function PreprocessingConfig({
555
955
  <label className="block text-sm font-medium text-gray-700 mb-1">
556
956
  Column Type
557
957
  </label>
558
- <select
958
+ <SearchableSelect
559
959
  value={selectedType}
560
- disabled
561
- className="w-full rounded-md border-gray-300 bg-gray-50 shadow-sm text-gray-700 cursor-not-allowed"
562
- >
563
- {constants.column_types.map(type => (
564
- <option key={type.value} value={type.value}>
565
- {type.label}
566
- </option>
567
- ))}
568
- </select>
569
- <p className="mt-1 text-sm text-gray-500">
570
- Column type cannot be changed after creation
571
- </p>
960
+ onChange={(value) => setColumnType(column.name, value)}
961
+ options={constants.column_types.map(type => ({
962
+ value: type.value,
963
+ label: type.label
964
+ }))}
965
+ />
572
966
  </div>
573
967
 
574
968
  <div className="bg-gray-50 rounded-md p-4">
@@ -577,7 +971,9 @@ export function PreprocessingConfig({
577
971
  {Array.isArray(column.sample_values) ? column.sample_values.slice(0, 3).map((value: any, index: number) => (
578
972
  <span key={index} className="m-1 flex-items items-center">
579
973
  <Badge>
580
- {String(value)}
974
+ {typeof value === 'string'
975
+ ? value.split(/\s+/).slice(0, 10).join(' ') + (value.split(/\s+/).length > 10 ? '...' : '')
976
+ : String(value)}
581
977
  </Badge>
582
978
  </span>
583
979
  )) : []}
@@ -621,23 +1017,25 @@ export function PreprocessingConfig({
621
1017
  </div>
622
1018
 
623
1019
  <div className={useDistinctInference ? "grid grid-cols-2 gap-6" : ""}>
624
- <div>
625
- <select
1020
+ <div className="relative z-50">
1021
+ <SearchableSelect
626
1022
  value={training.method}
627
- onChange={(e) => handleStrategyChange('training', e.target.value as PreprocessingStep['method'])}
628
- className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
629
- >
630
- <option value="none">No preprocessing</option>
631
- {constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => (
632
- <option key={strategy.value} value={strategy.value}>
633
- {strategy.label}
634
- </option>
635
- ))}
636
- </select>
1023
+ onChange={(value) => handleStrategyChange('training', value as PreprocessingStep['method'])}
1024
+ options={[
1025
+ { value: 'none', label: 'No preprocessing' },
1026
+ ...(constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => ({
1027
+ value: strategy.value,
1028
+ label: strategy.label
1029
+ })) || [])
1030
+ ]}
1031
+ options={constants.preprocessing_strategies[selectedType]}
1032
+ />
637
1033
 
638
1034
  {renderStrategySpecificInfo('training')}
639
1035
  {renderConstantValueInput('training')}
640
- {(column.datatype === 'categorical' && training.method === 'categorical') && (
1036
+ {renderEncodingConfig('training')}
1037
+ {renderEmbeddingConfig('training')}
1038
+ {(column.datatype === 'categorical' && training.method === 'categorical') && (
641
1039
  <div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
642
1040
  <div>
643
1041
  <label className="block text-sm font-medium text-gray-700 mb-1">
@@ -658,43 +1056,6 @@ export function PreprocessingConfig({
658
1056
  </div>
659
1057
  </div>
660
1058
  )}
661
- {(column.datatype === 'categorical' && training.method !== 'none') && (
662
- <div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
663
- <h4 className="text-sm font-medium text-gray-900 mb-2">Encoding</h4>
664
- <div className="flex items-center gap-2">
665
- <input
666
- type="radio"
667
- id="oneHotEncode"
668
- name="encoding"
669
- checked={training.params.one_hot}
670
- onChange={() => handleCategoricalParamChange('training', {
671
- one_hot: true,
672
- ordinal_encoding: false
673
- })}
674
- className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
675
- />
676
- <label htmlFor="oneHotEncode" className="text-sm text-gray-700">
677
- One-hot encode categories
678
- </label>
679
- </div>
680
- <div className="flex items-center gap-2">
681
- <input
682
- type="radio"
683
- id="ordinalEncode"
684
- name="encoding"
685
- checked={training.params.ordinal_encoding}
686
- onChange={() => handleCategoricalParamChange('training', {
687
- one_hot: false,
688
- ordinal_encoding: true
689
- })}
690
- className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
691
- />
692
- <label htmlFor="ordinalEncode" className="text-sm text-gray-700">
693
- Ordinal encode categories
694
- </label>
695
- </div>
696
- </div>
697
- )}
698
1059
  </div>
699
1060
 
700
1061
  {useDistinctInference && (
@@ -705,20 +1066,21 @@ export function PreprocessingConfig({
705
1066
  Inference Strategy
706
1067
  </span>
707
1068
  </div>
708
- <select
1069
+ <SearchableSelect
709
1070
  value={inference.method}
710
- onChange={(e) => handleStrategyChange('inference', e.target.value as PreprocessingStep['method'])}
711
- className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
712
- >
713
- <option value="none">No preprocessing</option>
714
- {constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => (
715
- <option key={strategy.value} value={strategy.value}>
716
- {strategy.label}
717
- </option>
718
- ))}
719
- </select>
1071
+ onChange={(value) => handleStrategyChange('inference', value as PreprocessingStep['method'])}
1072
+ options={[
1073
+ { value: 'none', label: 'No preprocessing' },
1074
+ ...(constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => ({
1075
+ value: strategy.value,
1076
+ label: strategy.label
1077
+ })) || [])
1078
+ ]}
1079
+ />
720
1080
 
721
1081
  {renderConstantValueInput('inference')}
1082
+ {renderEncodingConfig('inference')}
1083
+ {renderEmbeddingConfig('inference')}
722
1084
  </div>
723
1085
  )}
724
1086
  </div>
@@ -766,6 +1128,17 @@ export function PreprocessingConfig({
766
1128
  </div>
767
1129
  </div>
768
1130
 
1131
+ {/* Column Lineage Section */}
1132
+ <div className="bg-white rounded-lg border border-gray-200 p-6">
1133
+ <h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
1134
+ <GitBranch className="w-5 h-5 text-gray-500" />
1135
+ Column Lineage
1136
+ </h3>
1137
+
1138
+ <div className="space-y-4">
1139
+ {renderLineageInfo()}
1140
+ </div>
1141
+ </div>
769
1142
  </div>
770
1143
  );
771
1144
  }