easy_ml 0.2.0.pre.rc58 → 0.2.0.pre.rc61

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/easy_ml/application_controller.rb +4 -0
  3. data/app/controllers/easy_ml/datasets_controller.rb +32 -1
  4. data/app/frontend/components/DatasetPreview.tsx +50 -19
  5. data/app/frontend/components/dataset/ColumnConfigModal.tsx +7 -1
  6. data/app/frontend/components/dataset/ColumnFilters.tsx +37 -3
  7. data/app/frontend/components/dataset/ColumnList.tsx +14 -2
  8. data/app/frontend/components/dataset/PreprocessingConfig.tsx +81 -20
  9. data/app/frontend/types/dataset.ts +3 -0
  10. data/app/jobs/easy_ml/compute_feature_job.rb +0 -3
  11. data/app/jobs/easy_ml/refresh_dataset_job.rb +0 -6
  12. data/app/models/easy_ml/column/imputers/base.rb +89 -0
  13. data/app/models/easy_ml/column/imputers/categorical.rb +35 -0
  14. data/app/models/easy_ml/column/imputers/clip.rb +30 -0
  15. data/app/models/easy_ml/column/imputers/constant.rb +27 -0
  16. data/app/models/easy_ml/column/imputers/ffill.rb +29 -0
  17. data/app/models/easy_ml/column/imputers/imputer.rb +103 -0
  18. data/app/models/easy_ml/column/imputers/mean.rb +27 -0
  19. data/app/models/easy_ml/column/imputers/median.rb +27 -0
  20. data/app/models/easy_ml/column/imputers/most_frequent.rb +27 -0
  21. data/app/models/easy_ml/column/imputers/null_imputer.rb +15 -0
  22. data/app/models/easy_ml/column/imputers/one_hot_encoder.rb +30 -0
  23. data/app/models/easy_ml/column/imputers/ordinal_encoder.rb +78 -0
  24. data/app/models/easy_ml/column/imputers/today.rb +20 -0
  25. data/app/models/easy_ml/column/imputers.rb +126 -0
  26. data/app/models/easy_ml/column/learner.rb +18 -0
  27. data/app/models/easy_ml/column/learners/base.rb +103 -0
  28. data/app/models/easy_ml/column/learners/boolean.rb +11 -0
  29. data/app/models/easy_ml/column/learners/categorical.rb +51 -0
  30. data/app/models/easy_ml/column/learners/datetime.rb +19 -0
  31. data/app/models/easy_ml/column/learners/null.rb +22 -0
  32. data/app/models/easy_ml/column/learners/numeric.rb +33 -0
  33. data/app/models/easy_ml/column/learners/string.rb +15 -0
  34. data/app/models/easy_ml/column/lineage/base.rb +22 -0
  35. data/app/models/easy_ml/column/lineage/computed_by_feature.rb +23 -0
  36. data/app/models/easy_ml/column/lineage/preprocessed.rb +23 -0
  37. data/app/models/easy_ml/column/lineage/raw_dataset.rb +23 -0
  38. data/app/models/easy_ml/column/lineage.rb +28 -0
  39. data/app/models/easy_ml/column/selector.rb +96 -0
  40. data/app/models/easy_ml/column.rb +319 -52
  41. data/app/models/easy_ml/column_history.rb +29 -22
  42. data/app/models/easy_ml/column_list.rb +63 -78
  43. data/app/models/easy_ml/dataset.rb +128 -96
  44. data/app/models/easy_ml/dataset_history.rb +23 -23
  45. data/app/models/easy_ml/datasource.rb +3 -0
  46. data/app/models/easy_ml/datasource_history.rb +1 -0
  47. data/app/models/easy_ml/datasources/file_datasource.rb +1 -1
  48. data/app/models/easy_ml/datasources/polars_datasource.rb +6 -12
  49. data/app/models/easy_ml/datasources/s3_datasource.rb +1 -1
  50. data/app/models/easy_ml/feature.rb +19 -7
  51. data/app/models/easy_ml/feature_history.rb +12 -0
  52. data/app/models/easy_ml/feature_list.rb +15 -0
  53. data/app/serializers/easy_ml/column_serializer.rb +11 -1
  54. data/app/serializers/easy_ml/dataset_serializer.rb +23 -2
  55. data/config/initializers/enumerable.rb +17 -0
  56. data/lib/easy_ml/data/date_converter.rb +137 -30
  57. data/lib/easy_ml/data/polars_column.rb +17 -0
  58. data/lib/easy_ml/data/polars_in_memory.rb +30 -0
  59. data/lib/easy_ml/data/polars_reader.rb +20 -1
  60. data/lib/easy_ml/data/splits/in_memory_split.rb +3 -5
  61. data/lib/easy_ml/data/splits/split.rb +2 -1
  62. data/lib/easy_ml/data/synced_directory.rb +1 -1
  63. data/lib/easy_ml/data.rb +1 -2
  64. data/lib/easy_ml/engine.rb +1 -0
  65. data/lib/easy_ml/feature_store.rb +33 -22
  66. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +4 -0
  67. data/lib/easy_ml/railtie/templates/migration/add_computed_columns_to_easy_ml_columns.rb.tt +4 -0
  68. data/lib/easy_ml/railtie/templates/migration/add_last_feature_sha_to_columns.rb.tt +9 -0
  69. data/lib/easy_ml/railtie/templates/migration/add_learned_at_to_easy_ml_columns.rb.tt +13 -0
  70. data/lib/easy_ml/railtie/templates/migration/add_sha_to_datasources_datasets_and_columns.rb.tt +21 -0
  71. data/lib/easy_ml/railtie/templates/migration/remove_preprocessor_statistics_from_easy_ml_datasets.rb.tt +11 -0
  72. data/lib/easy_ml/version.rb +1 -1
  73. data/lib/tasks/profile.rake +40 -0
  74. data/public/easy_ml/assets/.vite/manifest.json +2 -2
  75. data/public/easy_ml/assets/assets/Application-BbFobaXt.css +1 -0
  76. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js +489 -0
  77. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-Dni_GM8r.js.map +1 -0
  78. metadata +41 -10
  79. data/app/models/easy_ml/adapters/base_adapter.rb +0 -45
  80. data/app/models/easy_ml/adapters/polars_adapter.rb +0 -77
  81. data/lib/easy_ml/data/preprocessor.rb +0 -340
  82. data/lib/easy_ml/data/simple_imputer.rb +0 -255
  83. data/lib/easy_ml/data/statistics_learner.rb +0 -193
  84. data/public/easy_ml/assets/assets/Application-BUsRR6b6.css +0 -1
  85. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js +0 -474
  86. data/public/easy_ml/assets/assets/entrypoints/Application.tsx-DmkdJsDd.js.map +0 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1a25c50b89c079e7e62f52d1f5a52ef16f3d7bc9b388fcee9a7b0983148de9cd
4
- data.tar.gz: bfcf0d06fbe498ccc70251c144649d7fd6699c2bdc4a9acbf5866e60ce04c7bd
3
+ metadata.gz: 1b194e446191bb71affc096c0dbc049f2b0dc76878b83d4ff1eb729e9d8a29b9
4
+ data.tar.gz: c33ef37f01db6b965433c8fe90e487905e17ebdfcf6967fa93ef6cf10c8755eb
5
5
  SHA512:
6
- metadata.gz: 77186b1d2d7558db7d128e03c68f8632af6f28be4b1bf2daa71ac804abbcc5a26470fe29a802258d326048155fea46a7f707e46fc88cf8571af5f30cb870d839
7
- data.tar.gz: a22ba3e21ab32e64674033f0c83d023e41c3c5f158117be5fa8b85f1865a4bfc39bda72042e4c1ff80277d4d664fcc694349f7a377933c7c52f23417c299618b
6
+ metadata.gz: 92d0d03c4d258942c01967e6afe1ec2dc956a2a70b5d95d400f44189948b4192c9cd5a599c8d5b48cae2c0b72be18ee7926dfb6b4555d32e3e6292e03070f30e
7
+ data.tar.gz: 57b3e65c77f765ecce66fdcfaa0bce24c2c31a1b5f9b248a25dcd2a5fc587450df3dee496f9cd75a6c7754c6cc5cef173c04c9c04d8da495f0dbfbb95b97d6e1
@@ -23,6 +23,10 @@ module EasyML
23
23
  SettingsSerializer.new(settings).serializable_hash.dig(:data, :attributes)
24
24
  end
25
25
 
26
+ def dataset_to_json_small(dataset)
27
+ DatasetSerializer::SmallSerializer.new(dataset).serializable_hash.dig(:data, :attributes)
28
+ end
29
+
26
30
  def dataset_to_json(dataset)
27
31
  DatasetSerializer.new(dataset).serializable_hash.dig(:data, :attributes)
28
32
  end
@@ -26,7 +26,7 @@ module EasyML
26
26
  datasets = Dataset.all.order(id: :desc)
27
27
 
28
28
  render inertia: "pages/DatasetsPage", props: {
29
- datasets: datasets.map { |dataset| dataset_to_json(dataset) },
29
+ datasets: datasets.map { |dataset| dataset_to_json_small(dataset) },
30
30
  constants: Dataset.constants,
31
31
  }
32
32
  end
@@ -76,6 +76,37 @@ module EasyML
76
76
  column_attrs[:preprocessing_steps] = nil if column_attrs.dig(:preprocessing_steps, :training, :method) == "none"
77
77
  end
78
78
 
79
+ # Handle feature ID assignment for existing features
80
+ if dataset_params[:features_attributes].present?
81
+ # Clean up any feature IDs that don't exist anymore
82
+ feature_ids = dataset_params[:features_attributes].map { |attrs| attrs[:id] }.compact
83
+ existing_feature_ids = Feature.where(id: feature_ids).pluck(:id)
84
+
85
+ params[:dataset][:features_attributes].each do |attrs|
86
+ if attrs[:id].present? && !existing_feature_ids.include?(attrs[:id].to_i)
87
+ attrs.delete(:id)
88
+ end
89
+ end
90
+
91
+ # Find existing features by feature_class
92
+ feature_classes = dataset_params[:features_attributes].map { |attrs|
93
+ attrs[:feature_class] if attrs[:id].blank?
94
+ }.compact
95
+
96
+ existing_features = Feature.where(feature_class: feature_classes)
97
+
98
+ # Update params with existing feature IDs
99
+ existing_features.each do |feature|
100
+ matching_param_index = params[:dataset][:features_attributes].find_index { |attrs|
101
+ attrs[:feature_class] == feature.feature_class
102
+ }
103
+
104
+ if matching_param_index
105
+ params[:dataset][:features_attributes][matching_param_index][:id] = feature.id
106
+ end
107
+ end
108
+ end
109
+
79
110
  if dataset.update(dataset_params)
80
111
  flash.now[:notice] = "Dataset configuration was successfully updated."
81
112
  render inertia: "pages/DatasetDetailsPage", props: {
@@ -58,9 +58,11 @@ export function DatasetPreview({ dataset }: DatasetPreviewProps) {
58
58
  key={column.name}
59
59
  className="bg-gray-50 rounded-lg p-4"
60
60
  >
61
- <div className="flex items-center justify-between mb-2">
62
- <h4 className="font-medium text-gray-900">{column.name}</h4>
63
- <span className="text-xs font-medium text-gray-500 px-2 py-1 bg-gray-200 rounded-full">
61
+ <div className="flex items-center justify-between mb-2 gap-2">
62
+ <h4 className="font-medium text-gray-900 break-normal max-w-[70%] word-break:break-word overflow-wrap:anywhere whitespace-pre-wrap">
63
+ {column.name.split('_').join('_\u200B')}
64
+ </h4>
65
+ <span className="text-xs font-medium text-gray-500 px-2 py-1 bg-gray-200 rounded-full flex-shrink-0">
64
66
  {column.datatype}
65
67
  </span>
66
68
  </div>
@@ -68,23 +70,48 @@ export function DatasetPreview({ dataset }: DatasetPreviewProps) {
68
70
  {column.statistics && (
69
71
  <div className="space-y-1">
70
72
  {Object.entries(column.statistics.raw).map(([key, value]) => {
71
- if (key === "counts") {
73
+ // Skip internal keys and null/undefined values
74
+ if (key === "counts" ||
75
+ key === "allowed_categories" ||
76
+ key === "value" ||
77
+ key === "label_encoder" ||
78
+ key === "label_decoder" ||
79
+ value === null ||
80
+ value === undefined) {
72
81
  return null;
73
82
  }
83
+
84
+ // Format the value based on its type
85
+ let displayValue: string;
86
+ if (typeof value === 'number') {
87
+ displayValue = value.toLocaleString(undefined, {
88
+ maximumFractionDigits: 2
89
+ });
90
+ } else if (typeof value === 'object') {
91
+ // Handle arrays or other objects
92
+ displayValue = JSON.stringify(value);
93
+ } else if (typeof value === 'boolean') {
94
+ displayValue = value.toString();
95
+ } else {
96
+ displayValue = String(value);
97
+ }
98
+
99
+ // Truncate long strings
100
+ if (displayValue.length > 50) {
101
+ displayValue = displayValue.slice(0, 47) + '...';
102
+ }
103
+
74
104
  return (
75
- <div key={key} className="flex justify-between text-sm">
76
- <span className="text-gray-500">
77
- {key.charAt(0).toUpperCase() + key.slice(1)}:
78
- </span>
79
- <span className="font-medium text-gray-900">
80
- {typeof value === 'number' ?
81
- value.toLocaleString(undefined, {
82
- maximumFractionDigits: 2
83
- }) :
84
- value}
85
- </span>
86
- </div>
87
- )})}
105
+ <div key={key} className="flex justify-between text-sm gap-2">
106
+ <span className="text-gray-500 flex-shrink-0">
107
+ {key.charAt(0).toUpperCase() + key.slice(1)}:
108
+ </span>
109
+ <span className="font-medium text-gray-900 text-right break-all">
110
+ {displayValue}
111
+ </span>
112
+ </div>
113
+ );
114
+ })}
88
115
  </div>
89
116
  )}
90
117
  </div>
@@ -142,10 +169,14 @@ export function DatasetPreview({ dataset }: DatasetPreviewProps) {
142
169
  <tr key={i}>
143
170
  {columns.map((column) => (
144
171
  <td
145
- key={row[column]}
172
+ key={`${i}-${column}`}
146
173
  className="whitespace-nowrap px-3 py-4 text-sm text-gray-500"
147
174
  >
148
- {row[column]?.toString()}
175
+ {row[column] === null || row[column] === undefined
176
+ ? ''
177
+ : typeof row[column] === 'object'
178
+ ? JSON.stringify(row[column])
179
+ : String(row[column])}
149
180
  </td>
150
181
  ))}
151
182
  </tr>
@@ -58,7 +58,7 @@ export function ColumnConfigModal({
58
58
  const [selectedColumn, setSelectedColumn] = useState<string | null>(null);
59
59
  const [searchQuery, setSearchQuery] = useState("");
60
60
  const [activeFilters, setActiveFilters] = useState<{
61
- view: "all" | "training" | "hidden" | "preprocessed" | "nulls";
61
+ view: "all" | "training" | "hidden" | "preprocessed" | "nulls" | "computed" | "required";
62
62
  types: string[];
63
63
  }>({
64
64
  view: "all",
@@ -103,6 +103,10 @@ export function ColumnConfigModal({
103
103
  return colHasPreprocessingSteps(column);
104
104
  case "nulls":
105
105
  return (column.statistics?.processed?.null_count || 0) > 0;
106
+ case "computed":
107
+ return column.is_computed;
108
+ case "required":
109
+ return column.required;
106
110
  default:
107
111
  return true;
108
112
  }
@@ -124,6 +128,8 @@ export function ColumnConfigModal({
124
128
  withNulls: dataset.columns.filter(
125
129
  (c) => (c.statistics?.processed?.null_count || 0) > 0
126
130
  ).length,
131
+ computed: dataset.columns.filter((c) => c.is_computed === true).length,
132
+ required: dataset.columns.filter((c) => c.required === true).length,
127
133
  }),
128
134
  [dataset.columns, filteredColumns]
129
135
  );
@@ -1,16 +1,16 @@
1
1
  import React, { useState } from 'react';
2
- import { Filter, Database, Wrench, Eye, EyeOff, AlertTriangle, ChevronLeft, ChevronRight } from 'lucide-react';
2
+ import { Filter, Database, Wrench, Eye, EyeOff, AlertTriangle, ChevronLeft, ChevronRight, Calculator, Target } from 'lucide-react';
3
3
  import type { Column } from '../../types';
4
4
 
5
5
  const ITEMS_PER_PAGE = 5;
6
6
  interface ColumnFiltersProps {
7
7
  types: string[];
8
8
  activeFilters: {
9
- view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls';
9
+ view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls' | 'computed' | 'required';
10
10
  types: string[];
11
11
  };
12
12
  onFilterChange: (filters: {
13
- view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls';
13
+ view: 'all' | 'training' | 'hidden' | 'preprocessed' | 'nulls' | 'computed' | 'required';
14
14
  types: string[];
15
15
  }) => void;
16
16
  columnStats: {
@@ -20,6 +20,8 @@ interface ColumnFiltersProps {
20
20
  hidden: number;
21
21
  withPreprocessing: number;
22
22
  withNulls: number;
23
+ computed: number;
24
+ required: number;
23
25
  };
24
26
  colHasPreprocessingSteps: (col: Column) => boolean;
25
27
  columns: Column[];
@@ -43,6 +45,10 @@ export function ColumnFilters({
43
45
  return `${columnStats.withPreprocessing} columns`;
44
46
  case 'nulls':
45
47
  return `${columnStats.withNulls} columns`;
48
+ case 'computed':
49
+ return `${columnStats.computed} columns`;
50
+ case 'required':
51
+ return `${columnStats.required} columns`;
46
52
  default:
47
53
  return `${columnStats.total} columns`;
48
54
  }
@@ -158,6 +164,34 @@ export function ColumnFilters({
158
164
  ({getViewStats('nulls')})
159
165
  </span>
160
166
  </button>
167
+ <button
168
+ onClick={() => onFilterChange({ ...activeFilters, view: 'computed' })}
169
+ className={`inline-flex items-center gap-1 px-3 py-1.5 rounded-md text-sm font-medium ${
170
+ activeFilters.view === 'computed'
171
+ ? 'bg-purple-100 text-purple-900'
172
+ : 'text-gray-600 hover:bg-gray-50'
173
+ }`}
174
+ >
175
+ <Calculator className="w-4 h-4" />
176
+ Computed
177
+ <span className="text-xs text-gray-500 ml-1">
178
+ ({getViewStats('computed')})
179
+ </span>
180
+ </button>
181
+ <button
182
+ onClick={() => onFilterChange({ ...activeFilters, view: 'required' })}
183
+ className={`inline-flex items-center gap-1 px-3 py-1.5 rounded-md text-sm font-medium ${
184
+ activeFilters.view === 'required'
185
+ ? 'bg-blue-100 text-blue-900'
186
+ : 'text-gray-600 hover:bg-gray-50'
187
+ }`}
188
+ >
189
+ <Target className="w-4 h-4" />
190
+ Required
191
+ <span className="text-xs text-gray-500 ml-1">
192
+ ({getViewStats('required')})
193
+ </span>
194
+ </button>
161
195
  </div>
162
196
 
163
197
  {/* Column Types */}
@@ -1,5 +1,5 @@
1
1
  import React from 'react';
2
- import { Settings2, AlertCircle, Target, EyeOff, Eye } from 'lucide-react';
2
+ import { Settings2, AlertCircle, Target, EyeOff, Eye, Calculator, Star } from 'lucide-react';
3
3
  import type { Column } from '../../types';
4
4
  import { usePage } from "@inertiajs/react";
5
5
 
@@ -79,11 +79,23 @@ export function ColumnList({
79
79
  </p>
80
80
  )}
81
81
  <div className="flex flex-wrap gap-2">
82
+ {column.required && (
83
+ <div className="flex items-center gap-1 text-blue-600">
84
+ <Star className="w-3 h-3" />
85
+ <span className="text-xs">required</span>
86
+ </div>
87
+ )}
88
+ {column.is_computed && (
89
+ <div className="flex items-center gap-1 text-purple-600">
90
+ <Calculator className="w-3 h-3" />
91
+ <span className="text-xs">computed</span>
92
+ </div>
93
+ )}
82
94
  {column.preprocessing_steps && column.preprocessing_steps?.training &&
83
95
  column.preprocessing_steps?.training?.method !== 'none' && (
84
96
  <div className="flex items-center gap-1 text-blue-600">
85
97
  <AlertCircle className="w-3 h-3" />
86
- <span className="text-xs">Preprocessing configured</span>
98
+ <span className="text-xs">preprocessing configured</span>
87
99
  </div>
88
100
  )}
89
101
  {column.hidden && (
@@ -1,5 +1,5 @@
1
1
  import React, { useState, useEffect } from 'react';
2
- import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database } from 'lucide-react';
2
+ import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database, Calculator, GitBranch } from 'lucide-react';
3
3
  import type { Dataset, Column, ColumnType, PreprocessingConstants, PreprocessingSteps, PreprocessingStep } from '../../types/dataset';
4
4
  import { Badge } from "@/components/ui/badge";
5
5
 
@@ -332,26 +332,41 @@ export function PreprocessingConfig({
332
332
  </div>
333
333
  </div>
334
334
  <div className="flex items-center gap-4 flex-shrink-0">
335
- {column.is_target ? (
336
- <span className="inline-flex items-center px-3 py-1 rounded-full text-sm font-medium bg-purple-100 text-purple-800">
337
- Target Column
338
- </span>
339
- ) : (
340
- <div className="flex items-center gap-2">
341
- <label className="flex items-center gap-2 text-sm">
342
- <input
343
- type="checkbox"
344
- checked={column.drop_if_null}
345
- onChange={onToggleDropIfNull}
346
- className="rounded border-gray-300 text-red-600 focus:ring-red-500"
347
- />
348
- <span className="flex items-center gap-1 text-gray-700">
349
- <Trash2 className="w-4 h-4 text-gray-400" />
350
- Drop if null
351
- </span>
352
- </label>
335
+ <div className="relative flex items-center gap-2">
336
+ <div className="absolute right-0 -top-8 flex items-center gap-2">
337
+ {column.required && (
338
+ <Badge variant="secondary" className="bg-blue-100 text-blue-800">
339
+ Required
340
+ </Badge>
341
+ )}
342
+ {column.is_computed && (
343
+ <Badge variant="secondary" className="bg-purple-100 text-purple-800">
344
+ <Calculator className="w-3 h-3 mr-1" />
345
+ Computed
346
+ </Badge>
347
+ )}
353
348
  </div>
354
- )}
349
+ {column.is_target ? (
350
+ <span className="inline-flex items-center px-3 py-1 rounded-full text-sm font-medium bg-purple-100 text-purple-800">
351
+ Target Column
352
+ </span>
353
+ ) : (
354
+ <div className="flex items-center gap-2">
355
+ <label className="flex items-center gap-2 text-sm">
356
+ <input
357
+ type="checkbox"
358
+ checked={column.drop_if_null}
359
+ onChange={onToggleDropIfNull}
360
+ className="rounded border-gray-300 text-red-600 focus:ring-red-500"
361
+ />
362
+ <span className="flex items-center gap-1 text-gray-700">
363
+ <Trash2 className="w-4 h-4 text-gray-400" />
364
+ Drop if null
365
+ </span>
366
+ </label>
367
+ </div>
368
+ )}
369
+ </div>
355
370
  </div>
356
371
  </div>
357
372
 
@@ -481,6 +496,52 @@ export function PreprocessingConfig({
481
496
  )}
482
497
  </div>
483
498
 
499
+ {/* Column Lineage Section */}
500
+ {column.lineage && column.lineage.length > 0 && (
501
+ <div className="bg-white rounded-lg border border-gray-200 p-6">
502
+ <h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
503
+ <GitBranch className="w-5 h-5 text-gray-500" />
504
+ Column Lineage
505
+ </h3>
506
+ <div className="space-y-4">
507
+ {column.lineage.map((step, index) => (
508
+ <div key={index} className="flex items-start gap-3">
509
+ <div className={`w-8 h-8 rounded-full flex items-center justify-center flex-shrink-0 ${
510
+ step.key === 'raw_dataset'
511
+ ? 'bg-gray-100'
512
+ : step.key === 'computed_by_feature'
513
+ ? 'bg-purple-100'
514
+ : 'bg-blue-100'
515
+ }`}>
516
+ {step.key === 'raw_dataset' ? (
517
+ <Database className="w-4 h-4 text-gray-600" />
518
+ ) : step.key === 'computed_by_feature' ? (
519
+ <Calculator className="w-4 h-4 text-purple-600" />
520
+ ) : (
521
+ <Settings2 className="w-4 h-4 text-blue-600" />
522
+ )}
523
+ </div>
524
+ <div className="flex-1">
525
+ <div className="flex items-center justify-between">
526
+ <p className="text-sm font-medium text-gray-900">
527
+ {step.description}
528
+ </p>
529
+ {step.timestamp && (
530
+ <span className="text-xs text-gray-500">
531
+ {new Date(step.timestamp).toLocaleString()}
532
+ </span>
533
+ )}
534
+ </div>
535
+ {index < column.lineage.length - 1 && (
536
+ <div className="ml-4 mt-2 mb-2 w-0.5 h-4 bg-gray-200" />
537
+ )}
538
+ </div>
539
+ </div>
540
+ ))}
541
+ </div>
542
+ </div>
543
+ )}
544
+
484
545
  {/* Data Type Section */}
485
546
  <div className="bg-white rounded-lg border border-gray-200 p-6">
486
547
  <h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
@@ -84,6 +84,9 @@ export interface Column {
84
84
  sample_values: {};
85
85
  statistics?: Statistics;
86
86
  preprocessing_steps?: PreprocessingSteps;
87
+ lineage?: Array<{ key: string }>;
88
+ required?: boolean;
89
+ is_computed?: boolean;
87
90
  }
88
91
 
89
92
  export interface Dataset {
@@ -5,7 +5,6 @@ module EasyML
5
5
  @queue = :easy_ml
6
6
 
7
7
  def self.perform(batch_id, options = {})
8
- puts "Performing compute feature job with options #{options}"
9
8
  begin
10
9
  options.symbolize_keys!
11
10
  feature_id = options.dig(:feature_id)
@@ -14,7 +13,6 @@ module EasyML
14
13
 
15
14
  # Check if any feature has failed before proceeding
16
15
  if dataset.features.any? { |f| f.workflow_status == "failed" }
17
- puts "Aborting feature computation due to previous feature failure"
18
16
  return
19
17
  end
20
18
 
@@ -40,7 +38,6 @@ module EasyML
40
38
  end
41
39
 
42
40
  def self.after_batch_hook(batch_id, *args)
43
- puts "After batch!"
44
41
  batch_args = fetch_batch_arguments(batch_id).flatten.map(&:symbolize_keys)
45
42
  feature_ids = batch_args.pluck(:feature_id).uniq
46
43
  parent_id = batch_args.pluck(:parent_batch_id).first
@@ -3,28 +3,22 @@ module EasyML
3
3
  def perform(id)
4
4
  begin
5
5
  dataset = EasyML::Dataset.find(id)
6
- return if dataset.workflow_status == :analyzing
7
6
 
8
7
  puts "Refreshing dataset #{dataset.name}"
9
- puts "Needs refresh? #{dataset.needs_refresh?}"
10
8
  unless dataset.needs_refresh?
11
9
  dataset.update(workflow_status: :ready)
12
10
  end
13
11
 
14
12
  create_event(dataset, "started")
15
13
 
16
- puts "Prepare! #{dataset.name}"
17
14
  dataset.unlock!
18
15
  dataset.prepare
19
16
  if dataset.features.needs_fit.any?
20
17
  dataset.fit_features(async: true)
21
- puts "Computing features!"
22
18
  else
23
19
  dataset.actually_refresh
24
- puts "Done!"
25
20
  end
26
21
  rescue StandardError => e
27
- puts "Error #{e.message}"
28
22
  if Rails.env.test?
29
23
  raise e
30
24
  end
@@ -0,0 +1,89 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Base
5
+ class << self
6
+ def param_applies(p)
7
+ Imputers.supported_params << p
8
+ Imputers.params_by_class[self] ||= []
9
+ Imputers.params_by_class[self] << p.to_sym
10
+ end
11
+
12
+ def method_applies(m)
13
+ Imputers.supported_methods << m.to_sym
14
+ Imputers.methods_by_class[self] ||= []
15
+ Imputers.methods_by_class[self] << m.to_sym
16
+ end
17
+
18
+ def description
19
+ "Unknown preprocessing method"
20
+ end
21
+ end
22
+
23
+ attr_accessor :column, :preprocessing_step
24
+
25
+ def initialize(column, preprocessing_step)
26
+ @column = column
27
+ @preprocessing_step = preprocessing_step.with_indifferent_access
28
+ end
29
+
30
+ def applies?
31
+ method_applies? || param_applies?
32
+ end
33
+
34
+ def method_applies?
35
+ imputers_own_methods.include?(method.to_sym)
36
+ end
37
+
38
+ def param_applies?
39
+ params.keys.any? { |p| imputers_own_params.include?(p.to_sym) && params[p] != false }
40
+ end
41
+
42
+ def imputers_own_methods
43
+ Imputers.methods_by_class[self.class] || []
44
+ end
45
+
46
+ def imputers_own_params
47
+ Imputers.params_by_class[self.class] || []
48
+ end
49
+
50
+ def params
51
+ @preprocessing_step.dig(:params)
52
+ end
53
+
54
+ def method
55
+ @preprocessing_step.dig(:method)
56
+ end
57
+
58
+ def statistics(*args)
59
+ if column.is_computed
60
+ column.statistics.dig(:processed, *args)
61
+ else
62
+ column.statistics.dig(:clipped, *args) || column.statistics.dig(:raw, *args)
63
+ end
64
+ end
65
+
66
+ def anything?
67
+ true
68
+ end
69
+
70
+ def inspect
71
+ params_str = params ? params.map { |k, v| "#{k}: #{v}" }.join(", ") : "none"
72
+ method_str = method ? method : "none"
73
+
74
+ "#<#{self.class.name} method=#{method_str.inspect} params={#{params_str}}>"
75
+ end
76
+
77
+ alias_method :to_s, :inspect
78
+
79
+ def transform(df)
80
+ raise "Method not implemented"
81
+ end
82
+
83
+ def description
84
+ self.class.description
85
+ end
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,35 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Categorical < Base
5
+ method_applies :categorical
6
+ param_applies :categorical_min
7
+
8
+ def self.description
9
+ "Categorical imputation"
10
+ end
11
+
12
+ def transform(df)
13
+ return df unless allowed_categories.present?
14
+
15
+ case column.datatype
16
+ when :categorical
17
+ df = df.with_column(
18
+ Polars.when(Polars.col(column.name).is_in(allowed_categories))
19
+ .then(Polars.col(column.name))
20
+ .otherwise(Polars.lit("other"))
21
+ .alias(column.name)
22
+ )
23
+ when :boolean
24
+ # no-op
25
+ end
26
+ df
27
+ end
28
+
29
+ def allowed_categories
30
+ column.allowed_categories
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Clip < Base
5
+ attr_accessor :column, :dataset, :preprocessing_step
6
+
7
+ param_applies :clip
8
+
9
+ def self.description
10
+ "Clip"
11
+ end
12
+
13
+ def transform(df)
14
+ df = df.with_column(
15
+ Polars.col(column.name).clip(min, max).alias(column.name)
16
+ )
17
+ df
18
+ end
19
+
20
+ def min
21
+ params.dig(:clip, :min) || 0
22
+ end
23
+
24
+ def max
25
+ params.dig(:clip, :max) || Float::INFINITY
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,27 @@
1
+ module EasyML
2
+ class Column
3
+ class Imputers
4
+ class Constant < Base
5
+ method_applies :constant
6
+ param_applies :constant
7
+
8
+ def self.description
9
+ "Constant value imputation"
10
+ end
11
+
12
+ def transform(df)
13
+ return df unless constant.present?
14
+
15
+ df = df.with_column(
16
+ Polars.col(column.name).fill_null(constant).alias(column.name)
17
+ )
18
+ df
19
+ end
20
+
21
+ def constant
22
+ params.dig(:constant)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end