easy_ml 0.1.3 → 0.2.0.pre.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +234 -26
  3. data/Rakefile +45 -0
  4. data/app/controllers/easy_ml/application_controller.rb +67 -0
  5. data/app/controllers/easy_ml/columns_controller.rb +38 -0
  6. data/app/controllers/easy_ml/datasets_controller.rb +156 -0
  7. data/app/controllers/easy_ml/datasources_controller.rb +88 -0
  8. data/app/controllers/easy_ml/deploys_controller.rb +20 -0
  9. data/app/controllers/easy_ml/models_controller.rb +151 -0
  10. data/app/controllers/easy_ml/retraining_runs_controller.rb +19 -0
  11. data/app/controllers/easy_ml/settings_controller.rb +59 -0
  12. data/app/frontend/components/AlertProvider.tsx +108 -0
  13. data/app/frontend/components/DatasetPreview.tsx +161 -0
  14. data/app/frontend/components/EmptyState.tsx +28 -0
  15. data/app/frontend/components/ModelCard.tsx +255 -0
  16. data/app/frontend/components/ModelDetails.tsx +334 -0
  17. data/app/frontend/components/ModelForm.tsx +384 -0
  18. data/app/frontend/components/Navigation.tsx +300 -0
  19. data/app/frontend/components/Pagination.tsx +72 -0
  20. data/app/frontend/components/Popover.tsx +55 -0
  21. data/app/frontend/components/PredictionStream.tsx +105 -0
  22. data/app/frontend/components/ScheduleModal.tsx +726 -0
  23. data/app/frontend/components/SearchInput.tsx +23 -0
  24. data/app/frontend/components/SearchableSelect.tsx +132 -0
  25. data/app/frontend/components/dataset/AutosaveIndicator.tsx +39 -0
  26. data/app/frontend/components/dataset/ColumnConfigModal.tsx +431 -0
  27. data/app/frontend/components/dataset/ColumnFilters.tsx +256 -0
  28. data/app/frontend/components/dataset/ColumnList.tsx +101 -0
  29. data/app/frontend/components/dataset/FeatureConfigPopover.tsx +57 -0
  30. data/app/frontend/components/dataset/FeaturePicker.tsx +205 -0
  31. data/app/frontend/components/dataset/PreprocessingConfig.tsx +704 -0
  32. data/app/frontend/components/dataset/SplitConfigurator.tsx +120 -0
  33. data/app/frontend/components/dataset/splitters/DateSplitter.tsx +58 -0
  34. data/app/frontend/components/dataset/splitters/KFoldSplitter.tsx +68 -0
  35. data/app/frontend/components/dataset/splitters/LeavePOutSplitter.tsx +29 -0
  36. data/app/frontend/components/dataset/splitters/PredefinedSplitter.tsx +146 -0
  37. data/app/frontend/components/dataset/splitters/RandomSplitter.tsx +85 -0
  38. data/app/frontend/components/dataset/splitters/StratifiedSplitter.tsx +79 -0
  39. data/app/frontend/components/dataset/splitters/constants.ts +77 -0
  40. data/app/frontend/components/dataset/splitters/types.ts +168 -0
  41. data/app/frontend/components/dataset/splitters/utils.ts +53 -0
  42. data/app/frontend/components/features/CodeEditor.tsx +46 -0
  43. data/app/frontend/components/features/DataPreview.tsx +150 -0
  44. data/app/frontend/components/features/FeatureCard.tsx +88 -0
  45. data/app/frontend/components/features/FeatureForm.tsx +235 -0
  46. data/app/frontend/components/features/FeatureGroupCard.tsx +54 -0
  47. data/app/frontend/components/settings/PluginSettings.tsx +81 -0
  48. data/app/frontend/components/ui/badge.tsx +44 -0
  49. data/app/frontend/components/ui/collapsible.tsx +9 -0
  50. data/app/frontend/components/ui/scroll-area.tsx +46 -0
  51. data/app/frontend/components/ui/separator.tsx +29 -0
  52. data/app/frontend/entrypoints/App.tsx +40 -0
  53. data/app/frontend/entrypoints/Application.tsx +24 -0
  54. data/app/frontend/hooks/useAutosave.ts +61 -0
  55. data/app/frontend/layouts/Layout.tsx +38 -0
  56. data/app/frontend/lib/utils.ts +6 -0
  57. data/app/frontend/mockData.ts +272 -0
  58. data/app/frontend/pages/DatasetDetailsPage.tsx +103 -0
  59. data/app/frontend/pages/DatasetsPage.tsx +261 -0
  60. data/app/frontend/pages/DatasourceFormPage.tsx +147 -0
  61. data/app/frontend/pages/DatasourcesPage.tsx +261 -0
  62. data/app/frontend/pages/EditModelPage.tsx +45 -0
  63. data/app/frontend/pages/EditTransformationPage.tsx +56 -0
  64. data/app/frontend/pages/ModelsPage.tsx +115 -0
  65. data/app/frontend/pages/NewDatasetPage.tsx +366 -0
  66. data/app/frontend/pages/NewModelPage.tsx +45 -0
  67. data/app/frontend/pages/NewTransformationPage.tsx +43 -0
  68. data/app/frontend/pages/SettingsPage.tsx +272 -0
  69. data/app/frontend/pages/ShowModelPage.tsx +30 -0
  70. data/app/frontend/pages/TransformationsPage.tsx +95 -0
  71. data/app/frontend/styles/application.css +100 -0
  72. data/app/frontend/types/dataset.ts +146 -0
  73. data/app/frontend/types/datasource.ts +33 -0
  74. data/app/frontend/types/preprocessing.ts +1 -0
  75. data/app/frontend/types.ts +113 -0
  76. data/app/helpers/easy_ml/application_helper.rb +10 -0
  77. data/app/jobs/easy_ml/application_job.rb +21 -0
  78. data/app/jobs/easy_ml/batch_job.rb +46 -0
  79. data/app/jobs/easy_ml/compute_feature_job.rb +19 -0
  80. data/app/jobs/easy_ml/deploy_job.rb +13 -0
  81. data/app/jobs/easy_ml/finalize_feature_job.rb +15 -0
  82. data/app/jobs/easy_ml/refresh_dataset_job.rb +32 -0
  83. data/app/jobs/easy_ml/schedule_retraining_job.rb +11 -0
  84. data/app/jobs/easy_ml/sync_datasource_job.rb +17 -0
  85. data/app/jobs/easy_ml/training_job.rb +62 -0
  86. data/app/models/easy_ml/adapters/base_adapter.rb +45 -0
  87. data/app/models/easy_ml/adapters/polars_adapter.rb +77 -0
  88. data/app/models/easy_ml/cleaner.rb +82 -0
  89. data/app/models/easy_ml/column.rb +124 -0
  90. data/app/models/easy_ml/column_history.rb +30 -0
  91. data/app/models/easy_ml/column_list.rb +122 -0
  92. data/app/models/easy_ml/concerns/configurable.rb +61 -0
  93. data/app/models/easy_ml/concerns/versionable.rb +19 -0
  94. data/app/models/easy_ml/dataset.rb +767 -0
  95. data/app/models/easy_ml/dataset_history.rb +56 -0
  96. data/app/models/easy_ml/datasource.rb +182 -0
  97. data/app/models/easy_ml/datasource_history.rb +24 -0
  98. data/app/models/easy_ml/datasources/base_datasource.rb +54 -0
  99. data/app/models/easy_ml/datasources/file_datasource.rb +58 -0
  100. data/app/models/easy_ml/datasources/polars_datasource.rb +89 -0
  101. data/app/models/easy_ml/datasources/s3_datasource.rb +97 -0
  102. data/app/models/easy_ml/deploy.rb +114 -0
  103. data/app/models/easy_ml/event.rb +79 -0
  104. data/app/models/easy_ml/feature.rb +437 -0
  105. data/app/models/easy_ml/feature_history.rb +38 -0
  106. data/app/models/easy_ml/model.rb +575 -41
  107. data/app/models/easy_ml/model_file.rb +133 -0
  108. data/app/models/easy_ml/model_file_history.rb +24 -0
  109. data/app/models/easy_ml/model_history.rb +51 -0
  110. data/app/models/easy_ml/models/base_model.rb +58 -0
  111. data/app/models/easy_ml/models/hyperparameters/base.rb +99 -0
  112. data/app/models/easy_ml/models/hyperparameters/xgboost/dart.rb +82 -0
  113. data/app/models/easy_ml/models/hyperparameters/xgboost/gblinear.rb +82 -0
  114. data/app/models/easy_ml/models/hyperparameters/xgboost/gbtree.rb +97 -0
  115. data/app/models/easy_ml/models/hyperparameters/xgboost.rb +71 -0
  116. data/app/models/easy_ml/models/xgboost/evals_callback.rb +138 -0
  117. data/app/models/easy_ml/models/xgboost/progress_callback.rb +39 -0
  118. data/app/models/easy_ml/models/xgboost.rb +544 -4
  119. data/app/models/easy_ml/prediction.rb +44 -0
  120. data/app/models/easy_ml/retraining_job.rb +278 -0
  121. data/app/models/easy_ml/retraining_run.rb +184 -0
  122. data/app/models/easy_ml/settings.rb +37 -0
  123. data/app/models/easy_ml/splitter.rb +90 -0
  124. data/app/models/easy_ml/splitters/base_splitter.rb +28 -0
  125. data/app/models/easy_ml/splitters/date_splitter.rb +91 -0
  126. data/app/models/easy_ml/splitters/predefined_splitter.rb +74 -0
  127. data/app/models/easy_ml/splitters/random_splitter.rb +82 -0
  128. data/app/models/easy_ml/tuner_job.rb +56 -0
  129. data/app/models/easy_ml/tuner_run.rb +31 -0
  130. data/app/models/splitter_history.rb +6 -0
  131. data/app/serializers/easy_ml/column_serializer.rb +27 -0
  132. data/app/serializers/easy_ml/dataset_serializer.rb +73 -0
  133. data/app/serializers/easy_ml/datasource_serializer.rb +64 -0
  134. data/app/serializers/easy_ml/feature_serializer.rb +27 -0
  135. data/app/serializers/easy_ml/model_serializer.rb +90 -0
  136. data/app/serializers/easy_ml/retraining_job_serializer.rb +22 -0
  137. data/app/serializers/easy_ml/retraining_run_serializer.rb +39 -0
  138. data/app/serializers/easy_ml/settings_serializer.rb +9 -0
  139. data/app/views/layouts/easy_ml/application.html.erb +15 -0
  140. data/config/initializers/resque.rb +3 -0
  141. data/config/resque-pool.yml +6 -0
  142. data/config/routes.rb +39 -0
  143. data/config/spring.rb +1 -0
  144. data/config/vite.json +15 -0
  145. data/lib/easy_ml/configuration.rb +64 -0
  146. data/lib/easy_ml/core/evaluators/base_evaluator.rb +53 -0
  147. data/lib/easy_ml/core/evaluators/classification_evaluators.rb +126 -0
  148. data/lib/easy_ml/core/evaluators/regression_evaluators.rb +66 -0
  149. data/lib/easy_ml/core/model_evaluator.rb +161 -89
  150. data/lib/easy_ml/core/tuner/adapters/base_adapter.rb +28 -18
  151. data/lib/easy_ml/core/tuner/adapters/xgboost_adapter.rb +4 -25
  152. data/lib/easy_ml/core/tuner.rb +123 -62
  153. data/lib/easy_ml/core.rb +0 -3
  154. data/lib/easy_ml/core_ext/hash.rb +24 -0
  155. data/lib/easy_ml/core_ext/pathname.rb +11 -5
  156. data/lib/easy_ml/data/date_converter.rb +90 -0
  157. data/lib/easy_ml/data/filter_extensions.rb +31 -0
  158. data/lib/easy_ml/data/polars_column.rb +126 -0
  159. data/lib/easy_ml/data/polars_reader.rb +297 -0
  160. data/lib/easy_ml/data/preprocessor.rb +280 -142
  161. data/lib/easy_ml/data/simple_imputer.rb +255 -0
  162. data/lib/easy_ml/data/splits/file_split.rb +252 -0
  163. data/lib/easy_ml/data/splits/in_memory_split.rb +54 -0
  164. data/lib/easy_ml/data/splits/split.rb +95 -0
  165. data/lib/easy_ml/data/splits.rb +9 -0
  166. data/lib/easy_ml/data/statistics_learner.rb +93 -0
  167. data/lib/easy_ml/data/synced_directory.rb +341 -0
  168. data/lib/easy_ml/data.rb +6 -2
  169. data/lib/easy_ml/engine.rb +105 -6
  170. data/lib/easy_ml/feature_store.rb +227 -0
  171. data/lib/easy_ml/features.rb +61 -0
  172. data/lib/easy_ml/initializers/inflections.rb +17 -3
  173. data/lib/easy_ml/logging.rb +2 -2
  174. data/lib/easy_ml/predict.rb +74 -0
  175. data/lib/easy_ml/railtie/generators/migration/migration_generator.rb +192 -36
  176. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_column_histories.rb.tt +9 -0
  177. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_columns.rb.tt +25 -0
  178. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_dataset_histories.rb.tt +9 -0
  179. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasets.rb.tt +31 -0
  180. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasource_histories.rb.tt +9 -0
  181. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_datasources.rb.tt +16 -0
  182. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_deploys.rb.tt +24 -0
  183. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_events.rb.tt +20 -0
  184. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_feature_histories.rb.tt +14 -0
  185. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_features.rb.tt +32 -0
  186. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_file_histories.rb.tt +9 -0
  187. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_files.rb.tt +17 -0
  188. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_model_histories.rb.tt +9 -0
  189. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_models.rb.tt +20 -9
  190. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_predictions.rb.tt +17 -0
  191. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_retraining_jobs.rb.tt +77 -0
  192. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_settings.rb.tt +9 -0
  193. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitter_histories.rb.tt +9 -0
  194. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_splitters.rb.tt +15 -0
  195. data/lib/easy_ml/railtie/templates/migration/create_easy_ml_tuner_jobs.rb.tt +40 -0
  196. data/lib/easy_ml/support/est.rb +5 -1
  197. data/lib/easy_ml/support/file_rotate.rb +79 -15
  198. data/lib/easy_ml/support/file_support.rb +9 -0
  199. data/lib/easy_ml/support/local_file.rb +24 -0
  200. data/lib/easy_ml/support/lockable.rb +62 -0
  201. data/lib/easy_ml/support/synced_file.rb +103 -0
  202. data/lib/easy_ml/support/utc.rb +5 -1
  203. data/lib/easy_ml/support.rb +6 -3
  204. data/lib/easy_ml/version.rb +4 -1
  205. data/lib/easy_ml.rb +7 -2
  206. metadata +355 -72
  207. data/app/models/easy_ml/models.rb +0 -5
  208. data/lib/easy_ml/core/model.rb +0 -30
  209. data/lib/easy_ml/core/model_core.rb +0 -181
  210. data/lib/easy_ml/core/models/hyperparameters/base.rb +0 -34
  211. data/lib/easy_ml/core/models/hyperparameters/xgboost.rb +0 -19
  212. data/lib/easy_ml/core/models/xgboost.rb +0 -10
  213. data/lib/easy_ml/core/models/xgboost_core.rb +0 -220
  214. data/lib/easy_ml/core/models.rb +0 -10
  215. data/lib/easy_ml/core/uploaders/model_uploader.rb +0 -24
  216. data/lib/easy_ml/core/uploaders.rb +0 -7
  217. data/lib/easy_ml/data/dataloader.rb +0 -6
  218. data/lib/easy_ml/data/dataset/data/preprocessor/statistics.json +0 -31
  219. data/lib/easy_ml/data/dataset/data/sample_info.json +0 -1
  220. data/lib/easy_ml/data/dataset/dataset/files/sample_info.json +0 -1
  221. data/lib/easy_ml/data/dataset/splits/file_split.rb +0 -140
  222. data/lib/easy_ml/data/dataset/splits/in_memory_split.rb +0 -49
  223. data/lib/easy_ml/data/dataset/splits/split.rb +0 -98
  224. data/lib/easy_ml/data/dataset/splits.rb +0 -11
  225. data/lib/easy_ml/data/dataset/splitters/date_splitter.rb +0 -43
  226. data/lib/easy_ml/data/dataset/splitters.rb +0 -9
  227. data/lib/easy_ml/data/dataset.rb +0 -430
  228. data/lib/easy_ml/data/datasource/datasource_factory.rb +0 -60
  229. data/lib/easy_ml/data/datasource/file_datasource.rb +0 -40
  230. data/lib/easy_ml/data/datasource/merged_datasource.rb +0 -64
  231. data/lib/easy_ml/data/datasource/polars_datasource.rb +0 -41
  232. data/lib/easy_ml/data/datasource/s3_datasource.rb +0 -89
  233. data/lib/easy_ml/data/datasource.rb +0 -33
  234. data/lib/easy_ml/data/preprocessor/preprocessor.rb +0 -205
  235. data/lib/easy_ml/data/preprocessor/simple_imputer.rb +0 -402
  236. data/lib/easy_ml/deployment.rb +0 -5
  237. data/lib/easy_ml/support/synced_directory.rb +0 -134
  238. data/lib/easy_ml/transforms.rb +0 -29
  239. /data/{lib/easy_ml/core → app/models/easy_ml}/models/hyperparameters.rb +0 -0
@@ -0,0 +1,704 @@
1
+ import React, { useState, useEffect } from 'react';
2
+ import { Settings2, Wrench, ArrowRight, Pencil, Trash2, Database } from 'lucide-react';
3
+ import type { Dataset, Column, ColumnType, PreprocessingConstants, PreprocessingSteps, PreprocessingStep } from '../../types/dataset';
4
+ import { Badge } from "@/components/ui/badge";
5
+
6
+ interface PreprocessingConfigProps {
7
+ column: Column;
8
+ dataset: Dataset;
9
+ setColumnType: (columnName: string, columnType: string) => void;
10
+ setDataset: (dataset: Dataset) => void;
11
+ constants: PreprocessingConstants;
12
+ onUpdate: (
13
+ training: PreprocessingStep,
14
+ inference: PreprocessingStep | undefined,
15
+ useDistinctInference: boolean
16
+ ) => void;
17
+ }
18
+
19
+ const isNumericType = (type: ColumnType): boolean =>
20
+ type === 'float' || type === 'integer';
21
+
22
+ const createPreprocessingStep = (steps?: PreprocessingStep): PreprocessingStep => ({
23
+ method: steps?.method || 'none',
24
+ params: {
25
+ constant: steps?.params?.constant,
26
+ categorical_min: steps?.params?.categorical_min ?? 100,
27
+ one_hot: steps?.params?.one_hot ?? true,
28
+ ordinal_encoding: steps?.params?.ordinal_encoding ?? false,
29
+ clip: steps?.params?.clip
30
+ }
31
+ });
32
+
33
+ export function PreprocessingConfig({
34
+ column,
35
+ dataset,
36
+ setColumnType,
37
+ setDataset,
38
+ constants,
39
+ onUpdate
40
+ }: PreprocessingConfigProps) {
41
+ const [useDistinctInference, setUseDistinctInference] = useState(
42
+ Boolean(column.preprocessing_steps?.inference?.method &&
43
+ column.preprocessing_steps.inference.method !== 'none')
44
+ );
45
+
46
+ const selectedType = column.datatype as ColumnType;
47
+
48
+ const [training, setTraining] = useState<PreprocessingStep>(() =>
49
+ createPreprocessingStep(column.preprocessing_steps?.training)
50
+ );
51
+
52
+ const [inference, setInference] = useState<PreprocessingStep>(() =>
53
+ createPreprocessingStep(column.preprocessing_steps?.inference)
54
+ );
55
+
56
+ // Update all states when column changes
57
+ useEffect(() => {
58
+ setTraining(createPreprocessingStep(column.preprocessing_steps?.training));
59
+ setInference(createPreprocessingStep(column.preprocessing_steps?.inference));
60
+ }, [column.id]); // Only re-run when column changes
61
+
62
+ const handleStrategyChange = (
63
+ type: 'training' | 'inference',
64
+ method: PreprocessingStep['method']
65
+ ) => {
66
+ let defaultParams: PreprocessingStep['params'] = {};
67
+
68
+ if (selectedType === 'categorical') {
69
+ if (method === 'categorical') {
70
+ defaultParams = {
71
+ ...defaultParams,
72
+ categorical_min: 100,
73
+ one_hot: true
74
+ };
75
+ } else if (method != 'none') {
76
+ defaultParams = {
77
+ ...defaultParams,
78
+ one_hot: true
79
+ };
80
+ }
81
+ }
82
+
83
+ if (column.is_target) {
84
+ defaultParams = {
85
+ ...defaultParams,
86
+ ordinal_encoding: true
87
+ };
88
+ }
89
+
90
+ const newStrategy: PreprocessingStep = {
91
+ method,
92
+ params: defaultParams
93
+ };
94
+
95
+ if (type === 'training') {
96
+ setTraining(newStrategy);
97
+ onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
98
+ } else {
99
+ setInference(newStrategy);
100
+ onUpdate(training, newStrategy, useDistinctInference);
101
+ }
102
+ };
103
+
104
+ // Update the categorical params section:
105
+ const handleCategoricalParamChange = (
106
+ type: 'training' | 'inference',
107
+ updates: Partial<PreprocessingStep['params']>
108
+ ) => {
109
+ const strategy = type === 'training' ? training : inference;
110
+ const setStrategy = type === 'training' ? setTraining : setInference;
111
+
112
+ const newStrategy: PreprocessingStep = {
113
+ ...strategy,
114
+ params: {
115
+ categorical_min: strategy.params.categorical_min,
116
+ one_hot: strategy.params.one_hot,
117
+ ordinal_encoding: strategy.params.ordinal_encoding,
118
+ ...updates
119
+ }
120
+ };
121
+
122
+ setStrategy(newStrategy);
123
+ if (type === 'training') {
124
+ onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
125
+ } else {
126
+ onUpdate(training, newStrategy, useDistinctInference);
127
+ }
128
+ };
129
+
130
+ // Update the numeric clipping section:
131
+ const handleClipChange = (
132
+ type: 'training' | 'inference',
133
+ clipUpdates: Partial<{ min?: number; max?: number }>
134
+ ) => {
135
+ const strategy = type === 'training' ? training : inference;
136
+ const setStrategy = type === 'training' ? setTraining : setInference;
137
+
138
+ const newStrategy: PreprocessingStep = {
139
+ ...strategy,
140
+ params: {
141
+ ...strategy.params,
142
+ clip: {
143
+ ...strategy.params.clip,
144
+ ...clipUpdates
145
+ }
146
+ }
147
+ };
148
+
149
+ setStrategy(newStrategy);
150
+ if (type === 'training') {
151
+ onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
152
+ } else {
153
+ onUpdate(training, newStrategy, useDistinctInference);
154
+ }
155
+ };
156
+
157
+ const handleConstantValueChange = (
158
+ type: 'training' | 'inference',
159
+ value: string
160
+ ) => {
161
+ const strategy = type === 'training' ? training : inference;
162
+ const setStrategy = type === 'training' ? setTraining : setInference;
163
+
164
+ const newStrategy: PreprocessingStep = {
165
+ ...strategy,
166
+ params: {
167
+ ...strategy.params,
168
+ constant: value
169
+ }
170
+ };
171
+
172
+ setStrategy(newStrategy);
173
+ if (type === 'training') {
174
+ onUpdate(newStrategy, useDistinctInference ? inference : undefined, useDistinctInference);
175
+ } else {
176
+ onUpdate(training, newStrategy, useDistinctInference);
177
+ }
178
+ };
179
+
180
+ const renderConstantValueInput = (type: 'training' | 'inference') => {
181
+ const strategy = type === 'training' ? training : inference;
182
+ if (strategy.method !== 'constant') return null;
183
+
184
+ return (
185
+ <div className="mt-4">
186
+ <label className="block text-sm font-medium text-gray-700 mb-1">
187
+ Constant Value
188
+ </label>
189
+ {isNumericType(selectedType) ? (
190
+ <input
191
+ type="number"
192
+ value={strategy.params?.constant ?? ''}
193
+ onChange={(e) => handleConstantValueChange(type, e.target.value)}
194
+ className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
195
+ placeholder="Enter a number..."
196
+ />
197
+ ) : (
198
+ <input
199
+ type="text"
200
+ value={strategy.params?.constant ?? ''}
201
+ onChange={(e) => handleConstantValueChange(type, e.target.value)}
202
+ className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
203
+ placeholder="Enter a value..."
204
+ />
205
+ )}
206
+ </div>
207
+ );
208
+ };
209
+
210
+ const [isEditingDescription, setIsEditingDescription] = useState(false);
211
+
212
+ const onToggleDropIfNull = (e: React.ChangeEvent<HTMLInputElement>) => {
213
+ const updatedColumns = dataset.columns.map(c => ({
214
+ ...c,
215
+ drop_if_null: c.name === column.name ? e.target.checked : c.drop_if_null
216
+ }));
217
+
218
+ setDataset({
219
+ ...dataset,
220
+ columns: updatedColumns
221
+ });
222
+ };
223
+
224
+ const handleDescriptionChange = (e: React.ChangeEvent<HTMLTextAreaElement>) => {
225
+ const updatedColumns = dataset.columns.map(c => ({
226
+ ...c,
227
+ description: c.name === column.name ? e.target.value : c.description
228
+ }));
229
+
230
+ setDataset({
231
+ ...dataset,
232
+ columns: updatedColumns
233
+ });
234
+ };
235
+
236
+ const handleDescriptionSave = () => {
237
+ setIsEditingDescription(false);
238
+ };
239
+
240
+ const handleDescriptionKeyDown = (e: React.KeyboardEvent) => {
241
+ if (e.key === 'Enter') {
242
+ e.preventDefault();
243
+ setIsEditingDescription(false);
244
+ } else if (e.key === 'Escape') {
245
+ setIsEditingDescription(false);
246
+ }
247
+ };
248
+
249
+ const handleDescriptionClick = () => {
250
+ setIsEditingDescription(true);
251
+ };
252
+
253
+ let nullCount = (column.statistics?.processed.null_count || column.statistics?.raw.null_count) || 0;
254
+ const nullPercentage = nullCount && column.statistics?.raw.num_rows
255
+ ? ((nullCount / column.statistics.raw.num_rows) * 100)
256
+ : 0;
257
+
258
+ const nullPercentageProcessed = column.statistics?.processed?.null_count && column.statistics?.raw.num_rows
259
+ ? ((column.statistics.processed.null_count / column.statistics.raw.num_rows) * 100)
260
+ : 0;
261
+
262
+ const totalRows = column.statistics?.raw.num_rows ?? 0;
263
+
264
+ const renderStrategySpecificInfo = (type: 'training' | 'inference') => {
265
+ const strategy = type === 'training' ? training : inference;
266
+ let content;
267
+ if (strategy.method === 'most_frequent' && column.statistics?.raw.most_frequent_value) {
268
+ content = `Most Frequent Value: ${column.statistics.raw.most_frequent_value}`
269
+ } else if (strategy.method === 'ffill' && column.statistics?.raw.last_value) {
270
+ content = `Last Value: ${column.statistics.raw.last_value}`
271
+ } else if (strategy.method === 'median' && column.statistics?.raw?.median) {
272
+ content = `Median: ${column.statistics.raw.median}`
273
+ } else if (strategy.method === 'mean' && column.statistics?.raw?.mean) {
274
+ content = `Mean: ${column.statistics.raw.mean}`
275
+ } else {
276
+ return null;
277
+ }
278
+ return (
279
+ <div className="mt-4 bg-yellow-50 rounded-lg p-4">
280
+ <span className="text-sm font-medium text-yellow-700">
281
+ {content}
282
+ </span>
283
+ </div>
284
+ );
285
+ };
286
+
287
+ return (
288
+ <div className="space-y-8">
289
+ {/* Column Header Section */}
290
+ <div className="bg-white rounded-lg border border-gray-200 p-6">
291
+ <div className="flex items-center justify-between mb-4">
292
+ <div className="flex-1 max-w-[70%]">
293
+ <h2 className="text-2xl font-semibold text-gray-900">{column.name}</h2>
294
+ <div className="mt-1 flex items-start gap-1">
295
+ {isEditingDescription ? (
296
+ <div className="flex-1">
297
+ <textarea
298
+ value={column.description || ''}
299
+ onChange={handleDescriptionChange}
300
+ onBlur={handleDescriptionSave}
301
+ onKeyDown={handleDescriptionKeyDown}
302
+ className="w-full px-2 py-1 text-sm text-gray-900 border border-blue-500 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500"
303
+ rows={2}
304
+ autoFocus
305
+ placeholder="Enter column description..."
306
+ />
307
+ <p className="mt-1 text-xs text-gray-500">
308
+ Press Enter to save, Escape to cancel
309
+ </p>
310
+ </div>
311
+ ) : (
312
+ <div className="flex-3/4 flex items-start gap-1">
313
+ <p
314
+ className="text-sm text-gray-500 cursor-pointer flex-grow truncate"
315
+ onClick={handleDescriptionClick}
316
+ >
317
+ {column.description || 'No description provided'}
318
+ </p>
319
+ <button
320
+ onClick={handleDescriptionClick}
321
+ className="p-1 text-gray-400 hover:text-gray-600 rounded-md hover:bg-gray-100 flex-shrink-0"
322
+ >
323
+ <Pencil className="w-4 h-4" />
324
+ </button>
325
+ </div>
326
+ )}
327
+ </div>
328
+ </div>
329
+ <div className="flex items-center gap-4 flex-shrink-0">
330
+ {column.is_target ? (
331
+ <span className="inline-flex items-center px-3 py-1 rounded-full text-sm font-medium bg-purple-100 text-purple-800">
332
+ Target Column
333
+ </span>
334
+ ) : (
335
+ <div className="flex items-center gap-2">
336
+ <label className="flex items-center gap-2 text-sm">
337
+ <input
338
+ type="checkbox"
339
+ checked={column.drop_if_null}
340
+ onChange={onToggleDropIfNull}
341
+ className="rounded border-gray-300 text-red-600 focus:ring-red-500"
342
+ />
343
+ <span className="flex items-center gap-1 text-gray-700">
344
+ <Trash2 className="w-4 h-4 text-gray-400" />
345
+ Drop if null
346
+ </span>
347
+ </label>
348
+ </div>
349
+ )}
350
+ </div>
351
+ </div>
352
+
353
+ {/* Null Value Statistics */}
354
+ <div className="mt-6 grid grid-cols-2 gap-6">
355
+ <div className="bg-gray-50 rounded-lg p-4">
356
+ <div className="flex items-center gap-2 mb-3">
357
+ <Database className="w-4 h-4 text-gray-500" />
358
+ <h3 className="text-sm font-medium text-gray-900">Raw Data Statistics</h3>
359
+ </div>
360
+ <div className="space-y-2">
361
+ <div className="flex justify-between text-sm">
362
+ <span className="text-gray-600">Null Values:</span>
363
+ <span className="font-medium text-gray-900">{column.statistics?.raw?.null_count.toLocaleString()}</span>
364
+ </div>
365
+ <div className="flex justify-between text-sm">
366
+ <span className="text-gray-600">Total Rows:</span>
367
+ <span className="font-medium text-gray-900">{totalRows.toLocaleString()}</span>
368
+ </div>
369
+ <div className="flex justify-between text-sm">
370
+ <span className="text-gray-600">Null Percentage:</span>
371
+ <span className="font-medium text-gray-900">{nullPercentage.toFixed(2)}%</span>
372
+ </div>
373
+ <div className="mt-2">
374
+ <div className="w-full h-2 bg-gray-200 rounded-full overflow-hidden">
375
+ <div
376
+ className="h-full bg-blue-600 rounded-full"
377
+ style={{ width: `${nullPercentage}%` }}
378
+ />
379
+ </div>
380
+ </div>
381
+ </div>
382
+ </div>
383
+
384
+ <div className="bg-gray-50 rounded-lg p-4">
385
+ <div className="flex items-center gap-2 mb-3">
386
+ <Wrench className="w-4 h-4 text-gray-500" />
387
+ <h3 className="text-sm font-medium text-gray-900">Processed Data Statistics</h3>
388
+ </div>
389
+ {dataset?.preprocessing_steps?.training ? (
390
+ <div className="space-y-2">
391
+ <div className="flex justify-between text-sm">
392
+ <span className="text-gray-600">Null Values:</span>
393
+ <span className="font-medium text-gray-900">{column.statistics?.processed?.null_count?.toLocaleString()}</span>
394
+ </div>
395
+ <div className="flex justify-between text-sm">
396
+ <span className="text-gray-600">Total Rows:</span>
397
+ <span className="font-medium text-gray-900">{column.statistics?.processed?.num_rows?.toLocaleString()}</span>
398
+ </div>
399
+ <div className="flex justify-between text-sm">
400
+ <span className="text-gray-600">Null Percentage:</span>
401
+ <span className="font-medium text-gray-900">{nullPercentageProcessed.toFixed(2)}%</span>
402
+ </div>
403
+ <div className="mt-2">
404
+ <div className="w-full h-2 bg-gray-200 rounded-full overflow-hidden">
405
+ <div
406
+ className="h-full bg-blue-600 rounded-full"
407
+ style={{ width: `${nullPercentageProcessed}%` }}
408
+ />
409
+ </div>
410
+ </div>
411
+ </div>
412
+ ) : (
413
+ <div className="text-sm text-gray-500 text-center py-2">
414
+ No preprocessing configured
415
+ </div>
416
+ )}
417
+ </div>
418
+ </div>
419
+
420
+ <div className="grid grid-cols-3 gap-4 mt-6">
421
+ <div className="bg-gray-50 rounded-lg p-4">
422
+ <span className="text-sm text-gray-500">Type</span>
423
+ <p className="text-lg font-medium text-gray-900 mt-1">{column.datatype}</p>
424
+ </div>
425
+ <div className="bg-gray-50 rounded-lg p-4">
426
+ <span className="text-sm text-gray-500">Unique Values</span>
427
+ <p className="text-lg font-medium text-gray-900 mt-1">
428
+ {column.statistics?.processed?.unique_count?.toLocaleString() ?? 'N/A'}
429
+ </p>
430
+ </div>
431
+ <div className="bg-gray-50 rounded-lg p-4">
432
+ <span className="text-sm text-gray-500">Null Values</span>
433
+ <p className="text-lg font-medium text-gray-900 mt-1">
434
+ {column.statistics?.processed?.null_count?.toLocaleString() ?? '0'}
435
+ </p>
436
+ </div>
437
+ </div>
438
+
439
+ {column.statistics?.processed.null_count ? (
440
+ <div className="mt-6">
441
+ <div className="flex items-center justify-between mb-2">
442
+ <span className="text-sm font-medium text-gray-700">Null Distribution</span>
443
+ <span className="text-sm text-gray-500">
444
+ {nullPercentage}% of values are null
445
+ </span>
446
+ </div>
447
+ <div className="relative h-2 bg-gray-100 rounded-full overflow-hidden">
448
+ <div
449
+ className="absolute top-0 left-0 h-full bg-yellow-400 rounded-full"
450
+ style={{ width: `${nullPercentage}%` }}
451
+ />
452
+ </div>
453
+ </div>
454
+ ) : (
455
+ <div className="mt-6 bg-green-50 rounded-lg p-4">
456
+ <div className="flex items-center gap-2">
457
+ <div className="w-2 h-2 bg-green-400 rounded-full" />
458
+ <span className="text-sm text-green-700">This column has no null values</span>
459
+ </div>
460
+ </div>
461
+ )}
462
+
463
+ {column.statistics?.raw?.sample_data && (
464
+ <div className="mt-6">
465
+ <h4 className="text-sm font-medium text-gray-700 mb-2">Sample Values</h4>
466
+ <div className="bg-gray-50 rounded-lg p-4">
467
+ <div className="flex flex-wrap gap-2">
468
+ {column.statistics?.raw?.sample_data && column.statistics.raw.sample_data.map((value, index) => (
469
+ <span key={index} className="px-2 py-1 bg-gray-100 rounded text-sm text-gray-700">
470
+ {String(value)}
471
+ </span>
472
+ ))}
473
+ </div>
474
+ </div>
475
+ </div>
476
+ )}
477
+ </div>
478
+
479
+ {/* Data Type Section */}
480
+ <div className="bg-white rounded-lg border border-gray-200 p-6">
481
+ <h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
482
+ <Settings2 className="w-5 h-5 text-gray-500" />
483
+ Data Type
484
+ </h3>
485
+
486
+ <div className="space-y-4">
487
+ <div>
488
+ <label className="block text-sm font-medium text-gray-700 mb-1">
489
+ Column Type
490
+ </label>
491
+ <select
492
+ value={selectedType}
493
+ disabled
494
+ className="w-full rounded-md border-gray-300 bg-gray-50 shadow-sm text-gray-700 cursor-not-allowed"
495
+ >
496
+ {constants.column_types.map(type => (
497
+ <option key={type.value} value={type.value}>
498
+ {type.label}
499
+ </option>
500
+ ))}
501
+ </select>
502
+ <p className="mt-1 text-sm text-gray-500">
503
+ Column type cannot be changed after creation
504
+ </p>
505
+ </div>
506
+
507
+ <div className="bg-gray-50 rounded-md p-4">
508
+ <h4 className="text-sm font-medium text-gray-900 mb-2">Sample Data</h4>
509
+ <div className="space-y-2">
510
+ {Array.isArray(column.sample_values) ? column.sample_values.slice(0, 3).map((value: any, index: number) => (
511
+ <span key={index} className="m-1 flex-items items-center">
512
+ <Badge>
513
+ {String(value)}
514
+ </Badge>
515
+ </span>
516
+ )) : []}
517
+ </div>
518
+ </div>
519
+ </div>
520
+ </div>
521
+
522
+ {/* Preprocessing Strategy Section */}
523
+ <div className="bg-white rounded-lg border border-gray-200 p-6">
524
+ <h3 className="text-lg font-medium text-gray-900 mb-4 flex items-center gap-2">
525
+ <Wrench className="w-5 h-5 text-gray-500" />
526
+ Preprocessing Strategy
527
+ </h3>
528
+
529
+ <div className="space-y-6">
530
+ <div>
531
+ <div className="flex items-center justify-between mb-4">
532
+ <label className="block text-sm font-medium text-gray-700">
533
+ Training Strategy
534
+ </label>
535
+ <div className="flex items-center gap-2">
536
+ <input
537
+ type="checkbox"
538
+ id="useDistinctInference"
539
+ checked={useDistinctInference}
540
+ onChange={(e) => {
541
+ setUseDistinctInference(e.target.checked);
542
+ onUpdate(
543
+ training,
544
+ e.target.checked ? inference : undefined,
545
+ e.target.checked
546
+ );
547
+ }}
548
+ className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
549
+ />
550
+ <label htmlFor="useDistinctInference" className="text-sm text-gray-700">
551
+ Use different strategy for inference
552
+ </label>
553
+ </div>
554
+ </div>
555
+
556
+ <div className={useDistinctInference ? "grid grid-cols-2 gap-6" : ""}>
557
+ <div>
558
+ <select
559
+ value={training.method}
560
+ onChange={(e) => handleStrategyChange('training', e.target.value as PreprocessingStep['method'])}
561
+ className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
562
+ >
563
+ <option value="none">No preprocessing</option>
564
+ {constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => (
565
+ <option key={strategy.value} value={strategy.value}>
566
+ {strategy.label}
567
+ </option>
568
+ ))}
569
+ </select>
570
+
571
+ {renderStrategySpecificInfo('training')}
572
+ {renderConstantValueInput('training')}
573
+ {(column.datatype === 'categorical' && training.method === 'categorical') && (
574
+ <div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
575
+ <div>
576
+ <label className="block text-sm font-medium text-gray-700 mb-1">
577
+ Minimum Category Instances
578
+ </label>
579
+ <input
580
+ type="number"
581
+ min="1"
582
+ value={training.params.categorical_min}
583
+ onChange={(e) => handleCategoricalParamChange('training', {
584
+ categorical_min: parseInt(e.target.value)
585
+ })}
586
+ className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
587
+ />
588
+ <p className="mt-1 text-sm text-gray-500">
589
+ Categories with fewer instances will be grouped as "OTHER"
590
+ </p>
591
+ </div>
592
+ </div>
593
+ )}
594
+ {(column.datatype === 'categorical' && training.method !== 'none') && (
595
+ <div className="mt-4 space-y-4 bg-gray-50 rounded-lg p-4">
596
+ <h4 className="text-sm font-medium text-gray-900 mb-2">Encoding</h4>
597
+ <div className="flex items-center gap-2">
598
+ <input
599
+ type="radio"
600
+ id="oneHotEncode"
601
+ name="encoding"
602
+ checked={training.params.one_hot}
603
+ onChange={() => handleCategoricalParamChange('training', {
604
+ one_hot: true,
605
+ ordinal_encoding: false
606
+ })}
607
+ className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
608
+ />
609
+ <label htmlFor="oneHotEncode" className="text-sm text-gray-700">
610
+ One-hot encode categories
611
+ </label>
612
+ </div>
613
+ <div className="flex items-center gap-2">
614
+ <input
615
+ type="radio"
616
+ id="ordinalEncode"
617
+ name="encoding"
618
+ checked={training.params.ordinal_encoding}
619
+ onChange={() => handleCategoricalParamChange('training', {
620
+ one_hot: false,
621
+ ordinal_encoding: true
622
+ })}
623
+ className="rounded border-gray-300 text-blue-600 focus:ring-blue-500"
624
+ />
625
+ <label htmlFor="ordinalEncode" className="text-sm text-gray-700">
626
+ Ordinal encode categories
627
+ </label>
628
+ </div>
629
+ </div>
630
+ )}
631
+ </div>
632
+
633
+ {useDistinctInference && (
634
+ <div>
635
+ <div className="flex items-center gap-2 mb-2">
636
+ <ArrowRight className="w-4 h-4 text-gray-400" />
637
+ <span className="text-sm font-medium text-gray-700">
638
+ Inference Strategy
639
+ </span>
640
+ </div>
641
+ <select
642
+ value={inference.method}
643
+ onChange={(e) => handleStrategyChange('inference', e.target.value as PreprocessingStep['method'])}
644
+ className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
645
+ >
646
+ <option value="none">No preprocessing</option>
647
+ {constants.preprocessing_strategies[selectedType]?.map((strategy: { value: string; label: string; }) => (
648
+ <option key={strategy.value} value={strategy.value}>
649
+ {strategy.label}
650
+ </option>
651
+ ))}
652
+ </select>
653
+
654
+ {renderConstantValueInput('inference')}
655
+ </div>
656
+ )}
657
+ </div>
658
+ </div>
659
+
660
+ {isNumericType(selectedType) && training.method !== 'none' && (
661
+ <div className="border-t pt-4">
662
+ <h4 className="text-sm font-medium text-gray-900 mb-2">Clip Values</h4>
663
+ <div className="grid grid-cols-2 gap-4">
664
+ <div>
665
+ <label className="block text-sm font-medium text-gray-700 mb-1">
666
+ Min Value
667
+ </label>
668
+ <input
669
+ type="number"
670
+ value={training.params?.clip?.min ?? ''}
671
+ onChange={(e) => {
672
+ handleClipChange('training', {
673
+ min: e.target.value ? Number(e.target.value) : undefined
674
+ });
675
+ }}
676
+ className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
677
+ placeholder="No minimum"
678
+ />
679
+ </div>
680
+ <div>
681
+ <label className="block text-sm font-medium text-gray-700 mb-1">
682
+ Max Value
683
+ </label>
684
+ <input
685
+ type="number"
686
+ value={training.params?.clip?.max ?? ''}
687
+ onChange={(e) => {
688
+ handleClipChange('training', {
689
+ max: e.target.value ? Number(e.target.value) : undefined
690
+ });
691
+ }}
692
+ className="w-full rounded-md border-gray-300 shadow-sm focus:border-blue-500 focus:ring-blue-500"
693
+ placeholder="No maximum"
694
+ />
695
+ </div>
696
+ </div>
697
+ </div>
698
+ )}
699
+ </div>
700
+ </div>
701
+
702
+ </div>
703
+ );
704
+ }