lecrapaud 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lecrapaud might be problematic. Click here for more details.

Files changed (63) hide show
  1. lecrapaud/__init__.py +1 -0
  2. lecrapaud/api.py +271 -0
  3. lecrapaud/config.py +25 -0
  4. lecrapaud/db/__init__.py +1 -0
  5. lecrapaud/db/alembic/README +1 -0
  6. lecrapaud/db/alembic/env.py +78 -0
  7. lecrapaud/db/alembic/script.py.mako +26 -0
  8. lecrapaud/db/alembic/versions/2025_04_06_1738-7390745388e4_initial_setup.py +295 -0
  9. lecrapaud/db/alembic/versions/2025_04_06_1755-40cd8d3e798e_unique_constraint_for_data.py +30 -0
  10. lecrapaud/db/alembic/versions/2025_05_23_1724-2360941fa0bd_longer_string.py +52 -0
  11. lecrapaud/db/alembic/versions/2025_05_27_1159-b96396dcfaff_add_env_to_trading_tables.py +34 -0
  12. lecrapaud/db/alembic/versions/2025_05_27_1337-40cbfc215f7c_fix_nb_character_on_portfolio.py +39 -0
  13. lecrapaud/db/alembic/versions/2025_05_27_1526-3de994115317_to_datetime.py +36 -0
  14. lecrapaud/db/alembic/versions/2025_05_27_2003-25c227c684f8_add_fees_to_transactions.py +30 -0
  15. lecrapaud/db/alembic/versions/2025_05_27_2047-6b6f2d38e9bc_double_instead_of_float.py +132 -0
  16. lecrapaud/db/alembic/versions/2025_05_31_1111-c175e4a36d68_generalise_stock_to_group.py +36 -0
  17. lecrapaud/db/alembic/versions/2025_05_31_1256-5681095bfc27_create_investment_run_and_portfolio_.py +62 -0
  18. lecrapaud/db/alembic/versions/2025_05_31_1806-339927587383_add_investment_run_id.py +107 -0
  19. lecrapaud/db/alembic/versions/2025_05_31_1834-52b809a34371_make_nullablee.py +38 -0
  20. lecrapaud/db/alembic/versions/2025_05_31_1849-3b8550297e8e_change_date_to_datetime.py +44 -0
  21. lecrapaud/db/alembic/versions/2025_05_31_1852-e6b8c95d8243_add_date_to_portfolio_history.py +30 -0
  22. lecrapaud/db/alembic/versions/2025_06_10_1136-db8cdd83563a_addnewsandoptiontodata.py +32 -0
  23. lecrapaud/db/alembic/versions/2025_06_17_1652-c45f5e49fa2c_make_fields_nullable.py +89 -0
  24. lecrapaud/db/models/__init__.py +11 -0
  25. lecrapaud/db/models/base.py +181 -0
  26. lecrapaud/db/models/dataset.py +129 -0
  27. lecrapaud/db/models/feature.py +45 -0
  28. lecrapaud/db/models/feature_selection.py +125 -0
  29. lecrapaud/db/models/feature_selection_rank.py +79 -0
  30. lecrapaud/db/models/model.py +40 -0
  31. lecrapaud/db/models/model_selection.py +63 -0
  32. lecrapaud/db/models/model_training.py +62 -0
  33. lecrapaud/db/models/score.py +65 -0
  34. lecrapaud/db/models/target.py +67 -0
  35. lecrapaud/db/session.py +45 -0
  36. lecrapaud/directory_management.py +28 -0
  37. lecrapaud/experiment.py +64 -0
  38. lecrapaud/feature_engineering.py +846 -0
  39. lecrapaud/feature_selection.py +1167 -0
  40. lecrapaud/integrations/openai_integration.py +225 -0
  41. lecrapaud/jobs/__init__.py +13 -0
  42. lecrapaud/jobs/config.py +17 -0
  43. lecrapaud/jobs/scheduler.py +36 -0
  44. lecrapaud/jobs/tasks.py +57 -0
  45. lecrapaud/model_selection.py +1671 -0
  46. lecrapaud/predictions.py +292 -0
  47. lecrapaud/preprocessing.py +984 -0
  48. lecrapaud/search_space.py +848 -0
  49. lecrapaud/services/__init__.py +0 -0
  50. lecrapaud/services/embedding_categorical.py +71 -0
  51. lecrapaud/services/indicators.py +309 -0
  52. lecrapaud/speed_tests/experiments.py +139 -0
  53. lecrapaud/speed_tests/test-gpu-bilstm.ipynb +261 -0
  54. lecrapaud/speed_tests/test-gpu-resnet.ipynb +166 -0
  55. lecrapaud/speed_tests/test-gpu-transformers.ipynb +254 -0
  56. lecrapaud/speed_tests/tests.ipynb +145 -0
  57. lecrapaud/speed_tests/trash.py +37 -0
  58. lecrapaud/training.py +239 -0
  59. lecrapaud/utils.py +246 -0
  60. lecrapaud-0.1.0.dist-info/LICENSE +201 -0
  61. lecrapaud-0.1.0.dist-info/METADATA +105 -0
  62. lecrapaud-0.1.0.dist-info/RECORD +63 -0
  63. lecrapaud-0.1.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,846 @@
1
+ """
2
+ Feature engineering module for data preprocessing and transformation.
3
+
4
+ Process
5
+ -------
6
+ FEAT ENG
7
+ - utiliser business_analysis > get_table_summary pour voir quels sont les champs null à + de 90%
8
+ - utiliser remove_constant_columns pour supprimer les colonnes constantes
9
+ - utiliser summarize_dataframe pour supprimer de nouvelles colonnes inutiles (date, id, donnée future à la prédiction, misc not useful)
10
+ - caster en numeric ce qui peut être casté en numeric
11
+
12
+ - definir columns_boolean
13
+ - definir groupby_columns_list et target_column pour le target encoding
14
+ - créer la/les targets
15
+ - définir columns_pca
16
+ - définir columns_one_hot, columns_binary, columns_ordinal, columns_frequency
17
+
18
+
19
+ Todo
20
+ ----
21
+ - DONE: drop meaningless identifier columns
22
+ - DONE: PCA on embedding of deck
23
+ - DONE: maybe cyclic encoding for date columns
24
+
25
+ - DONE: ordinal/label encode (only 1 column) for tree based method when not too big number of categories
26
+ - DONE: frequency encoding for some categorical columns
27
+ - DONE: one hot encoding for categorical columns
28
+ - DONE: binary encoding if big number of category
29
+
30
+ - DONE: create other other embedding column for textual data ?
31
+ - DONE: create some boolean like has_website, has_linkedin_company_url, etc...
32
+
33
+ - target/mean encoding with a groupby on a very interesting categorical column
34
+ - faire du "vrai" target encoding avec du leave one out encoding par exemple, sur la target variable ?
35
+
36
+ - better categorize some stuff like country ? for sourcing we do position, ext_position, company, ext_company, country, source, but only country is relevant here
37
+
38
+
39
+ Development
40
+ -----------
41
+ - utiliser le PCA pour définir combien de variable explique la variance pour la feature selection max_feature
42
+ - could be nice to get linkedin info of founders (need to search reps in rails first) - and score !
43
+ - add created_from, utm_source, referrer when we will have more data
44
+ - could be nice to get team_count, or dealroom info but at the moment of submission...
45
+ """
46
+
47
+ import pandas as pd
48
+ import numpy as np
49
+ from itertools import product
50
+ import joblib
51
+
52
+ from sklearn.compose import ColumnTransformer
53
+ from sklearn.decomposition import PCA
54
+ from category_encoders import BinaryEncoder, CountEncoder
55
+ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
56
+ from sklearn.model_selection import train_test_split
57
+
58
+ from lecrapaud.integrations.openai_integration import (
59
+ truncate_text,
60
+ get_openai_embeddings,
61
+ )
62
+ from lecrapaud.feature_selection import get_features_by_types
63
+ from lecrapaud.utils import logger
64
+ from lecrapaud.db import Target, Feature, Dataset
65
+ from lecrapaud.config import PYTHON_ENV
66
+
67
+
68
+ # main function
69
+ class FeatureEngineeringEngine:
70
+ """
71
+ Feature engineering pipeline
72
+
73
+ Params needed
74
+ -------------
75
+ data
76
+ columns_boolean
77
+ columns_date
78
+ columns_te_groupby
79
+ columns_te_target
80
+ for_training
81
+ """
82
+
83
+ def __init__(
84
+ self,
85
+ data: pd.DataFrame,
86
+ columns_drop: list[str] = [],
87
+ columns_boolean: list[str] = [],
88
+ columns_date: list[str] = [],
89
+ columns_te_groupby: list[str] = [],
90
+ columns_te_target: list[str] = [],
91
+ for_training: bool = True,
92
+ **kwargs,
93
+ ):
94
+ self.data = data
95
+ self.columns_drop = columns_drop
96
+ self.columns_boolean = columns_boolean
97
+ self.columns_date = columns_date
98
+ self.columns_te_groupby = columns_te_groupby
99
+ self.columns_te_target = columns_te_target
100
+ self.for_training = for_training
101
+
102
+ def run(self) -> pd.DataFrame:
103
+ # drop columns
104
+ self.data = self.data.drop(columns=self.columns_drop)
105
+
106
+ # convert object columns to numeric if possible
107
+ self.data = convert_object_columns_that_are_numeric(self.data)
108
+
109
+ # handle boolean features
110
+ self.data = self.boolean_encode_columns()
111
+
112
+ # handle missing values
113
+ self.data = (
114
+ self.fillna_at_training()
115
+ if self.for_training
116
+ else self.fillna_at_inference()
117
+ )
118
+
119
+ # target encoding
120
+ self.data = self.generate_target_encodings()
121
+
122
+ # Cyclic encode dates
123
+ self.data = self.cyclic_encode_date()
124
+
125
+ return self.data
126
+
127
+ def cyclic_encode_date(self) -> pd.DataFrame:
128
+ """
129
+ Adds cyclic (sine and cosine) encoding for common date parts: day of week, day of month, and month.
130
+
131
+ Parameters:
132
+ df (pd.DataFrame): Input dataframe
133
+ columns (list[str]): List of datetime columns to encode
134
+ prefix (str): Optional prefix for new columns. If None, uses column names.
135
+
136
+ Returns:
137
+ pd.DataFrame: Updated dataframe with new cyclic features
138
+ """
139
+
140
+ df: pd.DataFrame = self.data
141
+ columns: list[str] = self.columns_date
142
+
143
+ def cyclic_encode(series, max_value):
144
+ sin_values = np.sin(2 * np.pi * series / max_value)
145
+ cos_values = np.cos(2 * np.pi * series / max_value)
146
+ return sin_values, cos_values
147
+
148
+ for col in columns:
149
+
150
+ df[col] = pd.to_datetime(df[col]).dt.normalize()
151
+ df[f"{col}_year"] = df[col].dt.isocalendar().year
152
+ df[f"{col}_month"] = df[col].dt.month
153
+ df[f"{col}_day"] = df[col].dt.day
154
+ df[f"{col}_week"] = df[col].dt.isocalendar().week
155
+ df[f"{col}_weekday"] = df[col].dt.weekday
156
+ df[f"{col}_yearday"] = df[col].dt.dayofyear
157
+ df[col] = pd.to_datetime(df[col]).map(pd.Timestamp.toordinal)
158
+
159
+ df[f"{col}_month_sin"], df[f"{col}_month_cos"] = cyclic_encode(
160
+ df[f"{col}_month"], 12
161
+ )
162
+ df[f"{col}_day_sin"], df[f"{col}_day_cos"] = cyclic_encode(
163
+ df[f"{col}_day"], 31
164
+ )
165
+ df[f"{col}_week_sin"], df[f"{col}_week_cos"] = cyclic_encode(
166
+ df[f"{col}_week"], 52
167
+ )
168
+ df[f"{col}_weekday_sin"], df[f"{col}_weekday_cos"] = cyclic_encode(
169
+ df[f"{col}_weekday"], 7
170
+ )
171
+ df[f"{col}_yearday_sin"], df[f"{col}_yearday_cos"] = cyclic_encode(
172
+ df[f"{col}_yearday"], 365
173
+ )
174
+
175
+ # Drop the original column TODO: not sure if we should drop it for time series
176
+ # df.drop(col, axis=1, inplace=True)
177
+
178
+ return df
179
+
180
+ def boolean_encode_columns(self) -> pd.DataFrame:
181
+ """
182
+ Applies boolean encoding to a list of columns:
183
+ - Leaves column as-is if already int with only 0 and 1
184
+ - Otherwise: sets 1 if value is present (notna), 0 if null/NaN/None
185
+
186
+ Parameters:
187
+ df (pd.DataFrame): Input dataframe
188
+ columns (list): List of column names to encode
189
+
190
+ Returns:
191
+ pd.DataFrame: Updated dataframe with encoded columns
192
+ """
193
+
194
+ df: pd.DataFrame = self.data
195
+ columns: list[str] = self.columns_boolean
196
+
197
+ for column in columns:
198
+ col = df[column]
199
+ if pd.api.types.is_integer_dtype(col) and set(
200
+ col.dropna().unique()
201
+ ).issubset({0, 1}):
202
+ continue # already valid binary
203
+ df[column] = col.notna().astype(int)
204
+ return df
205
+
206
+ def generate_target_encodings(self) -> pd.DataFrame:
207
+ """
208
+ Generate target encoding features (e.g., mean, median) for specified targets and group-by combinations.
209
+
210
+ Parameters:
211
+ df (pd.DataFrame): Input dataframe
212
+ columns_te_groupby (list of list): Grouping keys, e.g., [["SECTOR", "DATE"], ["SUBINDUSTRY", "DATE"]]
213
+ columns_te_target (list): Target columns to aggregate (e.g., ["RET", "VOLUME", "RSI_14"])
214
+ statistics (list): List of aggregation statistics (e.g., ["mean", "median"])
215
+
216
+ Returns:
217
+ pd.DataFrame: Original dataframe with new encoded columns added
218
+ """
219
+
220
+ df: pd.DataFrame = self.data
221
+ columns_te_groupby: list[list[str]] = self.columns_te_groupby
222
+ columns_te_target: list[str] = self.columns_te_target
223
+ statistics: list[str] = ["mean", "median"]
224
+
225
+ df = df.copy()
226
+ new_feature_cols = {}
227
+ for group_cols, stat, target_col in product(
228
+ columns_te_groupby, statistics, columns_te_target
229
+ ):
230
+ col_name = f"{target_col}_{'_'.join(group_cols)}_{stat.upper()}"
231
+ new_feature_cols[col_name] = df.groupby(group_cols)[target_col].transform(
232
+ stat
233
+ )
234
+
235
+ # merge all at once to improve performance
236
+ df = pd.concat([df, pd.DataFrame(new_feature_cols)], axis=1)
237
+ return df
238
+
239
+ def fillna_at_training(self) -> pd.DataFrame:
240
+ """
241
+ Fill missing values in a DataFrame:
242
+ - Numeric columns: fill with mean
243
+ - Categorical columns: fill with mode
244
+ Handles both NaN and None.
245
+
246
+ Parameters:
247
+ df (pd.DataFrame): Input DataFrame
248
+
249
+ Returns:
250
+ pd.DataFrame: Cleaned DataFrame with missing values filled
251
+ """
252
+
253
+ df: pd.DataFrame = self.data.copy()
254
+
255
+ for col in df.columns:
256
+ missing_count = df[col].isnull().sum()
257
+ if missing_count > 0:
258
+ if pd.api.types.is_numeric_dtype(df[col]):
259
+ df[col] = df[col].fillna(df[col].mean())
260
+ logger.info(
261
+ f"Filled {missing_count} NaN values in numeric column '{col}' with mean."
262
+ )
263
+ else:
264
+ mode = df[col].mode()
265
+ if not mode.empty:
266
+ mode_value = mode[0]
267
+ mode_count = (df[col] == mode_value).sum()
268
+ if mode_count > 100:
269
+ fill_value = mode_value
270
+ else:
271
+ fill_value = "unknown"
272
+ else:
273
+ fill_value = "unknown"
274
+
275
+ df[col] = df[col].fillna(fill_value)
276
+ logger.info(
277
+ f"Filled {missing_count} NaN values in categorical column '{col}' with '{fill_value}'."
278
+ )
279
+
280
+ return df
281
+
282
+ def fillna_at_inference(self) -> pd.DataFrame:
283
+
284
+ df: pd.DataFrame = self.data
285
+
286
+ missing_cols = df.columns[df.isnull().any()].tolist()
287
+
288
+ if missing_cols:
289
+ numeric_cols = [
290
+ col for col in missing_cols if pd.api.types.is_numeric_dtype(df[col])
291
+ ]
292
+ non_numeric_cols = [col for col in missing_cols if col not in numeric_cols]
293
+
294
+ logger.warning(
295
+ f"Missing values found in inference data."
296
+ f"Filling with 0 for numeric columns: {numeric_cols}, "
297
+ f"and 'unknown' for non-numeric columns: {non_numeric_cols}"
298
+ )
299
+
300
+ df[numeric_cols] = df[numeric_cols].fillna(0)
301
+ df[non_numeric_cols] = df[non_numeric_cols].fillna("unknown")
302
+
303
+ return df
304
+
305
+
306
+ class PreprocessFeature:
307
+
308
+ def __init__(
309
+ self,
310
+ data: pd.DataFrame,
311
+ dataset,
312
+ time_series: bool = False,
313
+ date_column: str | None = None,
314
+ group_column: str | None = None,
315
+ val_size: float = 0.2,
316
+ test_size: float = 0.2,
317
+ columns_pca: list[str] = [],
318
+ columns_onehot: list[str] = [],
319
+ columns_binary: list[str] = [],
320
+ columns_ordinal: list[str] = [],
321
+ columns_frequency: list[str] = [],
322
+ target_numbers: list = [],
323
+ target_clf: list = [],
324
+ **kwargs,
325
+ ):
326
+ self.data = data
327
+ self.dataset = dataset
328
+ self.columns_pca = columns_pca
329
+ self.columns_onehot = columns_onehot
330
+ self.columns_binary = columns_binary
331
+ self.columns_ordinal = columns_ordinal
332
+ self.columns_frequency = columns_frequency
333
+ self.target_numbers = target_numbers
334
+ self.target_clf = target_clf
335
+
336
+ self.time_series = time_series
337
+ self.date_column = date_column
338
+ self.group_column = group_column
339
+ self.val_size = val_size
340
+ self.test_size = test_size
341
+
342
+ self.dataset_dir = self.dataset.path
343
+ self.dataset_id = self.dataset.id
344
+ self.data_dir = f"{self.dataset_dir}/data"
345
+ self.preprocessing_dir = f"{self.dataset_dir}/preprocessing"
346
+
347
+ def run(self):
348
+ # Split
349
+ train, val, test = (
350
+ self.train_val_test_split_time_series()
351
+ if self.time_series
352
+ else self.train_val_test_split(
353
+ stratify_col=f"target_{self.target_numbers[0]}"
354
+ )
355
+ ) # TODO: only stratifying first target for now
356
+
357
+ # PCA
358
+ train, pcas = self.add_pca_features(train)
359
+ val, _ = self.add_pca_features(test, pcas=pcas)
360
+ test, _ = self.add_pca_features(val, pcas=pcas)
361
+
362
+ if PYTHON_ENV != "Test":
363
+ joblib.dump(pcas, f"{self.preprocessing_dir}/pcas.pkl")
364
+
365
+ # Encoding
366
+ train, transformer = self.encode_categorical_features(train)
367
+ val, _ = self.encode_categorical_features(
368
+ val,
369
+ transformer=transformer,
370
+ )
371
+ test, _ = self.encode_categorical_features(
372
+ test,
373
+ transformer=transformer,
374
+ )
375
+
376
+ if PYTHON_ENV != "Test":
377
+ joblib.dump(self.data, f"{self.data_dir}/full.pkl")
378
+ joblib.dump(transformer, f"{self.preprocessing_dir}/column_transformer.pkl")
379
+ summary = summarize_dataframe(train)
380
+ summary.to_csv(f"{self.dataset_dir}/feature_summary.csv", index=False)
381
+
382
+ return train, val, test
383
+
384
+ def inference(self):
385
+ # PCA
386
+ pcas = joblib.load(f"{self.preprocessing_dir}/pcas.pkl")
387
+ data, _ = self.add_pca_features(self.data, pcas=pcas)
388
+
389
+ # Encoding
390
+ transformer = joblib.load(f"{self.preprocessing_dir}/column_transformer.pkl")
391
+ data, _ = self.encode_categorical_features(
392
+ data,
393
+ transformer=transformer,
394
+ )
395
+ return data
396
+
397
+ def train_val_test_split_time_series(self):
398
+ df: pd.DataFrame = self.data
399
+ date_column: str = self.date_column
400
+ group_column: str = self.group_column
401
+ val_size: float = self.val_size
402
+ test_size: float = self.test_size
403
+
404
+ if not date_column:
405
+ ValueError("Please specify a date_column for time series")
406
+
407
+ if group_column:
408
+ df.sort_values([date_column, group_column], inplace=True)
409
+ else:
410
+ df.sort_values(date_column, inplace=True)
411
+
412
+ dates = df[date_column].unique()
413
+
414
+ val_first_id = int(len(dates) * (1 - val_size - test_size)) + 1
415
+ test_first_id = int(len(dates) * (1 - test_size)) + 1
416
+
417
+ train = df[df[date_column].isin(dates[:val_first_id])]
418
+ val = df[df[date_column].isin(dates[val_first_id:test_first_id])]
419
+ test = df[df[date_column].isin(dates[test_first_id:])]
420
+
421
+ dates = {}
422
+ for name, data in zip(["train", "val", "test"], [train, val, test]):
423
+ dates[f"{name}_start_date"] = (
424
+ data[date_column].map(pd.Timestamp.fromordinal).iat[0]
425
+ )
426
+ dates[f"{name}_end_date"] = (
427
+ data[date_column].map(pd.Timestamp.fromordinal).iat[-1]
428
+ )
429
+
430
+ logger.info(
431
+ f"{data.shape} {name} data from {dates[f"{name}_start_date"].strftime('%d/%m/%Y')} to {dates[f"{name}_end_date"].strftime('%d/%m/%Y')}"
432
+ )
433
+
434
+ Dataset.update(
435
+ match_fields=["id"],
436
+ id=self.dataset_id,
437
+ train_size=len(train),
438
+ val_size=len(val),
439
+ test_size=len(test),
440
+ **dates,
441
+ )
442
+ return (
443
+ train.reset_index(drop=True),
444
+ val.reset_index(drop=True),
445
+ test.reset_index(drop=True),
446
+ )
447
+
448
+ def train_val_test_split(
449
+ self,
450
+ random_state: int = 42,
451
+ stratify_col: str | None = None,
452
+ ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
453
+ """
454
+ Splits a DataFrame into train, validation, and test sets.
455
+
456
+ Parameters:
457
+ df (pd.DataFrame): The full dataset
458
+ val_size (float): Proportion of validation set (default 0.1)
459
+ test_size (float): Proportion of test set (default 0.1)
460
+ random_state (int): Random seed for reproducibility
461
+ stratify_col (str | None): Optional column to stratify on (for classification tasks)
462
+
463
+ Returns:
464
+ Tuple of (train_df, val_df, test_df)
465
+ """
466
+ df: pd.DataFrame = self.data
467
+ val_size: float = self.val_size
468
+ test_size: float = self.test_size
469
+
470
+ stratify_vals = df[stratify_col] if stratify_col else None
471
+
472
+ # First split: train + (val + test)
473
+ train, temp = train_test_split(
474
+ df,
475
+ test_size=val_size + test_size,
476
+ random_state=random_state,
477
+ stratify=stratify_vals,
478
+ )
479
+
480
+ # Adjust stratify target for val/test split
481
+ stratify_temp = temp[stratify_col] if stratify_col else None
482
+
483
+ # Compute val and test sizes relative to temp
484
+ val_ratio = val_size / (val_size + test_size)
485
+
486
+ val, test = train_test_split(
487
+ temp,
488
+ test_size=1 - val_ratio,
489
+ random_state=random_state,
490
+ stratify=stratify_temp,
491
+ )
492
+
493
+ for name, data in zip(["train", "val", "test"], [train, val, test]):
494
+ logger.info(f"{data.shape} {name} data")
495
+
496
+ return (
497
+ train.reset_index(drop=True),
498
+ val.reset_index(drop=True),
499
+ test.reset_index(drop=True),
500
+ )
501
+
502
+ # embedding and pca
503
+ def add_pca_features(
504
+ self, df: pd.DataFrame, n_components: int = 5, pcas=None
505
+ ) -> tuple[pd.DataFrame, dict]:
506
+ """
507
+ Adds PCA components as new columns to a DataFrame from a column containing numpy arrays.
508
+ NEED TRAIN/TEST SPLIT BEFORE APPLYING - LIKE ENCODING CATEGORICAL VARIABLES
509
+
510
+ Parameters:
511
+ df (pd.DataFrame): Input DataFrame
512
+ column (str): Name of the column containing np.ndarray
513
+ n_components (int): Number of PCA components to keep
514
+
515
+ Returns:
516
+ pd.DataFrame: DataFrame with new PCA columns added
517
+ """
518
+ columns: list[str] = self.columns_pca
519
+
520
+ pcas_dict = {}
521
+ for column in columns:
522
+ # Convert text to embeddings if necessary
523
+ if not isinstance(df[column].iloc[0], (np.ndarray, list)):
524
+ sentences = df[column].astype(str).tolist()
525
+ logger.info(
526
+ f"Total sentences to embed for column {column}: {len(sentences)}"
527
+ )
528
+
529
+ # Truncate each sentence
530
+ truncate_sentences = [truncate_text(sentence) for sentence in sentences]
531
+
532
+ # embedding
533
+ embedding_matrix = get_openai_embeddings(truncate_sentences)
534
+ else:
535
+ logger.info(f"Column {column} is already embeddings")
536
+ # Stack the vectors into a 2D array
537
+ embedding_matrix = np.vstack(df[column].values)
538
+
539
+ # Apply PCA
540
+ if pcas:
541
+ pca = pcas[column]
542
+ pca_features = pca.transform(embedding_matrix)
543
+ else:
544
+ pca = PCA(n_components=n_components)
545
+ pca_features = pca.fit_transform(embedding_matrix)
546
+
547
+ # Add PCA columns
548
+ for i in range(n_components):
549
+ df[f"{column}_pca_{i+1}"] = pca_features[:, i]
550
+
551
+ # Drop the original column
552
+ df.drop(column, axis=1, inplace=True)
553
+ pcas_dict.update({column: pca})
554
+
555
+ return df, pcas_dict
556
+
557
+ # encoding categorical features
558
+ def encode_categorical_features(
559
+ self,
560
+ df: pd.DataFrame,
561
+ transformer: ColumnTransformer | None = None,
562
+ ) -> tuple[pd.DataFrame, ColumnTransformer]:
563
+ """
564
+ Encodes categorical columns using one-hot, binary, ordinal, and frequency encoding.
565
+
566
+ Parameters:
567
+ df (pd.DataFrame): Input DataFrame
568
+ columns_onehot (list[str]) Creates one binary column per category forLow-cardinality categorical features
569
+ columns_binary (list[str]) Converts categories into binary and splits bits across columns for Mid-to-high cardinality (e.g., 10–100 unique values)
570
+ columns_ordinal (list[str]) Assigns integer ranks to categories When order matters (e.g., low < medium < high)
571
+ columns_frequency (list[str]) Replaces each category with its frequency count, normalized to proportion. High-cardinality features with meaning in frequency
572
+ transformer (ColumnTransformer, optional): if provided, applies transform only
573
+
574
+ Returns:
575
+ tuple: (transformed DataFrame, ColumnTransformer)
576
+ """
577
+ columns_onehot: list[str] = self.columns_onehot
578
+ columns_binary: list[str] = self.columns_binary
579
+ columns_ordinal: list[str] = self.columns_ordinal
580
+ columns_frequency: list[str] = self.columns_frequency
581
+
582
+ X = df.loc[:, ~df.columns.str.contains("^target_")]
583
+ y = df.loc[:, df.columns.str.contains("^target_")]
584
+ save_in_db = False
585
+
586
+ all_columns = (
587
+ columns_onehot + columns_binary + columns_ordinal + columns_frequency
588
+ )
589
+
590
+ if transformer:
591
+ transformed = transformer.transform(X)
592
+ print(len(transformed), len(transformed[0]))
593
+ else:
594
+ transformer = ColumnTransformer(
595
+ transformers=[
596
+ (
597
+ "onehot",
598
+ OneHotEncoder(handle_unknown="ignore", sparse_output=False),
599
+ columns_onehot,
600
+ ),
601
+ (
602
+ "ordinal",
603
+ OrdinalEncoder(
604
+ handle_unknown="use_encoded_value", unknown_value=-1
605
+ ),
606
+ columns_ordinal,
607
+ ),
608
+ ("binary", BinaryEncoder(handle_unknown="value"), columns_binary),
609
+ ("freq", CountEncoder(normalize=True), columns_frequency),
610
+ ],
611
+ remainder="passthrough",
612
+ )
613
+ transformed = transformer.fit_transform(X)
614
+ save_in_db = True
615
+
616
+ # Build output column names
617
+ column_names = []
618
+
619
+ if columns_onehot:
620
+ column_names.extend(
621
+ transformer.named_transformers_["onehot"]
622
+ .get_feature_names_out(columns_onehot)
623
+ .tolist()
624
+ )
625
+
626
+ if columns_ordinal:
627
+ column_names.extend(columns_ordinal)
628
+
629
+ if columns_binary:
630
+ column_names.extend(
631
+ transformer.named_transformers_["binary"]
632
+ .get_feature_names_out(columns_binary)
633
+ .tolist()
634
+ )
635
+
636
+ if columns_frequency:
637
+ column_names.extend(columns_frequency)
638
+
639
+ # Add passthrough (non-encoded) columns
640
+ passthrough_columns = [col for col in X.columns if col not in all_columns]
641
+ column_names.extend(passthrough_columns)
642
+
643
+ X_transformed = pd.DataFrame(transformed, columns=column_names, index=df.index)
644
+
645
+ # Try to convert columns to best possible dtypes
646
+ X_transformed = X_transformed.convert_dtypes()
647
+ X_transformed.columns = X_transformed.columns.str.upper()
648
+
649
+ # Insert features in db
650
+ if save_in_db:
651
+ # TODO: in bulk
652
+ categorical_features, numerical_features = get_features_by_types(
653
+ X_transformed
654
+ )
655
+ for feature in categorical_features:
656
+ Feature.upsert(match_fields=["name"], name=feature, type="categorical")
657
+ for feature in numerical_features:
658
+ Feature.upsert(match_fields=["name"], name=feature, type="numerical")
659
+ for target in y.columns:
660
+ target_number = int(target.split("_")[1])
661
+ type = (
662
+ "classification"
663
+ if target_number in self.target_clf
664
+ else "regression"
665
+ )
666
+ # TODO: what about description here ?
667
+ Target.upsert(match_fields=["name", "type"], name=target, type=type)
668
+
669
+ return pd.concat([X_transformed, y], axis=1), transformer
670
+
671
+
672
+ # analysis & utils
673
+ def summarize_dataframe(
674
+ df: pd.DataFrame, sample_categorical_threshold: int = 15
675
+ ) -> pd.DataFrame:
676
+ summary = []
677
+
678
+ def is_hashable_series(series: pd.Series) -> bool:
679
+ try:
680
+ _ = series.dropna().unique()
681
+ return True
682
+ except TypeError:
683
+ return False
684
+
685
+ df = convert_object_columns_that_are_numeric(df)
686
+ df = df.convert_dtypes()
687
+
688
+ for col in df.columns:
689
+ total_missing = df[col].isna().sum()
690
+ col_data = df[col].dropna()
691
+ dtype = col_data.dtype
692
+
693
+ if col_data.empty:
694
+ summary.append(
695
+ {
696
+ "Column": col,
697
+ "Dtype": dtype,
698
+ "Type": "unknown",
699
+ "Detail": "No non-null values",
700
+ "Missing": total_missing,
701
+ }
702
+ )
703
+ continue
704
+
705
+ # Case 1: Numeric columns
706
+ if pd.api.types.is_numeric_dtype(col_data):
707
+ unique_vals = col_data.nunique()
708
+
709
+ if set(col_data.unique()).issubset({0, 1}):
710
+ col_type = "binary-categorical"
711
+ detail = "0/1 values only"
712
+ elif (
713
+ pd.api.types.is_integer_dtype(col_data)
714
+ and unique_vals <= sample_categorical_threshold
715
+ ):
716
+ col_type = "multi-categorical"
717
+ top_vals = col_data.value_counts().head(10)
718
+ detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
719
+ else:
720
+ col_type = "numeric"
721
+ q = col_data.quantile([0, 0.25, 0.5, 0.75, 1])
722
+ detail = (
723
+ f"Min: {q.iloc[0]:.2f}, Q1: {q.iloc[1]:.2f}, Median: {q.iloc[2]:.2f}, "
724
+ f"Q3: {q.iloc[3]:.2f}, Max: {q.iloc[4]:.2f}"
725
+ )
726
+
727
+ # Case 2: Object or other hashable columns
728
+ elif is_hashable_series(col_data):
729
+ unique_vals = col_data.nunique()
730
+ if unique_vals <= sample_categorical_threshold:
731
+ col_type = "object-categorical"
732
+ top_vals = col_data.value_counts().head(10)
733
+ detail = ", ".join(f"{k} ({v})" for k, v in top_vals.items())
734
+ else:
735
+ col_type = "high-cardinality-categorical"
736
+ detail = f"{unique_vals} unique values"
737
+
738
+ # Case 3: Unusable columns
739
+ else:
740
+ col_type = "non-hashable"
741
+ detail = f"Non-hashable type: {type(col_data.iloc[0])}"
742
+
743
+ summary.append(
744
+ {
745
+ "Column": col,
746
+ "Dtype": dtype,
747
+ "Type": col_type,
748
+ "Detail": detail,
749
+ "Missing": total_missing,
750
+ }
751
+ )
752
+
753
+ return pd.DataFrame(summary)
754
+
755
+
756
+ def convert_object_columns_that_are_numeric(df: pd.DataFrame) -> list:
757
+ """
758
+ Detect object columns that can be safely converted to numeric (float or int).
759
+
760
+ Returns:
761
+ List of column names that are object type but contain numeric values.
762
+ """
763
+
764
+ numeric_candidates = []
765
+
766
+ for col in df.select_dtypes(include=["object"]).columns:
767
+ try:
768
+ converted = pd.to_numeric(df[col], errors="coerce")
769
+ if converted.notna().sum() / len(df) > 0.9: # at least 90% convertible
770
+ numeric_candidates.append(col)
771
+ except Exception:
772
+ continue
773
+
774
+ for col in numeric_candidates:
775
+ df[col] = pd.to_numeric(df[col], errors="coerce")
776
+
777
+ return df
778
+
779
+
780
+ def traditional_descriptive_analysis(df: pd.DataFrame, group_column: str | None = None):
781
+ with pd.option_context("display.max_rows", None):
782
+ results = {}
783
+
784
+ # Shape
785
+ results["Shape"] = f"{df.shape[0]} rows × {df.shape[1]} columns"
786
+
787
+ # Duplicated rows
788
+ results["Duplicated rows"] = int(df.duplicated().sum())
789
+
790
+ # Duplicated columns
791
+ duplicated_cols = df.T[df.T.duplicated()].index.tolist()
792
+ results["Duplicated columns"] = (
793
+ ", ".join(duplicated_cols) if len(duplicated_cols) > 0 else "None"
794
+ )
795
+
796
+ # Missing values
797
+ missing = df.isnull().sum()
798
+ missing = missing[missing > 0].sort_values(ascending=False)
799
+ if len(missing) > 0:
800
+ results["Missing values"] = missing.to_frame("Missing Count").to_markdown()
801
+ else:
802
+ results["Missing values"] = "No missing values"
803
+
804
+ # Infinite values
805
+ inf = df.replace([np.inf, -np.inf], np.nan)
806
+ inf_count = inf.isnull().sum() - df.isnull().sum()
807
+ inf_count = inf_count[inf_count > 0].sort_values(ascending=False)
808
+ if len(inf_count) > 0:
809
+ results["Infinite values"] = inf_count.to_frame("Inf Count").to_markdown()
810
+ else:
811
+ results["Infinite values"] = "No infinite values"
812
+
813
+ # Constant columns
814
+ constant_cols = [col for col in df.columns if df[col].nunique() == 1]
815
+ results["Constant columns"] = (
816
+ ", ".join(constant_cols) if len(constant_cols) > 0 else "None"
817
+ )
818
+
819
+ # Data types
820
+ dtypes = df.dtypes.astype(str).sort_index()
821
+ results["Data types"] = dtypes.to_frame("Type").to_markdown()
822
+
823
+ # Unique values in group_column
824
+ if group_column is not None:
825
+ if group_column in df.columns:
826
+ results[f"Unique values in '{group_column}'"] = int(
827
+ df[group_column].nunique()
828
+ )
829
+ else:
830
+ results[f"Unique values in '{group_column}'"] = (
831
+ f"❌ Column '{group_column}' not found"
832
+ )
833
+
834
+ # Log all results
835
+ for title, content in results.items():
836
+ print(f"\n### {title}\n{content}")
837
+
838
+
839
+ def print_missing_values(df: pd.DataFrame):
840
+
841
+ if len(df.isnull().sum().where(df.isnull().sum() != 0).dropna()):
842
+ logger.info(
843
+ f"Missing values : \n{df.isnull().sum().where(df.isnull().sum() != 0).dropna().sort_values(ascending=False).to_string()}"
844
+ )
845
+ else:
846
+ logger.info("No missing values found")