upgini 1.2.123__py3-none-any.whl → 1.2.125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +4 -3
- upgini/dataset.py +3 -1
- upgini/features_enricher.py +107 -72
- upgini/metadata.py +4 -1
- upgini/normalizer/normalize_utils.py +2 -2
- upgini/search_task.py +12 -1
- upgini/utils/datetime_utils.py +103 -36
- upgini/utils/deduplicate_utils.py +2 -2
- {upgini-1.2.123.dist-info → upgini-1.2.125.dist-info}/METADATA +2 -1
- {upgini-1.2.123.dist-info → upgini-1.2.125.dist-info}/RECORD +13 -13
- {upgini-1.2.123.dist-info → upgini-1.2.125.dist-info}/WHEEL +0 -0
- {upgini-1.2.123.dist-info → upgini-1.2.125.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.125"
|
upgini/autofe/binary.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import abc
|
2
2
|
from typing import Optional
|
3
|
+
|
3
4
|
import Levenshtein
|
4
5
|
import numpy as np
|
5
6
|
import pandas as pd
|
@@ -201,7 +202,7 @@ class JaroWinklerSim1(StringSim):
|
|
201
202
|
has_symmetry_importance: bool = True
|
202
203
|
|
203
204
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
204
|
-
return value
|
205
|
+
return value if value is not None and len(value) > 0 else None
|
205
206
|
|
206
207
|
def _similarity(self, left: str, right: str) -> float:
|
207
208
|
return jarowinkler_similarity(left, right)
|
@@ -216,7 +217,7 @@ class JaroWinklerSim2(StringSim):
|
|
216
217
|
has_symmetry_importance: bool = True
|
217
218
|
|
218
219
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
219
|
-
return value[::-1] if value is not None else None
|
220
|
+
return value[::-1] if value is not None and len(value) > 0 else None
|
220
221
|
|
221
222
|
def _similarity(self, left: str, right: str) -> float:
|
222
223
|
return jarowinkler_similarity(left, right)
|
@@ -231,7 +232,7 @@ class LevenshteinSim(StringSim):
|
|
231
232
|
has_symmetry_importance: bool = True
|
232
233
|
|
233
234
|
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
234
|
-
return value
|
235
|
+
return value if value is not None and len(value) > 0 else None
|
235
236
|
|
236
237
|
def _similarity(self, left: str, right: str) -> float:
|
237
238
|
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
upgini/dataset.py
CHANGED
@@ -151,7 +151,9 @@ class Dataset:
|
|
151
151
|
def etalon_def_checked(self) -> Dict[str, str]:
|
152
152
|
if self.etalon_def is None:
|
153
153
|
self.etalon_def = {
|
154
|
-
v.value: k
|
154
|
+
v.value: k
|
155
|
+
for k, v in self.meaning_types_checked.items()
|
156
|
+
if v not in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
|
155
157
|
}
|
156
158
|
|
157
159
|
return self.etalon_def
|
upgini/features_enricher.py
CHANGED
@@ -76,7 +76,7 @@ from upgini.utils.custom_loss_utils import (
|
|
76
76
|
)
|
77
77
|
from upgini.utils.cv_utils import CVConfig, get_groups
|
78
78
|
from upgini.utils.datetime_utils import (
|
79
|
-
|
79
|
+
DateTimeConverter,
|
80
80
|
is_blocked_time_series,
|
81
81
|
is_dates_distribution_valid,
|
82
82
|
is_time_series,
|
@@ -220,7 +220,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
220
220
|
cv: CVType | None = None,
|
221
221
|
loss: str | None = None,
|
222
222
|
autodetect_search_keys: bool = True,
|
223
|
+
# deprecated, use text_features instead
|
223
224
|
generate_features: list[str] | None = None,
|
225
|
+
text_features: list[str] | None = None,
|
224
226
|
columns_for_online_api: list[str] | None = None,
|
225
227
|
round_embeddings: int | None = None,
|
226
228
|
logs_enabled: bool = True,
|
@@ -284,8 +286,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
284
286
|
self.country_code = country_code
|
285
287
|
self.__validate_search_keys(search_keys, search_id)
|
286
288
|
|
287
|
-
|
288
|
-
self.model_task_type = ModelTaskType.parse(model_task_type)
|
289
|
+
self.model_task_type = ModelTaskType.parse(model_task_type)
|
289
290
|
self.endpoint = endpoint
|
290
291
|
self._search_task: SearchTask | None = None
|
291
292
|
self.features_info: pd.DataFrame = self.EMPTY_FEATURES_INFO
|
@@ -306,10 +307,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
306
307
|
search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
|
307
308
|
|
308
309
|
print(self.bundle.get("search_by_task_id_start"))
|
309
|
-
trace_id =
|
310
|
-
|
311
|
-
print(f"https://app.datadoghq.eu/logs?query=%40trace_id%3A{trace_id}")
|
312
|
-
with MDC(trace_id=trace_id):
|
310
|
+
trace_id = time.time_ns()
|
311
|
+
with MDC(correlation_id=trace_id):
|
313
312
|
try:
|
314
313
|
self.logger.debug(f"FeaturesEnricher created from existing search: {search_id}")
|
315
314
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
@@ -343,14 +342,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
343
342
|
self.shared_datasets = shared_datasets
|
344
343
|
if shared_datasets is not None:
|
345
344
|
self.runtime_parameters.properties["shared_datasets"] = ",".join(shared_datasets)
|
346
|
-
self.generate_features = generate_features
|
345
|
+
self.generate_features = text_features or generate_features
|
347
346
|
self.round_embeddings = round_embeddings
|
348
|
-
if generate_features is not None:
|
349
|
-
if len(generate_features) > self.GENERATE_FEATURES_LIMIT:
|
347
|
+
if self.generate_features is not None:
|
348
|
+
if len(self.generate_features) > self.GENERATE_FEATURES_LIMIT:
|
350
349
|
msg = self.bundle.get("too_many_generate_features").format(self.GENERATE_FEATURES_LIMIT)
|
351
350
|
self.logger.error(msg)
|
352
351
|
raise ValidationError(msg)
|
353
|
-
self.runtime_parameters.properties["generate_features"] = ",".join(generate_features)
|
352
|
+
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
354
353
|
if round_embeddings is not None:
|
355
354
|
if not isinstance(round_embeddings, int) or round_embeddings < 0:
|
356
355
|
msg = self.bundle.get("invalid_round_embeddings")
|
@@ -485,9 +484,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
485
484
|
stability_agg_func: str, optional (default="max")
|
486
485
|
Function to aggregate stability values. Can be "max", "min", "mean".
|
487
486
|
"""
|
488
|
-
trace_id =
|
487
|
+
trace_id = time.time_ns()
|
489
488
|
if self.print_trace_id:
|
490
|
-
print(f"https://app.datadoghq.eu/logs?query=%
|
489
|
+
print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
|
491
490
|
start_time = time.time()
|
492
491
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
493
492
|
search_progress = SearchProgress(0.0, ProgressStage.START_FIT)
|
@@ -499,7 +498,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
499
498
|
progress_bar.progress = search_progress.to_progress_bar()
|
500
499
|
progress_bar.display()
|
501
500
|
|
502
|
-
with MDC(
|
501
|
+
with MDC(correlation_id=trace_id):
|
503
502
|
if len(args) > 0:
|
504
503
|
msg = f"WARNING: Unsupported positional arguments for fit: {args}"
|
505
504
|
self.logger.warning(msg)
|
@@ -644,11 +643,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
644
643
|
|
645
644
|
self.warning_counter.reset()
|
646
645
|
auto_fe_parameters = AutoFEParameters() if auto_fe_parameters is None else auto_fe_parameters
|
647
|
-
trace_id =
|
646
|
+
trace_id = time.time_ns()
|
648
647
|
if self.print_trace_id:
|
649
|
-
print(f"https://app.datadoghq.eu/logs?query=%
|
648
|
+
print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
|
650
649
|
start_time = time.time()
|
651
|
-
with MDC(
|
650
|
+
with MDC(correlation_id=trace_id):
|
652
651
|
if len(args) > 0:
|
653
652
|
msg = f"WARNING: Unsupported positional arguments for fit_transform: {args}"
|
654
653
|
self.logger.warning(msg)
|
@@ -746,8 +745,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
746
745
|
def transform(
|
747
746
|
self,
|
748
747
|
X: pd.DataFrame,
|
749
|
-
*args,
|
750
748
|
y: pd.Series | None = None,
|
749
|
+
*args,
|
751
750
|
exclude_features_sources: list[str] | None = None,
|
752
751
|
keep_input: bool = True,
|
753
752
|
trace_id: str | None = None,
|
@@ -788,9 +787,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
788
787
|
progress_bar.progress = search_progress.to_progress_bar()
|
789
788
|
if new_progress:
|
790
789
|
progress_bar.display()
|
791
|
-
trace_id = trace_id or
|
790
|
+
trace_id = trace_id or time.time_ns()
|
791
|
+
if self.print_trace_id:
|
792
|
+
print(f"https://app.datadoghq.eu/logs?query=%40correlation_id%3A{trace_id}")
|
792
793
|
search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
|
793
|
-
with MDC(
|
794
|
+
with MDC(correlation_id=trace_id, search_id=search_id):
|
794
795
|
self.dump_input(trace_id, X)
|
795
796
|
if len(args) > 0:
|
796
797
|
msg = f"WARNING: Unsupported positional arguments for transform: {args}"
|
@@ -905,10 +906,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
905
906
|
Dataframe with metrics calculated on train and validation datasets.
|
906
907
|
"""
|
907
908
|
|
908
|
-
trace_id = trace_id or
|
909
|
+
trace_id = trace_id or time.time_ns()
|
909
910
|
start_time = time.time()
|
910
911
|
search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
|
911
|
-
with MDC(
|
912
|
+
with MDC(correlation_id=trace_id, search_id=search_id):
|
912
913
|
self.logger.info("Start calculate metrics")
|
913
914
|
if len(args) > 0:
|
914
915
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
@@ -1416,13 +1417,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1416
1417
|
# Find latest eval set or earliest if all eval sets are before train set
|
1417
1418
|
date_column = self._get_date_column(search_keys)
|
1418
1419
|
|
1419
|
-
date_converter =
|
1420
|
+
date_converter = DateTimeConverter(
|
1420
1421
|
date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
|
1421
1422
|
)
|
1422
1423
|
|
1423
|
-
|
1424
|
-
|
1425
|
-
x_date = X[date_column].dropna()
|
1424
|
+
x_date = date_converter.to_date_ms(X).dropna()
|
1426
1425
|
if len(x_date) == 0:
|
1427
1426
|
self.logger.warning("Empty date column in X")
|
1428
1427
|
return []
|
@@ -1435,8 +1434,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1435
1434
|
if date_column not in eval_x.columns:
|
1436
1435
|
self.logger.warning(f"Date column not found in eval_set {i + 1}")
|
1437
1436
|
continue
|
1438
|
-
|
1439
|
-
eval_x_date = eval_x[date_column].dropna()
|
1437
|
+
eval_x_date = date_converter.to_date_ms(eval_x).dropna()
|
1440
1438
|
if len(eval_x_date) < 1000:
|
1441
1439
|
self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
|
1442
1440
|
continue
|
@@ -1473,8 +1471,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1473
1471
|
)
|
1474
1472
|
checking_eval_set_df = checking_eval_set_df.copy()
|
1475
1473
|
|
1476
|
-
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1477
|
-
checking_eval_set_df = date_converter.convert(checking_eval_set_df)
|
1474
|
+
checking_eval_set_df[date_column] = date_converter.to_date_ms(eval_set_dates[selected_eval_set_idx].to_frame())
|
1478
1475
|
|
1479
1476
|
psi_values_sparse = calculate_sparsity_psi(
|
1480
1477
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
@@ -1482,7 +1479,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1482
1479
|
|
1483
1480
|
self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
|
1484
1481
|
|
1485
|
-
unstable_by_sparsity = [
|
1482
|
+
unstable_by_sparsity = [
|
1483
|
+
feature
|
1484
|
+
for feature, psi in psi_values_sparse.items()
|
1485
|
+
if psi > stability_threshold
|
1486
|
+
]
|
1486
1487
|
if unstable_by_sparsity:
|
1487
1488
|
self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
|
1488
1489
|
|
@@ -1492,7 +1493,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1492
1493
|
|
1493
1494
|
self.logger.info(f"PSI values by value: {psi_values}")
|
1494
1495
|
|
1495
|
-
unstable_by_value = [
|
1496
|
+
unstable_by_value = [
|
1497
|
+
feature
|
1498
|
+
for feature, psi in psi_values.items()
|
1499
|
+
if psi > stability_threshold
|
1500
|
+
]
|
1496
1501
|
if unstable_by_value:
|
1497
1502
|
self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
|
1498
1503
|
|
@@ -1746,9 +1751,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1746
1751
|
not in (
|
1747
1752
|
excluding_search_keys
|
1748
1753
|
+ list(self.fit_dropped_features)
|
1749
|
-
+ [
|
1754
|
+
+ [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
1750
1755
|
)
|
1751
1756
|
]
|
1757
|
+
if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
|
1758
|
+
client_features.append(self.baseline_score_column)
|
1752
1759
|
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
1753
1760
|
|
1754
1761
|
selected_enriched_features = [c for c in self.feature_names_ if c not in client_features]
|
@@ -1996,7 +2003,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1996
2003
|
date_column = self._get_date_column(search_keys)
|
1997
2004
|
generated_features = []
|
1998
2005
|
if date_column is not None:
|
1999
|
-
converter =
|
2006
|
+
converter = DateTimeConverter(
|
2000
2007
|
date_column,
|
2001
2008
|
self.date_format,
|
2002
2009
|
self.logger,
|
@@ -2005,6 +2012,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2005
2012
|
)
|
2006
2013
|
# Leave original date column values
|
2007
2014
|
df_with_date_features = converter.convert(df, keep_time=True)
|
2015
|
+
# TODO check if this is correct
|
2008
2016
|
df_with_date_features[date_column] = df[date_column]
|
2009
2017
|
df = df_with_date_features
|
2010
2018
|
generated_features = converter.generated_features
|
@@ -2035,15 +2043,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
2035
2043
|
|
2036
2044
|
# Sample after sorting by system_record_id for idempotency
|
2037
2045
|
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
2038
|
-
df = self.__downsample_for_metrics(df)
|
2039
2046
|
|
2040
|
-
if
|
2041
|
-
df = df.drop(columns=
|
2047
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
2048
|
+
df = df.drop(columns=DateTimeConverter.DATETIME_COL)
|
2042
2049
|
|
2043
2050
|
df = df.rename(columns=columns_renaming)
|
2044
2051
|
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
2045
2052
|
search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2046
2053
|
|
2054
|
+
# It uses original columns names!
|
2055
|
+
df = self.__downsample_for_metrics(df)
|
2056
|
+
|
2047
2057
|
train_df = df.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df
|
2048
2058
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
2049
2059
|
y_sampled = train_df[TARGET].copy()
|
@@ -2387,7 +2397,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2387
2397
|
def get_progress(self, trace_id: str | None = None, search_task: SearchTask | None = None) -> SearchProgress:
|
2388
2398
|
search_task = search_task or self._search_task
|
2389
2399
|
if search_task is not None:
|
2390
|
-
trace_id = trace_id or
|
2400
|
+
trace_id = trace_id or time.time_ns()
|
2391
2401
|
return search_task.get_progress(trace_id)
|
2392
2402
|
|
2393
2403
|
def display_transactional_transform_api(self, only_online_sources=False):
|
@@ -2415,7 +2425,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2415
2425
|
return "12345678"
|
2416
2426
|
return "test_value"
|
2417
2427
|
|
2418
|
-
file_metadata = self._search_task.get_file_metadata(
|
2428
|
+
file_metadata = self._search_task.get_file_metadata(time.time_ns())
|
2419
2429
|
|
2420
2430
|
def get_column_meta(column_name: str) -> FileColumnMetadata:
|
2421
2431
|
for c in file_metadata.columns:
|
@@ -2509,7 +2519,7 @@ if response.status_code == 200:
|
|
2509
2519
|
|
2510
2520
|
start_time = time.time()
|
2511
2521
|
search_id = self.search_id or (self._search_task.search_task_id if self._search_task is not None else None)
|
2512
|
-
with MDC(
|
2522
|
+
with MDC(correlation_id=trace_id, search_id=search_id):
|
2513
2523
|
self.logger.info("Start transform")
|
2514
2524
|
|
2515
2525
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(
|
@@ -2598,7 +2608,7 @@ if response.status_code == 200:
|
|
2598
2608
|
generated_features = []
|
2599
2609
|
date_column = self._get_date_column(search_keys)
|
2600
2610
|
if date_column is not None:
|
2601
|
-
converter =
|
2611
|
+
converter = DateTimeConverter(
|
2602
2612
|
date_column,
|
2603
2613
|
self.date_format,
|
2604
2614
|
self.logger,
|
@@ -2655,8 +2665,8 @@ if response.status_code == 200:
|
|
2655
2665
|
|
2656
2666
|
# Don't pass all features in backend on transform
|
2657
2667
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
2658
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
2659
|
-
if
|
2668
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
2669
|
+
if features_for_transform:
|
2660
2670
|
missing_features_for_transform = [
|
2661
2671
|
columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
|
2662
2672
|
]
|
@@ -2667,7 +2677,10 @@ if response.status_code == 200:
|
|
2667
2677
|
raise ValidationError(
|
2668
2678
|
self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
|
2669
2679
|
)
|
2670
|
-
|
2680
|
+
features_for_embeddings = self._search_task.get_features_for_embeddings()
|
2681
|
+
if features_for_embeddings:
|
2682
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_embeddings)
|
2683
|
+
features_for_transform = [f for f in features_for_transform if f not in search_keys.keys()]
|
2671
2684
|
|
2672
2685
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
2673
2686
|
|
@@ -2728,8 +2741,17 @@ if response.status_code == 200:
|
|
2728
2741
|
)
|
2729
2742
|
df = converter.convert(df)
|
2730
2743
|
|
2744
|
+
date_features = []
|
2745
|
+
for col in features_for_transform:
|
2746
|
+
if DateTimeConverter(col).is_datetime(df):
|
2747
|
+
df[col] = DateTimeConverter(col).to_date_string(df)
|
2748
|
+
date_features.append(col)
|
2749
|
+
|
2731
2750
|
meaning_types = {}
|
2732
|
-
meaning_types.update(
|
2751
|
+
meaning_types.update(
|
2752
|
+
{col: FileColumnMeaningType.FEATURE for col in features_for_transform if col not in date_features}
|
2753
|
+
)
|
2754
|
+
meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
|
2733
2755
|
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
2734
2756
|
|
2735
2757
|
features_not_to_pass.extend(
|
@@ -2742,8 +2764,8 @@ if response.status_code == 200:
|
|
2742
2764
|
]
|
2743
2765
|
)
|
2744
2766
|
|
2745
|
-
if
|
2746
|
-
df = df.drop(columns=
|
2767
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
2768
|
+
df = df.drop(columns=DateTimeConverter.DATETIME_COL)
|
2747
2769
|
|
2748
2770
|
# search keys might be changed after explode
|
2749
2771
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
@@ -2925,6 +2947,7 @@ if response.status_code == 200:
|
|
2925
2947
|
or c in self.search_keys
|
2926
2948
|
or c in (self.id_columns or [])
|
2927
2949
|
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2950
|
+
or c == self.baseline_score_column
|
2928
2951
|
]
|
2929
2952
|
else:
|
2930
2953
|
selected_input_columns = []
|
@@ -3123,7 +3146,7 @@ if response.status_code == 200:
|
|
3123
3146
|
self.fit_generated_features = []
|
3124
3147
|
|
3125
3148
|
if has_date:
|
3126
|
-
converter =
|
3149
|
+
converter = DateTimeConverter(
|
3127
3150
|
maybe_date_column,
|
3128
3151
|
self.date_format,
|
3129
3152
|
self.logger,
|
@@ -3176,8 +3199,8 @@ if response.status_code == 200:
|
|
3176
3199
|
self.TARGET_NAME,
|
3177
3200
|
EVAL_SET_INDEX,
|
3178
3201
|
] + list(self.fit_search_keys.keys())
|
3179
|
-
if
|
3180
|
-
non_feature_columns.append(
|
3202
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
3203
|
+
non_feature_columns.append(DateTimeConverter.DATETIME_COL)
|
3181
3204
|
|
3182
3205
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
3183
3206
|
|
@@ -3264,15 +3287,27 @@ if response.status_code == 200:
|
|
3264
3287
|
ENTITY_SYSTEM_RECORD_ID,
|
3265
3288
|
SEARCH_KEY_UNNEST,
|
3266
3289
|
] + list(self.fit_search_keys.keys())
|
3267
|
-
if
|
3268
|
-
non_feature_columns.append(
|
3290
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
3291
|
+
non_feature_columns.append(DateTimeConverter.DATETIME_COL)
|
3269
3292
|
|
3270
3293
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
3271
3294
|
|
3295
|
+
# find date features
|
3296
|
+
date_features = []
|
3297
|
+
for col in features_columns:
|
3298
|
+
if DateTimeConverter(col).is_datetime(df):
|
3299
|
+
df[col] = DateTimeConverter(col).to_date_string(df)
|
3300
|
+
date_features.append(col)
|
3301
|
+
|
3272
3302
|
meaning_types = {
|
3273
3303
|
**{col: key.value for col, key in self.fit_search_keys.items()},
|
3274
|
-
**{
|
3304
|
+
**{
|
3305
|
+
str(c): FileColumnMeaningType.FEATURE
|
3306
|
+
for c in df.columns
|
3307
|
+
if c not in non_feature_columns and c not in date_features
|
3308
|
+
},
|
3275
3309
|
}
|
3310
|
+
meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
|
3276
3311
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
3277
3312
|
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
3278
3313
|
if SEARCH_KEY_UNNEST in df.columns:
|
@@ -3293,8 +3328,8 @@ if response.status_code == 200:
|
|
3293
3328
|
self.bundle,
|
3294
3329
|
)
|
3295
3330
|
|
3296
|
-
if
|
3297
|
-
df = df.drop(columns=
|
3331
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
3332
|
+
df = df.drop(columns=DateTimeConverter.DATETIME_COL)
|
3298
3333
|
|
3299
3334
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
3300
3335
|
|
@@ -3331,7 +3366,9 @@ if response.status_code == 200:
|
|
3331
3366
|
dataset.columns_renaming = self.fit_columns_renaming
|
3332
3367
|
|
3333
3368
|
self.passed_features = [
|
3334
|
-
column
|
3369
|
+
column
|
3370
|
+
for column, meaning_type in meaning_types.items()
|
3371
|
+
if meaning_type in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
|
3335
3372
|
]
|
3336
3373
|
|
3337
3374
|
self._search_task = dataset.search(
|
@@ -3859,8 +3896,8 @@ if response.status_code == 200:
|
|
3859
3896
|
X = Xy.drop(columns=TARGET)
|
3860
3897
|
y = Xy[TARGET].copy()
|
3861
3898
|
|
3862
|
-
if
|
3863
|
-
X.drop(columns=
|
3899
|
+
if DateTimeConverter.DATETIME_COL in X.columns:
|
3900
|
+
X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
|
3864
3901
|
|
3865
3902
|
return X, y
|
3866
3903
|
|
@@ -3870,8 +3907,8 @@ if response.status_code == 200:
|
|
3870
3907
|
X: pd.DataFrame, y: pd.Series, search_keys: dict[str, SearchKey], cv: CVType | None
|
3871
3908
|
) -> tuple[pd.DataFrame, pd.Series]:
|
3872
3909
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3873
|
-
if
|
3874
|
-
date_column =
|
3910
|
+
if DateTimeConverter.DATETIME_COL in X.columns:
|
3911
|
+
date_column = DateTimeConverter.DATETIME_COL
|
3875
3912
|
else:
|
3876
3913
|
date_column = FeaturesEnricher._get_date_column(search_keys)
|
3877
3914
|
sort_columns = [date_column] if date_column is not None else []
|
@@ -3899,8 +3936,8 @@ if response.status_code == 200:
|
|
3899
3936
|
|
3900
3937
|
y = Xy[TARGET].copy()
|
3901
3938
|
|
3902
|
-
if
|
3903
|
-
X.drop(columns=
|
3939
|
+
if DateTimeConverter.DATETIME_COL in X.columns:
|
3940
|
+
X.drop(columns=DateTimeConverter.DATETIME_COL, inplace=True)
|
3904
3941
|
|
3905
3942
|
return X, y
|
3906
3943
|
|
@@ -3979,12 +4016,10 @@ if response.status_code == 200:
|
|
3979
4016
|
maybe_date_col = SearchKey.find_key(self.search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3980
4017
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
3981
4018
|
# TODO cast date column to single dtype
|
3982
|
-
date_converter =
|
3983
|
-
|
3984
|
-
)
|
3985
|
-
|
3986
|
-
min_date = converted_X[maybe_date_col].min()
|
3987
|
-
max_date = converted_X[maybe_date_col].max()
|
4019
|
+
date_converter = DateTimeConverter(maybe_date_col, self.date_format, generate_cyclical_features=False)
|
4020
|
+
date_col_values = date_converter.to_date_ms(X)
|
4021
|
+
min_date = date_col_values.min()
|
4022
|
+
max_date = date_col_values.max()
|
3988
4023
|
self.logger.info(f"Dates interval is ({min_date}, {max_date})")
|
3989
4024
|
|
3990
4025
|
except Exception:
|
@@ -4021,7 +4056,7 @@ if response.status_code == 200:
|
|
4021
4056
|
self.__log_warning(bundle.get("current_date_added"))
|
4022
4057
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
4023
4058
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
4024
|
-
converter =
|
4059
|
+
converter = DateTimeConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
|
4025
4060
|
df = converter.convert(df)
|
4026
4061
|
return df
|
4027
4062
|
|
@@ -4152,8 +4187,8 @@ if response.status_code == 200:
|
|
4152
4187
|
"__target",
|
4153
4188
|
ENTITY_SYSTEM_RECORD_ID,
|
4154
4189
|
]
|
4155
|
-
if
|
4156
|
-
date_column =
|
4190
|
+
if DateTimeConverter.DATETIME_COL in df.columns:
|
4191
|
+
date_column = DateTimeConverter.DATETIME_COL
|
4157
4192
|
sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
|
4158
4193
|
else:
|
4159
4194
|
date_column = FeaturesEnricher._get_date_column(search_keys)
|
@@ -4953,7 +4988,7 @@ if response.status_code == 200:
|
|
4953
4988
|
eval_set: tuple | None = None,
|
4954
4989
|
):
|
4955
4990
|
def dump_task(X_, y_, eval_set_):
|
4956
|
-
with MDC(
|
4991
|
+
with MDC(correlation_id=trace_id):
|
4957
4992
|
try:
|
4958
4993
|
if isinstance(X_, pd.Series):
|
4959
4994
|
X_ = X_.to_frame()
|
upgini/metadata.py
CHANGED
@@ -36,6 +36,7 @@ class FileColumnMeaningType(Enum):
|
|
36
36
|
SCORE = "SCORE"
|
37
37
|
TARGET = "TARGET"
|
38
38
|
FEATURE = "FEATURE"
|
39
|
+
DATE_FEATURE = "DATE_FEATURE"
|
39
40
|
CUSTOM_KEY = "CUSTOM_KEY"
|
40
41
|
COUNTRY = "COUNTRY"
|
41
42
|
POSTAL_CODE = "POSTAL_CODE"
|
@@ -163,7 +164,9 @@ class ModelTaskType(Enum):
|
|
163
164
|
return self in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]
|
164
165
|
|
165
166
|
@staticmethod
|
166
|
-
def parse(task_type: Any) -> "ModelTaskType":
|
167
|
+
def parse(task_type: Any) -> Optional["ModelTaskType"]:
|
168
|
+
if task_type is None:
|
169
|
+
return None
|
167
170
|
if isinstance(task_type, ModelTaskType):
|
168
171
|
return task_type
|
169
172
|
elif isinstance(task_type, str):
|
@@ -25,7 +25,7 @@ from upgini.metadata import (
|
|
25
25
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
26
26
|
from upgini.utils import find_numbers_with_decimal_comma
|
27
27
|
from upgini.utils.country_utils import CountrySearchKeyConverter
|
28
|
-
from upgini.utils.datetime_utils import
|
28
|
+
from upgini.utils.datetime_utils import DateTimeConverter
|
29
29
|
from upgini.utils.ip_utils import IpSearchKeyConverter
|
30
30
|
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
31
31
|
from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
|
@@ -89,7 +89,7 @@ class Normalizer:
|
|
89
89
|
SYSTEM_RECORD_ID,
|
90
90
|
ENTITY_SYSTEM_RECORD_ID,
|
91
91
|
SEARCH_KEY_UNNEST,
|
92
|
-
|
92
|
+
DateTimeConverter.DATETIME_COL,
|
93
93
|
]:
|
94
94
|
self.columns_renaming[column] = column
|
95
95
|
new_columns.append(column)
|
upgini/search_task.py
CHANGED
@@ -165,10 +165,21 @@ class SearchTask:
|
|
165
165
|
|
166
166
|
return list(zero_hit_search_keys)
|
167
167
|
|
168
|
-
def
|
168
|
+
def get_features_for_embeddings(self) -> Optional[List[str]]:
|
169
169
|
if self.provider_metadata_v2 is None:
|
170
170
|
return None
|
171
171
|
|
172
|
+
features_for_transform = set()
|
173
|
+
for meta in self.provider_metadata_v2:
|
174
|
+
if meta.features_used_for_embeddings is not None:
|
175
|
+
features_for_transform.update(meta.features_used_for_embeddings)
|
176
|
+
|
177
|
+
return list(features_for_transform)
|
178
|
+
|
179
|
+
def get_features_for_transform(self) -> List[str]:
|
180
|
+
if self.provider_metadata_v2 is None:
|
181
|
+
return []
|
182
|
+
|
172
183
|
features_for_transform = set()
|
173
184
|
for meta in self.provider_metadata_v2:
|
174
185
|
if meta.features_used_for_embeddings is not None:
|
upgini/utils/datetime_utils.py
CHANGED
@@ -30,7 +30,7 @@ DATE_FORMATS = [
|
|
30
30
|
DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
|
31
31
|
|
32
32
|
|
33
|
-
class
|
33
|
+
class DateTimeConverter:
|
34
34
|
DATETIME_COL = "_date_time"
|
35
35
|
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
36
36
|
MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
|
@@ -73,41 +73,99 @@ class DateTimeSearchKeyConverter:
|
|
73
73
|
except Exception:
|
74
74
|
return None
|
75
75
|
|
76
|
-
def
|
77
|
-
if len(df) == 0:
|
78
|
-
return
|
76
|
+
def is_datetime(self, df: pd.DataFrame) -> bool:
|
77
|
+
if len(df) == 0 or df[self.date_column].isna().all():
|
78
|
+
return False
|
79
|
+
|
80
|
+
if pd.api.types.is_datetime64_any_dtype(df[self.date_column]):
|
81
|
+
return True
|
82
|
+
|
83
|
+
parsed = self.parse_datetime(df, raise_errors=False)
|
84
|
+
return parsed is not None and not parsed.isna().all()
|
79
85
|
|
86
|
+
def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
|
80
87
|
df = df.copy()
|
81
|
-
if df[self.date_column].
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
88
|
+
if len(df) == 0 or df[self.date_column].isna().all():
|
89
|
+
return None
|
90
|
+
|
91
|
+
try:
|
92
|
+
if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
|
93
|
+
parsed_datetime = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
|
94
|
+
elif isinstance(df[self.date_column].dropna().values[0], datetime.date):
|
95
|
+
parsed_datetime = pd.to_datetime(df[self.date_column], errors="coerce")
|
96
|
+
elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
|
97
|
+
parsed_datetime = df[self.date_column].dt.to_timestamp()
|
98
|
+
elif is_numeric_dtype(df[self.date_column]):
|
99
|
+
# 315532801 - 2524608001 - seconds
|
100
|
+
# 315532801000 - 2524608001000 - milliseconds
|
101
|
+
# 315532801000000 - 2524608001000000 - microseconds
|
102
|
+
# 315532801000000000 - 2524608001000000000 - nanoseconds
|
103
|
+
if df[self.date_column].apply(lambda x: 10**16 < x).all():
|
104
|
+
parsed_datetime = pd.to_datetime(df[self.date_column], unit="ns")
|
105
|
+
elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
|
106
|
+
parsed_datetime = pd.to_datetime(df[self.date_column], unit="us")
|
107
|
+
elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
|
108
|
+
parsed_datetime = pd.to_datetime(df[self.date_column], unit="ms")
|
109
|
+
elif df[self.date_column].apply(lambda x: 10**8 < x < 10**11).all():
|
110
|
+
parsed_datetime = pd.to_datetime(df[self.date_column], unit="s")
|
111
|
+
else:
|
112
|
+
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
113
|
+
if raise_errors:
|
114
|
+
raise ValidationError(msg)
|
115
|
+
else:
|
116
|
+
return None
|
117
|
+
else:
|
118
|
+
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
119
|
+
parsed_datetime = self.parse_string_date(df, raise_errors)
|
120
|
+
parsed_datetime = parsed_datetime.dt.tz_localize(None)
|
121
|
+
return parsed_datetime
|
122
|
+
except Exception as e:
|
123
|
+
if raise_errors:
|
124
|
+
raise ValidationError(e)
|
100
125
|
else:
|
101
|
-
|
102
|
-
|
126
|
+
return None
|
127
|
+
|
128
|
+
def to_date_string(self, df: pd.DataFrame) -> pd.Series:
|
129
|
+
parsed_datetime = self.parse_datetime(df)
|
130
|
+
if parsed_datetime is None:
|
131
|
+
return df[self.date_column]
|
132
|
+
return parsed_datetime.dt.strftime("%Y-%m-%d")
|
133
|
+
|
134
|
+
def to_date_ms(self, df: pd.DataFrame) -> pd.Series:
|
135
|
+
parsed_datetime = self.parse_datetime(df)
|
136
|
+
if parsed_datetime is None:
|
137
|
+
return df[self.date_column]
|
138
|
+
return self.convert_datetime_to_date_ms(parsed_datetime)
|
139
|
+
|
140
|
+
def convert_datetime_to_datetime_ms(self, date_col: pd.Series) -> pd.Series:
|
141
|
+
if date_col.dt.unit == "ns":
|
142
|
+
date_col = date_col.astype(np.int64) // 1_000_000
|
143
|
+
elif date_col.dt.unit == "us":
|
144
|
+
date_col = date_col.astype(np.int64) // 1_000
|
145
|
+
elif date_col.dt.unit == "ms":
|
146
|
+
date_col = date_col.astype(np.int64)
|
147
|
+
elif date_col.dt.unit == "s":
|
148
|
+
date_col = date_col.astype(np.int64) * 1_000
|
103
149
|
else:
|
104
|
-
|
105
|
-
|
150
|
+
raise ValueError(f"Unsupported date unit: {date_col.dt.unit}")
|
151
|
+
|
152
|
+
return date_col.apply(self._int_to_opt).astype("Int64")
|
153
|
+
|
154
|
+
def convert_datetime_to_date_ms(self, date_col: pd.Series) -> pd.Series:
|
155
|
+
date_col = date_col.dt.floor("D")
|
156
|
+
return self.convert_datetime_to_datetime_ms(date_col)
|
157
|
+
|
158
|
+
def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
|
159
|
+
df = df.copy()
|
160
|
+
parsed_datetime = self.parse_datetime(df)
|
161
|
+
if parsed_datetime is None:
|
162
|
+
return df
|
163
|
+
|
164
|
+
df[self.date_column] = parsed_datetime
|
106
165
|
|
107
166
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
108
167
|
# as additional features
|
109
168
|
seconds = "datetime_seconds"
|
110
|
-
df[self.date_column] = df[self.date_column].dt.tz_localize(None)
|
111
169
|
|
112
170
|
df = self.clean_old_dates(df)
|
113
171
|
|
@@ -182,21 +240,22 @@ class DateTimeSearchKeyConverter:
|
|
182
240
|
df.drop(columns=seconds, inplace=True)
|
183
241
|
|
184
242
|
if keep_time:
|
185
|
-
df[self.DATETIME_COL] = df[self.date_column]
|
186
|
-
|
187
|
-
df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
|
188
|
-
df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
|
243
|
+
df[self.DATETIME_COL] = self.convert_datetime_to_datetime_ms(df[self.date_column])
|
244
|
+
df[self.date_column] = self.convert_datetime_to_date_ms(df[self.date_column])
|
189
245
|
|
190
246
|
self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
|
191
247
|
|
192
248
|
return df
|
193
249
|
|
194
|
-
def
|
250
|
+
def parse_string_date(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
|
195
251
|
if self.date_format is not None:
|
196
252
|
try:
|
197
253
|
return pd.to_datetime(df[self.date_column], format=self.date_format)
|
198
254
|
except ValueError as e:
|
199
|
-
|
255
|
+
if raise_errors:
|
256
|
+
raise ValidationError(e)
|
257
|
+
else:
|
258
|
+
return None
|
200
259
|
else:
|
201
260
|
for date_format in DATE_FORMATS:
|
202
261
|
try:
|
@@ -204,9 +263,17 @@ class DateTimeSearchKeyConverter:
|
|
204
263
|
except ValueError:
|
205
264
|
pass
|
206
265
|
try:
|
207
|
-
|
266
|
+
# Suppress warning for intentional fallback to dateutil parsing
|
267
|
+
import warnings
|
268
|
+
|
269
|
+
with warnings.catch_warnings():
|
270
|
+
warnings.filterwarnings("ignore", message="Could not infer format")
|
271
|
+
return pd.to_datetime(df[self.date_column])
|
208
272
|
except ValueError:
|
209
|
-
|
273
|
+
if raise_errors:
|
274
|
+
raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
|
275
|
+
else:
|
276
|
+
return None
|
210
277
|
|
211
278
|
def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
|
212
279
|
condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
|
@@ -14,7 +14,7 @@ from upgini.metadata import (
|
|
14
14
|
SearchKey,
|
15
15
|
)
|
16
16
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
17
|
-
from upgini.utils.datetime_utils import
|
17
|
+
from upgini.utils.datetime_utils import DateTimeConverter
|
18
18
|
from upgini.utils.target_utils import define_task
|
19
19
|
|
20
20
|
|
@@ -104,7 +104,7 @@ def remove_fintech_duplicates(
|
|
104
104
|
sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
|
105
105
|
|
106
106
|
# Convert date columns for further checks
|
107
|
-
sub_df =
|
107
|
+
sub_df = DateTimeConverter(
|
108
108
|
date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
|
109
109
|
).convert(sub_df)
|
110
110
|
grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.125
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -30,6 +30,7 @@ Requires-Dist: ipywidgets>=8.1.0
|
|
30
30
|
Requires-Dist: jarowinkler>=2.0.0
|
31
31
|
Requires-Dist: levenshtein>=0.25.1
|
32
32
|
Requires-Dist: lightgbm>=4.6.0
|
33
|
+
Requires-Dist: more-itertools==10.7.0
|
33
34
|
Requires-Dist: numpy<3.0.0,>=1.19.0
|
34
35
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
35
36
|
Requires-Dist: psutil>=5.9.0
|
@@ -1,20 +1,20 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=khvL6Ma3KHnaaXtUCPR9kKBJFG5qg7emKoKVlrbEt0k,24
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=Nm2ZmwyQqvTnymYpGUwyJWy7y2ebXlHMyYmGeGcyA_s,31652
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=tmKeERG2b0YfJ47g-UXQQ3S-9tyagwUOhI4oqN3kG2w,233058
|
7
7
|
upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
|
8
|
-
upgini/metadata.py,sha256=
|
8
|
+
upgini/metadata.py,sha256=CL9bFytdUZlbQYtTgNgAkt_sxO9klARQtULDBgb2Hlg,12575
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
10
|
-
upgini/search_task.py,sha256=
|
10
|
+
upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
13
13
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
14
14
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
15
15
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
17
|
-
upgini/autofe/binary.py,sha256=
|
17
|
+
upgini/autofe/binary.py,sha256=o3TQuP3EnECAVIeToGczu4yJ4vX7BJ2iSCN9Ra1SZJI,7829
|
18
18
|
upgini/autofe/date.py,sha256=RvexgrL1_6ISYPVrl9HUQmPgpVSGQsTNv8YhNQWs-5M,11329
|
19
19
|
upgini/autofe/feature.py,sha256=W9sZHdz5Vi0H_oPyY5saZAPjyd5wunpULnCqrGLpQc4,16879
|
20
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
@@ -35,7 +35,7 @@ upgini/data_source/data_source_publisher.py,sha256=qXQUYErhCmkWHm2FWgTL0FYZ2aJbx
|
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
38
|
+
upgini/normalizer/normalize_utils.py,sha256=w9f_9udrwqbhXgFMTs2keuce-6X_j6h3D7EdNo_2X7g,8493
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
41
|
upgini/resource_bundle/strings.properties,sha256=KcXm1Nl6c3zswL91tIbG0DjuuNpzxUdCg1cY9f2-9cg,29283
|
@@ -52,8 +52,8 @@ upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
|
|
52
52
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
53
53
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
54
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
55
|
-
upgini/utils/datetime_utils.py,sha256=
|
56
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
55
|
+
upgini/utils/datetime_utils.py,sha256=l85UzSQLhtMeI2G6m-m8y8bCColCLSXNHb2-G6fKpLM,16988
|
56
|
+
upgini/utils/deduplicate_utils.py,sha256=6czbn1q0p-lOmrNvbAzueBpDHmfIP4TfV4poWqbjX5w,11255
|
57
57
|
upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc,11973
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
59
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
|
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.125.dist-info/METADATA,sha256=CAoP8m15syLZEVmnYuUjUMI1Jo-XvMCGhz-CZnRYwy4,50781
|
78
|
+
upgini-1.2.125.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.125.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.125.dist-info/RECORD,,
|
File without changes
|
File without changes
|