upgini 1.1.244a4__tar.gz → 1.1.244a6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.244a4/src/upgini.egg-info → upgini-1.1.244a6}/PKG-INFO +1 -1
- {upgini-1.1.244a4 → upgini-1.1.244a6}/setup.py +1 -1
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/features_enricher.py +97 -58
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/metadata.py +2 -1
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/metrics.py +1 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.244a4 → upgini-1.1.244a6}/LICENSE +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/README.md +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/pyproject.toml +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/setup.cfg +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/ads.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/dataset.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/errors.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/http.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/search_task.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/spinner.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_country_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_email_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_metrics.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.244a4 → upgini-1.1.244a6}/tests/test_widget.py +0 -0
|
@@ -40,6 +40,7 @@ from upgini.metadata import (
|
|
|
40
40
|
EVAL_SET_INDEX,
|
|
41
41
|
ORIGINAL_INDEX,
|
|
42
42
|
RENAMED_INDEX,
|
|
43
|
+
SORT_ID,
|
|
43
44
|
SYSTEM_RECORD_ID,
|
|
44
45
|
TARGET,
|
|
45
46
|
CVType,
|
|
@@ -1299,7 +1300,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1299
1300
|
c
|
|
1300
1301
|
for c in X_sampled.columns.to_list()
|
|
1301
1302
|
if c
|
|
1302
|
-
not in (
|
|
1303
|
+
not in (
|
|
1304
|
+
excluding_search_keys
|
|
1305
|
+
+ list(self.fit_dropped_features)
|
|
1306
|
+
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1307
|
+
)
|
|
1303
1308
|
]
|
|
1304
1309
|
|
|
1305
1310
|
filtered_enriched_features = self.__filtered_enriched_features(
|
|
@@ -1307,10 +1312,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1307
1312
|
max_features,
|
|
1308
1313
|
)
|
|
1309
1314
|
|
|
1310
|
-
X_sorted, y_sorted = self.
|
|
1311
|
-
enriched_X_sorted, enriched_y_sorted = self.
|
|
1315
|
+
X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
|
|
1316
|
+
enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
|
|
1312
1317
|
|
|
1313
|
-
group_columns = sorted(self._get_group_columns(search_keys))
|
|
1318
|
+
group_columns = sorted(self._get_group_columns(enriched_X_sorted, search_keys))
|
|
1314
1319
|
groups = (
|
|
1315
1320
|
None
|
|
1316
1321
|
if not group_columns or self.cv != CVType.group_k_fold
|
|
@@ -1332,9 +1337,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1332
1337
|
fitting_eval_set_dict = dict()
|
|
1333
1338
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1334
1339
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1335
|
-
eval_X_sorted, eval_y_sorted = self.
|
|
1336
|
-
enriched_eval_X_sorted, enriched_eval_y_sorted = self.
|
|
1337
|
-
enriched_eval_X, eval_y_sampled,
|
|
1340
|
+
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
1341
|
+
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
|
1342
|
+
enriched_eval_X, eval_y_sampled, self.cv
|
|
1338
1343
|
)
|
|
1339
1344
|
fitting_eval_X = eval_X_sorted[client_features].copy()
|
|
1340
1345
|
fitting_enriched_eval_X = enriched_eval_X_sorted[
|
|
@@ -1470,19 +1475,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1470
1475
|
self.df_with_original_index,
|
|
1471
1476
|
fit_features,
|
|
1472
1477
|
rows_to_drop=rows_to_drop,
|
|
1478
|
+
drop_system_record_id=False,
|
|
1473
1479
|
)
|
|
1474
1480
|
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
]
|
|
1478
|
-
enriched_X = drop_existing_columns(enriched_Xy, TARGET)
|
|
1479
|
-
|
|
1480
|
-
Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
|
|
1481
|
-
else:
|
|
1482
|
-
Xy_sampled = original_df_sampled
|
|
1483
|
-
X_sampled = drop_existing_columns(Xy_sampled, [SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET])
|
|
1481
|
+
x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
|
|
1482
|
+
X_sampled = enriched_Xy[x_columns].copy()
|
|
1483
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
1484
|
+
enriched_X = drop_existing_columns(enriched_Xy, [TARGET, EVAL_SET_INDEX])
|
|
1485
|
+
|
|
1484
1486
|
search_keys = self.fit_search_keys
|
|
1485
|
-
y_sampled = Xy_sampled[TARGET].copy()
|
|
1486
1487
|
|
|
1487
1488
|
self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
|
|
1488
1489
|
self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
|
|
@@ -1495,10 +1496,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1495
1496
|
)
|
|
1496
1497
|
|
|
1497
1498
|
for idx in range(len(eval_set)):
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
eval_y_sampled = eval_Xy_sampled[TARGET].copy()
|
|
1499
|
+
eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
|
|
1500
|
+
eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
|
|
1501
|
+
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1502
1502
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1503
1503
|
|
|
1504
1504
|
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
@@ -1542,28 +1542,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1542
1542
|
n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state
|
|
1543
1543
|
)
|
|
1544
1544
|
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
)
|
|
1548
|
-
X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
|
|
1549
|
-
y_sampled = df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy()[TARGET]
|
|
1545
|
+
df_extended, search_keys = self._extend_x(df_with_eval_set_index, is_demo_dataset)
|
|
1546
|
+
|
|
1550
1547
|
eval_set_sampled_dict = dict()
|
|
1551
|
-
for idx in range(len(eval_set)):
|
|
1552
|
-
eval_x_sampled = (
|
|
1553
|
-
df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)]
|
|
1554
|
-
.copy()
|
|
1555
|
-
.drop(columns=[EVAL_SET_INDEX, TARGET])
|
|
1556
|
-
)
|
|
1557
|
-
eval_x_sampled, _ = self._extend_x(eval_x_sampled, is_demo_dataset)
|
|
1558
|
-
eval_y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)].copy()[
|
|
1559
|
-
TARGET
|
|
1560
|
-
]
|
|
1561
|
-
eval_set_sampled_dict[idx] = (eval_x_sampled, eval_y_sampled)
|
|
1562
1548
|
|
|
1563
|
-
|
|
1549
|
+
df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
|
|
1550
|
+
|
|
1551
|
+
x_columns = [
|
|
1552
|
+
c
|
|
1553
|
+
for c in df_extended.columns
|
|
1554
|
+
if c not in [TARGET, EVAL_SET_INDEX, DateTimeSearchKeyConverter.DATETIME_COL]
|
|
1555
|
+
]
|
|
1556
|
+
|
|
1557
|
+
tmp_target_name = "__target"
|
|
1558
|
+
df_extended.rename(columns={SYSTEM_RECORD_ID: SORT_ID, TARGET: tmp_target_name}, inplace=True)
|
|
1564
1559
|
|
|
1565
|
-
|
|
1566
|
-
|
|
1560
|
+
enriched_df = self.transform(
|
|
1561
|
+
df_extended,
|
|
1567
1562
|
exclude_features_sources=exclude_features_sources,
|
|
1568
1563
|
silent_mode=True,
|
|
1569
1564
|
trace_id=trace_id,
|
|
@@ -1571,16 +1566,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1571
1566
|
progress_bar=progress_bar,
|
|
1572
1567
|
progress_callback=progress_callback,
|
|
1573
1568
|
)
|
|
1574
|
-
if
|
|
1569
|
+
if enriched_df is None:
|
|
1575
1570
|
return None
|
|
1576
1571
|
|
|
1577
|
-
|
|
1578
|
-
|
|
1572
|
+
enriched_df.rename(columns={SORT_ID: SYSTEM_RECORD_ID, tmp_target_name: TARGET}, inplace=True)
|
|
1573
|
+
|
|
1574
|
+
enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
|
|
1575
|
+
X_sampled = enriched_Xy[x_columns].copy()
|
|
1576
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
1577
|
+
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1579
1578
|
|
|
1580
1579
|
for idx in range(len(eval_set)):
|
|
1581
|
-
|
|
1582
|
-
|
|
1583
|
-
|
|
1580
|
+
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1581
|
+
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
|
1582
|
+
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
|
1583
|
+
enriched_eval_x = enriched_eval_xy.drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1584
1584
|
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
1585
1585
|
else:
|
|
1586
1586
|
self.logger.info("Transform without eval_set")
|
|
@@ -1592,14 +1592,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1592
1592
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1593
1593
|
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
1594
1594
|
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1595
|
+
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1596
|
+
|
|
1597
|
+
df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
|
|
1598
1598
|
|
|
1599
|
-
|
|
1599
|
+
x_columns = [c for c in df_extended if c not in [TARGET, DateTimeSearchKeyConverter.DATETIME_COL]]
|
|
1600
1600
|
|
|
1601
|
-
|
|
1602
|
-
|
|
1601
|
+
tmp_target_name = "__target"
|
|
1602
|
+
df_extended.rename(columns={SYSTEM_RECORD_ID: SORT_ID, TARGET: tmp_target_name}, inplace=True)
|
|
1603
|
+
|
|
1604
|
+
enriched_Xy = self.transform(
|
|
1605
|
+
df_extended,
|
|
1603
1606
|
exclude_features_sources=exclude_features_sources,
|
|
1604
1607
|
silent_mode=True,
|
|
1605
1608
|
trace_id=trace_id,
|
|
@@ -1607,9 +1610,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1607
1610
|
progress_bar=progress_bar,
|
|
1608
1611
|
progress_callback=progress_callback,
|
|
1609
1612
|
)
|
|
1610
|
-
if
|
|
1613
|
+
if enriched_Xy is None:
|
|
1611
1614
|
return None
|
|
1612
1615
|
|
|
1616
|
+
enriched_Xy.rename(columns={SORT_ID: SYSTEM_RECORD_ID, tmp_target_name: TARGET}, inplace=True)
|
|
1617
|
+
|
|
1618
|
+
X_sampled = enriched_Xy[x_columns].copy() # check that all columns are present
|
|
1619
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
1620
|
+
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
1621
|
+
|
|
1613
1622
|
self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
1614
1623
|
|
|
1615
1624
|
return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
|
|
@@ -2153,7 +2162,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2153
2162
|
|
|
2154
2163
|
dataset = Dataset(
|
|
2155
2164
|
"tds_" + str(uuid.uuid4()),
|
|
2156
|
-
df=df,
|
|
2165
|
+
df=df,
|
|
2157
2166
|
model_task_type=model_task_type,
|
|
2158
2167
|
date_format=self.date_format,
|
|
2159
2168
|
random_state=self.random_state,
|
|
@@ -2323,7 +2332,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2323
2332
|
elif (
|
|
2324
2333
|
self.cv is None
|
|
2325
2334
|
and model_task_type != ModelTaskType.REGRESSION
|
|
2326
|
-
and self._get_group_columns(self.fit_search_keys)
|
|
2335
|
+
and self._get_group_columns(df, self.fit_search_keys)
|
|
2327
2336
|
):
|
|
2328
2337
|
msg = bundle.get("group_k_fold_in_classification")
|
|
2329
2338
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
@@ -2508,6 +2517,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2508
2517
|
Xy = pd.merge(Xy, enriched_X, left_index=True, right_index=True, how="inner", suffixes=("", "enriched"))
|
|
2509
2518
|
return Xy[X.columns].copy(), Xy[TARGET].copy()
|
|
2510
2519
|
|
|
2520
|
+
@staticmethod
|
|
2521
|
+
def _sort_by_system_record_id(
|
|
2522
|
+
X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
|
|
2523
|
+
) -> Tuple[pd.DataFrame, pd.Series]:
|
|
2524
|
+
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2525
|
+
Xy = X.copy()
|
|
2526
|
+
Xy[TARGET] = y
|
|
2527
|
+
Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2528
|
+
X = Xy.drop(columns=TARGET)
|
|
2529
|
+
y = Xy[TARGET].copy()
|
|
2530
|
+
|
|
2531
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
|
|
2532
|
+
X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
|
|
2533
|
+
|
|
2534
|
+
return X, y
|
|
2535
|
+
|
|
2536
|
+
# Deprecated
|
|
2511
2537
|
@staticmethod
|
|
2512
2538
|
def _sort_by_keys(
|
|
2513
2539
|
X: pd.DataFrame, y: pd.Series, search_keys: Dict[str, SearchKey], cv: Optional[CVType]
|
|
@@ -2646,8 +2672,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2646
2672
|
return col
|
|
2647
2673
|
|
|
2648
2674
|
@staticmethod
|
|
2649
|
-
def _get_group_columns(search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2650
|
-
return [
|
|
2675
|
+
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2676
|
+
return [
|
|
2677
|
+
col
|
|
2678
|
+
for col, t in search_keys.items()
|
|
2679
|
+
if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].nunique() > 1
|
|
2680
|
+
]
|
|
2651
2681
|
|
|
2652
2682
|
@staticmethod
|
|
2653
2683
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2684,7 +2714,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2684
2714
|
date_column = self._get_date_column(search_keys)
|
|
2685
2715
|
sort_columns = [date_column] if date_column is not None else []
|
|
2686
2716
|
|
|
2687
|
-
other_search_keys = sorted(
|
|
2717
|
+
other_search_keys = sorted(
|
|
2718
|
+
[
|
|
2719
|
+
sk
|
|
2720
|
+
for sk, key_type in search_keys.items()
|
|
2721
|
+
if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
2722
|
+
and sk in df.columns
|
|
2723
|
+
and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
2724
|
+
]
|
|
2725
|
+
)
|
|
2688
2726
|
|
|
2689
2727
|
search_keys_hash = "search_keys_hash"
|
|
2690
2728
|
if len(other_search_keys) > 0:
|
|
@@ -2751,6 +2789,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2751
2789
|
X: Optional[pd.DataFrame] = None,
|
|
2752
2790
|
is_transform=False,
|
|
2753
2791
|
rows_to_drop: Optional[pd.DataFrame] = None,
|
|
2792
|
+
drop_system_record_id=True,
|
|
2754
2793
|
) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
|
|
2755
2794
|
if result_features is None:
|
|
2756
2795
|
self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
|
|
@@ -4,13 +4,14 @@ from typing import Dict, List, Optional, Set
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
6
|
SYSTEM_RECORD_ID = "system_record_id"
|
|
7
|
+
SORT_ID = "sort_id"
|
|
7
8
|
EVAL_SET_INDEX = "eval_set_index"
|
|
8
9
|
TARGET = "target"
|
|
9
10
|
COUNTRY = "country_iso_code"
|
|
10
11
|
RENAMED_INDEX = "index_col"
|
|
11
12
|
DEFAULT_INDEX = "index"
|
|
12
13
|
ORIGINAL_INDEX = "original_index"
|
|
13
|
-
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY}
|
|
14
|
+
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class FileColumnMeaningType(Enum):
|
|
@@ -318,6 +318,7 @@ class EstimatorWrapper:
|
|
|
318
318
|
X: pd.DataFrame,
|
|
319
319
|
scoring: Union[Callable, str, None] = None,
|
|
320
320
|
cat_features: Optional[List[str]] = None,
|
|
321
|
+
text_features: Optional[List[str]] = None,
|
|
321
322
|
add_params: Optional[Dict[str, Any]] = None,
|
|
322
323
|
groups: Optional[List[str]] = None,
|
|
323
324
|
) -> "EstimatorWrapper":
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|