upgini 1.1.244a5__py3-none-any.whl → 1.1.244a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -40,6 +40,7 @@ from upgini.metadata import (
40
40
  EVAL_SET_INDEX,
41
41
  ORIGINAL_INDEX,
42
42
  RENAMED_INDEX,
43
+ SORT_ID,
43
44
  SYSTEM_RECORD_ID,
44
45
  TARGET,
45
46
  CVType,
@@ -1299,7 +1300,11 @@ class FeaturesEnricher(TransformerMixin):
1299
1300
  c
1300
1301
  for c in X_sampled.columns.to_list()
1301
1302
  if c
1302
- not in (excluding_search_keys + list(self.fit_dropped_features) + [DateTimeSearchKeyConverter.DATETIME_COL])
1303
+ not in (
1304
+ excluding_search_keys
1305
+ + list(self.fit_dropped_features)
1306
+ + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1307
+ )
1303
1308
  ]
1304
1309
 
1305
1310
  filtered_enriched_features = self.__filtered_enriched_features(
@@ -1307,11 +1312,10 @@ class FeaturesEnricher(TransformerMixin):
1307
1312
  max_features,
1308
1313
  )
1309
1314
 
1310
- X_sorted, y_sorted = self._sort_by_keys(X_sampled, y_sampled, search_keys, self.cv)
1311
- enriched_X_extended, _ = self._extend_x(enriched_X, is_demo_dataset)
1312
- enriched_X_sorted, enriched_y_sorted = self._sort_by_keys(enriched_X_extended, y_sampled, search_keys, self.cv)
1315
+ X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1316
+ enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
1313
1317
 
1314
- group_columns = sorted(self._get_group_columns(search_keys))
1318
+ group_columns = sorted(self._get_group_columns(enriched_X_sorted, search_keys))
1315
1319
  groups = (
1316
1320
  None
1317
1321
  if not group_columns or self.cv != CVType.group_k_fold
@@ -1333,10 +1337,9 @@ class FeaturesEnricher(TransformerMixin):
1333
1337
  fitting_eval_set_dict = dict()
1334
1338
  for idx, eval_tuple in eval_set_sampled_dict.items():
1335
1339
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1336
- eval_X_sorted, eval_y_sorted = self._sort_by_keys(eval_X_sampled, eval_y_sampled, search_keys, self.cv)
1337
- enriched_eval_X_extended, _ = self._extend_x(enriched_eval_X, is_demo_dataset)
1338
- enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_keys(
1339
- enriched_eval_X_extended, eval_y_sampled, search_keys, self.cv
1340
+ eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1341
+ enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1342
+ enriched_eval_X, eval_y_sampled, self.cv
1340
1343
  )
1341
1344
  fitting_eval_X = eval_X_sorted[client_features].copy()
1342
1345
  fitting_enriched_eval_X = enriched_eval_X_sorted[
@@ -1472,19 +1475,15 @@ class FeaturesEnricher(TransformerMixin):
1472
1475
  self.df_with_original_index,
1473
1476
  fit_features,
1474
1477
  rows_to_drop=rows_to_drop,
1478
+ drop_system_record_id=False,
1475
1479
  )
1476
1480
 
1477
- original_df_sampled = self.df_with_original_index[
1478
- self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
1479
- ]
1480
- enriched_X = drop_existing_columns(enriched_Xy, TARGET)
1481
- if EVAL_SET_INDEX in original_df_sampled.columns:
1482
- Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
1483
- else:
1484
- Xy_sampled = original_df_sampled
1485
- X_sampled = drop_existing_columns(Xy_sampled, [SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET])
1481
+ x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
1482
+ X_sampled = enriched_Xy[x_columns].copy()
1483
+ y_sampled = enriched_Xy[TARGET].copy()
1484
+ enriched_X = drop_existing_columns(enriched_Xy, [TARGET, EVAL_SET_INDEX])
1485
+
1486
1486
  search_keys = self.fit_search_keys
1487
- y_sampled = Xy_sampled[TARGET].copy()
1488
1487
 
1489
1488
  self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
1490
1489
  self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
@@ -1497,10 +1496,9 @@ class FeaturesEnricher(TransformerMixin):
1497
1496
  )
1498
1497
 
1499
1498
  for idx in range(len(eval_set)):
1500
- enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
1501
- eval_Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1502
- eval_X_sampled = drop_existing_columns(eval_Xy_sampled, [SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET])
1503
- eval_y_sampled = eval_Xy_sampled[TARGET].copy()
1499
+ eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
1500
+ eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
1501
+ enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1504
1502
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1505
1503
 
1506
1504
  self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
@@ -1544,28 +1542,23 @@ class FeaturesEnricher(TransformerMixin):
1544
1542
  n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state
1545
1543
  )
1546
1544
 
1547
- X_sampled = (
1548
- df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy().drop(columns=[EVAL_SET_INDEX, TARGET])
1549
- )
1550
- X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1551
- y_sampled = df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy()[TARGET]
1545
+ df_extended, search_keys = self._extend_x(df_with_eval_set_index, is_demo_dataset)
1546
+
1552
1547
  eval_set_sampled_dict = dict()
1553
- for idx in range(len(eval_set)):
1554
- eval_x_sampled = (
1555
- df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)]
1556
- .copy()
1557
- .drop(columns=[EVAL_SET_INDEX, TARGET])
1558
- )
1559
- eval_x_sampled, _ = self._extend_x(eval_x_sampled, is_demo_dataset)
1560
- eval_y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)].copy()[
1561
- TARGET
1562
- ]
1563
- eval_set_sampled_dict[idx] = (eval_x_sampled, eval_y_sampled)
1564
1548
 
1565
- df_with_eval_set_index.drop(columns=TARGET, inplace=True)
1549
+ df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1550
+
1551
+ x_columns = [
1552
+ c
1553
+ for c in df_extended.columns
1554
+ if c not in [TARGET, EVAL_SET_INDEX, DateTimeSearchKeyConverter.DATETIME_COL]
1555
+ ]
1556
+
1557
+ tmp_target_name = "__target"
1558
+ df_extended.rename(columns={SYSTEM_RECORD_ID: SORT_ID, TARGET: tmp_target_name}, inplace=True)
1566
1559
 
1567
- enriched = self.transform(
1568
- df_with_eval_set_index,
1560
+ enriched_df = self.transform(
1561
+ df_extended,
1569
1562
  exclude_features_sources=exclude_features_sources,
1570
1563
  silent_mode=True,
1571
1564
  trace_id=trace_id,
@@ -1573,16 +1566,21 @@ class FeaturesEnricher(TransformerMixin):
1573
1566
  progress_bar=progress_bar,
1574
1567
  progress_callback=progress_callback,
1575
1568
  )
1576
- if enriched is None:
1569
+ if enriched_df is None:
1577
1570
  return None
1578
1571
 
1579
- enriched_X = enriched[enriched[EVAL_SET_INDEX] == 0].copy()
1580
- enriched_X.drop(columns=EVAL_SET_INDEX, inplace=True)
1572
+ enriched_df.rename(columns={SORT_ID: SYSTEM_RECORD_ID, tmp_target_name: TARGET}, inplace=True)
1573
+
1574
+ enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1575
+ X_sampled = enriched_Xy[x_columns].copy()
1576
+ y_sampled = enriched_Xy[TARGET].copy()
1577
+ enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1581
1578
 
1582
1579
  for idx in range(len(eval_set)):
1583
- enriched_eval_x = enriched[enriched[EVAL_SET_INDEX] == (idx + 1)].copy()
1584
- enriched_eval_x.drop(columns=EVAL_SET_INDEX, inplace=True)
1585
- eval_x_sampled, eval_y_sampled = eval_set_sampled_dict[idx]
1580
+ enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1581
+ eval_x_sampled = enriched_eval_xy[x_columns].copy()
1582
+ eval_y_sampled = enriched_eval_xy[TARGET].copy()
1583
+ enriched_eval_x = enriched_eval_xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1586
1584
  eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1587
1585
  else:
1588
1586
  self.logger.info("Transform without eval_set")
@@ -1594,14 +1592,17 @@ class FeaturesEnricher(TransformerMixin):
1594
1592
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1595
1593
  df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1596
1594
 
1597
- X_sampled = df.copy().drop(columns=TARGET)
1598
- X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1599
- y_sampled = df.copy()[TARGET]
1595
+ df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1596
+
1597
+ df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1600
1598
 
1601
- df.drop(columns=TARGET, inplace=True)
1599
+ x_columns = [c for c in df_extended if c not in [TARGET, DateTimeSearchKeyConverter.DATETIME_COL]]
1602
1600
 
1603
- enriched_X = self.transform(
1604
- df,
1601
+ tmp_target_name = "__target"
1602
+ df_extended.rename(columns={SYSTEM_RECORD_ID: SORT_ID, TARGET: tmp_target_name}, inplace=True)
1603
+
1604
+ enriched_Xy = self.transform(
1605
+ df_extended,
1605
1606
  exclude_features_sources=exclude_features_sources,
1606
1607
  silent_mode=True,
1607
1608
  trace_id=trace_id,
@@ -1609,9 +1610,15 @@ class FeaturesEnricher(TransformerMixin):
1609
1610
  progress_bar=progress_bar,
1610
1611
  progress_callback=progress_callback,
1611
1612
  )
1612
- if enriched_X is None:
1613
+ if enriched_Xy is None:
1613
1614
  return None
1614
1615
 
1616
+ enriched_Xy.rename(columns={SORT_ID: SYSTEM_RECORD_ID, tmp_target_name: TARGET}, inplace=True)
1617
+
1618
+ X_sampled = enriched_Xy[x_columns].copy() # check that all columns are present
1619
+ y_sampled = enriched_Xy[TARGET].copy()
1620
+ enriched_X = enriched_Xy.drop(columns=TARGET)
1621
+
1615
1622
  self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1616
1623
 
1617
1624
  return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
@@ -2155,7 +2162,7 @@ class FeaturesEnricher(TransformerMixin):
2155
2162
 
2156
2163
  dataset = Dataset(
2157
2164
  "tds_" + str(uuid.uuid4()),
2158
- df=df, # type: ignore
2165
+ df=df,
2159
2166
  model_task_type=model_task_type,
2160
2167
  date_format=self.date_format,
2161
2168
  random_state=self.random_state,
@@ -2325,7 +2332,7 @@ class FeaturesEnricher(TransformerMixin):
2325
2332
  elif (
2326
2333
  self.cv is None
2327
2334
  and model_task_type != ModelTaskType.REGRESSION
2328
- and self._get_group_columns(self.fit_search_keys)
2335
+ and self._get_group_columns(df, self.fit_search_keys)
2329
2336
  ):
2330
2337
  msg = bundle.get("group_k_fold_in_classification")
2331
2338
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
@@ -2510,6 +2517,23 @@ class FeaturesEnricher(TransformerMixin):
2510
2517
  Xy = pd.merge(Xy, enriched_X, left_index=True, right_index=True, how="inner", suffixes=("", "enriched"))
2511
2518
  return Xy[X.columns].copy(), Xy[TARGET].copy()
2512
2519
 
2520
+ @staticmethod
2521
+ def _sort_by_system_record_id(
2522
+ X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
2523
+ ) -> Tuple[pd.DataFrame, pd.Series]:
2524
+ if cv not in [CVType.time_series, CVType.blocked_time_series]:
2525
+ Xy = X.copy()
2526
+ Xy[TARGET] = y
2527
+ Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2528
+ X = Xy.drop(columns=TARGET)
2529
+ y = Xy[TARGET].copy()
2530
+
2531
+ if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2532
+ X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
2533
+
2534
+ return X, y
2535
+
2536
+ # Deprecated
2513
2537
  @staticmethod
2514
2538
  def _sort_by_keys(
2515
2539
  X: pd.DataFrame, y: pd.Series, search_keys: Dict[str, SearchKey], cv: Optional[CVType]
@@ -2648,8 +2672,12 @@ class FeaturesEnricher(TransformerMixin):
2648
2672
  return col
2649
2673
 
2650
2674
  @staticmethod
2651
- def _get_group_columns(search_keys: Dict[str, SearchKey]) -> List[str]:
2652
- return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
2675
+ def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2676
+ return [
2677
+ col
2678
+ for col, t in search_keys.items()
2679
+ if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].nunique() > 1
2680
+ ]
2653
2681
 
2654
2682
  @staticmethod
2655
2683
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2686,7 +2714,15 @@ class FeaturesEnricher(TransformerMixin):
2686
2714
  date_column = self._get_date_column(search_keys)
2687
2715
  sort_columns = [date_column] if date_column is not None else []
2688
2716
 
2689
- other_search_keys = sorted([sk for sk in search_keys.keys() if sk != date_column and sk in df.columns])
2717
+ other_search_keys = sorted(
2718
+ [
2719
+ sk
2720
+ for sk, key_type in search_keys.items()
2721
+ if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2722
+ and sk in df.columns
2723
+ and df[sk].nunique() > 1 # don't use constant keys for hash
2724
+ ]
2725
+ )
2690
2726
 
2691
2727
  search_keys_hash = "search_keys_hash"
2692
2728
  if len(other_search_keys) > 0:
@@ -2753,6 +2789,7 @@ class FeaturesEnricher(TransformerMixin):
2753
2789
  X: Optional[pd.DataFrame] = None,
2754
2790
  is_transform=False,
2755
2791
  rows_to_drop: Optional[pd.DataFrame] = None,
2792
+ drop_system_record_id=True,
2756
2793
  ) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
2757
2794
  if result_features is None:
2758
2795
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
upgini/metadata.py CHANGED
@@ -4,13 +4,14 @@ from typing import Dict, List, Optional, Set
4
4
  from pydantic import BaseModel
5
5
 
6
6
  SYSTEM_RECORD_ID = "system_record_id"
7
+ SORT_ID = "sort_id"
7
8
  EVAL_SET_INDEX = "eval_set_index"
8
9
  TARGET = "target"
9
10
  COUNTRY = "country_iso_code"
10
11
  RENAMED_INDEX = "index_col"
11
12
  DEFAULT_INDEX = "index"
12
13
  ORIGINAL_INDEX = "original_index"
13
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY}
14
+ SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
14
15
 
15
16
 
16
17
  class FileColumnMeaningType(Enum):
upgini/metrics.py CHANGED
@@ -318,6 +318,7 @@ class EstimatorWrapper:
318
318
  X: pd.DataFrame,
319
319
  scoring: Union[Callable, str, None] = None,
320
320
  cat_features: Optional[List[str]] = None,
321
+ text_features: Optional[List[str]] = None,
321
322
  add_params: Optional[Dict[str, Any]] = None,
322
323
  groups: Optional[List[str]] = None,
323
324
  ) -> "EstimatorWrapper":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a5
3
+ Version: 1.1.244a6
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,11 +2,11 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=1AEKFg2ooGnlBzmxX6sw-sDJdDoT8HfHWNYacwGHZGI,50023
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=tNPPAobN091LHV7mcnqR-QRdmrR5KTS0BJExgcshw2Y,162680
5
+ upgini/features_enricher.py,sha256=r5k_djjmoVwTJ6Xg9-k9jZfAKTcr1h_wWkI64vkO34E,163572
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
8
- upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
9
- upgini/metrics.py,sha256=cb-IY4QyNWq2F8i3vCEDPN8wyblknjVf61JnZKsMY50,23777
8
+ upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
9
+ upgini/metrics.py,sha256=GFAZNu5V-xrALTiju0vMMdM-ckysnF23ogUtLJRNV08,23828
10
10
  upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -54,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,4
54
54
  upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
55
55
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
56
56
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
57
- upgini-1.1.244a5.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
- upgini-1.1.244a5.dist-info/METADATA,sha256=v5xc7nTR4KsbPxf03HhCLxmZzUsYkViCbvBBp4Oahwk,48264
59
- upgini-1.1.244a5.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
- upgini-1.1.244a5.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
- upgini-1.1.244a5.dist-info/RECORD,,
57
+ upgini-1.1.244a6.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
+ upgini-1.1.244a6.dist-info/METADATA,sha256=s3qLjYZ-bw2atIM2spsBKOwRW-Hq6LEGRSp0eR8WOc0,48264
59
+ upgini-1.1.244a6.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
+ upgini-1.1.244a6.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
+ upgini-1.1.244a6.dist-info/RECORD,,