upgini 1.1.244a4__py3-none-any.whl → 1.1.244a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -40,6 +40,7 @@ from upgini.metadata import (
40
40
  EVAL_SET_INDEX,
41
41
  ORIGINAL_INDEX,
42
42
  RENAMED_INDEX,
43
+ SORT_ID,
43
44
  SYSTEM_RECORD_ID,
44
45
  TARGET,
45
46
  CVType,
@@ -1299,7 +1300,11 @@ class FeaturesEnricher(TransformerMixin):
1299
1300
  c
1300
1301
  for c in X_sampled.columns.to_list()
1301
1302
  if c
1302
- not in (excluding_search_keys + list(self.fit_dropped_features) + [DateTimeSearchKeyConverter.DATETIME_COL])
1303
+ not in (
1304
+ excluding_search_keys
1305
+ + list(self.fit_dropped_features)
1306
+ + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1307
+ )
1303
1308
  ]
1304
1309
 
1305
1310
  filtered_enriched_features = self.__filtered_enriched_features(
@@ -1307,10 +1312,10 @@ class FeaturesEnricher(TransformerMixin):
1307
1312
  max_features,
1308
1313
  )
1309
1314
 
1310
- X_sorted, y_sorted = self._sort_by_keys(X_sampled, y_sampled, search_keys, self.cv)
1311
- enriched_X_sorted, enriched_y_sorted = self._sort_by_keys(enriched_X, y_sampled, search_keys, self.cv)
1315
+ X_sorted, y_sorted = self._sort_by_system_record_id(X_sampled, y_sampled, self.cv)
1316
+ enriched_X_sorted, enriched_y_sorted = self._sort_by_system_record_id(enriched_X, y_sampled, self.cv)
1312
1317
 
1313
- group_columns = sorted(self._get_group_columns(search_keys))
1318
+ group_columns = sorted(self._get_group_columns(enriched_X_sorted, search_keys))
1314
1319
  groups = (
1315
1320
  None
1316
1321
  if not group_columns or self.cv != CVType.group_k_fold
@@ -1332,9 +1337,9 @@ class FeaturesEnricher(TransformerMixin):
1332
1337
  fitting_eval_set_dict = dict()
1333
1338
  for idx, eval_tuple in eval_set_sampled_dict.items():
1334
1339
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1335
- eval_X_sorted, eval_y_sorted = self._sort_by_keys(eval_X_sampled, eval_y_sampled, search_keys, self.cv)
1336
- enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_keys(
1337
- enriched_eval_X, eval_y_sampled, search_keys, self.cv
1340
+ eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
1341
+ enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1342
+ enriched_eval_X, eval_y_sampled, self.cv
1338
1343
  )
1339
1344
  fitting_eval_X = eval_X_sorted[client_features].copy()
1340
1345
  fitting_enriched_eval_X = enriched_eval_X_sorted[
@@ -1470,19 +1475,15 @@ class FeaturesEnricher(TransformerMixin):
1470
1475
  self.df_with_original_index,
1471
1476
  fit_features,
1472
1477
  rows_to_drop=rows_to_drop,
1478
+ drop_system_record_id=False,
1473
1479
  )
1474
1480
 
1475
- original_df_sampled = self.df_with_original_index[
1476
- self.df_with_original_index[SYSTEM_RECORD_ID].isin(fit_features[SYSTEM_RECORD_ID])
1477
- ]
1478
- enriched_X = drop_existing_columns(enriched_Xy, TARGET)
1479
- if EVAL_SET_INDEX in original_df_sampled.columns:
1480
- Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == 0")
1481
- else:
1482
- Xy_sampled = original_df_sampled
1483
- X_sampled = drop_existing_columns(Xy_sampled, [SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET])
1481
+ x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
1482
+ X_sampled = enriched_Xy[x_columns].copy()
1483
+ y_sampled = enriched_Xy[TARGET].copy()
1484
+ enriched_X = drop_existing_columns(enriched_Xy, [TARGET, EVAL_SET_INDEX])
1485
+
1484
1486
  search_keys = self.fit_search_keys
1485
- y_sampled = Xy_sampled[TARGET].copy()
1486
1487
 
1487
1488
  self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
1488
1489
  self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
@@ -1495,10 +1496,9 @@ class FeaturesEnricher(TransformerMixin):
1495
1496
  )
1496
1497
 
1497
1498
  for idx in range(len(eval_set)):
1498
- enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
1499
- eval_Xy_sampled = original_df_sampled.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1500
- eval_X_sampled = drop_existing_columns(eval_Xy_sampled, [SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET])
1501
- eval_y_sampled = eval_Xy_sampled[TARGET].copy()
1499
+ eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
1500
+ eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
1501
+ enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
1502
1502
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
1503
1503
 
1504
1504
  self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
@@ -1542,28 +1542,23 @@ class FeaturesEnricher(TransformerMixin):
1542
1542
  n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state
1543
1543
  )
1544
1544
 
1545
- X_sampled = (
1546
- df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy().drop(columns=[EVAL_SET_INDEX, TARGET])
1547
- )
1548
- X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1549
- y_sampled = df_with_eval_set_index.query(f"{EVAL_SET_INDEX} == 0").copy()[TARGET]
1545
+ df_extended, search_keys = self._extend_x(df_with_eval_set_index, is_demo_dataset)
1546
+
1550
1547
  eval_set_sampled_dict = dict()
1551
- for idx in range(len(eval_set)):
1552
- eval_x_sampled = (
1553
- df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)]
1554
- .copy()
1555
- .drop(columns=[EVAL_SET_INDEX, TARGET])
1556
- )
1557
- eval_x_sampled, _ = self._extend_x(eval_x_sampled, is_demo_dataset)
1558
- eval_y_sampled = df_with_eval_set_index[df_with_eval_set_index[EVAL_SET_INDEX] == (idx + 1)].copy()[
1559
- TARGET
1560
- ]
1561
- eval_set_sampled_dict[idx] = (eval_x_sampled, eval_y_sampled)
1562
1548
 
1563
- df_with_eval_set_index.drop(columns=TARGET, inplace=True)
1549
+ df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1550
+
1551
+ x_columns = [
1552
+ c
1553
+ for c in df_extended.columns
1554
+ if c not in [TARGET, EVAL_SET_INDEX, DateTimeSearchKeyConverter.DATETIME_COL]
1555
+ ]
1556
+
1557
+ tmp_target_name = "__target"
1558
+ df_extended.rename(columns={SYSTEM_RECORD_ID: SORT_ID, TARGET: tmp_target_name}, inplace=True)
1564
1559
 
1565
- enriched = self.transform(
1566
- df_with_eval_set_index,
1560
+ enriched_df = self.transform(
1561
+ df_extended,
1567
1562
  exclude_features_sources=exclude_features_sources,
1568
1563
  silent_mode=True,
1569
1564
  trace_id=trace_id,
@@ -1571,16 +1566,21 @@ class FeaturesEnricher(TransformerMixin):
1571
1566
  progress_bar=progress_bar,
1572
1567
  progress_callback=progress_callback,
1573
1568
  )
1574
- if enriched is None:
1569
+ if enriched_df is None:
1575
1570
  return None
1576
1571
 
1577
- enriched_X = enriched[enriched[EVAL_SET_INDEX] == 0].copy()
1578
- enriched_X.drop(columns=EVAL_SET_INDEX, inplace=True)
1572
+ enriched_df.rename(columns={SORT_ID: SYSTEM_RECORD_ID, tmp_target_name: TARGET}, inplace=True)
1573
+
1574
+ enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
1575
+ X_sampled = enriched_Xy[x_columns].copy()
1576
+ y_sampled = enriched_Xy[TARGET].copy()
1577
+ enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1579
1578
 
1580
1579
  for idx in range(len(eval_set)):
1581
- enriched_eval_x = enriched[enriched[EVAL_SET_INDEX] == (idx + 1)].copy()
1582
- enriched_eval_x.drop(columns=EVAL_SET_INDEX, inplace=True)
1583
- eval_x_sampled, eval_y_sampled = eval_set_sampled_dict[idx]
1580
+ enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
1581
+ eval_x_sampled = enriched_eval_xy[x_columns].copy()
1582
+ eval_y_sampled = enriched_eval_xy[TARGET].copy()
1583
+ enriched_eval_x = enriched_eval_xy.drop(columns=[TARGET, EVAL_SET_INDEX])
1584
1584
  eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
1585
1585
  else:
1586
1586
  self.logger.info("Transform without eval_set")
@@ -1592,14 +1592,17 @@ class FeaturesEnricher(TransformerMixin):
1592
1592
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
1593
1593
  df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
1594
1594
 
1595
- X_sampled = df.copy().drop(columns=TARGET)
1596
- X_sampled, search_keys = self._extend_x(X_sampled, is_demo_dataset)
1597
- y_sampled = df.copy()[TARGET]
1595
+ df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1596
+
1597
+ df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1598
1598
 
1599
- df.drop(columns=TARGET, inplace=True)
1599
+ x_columns = [c for c in df_extended if c not in [TARGET, DateTimeSearchKeyConverter.DATETIME_COL]]
1600
1600
 
1601
- enriched_X = self.transform(
1602
- df,
1601
+ tmp_target_name = "__target"
1602
+ df_extended.rename(columns={SYSTEM_RECORD_ID: SORT_ID, TARGET: tmp_target_name}, inplace=True)
1603
+
1604
+ enriched_Xy = self.transform(
1605
+ df_extended,
1603
1606
  exclude_features_sources=exclude_features_sources,
1604
1607
  silent_mode=True,
1605
1608
  trace_id=trace_id,
@@ -1607,9 +1610,15 @@ class FeaturesEnricher(TransformerMixin):
1607
1610
  progress_bar=progress_bar,
1608
1611
  progress_callback=progress_callback,
1609
1612
  )
1610
- if enriched_X is None:
1613
+ if enriched_Xy is None:
1611
1614
  return None
1612
1615
 
1616
+ enriched_Xy.rename(columns={SORT_ID: SYSTEM_RECORD_ID, tmp_target_name: TARGET}, inplace=True)
1617
+
1618
+ X_sampled = enriched_Xy[x_columns].copy() # check that all columns are present
1619
+ y_sampled = enriched_Xy[TARGET].copy()
1620
+ enriched_X = enriched_Xy.drop(columns=TARGET)
1621
+
1613
1622
  self.__cached_sampled_datasets = (X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
1614
1623
 
1615
1624
  return self.__mk_sampled_data_tuple(X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys)
@@ -2153,7 +2162,7 @@ class FeaturesEnricher(TransformerMixin):
2153
2162
 
2154
2163
  dataset = Dataset(
2155
2164
  "tds_" + str(uuid.uuid4()),
2156
- df=df, # type: ignore
2165
+ df=df,
2157
2166
  model_task_type=model_task_type,
2158
2167
  date_format=self.date_format,
2159
2168
  random_state=self.random_state,
@@ -2323,7 +2332,7 @@ class FeaturesEnricher(TransformerMixin):
2323
2332
  elif (
2324
2333
  self.cv is None
2325
2334
  and model_task_type != ModelTaskType.REGRESSION
2326
- and self._get_group_columns(self.fit_search_keys)
2335
+ and self._get_group_columns(df, self.fit_search_keys)
2327
2336
  ):
2328
2337
  msg = bundle.get("group_k_fold_in_classification")
2329
2338
  self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
@@ -2508,6 +2517,23 @@ class FeaturesEnricher(TransformerMixin):
2508
2517
  Xy = pd.merge(Xy, enriched_X, left_index=True, right_index=True, how="inner", suffixes=("", "enriched"))
2509
2518
  return Xy[X.columns].copy(), Xy[TARGET].copy()
2510
2519
 
2520
+ @staticmethod
2521
+ def _sort_by_system_record_id(
2522
+ X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
2523
+ ) -> Tuple[pd.DataFrame, pd.Series]:
2524
+ if cv not in [CVType.time_series, CVType.blocked_time_series]:
2525
+ Xy = X.copy()
2526
+ Xy[TARGET] = y
2527
+ Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2528
+ X = Xy.drop(columns=TARGET)
2529
+ y = Xy[TARGET].copy()
2530
+
2531
+ if DateTimeSearchKeyConverter.DATETIME_COL in X.columns:
2532
+ X.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL, inplace=True)
2533
+
2534
+ return X, y
2535
+
2536
+ # Deprecated
2511
2537
  @staticmethod
2512
2538
  def _sort_by_keys(
2513
2539
  X: pd.DataFrame, y: pd.Series, search_keys: Dict[str, SearchKey], cv: Optional[CVType]
@@ -2646,8 +2672,12 @@ class FeaturesEnricher(TransformerMixin):
2646
2672
  return col
2647
2673
 
2648
2674
  @staticmethod
2649
- def _get_group_columns(search_keys: Dict[str, SearchKey]) -> List[str]:
2650
- return [col for col, t in search_keys.items() if t not in [SearchKey.DATE, SearchKey.DATETIME]]
2675
+ def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2676
+ return [
2677
+ col
2678
+ for col, t in search_keys.items()
2679
+ if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].nunique() > 1
2680
+ ]
2651
2681
 
2652
2682
  @staticmethod
2653
2683
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2684,7 +2714,15 @@ class FeaturesEnricher(TransformerMixin):
2684
2714
  date_column = self._get_date_column(search_keys)
2685
2715
  sort_columns = [date_column] if date_column is not None else []
2686
2716
 
2687
- other_search_keys = sorted([sk for sk in search_keys.keys() if sk != date_column and sk in df.columns])
2717
+ other_search_keys = sorted(
2718
+ [
2719
+ sk
2720
+ for sk, key_type in search_keys.items()
2721
+ if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
2722
+ and sk in df.columns
2723
+ and df[sk].nunique() > 1 # don't use constant keys for hash
2724
+ ]
2725
+ )
2688
2726
 
2689
2727
  search_keys_hash = "search_keys_hash"
2690
2728
  if len(other_search_keys) > 0:
@@ -2751,6 +2789,7 @@ class FeaturesEnricher(TransformerMixin):
2751
2789
  X: Optional[pd.DataFrame] = None,
2752
2790
  is_transform=False,
2753
2791
  rows_to_drop: Optional[pd.DataFrame] = None,
2792
+ drop_system_record_id=True,
2754
2793
  ) -> Tuple[pd.DataFrame, Dict[int, pd.DataFrame]]:
2755
2794
  if result_features is None:
2756
2795
  self.logger.error(f"result features not found by search_task_id: {self.get_search_id()}")
upgini/metadata.py CHANGED
@@ -4,13 +4,14 @@ from typing import Dict, List, Optional, Set
4
4
  from pydantic import BaseModel
5
5
 
6
6
  SYSTEM_RECORD_ID = "system_record_id"
7
+ SORT_ID = "sort_id"
7
8
  EVAL_SET_INDEX = "eval_set_index"
8
9
  TARGET = "target"
9
10
  COUNTRY = "country_iso_code"
10
11
  RENAMED_INDEX = "index_col"
11
12
  DEFAULT_INDEX = "index"
12
13
  ORIGINAL_INDEX = "original_index"
13
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY}
14
+ SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
14
15
 
15
16
 
16
17
  class FileColumnMeaningType(Enum):
upgini/metrics.py CHANGED
@@ -318,6 +318,7 @@ class EstimatorWrapper:
318
318
  X: pd.DataFrame,
319
319
  scoring: Union[Callable, str, None] = None,
320
320
  cat_features: Optional[List[str]] = None,
321
+ text_features: Optional[List[str]] = None,
321
322
  add_params: Optional[Dict[str, Any]] = None,
322
323
  groups: Optional[List[str]] = None,
323
324
  ) -> "EstimatorWrapper":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a4
3
+ Version: 1.1.244a6
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,11 +2,11 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=1AEKFg2ooGnlBzmxX6sw-sDJdDoT8HfHWNYacwGHZGI,50023
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=HZfwDvZvq4IIkguc5MoRpbLeFr3ZKwyYejZfhBteQf4,162494
5
+ upgini/features_enricher.py,sha256=r5k_djjmoVwTJ6Xg9-k9jZfAKTcr1h_wWkI64vkO34E,163572
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
8
- upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
9
- upgini/metrics.py,sha256=cb-IY4QyNWq2F8i3vCEDPN8wyblknjVf61JnZKsMY50,23777
8
+ upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
9
+ upgini/metrics.py,sha256=GFAZNu5V-xrALTiju0vMMdM-ckysnF23ogUtLJRNV08,23828
10
10
  upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -54,8 +54,8 @@ upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,4
54
54
  upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
55
55
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
56
56
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
57
- upgini-1.1.244a4.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
- upgini-1.1.244a4.dist-info/METADATA,sha256=V7zuNuOujnaTi46Qfpe1FMhpEyyrWEAsy1roqyp0ctc,48264
59
- upgini-1.1.244a4.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
- upgini-1.1.244a4.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
- upgini-1.1.244a4.dist-info/RECORD,,
57
+ upgini-1.1.244a6.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
58
+ upgini-1.1.244a6.dist-info/METADATA,sha256=s3qLjYZ-bw2atIM2spsBKOwRW-Hq6LEGRSp0eR8WOc0,48264
59
+ upgini-1.1.244a6.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
60
+ upgini-1.1.244a6.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
61
+ upgini-1.1.244a6.dist-info/RECORD,,