upgini 1.2.29a3__py3-none-any.whl → 1.2.29a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.29a3"
1
+ __version__ = "1.2.29a5"
@@ -2,7 +2,6 @@ import dataclasses
2
2
  import datetime
3
3
  import gc
4
4
  import hashlib
5
- import itertools
6
5
  import logging
7
6
  import numbers
8
7
  import os
@@ -54,6 +53,7 @@ from upgini.metadata import (
54
53
  SYSTEM_RECORD_ID,
55
54
  TARGET,
56
55
  CVType,
56
+ FeaturesMetadataV2,
57
57
  FileColumnMeaningType,
58
58
  ModelTaskType,
59
59
  RuntimeParameters,
@@ -95,6 +95,7 @@ from upgini.utils.email_utils import (
95
95
  EmailSearchKeyConverter,
96
96
  EmailSearchKeyDetector,
97
97
  )
98
+ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
98
99
  from upgini.utils.features_validator import FeaturesValidator
99
100
  from upgini.utils.format import Format
100
101
  from upgini.utils.ip_utils import IpSearchKeyConverter
@@ -224,6 +225,7 @@ class FeaturesEnricher(TransformerMixin):
224
225
  client_visitorid: Optional[str] = None,
225
226
  custom_bundle_config: Optional[str] = None,
226
227
  add_date_if_missing: bool = True,
228
+ select_features: bool = False,
227
229
  **kwargs,
228
230
  ):
229
231
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -277,8 +279,11 @@ class FeaturesEnricher(TransformerMixin):
277
279
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
278
280
  self.metrics: Optional[pd.DataFrame] = None
279
281
  self.feature_names_ = []
282
+ self.client_feature_names_ = []
280
283
  self.feature_importances_ = []
281
284
  self.search_id = search_id
285
+ self.select_features = select_features
286
+
282
287
  if search_id:
283
288
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
284
289
 
@@ -999,9 +1004,10 @@ class FeaturesEnricher(TransformerMixin):
999
1004
  text_features=self.generate_features,
1000
1005
  has_date=has_date,
1001
1006
  )
1002
- etalon_metric, _ = baseline_estimator.cross_val_predict(
1007
+ etalon_cv_result = baseline_estimator.cross_val_predict(
1003
1008
  fitting_X, y_sorted, self.baseline_score_column
1004
1009
  )
1010
+ etalon_metric = etalon_cv_result.get_display_metric()
1005
1011
  if etalon_metric is None:
1006
1012
  self.logger.info(
1007
1013
  f"Baseline {metric} on train client features is None (maybe all features was removed)"
@@ -1033,9 +1039,9 @@ class FeaturesEnricher(TransformerMixin):
1033
1039
  text_features=self.generate_features,
1034
1040
  has_date=has_date,
1035
1041
  )
1036
- enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1037
- fitting_enriched_X, enriched_y_sorted
1038
- )
1042
+ enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1043
+ enriched_metric = enriched_cv_result.get_display_metric()
1044
+ enriched_shaps = enriched_cv_result.shap_values
1039
1045
 
1040
1046
  if enriched_shaps is not None:
1041
1047
  self._update_shap_values(enriched_shaps)
@@ -1048,7 +1054,7 @@ class FeaturesEnricher(TransformerMixin):
1048
1054
  else:
1049
1055
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
1050
1056
  if etalon_metric is not None and enriched_metric is not None:
1051
- uplift = (enriched_metric - etalon_metric) * multiplier
1057
+ uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
1052
1058
 
1053
1059
  train_metrics = {
1054
1060
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1091,9 +1097,10 @@ class FeaturesEnricher(TransformerMixin):
1091
1097
  f"Calculate baseline {metric} on eval set {idx + 1} "
1092
1098
  f"on client features: {eval_X_sorted.columns.to_list()}"
1093
1099
  )
1094
- etalon_eval_metric = baseline_estimator.calculate_metric(
1100
+ etalon_eval_results = baseline_estimator.calculate_metric(
1095
1101
  eval_X_sorted, eval_y_sorted, self.baseline_score_column
1096
1102
  )
1103
+ etalon_eval_metric = etalon_eval_results.get_display_metric()
1097
1104
  self.logger.info(
1098
1105
  f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
1099
1106
  )
@@ -1105,9 +1112,10 @@ class FeaturesEnricher(TransformerMixin):
1105
1112
  f"Calculate enriched {metric} on eval set {idx + 1} "
1106
1113
  f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
1107
1114
  )
1108
- enriched_eval_metric = enriched_estimator.calculate_metric(
1115
+ enriched_eval_results = enriched_estimator.calculate_metric(
1109
1116
  enriched_eval_X_sorted, enriched_eval_y_sorted
1110
1117
  )
1118
+ enriched_eval_metric = enriched_eval_results.get_display_metric()
1111
1119
  self.logger.info(
1112
1120
  f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
1113
1121
  )
@@ -1115,7 +1123,7 @@ class FeaturesEnricher(TransformerMixin):
1115
1123
  enriched_eval_metric = None
1116
1124
 
1117
1125
  if etalon_eval_metric is not None and enriched_eval_metric is not None:
1118
- eval_uplift = (enriched_eval_metric - etalon_eval_metric) * multiplier
1126
+ eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
1119
1127
  else:
1120
1128
  eval_uplift = None
1121
1129
 
@@ -1198,9 +1206,7 @@ class FeaturesEnricher(TransformerMixin):
1198
1206
 
1199
1207
  def _update_shap_values(self, new_shaps: Dict[str, float]):
1200
1208
  new_shaps = {
1201
- feature: self._round_shap_value(shap)
1202
- for feature, shap in new_shaps.items()
1203
- if feature in self.feature_names_
1209
+ feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1204
1210
  }
1205
1211
  features_importances = list(new_shaps.items())
1206
1212
  features_importances.sort(key=lambda m: (-m[1], m[0]))
@@ -1437,7 +1443,8 @@ class FeaturesEnricher(TransformerMixin):
1437
1443
  client_features = [
1438
1444
  c
1439
1445
  for c in X_sampled.columns.to_list()
1440
- if c
1446
+ if (not self.select_features or c in self.feature_names_)
1447
+ and c
1441
1448
  not in (
1442
1449
  excluding_search_keys
1443
1450
  + list(self.fit_dropped_features)
@@ -2063,7 +2070,9 @@ class FeaturesEnricher(TransformerMixin):
2063
2070
 
2064
2071
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2065
2072
 
2066
- columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
2073
+ columns_to_drop = [
2074
+ c for c in validated_X.columns if c in self.feature_names_ and c not in self.client_feature_names_
2075
+ ]
2067
2076
  if len(columns_to_drop) > 0:
2068
2077
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2069
2078
  self.logger.warning(msg)
@@ -3490,15 +3499,7 @@ class FeaturesEnricher(TransformerMixin):
3490
3499
 
3491
3500
  return result_train, result_eval_sets
3492
3501
 
3493
- @staticmethod
3494
- def _round_shap_value(shap: float) -> float:
3495
- if shap > 0.0 and shap < 0.0001:
3496
- return 0.0001
3497
- else:
3498
- return round(shap, 4)
3499
-
3500
3502
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3501
- llm_source = "LLM with external data augmentation"
3502
3503
  if self._search_task is None:
3503
3504
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3504
3505
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3509,116 +3510,36 @@ class FeaturesEnricher(TransformerMixin):
3509
3510
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3510
3511
 
3511
3512
  self.feature_names_ = []
3513
+ self.client_feature_names_ = []
3512
3514
  self.feature_importances_ = []
3513
3515
  features_info = []
3514
3516
  features_info_without_links = []
3515
3517
  internal_features_info = []
3516
3518
 
3517
- def list_or_single(lst: List[str], single: str):
3518
- return lst or ([single] if single else [])
3519
-
3520
- def to_anchor(link: str, value: str) -> str:
3521
- if not value:
3522
- return ""
3523
- elif not link:
3524
- return value
3525
- elif value == llm_source:
3526
- return value
3527
- else:
3528
- return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3529
-
3530
- def make_links(names: List[str], links: List[str]):
3531
- all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3532
- return ",".join(all_links)
3533
-
3534
3519
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3535
3520
  for feature_meta in features_meta:
3536
3521
  if feature_meta.name in original_names_dict.keys():
3537
3522
  feature_meta.name = original_names_dict[feature_meta.name]
3538
- # Use only enriched features
3523
+ # Use only important features
3539
3524
  if (
3540
- feature_meta.name in x_columns
3541
- or feature_meta.name == COUNTRY
3542
- or feature_meta.shap_value == 0.0
3543
- or feature_meta.name in self.fit_generated_features
3525
+ (feature_meta.shap_value == 0.0)
3526
+ or (feature_meta.name in self.fit_generated_features)
3527
+ or (feature_meta.name == COUNTRY)
3544
3528
  ):
3545
3529
  continue
3546
3530
 
3547
- feature_sample = []
3548
- self.feature_names_.append(feature_meta.name)
3549
- self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
3550
- if feature_meta.name in features_df.columns:
3551
- feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
3552
- if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
3553
- feature_sample = [round(f, 4) for f in feature_sample]
3554
- feature_sample = [str(f) for f in feature_sample]
3555
- feature_sample = ", ".join(feature_sample)
3556
- if len(feature_sample) > 30:
3557
- feature_sample = feature_sample[:30] + "..."
3558
-
3559
- internal_provider = feature_meta.data_provider or "Upgini"
3560
- providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
3561
- provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
3562
- if providers:
3563
- provider = make_links(providers, provider_links)
3564
- else:
3565
- provider = to_anchor("https://upgini.com", "Upgini")
3531
+ is_client_feature = feature_meta.name in x_columns
3532
+ # In select_features mode we select also from etalon features and need to show them
3533
+ if not self.select_features and is_client_feature:
3534
+ continue
3566
3535
 
3567
- internal_source = feature_meta.data_source or (
3568
- llm_source
3569
- if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
3570
- else ""
3571
- )
3572
- sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
3573
- source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
3574
- if sources:
3575
- source = make_links(sources, source_links)
3576
- else:
3577
- source = internal_source
3536
+ self.feature_names_.append(feature_meta.name)
3537
+ self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3578
3538
 
3579
- internal_feature_name = feature_meta.name
3580
- if feature_meta.doc_link:
3581
- feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
3582
- else:
3583
- feature_name = internal_feature_name
3584
-
3585
- features_info.append(
3586
- {
3587
- self.bundle.get("features_info_name"): feature_name,
3588
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3589
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3590
- self.bundle.get("features_info_value_preview"): feature_sample,
3591
- self.bundle.get("features_info_provider"): provider,
3592
- self.bundle.get("features_info_source"): source,
3593
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3594
- }
3595
- )
3596
- features_info_without_links.append(
3597
- {
3598
- self.bundle.get("features_info_name"): internal_feature_name,
3599
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3600
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3601
- self.bundle.get("features_info_value_preview"): feature_sample,
3602
- self.bundle.get("features_info_provider"): internal_provider,
3603
- self.bundle.get("features_info_source"): internal_source,
3604
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3605
- }
3606
- )
3607
- internal_features_info.append(
3608
- {
3609
- self.bundle.get("features_info_name"): internal_feature_name,
3610
- "feature_link": feature_meta.doc_link,
3611
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3612
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3613
- self.bundle.get("features_info_value_preview"): feature_sample,
3614
- self.bundle.get("features_info_provider"): internal_provider,
3615
- "provider_link": feature_meta.data_provider_link,
3616
- self.bundle.get("features_info_source"): internal_source,
3617
- "source_link": feature_meta.data_source_link,
3618
- self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3619
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3620
- }
3621
- )
3539
+ feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
3540
+ features_info.append(feature_info.to_row(self.bundle))
3541
+ features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
3542
+ internal_features_info.append(feature_info.to_internal_row(self.bundle))
3622
3543
 
3623
3544
  if len(features_info) > 0:
3624
3545
  self.features_info = pd.DataFrame(features_info)
upgini/metrics.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataclasses import dataclass
3
4
  import inspect
4
5
  import logging
5
6
  import re
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
210
211
  }
211
212
 
212
213
 
214
+ @dataclass
215
+ class _CrossValResults:
216
+ metric: Optional[float]
217
+ metric_std: Optional[float]
218
+ shap_values: Optional[Dict[str, float]]
219
+
220
+ def get_display_metric(self) -> Optional[str]:
221
+ if self.metric is None:
222
+ return None
223
+ elif self.metric_std is None:
224
+ return f"{self.metric:.3f}"
225
+ else:
226
+ return f"{self.metric:.3f} ± {self.metric_std:.3f}"
227
+
228
+
213
229
  class EstimatorWrapper:
214
230
  def __init__(
215
231
  self,
@@ -297,11 +313,11 @@ class EstimatorWrapper:
297
313
 
298
314
  def cross_val_predict(
299
315
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
300
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
316
+ ) -> _CrossValResults:
301
317
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
302
318
 
303
319
  if x.shape[1] == 0:
304
- return None
320
+ return _CrossValResults(metric=None, metric_std=None, shap_values=None)
305
321
 
306
322
  scorer = check_scoring(self.estimator, scoring=self.scorer)
307
323
 
@@ -326,7 +342,7 @@ class EstimatorWrapper:
326
342
 
327
343
  self.check_fold_metrics(metrics_by_fold)
328
344
 
329
- metric = np.mean(metrics_by_fold) * self.multiplier
345
+ metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
330
346
 
331
347
  splits = self.cv.split(x, y, groups)
332
348
 
@@ -351,7 +367,7 @@ class EstimatorWrapper:
351
367
  else:
352
368
  average_shap_values = None
353
369
 
354
- return self.post_process_metric(metric), average_shap_values
370
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
355
371
 
356
372
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
357
373
  return shap_values
@@ -367,17 +383,25 @@ class EstimatorWrapper:
367
383
  metric = 2 * metric - 1
368
384
  return metric
369
385
 
370
- def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
386
+ def calculate_metric(
387
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
388
+ ) -> _CrossValResults:
371
389
  x, y, _ = self._prepare_to_calculate(x, y)
372
390
  if baseline_score_column is not None and self.metric_name == "GINI":
373
- metric = roc_auc_score(y, x[baseline_score_column])
391
+ metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
374
392
  else:
375
393
  metrics = []
376
394
  for est in self.cv_estimators:
377
395
  metrics.append(self.scorer(est, x, y))
378
396
 
379
- metric = np.mean(metrics) * self.multiplier
380
- return self.post_process_metric(metric)
397
+ metric, metric_std = self._calculate_metric_from_folds(metrics)
398
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
399
+
400
+ def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
401
+ metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
402
+ metric = np.mean(metrics_by_fold) * self.multiplier
403
+ metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
404
+ return metric, metric_std
381
405
 
382
406
  @staticmethod
383
407
  def create(
@@ -591,7 +615,7 @@ class CatBoostWrapper(EstimatorWrapper):
591
615
 
592
616
  def cross_val_predict(
593
617
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
594
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
618
+ ) -> _CrossValResults:
595
619
  try:
596
620
  return super().cross_val_predict(x, y, baseline_score_column)
597
621
  except Exception as e:
@@ -111,7 +111,7 @@ class DateTimeSearchKeyConverter:
111
111
 
112
112
  # Define function to apply sine and cosine transformations
113
113
  def add_cyclical_features(df, column, period):
114
- period_suffix = f"_{period}" if column != 'day_in_quarter' else ""
114
+ period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
115
  sin_feature = f"datetime_{column}_sin{period_suffix}"
116
116
  cos_feature = f"datetime_{column}_cos{period_suffix}"
117
117
  df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
@@ -137,6 +137,7 @@ class DateTimeSearchKeyConverter:
137
137
  quarter_end_month = np.where(quarter == 4, 1, month + 3)
138
138
 
139
139
  end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
140
+ end.index = df.index
140
141
 
141
142
  df["days_in_quarter"] = (end - start).dt.days
142
143
 
@@ -0,0 +1,172 @@
1
+ from dataclasses import dataclass
2
+ import itertools
3
+ from typing import Dict, List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from upgini.metadata import FeaturesMetadataV2
9
+ from upgini.resource_bundle import ResourceBundle
10
+
11
+
12
+ LLM_SOURCE = "LLM with external data augmentation"
13
+
14
+
15
+ @dataclass
16
+ class FeatureInfo:
17
+ name: str
18
+ internal_name: str
19
+ rounded_shap: float
20
+ hitrate: float
21
+ value_preview: str
22
+ provider: str
23
+ internal_provider: str
24
+ source: str
25
+ internal_source: str
26
+ update_frequency: str
27
+ commercial_schema: str
28
+ doc_link: str
29
+ data_provider_link: str
30
+ data_source_link: str
31
+
32
+ @staticmethod
33
+ def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
34
+ return FeatureInfo(
35
+ name=_get_name(feature_meta),
36
+ internal_name=_get_internal_name(feature_meta),
37
+ rounded_shap=_round_shap_value(feature_meta.shap_value),
38
+ hitrate=feature_meta.hit_rate,
39
+ value_preview=_get_feature_sample(feature_meta, data),
40
+ provider=_get_provider(feature_meta, is_client_feature),
41
+ internal_provider=_get_internal_provider(feature_meta, is_client_feature),
42
+ source=_get_source(feature_meta, is_client_feature),
43
+ internal_source=_get_internal_source(feature_meta, is_client_feature),
44
+ update_frequency=feature_meta.update_frequency,
45
+ commercial_schema=feature_meta.commercial_schema,
46
+ doc_link=feature_meta.doc_link,
47
+ data_provider_link=feature_meta.data_provider_link,
48
+ data_source_link=feature_meta.data_source_link,
49
+ )
50
+
51
+ def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
52
+ return {
53
+ bundle.get("features_info_name"): self.name,
54
+ bundle.get("features_info_shap"): self.rounded_shap,
55
+ bundle.get("features_info_hitrate"): self.hitrate,
56
+ bundle.get("features_info_value_preview"): self.value_preview,
57
+ bundle.get("features_info_provider"): self.provider,
58
+ bundle.get("features_info_source"): self.source,
59
+ bundle.get("features_info_update_frequency"): self.update_frequency,
60
+ }
61
+
62
+ def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
63
+ return {
64
+ bundle.get("features_info_name"): self.internal_name,
65
+ bundle.get("features_info_shap"): self.rounded_shap,
66
+ bundle.get("features_info_hitrate"): self.hitrate,
67
+ bundle.get("features_info_value_preview"): self.value_preview,
68
+ bundle.get("features_info_provider"): self.internal_provider,
69
+ bundle.get("features_info_source"): self.internal_source,
70
+ bundle.get("features_info_update_frequency"): self.update_frequency,
71
+ }
72
+
73
+ def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
74
+ return {
75
+ bundle.get("features_info_name"): self.internal_name,
76
+ "feature_link": self.doc_link,
77
+ bundle.get("features_info_shap"): self.rounded_shap,
78
+ bundle.get("features_info_hitrate"): self.hitrate,
79
+ bundle.get("features_info_value_preview"): self.value_preview,
80
+ bundle.get("features_info_provider"): self.internal_provider,
81
+ "provider_link": self.data_provider_link,
82
+ bundle.get("features_info_source"): self.internal_source,
83
+ "source_link": self.data_source_link,
84
+ bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
85
+ bundle.get("features_info_update_frequency"): self.update_frequency,
86
+ }
87
+
88
+
89
+ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
+ if feature_meta.name in data.columns:
91
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
+ if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
+ feature_sample = [round(f, 4) for f in feature_sample]
94
+ feature_sample = [str(f) for f in feature_sample]
95
+ feature_sample = ", ".join(feature_sample)
96
+ if len(feature_sample) > 30:
97
+ feature_sample = feature_sample[:30] + "..."
98
+ else:
99
+ feature_sample = ""
100
+ return feature_sample
101
+
102
+
103
+ def _get_name(feature_meta: FeaturesMetadataV2) -> str:
104
+ if feature_meta.doc_link:
105
+ return _to_anchor(feature_meta.doc_link, feature_meta.name)
106
+ else:
107
+ return feature_meta.name
108
+
109
+
110
+ def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
111
+ return feature_meta.name
112
+
113
+
114
+ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
115
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
116
+ provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
117
+ if providers:
118
+ provider = _make_links(providers, provider_links)
119
+ else:
120
+ provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
121
+ return provider
122
+
123
+
124
+ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
125
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
126
+
127
+
128
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
129
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
130
+ source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
131
+ if sources:
132
+ source = _make_links(sources, source_links)
133
+ else:
134
+ source = _get_internal_source(feature_meta, is_client_feature)
135
+ return source
136
+
137
+
138
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
139
+ return feature_meta.data_source or (
140
+ LLM_SOURCE
141
+ if not feature_meta.name.endswith("_country")
142
+ and not feature_meta.name.endswith("_postal_code")
143
+ and not is_client_feature
144
+ else ""
145
+ )
146
+
147
+
148
+ def _list_or_single(lst: List[str], single: str):
149
+ return lst or ([single] if single else [])
150
+
151
+
152
+ def _to_anchor(link: str, value: str) -> str:
153
+ if not value:
154
+ return ""
155
+ elif not link:
156
+ return value
157
+ elif value == LLM_SOURCE:
158
+ return value
159
+ else:
160
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
161
+
162
+
163
+ def _make_links(names: List[str], links: List[str]):
164
+ all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
165
+ return ",".join(all_links)
166
+
167
+
168
+ def _round_shap_value(shap: float) -> float:
169
+ if shap > 0.0 and shap < 0.0001:
170
+ return 0.0001
171
+ else:
172
+ return round(shap, 4)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a3
3
+ Version: 1.2.29a5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,13 +1,13 @@
1
- upgini/__about__.py,sha256=g5pIOn0QIK7AYvLSK8cOcem2I_ZfKqz9pqOf071XTPQ,25
1
+ upgini/__about__.py,sha256=kPhSfYAPCirj2ias-m_3Zttb3tZcyZpwrnKTzNizAdQ,25
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=YYboYw--vCH9VerkTnCF3njztU3fVLrhlhU3NfBVJYQ,194302
6
+ upgini/features_enricher.py,sha256=mEb1I_qnheDHNL2LEm-q9Yg3ZR_UPnavxe8H4JkVdvk,190405
7
7
  upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
- upgini/metrics.py,sha256=PoY1fq6XYAHNzn-rmnwRQZjCoVYP5bJNmKhR0ST2Txk,34588
10
+ upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
@@ -43,11 +43,12 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
43
43
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
44
44
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
45
45
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
46
- upgini/utils/datetime_utils.py,sha256=GiJhOXE4taDtC0PEBYloSN7jeLwN26AchOQnMSTUDpc,12996
46
+ upgini/utils/datetime_utils.py,sha256=_uINXZUZ2MXvpGFBcxk_kZKMa1Umd8nhs8Iam-Gbwo0,13025
47
47
  upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
48
48
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
49
49
  upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
50
50
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
51
+ upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
51
52
  upgini/utils/features_validator.py,sha256=1Xj2ir5LzzYiX3NH8o88c2J6RTTetaTwu0MhjLTyuvM,3378
52
53
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
53
54
  upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
@@ -58,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
58
59
  upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
59
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
60
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
61
- upgini-1.2.29a3.dist-info/METADATA,sha256=Ek9umOS0JA_zCCYMq7PWIcokbDM59DB0lbwQappDk1g,48580
62
- upgini-1.2.29a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
63
- upgini-1.2.29a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
64
- upgini-1.2.29a3.dist-info/RECORD,,
62
+ upgini-1.2.29a5.dist-info/METADATA,sha256=ccMETf-MYrRSaOwmMEW0Smo9IjLU_b-LS6MTOvyilpc,48580
63
+ upgini-1.2.29a5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
+ upgini-1.2.29a5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.29a5.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any