upgini 1.2.29a4__tar.gz → 1.2.29a6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.29a4 → upgini-1.2.29a6}/PKG-INFO +1 -1
  2. upgini-1.2.29a6/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/features_enricher.py +41 -111
  4. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/datetime_utils.py +2 -1
  5. upgini-1.2.29a6/src/upgini/utils/feature_info.py +172 -0
  6. upgini-1.2.29a4/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.29a4 → upgini-1.2.29a6}/.gitignore +0 -0
  8. {upgini-1.2.29a4 → upgini-1.2.29a6}/LICENSE +0 -0
  9. {upgini-1.2.29a4 → upgini-1.2.29a6}/README.md +0 -0
  10. {upgini-1.2.29a4 → upgini-1.2.29a6}/pyproject.toml +0 -0
  11. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/all_operands.py +0 -0
  17. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/dataset.py +0 -0
  27. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/http.py +0 -0
  29. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  47. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/__init__.py +0 -0
  48. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/base_search_key_detector.py +0 -0
  49. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/features_validator.py +0 -0
  58. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.29a4 → upgini-1.2.29a6}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a4
3
+ Version: 1.2.29a6
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.29a6"
@@ -54,6 +54,7 @@ from upgini.metadata import (
54
54
  SYSTEM_RECORD_ID,
55
55
  TARGET,
56
56
  CVType,
57
+ FeaturesMetadataV2,
57
58
  FileColumnMeaningType,
58
59
  ModelTaskType,
59
60
  RuntimeParameters,
@@ -95,6 +96,7 @@ from upgini.utils.email_utils import (
95
96
  EmailSearchKeyConverter,
96
97
  EmailSearchKeyDetector,
97
98
  )
99
+ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
98
100
  from upgini.utils.features_validator import FeaturesValidator
99
101
  from upgini.utils.format import Format
100
102
  from upgini.utils.ip_utils import IpSearchKeyConverter
@@ -158,6 +160,10 @@ class FeaturesEnricher(TransformerMixin):
158
160
 
159
161
  shared_datasets: list of str, optional (default=None)
160
162
  List of private shared dataset ids for custom search
163
+
164
+ select_features: bool, optional (default=False)
165
+ If True, return only selected features both from input and data sources.
166
+ Otherwise, return all features from input and only selected features from data sources.
161
167
  """
162
168
 
163
169
  TARGET_NAME = "target"
@@ -224,6 +230,7 @@ class FeaturesEnricher(TransformerMixin):
224
230
  client_visitorid: Optional[str] = None,
225
231
  custom_bundle_config: Optional[str] = None,
226
232
  add_date_if_missing: bool = True,
233
+ select_features: bool = False,
227
234
  **kwargs,
228
235
  ):
229
236
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -277,8 +284,11 @@ class FeaturesEnricher(TransformerMixin):
277
284
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
278
285
  self.metrics: Optional[pd.DataFrame] = None
279
286
  self.feature_names_ = []
287
+ self.dropped_client_feature_names_ = []
280
288
  self.feature_importances_ = []
281
289
  self.search_id = search_id
290
+ self.select_features = select_features
291
+
282
292
  if search_id:
283
293
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
284
294
 
@@ -1201,9 +1211,7 @@ class FeaturesEnricher(TransformerMixin):
1201
1211
 
1202
1212
  def _update_shap_values(self, new_shaps: Dict[str, float]):
1203
1213
  new_shaps = {
1204
- feature: self._round_shap_value(shap)
1205
- for feature, shap in new_shaps.items()
1206
- if feature in self.feature_names_
1214
+ feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1207
1215
  }
1208
1216
  features_importances = list(new_shaps.items())
1209
1217
  features_importances.sort(key=lambda m: (-m[1], m[0]))
@@ -1440,7 +1448,8 @@ class FeaturesEnricher(TransformerMixin):
1440
1448
  client_features = [
1441
1449
  c
1442
1450
  for c in X_sampled.columns.to_list()
1443
- if c
1451
+ if (not self.select_features or c in self.feature_names_)
1452
+ and c
1444
1453
  not in (
1445
1454
  excluding_search_keys
1446
1455
  + list(self.fit_dropped_features)
@@ -2066,7 +2075,9 @@ class FeaturesEnricher(TransformerMixin):
2066
2075
 
2067
2076
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2068
2077
 
2069
- columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
2078
+ columns_to_drop = [
2079
+ c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2080
+ ]
2070
2081
  if len(columns_to_drop) > 0:
2071
2082
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2072
2083
  self.logger.warning(msg)
@@ -2322,11 +2333,13 @@ class FeaturesEnricher(TransformerMixin):
2322
2333
  else:
2323
2334
  result = enrich()
2324
2335
 
2325
- filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2326
- existing_filtered_columns = [
2327
- c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2336
+ selecting_columns = [
2337
+ c
2338
+ for c in itertools.chain(validated_X.columns.tolist(), generated_features)
2339
+ if c not in self.dropped_client_feature_names_
2328
2340
  ]
2329
- selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2341
+ filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2342
+ selecting_columns.extend(c for c in filtered_columns if c in result.columns and c not in validated_X.columns)
2330
2343
  if add_fit_system_record_id:
2331
2344
  selecting_columns.append(SORT_ID)
2332
2345
 
@@ -3493,15 +3506,7 @@ class FeaturesEnricher(TransformerMixin):
3493
3506
 
3494
3507
  return result_train, result_eval_sets
3495
3508
 
3496
- @staticmethod
3497
- def _round_shap_value(shap: float) -> float:
3498
- if shap > 0.0 and shap < 0.0001:
3499
- return 0.0001
3500
- else:
3501
- return round(shap, 4)
3502
-
3503
3509
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3504
- llm_source = "LLM with external data augmentation"
3505
3510
  if self._search_task is None:
3506
3511
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3507
3512
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3512,116 +3517,41 @@ class FeaturesEnricher(TransformerMixin):
3512
3517
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3513
3518
 
3514
3519
  self.feature_names_ = []
3520
+ self.dropped_client_feature_names_ = []
3515
3521
  self.feature_importances_ = []
3516
3522
  features_info = []
3517
3523
  features_info_without_links = []
3518
3524
  internal_features_info = []
3519
3525
 
3520
- def list_or_single(lst: List[str], single: str):
3521
- return lst or ([single] if single else [])
3522
-
3523
- def to_anchor(link: str, value: str) -> str:
3524
- if not value:
3525
- return ""
3526
- elif not link:
3527
- return value
3528
- elif value == llm_source:
3529
- return value
3530
- else:
3531
- return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3532
-
3533
- def make_links(names: List[str], links: List[str]):
3534
- all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3535
- return ",".join(all_links)
3536
-
3537
3526
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3538
3527
  for feature_meta in features_meta:
3539
3528
  if feature_meta.name in original_names_dict.keys():
3540
3529
  feature_meta.name = original_names_dict[feature_meta.name]
3541
- # Use only enriched features
3530
+
3531
+ is_client_feature = feature_meta.name in x_columns
3532
+
3533
+ if feature_meta.shap_value == 0.0:
3534
+ if self.select_features:
3535
+ self.dropped_client_feature_names_.append(feature_meta.name)
3536
+ continue
3537
+
3538
+ # Use only important features
3542
3539
  if (
3543
- feature_meta.name in x_columns
3540
+ feature_meta.name in self.fit_generated_features
3544
3541
  or feature_meta.name == COUNTRY
3545
- or feature_meta.shap_value == 0.0
3546
- or feature_meta.name in self.fit_generated_features
3542
+ # In select_features mode we select also from etalon features and need to show them
3543
+ or (not self.select_features and is_client_feature)
3547
3544
  ):
3548
3545
  continue
3549
3546
 
3550
- feature_sample = []
3551
- self.feature_names_.append(feature_meta.name)
3552
- self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
3553
- if feature_meta.name in features_df.columns:
3554
- feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
3555
- if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
3556
- feature_sample = [round(f, 4) for f in feature_sample]
3557
- feature_sample = [str(f) for f in feature_sample]
3558
- feature_sample = ", ".join(feature_sample)
3559
- if len(feature_sample) > 30:
3560
- feature_sample = feature_sample[:30] + "..."
3561
-
3562
- internal_provider = feature_meta.data_provider or "Upgini"
3563
- providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
3564
- provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
3565
- if providers:
3566
- provider = make_links(providers, provider_links)
3567
- else:
3568
- provider = to_anchor("https://upgini.com", "Upgini")
3569
3547
 
3570
- internal_source = feature_meta.data_source or (
3571
- llm_source
3572
- if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
3573
- else ""
3574
- )
3575
- sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
3576
- source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
3577
- if sources:
3578
- source = make_links(sources, source_links)
3579
- else:
3580
- source = internal_source
3548
+ self.feature_names_.append(feature_meta.name)
3549
+ self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3581
3550
 
3582
- internal_feature_name = feature_meta.name
3583
- if feature_meta.doc_link:
3584
- feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
3585
- else:
3586
- feature_name = internal_feature_name
3587
-
3588
- features_info.append(
3589
- {
3590
- self.bundle.get("features_info_name"): feature_name,
3591
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3592
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3593
- self.bundle.get("features_info_value_preview"): feature_sample,
3594
- self.bundle.get("features_info_provider"): provider,
3595
- self.bundle.get("features_info_source"): source,
3596
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3597
- }
3598
- )
3599
- features_info_without_links.append(
3600
- {
3601
- self.bundle.get("features_info_name"): internal_feature_name,
3602
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3603
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3604
- self.bundle.get("features_info_value_preview"): feature_sample,
3605
- self.bundle.get("features_info_provider"): internal_provider,
3606
- self.bundle.get("features_info_source"): internal_source,
3607
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3608
- }
3609
- )
3610
- internal_features_info.append(
3611
- {
3612
- self.bundle.get("features_info_name"): internal_feature_name,
3613
- "feature_link": feature_meta.doc_link,
3614
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3615
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3616
- self.bundle.get("features_info_value_preview"): feature_sample,
3617
- self.bundle.get("features_info_provider"): internal_provider,
3618
- "provider_link": feature_meta.data_provider_link,
3619
- self.bundle.get("features_info_source"): internal_source,
3620
- "source_link": feature_meta.data_source_link,
3621
- self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3622
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3623
- }
3624
- )
3551
+ feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
3552
+ features_info.append(feature_info.to_row(self.bundle))
3553
+ features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
3554
+ internal_features_info.append(feature_info.to_internal_row(self.bundle))
3625
3555
 
3626
3556
  if len(features_info) > 0:
3627
3557
  self.features_info = pd.DataFrame(features_info)
@@ -137,8 +137,9 @@ class DateTimeSearchKeyConverter:
137
137
  quarter_end_month = np.where(quarter == 4, 1, month + 3)
138
138
 
139
139
  end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
140
+ end.index = df.index
140
141
 
141
- df["days_in_quarter"] = (end.reset_index(drop=True) - start.reset_index(drop=True)).dt.days
142
+ df["days_in_quarter"] = (end - start).dt.days
142
143
 
143
144
  add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
144
145
 
@@ -0,0 +1,172 @@
1
+ from dataclasses import dataclass
2
+ import itertools
3
+ from typing import Dict, List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from upgini.metadata import FeaturesMetadataV2
9
+ from upgini.resource_bundle import ResourceBundle
10
+
11
+
12
+ LLM_SOURCE = "LLM with external data augmentation"
13
+
14
+
15
+ @dataclass
16
+ class FeatureInfo:
17
+ name: str
18
+ internal_name: str
19
+ rounded_shap: float
20
+ hitrate: float
21
+ value_preview: str
22
+ provider: str
23
+ internal_provider: str
24
+ source: str
25
+ internal_source: str
26
+ update_frequency: str
27
+ commercial_schema: str
28
+ doc_link: str
29
+ data_provider_link: str
30
+ data_source_link: str
31
+
32
+ @staticmethod
33
+ def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
34
+ return FeatureInfo(
35
+ name=_get_name(feature_meta),
36
+ internal_name=_get_internal_name(feature_meta),
37
+ rounded_shap=_round_shap_value(feature_meta.shap_value),
38
+ hitrate=feature_meta.hit_rate,
39
+ value_preview=_get_feature_sample(feature_meta, data),
40
+ provider=_get_provider(feature_meta, is_client_feature),
41
+ internal_provider=_get_internal_provider(feature_meta, is_client_feature),
42
+ source=_get_source(feature_meta, is_client_feature),
43
+ internal_source=_get_internal_source(feature_meta, is_client_feature),
44
+ update_frequency=feature_meta.update_frequency,
45
+ commercial_schema=feature_meta.commercial_schema,
46
+ doc_link=feature_meta.doc_link,
47
+ data_provider_link=feature_meta.data_provider_link,
48
+ data_source_link=feature_meta.data_source_link,
49
+ )
50
+
51
+ def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
52
+ return {
53
+ bundle.get("features_info_name"): self.name,
54
+ bundle.get("features_info_shap"): self.rounded_shap,
55
+ bundle.get("features_info_hitrate"): self.hitrate,
56
+ bundle.get("features_info_value_preview"): self.value_preview,
57
+ bundle.get("features_info_provider"): self.provider,
58
+ bundle.get("features_info_source"): self.source,
59
+ bundle.get("features_info_update_frequency"): self.update_frequency,
60
+ }
61
+
62
+ def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
63
+ return {
64
+ bundle.get("features_info_name"): self.internal_name,
65
+ bundle.get("features_info_shap"): self.rounded_shap,
66
+ bundle.get("features_info_hitrate"): self.hitrate,
67
+ bundle.get("features_info_value_preview"): self.value_preview,
68
+ bundle.get("features_info_provider"): self.internal_provider,
69
+ bundle.get("features_info_source"): self.internal_source,
70
+ bundle.get("features_info_update_frequency"): self.update_frequency,
71
+ }
72
+
73
+ def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
74
+ return {
75
+ bundle.get("features_info_name"): self.internal_name,
76
+ "feature_link": self.doc_link,
77
+ bundle.get("features_info_shap"): self.rounded_shap,
78
+ bundle.get("features_info_hitrate"): self.hitrate,
79
+ bundle.get("features_info_value_preview"): self.value_preview,
80
+ bundle.get("features_info_provider"): self.internal_provider,
81
+ "provider_link": self.data_provider_link,
82
+ bundle.get("features_info_source"): self.internal_source,
83
+ "source_link": self.data_source_link,
84
+ bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
85
+ bundle.get("features_info_update_frequency"): self.update_frequency,
86
+ }
87
+
88
+
89
+ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
+ if feature_meta.name in data.columns:
91
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
+ if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
+ feature_sample = [round(f, 4) for f in feature_sample]
94
+ feature_sample = [str(f) for f in feature_sample]
95
+ feature_sample = ", ".join(feature_sample)
96
+ if len(feature_sample) > 30:
97
+ feature_sample = feature_sample[:30] + "..."
98
+ else:
99
+ feature_sample = ""
100
+ return feature_sample
101
+
102
+
103
+ def _get_name(feature_meta: FeaturesMetadataV2) -> str:
104
+ if feature_meta.doc_link:
105
+ return _to_anchor(feature_meta.doc_link, feature_meta.name)
106
+ else:
107
+ return feature_meta.name
108
+
109
+
110
+ def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
111
+ return feature_meta.name
112
+
113
+
114
+ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
115
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
116
+ provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
117
+ if providers:
118
+ provider = _make_links(providers, provider_links)
119
+ else:
120
+ provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
121
+ return provider
122
+
123
+
124
+ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
125
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
126
+
127
+
128
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
129
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
130
+ source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
131
+ if sources:
132
+ source = _make_links(sources, source_links)
133
+ else:
134
+ source = _get_internal_source(feature_meta, is_client_feature)
135
+ return source
136
+
137
+
138
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
139
+ return feature_meta.data_source or (
140
+ LLM_SOURCE
141
+ if not feature_meta.name.endswith("_country")
142
+ and not feature_meta.name.endswith("_postal_code")
143
+ and not is_client_feature
144
+ else ""
145
+ )
146
+
147
+
148
+ def _list_or_single(lst: List[str], single: str):
149
+ return lst or ([single] if single else [])
150
+
151
+
152
+ def _to_anchor(link: str, value: str) -> str:
153
+ if not value:
154
+ return ""
155
+ elif not link:
156
+ return value
157
+ elif value == LLM_SOURCE:
158
+ return value
159
+ else:
160
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
161
+
162
+
163
+ def _make_links(names: List[str], links: List[str]):
164
+ all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
165
+ return ",".join(all_links)
166
+
167
+
168
+ def _round_shap_value(shap: float) -> float:
169
+ if shap > 0.0 and shap < 0.0001:
170
+ return 0.0001
171
+ else:
172
+ return round(shap, 4)
@@ -1 +0,0 @@
1
- __version__ = "1.2.29a4"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes