upgini 1.2.29a4__tar.gz → 1.2.29a5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.29a4 → upgini-1.2.29a5}/PKG-INFO +1 -1
  2. upgini-1.2.29a5/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/features_enricher.py +27 -109
  4. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/datetime_utils.py +2 -1
  5. upgini-1.2.29a5/src/upgini/utils/feature_info.py +172 -0
  6. upgini-1.2.29a4/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.29a4 → upgini-1.2.29a5}/.gitignore +0 -0
  8. {upgini-1.2.29a4 → upgini-1.2.29a5}/LICENSE +0 -0
  9. {upgini-1.2.29a4 → upgini-1.2.29a5}/README.md +0 -0
  10. {upgini-1.2.29a4 → upgini-1.2.29a5}/pyproject.toml +0 -0
  11. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/all_operands.py +0 -0
  17. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/dataset.py +0 -0
  27. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/http.py +0 -0
  29. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  47. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/__init__.py +0 -0
  48. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/base_search_key_detector.py +0 -0
  49. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/features_validator.py +0 -0
  58. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.29a4 → upgini-1.2.29a5}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a4
3
+ Version: 1.2.29a5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.29a5"
@@ -2,7 +2,6 @@ import dataclasses
2
2
  import datetime
3
3
  import gc
4
4
  import hashlib
5
- import itertools
6
5
  import logging
7
6
  import numbers
8
7
  import os
@@ -54,6 +53,7 @@ from upgini.metadata import (
54
53
  SYSTEM_RECORD_ID,
55
54
  TARGET,
56
55
  CVType,
56
+ FeaturesMetadataV2,
57
57
  FileColumnMeaningType,
58
58
  ModelTaskType,
59
59
  RuntimeParameters,
@@ -95,6 +95,7 @@ from upgini.utils.email_utils import (
95
95
  EmailSearchKeyConverter,
96
96
  EmailSearchKeyDetector,
97
97
  )
98
+ from upgini.utils.feature_info import FeatureInfo, _round_shap_value
98
99
  from upgini.utils.features_validator import FeaturesValidator
99
100
  from upgini.utils.format import Format
100
101
  from upgini.utils.ip_utils import IpSearchKeyConverter
@@ -224,6 +225,7 @@ class FeaturesEnricher(TransformerMixin):
224
225
  client_visitorid: Optional[str] = None,
225
226
  custom_bundle_config: Optional[str] = None,
226
227
  add_date_if_missing: bool = True,
228
+ select_features: bool = False,
227
229
  **kwargs,
228
230
  ):
229
231
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -277,8 +279,11 @@ class FeaturesEnricher(TransformerMixin):
277
279
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
278
280
  self.metrics: Optional[pd.DataFrame] = None
279
281
  self.feature_names_ = []
282
+ self.client_feature_names_ = []
280
283
  self.feature_importances_ = []
281
284
  self.search_id = search_id
285
+ self.select_features = select_features
286
+
282
287
  if search_id:
283
288
  search_task = SearchTask(search_id, rest_client=self.rest_client, logger=self.logger)
284
289
 
@@ -1201,9 +1206,7 @@ class FeaturesEnricher(TransformerMixin):
1201
1206
 
1202
1207
  def _update_shap_values(self, new_shaps: Dict[str, float]):
1203
1208
  new_shaps = {
1204
- feature: self._round_shap_value(shap)
1205
- for feature, shap in new_shaps.items()
1206
- if feature in self.feature_names_
1209
+ feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1207
1210
  }
1208
1211
  features_importances = list(new_shaps.items())
1209
1212
  features_importances.sort(key=lambda m: (-m[1], m[0]))
@@ -1440,7 +1443,8 @@ class FeaturesEnricher(TransformerMixin):
1440
1443
  client_features = [
1441
1444
  c
1442
1445
  for c in X_sampled.columns.to_list()
1443
- if c
1446
+ if (not self.select_features or c in self.feature_names_)
1447
+ and c
1444
1448
  not in (
1445
1449
  excluding_search_keys
1446
1450
  + list(self.fit_dropped_features)
@@ -2066,7 +2070,9 @@ class FeaturesEnricher(TransformerMixin):
2066
2070
 
2067
2071
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2068
2072
 
2069
- columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
2073
+ columns_to_drop = [
2074
+ c for c in validated_X.columns if c in self.feature_names_ and c not in self.client_feature_names_
2075
+ ]
2070
2076
  if len(columns_to_drop) > 0:
2071
2077
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
2072
2078
  self.logger.warning(msg)
@@ -3493,15 +3499,7 @@ class FeaturesEnricher(TransformerMixin):
3493
3499
 
3494
3500
  return result_train, result_eval_sets
3495
3501
 
3496
- @staticmethod
3497
- def _round_shap_value(shap: float) -> float:
3498
- if shap > 0.0 and shap < 0.0001:
3499
- return 0.0001
3500
- else:
3501
- return round(shap, 4)
3502
-
3503
3502
  def __prepare_feature_importances(self, trace_id: str, x_columns: List[str], silent=False):
3504
- llm_source = "LLM with external data augmentation"
3505
3503
  if self._search_task is None:
3506
3504
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3507
3505
  features_meta = self._search_task.get_all_features_metadata_v2()
@@ -3512,116 +3510,36 @@ class FeaturesEnricher(TransformerMixin):
3512
3510
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3513
3511
 
3514
3512
  self.feature_names_ = []
3513
+ self.client_feature_names_ = []
3515
3514
  self.feature_importances_ = []
3516
3515
  features_info = []
3517
3516
  features_info_without_links = []
3518
3517
  internal_features_info = []
3519
3518
 
3520
- def list_or_single(lst: List[str], single: str):
3521
- return lst or ([single] if single else [])
3522
-
3523
- def to_anchor(link: str, value: str) -> str:
3524
- if not value:
3525
- return ""
3526
- elif not link:
3527
- return value
3528
- elif value == llm_source:
3529
- return value
3530
- else:
3531
- return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
3532
-
3533
- def make_links(names: List[str], links: List[str]):
3534
- all_links = [to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
3535
- return ",".join(all_links)
3536
-
3537
3519
  features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3538
3520
  for feature_meta in features_meta:
3539
3521
  if feature_meta.name in original_names_dict.keys():
3540
3522
  feature_meta.name = original_names_dict[feature_meta.name]
3541
- # Use only enriched features
3523
+ # Use only important features
3542
3524
  if (
3543
- feature_meta.name in x_columns
3544
- or feature_meta.name == COUNTRY
3545
- or feature_meta.shap_value == 0.0
3546
- or feature_meta.name in self.fit_generated_features
3525
+ (feature_meta.shap_value == 0.0)
3526
+ or (feature_meta.name in self.fit_generated_features)
3527
+ or (feature_meta.name == COUNTRY)
3547
3528
  ):
3548
3529
  continue
3549
3530
 
3550
- feature_sample = []
3551
- self.feature_names_.append(feature_meta.name)
3552
- self.feature_importances_.append(self._round_shap_value(feature_meta.shap_value))
3553
- if feature_meta.name in features_df.columns:
3554
- feature_sample = np.random.choice(features_df[feature_meta.name].dropna().unique(), 3).tolist()
3555
- if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
3556
- feature_sample = [round(f, 4) for f in feature_sample]
3557
- feature_sample = [str(f) for f in feature_sample]
3558
- feature_sample = ", ".join(feature_sample)
3559
- if len(feature_sample) > 30:
3560
- feature_sample = feature_sample[:30] + "..."
3561
-
3562
- internal_provider = feature_meta.data_provider or "Upgini"
3563
- providers = list_or_single(feature_meta.data_providers, feature_meta.data_provider)
3564
- provider_links = list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
3565
- if providers:
3566
- provider = make_links(providers, provider_links)
3567
- else:
3568
- provider = to_anchor("https://upgini.com", "Upgini")
3531
+ is_client_feature = feature_meta.name in x_columns
3532
+ # In select_features mode we select also from etalon features and need to show them
3533
+ if not self.select_features and is_client_feature:
3534
+ continue
3569
3535
 
3570
- internal_source = feature_meta.data_source or (
3571
- llm_source
3572
- if not feature_meta.name.endswith("_country") and not feature_meta.name.endswith("_postal_code")
3573
- else ""
3574
- )
3575
- sources = list_or_single(feature_meta.data_sources, feature_meta.data_source)
3576
- source_links = list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
3577
- if sources:
3578
- source = make_links(sources, source_links)
3579
- else:
3580
- source = internal_source
3536
+ self.feature_names_.append(feature_meta.name)
3537
+ self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3581
3538
 
3582
- internal_feature_name = feature_meta.name
3583
- if feature_meta.doc_link:
3584
- feature_name = to_anchor(feature_meta.doc_link, feature_meta.name)
3585
- else:
3586
- feature_name = internal_feature_name
3587
-
3588
- features_info.append(
3589
- {
3590
- self.bundle.get("features_info_name"): feature_name,
3591
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3592
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3593
- self.bundle.get("features_info_value_preview"): feature_sample,
3594
- self.bundle.get("features_info_provider"): provider,
3595
- self.bundle.get("features_info_source"): source,
3596
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3597
- }
3598
- )
3599
- features_info_without_links.append(
3600
- {
3601
- self.bundle.get("features_info_name"): internal_feature_name,
3602
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3603
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3604
- self.bundle.get("features_info_value_preview"): feature_sample,
3605
- self.bundle.get("features_info_provider"): internal_provider,
3606
- self.bundle.get("features_info_source"): internal_source,
3607
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3608
- }
3609
- )
3610
- internal_features_info.append(
3611
- {
3612
- self.bundle.get("features_info_name"): internal_feature_name,
3613
- "feature_link": feature_meta.doc_link,
3614
- self.bundle.get("features_info_shap"): self._round_shap_value(feature_meta.shap_value),
3615
- self.bundle.get("features_info_hitrate"): feature_meta.hit_rate,
3616
- self.bundle.get("features_info_value_preview"): feature_sample,
3617
- self.bundle.get("features_info_provider"): internal_provider,
3618
- "provider_link": feature_meta.data_provider_link,
3619
- self.bundle.get("features_info_source"): internal_source,
3620
- "source_link": feature_meta.data_source_link,
3621
- self.bundle.get("features_info_commercial_schema"): feature_meta.commercial_schema or "",
3622
- self.bundle.get("features_info_update_frequency"): feature_meta.update_frequency,
3623
- }
3624
- )
3539
+ feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
3540
+ features_info.append(feature_info.to_row(self.bundle))
3541
+ features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
3542
+ internal_features_info.append(feature_info.to_internal_row(self.bundle))
3625
3543
 
3626
3544
  if len(features_info) > 0:
3627
3545
  self.features_info = pd.DataFrame(features_info)
@@ -137,8 +137,9 @@ class DateTimeSearchKeyConverter:
137
137
  quarter_end_month = np.where(quarter == 4, 1, month + 3)
138
138
 
139
139
  end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
140
+ end.index = df.index
140
141
 
141
- df["days_in_quarter"] = (end.reset_index(drop=True) - start.reset_index(drop=True)).dt.days
142
+ df["days_in_quarter"] = (end - start).dt.days
142
143
 
143
144
  add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
144
145
 
@@ -0,0 +1,172 @@
1
+ from dataclasses import dataclass
2
+ import itertools
3
+ from typing import Dict, List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from upgini.metadata import FeaturesMetadataV2
9
+ from upgini.resource_bundle import ResourceBundle
10
+
11
+
12
+ LLM_SOURCE = "LLM with external data augmentation"
13
+
14
+
15
+ @dataclass
16
+ class FeatureInfo:
17
+ name: str
18
+ internal_name: str
19
+ rounded_shap: float
20
+ hitrate: float
21
+ value_preview: str
22
+ provider: str
23
+ internal_provider: str
24
+ source: str
25
+ internal_source: str
26
+ update_frequency: str
27
+ commercial_schema: str
28
+ doc_link: str
29
+ data_provider_link: str
30
+ data_source_link: str
31
+
32
+ @staticmethod
33
+ def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
34
+ return FeatureInfo(
35
+ name=_get_name(feature_meta),
36
+ internal_name=_get_internal_name(feature_meta),
37
+ rounded_shap=_round_shap_value(feature_meta.shap_value),
38
+ hitrate=feature_meta.hit_rate,
39
+ value_preview=_get_feature_sample(feature_meta, data),
40
+ provider=_get_provider(feature_meta, is_client_feature),
41
+ internal_provider=_get_internal_provider(feature_meta, is_client_feature),
42
+ source=_get_source(feature_meta, is_client_feature),
43
+ internal_source=_get_internal_source(feature_meta, is_client_feature),
44
+ update_frequency=feature_meta.update_frequency,
45
+ commercial_schema=feature_meta.commercial_schema,
46
+ doc_link=feature_meta.doc_link,
47
+ data_provider_link=feature_meta.data_provider_link,
48
+ data_source_link=feature_meta.data_source_link,
49
+ )
50
+
51
+ def to_row(self, bundle: ResourceBundle) -> Dict[str, str]:
52
+ return {
53
+ bundle.get("features_info_name"): self.name,
54
+ bundle.get("features_info_shap"): self.rounded_shap,
55
+ bundle.get("features_info_hitrate"): self.hitrate,
56
+ bundle.get("features_info_value_preview"): self.value_preview,
57
+ bundle.get("features_info_provider"): self.provider,
58
+ bundle.get("features_info_source"): self.source,
59
+ bundle.get("features_info_update_frequency"): self.update_frequency,
60
+ }
61
+
62
+ def to_row_without_links(self, bundle: ResourceBundle) -> Dict[str, str]:
63
+ return {
64
+ bundle.get("features_info_name"): self.internal_name,
65
+ bundle.get("features_info_shap"): self.rounded_shap,
66
+ bundle.get("features_info_hitrate"): self.hitrate,
67
+ bundle.get("features_info_value_preview"): self.value_preview,
68
+ bundle.get("features_info_provider"): self.internal_provider,
69
+ bundle.get("features_info_source"): self.internal_source,
70
+ bundle.get("features_info_update_frequency"): self.update_frequency,
71
+ }
72
+
73
+ def to_internal_row(self, bundle: ResourceBundle) -> Dict[str, str]:
74
+ return {
75
+ bundle.get("features_info_name"): self.internal_name,
76
+ "feature_link": self.doc_link,
77
+ bundle.get("features_info_shap"): self.rounded_shap,
78
+ bundle.get("features_info_hitrate"): self.hitrate,
79
+ bundle.get("features_info_value_preview"): self.value_preview,
80
+ bundle.get("features_info_provider"): self.internal_provider,
81
+ "provider_link": self.data_provider_link,
82
+ bundle.get("features_info_source"): self.internal_source,
83
+ "source_link": self.data_source_link,
84
+ bundle.get("features_info_commercial_schema"): self.commercial_schema or "",
85
+ bundle.get("features_info_update_frequency"): self.update_frequency,
86
+ }
87
+
88
+
89
+ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
+ if feature_meta.name in data.columns:
91
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
+ if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
+ feature_sample = [round(f, 4) for f in feature_sample]
94
+ feature_sample = [str(f) for f in feature_sample]
95
+ feature_sample = ", ".join(feature_sample)
96
+ if len(feature_sample) > 30:
97
+ feature_sample = feature_sample[:30] + "..."
98
+ else:
99
+ feature_sample = ""
100
+ return feature_sample
101
+
102
+
103
+ def _get_name(feature_meta: FeaturesMetadataV2) -> str:
104
+ if feature_meta.doc_link:
105
+ return _to_anchor(feature_meta.doc_link, feature_meta.name)
106
+ else:
107
+ return feature_meta.name
108
+
109
+
110
+ def _get_internal_name(feature_meta: FeaturesMetadataV2) -> str:
111
+ return feature_meta.name
112
+
113
+
114
+ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
115
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
116
+ provider_links = _list_or_single(feature_meta.data_provider_links, feature_meta.data_provider_link)
117
+ if providers:
118
+ provider = _make_links(providers, provider_links)
119
+ else:
120
+ provider = "" if is_client_feature else _to_anchor("https://upgini.com", "Upgini")
121
+ return provider
122
+
123
+
124
+ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
125
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
126
+
127
+
128
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
129
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
130
+ source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
131
+ if sources:
132
+ source = _make_links(sources, source_links)
133
+ else:
134
+ source = _get_internal_source(feature_meta, is_client_feature)
135
+ return source
136
+
137
+
138
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
139
+ return feature_meta.data_source or (
140
+ LLM_SOURCE
141
+ if not feature_meta.name.endswith("_country")
142
+ and not feature_meta.name.endswith("_postal_code")
143
+ and not is_client_feature
144
+ else ""
145
+ )
146
+
147
+
148
+ def _list_or_single(lst: List[str], single: str):
149
+ return lst or ([single] if single else [])
150
+
151
+
152
+ def _to_anchor(link: str, value: str) -> str:
153
+ if not value:
154
+ return ""
155
+ elif not link:
156
+ return value
157
+ elif value == LLM_SOURCE:
158
+ return value
159
+ else:
160
+ return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
161
+
162
+
163
+ def _make_links(names: List[str], links: List[str]):
164
+ all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
165
+ return ",".join(all_links)
166
+
167
+
168
+ def _round_shap_value(shap: float) -> float:
169
+ if shap > 0.0 and shap < 0.0001:
170
+ return 0.0001
171
+ else:
172
+ return round(shap, 4)
@@ -1 +0,0 @@
1
- __version__ = "1.2.29a4"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes