upgini 1.2.48__py3-none-any.whl → 1.2.50__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.48"
1
+ __version__ = "1.2.50"
upgini/dataset.py CHANGED
@@ -37,12 +37,18 @@ from upgini.metadata import (
37
37
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
38
38
  from upgini.search_task import SearchTask
39
39
  from upgini.utils.email_utils import EmailSearchKeyConverter
40
- from upgini.utils.target_utils import balance_undersample, balance_undersample_forced, balance_undersample_time_series
40
+ from upgini.utils.target_utils import (
41
+ balance_undersample,
42
+ balance_undersample_forced,
43
+ balance_undersample_time_series,
44
+ )
41
45
 
42
46
  try:
43
47
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
44
48
  except Exception:
45
- from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
49
+ from upgini.utils.fallback_progress_bar import (
50
+ CustomFallbackProgressBar as ProgressBar,
51
+ )
46
52
 
47
53
 
48
54
  class Dataset: # (pd.DataFrame):
@@ -347,7 +353,8 @@ class Dataset: # (pd.DataFrame):
347
353
  key
348
354
  for search_group in self.search_keys_checked
349
355
  for key in search_group
350
- if not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
356
+ if key in self.columns_renaming
357
+ and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
351
358
  }
352
359
  ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
353
360
  if (
@@ -400,7 +400,7 @@ class FeaturesEnricher(TransformerMixin):
400
400
  remove_outliers_calc_metrics: Optional[bool] = None,
401
401
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
402
402
  search_id_callback: Optional[Callable[[str], Any]] = None,
403
- select_features: bool = False,
403
+ select_features: bool = True,
404
404
  **kwargs,
405
405
  ):
406
406
  """Fit to data.
@@ -543,7 +543,7 @@ class FeaturesEnricher(TransformerMixin):
543
543
  estimator: Optional[Any] = None,
544
544
  remove_outliers_calc_metrics: Optional[bool] = None,
545
545
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
546
- select_features: bool = False,
546
+ select_features: bool = True,
547
547
  **kwargs,
548
548
  ) -> pd.DataFrame:
549
549
  """Fit to data, then transform it.
@@ -1486,8 +1486,8 @@ class FeaturesEnricher(TransformerMixin):
1486
1486
  for c in X_sampled.columns.to_list()
1487
1487
  if (
1488
1488
  not self.fit_select_features
1489
- or c in set(self.feature_names_).union(self.id_columns)
1490
- or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns)
1489
+ or c in set(self.feature_names_).union(self.id_columns or [])
1490
+ or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1491
1491
  )
1492
1492
  and c
1493
1493
  not in (
@@ -2619,6 +2619,11 @@ if response.status_code == 200:
2619
2619
  self.generate_features = checked_generate_features
2620
2620
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2621
2621
 
2622
+ if self.id_columns is not None:
2623
+ for id_column in self.id_columns:
2624
+ if id_column not in validated_X.columns:
2625
+ raise ValidationError(self.bundle.get("missing_id_column").format(id_column))
2626
+
2622
2627
  validate_scoring_argument(scoring)
2623
2628
 
2624
2629
  self.__log_debug_information(
@@ -3742,7 +3747,8 @@ if response.status_code == 200:
3742
3747
  self.feature_names_.append(feature_meta.name)
3743
3748
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3744
3749
 
3745
- feature_info = FeatureInfo.from_metadata(feature_meta, features_df, is_client_feature)
3750
+ df_for_sample = features_df if feature_meta.name in features_df.columns else self.X
3751
+ feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
3746
3752
  features_info.append(feature_info.to_row(self.bundle))
3747
3753
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
3748
3754
  internal_features_info.append(feature_info.to_internal_row(self.bundle))
@@ -134,6 +134,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
134
134
  baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
135
135
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
136
136
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
137
+ missing_id_column=Id column {} not found in X
137
138
  # target validation
138
139
  empty_target=Target is empty in all rows
139
140
  # non_numeric_target=Binary target should be numerical type
@@ -1,6 +1,6 @@
1
- from dataclasses import dataclass
2
1
  import itertools
3
- from typing import Dict, List
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  import numpy as np
6
6
  import pandas as pd
@@ -8,7 +8,6 @@ import pandas as pd
8
8
  from upgini.metadata import FeaturesMetadataV2
9
9
  from upgini.resource_bundle import ResourceBundle
10
10
 
11
-
12
11
  LLM_SOURCE = "LLM with external data augmentation"
13
12
 
14
13
 
@@ -30,7 +29,9 @@ class FeatureInfo:
30
29
  data_source_link: str
31
30
 
32
31
  @staticmethod
33
- def from_metadata(feature_meta: FeaturesMetadataV2, data: pd.DataFrame, is_client_feature: bool) -> "FeatureInfo":
32
+ def from_metadata(
33
+ feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame], is_client_feature: bool
34
+ ) -> "FeatureInfo":
34
35
  return FeatureInfo(
35
36
  name=_get_name(feature_meta),
36
37
  internal_name=_get_internal_name(feature_meta),
@@ -86,8 +87,8 @@ class FeatureInfo:
86
87
  }
87
88
 
88
89
 
89
- def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: pd.DataFrame) -> str:
90
- if feature_meta.name in data.columns:
90
+ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
91
+ if data is not None and feature_meta.name in data.columns:
91
92
  feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
92
93
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
93
94
  feature_sample = [round(f, 4) for f in feature_sample]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.48
3
+ Version: 1.2.50
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=Zm4hDkoG9GX2KfPn6Wt1kdPl888Wv-de5OuQhdWNP9E,23
1
+ upgini/__about__.py,sha256=Mi5DzFmquYseHnFMuFvsBrEztpwNZnhZs1G4xpE08KQ,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=QC3jncWS3wHe4CY7pWWDMO_3HKxGbi0EyPHXMdBtoQM,33456
4
+ upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=wQmmDAI2F7E2805iX-Cpc5v44QRVrCJV8x8j_Ujh38w,200242
6
+ upgini/features_enricher.py,sha256=O-0ZLFp1SPDNf5Yq-dysH8Jm-1c_LpNv2cIdXZ15nK8,200592
7
7
  upgini/http.py,sha256=danPeX7nTMa_70S-pk-4UUm5yOvXYlR84jgyjoHYBkU,43367
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=0jZC0HjyQHeqFCHt6nn1kz7vV0oq92AYQJvy-soAwe4,27304
33
+ upgini/resource_bundle/strings.properties,sha256=0_KAExIi1u48N1CQ13LKJS3bgDlRs-MPOyU3VxcE-qY,27350
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -48,7 +48,7 @@ upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuM
48
48
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
49
49
  upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
50
50
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
51
- upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
51
+ upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
52
52
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
53
53
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
54
54
  upgini/utils/ip_utils.py,sha256=VORRmtKlItcbBVVK5SiwXD7J-6Y5rn7UQ5m6WcBXt7E,5698
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.48.dist-info/METADATA,sha256=kGKDKYGXPVY0vOhgpsz1bq3UiMJEy8szoDRToAvVMuA,49055
63
- upgini-1.2.48.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
- upgini-1.2.48.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.48.dist-info/RECORD,,
62
+ upgini-1.2.50.dist-info/METADATA,sha256=PH8ms19Lbu3cuZxySGo9kcBeMkErCLGL8j8X3t2gxbw,49055
63
+ upgini-1.2.50.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
+ upgini-1.2.50.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.50.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any