upgini 1.2.29a5__tar.gz → 1.2.29a7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.29a5 → upgini-1.2.29a7}/PKG-INFO +1 -1
  2. upgini-1.2.29a7/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/features_enricher.py +45 -24
  4. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/http.py +1 -1
  5. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/datetime_utils.py +6 -4
  6. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/email_utils.py +3 -2
  7. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/features_validator.py +13 -1
  8. upgini-1.2.29a5/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.29a5 → upgini-1.2.29a7}/.gitignore +0 -0
  10. {upgini-1.2.29a5 → upgini-1.2.29a7}/LICENSE +0 -0
  11. {upgini-1.2.29a5 → upgini-1.2.29a7}/README.md +0 -0
  12. {upgini-1.2.29a5 → upgini-1.2.29a7}/pyproject.toml +0 -0
  13. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/all_operands.py +0 -0
  19. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/autofe/vector.py +0 -0
  26. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/dataset.py +0 -0
  29. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/errors.py +0 -0
  30. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/lazy_import.py +0 -0
  31. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/metadata.py +0 -0
  34. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/metrics.py +0 -0
  35. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/normalizer/normalize_utils.py +0 -0
  37. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/resource_bundle/strings.properties +0 -0
  40. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/search_task.py +0 -0
  46. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/spinner.py +0 -0
  47. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  48. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/feature_info.py +0 -0
  58. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.29a5 → upgini-1.2.29a7}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a5
3
+ Version: 1.2.29a7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.29a7"
@@ -2,6 +2,7 @@ import dataclasses
2
2
  import datetime
3
3
  import gc
4
4
  import hashlib
5
+ import itertools
5
6
  import logging
6
7
  import numbers
7
8
  import os
@@ -53,7 +54,6 @@ from upgini.metadata import (
53
54
  SYSTEM_RECORD_ID,
54
55
  TARGET,
55
56
  CVType,
56
- FeaturesMetadataV2,
57
57
  FileColumnMeaningType,
58
58
  ModelTaskType,
59
59
  RuntimeParameters,
@@ -159,6 +159,10 @@ class FeaturesEnricher(TransformerMixin):
159
159
 
160
160
  shared_datasets: list of str, optional (default=None)
161
161
  List of private shared dataset ids for custom search
162
+
163
+ select_features: bool, optional (default=False)
164
+ If True, return only selected features both from input and data sources.
165
+ Otherwise, return all features from input and only selected features from data sources.
162
166
  """
163
167
 
164
168
  TARGET_NAME = "target"
@@ -279,7 +283,7 @@ class FeaturesEnricher(TransformerMixin):
279
283
  self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
280
284
  self.metrics: Optional[pd.DataFrame] = None
281
285
  self.feature_names_ = []
282
- self.client_feature_names_ = []
286
+ self.dropped_client_feature_names_ = []
283
287
  self.feature_importances_ = []
284
288
  self.search_id = search_id
285
289
  self.select_features = select_features
@@ -1443,7 +1447,11 @@ class FeaturesEnricher(TransformerMixin):
1443
1447
  client_features = [
1444
1448
  c
1445
1449
  for c in X_sampled.columns.to_list()
1446
- if (not self.select_features or c in self.feature_names_)
1450
+ if (
1451
+ not self.select_features
1452
+ or c in self.feature_names_
1453
+ or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1454
+ )
1447
1455
  and c
1448
1456
  not in (
1449
1457
  excluding_search_keys
@@ -1660,7 +1668,10 @@ class FeaturesEnricher(TransformerMixin):
1660
1668
  generated_features = []
1661
1669
  if date_column is not None:
1662
1670
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1663
- df = converter.convert(df, keep_time=True)
1671
+ # Leave original date column values
1672
+ df_with_date_features = converter.convert(df, keep_time=True)
1673
+ df_with_date_features[date_column] = df[date_column]
1674
+ df = df_with_date_features
1664
1675
  generated_features = converter.generated_features
1665
1676
 
1666
1677
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -1669,9 +1680,10 @@ class FeaturesEnricher(TransformerMixin):
1669
1680
  df = generator.generate(df)
1670
1681
  generated_features.extend(generator.generated_features)
1671
1682
 
1672
- normalizer = Normalizer(self.bundle, self.logger)
1673
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1674
- columns_renaming = normalizer.columns_renaming
1683
+ # normalizer = Normalizer(self.bundle, self.logger)
1684
+ # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1685
+ # columns_renaming = normalizer.columns_renaming
1686
+ columns_renaming = {c: c for c in df.columns}
1675
1687
 
1676
1688
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1677
1689
 
@@ -2071,7 +2083,7 @@ class FeaturesEnricher(TransformerMixin):
2071
2083
  is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
2072
2084
 
2073
2085
  columns_to_drop = [
2074
- c for c in validated_X.columns if c in self.feature_names_ and c not in self.client_feature_names_
2086
+ c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
2075
2087
  ]
2076
2088
  if len(columns_to_drop) > 0:
2077
2089
  msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
@@ -2101,7 +2113,7 @@ class FeaturesEnricher(TransformerMixin):
2101
2113
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2102
2114
  if date_column is not None:
2103
2115
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2104
- df = converter.convert(df)
2116
+ df = converter.convert(df, keep_time=True)
2105
2117
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2106
2118
  generated_features.extend(converter.generated_features)
2107
2119
  else:
@@ -2196,11 +2208,12 @@ class FeaturesEnricher(TransformerMixin):
2196
2208
 
2197
2209
  if add_fit_system_record_id:
2198
2210
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2199
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2200
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2201
2211
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2202
2212
  features_not_to_pass.append(SORT_ID)
2203
2213
 
2214
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2215
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2216
+
2204
2217
  # search keys might be changed after explode
2205
2218
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2206
2219
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
@@ -2219,7 +2232,7 @@ class FeaturesEnricher(TransformerMixin):
2219
2232
 
2220
2233
  combined_search_keys = combine_search_keys(search_keys.keys())
2221
2234
 
2222
- df_without_features = df.drop(columns=features_not_to_pass)
2235
+ df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2223
2236
 
2224
2237
  df_without_features, full_duplicates_warning = clean_full_duplicates(
2225
2238
  df_without_features, self.logger, bundle=self.bundle
@@ -2328,11 +2341,15 @@ class FeaturesEnricher(TransformerMixin):
2328
2341
  else:
2329
2342
  result = enrich()
2330
2343
 
2344
+ selecting_columns = [
2345
+ c
2346
+ for c in itertools.chain(validated_X.columns.tolist(), generated_features)
2347
+ if c not in self.dropped_client_feature_names_
2348
+ ]
2331
2349
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2332
- existing_filtered_columns = [
2350
+ selecting_columns.extend(
2333
2351
  c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2334
- ]
2335
- selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
2352
+ )
2336
2353
  if add_fit_system_record_id:
2337
2354
  selecting_columns.append(SORT_ID)
2338
2355
 
@@ -3510,7 +3527,7 @@ class FeaturesEnricher(TransformerMixin):
3510
3527
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3511
3528
 
3512
3529
  self.feature_names_ = []
3513
- self.client_feature_names_ = []
3530
+ self.dropped_client_feature_names_ = []
3514
3531
  self.feature_importances_ = []
3515
3532
  features_info = []
3516
3533
  features_info_without_links = []
@@ -3520,19 +3537,23 @@ class FeaturesEnricher(TransformerMixin):
3520
3537
  for feature_meta in features_meta:
3521
3538
  if feature_meta.name in original_names_dict.keys():
3522
3539
  feature_meta.name = original_names_dict[feature_meta.name]
3540
+
3541
+ is_client_feature = feature_meta.name in x_columns
3542
+
3543
+ if feature_meta.shap_value == 0.0:
3544
+ if self.select_features:
3545
+ self.dropped_client_feature_names_.append(feature_meta.name)
3546
+ continue
3547
+
3523
3548
  # Use only important features
3524
3549
  if (
3525
- (feature_meta.shap_value == 0.0)
3526
- or (feature_meta.name in self.fit_generated_features)
3527
- or (feature_meta.name == COUNTRY)
3550
+ feature_meta.name in self.fit_generated_features
3551
+ or feature_meta.name == COUNTRY
3552
+ # In select_features mode we select also from etalon features and need to show them
3553
+ or (not self.select_features and is_client_feature)
3528
3554
  ):
3529
3555
  continue
3530
3556
 
3531
- is_client_feature = feature_meta.name in x_columns
3532
- # In select_features mode we select also from etalon features and need to show them
3533
- if not self.select_features and is_client_feature:
3534
- continue
3535
-
3536
3557
  self.feature_names_.append(feature_meta.name)
3537
3558
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3538
3559
 
@@ -882,7 +882,7 @@ class _RestClient:
882
882
  if content_type:
883
883
  headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
884
884
  if trace_id:
885
- headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
885
+ headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
886
886
  for header_key, header_value in additional_headers.items():
887
887
  headers[header_key] = header_value
888
888
  return headers
@@ -114,10 +114,12 @@ class DateTimeSearchKeyConverter:
114
114
  period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
115
  sin_feature = f"datetime_{column}_sin{period_suffix}"
116
116
  cos_feature = f"datetime_{column}_cos{period_suffix}"
117
- df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
118
- df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
119
- self.generated_features.append(sin_feature)
120
- self.generated_features.append(cos_feature)
117
+ if sin_feature not in df.columns:
118
+ df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
119
+ self.generated_features.append(sin_feature)
120
+ if cos_feature not in df.columns:
121
+ df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
+ self.generated_features.append(cos_feature)
121
123
 
122
124
  df["quarter"] = df[self.date_column].dt.quarter
123
125
 
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
38
38
  def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
39
  for email_col in self.email_columns:
40
40
  domain_feature = email_col + self.DOMAIN_SUFFIX
41
- df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
- self.generated_features.append(domain_feature)
41
+ if domain_feature not in df.columns:
42
+ df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
+ self.generated_features.append(domain_feature)
43
44
  return df
44
45
 
45
46
  @staticmethod
@@ -2,6 +2,7 @@ import logging
2
2
  from logging import Logger
3
3
  from typing import Dict, List, Optional, Tuple
4
4
 
5
+ import numpy as np
5
6
  import pandas as pd
6
7
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
7
8
 
@@ -83,10 +84,21 @@ class FeaturesValidator:
83
84
  return [
84
85
  i
85
86
  for i in df
86
- if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
87
+ if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
87
88
  and (df[i].nunique(dropna=False) / row_count >= 0.85)
88
89
  ]
89
90
 
91
+ @staticmethod
92
+ def __is_integer(series: pd.Series) -> bool:
93
+ return (
94
+ is_integer_dtype(series)
95
+ or series.dropna()
96
+ .apply(
97
+ lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
98
+ )
99
+ .all()
100
+ )
101
+
90
102
  @staticmethod
91
103
  def find_constant_features(df: pd.DataFrame) -> List[str]:
92
104
  return [i for i in df if df[i].nunique() <= 1]
@@ -1 +0,0 @@
1
- __version__ = "1.2.29a5"
File without changes
File without changes
File without changes
File without changes
File without changes