upgini 1.2.29a6__tar.gz → 1.2.29a7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.29a6 → upgini-1.2.29a7}/PKG-INFO +1 -1
  2. upgini-1.2.29a7/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/features_enricher.py +21 -12
  4. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/http.py +1 -1
  5. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/datetime_utils.py +6 -4
  6. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/email_utils.py +3 -2
  7. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/features_validator.py +13 -1
  8. upgini-1.2.29a6/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.29a6 → upgini-1.2.29a7}/.gitignore +0 -0
  10. {upgini-1.2.29a6 → upgini-1.2.29a7}/LICENSE +0 -0
  11. {upgini-1.2.29a6 → upgini-1.2.29a7}/README.md +0 -0
  12. {upgini-1.2.29a6 → upgini-1.2.29a7}/pyproject.toml +0 -0
  13. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/all_operands.py +0 -0
  19. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/autofe/vector.py +0 -0
  26. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/dataset.py +0 -0
  29. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/errors.py +0 -0
  30. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/lazy_import.py +0 -0
  31. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/metadata.py +0 -0
  34. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/metrics.py +0 -0
  35. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/normalizer/normalize_utils.py +0 -0
  37. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/resource_bundle/strings.properties +0 -0
  40. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/search_task.py +0 -0
  46. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/spinner.py +0 -0
  47. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  48. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/feature_info.py +0 -0
  58. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.29a6 → upgini-1.2.29a7}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a6
3
+ Version: 1.2.29a7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.29a7"
@@ -54,7 +54,6 @@ from upgini.metadata import (
54
54
  SYSTEM_RECORD_ID,
55
55
  TARGET,
56
56
  CVType,
57
- FeaturesMetadataV2,
58
57
  FileColumnMeaningType,
59
58
  ModelTaskType,
60
59
  RuntimeParameters,
@@ -1448,7 +1447,11 @@ class FeaturesEnricher(TransformerMixin):
1448
1447
  client_features = [
1449
1448
  c
1450
1449
  for c in X_sampled.columns.to_list()
1451
- if (not self.select_features or c in self.feature_names_)
1450
+ if (
1451
+ not self.select_features
1452
+ or c in self.feature_names_
1453
+ or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1454
+ )
1452
1455
  and c
1453
1456
  not in (
1454
1457
  excluding_search_keys
@@ -1665,7 +1668,10 @@ class FeaturesEnricher(TransformerMixin):
1665
1668
  generated_features = []
1666
1669
  if date_column is not None:
1667
1670
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
1668
- df = converter.convert(df, keep_time=True)
1671
+ # Leave original date column values
1672
+ df_with_date_features = converter.convert(df, keep_time=True)
1673
+ df_with_date_features[date_column] = df[date_column]
1674
+ df = df_with_date_features
1669
1675
  generated_features = converter.generated_features
1670
1676
 
1671
1677
  email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
@@ -1674,9 +1680,10 @@ class FeaturesEnricher(TransformerMixin):
1674
1680
  df = generator.generate(df)
1675
1681
  generated_features.extend(generator.generated_features)
1676
1682
 
1677
- normalizer = Normalizer(self.bundle, self.logger)
1678
- df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1679
- columns_renaming = normalizer.columns_renaming
1683
+ # normalizer = Normalizer(self.bundle, self.logger)
1684
+ # df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1685
+ # columns_renaming = normalizer.columns_renaming
1686
+ columns_renaming = {c: c for c in df.columns}
1680
1687
 
1681
1688
  df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
1682
1689
 
@@ -2106,7 +2113,7 @@ class FeaturesEnricher(TransformerMixin):
2106
2113
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
2107
2114
  if date_column is not None:
2108
2115
  converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
2109
- df = converter.convert(df)
2116
+ df = converter.convert(df, keep_time=True)
2110
2117
  self.logger.info(f"Date column after convertion: {df[date_column]}")
2111
2118
  generated_features.extend(converter.generated_features)
2112
2119
  else:
@@ -2201,11 +2208,12 @@ class FeaturesEnricher(TransformerMixin):
2201
2208
 
2202
2209
  if add_fit_system_record_id:
2203
2210
  df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
2204
- if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2205
- df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2206
2211
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2207
2212
  features_not_to_pass.append(SORT_ID)
2208
2213
 
2214
+ if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2215
+ df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
2216
+
2209
2217
  # search keys might be changed after explode
2210
2218
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2211
2219
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
@@ -2224,7 +2232,7 @@ class FeaturesEnricher(TransformerMixin):
2224
2232
 
2225
2233
  combined_search_keys = combine_search_keys(search_keys.keys())
2226
2234
 
2227
- df_without_features = df.drop(columns=features_not_to_pass)
2235
+ df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
2228
2236
 
2229
2237
  df_without_features, full_duplicates_warning = clean_full_duplicates(
2230
2238
  df_without_features, self.logger, bundle=self.bundle
@@ -2339,7 +2347,9 @@ class FeaturesEnricher(TransformerMixin):
2339
2347
  if c not in self.dropped_client_feature_names_
2340
2348
  ]
2341
2349
  filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2342
- selecting_columns.extend(c for c in filtered_columns if c in result.columns and c not in validated_X.columns)
2350
+ selecting_columns.extend(
2351
+ c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2352
+ )
2343
2353
  if add_fit_system_record_id:
2344
2354
  selecting_columns.append(SORT_ID)
2345
2355
 
@@ -3544,7 +3554,6 @@ class FeaturesEnricher(TransformerMixin):
3544
3554
  ):
3545
3555
  continue
3546
3556
 
3547
-
3548
3557
  self.feature_names_.append(feature_meta.name)
3549
3558
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3550
3559
 
@@ -882,7 +882,7 @@ class _RestClient:
882
882
  if content_type:
883
883
  headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
884
884
  if trace_id:
885
- headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
885
+ headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
886
886
  for header_key, header_value in additional_headers.items():
887
887
  headers[header_key] = header_value
888
888
  return headers
@@ -114,10 +114,12 @@ class DateTimeSearchKeyConverter:
114
114
  period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
115
  sin_feature = f"datetime_{column}_sin{period_suffix}"
116
116
  cos_feature = f"datetime_{column}_cos{period_suffix}"
117
- df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
118
- df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
119
- self.generated_features.append(sin_feature)
120
- self.generated_features.append(cos_feature)
117
+ if sin_feature not in df.columns:
118
+ df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
119
+ self.generated_features.append(sin_feature)
120
+ if cos_feature not in df.columns:
121
+ df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
122
+ self.generated_features.append(cos_feature)
121
123
 
122
124
  df["quarter"] = df[self.date_column].dt.quarter
123
125
 
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
38
38
  def generate(self, df: pd.DataFrame) -> pd.DataFrame:
39
39
  for email_col in self.email_columns:
40
40
  domain_feature = email_col + self.DOMAIN_SUFFIX
41
- df[domain_feature] = df[email_col].apply(self._email_to_domain)
42
- self.generated_features.append(domain_feature)
41
+ if domain_feature not in df.columns:
42
+ df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
43
+ self.generated_features.append(domain_feature)
43
44
  return df
44
45
 
45
46
  @staticmethod
@@ -2,6 +2,7 @@ import logging
2
2
  from logging import Logger
3
3
  from typing import Dict, List, Optional, Tuple
4
4
 
5
+ import numpy as np
5
6
  import pandas as pd
6
7
  from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
7
8
 
@@ -83,10 +84,21 @@ class FeaturesValidator:
83
84
  return [
84
85
  i
85
86
  for i in df
86
- if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
87
+ if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
87
88
  and (df[i].nunique(dropna=False) / row_count >= 0.85)
88
89
  ]
89
90
 
91
+ @staticmethod
92
+ def __is_integer(series: pd.Series) -> bool:
93
+ return (
94
+ is_integer_dtype(series)
95
+ or series.dropna()
96
+ .apply(
97
+ lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
98
+ )
99
+ .all()
100
+ )
101
+
90
102
  @staticmethod
91
103
  def find_constant_features(df: pd.DataFrame) -> List[str]:
92
104
  return [i for i in df if df[i].nunique() <= 1]
@@ -1 +0,0 @@
1
- __version__ = "1.2.29a6"
File without changes
File without changes
File without changes
File without changes
File without changes