upgini 1.1.275a99__tar.gz → 1.1.276__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (85) hide show
  1. {upgini-1.1.275a99/src/upgini.egg-info → upgini-1.1.276}/PKG-INFO +1 -1
  2. {upgini-1.1.275a99 → upgini-1.1.276}/setup.py +1 -1
  3. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/ads.py +6 -2
  4. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/dataset.py +4 -3
  5. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/features_enricher.py +6 -2
  6. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/__init__.py +3 -2
  7. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/country_utils.py +2 -2
  8. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/datetime_utils.py +4 -4
  9. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/email_utils.py +2 -2
  10. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/target_utils.py +1 -1
  11. {upgini-1.1.275a99 → upgini-1.1.276/src/upgini.egg-info}/PKG-INFO +1 -1
  12. {upgini-1.1.275a99 → upgini-1.1.276}/LICENSE +0 -0
  13. {upgini-1.1.275a99 → upgini-1.1.276}/README.md +0 -0
  14. {upgini-1.1.275a99 → upgini-1.1.276}/pyproject.toml +0 -0
  15. {upgini-1.1.275a99 → upgini-1.1.276}/setup.cfg +0 -0
  16. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/__init__.py +0 -0
  17. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/all_operands.py +0 -0
  21. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/date.py +0 -0
  23. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/feature.py +0 -0
  24. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/operand.py +0 -0
  26. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/unary.py +0 -0
  27. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/autofe/vector.py +0 -0
  28. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/data_source/__init__.py +0 -0
  29. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/data_source/data_source_publisher.py +0 -0
  30. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/errors.py +0 -0
  31. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/fingerprint.js +0 -0
  32. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/http.py +0 -0
  33. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/mdc/__init__.py +0 -0
  34. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/mdc/context.py +0 -0
  35. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/metadata.py +0 -0
  36. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/metrics.py +0 -0
  37. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/normalizer/__init__.py +0 -0
  38. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/normalizer/phone_normalizer.py +0 -0
  39. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/resource_bundle/__init__.py +0 -0
  40. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/resource_bundle/exceptions.py +0 -0
  41. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/resource_bundle/strings.properties +0 -0
  42. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/search_task.py +0 -0
  48. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/spinner.py +0 -0
  49. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/features_validator.py +0 -0
  57. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini/version_validator.py +0 -0
  66. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini.egg-info/SOURCES.txt +0 -0
  67. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini.egg-info/dependency_links.txt +0 -0
  68. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini.egg-info/requires.txt +0 -0
  69. {upgini-1.1.275a99 → upgini-1.1.276}/src/upgini.egg-info/top_level.txt +0 -0
  70. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_autofe_operands.py +0 -0
  71. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_binary_dataset.py +0 -0
  72. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_blocked_time_series.py +0 -0
  73. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_categorical_dataset.py +0 -0
  74. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_continuous_dataset.py +0 -0
  75. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_country_utils.py +0 -0
  76. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_custom_loss_utils.py +0 -0
  77. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_datetime_utils.py +0 -0
  78. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_email_utils.py +0 -0
  79. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_etalon_validation.py +0 -0
  80. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_features_enricher.py +0 -0
  81. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_metrics.py +0 -0
  82. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_phone_utils.py +0 -0
  83. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_postal_code_utils.py +0 -0
  84. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_target_utils.py +0 -0
  85. {upgini-1.1.275a99 → upgini-1.1.276}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.275a99
3
+ Version: 1.1.276
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.275a99"
43
+ version = "1.1.276"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -5,7 +5,7 @@ from typing import Dict, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from pandas.api.types import is_string_dtype
8
+ from pandas.api.types import is_object_dtype, is_string_dtype
9
9
 
10
10
  from upgini import SearchKey
11
11
  from upgini.http import get_rest_client
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
34
34
  if df[column_name].notnull().sum() < min_valid_rows_count:
35
35
  raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
36
36
  meaning_type = search_keys[column_name].value
37
- if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
37
+ if (
38
+ meaning_type == FileColumnMeaningType.MSISDN
39
+ and not is_string_dtype(df[column_name])
40
+ and not is_object_dtype(df[column_name])
41
+ ):
38
42
  df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
39
43
  else:
40
44
  meaning_type = FileColumnMeaningType.FEATURE
@@ -17,6 +17,7 @@ from pandas.api.types import (
17
17
  is_numeric_dtype,
18
18
  is_period_dtype,
19
19
  is_string_dtype,
20
+ is_object_dtype,
20
21
  )
21
22
 
22
23
  from upgini.errors import ValidationError
@@ -219,7 +220,7 @@ class Dataset: # (pd.DataFrame):
219
220
  """Check that string values less than maximum characters for LLM"""
220
221
  # self.logger.info("Validate too long string values")
221
222
  for col in self.data.columns:
222
- if is_string_dtype(self.data[col]):
223
+ if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
223
224
  max_length: int = self.data[col].astype("str").str.len().max()
224
225
  if max_length > self.MAX_STRING_FEATURE_LENGTH:
225
226
  self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
@@ -350,7 +351,7 @@ class Dataset: # (pd.DataFrame):
350
351
  if postal_code is not None and postal_code in self.data.columns:
351
352
  # self.logger.info("Normalize postal code")
352
353
 
353
- if is_string_dtype(self.data[postal_code]):
354
+ if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
354
355
  try:
355
356
  self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
356
357
  except Exception:
@@ -821,7 +822,7 @@ class Dataset: # (pd.DataFrame):
821
822
  return DataType.INT
822
823
  elif is_float_dtype(pandas_data_type):
823
824
  return DataType.DECIMAL
824
- elif is_string_dtype(pandas_data_type):
825
+ elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
825
826
  return DataType.STRING
826
827
  else:
827
828
  msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
@@ -21,6 +21,7 @@ from pandas.api.types import (
21
21
  is_bool,
22
22
  is_datetime64_any_dtype,
23
23
  is_numeric_dtype,
24
+ is_object_dtype,
24
25
  is_period_dtype,
25
26
  is_string_dtype,
26
27
  )
@@ -2982,7 +2983,7 @@ class FeaturesEnricher(TransformerMixin):
2982
2983
 
2983
2984
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
2984
2985
  target = df[self.TARGET_NAME]
2985
- if is_string_dtype(target):
2986
+ if is_string_dtype(target) or is_object_dtype(target):
2986
2987
  maybe_numeric_target = pd.to_numeric(target, errors="coerce")
2987
2988
  # If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
2988
2989
  if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
@@ -3255,6 +3256,8 @@ class FeaturesEnricher(TransformerMixin):
3255
3256
  descriptions = []
3256
3257
  for m in autofe_meta:
3257
3258
  autofe_feature = Feature.from_formula(m.formula)
3259
+ orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
3260
+ autofe_feature.rename_columns(orig_to_hashed)
3258
3261
  autofe_feature.set_display_index(m.display_index)
3259
3262
  if autofe_feature.op.is_vector:
3260
3263
  continue
@@ -3382,7 +3385,8 @@ class FeaturesEnricher(TransformerMixin):
3382
3385
  valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
3383
3386
  else:
3384
3387
  if x[column_name].isnull().all() or (
3385
- is_string_dtype(x[column_name]) and (x[column_name].astype("string").str.strip() == "").all()
3388
+ (is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
3389
+ and (x[column_name].astype("string").str.strip() == "").all()
3386
3390
  ):
3387
3391
  raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
3388
3392
 
@@ -2,7 +2,7 @@ import itertools
2
2
  from typing import List, Tuple
3
3
 
4
4
  import pandas as pd
5
- from pandas.api.types import is_string_dtype
5
+ from pandas.api.types import is_string_dtype, is_object_dtype
6
6
 
7
7
 
8
8
  def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
@@ -20,5 +20,6 @@ def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
20
20
  return [
21
21
  col
22
22
  for col in tmp.columns
23
- if is_string_dtype(tmp[col]) and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
23
+ if (is_string_dtype(tmp[col]) or is_object_dtype(tmp[col]))
24
+ and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
24
25
  ]
@@ -1,5 +1,5 @@
1
1
  import pandas as pd
2
- from pandas.api.types import is_string_dtype
2
+ from pandas.api.types import is_string_dtype, is_object_dtype
3
3
 
4
4
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
5
5
 
@@ -9,7 +9,7 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
9
9
  return "country" in str(column_name).lower()
10
10
 
11
11
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
12
- if not is_string_dtype(column):
12
+ if not is_string_dtype(column) and not is_object_dtype(column):
13
13
  return False
14
14
 
15
15
  all_count = len(column)
@@ -6,7 +6,10 @@ from typing import Dict, List, Optional
6
6
  import numpy as np
7
7
  import pandas as pd
8
8
  from dateutil.relativedelta import relativedelta
9
- from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
9
+ from pandas.api.types import (
10
+ is_numeric_dtype,
11
+ is_period_dtype,
12
+ )
10
13
 
11
14
  from upgini.errors import ValidationError
12
15
  from upgini.metadata import SearchKey
@@ -78,9 +81,6 @@ class DateTimeSearchKeyConverter:
78
81
  df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
79
82
  elif isinstance(df[self.date_column].values[0], datetime.date):
80
83
  df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
81
- elif is_string_dtype(df[self.date_column]):
82
- df[self.date_column] = df[self.date_column].apply(self.clean_date)
83
- df[self.date_column] = self.parse_date(df)
84
84
  elif is_period_dtype(df[self.date_column]):
85
85
  df[self.date_column] = pd.to_datetime(df[self.date_column].astype("string"))
86
86
  elif is_numeric_dtype(df[self.date_column]):
@@ -4,7 +4,7 @@ from hashlib import sha256
4
4
  from typing import Dict, List, Optional
5
5
 
6
6
  import pandas as pd
7
- from pandas.api.types import is_string_dtype
7
+ from pandas.api.types import is_string_dtype, is_object_dtype
8
8
  from upgini.resource_bundle import bundle
9
9
 
10
10
  from upgini.metadata import SearchKey
@@ -18,7 +18,7 @@ class EmailSearchKeyDetector(BaseSearchKeyDetector):
18
18
  return str(column_name).lower() in ["email", "e_mail", "e-mail"]
19
19
 
20
20
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
21
- if not is_string_dtype(column):
21
+ if not is_string_dtype(column) and not is_object_dtype:
22
22
  return False
23
23
  if not column.astype("string").str.contains("@").any():
24
24
  return False
@@ -107,7 +107,7 @@ def balance_undersample(
107
107
  min_class_count = vc[min_class_value]
108
108
 
109
109
  min_class_percent = imbalance_threshold / target_classes_count
110
- min_class_threshold = min_class_percent * count
110
+ min_class_threshold = int(min_class_percent * count)
111
111
 
112
112
  resampled_data = df
113
113
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.275a99
3
+ Version: 1.1.276
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
File without changes
File without changes
File without changes
File without changes
File without changes