upgini 1.1.275a1__tar.gz → 1.1.276__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (86) hide show
  1. {upgini-1.1.275a1/src/upgini.egg-info → upgini-1.1.276}/PKG-INFO +2 -2
  2. {upgini-1.1.275a1 → upgini-1.1.276}/setup.py +2 -2
  3. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/ads.py +6 -2
  4. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/date.py +9 -2
  5. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/data_source/data_source_publisher.py +1 -1
  6. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/dataset.py +6 -13
  7. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/features_enricher.py +156 -220
  8. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/metadata.py +1 -9
  9. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/metrics.py +12 -0
  10. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/normalizer/phone_normalizer.py +2 -2
  11. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/strings.properties +2 -2
  12. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/__init__.py +3 -2
  13. upgini-1.1.276/src/upgini/utils/base_search_key_detector.py +25 -0
  14. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/country_utils.py +2 -2
  15. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/datetime_utils.py +7 -4
  16. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/deduplicate_utils.py +1 -11
  17. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/email_utils.py +2 -7
  18. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/features_validator.py +2 -1
  19. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/target_utils.py +1 -1
  20. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/track_info.py +25 -13
  21. {upgini-1.1.275a1 → upgini-1.1.276/src/upgini.egg-info}/PKG-INFO +2 -2
  22. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/requires.txt +1 -1
  23. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_autofe_operands.py +2 -1
  24. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_country_utils.py +4 -4
  25. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_email_utils.py +10 -8
  26. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_etalon_validation.py +2 -21
  27. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_features_enricher.py +18 -23
  28. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_phone_utils.py +6 -6
  29. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_postal_code_utils.py +6 -6
  30. upgini-1.1.275a1/src/upgini/utils/base_search_key_detector.py +0 -27
  31. {upgini-1.1.275a1 → upgini-1.1.276}/LICENSE +0 -0
  32. {upgini-1.1.275a1 → upgini-1.1.276}/README.md +0 -0
  33. {upgini-1.1.275a1 → upgini-1.1.276}/pyproject.toml +0 -0
  34. {upgini-1.1.275a1 → upgini-1.1.276}/setup.cfg +0 -0
  35. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/__init__.py +0 -0
  36. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/ads_management/__init__.py +0 -0
  37. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/ads_management/ads_manager.py +0 -0
  38. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/__init__.py +0 -0
  39. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/all_operands.py +0 -0
  40. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/binary.py +0 -0
  41. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/feature.py +0 -0
  42. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/groupby.py +0 -0
  43. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/operand.py +0 -0
  44. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/unary.py +0 -0
  45. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/vector.py +0 -0
  46. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/data_source/__init__.py +0 -0
  47. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/errors.py +0 -0
  48. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/fingerprint.js +0 -0
  49. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/http.py +0 -0
  50. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/mdc/__init__.py +0 -0
  51. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/mdc/context.py +0 -0
  52. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/normalizer/__init__.py +0 -0
  53. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/__init__.py +0 -0
  54. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/exceptions.py +0 -0
  55. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  56. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/__init__.py +0 -0
  57. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/base.py +0 -0
  58. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/random_under_sampler.py +0 -0
  59. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/utils.py +0 -0
  60. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/search_task.py +0 -0
  61. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/spinner.py +0 -0
  62. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/blocked_time_series.py +0 -0
  63. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/custom_loss_utils.py +0 -0
  64. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/cv_utils.py +0 -0
  65. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/display_utils.py +0 -0
  66. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/fallback_progress_bar.py +0 -0
  67. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/ip_utils.py +0 -0
  69. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/phone_utils.py +0 -0
  70. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/postal_code_utils.py +0 -0
  71. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/progress_bar.py +0 -0
  72. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/sklearn_ext.py +0 -0
  73. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/warning_counter.py +0 -0
  74. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/version_validator.py +0 -0
  75. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/SOURCES.txt +0 -0
  76. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/dependency_links.txt +0 -0
  77. {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/top_level.txt +0 -0
  78. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_binary_dataset.py +0 -0
  79. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_blocked_time_series.py +0 -0
  80. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_categorical_dataset.py +0 -0
  81. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_continuous_dataset.py +0 -0
  82. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_custom_loss_utils.py +0 -0
  83. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_datetime_utils.py +0 -0
  84. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_metrics.py +0 -0
  85. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_target_utils.py +0 -0
  86. {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.275a1
3
+ Version: 1.1.276
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
28
28
  License-File: LICENSE
29
29
  Requires-Dist: python-dateutil>=2.8.0
30
30
  Requires-Dist: requests>=2.8.0
31
- Requires-Dist: pandas<2.0.0,>=1.1.0
31
+ Requires-Dist: pandas<3.0.0,>=1.1.0
32
32
  Requires-Dist: numpy>=1.19.0
33
33
  Requires-Dist: scikit-learn>=1.3.0
34
34
  Requires-Dist: pydantic<2.0.0,>=1.8.2
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.275a1"
43
+ version = "1.1.276"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -77,7 +77,7 @@ try:
77
77
  install_requires=[
78
78
  "python-dateutil>=2.8.0",
79
79
  "requests>=2.8.0",
80
- "pandas>=1.1.0,<2.0.0",
80
+ "pandas>=1.1.0,<3.0.0",
81
81
  "numpy>=1.19.0",
82
82
  "scikit-learn>=1.3.0",
83
83
  "pydantic>=1.8.2,<2.0.0",
@@ -5,7 +5,7 @@ from typing import Dict, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from pandas.api.types import is_string_dtype
8
+ from pandas.api.types import is_object_dtype, is_string_dtype
9
9
 
10
10
  from upgini import SearchKey
11
11
  from upgini.http import get_rest_client
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
34
34
  if df[column_name].notnull().sum() < min_valid_rows_count:
35
35
  raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
36
36
  meaning_type = search_keys[column_name].value
37
- if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
37
+ if (
38
+ meaning_type == FileColumnMeaningType.MSISDN
39
+ and not is_string_dtype(df[column_name])
40
+ and not is_object_dtype(df[column_name])
41
+ ):
38
42
  df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
39
43
  else:
40
44
  meaning_type = FileColumnMeaningType.FEATURE
@@ -2,6 +2,7 @@ from typing import Any, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
  from pydantic import BaseModel
5
+ from pandas.core.arrays.timedeltas import TimedeltaArray
5
6
 
6
7
  from upgini.autofe.operand import PandasOperand
7
8
 
@@ -46,6 +47,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
46
47
  future = right + (left.dt.year - right.dt.year).apply(
47
48
  lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
48
49
  )
50
+ future = pd.to_datetime(future)
49
51
  before = future[future < left]
50
52
  future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
51
53
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
@@ -72,8 +74,13 @@ class DateListDiff(PandasOperand, DateDiffMixin):
72
74
 
73
75
  return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
74
76
 
75
- def _diff(self, x):
76
- x = x / np.timedelta64(1, self.diff_unit)
77
+ def _diff(self, x: TimedeltaArray):
78
+ if self.diff_unit == "Y":
79
+ x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
80
+ elif self.diff_unit == "M":
81
+ raise Exception("Unsupported difference unit: Month")
82
+ else:
83
+ x = x / np.timedelta64(1, self.diff_unit)
77
84
  return x[x > 0]
78
85
 
79
86
  def _agg(self, x):
@@ -48,6 +48,7 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
+ exclude_from_autofe_generation: Optional[List[str]],
51
52
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
52
53
  sort_column: Optional[str] = None,
53
54
  date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
57
58
  join_date_abs_limit_days: Optional[int] = None,
58
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
59
60
  data_table_id_to_replace: Optional[str] = None,
60
- exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
@@ -17,14 +17,13 @@ from pandas.api.types import (
17
17
  is_numeric_dtype,
18
18
  is_period_dtype,
19
19
  is_string_dtype,
20
+ is_object_dtype,
20
21
  )
21
22
 
22
23
  from upgini.errors import ValidationError
23
24
  from upgini.http import ProgressStage, SearchProgress, _RestClient
24
25
  from upgini.metadata import (
25
- ENTITY_SYSTEM_RECORD_ID,
26
26
  EVAL_SET_INDEX,
27
- SEARCH_KEY_UNNEST,
28
27
  SYSTEM_COLUMNS,
29
28
  SYSTEM_RECORD_ID,
30
29
  TARGET,
@@ -80,7 +79,6 @@ class Dataset: # (pd.DataFrame):
80
79
  path: Optional[str] = None,
81
80
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
82
81
  search_keys: Optional[List[Tuple[str, ...]]] = None,
83
- unnest_search_keys: Optional[List[str]] = None,
84
82
  model_task_type: Optional[ModelTaskType] = None,
85
83
  random_state: Optional[int] = None,
86
84
  rest_client: Optional[_RestClient] = None,
@@ -115,7 +113,6 @@ class Dataset: # (pd.DataFrame):
115
113
  self.description = description
116
114
  self.meaning_types = meaning_types
117
115
  self.search_keys = search_keys
118
- self.unnest_search_keys = unnest_search_keys
119
116
  self.ignore_columns = []
120
117
  self.hierarchical_group_keys = []
121
118
  self.hierarchical_subgroup_keys = []
@@ -175,7 +172,7 @@ class Dataset: # (pd.DataFrame):
175
172
  new_columns = []
176
173
  dup_counter = 0
177
174
  for column in self.data.columns:
178
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
175
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
179
176
  self.columns_renaming[column] = column
180
177
  new_columns.append(column)
181
178
  continue
@@ -223,7 +220,7 @@ class Dataset: # (pd.DataFrame):
223
220
  """Check that string values less than maximum characters for LLM"""
224
221
  # self.logger.info("Validate too long string values")
225
222
  for col in self.data.columns:
226
- if is_string_dtype(self.data[col]):
223
+ if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
227
224
  max_length: int = self.data[col].astype("str").str.len().max()
228
225
  if max_length > self.MAX_STRING_FEATURE_LENGTH:
229
226
  self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
@@ -354,11 +351,9 @@ class Dataset: # (pd.DataFrame):
354
351
  if postal_code is not None and postal_code in self.data.columns:
355
352
  # self.logger.info("Normalize postal code")
356
353
 
357
- if is_string_dtype(self.data[postal_code]):
354
+ if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
358
355
  try:
359
- self.data[postal_code] = (
360
- self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
361
- )
356
+ self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
362
357
  except Exception:
363
358
  pass
364
359
  elif is_float_dtype(self.data[postal_code]):
@@ -808,8 +803,6 @@ class Dataset: # (pd.DataFrame):
808
803
  meaningType=meaning_type,
809
804
  minMaxValues=min_max_values,
810
805
  )
811
- if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
812
- column_meta.isUnnest = True
813
806
 
814
807
  columns.append(column_meta)
815
808
 
@@ -829,7 +822,7 @@ class Dataset: # (pd.DataFrame):
829
822
  return DataType.INT
830
823
  elif is_float_dtype(pandas_data_type):
831
824
  return DataType.DECIMAL
832
- elif is_string_dtype(pandas_data_type):
825
+ elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
833
826
  return DataType.STRING
834
827
  else:
835
828
  msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)