upgini 1.1.275__py3-none-any.whl → 1.1.275a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/ads.py CHANGED
@@ -5,7 +5,7 @@ from typing import Dict, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from pandas.api.types import is_object_dtype, is_string_dtype
8
+ from pandas.api.types import is_string_dtype
9
9
 
10
10
  from upgini import SearchKey
11
11
  from upgini.http import get_rest_client
@@ -34,11 +34,7 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
34
34
  if df[column_name].notnull().sum() < min_valid_rows_count:
35
35
  raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
36
36
  meaning_type = search_keys[column_name].value
37
- if (
38
- meaning_type == FileColumnMeaningType.MSISDN
39
- and not is_string_dtype(df[column_name])
40
- and not is_object_dtype(df[column_name])
41
- ):
37
+ if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
42
38
  df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
43
39
  else:
44
40
  meaning_type = FileColumnMeaningType.FEATURE
upgini/autofe/date.py CHANGED
@@ -2,7 +2,6 @@ from typing import Any, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
  from pydantic import BaseModel
5
- from pandas.core.arrays.timedeltas import TimedeltaArray
6
5
 
7
6
  from upgini.autofe.operand import PandasOperand
8
7
 
@@ -47,7 +46,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
47
46
  future = right + (left.dt.year - right.dt.year).apply(
48
47
  lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
49
48
  )
50
- future = pd.to_datetime(future)
51
49
  before = future[future < left]
52
50
  future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
53
51
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
@@ -74,13 +72,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
74
72
 
75
73
  return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
76
74
 
77
- def _diff(self, x: TimedeltaArray):
78
- if self.diff_unit == "Y":
79
- x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
80
- elif self.diff_unit == "M":
81
- raise Exception("Unsupported difference unit: Month")
82
- else:
83
- x = x / np.timedelta64(1, self.diff_unit)
75
+ def _diff(self, x):
76
+ x = x / np.timedelta64(1, self.diff_unit)
84
77
  return x[x > 0]
85
78
 
86
79
  def _agg(self, x):
@@ -48,7 +48,6 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
- exclude_from_autofe_generation: Optional[List[str]],
52
51
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
53
52
  sort_column: Optional[str] = None,
54
53
  date_format: Optional[str] = None,
@@ -58,6 +57,7 @@ class DataSourcePublisher:
58
57
  join_date_abs_limit_days: Optional[int] = None,
59
58
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
60
59
  data_table_id_to_replace: Optional[str] = None,
60
+ exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
upgini/dataset.py CHANGED
@@ -17,13 +17,14 @@ from pandas.api.types import (
17
17
  is_numeric_dtype,
18
18
  is_period_dtype,
19
19
  is_string_dtype,
20
- is_object_dtype,
21
20
  )
22
21
 
23
22
  from upgini.errors import ValidationError
24
23
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
24
  from upgini.metadata import (
25
+ ENTITY_SYSTEM_RECORD_ID,
26
26
  EVAL_SET_INDEX,
27
+ SEARCH_KEY_UNNEST,
27
28
  SYSTEM_COLUMNS,
28
29
  SYSTEM_RECORD_ID,
29
30
  TARGET,
@@ -79,6 +80,7 @@ class Dataset: # (pd.DataFrame):
79
80
  path: Optional[str] = None,
80
81
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
81
82
  search_keys: Optional[List[Tuple[str, ...]]] = None,
83
+ unnest_search_keys: Optional[List[str]] = None,
82
84
  model_task_type: Optional[ModelTaskType] = None,
83
85
  random_state: Optional[int] = None,
84
86
  rest_client: Optional[_RestClient] = None,
@@ -113,6 +115,7 @@ class Dataset: # (pd.DataFrame):
113
115
  self.description = description
114
116
  self.meaning_types = meaning_types
115
117
  self.search_keys = search_keys
118
+ self.unnest_search_keys = unnest_search_keys
116
119
  self.ignore_columns = []
117
120
  self.hierarchical_group_keys = []
118
121
  self.hierarchical_subgroup_keys = []
@@ -172,7 +175,7 @@ class Dataset: # (pd.DataFrame):
172
175
  new_columns = []
173
176
  dup_counter = 0
174
177
  for column in self.data.columns:
175
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
178
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
176
179
  self.columns_renaming[column] = column
177
180
  new_columns.append(column)
178
181
  continue
@@ -220,7 +223,7 @@ class Dataset: # (pd.DataFrame):
220
223
  """Check that string values less than maximum characters for LLM"""
221
224
  # self.logger.info("Validate too long string values")
222
225
  for col in self.data.columns:
223
- if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
226
+ if is_string_dtype(self.data[col]):
224
227
  max_length: int = self.data[col].astype("str").str.len().max()
225
228
  if max_length > self.MAX_STRING_FEATURE_LENGTH:
226
229
  self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
@@ -351,9 +354,11 @@ class Dataset: # (pd.DataFrame):
351
354
  if postal_code is not None and postal_code in self.data.columns:
352
355
  # self.logger.info("Normalize postal code")
353
356
 
354
- if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
357
+ if is_string_dtype(self.data[postal_code]):
355
358
  try:
356
- self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
359
+ self.data[postal_code] = (
360
+ self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
361
+ )
357
362
  except Exception:
358
363
  pass
359
364
  elif is_float_dtype(self.data[postal_code]):
@@ -803,6 +808,8 @@ class Dataset: # (pd.DataFrame):
803
808
  meaningType=meaning_type,
804
809
  minMaxValues=min_max_values,
805
810
  )
811
+ if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
812
+ column_meta.isUnnest = True
806
813
 
807
814
  columns.append(column_meta)
808
815
 
@@ -822,7 +829,7 @@ class Dataset: # (pd.DataFrame):
822
829
  return DataType.INT
823
830
  elif is_float_dtype(pandas_data_type):
824
831
  return DataType.DECIMAL
825
- elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
832
+ elif is_string_dtype(pandas_data_type):
826
833
  return DataType.STRING
827
834
  else:
828
835
  msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)