upgini 1.1.275__py3-none-any.whl → 1.1.275a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/ads.py +2 -6
- upgini/autofe/date.py +2 -9
- upgini/data_source/data_source_publisher.py +1 -1
- upgini/dataset.py +13 -6
- upgini/features_enricher.py +220 -154
- upgini/metadata.py +9 -1
- upgini/metrics.py +0 -12
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +2 -2
- upgini/utils/__init__.py +2 -3
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +2 -2
- upgini/utils/datetime_utils.py +4 -7
- upgini/utils/deduplicate_utils.py +11 -1
- upgini/utils/email_utils.py +7 -2
- upgini/utils/features_validator.py +1 -2
- upgini/utils/target_utils.py +1 -1
- upgini/utils/track_info.py +13 -25
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/METADATA +2 -2
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/RECORD +23 -23
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/LICENSE +0 -0
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.275.dist-info → upgini-1.1.275a1.dist-info}/top_level.txt +0 -0
upgini/ads.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from pandas.api.types import
|
|
8
|
+
from pandas.api.types import is_string_dtype
|
|
9
9
|
|
|
10
10
|
from upgini import SearchKey
|
|
11
11
|
from upgini.http import get_rest_client
|
|
@@ -34,11 +34,7 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
|
|
|
34
34
|
if df[column_name].notnull().sum() < min_valid_rows_count:
|
|
35
35
|
raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
|
|
36
36
|
meaning_type = search_keys[column_name].value
|
|
37
|
-
if (
|
|
38
|
-
meaning_type == FileColumnMeaningType.MSISDN
|
|
39
|
-
and not is_string_dtype(df[column_name])
|
|
40
|
-
and not is_object_dtype(df[column_name])
|
|
41
|
-
):
|
|
37
|
+
if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
|
|
42
38
|
df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
|
|
43
39
|
else:
|
|
44
40
|
meaning_type = FileColumnMeaningType.FEATURE
|
upgini/autofe/date.py
CHANGED
|
@@ -2,7 +2,6 @@ from typing import Any, Optional, Union
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
-
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
6
5
|
|
|
7
6
|
from upgini.autofe.operand import PandasOperand
|
|
8
7
|
|
|
@@ -47,7 +46,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
47
46
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
48
47
|
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
49
48
|
)
|
|
50
|
-
future = pd.to_datetime(future)
|
|
51
49
|
before = future[future < left]
|
|
52
50
|
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
53
51
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
@@ -74,13 +72,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
74
72
|
|
|
75
73
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
76
74
|
|
|
77
|
-
def _diff(self, x
|
|
78
|
-
|
|
79
|
-
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
80
|
-
elif self.diff_unit == "M":
|
|
81
|
-
raise Exception("Unsupported difference unit: Month")
|
|
82
|
-
else:
|
|
83
|
-
x = x / np.timedelta64(1, self.diff_unit)
|
|
75
|
+
def _diff(self, x):
|
|
76
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
84
77
|
return x[x > 0]
|
|
85
78
|
|
|
86
79
|
def _agg(self, x):
|
|
@@ -48,7 +48,6 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
-
exclude_from_autofe_generation: Optional[List[str]],
|
|
52
51
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
53
52
|
sort_column: Optional[str] = None,
|
|
54
53
|
date_format: Optional[str] = None,
|
|
@@ -58,6 +57,7 @@ class DataSourcePublisher:
|
|
|
58
57
|
join_date_abs_limit_days: Optional[int] = None,
|
|
59
58
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
60
59
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
+
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
upgini/dataset.py
CHANGED
|
@@ -17,13 +17,14 @@ from pandas.api.types import (
|
|
|
17
17
|
is_numeric_dtype,
|
|
18
18
|
is_period_dtype,
|
|
19
19
|
is_string_dtype,
|
|
20
|
-
is_object_dtype,
|
|
21
20
|
)
|
|
22
21
|
|
|
23
22
|
from upgini.errors import ValidationError
|
|
24
23
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
24
|
from upgini.metadata import (
|
|
25
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
26
26
|
EVAL_SET_INDEX,
|
|
27
|
+
SEARCH_KEY_UNNEST,
|
|
27
28
|
SYSTEM_COLUMNS,
|
|
28
29
|
SYSTEM_RECORD_ID,
|
|
29
30
|
TARGET,
|
|
@@ -79,6 +80,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
79
80
|
path: Optional[str] = None,
|
|
80
81
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
81
82
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
83
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
82
84
|
model_task_type: Optional[ModelTaskType] = None,
|
|
83
85
|
random_state: Optional[int] = None,
|
|
84
86
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -113,6 +115,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
113
115
|
self.description = description
|
|
114
116
|
self.meaning_types = meaning_types
|
|
115
117
|
self.search_keys = search_keys
|
|
118
|
+
self.unnest_search_keys = unnest_search_keys
|
|
116
119
|
self.ignore_columns = []
|
|
117
120
|
self.hierarchical_group_keys = []
|
|
118
121
|
self.hierarchical_subgroup_keys = []
|
|
@@ -172,7 +175,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
172
175
|
new_columns = []
|
|
173
176
|
dup_counter = 0
|
|
174
177
|
for column in self.data.columns:
|
|
175
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
178
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
|
|
176
179
|
self.columns_renaming[column] = column
|
|
177
180
|
new_columns.append(column)
|
|
178
181
|
continue
|
|
@@ -220,7 +223,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
220
223
|
"""Check that string values less than maximum characters for LLM"""
|
|
221
224
|
# self.logger.info("Validate too long string values")
|
|
222
225
|
for col in self.data.columns:
|
|
223
|
-
if is_string_dtype(self.data[col])
|
|
226
|
+
if is_string_dtype(self.data[col]):
|
|
224
227
|
max_length: int = self.data[col].astype("str").str.len().max()
|
|
225
228
|
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
226
229
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
@@ -351,9 +354,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
351
354
|
if postal_code is not None and postal_code in self.data.columns:
|
|
352
355
|
# self.logger.info("Normalize postal code")
|
|
353
356
|
|
|
354
|
-
if is_string_dtype(self.data[postal_code])
|
|
357
|
+
if is_string_dtype(self.data[postal_code]):
|
|
355
358
|
try:
|
|
356
|
-
self.data[postal_code] =
|
|
359
|
+
self.data[postal_code] = (
|
|
360
|
+
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
361
|
+
)
|
|
357
362
|
except Exception:
|
|
358
363
|
pass
|
|
359
364
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -803,6 +808,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
803
808
|
meaningType=meaning_type,
|
|
804
809
|
minMaxValues=min_max_values,
|
|
805
810
|
)
|
|
811
|
+
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
812
|
+
column_meta.isUnnest = True
|
|
806
813
|
|
|
807
814
|
columns.append(column_meta)
|
|
808
815
|
|
|
@@ -822,7 +829,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
822
829
|
return DataType.INT
|
|
823
830
|
elif is_float_dtype(pandas_data_type):
|
|
824
831
|
return DataType.DECIMAL
|
|
825
|
-
elif is_string_dtype(pandas_data_type)
|
|
832
|
+
elif is_string_dtype(pandas_data_type):
|
|
826
833
|
return DataType.STRING
|
|
827
834
|
else:
|
|
828
835
|
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|