upgini 1.1.275a1__tar.gz → 1.1.276__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.275a1/src/upgini.egg-info → upgini-1.1.276}/PKG-INFO +2 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/setup.py +2 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/ads.py +6 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/date.py +9 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/data_source/data_source_publisher.py +1 -1
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/dataset.py +6 -13
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/features_enricher.py +156 -220
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/metadata.py +1 -9
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/metrics.py +12 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/normalizer/phone_normalizer.py +2 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/strings.properties +2 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/__init__.py +3 -2
- upgini-1.1.276/src/upgini/utils/base_search_key_detector.py +25 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/country_utils.py +2 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/datetime_utils.py +7 -4
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/deduplicate_utils.py +1 -11
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/email_utils.py +2 -7
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/features_validator.py +2 -1
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/target_utils.py +1 -1
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/track_info.py +25 -13
- {upgini-1.1.275a1 → upgini-1.1.276/src/upgini.egg-info}/PKG-INFO +2 -2
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/requires.txt +1 -1
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_autofe_operands.py +2 -1
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_country_utils.py +4 -4
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_email_utils.py +10 -8
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_etalon_validation.py +2 -21
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_features_enricher.py +18 -23
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_phone_utils.py +6 -6
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_postal_code_utils.py +6 -6
- upgini-1.1.275a1/src/upgini/utils/base_search_key_detector.py +0 -27
- {upgini-1.1.275a1 → upgini-1.1.276}/LICENSE +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/README.md +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/pyproject.toml +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/setup.cfg +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/errors.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/http.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/search_task.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/spinner.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_metrics.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_target_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.276}/tests/test_widget.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.276
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil>=2.8.0
|
|
30
30
|
Requires-Dist: requests>=2.8.0
|
|
31
|
-
Requires-Dist: pandas<
|
|
31
|
+
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy>=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn>=1.3.0
|
|
34
34
|
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
@@ -40,7 +40,7 @@ def send_log(msg: str):
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
here = Path(__file__).parent.resolve()
|
|
43
|
-
version = "1.1.
|
|
43
|
+
version = "1.1.276"
|
|
44
44
|
try:
|
|
45
45
|
send_log(f"Start setup PyLib version {version}")
|
|
46
46
|
setup(
|
|
@@ -77,7 +77,7 @@ try:
|
|
|
77
77
|
install_requires=[
|
|
78
78
|
"python-dateutil>=2.8.0",
|
|
79
79
|
"requests>=2.8.0",
|
|
80
|
-
"pandas>=1.1.0,<
|
|
80
|
+
"pandas>=1.1.0,<3.0.0",
|
|
81
81
|
"numpy>=1.19.0",
|
|
82
82
|
"scikit-learn>=1.3.0",
|
|
83
83
|
"pydantic>=1.8.2,<2.0.0",
|
|
@@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from pandas.api.types import is_string_dtype
|
|
8
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
9
9
|
|
|
10
10
|
from upgini import SearchKey
|
|
11
11
|
from upgini.http import get_rest_client
|
|
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
|
|
|
34
34
|
if df[column_name].notnull().sum() < min_valid_rows_count:
|
|
35
35
|
raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
|
|
36
36
|
meaning_type = search_keys[column_name].value
|
|
37
|
-
if
|
|
37
|
+
if (
|
|
38
|
+
meaning_type == FileColumnMeaningType.MSISDN
|
|
39
|
+
and not is_string_dtype(df[column_name])
|
|
40
|
+
and not is_object_dtype(df[column_name])
|
|
41
|
+
):
|
|
38
42
|
df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
|
|
39
43
|
else:
|
|
40
44
|
meaning_type = FileColumnMeaningType.FEATURE
|
|
@@ -2,6 +2,7 @@ from typing import Any, Optional, Union
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
+
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
5
6
|
|
|
6
7
|
from upgini.autofe.operand import PandasOperand
|
|
7
8
|
|
|
@@ -46,6 +47,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
46
47
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
48
|
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
49
|
)
|
|
50
|
+
future = pd.to_datetime(future)
|
|
49
51
|
before = future[future < left]
|
|
50
52
|
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
51
53
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
@@ -72,8 +74,13 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
72
74
|
|
|
73
75
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
74
76
|
|
|
75
|
-
def _diff(self, x):
|
|
76
|
-
|
|
77
|
+
def _diff(self, x: TimedeltaArray):
|
|
78
|
+
if self.diff_unit == "Y":
|
|
79
|
+
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
80
|
+
elif self.diff_unit == "M":
|
|
81
|
+
raise Exception("Unsupported difference unit: Month")
|
|
82
|
+
else:
|
|
83
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
77
84
|
return x[x > 0]
|
|
78
85
|
|
|
79
86
|
def _agg(self, x):
|
|
@@ -48,6 +48,7 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
+
exclude_from_autofe_generation: Optional[List[str]],
|
|
51
52
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
52
53
|
sort_column: Optional[str] = None,
|
|
53
54
|
date_format: Optional[str] = None,
|
|
@@ -57,7 +58,6 @@ class DataSourcePublisher:
|
|
|
57
58
|
join_date_abs_limit_days: Optional[int] = None,
|
|
58
59
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
59
60
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
-
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
|
@@ -17,14 +17,13 @@ from pandas.api.types import (
|
|
|
17
17
|
is_numeric_dtype,
|
|
18
18
|
is_period_dtype,
|
|
19
19
|
is_string_dtype,
|
|
20
|
+
is_object_dtype,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
from upgini.errors import ValidationError
|
|
23
24
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
24
25
|
from upgini.metadata import (
|
|
25
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
26
26
|
EVAL_SET_INDEX,
|
|
27
|
-
SEARCH_KEY_UNNEST,
|
|
28
27
|
SYSTEM_COLUMNS,
|
|
29
28
|
SYSTEM_RECORD_ID,
|
|
30
29
|
TARGET,
|
|
@@ -80,7 +79,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
80
79
|
path: Optional[str] = None,
|
|
81
80
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
82
81
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
83
|
-
unnest_search_keys: Optional[List[str]] = None,
|
|
84
82
|
model_task_type: Optional[ModelTaskType] = None,
|
|
85
83
|
random_state: Optional[int] = None,
|
|
86
84
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -115,7 +113,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
115
113
|
self.description = description
|
|
116
114
|
self.meaning_types = meaning_types
|
|
117
115
|
self.search_keys = search_keys
|
|
118
|
-
self.unnest_search_keys = unnest_search_keys
|
|
119
116
|
self.ignore_columns = []
|
|
120
117
|
self.hierarchical_group_keys = []
|
|
121
118
|
self.hierarchical_subgroup_keys = []
|
|
@@ -175,7 +172,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
175
172
|
new_columns = []
|
|
176
173
|
dup_counter = 0
|
|
177
174
|
for column in self.data.columns:
|
|
178
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID
|
|
175
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
179
176
|
self.columns_renaming[column] = column
|
|
180
177
|
new_columns.append(column)
|
|
181
178
|
continue
|
|
@@ -223,7 +220,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
223
220
|
"""Check that string values less than maximum characters for LLM"""
|
|
224
221
|
# self.logger.info("Validate too long string values")
|
|
225
222
|
for col in self.data.columns:
|
|
226
|
-
if is_string_dtype(self.data[col]):
|
|
223
|
+
if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
|
|
227
224
|
max_length: int = self.data[col].astype("str").str.len().max()
|
|
228
225
|
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
229
226
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
@@ -354,11 +351,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
354
351
|
if postal_code is not None and postal_code in self.data.columns:
|
|
355
352
|
# self.logger.info("Normalize postal code")
|
|
356
353
|
|
|
357
|
-
if is_string_dtype(self.data[postal_code]):
|
|
354
|
+
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
358
355
|
try:
|
|
359
|
-
self.data[postal_code] = (
|
|
360
|
-
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
361
|
-
)
|
|
356
|
+
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
362
357
|
except Exception:
|
|
363
358
|
pass
|
|
364
359
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -808,8 +803,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
808
803
|
meaningType=meaning_type,
|
|
809
804
|
minMaxValues=min_max_values,
|
|
810
805
|
)
|
|
811
|
-
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
812
|
-
column_meta.isUnnest = True
|
|
813
806
|
|
|
814
807
|
columns.append(column_meta)
|
|
815
808
|
|
|
@@ -829,7 +822,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
829
822
|
return DataType.INT
|
|
830
823
|
elif is_float_dtype(pandas_data_type):
|
|
831
824
|
return DataType.DECIMAL
|
|
832
|
-
elif is_string_dtype(pandas_data_type):
|
|
825
|
+
elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
|
|
833
826
|
return DataType.STRING
|
|
834
827
|
else:
|
|
835
828
|
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|