upgini 1.1.275a1__tar.gz → 1.1.275a99__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.275a1/src/upgini.egg-info → upgini-1.1.275a99}/PKG-INFO +2 -2
- {upgini-1.1.275a1 → upgini-1.1.275a99}/setup.py +2 -2
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/date.py +9 -2
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/data_source/data_source_publisher.py +1 -1
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/dataset.py +2 -10
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/features_enricher.py +150 -218
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/metadata.py +1 -9
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/metrics.py +12 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/normalizer/phone_normalizer.py +2 -2
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/resource_bundle/strings.properties +2 -2
- upgini-1.1.275a99/src/upgini/utils/base_search_key_detector.py +25 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/datetime_utils.py +3 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/deduplicate_utils.py +1 -11
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/email_utils.py +0 -5
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/features_validator.py +2 -1
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/track_info.py +25 -13
- {upgini-1.1.275a1 → upgini-1.1.275a99/src/upgini.egg-info}/PKG-INFO +2 -2
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini.egg-info/requires.txt +1 -1
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_autofe_operands.py +2 -1
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_country_utils.py +4 -4
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_email_utils.py +10 -8
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_etalon_validation.py +2 -21
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_features_enricher.py +18 -23
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_phone_utils.py +6 -6
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_postal_code_utils.py +6 -6
- upgini-1.1.275a1/src/upgini/utils/base_search_key_detector.py +0 -27
- {upgini-1.1.275a1 → upgini-1.1.275a99}/LICENSE +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/README.md +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/pyproject.toml +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/setup.cfg +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/ads.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/errors.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/http.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/search_task.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/spinner.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_metrics.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_target_utils.py +0 -0
- {upgini-1.1.275a1 → upgini-1.1.275a99}/tests/test_widget.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.275a99
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil>=2.8.0
|
|
30
30
|
Requires-Dist: requests>=2.8.0
|
|
31
|
-
Requires-Dist: pandas<
|
|
31
|
+
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy>=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn>=1.3.0
|
|
34
34
|
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
@@ -40,7 +40,7 @@ def send_log(msg: str):
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
here = Path(__file__).parent.resolve()
|
|
43
|
-
version = "1.1.
|
|
43
|
+
version = "1.1.275a99"
|
|
44
44
|
try:
|
|
45
45
|
send_log(f"Start setup PyLib version {version}")
|
|
46
46
|
setup(
|
|
@@ -77,7 +77,7 @@ try:
|
|
|
77
77
|
install_requires=[
|
|
78
78
|
"python-dateutil>=2.8.0",
|
|
79
79
|
"requests>=2.8.0",
|
|
80
|
-
"pandas>=1.1.0,<
|
|
80
|
+
"pandas>=1.1.0,<3.0.0",
|
|
81
81
|
"numpy>=1.19.0",
|
|
82
82
|
"scikit-learn>=1.3.0",
|
|
83
83
|
"pydantic>=1.8.2,<2.0.0",
|
|
@@ -2,6 +2,7 @@ from typing import Any, Optional, Union
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
+
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
5
6
|
|
|
6
7
|
from upgini.autofe.operand import PandasOperand
|
|
7
8
|
|
|
@@ -46,6 +47,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
46
47
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
48
|
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
49
|
)
|
|
50
|
+
future = pd.to_datetime(future)
|
|
49
51
|
before = future[future < left]
|
|
50
52
|
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
51
53
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
@@ -72,8 +74,13 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
72
74
|
|
|
73
75
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
74
76
|
|
|
75
|
-
def _diff(self, x):
|
|
76
|
-
|
|
77
|
+
def _diff(self, x: TimedeltaArray):
|
|
78
|
+
if self.diff_unit == "Y":
|
|
79
|
+
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
80
|
+
elif self.diff_unit == "M":
|
|
81
|
+
raise Exception("Unsupported difference unit: Month")
|
|
82
|
+
else:
|
|
83
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
77
84
|
return x[x > 0]
|
|
78
85
|
|
|
79
86
|
def _agg(self, x):
|
|
@@ -48,6 +48,7 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
+
exclude_from_autofe_generation: Optional[List[str]],
|
|
51
52
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
52
53
|
sort_column: Optional[str] = None,
|
|
53
54
|
date_format: Optional[str] = None,
|
|
@@ -57,7 +58,6 @@ class DataSourcePublisher:
|
|
|
57
58
|
join_date_abs_limit_days: Optional[int] = None,
|
|
58
59
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
59
60
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
-
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
|
@@ -22,9 +22,7 @@ from pandas.api.types import (
|
|
|
22
22
|
from upgini.errors import ValidationError
|
|
23
23
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
24
24
|
from upgini.metadata import (
|
|
25
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
26
25
|
EVAL_SET_INDEX,
|
|
27
|
-
SEARCH_KEY_UNNEST,
|
|
28
26
|
SYSTEM_COLUMNS,
|
|
29
27
|
SYSTEM_RECORD_ID,
|
|
30
28
|
TARGET,
|
|
@@ -80,7 +78,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
80
78
|
path: Optional[str] = None,
|
|
81
79
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
82
80
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
83
|
-
unnest_search_keys: Optional[List[str]] = None,
|
|
84
81
|
model_task_type: Optional[ModelTaskType] = None,
|
|
85
82
|
random_state: Optional[int] = None,
|
|
86
83
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -115,7 +112,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
115
112
|
self.description = description
|
|
116
113
|
self.meaning_types = meaning_types
|
|
117
114
|
self.search_keys = search_keys
|
|
118
|
-
self.unnest_search_keys = unnest_search_keys
|
|
119
115
|
self.ignore_columns = []
|
|
120
116
|
self.hierarchical_group_keys = []
|
|
121
117
|
self.hierarchical_subgroup_keys = []
|
|
@@ -175,7 +171,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
175
171
|
new_columns = []
|
|
176
172
|
dup_counter = 0
|
|
177
173
|
for column in self.data.columns:
|
|
178
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID
|
|
174
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
179
175
|
self.columns_renaming[column] = column
|
|
180
176
|
new_columns.append(column)
|
|
181
177
|
continue
|
|
@@ -356,9 +352,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
356
352
|
|
|
357
353
|
if is_string_dtype(self.data[postal_code]):
|
|
358
354
|
try:
|
|
359
|
-
self.data[postal_code] = (
|
|
360
|
-
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
361
|
-
)
|
|
355
|
+
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
362
356
|
except Exception:
|
|
363
357
|
pass
|
|
364
358
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -808,8 +802,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
808
802
|
meaningType=meaning_type,
|
|
809
803
|
minMaxValues=min_max_values,
|
|
810
804
|
)
|
|
811
|
-
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
812
|
-
column_meta.isUnnest = True
|
|
813
805
|
|
|
814
806
|
columns.append(column_meta)
|
|
815
807
|
|