upgini 1.1.278a1__py3-none-any.whl → 1.1.279__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -0
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +3 -2
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +2 -1
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +2 -1
- upgini/autofe/vector.py +2 -0
- upgini/dataset.py +6 -15
- upgini/errors.py +1 -1
- upgini/features_enricher.py +102 -214
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +5 -10
- upgini/metrics.py +102 -100
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +0 -1
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +1 -1
- upgini/utils/base_search_key_detector.py +14 -16
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +1 -1
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +20 -15
- upgini/utils/deduplicate_utils.py +1 -11
- upgini/utils/email_utils.py +2 -7
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +14 -13
- upgini/utils/track_info.py +2 -2
- upgini/version_validator.py +2 -2
- {upgini-1.1.278a1.dist-info → upgini-1.1.279.dist-info}/METADATA +21 -23
- upgini-1.1.279.dist-info/RECORD +62 -0
- {upgini-1.1.278a1.dist-info → upgini-1.1.279.dist-info}/WHEEL +1 -2
- upgini/fingerprint.js +0 -8
- upgini-1.1.278a1.dist-info/RECORD +0 -63
- upgini-1.1.278a1.dist-info/top_level.txt +0 -1
- {upgini-1.1.278a1.dist-info → upgini-1.1.279.dist-info/licenses}/LICENSE +0 -0
upgini/__about__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.279"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
|
|
2
4
|
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
|
|
3
5
|
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
4
6
|
from upgini.autofe.operand import Operand
|
|
5
|
-
from upgini.autofe.unary import Abs,
|
|
6
|
-
from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
|
|
7
|
+
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
|
|
7
8
|
from upgini.autofe.vector import Mean, Sum
|
|
8
9
|
|
|
9
10
|
ALL_OPERANDS: Dict[str, Operand] = {
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
1
|
import numpy as np
|
|
3
2
|
import pandas as pd
|
|
4
3
|
from numpy import dot
|
|
5
4
|
from numpy.linalg import norm
|
|
6
5
|
|
|
6
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
|
+
|
|
7
8
|
|
|
8
9
|
class Min(PandasOperand):
|
|
9
10
|
name = "min"
|
upgini/autofe/date.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from typing import Any, Optional, Union
|
|
2
|
+
|
|
2
3
|
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
4
|
-
from pydantic import BaseModel
|
|
5
5
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
6
|
+
from pydantic import BaseModel
|
|
6
7
|
|
|
7
8
|
from upgini.autofe.operand import PandasOperand
|
|
8
9
|
|
upgini/autofe/feature.py
CHANGED
upgini/autofe/groupby.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
1
|
from typing import Optional
|
|
2
|
+
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
7
9
|
agg: Optional[str]
|
upgini/autofe/operand.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
2
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
3
1
|
import abc
|
|
4
|
-
import
|
|
2
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
5
4
|
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pydantic import BaseModel
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Operand(BaseModel):
|
upgini/autofe/unary.py
CHANGED
upgini/autofe/vector.py
CHANGED
upgini/dataset.py
CHANGED
|
@@ -15,17 +15,15 @@ from pandas.api.types import (
|
|
|
15
15
|
is_float_dtype,
|
|
16
16
|
is_integer_dtype,
|
|
17
17
|
is_numeric_dtype,
|
|
18
|
+
is_object_dtype,
|
|
18
19
|
is_period_dtype,
|
|
19
20
|
is_string_dtype,
|
|
20
|
-
is_object_dtype,
|
|
21
21
|
)
|
|
22
22
|
|
|
23
23
|
from upgini.errors import ValidationError
|
|
24
24
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
25
|
from upgini.metadata import (
|
|
26
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
27
26
|
EVAL_SET_INDEX,
|
|
28
|
-
SEARCH_KEY_UNNEST,
|
|
29
27
|
SYSTEM_COLUMNS,
|
|
30
28
|
SYSTEM_RECORD_ID,
|
|
31
29
|
TARGET,
|
|
@@ -81,7 +79,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
81
79
|
path: Optional[str] = None,
|
|
82
80
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
83
81
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
84
|
-
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
85
82
|
model_task_type: Optional[ModelTaskType] = None,
|
|
86
83
|
random_state: Optional[int] = None,
|
|
87
84
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -98,7 +95,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
98
95
|
data = pd.read_csv(path, **kwargs)
|
|
99
96
|
else:
|
|
100
97
|
# try different separators: , ; \t ...
|
|
101
|
-
with open(path
|
|
98
|
+
with open(path) as csvfile:
|
|
102
99
|
sep = csv.Sniffer().sniff(csvfile.read(2048)).delimiter
|
|
103
100
|
kwargs["sep"] = sep
|
|
104
101
|
data = pd.read_csv(path, **kwargs)
|
|
@@ -116,7 +113,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
116
113
|
self.description = description
|
|
117
114
|
self.meaning_types = meaning_types
|
|
118
115
|
self.search_keys = search_keys
|
|
119
|
-
self.unnest_search_keys = unnest_search_keys
|
|
120
116
|
self.ignore_columns = []
|
|
121
117
|
self.hierarchical_group_keys = []
|
|
122
118
|
self.hierarchical_subgroup_keys = []
|
|
@@ -176,7 +172,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
176
172
|
new_columns = []
|
|
177
173
|
dup_counter = 0
|
|
178
174
|
for column in self.data.columns:
|
|
179
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID
|
|
175
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
180
176
|
self.columns_renaming[column] = column
|
|
181
177
|
new_columns.append(column)
|
|
182
178
|
continue
|
|
@@ -255,7 +251,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
255
251
|
@staticmethod
|
|
256
252
|
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
257
253
|
try:
|
|
258
|
-
if isinstance(ip, IPv4Address
|
|
254
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
259
255
|
return int(ip)
|
|
260
256
|
except Exception:
|
|
261
257
|
pass
|
|
@@ -263,7 +259,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
263
259
|
@staticmethod
|
|
264
260
|
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
265
261
|
try:
|
|
266
|
-
if isinstance(ip, IPv4Address
|
|
262
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
267
263
|
return str(int(ip))
|
|
268
264
|
except Exception:
|
|
269
265
|
pass
|
|
@@ -357,9 +353,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
357
353
|
|
|
358
354
|
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
359
355
|
try:
|
|
360
|
-
self.data[postal_code] = (
|
|
361
|
-
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
362
|
-
)
|
|
356
|
+
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
363
357
|
except Exception:
|
|
364
358
|
pass
|
|
365
359
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -809,9 +803,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
809
803
|
meaningType=meaning_type,
|
|
810
804
|
minMaxValues=min_max_values,
|
|
811
805
|
)
|
|
812
|
-
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
813
|
-
column_meta.isUnnest = True
|
|
814
|
-
column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
|
|
815
806
|
|
|
816
807
|
columns.append(column_meta)
|
|
817
808
|
|
upgini/errors.py
CHANGED
|
@@ -16,7 +16,7 @@ class UnauthorizedError(HttpError):
|
|
|
16
16
|
"""Unauthorized error from REST API."""
|
|
17
17
|
|
|
18
18
|
def __init__(self, message, status_code):
|
|
19
|
-
message = "Unauthorized, please check your authorization token ({})"
|
|
19
|
+
message = f"Unauthorized, please check your authorization token ({message})"
|
|
20
20
|
super(UnauthorizedError, self).__init__(message, status_code)
|
|
21
21
|
|
|
22
22
|
|