upgini 1.1.296__py3-none-any.whl → 1.1.296a3511.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +24 -6
- upgini/autofe/binary.py +81 -2
- upgini/data_source/data_source_publisher.py +0 -37
- upgini/features_enricher.py +1 -1
- {upgini-1.1.296.dist-info → upgini-1.1.296a3511.dev1.dist-info}/METADATA +1 -1
- {upgini-1.1.296.dist-info → upgini-1.1.296a3511.dev1.dist-info}/RECORD +9 -9
- {upgini-1.1.296.dist-info → upgini-1.1.296a3511.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.1.296.dist-info → upgini-1.1.296a3511.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.296a3511.dev1"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,8 +1,22 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
3
|
+
from upgini.autofe.binary import (
|
|
4
|
+
Add,
|
|
5
|
+
Combine,
|
|
6
|
+
CombineThenFreq,
|
|
7
|
+
Distance,
|
|
8
|
+
Divide,
|
|
9
|
+
JaroWinklerSim1,
|
|
10
|
+
JaroWinklerSim2,
|
|
11
|
+
LevenshteinSim,
|
|
12
|
+
Max,
|
|
13
|
+
Min,
|
|
14
|
+
Multiply,
|
|
15
|
+
Sim,
|
|
16
|
+
Subtract,
|
|
17
|
+
)
|
|
4
18
|
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
|
|
5
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
19
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
6
20
|
from upgini.autofe.operand import Operand
|
|
7
21
|
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
8
22
|
from upgini.autofe.vector import Mean, Sum
|
|
@@ -32,10 +46,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
32
46
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
33
47
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
34
48
|
GroupByThenRank(),
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
Combine(),
|
|
50
|
+
CombineThenFreq(),
|
|
51
|
+
GroupByThenNUnique(),
|
|
52
|
+
GroupByThenFreq(),
|
|
39
53
|
Sim(),
|
|
40
54
|
DateDiff(),
|
|
41
55
|
DateDiffType2(),
|
|
@@ -51,6 +65,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
51
65
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
52
66
|
DatePercentile(),
|
|
53
67
|
Norm(),
|
|
68
|
+
JaroWinklerSim1(),
|
|
69
|
+
JaroWinklerSim2(),
|
|
70
|
+
LevenshteinSim(),
|
|
71
|
+
Distance(),
|
|
54
72
|
]
|
|
55
73
|
}
|
|
56
74
|
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import Levenshtein
|
|
1
4
|
import numpy as np
|
|
2
5
|
import pandas as pd
|
|
3
6
|
from numpy import dot
|
|
4
7
|
from numpy.linalg import norm
|
|
8
|
+
from jarowinkler import jarowinkler_similarity
|
|
5
9
|
|
|
6
10
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
11
|
|
|
@@ -130,8 +134,8 @@ class CombineThenFreq(PandasOperand):
|
|
|
130
134
|
self._loc(temp, value_counts)
|
|
131
135
|
|
|
132
136
|
|
|
133
|
-
class
|
|
134
|
-
name = "
|
|
137
|
+
class Distance(PandasOperand):
|
|
138
|
+
name = "dist"
|
|
135
139
|
is_binary = True
|
|
136
140
|
output_type = "float"
|
|
137
141
|
is_symmetrical = True
|
|
@@ -139,3 +143,78 @@ class Sim(PandasOperand):
|
|
|
139
143
|
|
|
140
144
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
141
145
|
return dot(left, right) / (norm(left) * norm(right))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class Sim(Distance):
|
|
149
|
+
name = "sim"
|
|
150
|
+
is_binary = True
|
|
151
|
+
output_type = "float"
|
|
152
|
+
is_symmetrical = True
|
|
153
|
+
has_symmetry_importance = True
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
157
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
158
|
+
sims = []
|
|
159
|
+
for i in left.index:
|
|
160
|
+
left_i = self._prepare_value(left.get(i))
|
|
161
|
+
right_i = self._prepare_value(right.get(i))
|
|
162
|
+
if left_i is not None and right_i is not None:
|
|
163
|
+
sims.append(self._similarity(left_i, right_i))
|
|
164
|
+
else:
|
|
165
|
+
sims.append(None)
|
|
166
|
+
|
|
167
|
+
return pd.Series(sims, index=left.index)
|
|
168
|
+
|
|
169
|
+
@abc.abstractmethod
|
|
170
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
171
|
+
pass
|
|
172
|
+
|
|
173
|
+
@abc.abstractmethod
|
|
174
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class JaroWinklerSim1(StringSim):
|
|
179
|
+
name = "sim_jw1"
|
|
180
|
+
is_binary = True
|
|
181
|
+
input_type = "string"
|
|
182
|
+
output_type = "float"
|
|
183
|
+
is_symmetrical = True
|
|
184
|
+
has_symmetry_importance = True
|
|
185
|
+
|
|
186
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
187
|
+
return value
|
|
188
|
+
|
|
189
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
190
|
+
return jarowinkler_similarity(left, right)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class JaroWinklerSim2(StringSim):
|
|
194
|
+
name = "sim_jw2"
|
|
195
|
+
is_binary = True
|
|
196
|
+
input_type = "string"
|
|
197
|
+
output_type = "float"
|
|
198
|
+
is_symmetrical = True
|
|
199
|
+
has_symmetry_importance = True
|
|
200
|
+
|
|
201
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
202
|
+
return value[::-1] if value is not None else None
|
|
203
|
+
|
|
204
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
205
|
+
return jarowinkler_similarity(left, right)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class LevenshteinSim(StringSim):
|
|
209
|
+
name = "sim_lv"
|
|
210
|
+
is_binary = True
|
|
211
|
+
input_type = "string"
|
|
212
|
+
output_type = "float"
|
|
213
|
+
is_symmetrical = True
|
|
214
|
+
has_symmetry_importance = True
|
|
215
|
+
|
|
216
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
217
|
+
return value
|
|
218
|
+
|
|
219
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
220
|
+
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
@@ -59,35 +59,9 @@ class DataSourcePublisher:
|
|
|
59
59
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
60
60
|
data_table_id_to_replace: Optional[str] = None,
|
|
61
61
|
keep_features: Optional[List[str]] = None,
|
|
62
|
-
date_features: Optional[List[str]] = None,
|
|
63
|
-
date_vector_features: Optional[List[str]] = None,
|
|
64
62
|
_force_generation=False,
|
|
65
63
|
_silent=False,
|
|
66
64
|
) -> str:
|
|
67
|
-
"""Register new ADS
|
|
68
|
-
|
|
69
|
-
Parameters
|
|
70
|
-
----------
|
|
71
|
-
data_table_uri - str - table name in format {project_id}.{datasource_name}.{table_name}
|
|
72
|
-
|
|
73
|
-
search_keys - dict with column names as keys and SearchKey as value
|
|
74
|
-
|
|
75
|
-
update_frequency - str - (Monthly, Weekly, Daily, Annually, Quarterly)
|
|
76
|
-
|
|
77
|
-
exclude_from_autofe_generation - optional list of features that should be excluded from AutoFE
|
|
78
|
-
|
|
79
|
-
secondary_search_keys - optional dict of secondary search keys
|
|
80
|
-
|
|
81
|
-
sort_column - optional str - name of unique column that could be used for sort
|
|
82
|
-
|
|
83
|
-
date_format - optional str - format of date if it is present in search keys
|
|
84
|
-
|
|
85
|
-
...
|
|
86
|
-
|
|
87
|
-
data_table_id_to_replace - optional str - id of registered ADS that should be replaced by new table
|
|
88
|
-
|
|
89
|
-
keep_features - optional list - features that should not be removed from ADS (even if they are personal)
|
|
90
|
-
"""
|
|
91
65
|
trace_id = str(uuid.uuid4())
|
|
92
66
|
|
|
93
67
|
with MDC(trace_id=trace_id):
|
|
@@ -150,14 +124,6 @@ class DataSourcePublisher:
|
|
|
150
124
|
request["excludeFromGeneration"] = exclude_from_autofe_generation
|
|
151
125
|
if keep_features is not None:
|
|
152
126
|
request["keepFeatures"] = keep_features
|
|
153
|
-
if date_features is not None:
|
|
154
|
-
if date_format is None:
|
|
155
|
-
raise ValidationError("date_format should be presented if you use date features")
|
|
156
|
-
request["dateFeatures"] = date_features
|
|
157
|
-
if date_vector_features is not None:
|
|
158
|
-
if date_format is None:
|
|
159
|
-
raise ValidationError("date_format should be presented if you use date vector features")
|
|
160
|
-
request["dateVectorFeatures"] = date_vector_features
|
|
161
127
|
self.logger.info(f"Start registering data table {request}")
|
|
162
128
|
|
|
163
129
|
task_id = self._rest_client.register_ads(request, trace_id)
|
|
@@ -215,9 +181,6 @@ class DataSourcePublisher:
|
|
|
215
181
|
msg = f"Data table successfully registered with id: {data_table_id}"
|
|
216
182
|
self.logger.info(msg)
|
|
217
183
|
print(msg)
|
|
218
|
-
if "warnings" in status_response and status_response["warnings"]:
|
|
219
|
-
self.logger.warning(status_response["warnings"])
|
|
220
|
-
print(status_response["warnings"])
|
|
221
184
|
return data_table_id
|
|
222
185
|
except KeyboardInterrupt:
|
|
223
186
|
if task_id is not None:
|
upgini/features_enricher.py
CHANGED
|
@@ -2870,7 +2870,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2870
2870
|
self.logger.info(f"Dates interval is ({min_date}, {max_date})")
|
|
2871
2871
|
|
|
2872
2872
|
except Exception:
|
|
2873
|
-
self.logger.
|
|
2873
|
+
self.logger.exception("Failed to log debug information")
|
|
2874
2874
|
|
|
2875
2875
|
def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
|
|
2876
2876
|
index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=cimRDJH6AT3BICa6gKJosTY9mQiinR-sub8yCsdaiuY,34
|
|
2
2
|
upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=rLy6BwhL94VkRCp8W4RxJ0lBb7sZqTOsz-bUVI8nXZU,177530
|
|
7
7
|
upgini/http.py,sha256=bp6jWl422Icy3AhHMdCcJv5NjExE45gSMmzMTPJjPuk,42600
|
|
8
8
|
upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
|
|
9
9
|
upgini/metadata.py,sha256=qDAIO7NLSSQp_XiXCv3U4XJTLO0KH3YuQ8lvCLYPqzs,9781
|
|
@@ -14,8 +14,8 @@ upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1
|
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=ZS9A7u1gV7Bt8tHFfCiNx9u4q85I1ny4NIL-keyfWHY,2423
|
|
18
|
+
upgini/autofe/binary.py,sha256=P46AVcFRiMKYucK3_N34QDqVMbvv2OuyWAREx0bUNwg,6367
|
|
19
19
|
upgini/autofe/date.py,sha256=qzk0NT332Q0vR1eRwTuNiMSrGE3ulh6Ic3QLBZqSdvw,7284
|
|
20
20
|
upgini/autofe/feature.py,sha256=_V9B74B3ue7eAYXSOt9JKhVC9klkAKks22MwnBRye_w,12487
|
|
21
21
|
upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
|
|
@@ -23,7 +23,7 @@ upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
|
|
|
23
23
|
upgini/autofe/unary.py,sha256=ZWjLd-CUkNt_PpM8YuWLLipW1v_RdBlsl4JxXIVo9aM,3652
|
|
24
24
|
upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
26
|
+
upgini/data_source/data_source_publisher.py,sha256=6paupnciqagACnSzjGSkA2a5i-c9ETvZheLqBYOJemk,17810
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.296a3511.dev1.dist-info/METADATA,sha256=_tPLQLuHI0_j5lGzkGhMSftXbfpLi8aCTRpjT4fd8QQ,48127
|
|
61
|
+
upgini-1.1.296a3511.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
62
|
+
upgini-1.1.296a3511.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.296a3511.dev1.dist-info/RECORD,,
|
|
File without changes
|