upgini 1.1.299__py3-none-any.whl → 1.1.299a3511.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +26 -7
- upgini/autofe/binary.py +91 -2
- upgini/autofe/date.py +16 -3
- upgini/autofe/feature.py +3 -2
- upgini/autofe/unary.py +7 -0
- upgini/dataset.py +2 -11
- upgini/features_enricher.py +101 -222
- upgini/metadata.py +2 -10
- upgini/metrics.py +1 -1
- upgini/resource_bundle/strings.properties +0 -1
- upgini/utils/base_search_key_detector.py +12 -14
- upgini/utils/deduplicate_utils.py +1 -11
- upgini/utils/email_utils.py +0 -5
- {upgini-1.1.299.dist-info → upgini-1.1.299a3511.dev6.dist-info}/METADATA +5 -3
- {upgini-1.1.299.dist-info → upgini-1.1.299a3511.dev6.dist-info}/RECORD +18 -18
- {upgini-1.1.299.dist-info → upgini-1.1.299a3511.dev6.dist-info}/WHEEL +1 -1
- {upgini-1.1.299.dist-info → upgini-1.1.299a3511.dev6.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.299a3511.dev6"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,6 +1,20 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
3
|
+
from upgini.autofe.binary import (
|
|
4
|
+
Add,
|
|
5
|
+
Combine,
|
|
6
|
+
CombineThenFreq,
|
|
7
|
+
Distance,
|
|
8
|
+
Divide,
|
|
9
|
+
JaroWinklerSim1,
|
|
10
|
+
JaroWinklerSim2,
|
|
11
|
+
LevenshteinSim,
|
|
12
|
+
Max,
|
|
13
|
+
Min,
|
|
14
|
+
Multiply,
|
|
15
|
+
Sim,
|
|
16
|
+
Subtract,
|
|
17
|
+
)
|
|
4
18
|
from upgini.autofe.date import (
|
|
5
19
|
DateDiff,
|
|
6
20
|
DateDiffType2,
|
|
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
|
|
|
9
23
|
DatePercentile,
|
|
10
24
|
DatePercentileMethod2,
|
|
11
25
|
)
|
|
12
|
-
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
26
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
|
|
13
27
|
from upgini.autofe.operand import Operand
|
|
14
|
-
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
28
|
+
from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
15
29
|
from upgini.autofe.vector import Mean, Sum
|
|
16
30
|
|
|
17
31
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
39
53
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
40
54
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
41
55
|
GroupByThenRank(),
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
56
|
+
Combine(),
|
|
57
|
+
CombineThenFreq(),
|
|
58
|
+
GroupByThenNUnique(),
|
|
59
|
+
GroupByThenFreq(),
|
|
46
60
|
Sim(),
|
|
47
61
|
DateDiff(),
|
|
48
62
|
DateDiffType2(),
|
|
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
59
73
|
DatePercentile(),
|
|
60
74
|
DatePercentileMethod2(),
|
|
61
75
|
Norm(),
|
|
76
|
+
JaroWinklerSim1(),
|
|
77
|
+
JaroWinklerSim2(),
|
|
78
|
+
LevenshteinSim(),
|
|
79
|
+
Distance(),
|
|
80
|
+
Embeddings(),
|
|
62
81
|
]
|
|
63
82
|
}
|
|
64
83
|
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import Levenshtein
|
|
1
4
|
import numpy as np
|
|
2
5
|
import pandas as pd
|
|
3
6
|
from numpy import dot
|
|
4
7
|
from numpy.linalg import norm
|
|
8
|
+
from jarowinkler import jarowinkler_similarity
|
|
5
9
|
|
|
6
10
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
11
|
|
|
@@ -130,7 +134,25 @@ class CombineThenFreq(PandasOperand):
|
|
|
130
134
|
self._loc(temp, value_counts)
|
|
131
135
|
|
|
132
136
|
|
|
133
|
-
class
|
|
137
|
+
class Distance(PandasOperand):
|
|
138
|
+
name = "dist"
|
|
139
|
+
is_binary = True
|
|
140
|
+
output_type = "float"
|
|
141
|
+
is_symmetrical = True
|
|
142
|
+
has_symmetry_importance = True
|
|
143
|
+
|
|
144
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
145
|
+
return pd.Series(
|
|
146
|
+
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# row-wise dot product
|
|
150
|
+
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
151
|
+
return (left * right).apply(np.sum)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# Left for backward compatibility
|
|
155
|
+
class Sim(Distance):
|
|
134
156
|
name = "sim"
|
|
135
157
|
is_binary = True
|
|
136
158
|
output_type = "float"
|
|
@@ -138,4 +160,71 @@ class Sim(PandasOperand):
|
|
|
138
160
|
has_symmetry_importance = True
|
|
139
161
|
|
|
140
162
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
141
|
-
return
|
|
163
|
+
return 1 - super().calculate_binary(left, right)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
167
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
168
|
+
sims = []
|
|
169
|
+
for i in left.index:
|
|
170
|
+
left_i = self._prepare_value(left.get(i))
|
|
171
|
+
right_i = self._prepare_value(right.get(i))
|
|
172
|
+
if left_i is not None and right_i is not None:
|
|
173
|
+
sims.append(self._similarity(left_i, right_i))
|
|
174
|
+
else:
|
|
175
|
+
sims.append(None)
|
|
176
|
+
|
|
177
|
+
return pd.Series(sims, index=left.index)
|
|
178
|
+
|
|
179
|
+
@abc.abstractmethod
|
|
180
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
181
|
+
pass
|
|
182
|
+
|
|
183
|
+
@abc.abstractmethod
|
|
184
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class JaroWinklerSim1(StringSim):
|
|
189
|
+
name = "sim_jw1"
|
|
190
|
+
is_binary = True
|
|
191
|
+
input_type = "string"
|
|
192
|
+
output_type = "float"
|
|
193
|
+
is_symmetrical = True
|
|
194
|
+
has_symmetry_importance = True
|
|
195
|
+
|
|
196
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
197
|
+
return value
|
|
198
|
+
|
|
199
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
200
|
+
return jarowinkler_similarity(left, right)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class JaroWinklerSim2(StringSim):
|
|
204
|
+
name = "sim_jw2"
|
|
205
|
+
is_binary = True
|
|
206
|
+
input_type = "string"
|
|
207
|
+
output_type = "float"
|
|
208
|
+
is_symmetrical = True
|
|
209
|
+
has_symmetry_importance = True
|
|
210
|
+
|
|
211
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
212
|
+
return value[::-1] if value is not None else None
|
|
213
|
+
|
|
214
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
215
|
+
return jarowinkler_similarity(left, right)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
class LevenshteinSim(StringSim):
|
|
219
|
+
name = "sim_lv"
|
|
220
|
+
is_binary = True
|
|
221
|
+
input_type = "string"
|
|
222
|
+
output_type = "float"
|
|
223
|
+
is_symmetrical = True
|
|
224
|
+
has_symmetry_importance = True
|
|
225
|
+
|
|
226
|
+
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
227
|
+
return value
|
|
228
|
+
|
|
229
|
+
def _similarity(self, left: str, right: str) -> float:
|
|
230
|
+
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
upgini/autofe/date.py
CHANGED
|
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
43
43
|
is_binary = True
|
|
44
44
|
has_symmetry_importance = True
|
|
45
45
|
|
|
46
|
+
replace_negative: bool = False
|
|
47
|
+
|
|
46
48
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
47
49
|
res = super().get_params()
|
|
48
50
|
res.update(
|
|
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
50
52
|
"diff_unit": self.diff_unit,
|
|
51
53
|
"left_unit": self.left_unit,
|
|
52
54
|
"right_unit": self.right_unit,
|
|
55
|
+
"replace_negative": self.replace_negative,
|
|
53
56
|
}
|
|
54
57
|
)
|
|
55
58
|
return res
|
|
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
61
64
|
return self.__replace_negative(diff)
|
|
62
65
|
|
|
63
66
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
64
|
-
|
|
67
|
+
if self.replace_negative:
|
|
68
|
+
x[x < 0] = None
|
|
65
69
|
return x
|
|
66
70
|
|
|
67
71
|
|
|
@@ -101,13 +105,19 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
101
105
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
102
106
|
is_binary = True
|
|
103
107
|
has_symmetry_importance = True
|
|
108
|
+
|
|
104
109
|
aggregation: str
|
|
110
|
+
replace_negative: bool = False
|
|
105
111
|
|
|
106
112
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
107
113
|
res = super().get_params()
|
|
108
114
|
res.update(
|
|
109
115
|
{
|
|
110
116
|
"aggregation": self.aggregation,
|
|
117
|
+
"diff_unit": self.diff_unit,
|
|
118
|
+
"left_unit": self.left_unit,
|
|
119
|
+
"right_unit": self.right_unit,
|
|
120
|
+
"replace_negative": self.replace_negative,
|
|
111
121
|
}
|
|
112
122
|
)
|
|
113
123
|
return res
|
|
@@ -125,7 +135,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
125
135
|
|
|
126
136
|
def _diff(self, x: TimedeltaArray):
|
|
127
137
|
x = self._convert_diff_to_unit(x)
|
|
128
|
-
return x[x > 0]
|
|
138
|
+
return x[x > 0] if self.replace_negative else x
|
|
129
139
|
|
|
130
140
|
def _agg(self, x):
|
|
131
141
|
method = getattr(np, self.aggregation, None)
|
|
@@ -157,7 +167,10 @@ class DateListDiffBounded(DateListDiff):
|
|
|
157
167
|
super().__init__(**data)
|
|
158
168
|
|
|
159
169
|
def _agg(self, x):
|
|
160
|
-
x = x[
|
|
170
|
+
x = x[
|
|
171
|
+
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
172
|
+
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
173
|
+
]
|
|
161
174
|
return super()._agg(x)
|
|
162
175
|
|
|
163
176
|
|
upgini/autofe/feature.py
CHANGED
|
@@ -140,8 +140,9 @@ class Feature:
|
|
|
140
140
|
|
|
141
141
|
if self.alias:
|
|
142
142
|
components = ["f_autofe", self.alias]
|
|
143
|
-
elif shorten and not self.op.is_unary:
|
|
144
|
-
|
|
143
|
+
elif shorten and not (self.op.is_unary and all(isinstance(c, Column) for c in self.children)):
|
|
144
|
+
prev_name = [self.children[0].get_op_display_name()] if self.op.is_unary else []
|
|
145
|
+
components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
|
|
145
146
|
else:
|
|
146
147
|
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
|
|
147
148
|
"autofe",
|
upgini/autofe/unary.py
CHANGED
|
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
|
|
|
125
125
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
126
|
normalized_data = normalized_data.reindex(data.index)
|
|
127
127
|
return normalized_data
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class Embeddings(PandasOperand):
|
|
131
|
+
name = "emb"
|
|
132
|
+
is_unary = True
|
|
133
|
+
input_type = "string"
|
|
134
|
+
output_type = "vector"
|
upgini/dataset.py
CHANGED
|
@@ -23,9 +23,7 @@ from pandas.api.types import (
|
|
|
23
23
|
from upgini.errors import ValidationError
|
|
24
24
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
25
|
from upgini.metadata import (
|
|
26
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
27
26
|
EVAL_SET_INDEX,
|
|
28
|
-
SEARCH_KEY_UNNEST,
|
|
29
27
|
SYSTEM_COLUMNS,
|
|
30
28
|
SYSTEM_RECORD_ID,
|
|
31
29
|
TARGET,
|
|
@@ -81,7 +79,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
81
79
|
path: Optional[str] = None,
|
|
82
80
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
83
81
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
84
|
-
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
85
82
|
model_task_type: Optional[ModelTaskType] = None,
|
|
86
83
|
random_state: Optional[int] = None,
|
|
87
84
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -116,7 +113,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
116
113
|
self.description = description
|
|
117
114
|
self.meaning_types = meaning_types
|
|
118
115
|
self.search_keys = search_keys
|
|
119
|
-
self.unnest_search_keys = unnest_search_keys
|
|
120
116
|
self.ignore_columns = []
|
|
121
117
|
self.hierarchical_group_keys = []
|
|
122
118
|
self.hierarchical_subgroup_keys = []
|
|
@@ -176,7 +172,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
176
172
|
new_columns = []
|
|
177
173
|
dup_counter = 0
|
|
178
174
|
for column in self.data.columns:
|
|
179
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID
|
|
175
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
180
176
|
self.columns_renaming[column] = column
|
|
181
177
|
new_columns.append(column)
|
|
182
178
|
continue
|
|
@@ -357,9 +353,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
357
353
|
|
|
358
354
|
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
359
355
|
try:
|
|
360
|
-
self.data[postal_code] = (
|
|
361
|
-
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
362
|
-
)
|
|
356
|
+
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
363
357
|
except Exception:
|
|
364
358
|
pass
|
|
365
359
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -809,9 +803,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
809
803
|
meaningType=meaning_type,
|
|
810
804
|
minMaxValues=min_max_values,
|
|
811
805
|
)
|
|
812
|
-
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
813
|
-
column_meta.isUnnest = True
|
|
814
|
-
column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
|
|
815
806
|
|
|
816
807
|
columns.append(column_meta)
|
|
817
808
|
|
upgini/features_enricher.py
CHANGED
|
@@ -11,7 +11,6 @@ import sys
|
|
|
11
11
|
import tempfile
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
|
-
from collections import Counter
|
|
15
14
|
from dataclasses import dataclass
|
|
16
15
|
from threading import Thread
|
|
17
16
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -46,11 +45,9 @@ from upgini.mdc import MDC
|
|
|
46
45
|
from upgini.metadata import (
|
|
47
46
|
COUNTRY,
|
|
48
47
|
DEFAULT_INDEX,
|
|
49
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
50
48
|
EVAL_SET_INDEX,
|
|
51
49
|
ORIGINAL_INDEX,
|
|
52
50
|
RENAMED_INDEX,
|
|
53
|
-
SEARCH_KEY_UNNEST,
|
|
54
51
|
SORT_ID,
|
|
55
52
|
SYSTEM_RECORD_ID,
|
|
56
53
|
TARGET,
|
|
@@ -251,7 +248,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
251
248
|
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
252
249
|
|
|
253
250
|
validate_version(self.logger)
|
|
254
|
-
self.search_keys = search_keys or
|
|
251
|
+
self.search_keys = search_keys or dict()
|
|
255
252
|
self.country_code = country_code
|
|
256
253
|
self.__validate_search_keys(search_keys, search_id)
|
|
257
254
|
self.model_task_type = model_task_type
|
|
@@ -1203,7 +1200,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1203
1200
|
email_column = self._get_email_column(search_keys)
|
|
1204
1201
|
hem_column = self._get_hem_column(search_keys)
|
|
1205
1202
|
if email_column:
|
|
1206
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys,
|
|
1203
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1207
1204
|
extended_X = converter.convert(extended_X)
|
|
1208
1205
|
generated_features.extend(converter.generated_features)
|
|
1209
1206
|
if (
|
|
@@ -1356,7 +1353,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1356
1353
|
not in (
|
|
1357
1354
|
excluding_search_keys
|
|
1358
1355
|
+ list(self.fit_dropped_features)
|
|
1359
|
-
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID
|
|
1356
|
+
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1360
1357
|
)
|
|
1361
1358
|
]
|
|
1362
1359
|
|
|
@@ -1420,7 +1417,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1420
1417
|
fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
|
|
1421
1418
|
)
|
|
1422
1419
|
|
|
1423
|
-
fitting_eval_set_dict =
|
|
1420
|
+
fitting_eval_set_dict = dict()
|
|
1424
1421
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1425
1422
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1426
1423
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1537,7 +1534,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1537
1534
|
def __sample_only_input(
|
|
1538
1535
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
1539
1536
|
) -> _SampledDataForMetrics:
|
|
1540
|
-
eval_set_sampled_dict =
|
|
1537
|
+
eval_set_sampled_dict = dict()
|
|
1541
1538
|
|
|
1542
1539
|
df = validated_X.copy()
|
|
1543
1540
|
df[TARGET] = validated_y
|
|
@@ -1563,7 +1560,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1563
1560
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1564
1561
|
|
|
1565
1562
|
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1566
|
-
df_extended = self.__add_fit_system_record_id(df_extended,
|
|
1563
|
+
df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
|
|
1567
1564
|
|
|
1568
1565
|
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1569
1566
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -1587,7 +1584,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1587
1584
|
trace_id: str,
|
|
1588
1585
|
remove_outliers_calc_metrics: Optional[bool],
|
|
1589
1586
|
) -> _SampledDataForMetrics:
|
|
1590
|
-
eval_set_sampled_dict =
|
|
1587
|
+
eval_set_sampled_dict = dict()
|
|
1591
1588
|
search_keys = self.fit_search_keys
|
|
1592
1589
|
|
|
1593
1590
|
rows_to_drop = None
|
|
@@ -1661,7 +1658,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1661
1658
|
progress_bar: Optional[ProgressBar],
|
|
1662
1659
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1663
1660
|
) -> _SampledDataForMetrics:
|
|
1664
|
-
eval_set_sampled_dict =
|
|
1661
|
+
eval_set_sampled_dict = dict()
|
|
1665
1662
|
if eval_set is not None:
|
|
1666
1663
|
self.logger.info("Transform with eval_set")
|
|
1667
1664
|
# concatenate X and eval_set with eval_set_index
|
|
@@ -1683,7 +1680,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1683
1680
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1684
1681
|
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1685
1682
|
|
|
1686
|
-
eval_set_sampled_dict =
|
|
1683
|
+
eval_set_sampled_dict = dict()
|
|
1687
1684
|
|
|
1688
1685
|
tmp_target_name = "__target"
|
|
1689
1686
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
@@ -1946,38 +1943,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1946
1943
|
self.logger.info("Input dataset hasn't date column")
|
|
1947
1944
|
if self.add_date_if_missing:
|
|
1948
1945
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1949
|
-
|
|
1950
|
-
# Don't pass all features in backend on transform
|
|
1951
|
-
original_features_for_transform = []
|
|
1952
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1953
|
-
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1954
|
-
if len(features_not_to_pass) > 0:
|
|
1955
|
-
# Pass only features that need for transform
|
|
1956
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1957
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1958
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1959
|
-
original_features_for_transform = [
|
|
1960
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1961
|
-
]
|
|
1962
|
-
|
|
1963
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1964
|
-
|
|
1965
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1966
|
-
|
|
1967
|
-
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1968
|
-
df[columns_for_system_record_id], index=False
|
|
1969
|
-
).astype("Float64")
|
|
1970
|
-
|
|
1971
|
-
# Explode multiple search keys
|
|
1972
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1973
|
-
|
|
1974
1946
|
email_column = self._get_email_column(search_keys)
|
|
1975
1947
|
hem_column = self._get_hem_column(search_keys)
|
|
1976
1948
|
email_converted_to_hem = False
|
|
1977
1949
|
if email_column:
|
|
1978
|
-
converter = EmailSearchKeyConverter(
|
|
1979
|
-
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
1980
|
-
)
|
|
1950
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1981
1951
|
df = converter.convert(df)
|
|
1982
1952
|
generated_features.extend(converter.generated_features)
|
|
1983
1953
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1991,21 +1961,30 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1991
1961
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1992
1962
|
|
|
1993
1963
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1994
|
-
|
|
1995
|
-
for col in original_features_for_transform:
|
|
1996
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1997
|
-
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1964
|
+
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1998
1965
|
|
|
1999
1966
|
if email_converted_to_hem:
|
|
2000
|
-
|
|
1967
|
+
non_keys_columns.append(email_column)
|
|
2001
1968
|
|
|
2002
|
-
|
|
2003
|
-
|
|
1969
|
+
# Don't pass features in backend on transform
|
|
1970
|
+
original_features_for_transform = None
|
|
1971
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1972
|
+
if len(non_keys_columns) > 0:
|
|
1973
|
+
# Pass only features that need for transform
|
|
1974
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1975
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1976
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1977
|
+
original_features_for_transform = [
|
|
1978
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1979
|
+
]
|
|
1980
|
+
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1981
|
+
|
|
1982
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2004
1983
|
|
|
2005
1984
|
if add_fit_system_record_id:
|
|
2006
|
-
df = self.__add_fit_system_record_id(df,
|
|
1985
|
+
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
2007
1986
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2008
|
-
|
|
1987
|
+
non_keys_columns.append(SORT_ID)
|
|
2009
1988
|
|
|
2010
1989
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
2011
1990
|
|
|
@@ -2013,19 +1992,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2013
1992
|
"Float64"
|
|
2014
1993
|
)
|
|
2015
1994
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2016
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2017
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2018
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2019
1995
|
|
|
2020
1996
|
df = df.reset_index(drop=True)
|
|
2021
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID
|
|
1997
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
2022
1998
|
if add_fit_system_record_id:
|
|
2023
1999
|
system_columns_with_original_index.append(SORT_ID)
|
|
2024
2000
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
2025
2001
|
|
|
2026
2002
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2027
2003
|
|
|
2028
|
-
df_without_features = df.drop(columns=
|
|
2004
|
+
df_without_features = df.drop(columns=non_keys_columns)
|
|
2029
2005
|
|
|
2030
2006
|
df_without_features = clean_full_duplicates(
|
|
2031
2007
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2037,13 +2013,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2037
2013
|
dataset = Dataset(
|
|
2038
2014
|
"sample_" + str(uuid.uuid4()),
|
|
2039
2015
|
df=df_without_features,
|
|
2040
|
-
meaning_types=meaning_types,
|
|
2041
|
-
search_keys=combined_search_keys,
|
|
2042
|
-
unnest_search_keys=unnest_search_keys,
|
|
2043
2016
|
date_format=self.date_format,
|
|
2044
2017
|
rest_client=self.rest_client,
|
|
2045
2018
|
logger=self.logger,
|
|
2046
2019
|
)
|
|
2020
|
+
dataset.meaning_types = meaning_types
|
|
2021
|
+
dataset.search_keys = combined_search_keys
|
|
2047
2022
|
if email_converted_to_hem:
|
|
2048
2023
|
dataset.ignore_columns = [email_column]
|
|
2049
2024
|
|
|
@@ -2182,14 +2157,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2182
2157
|
|
|
2183
2158
|
key_types = search_keys.values()
|
|
2184
2159
|
|
|
2185
|
-
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2186
|
-
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2187
|
-
for multi_key in multi_keys:
|
|
2188
|
-
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2189
|
-
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2190
|
-
self.logger.warning(msg)
|
|
2191
|
-
raise ValidationError(msg)
|
|
2192
|
-
|
|
2193
2160
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2194
2161
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2195
2162
|
self.logger.warning(msg)
|
|
@@ -2205,11 +2172,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2205
2172
|
self.logger.warning(msg)
|
|
2206
2173
|
raise ValidationError(msg)
|
|
2207
2174
|
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2212
|
-
|
|
2175
|
+
for key_type in SearchKey.__members__.values():
|
|
2176
|
+
if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2177
|
+
msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2178
|
+
self.logger.warning(msg)
|
|
2179
|
+
raise ValidationError(msg)
|
|
2213
2180
|
|
|
2214
2181
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2215
2182
|
# if (
|
|
@@ -2347,7 +2314,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2347
2314
|
self.logger.info("Input dataset hasn't date column")
|
|
2348
2315
|
if self.add_date_if_missing:
|
|
2349
2316
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2350
|
-
|
|
2317
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2318
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2319
|
+
email_converted_to_hem = False
|
|
2320
|
+
if email_column:
|
|
2321
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2322
|
+
df = converter.convert(df)
|
|
2323
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2324
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2351
2325
|
if (
|
|
2352
2326
|
self.detect_missing_search_keys
|
|
2353
2327
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2356,37 +2330,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2356
2330
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2357
2331
|
df = converter.convert(df)
|
|
2358
2332
|
|
|
2359
|
-
# Explode multiple search keys
|
|
2360
2333
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2361
|
-
meaning_types = {
|
|
2362
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2363
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2364
|
-
}
|
|
2365
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2366
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2367
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2368
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2369
|
-
|
|
2370
|
-
# TODO check that this is correct for enrichment
|
|
2371
|
-
self.df_with_original_index = df.copy()
|
|
2372
|
-
|
|
2373
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2374
|
-
|
|
2375
|
-
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2376
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
|
2377
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2378
|
-
email_converted_to_hem = False
|
|
2379
|
-
if email_column:
|
|
2380
|
-
converter = EmailSearchKeyConverter(
|
|
2381
|
-
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2382
|
-
)
|
|
2383
|
-
df = converter.convert(df)
|
|
2384
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2385
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2386
|
-
|
|
2387
|
-
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2388
|
-
self.fit_search_keys.keys()
|
|
2389
|
-
)
|
|
2390
2334
|
if email_converted_to_hem:
|
|
2391
2335
|
non_feature_columns.append(email_column)
|
|
2392
2336
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2410,14 +2354,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2410
2354
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2411
2355
|
}
|
|
2412
2356
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2413
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2414
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2415
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2416
2357
|
if eval_set is not None and len(eval_set) > 0:
|
|
2417
2358
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2418
2359
|
|
|
2419
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys
|
|
2360
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2420
2361
|
|
|
2362
|
+
self.df_with_original_index = df.copy()
|
|
2421
2363
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2422
2364
|
|
|
2423
2365
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2425,15 +2367,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2425
2367
|
dataset = Dataset(
|
|
2426
2368
|
"tds_" + str(uuid.uuid4()),
|
|
2427
2369
|
df=df,
|
|
2428
|
-
meaning_types=meaning_types,
|
|
2429
|
-
search_keys=combined_search_keys,
|
|
2430
|
-
unnest_search_keys=unnest_search_keys,
|
|
2431
2370
|
model_task_type=model_task_type,
|
|
2432
2371
|
date_format=self.date_format,
|
|
2433
2372
|
random_state=self.random_state,
|
|
2434
2373
|
rest_client=self.rest_client,
|
|
2435
2374
|
logger=self.logger,
|
|
2436
2375
|
)
|
|
2376
|
+
dataset.meaning_types = meaning_types
|
|
2377
|
+
dataset.search_keys = combined_search_keys
|
|
2437
2378
|
if email_converted_to_hem:
|
|
2438
2379
|
dataset.ignore_columns = [email_column]
|
|
2439
2380
|
|
|
@@ -2803,10 +2744,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2803
2744
|
X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
|
|
2804
2745
|
) -> Tuple[pd.DataFrame, pd.Series]:
|
|
2805
2746
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2806
|
-
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
|
2807
2747
|
Xy = X.copy()
|
|
2808
2748
|
Xy[TARGET] = y
|
|
2809
|
-
Xy = Xy.sort_values(by=
|
|
2749
|
+
Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2810
2750
|
X = Xy.drop(columns=TARGET)
|
|
2811
2751
|
y = Xy[TARGET].copy()
|
|
2812
2752
|
|
|
@@ -2985,19 +2925,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2985
2925
|
|
|
2986
2926
|
@staticmethod
|
|
2987
2927
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
if len(cols) == 1:
|
|
2992
|
-
return cols[0]
|
|
2928
|
+
for col, t in search_keys.items():
|
|
2929
|
+
if t == SearchKey.EMAIL:
|
|
2930
|
+
return col
|
|
2993
2931
|
|
|
2994
2932
|
@staticmethod
|
|
2995
2933
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2996
|
-
|
|
2997
|
-
|
|
2998
|
-
|
|
2999
|
-
if len(cols) == 1:
|
|
3000
|
-
return cols[0]
|
|
2934
|
+
for col, t in search_keys.items():
|
|
2935
|
+
if t == SearchKey.HEM:
|
|
2936
|
+
return col
|
|
3001
2937
|
|
|
3002
2938
|
@staticmethod
|
|
3003
2939
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -3005,44 +2941,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3005
2941
|
if t == SearchKey.PHONE:
|
|
3006
2942
|
return col
|
|
3007
2943
|
|
|
3008
|
-
def _explode_multiple_search_keys(
|
|
3009
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3010
|
-
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
3011
|
-
# find groups of multiple search keys
|
|
3012
|
-
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3013
|
-
for key_name, key_type in search_keys.items():
|
|
3014
|
-
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3015
|
-
search_key_names_by_type = {
|
|
3016
|
-
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
3017
|
-
}
|
|
3018
|
-
if len(search_key_names_by_type) == 0:
|
|
3019
|
-
return df, {}
|
|
3020
|
-
|
|
3021
|
-
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3022
|
-
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3023
|
-
exploded_dfs = []
|
|
3024
|
-
unnest_search_keys = {}
|
|
3025
|
-
|
|
3026
|
-
for key_type, key_names in search_key_names_by_type.items():
|
|
3027
|
-
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
3028
|
-
exploded_df = pd.melt(
|
|
3029
|
-
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
3030
|
-
)
|
|
3031
|
-
exploded_dfs.append(exploded_df)
|
|
3032
|
-
for old_key in key_names:
|
|
3033
|
-
del search_keys[old_key]
|
|
3034
|
-
search_keys[new_search_key] = key_type
|
|
3035
|
-
unnest_search_keys[new_search_key] = key_names
|
|
3036
|
-
|
|
3037
|
-
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3038
|
-
return df, unnest_search_keys
|
|
3039
|
-
|
|
3040
2944
|
def __add_fit_system_record_id(
|
|
3041
|
-
self,
|
|
3042
|
-
df: pd.DataFrame,
|
|
3043
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3044
|
-
search_keys: Dict[str, SearchKey],
|
|
3045
|
-
id_name: str,
|
|
2945
|
+
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
3046
2946
|
) -> pd.DataFrame:
|
|
3047
2947
|
# save original order or rows
|
|
3048
2948
|
original_index_name = df.index.name
|
|
@@ -3053,14 +2953,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3053
2953
|
|
|
3054
2954
|
# order by date and idempotent order by other keys
|
|
3055
2955
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
3056
|
-
sort_exclude_columns = [
|
|
3057
|
-
original_order_name,
|
|
3058
|
-
ORIGINAL_INDEX,
|
|
3059
|
-
EVAL_SET_INDEX,
|
|
3060
|
-
TARGET,
|
|
3061
|
-
"__target",
|
|
3062
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
3063
|
-
]
|
|
2956
|
+
sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
|
|
3064
2957
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3065
2958
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3066
2959
|
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
@@ -3098,18 +2991,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3098
2991
|
|
|
3099
2992
|
df = df.reset_index(drop=True).reset_index()
|
|
3100
2993
|
# system_record_id saves correct order for fit
|
|
3101
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
2994
|
+
df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
|
|
3102
2995
|
|
|
3103
2996
|
# return original order
|
|
3104
2997
|
df = df.set_index(ORIGINAL_INDEX)
|
|
3105
2998
|
df.index.name = original_index_name
|
|
3106
2999
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3107
3000
|
|
|
3108
|
-
meaning_types[
|
|
3109
|
-
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3110
|
-
if id_name == SYSTEM_RECORD_ID
|
|
3111
|
-
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3112
|
-
)
|
|
3001
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3113
3002
|
return df
|
|
3114
3003
|
|
|
3115
3004
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3164,11 +3053,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3164
3053
|
)
|
|
3165
3054
|
|
|
3166
3055
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3167
|
-
dup_features = [
|
|
3168
|
-
c
|
|
3169
|
-
for c in comparing_columns
|
|
3170
|
-
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3171
|
-
]
|
|
3056
|
+
dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
|
|
3172
3057
|
if len(dup_features) > 0:
|
|
3173
3058
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3174
3059
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3179,7 +3064,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3179
3064
|
result_features = pd.merge(
|
|
3180
3065
|
df_with_original_index,
|
|
3181
3066
|
result_features,
|
|
3182
|
-
|
|
3067
|
+
left_on=SYSTEM_RECORD_ID,
|
|
3068
|
+
right_on=SYSTEM_RECORD_ID,
|
|
3183
3069
|
how="left" if is_transform else "inner",
|
|
3184
3070
|
)
|
|
3185
3071
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3190,7 +3076,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3190
3076
|
result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
|
|
3191
3077
|
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3192
3078
|
|
|
3193
|
-
result_eval_sets =
|
|
3079
|
+
result_eval_sets = dict()
|
|
3194
3080
|
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3195
3081
|
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3196
3082
|
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
@@ -3402,7 +3288,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3402
3288
|
if autofe_feature.op.is_vector:
|
|
3403
3289
|
continue
|
|
3404
3290
|
|
|
3405
|
-
description =
|
|
3291
|
+
description = dict()
|
|
3406
3292
|
|
|
3407
3293
|
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3408
3294
|
if feature_meta is None:
|
|
@@ -3568,13 +3454,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3568
3454
|
self.warning_counter.increment()
|
|
3569
3455
|
|
|
3570
3456
|
if len(valid_search_keys) == 1:
|
|
3571
|
-
|
|
3572
|
-
|
|
3573
|
-
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3457
|
+
for k, v in valid_search_keys.items():
|
|
3458
|
+
# Show warning for country only if country is the only key
|
|
3459
|
+
if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
|
|
3460
|
+
msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3461
|
+
print(msg)
|
|
3462
|
+
self.logger.warning(msg)
|
|
3463
|
+
self.warning_counter.increment()
|
|
3578
3464
|
|
|
3579
3465
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3580
3466
|
|
|
@@ -3684,68 +3570,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3684
3570
|
def check_need_detect(search_key: SearchKey):
|
|
3685
3571
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3686
3572
|
|
|
3687
|
-
|
|
3688
|
-
|
|
3689
|
-
|
|
3690
|
-
|
|
3691
|
-
|
|
3692
|
-
|
|
3693
|
-
self.autodetected_search_keys.update(new_keys)
|
|
3694
|
-
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3573
|
+
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3574
|
+
maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
|
|
3575
|
+
if maybe_key is not None:
|
|
3576
|
+
search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3577
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3578
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
|
|
3695
3579
|
if not silent_mode:
|
|
3696
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3580
|
+
print(self.bundle.get("postal_code_detected").format(maybe_key))
|
|
3697
3581
|
|
|
3698
3582
|
if (
|
|
3699
3583
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3700
3584
|
and self.country_code is None
|
|
3701
3585
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3702
3586
|
):
|
|
3703
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3704
|
-
if maybe_key:
|
|
3705
|
-
search_keys[maybe_key
|
|
3706
|
-
self.autodetected_search_keys[maybe_key
|
|
3587
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
|
|
3588
|
+
if maybe_key is not None:
|
|
3589
|
+
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3590
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3707
3591
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3708
3592
|
if not silent_mode:
|
|
3709
3593
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3710
3594
|
|
|
3711
3595
|
if (
|
|
3712
|
-
|
|
3713
|
-
SearchKey.HEM not in search_keys.values()
|
|
3596
|
+
SearchKey.EMAIL not in search_keys.values()
|
|
3597
|
+
and SearchKey.HEM not in search_keys.values()
|
|
3714
3598
|
and check_need_detect(SearchKey.HEM)
|
|
3715
3599
|
):
|
|
3716
|
-
|
|
3717
|
-
if
|
|
3600
|
+
maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
|
|
3601
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3718
3602
|
if self.__is_registered or is_demo_dataset:
|
|
3719
|
-
|
|
3720
|
-
|
|
3721
|
-
self.
|
|
3722
|
-
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3603
|
+
search_keys[maybe_key] = SearchKey.EMAIL
|
|
3604
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
|
|
3605
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
|
|
3723
3606
|
if not silent_mode:
|
|
3724
|
-
print(self.bundle.get("email_detected").format(
|
|
3607
|
+
print(self.bundle.get("email_detected").format(maybe_key))
|
|
3725
3608
|
else:
|
|
3726
3609
|
self.logger.warning(
|
|
3727
|
-
f"Autodetected search key EMAIL in column {
|
|
3728
|
-
" But not used because not registered user"
|
|
3610
|
+
f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
|
|
3729
3611
|
)
|
|
3730
3612
|
if not silent_mode:
|
|
3731
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3613
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3732
3614
|
self.warning_counter.increment()
|
|
3733
3615
|
|
|
3734
|
-
|
|
3735
|
-
|
|
3736
|
-
|
|
3737
|
-
if maybe_keys:
|
|
3616
|
+
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3617
|
+
maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
|
|
3618
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3738
3619
|
if self.__is_registered or is_demo_dataset:
|
|
3739
|
-
|
|
3740
|
-
|
|
3741
|
-
self.
|
|
3742
|
-
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3620
|
+
search_keys[maybe_key] = SearchKey.PHONE
|
|
3621
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
|
|
3622
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
|
|
3743
3623
|
if not silent_mode:
|
|
3744
|
-
print(self.bundle.get("phone_detected").format(
|
|
3624
|
+
print(self.bundle.get("phone_detected").format(maybe_key))
|
|
3745
3625
|
else:
|
|
3746
3626
|
self.logger.warning(
|
|
3747
|
-
f"Autodetected search key PHONE in column {
|
|
3748
|
-
"But not used because not registered user"
|
|
3627
|
+
f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
|
|
3749
3628
|
)
|
|
3750
3629
|
if not silent_mode:
|
|
3751
3630
|
print(self.bundle.get("phone_detected_not_registered"))
|
upgini/metadata.py
CHANGED
|
@@ -6,8 +6,6 @@ from typing import Dict, List, Optional, Set
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
SYSTEM_RECORD_ID = "system_record_id"
|
|
9
|
-
ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
|
|
10
|
-
SEARCH_KEY_UNNEST = "search_key_unnest"
|
|
11
9
|
SORT_ID = "sort_id"
|
|
12
10
|
EVAL_SET_INDEX = "eval_set_index"
|
|
13
11
|
TARGET = "target"
|
|
@@ -15,7 +13,7 @@ COUNTRY = "country_iso_code"
|
|
|
15
13
|
RENAMED_INDEX = "index_col"
|
|
16
14
|
DEFAULT_INDEX = "index"
|
|
17
15
|
ORIGINAL_INDEX = "original_index"
|
|
18
|
-
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID,
|
|
16
|
+
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
|
|
19
17
|
|
|
20
18
|
|
|
21
19
|
class FileColumnMeaningType(Enum):
|
|
@@ -41,8 +39,6 @@ class FileColumnMeaningType(Enum):
|
|
|
41
39
|
POSTAL_CODE = "POSTAL_CODE"
|
|
42
40
|
SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
|
|
43
41
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
44
|
-
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
45
|
-
UNNEST_KEY = "UNNEST_KEY"
|
|
46
42
|
|
|
47
43
|
|
|
48
44
|
class SearchKey(Enum):
|
|
@@ -188,10 +184,6 @@ class FileColumnMetadata(BaseModel):
|
|
|
188
184
|
meaningType: FileColumnMeaningType
|
|
189
185
|
minMaxValues: Optional[NumericInterval] = None
|
|
190
186
|
originalName: Optional[str]
|
|
191
|
-
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
192
|
-
isUnnest: bool = False
|
|
193
|
-
# list of original etalon key column names like msisdn1, msisdn2
|
|
194
|
-
unnestKeyNames: Optional[list[str]]
|
|
195
187
|
|
|
196
188
|
|
|
197
189
|
class FileMetadata(BaseModel):
|
|
@@ -289,7 +281,7 @@ class FeaturesFilter(BaseModel):
|
|
|
289
281
|
|
|
290
282
|
|
|
291
283
|
class RuntimeParameters(BaseModel):
|
|
292
|
-
properties: Dict[str, str] =
|
|
284
|
+
properties: Dict[str, str] = dict()
|
|
293
285
|
|
|
294
286
|
|
|
295
287
|
class SearchCustomization(BaseModel):
|
upgini/metrics.py
CHANGED
|
@@ -369,7 +369,7 @@ class EstimatorWrapper:
|
|
|
369
369
|
"logger": logger,
|
|
370
370
|
}
|
|
371
371
|
if estimator is None:
|
|
372
|
-
params =
|
|
372
|
+
params = dict()
|
|
373
373
|
params["has_time"] = has_date
|
|
374
374
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
375
375
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
@@ -88,7 +88,6 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
|
88
88
|
search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
89
89
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
90
90
|
single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
91
|
-
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
92
91
|
unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
93
92
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
94
93
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -10,18 +10,16 @@ class BaseSearchKeyDetector:
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
11
|
raise NotImplementedError
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
column_name
|
|
16
|
-
|
|
17
|
-
if self._is_search_key_by_name(column_name)
|
|
18
|
-
]
|
|
13
|
+
def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
|
|
14
|
+
for column_name in column_names:
|
|
15
|
+
if self._is_search_key_by_name(column_name):
|
|
16
|
+
return column_name
|
|
19
17
|
|
|
20
|
-
def
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
|
|
19
|
+
maybe_column = self._get_search_key_by_name(df.columns.to_list())
|
|
20
|
+
if maybe_column is not None:
|
|
21
|
+
return maybe_column
|
|
22
|
+
|
|
23
|
+
for column_name in df.columns:
|
|
25
24
|
if self._is_search_key_by_values(df[column_name]):
|
|
26
|
-
|
|
27
|
-
return list(set(columns_by_names + columns_by_values))
|
|
25
|
+
return column_name
|
|
@@ -3,15 +3,7 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import
|
|
7
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
-
EVAL_SET_INDEX,
|
|
9
|
-
SORT_ID,
|
|
10
|
-
SYSTEM_RECORD_ID,
|
|
11
|
-
TARGET,
|
|
12
|
-
ModelTaskType,
|
|
13
|
-
SearchKey,
|
|
14
|
-
)
|
|
6
|
+
from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
15
7
|
from upgini.resource_bundle import ResourceBundle
|
|
16
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
17
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -151,8 +143,6 @@ def clean_full_duplicates(
|
|
|
151
143
|
unique_columns = df.columns.tolist()
|
|
152
144
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
153
145
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
154
|
-
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
155
|
-
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
156
146
|
if SORT_ID in unique_columns:
|
|
157
147
|
unique_columns.remove(SORT_ID)
|
|
158
148
|
if EVAL_SET_INDEX in unique_columns:
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,13 +38,11 @@ class EmailSearchKeyConverter:
|
|
|
38
38
|
email_column: str,
|
|
39
39
|
hem_column: Optional[str],
|
|
40
40
|
search_keys: Dict[str, SearchKey],
|
|
41
|
-
unnest_search_keys: Optional[List[str]] = None,
|
|
42
41
|
logger: Optional[logging.Logger] = None,
|
|
43
42
|
):
|
|
44
43
|
self.email_column = email_column
|
|
45
44
|
self.hem_column = hem_column
|
|
46
45
|
self.search_keys = search_keys
|
|
47
|
-
self.unnest_search_keys = unnest_search_keys
|
|
48
46
|
if logger is not None:
|
|
49
47
|
self.logger = logger
|
|
50
48
|
else:
|
|
@@ -82,12 +80,9 @@ class EmailSearchKeyConverter:
|
|
|
82
80
|
del self.search_keys[self.email_column]
|
|
83
81
|
return df
|
|
84
82
|
self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
|
|
85
|
-
self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
|
|
86
83
|
self.email_converted_to_hem = True
|
|
87
84
|
|
|
88
85
|
del self.search_keys[self.email_column]
|
|
89
|
-
if self.email_column in self.unnest_search_keys:
|
|
90
|
-
self.unnest_search_keys.remove(self.email_column)
|
|
91
86
|
|
|
92
87
|
df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
|
|
93
88
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.299a3511.dev6
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
|
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
+
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
+
Requires-Dist: levenshtein>=0.25.1
|
|
29
31
|
Requires-Dist: lightgbm>=3.3.2
|
|
30
32
|
Requires-Dist: numpy>=1.19.0
|
|
31
33
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
@@ -131,7 +133,7 @@ Description-Content-Type: text/markdown
|
|
|
131
133
|
|Consumer Confidence index| 44 |22|-|Monthly|date, country|No
|
|
132
134
|
|World economic indicators|191 |41|-|Monthly|date, country|No
|
|
133
135
|
|Markets data|-|17|-|Monthly|date, datetime|No
|
|
134
|
-
|World mobile & fixed broadband network coverage and
|
|
136
|
+
|World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
|
|
135
137
|
|World demographic data |90|-|2|Annual|country, postal/ZIP code|No
|
|
136
138
|
|World house prices |44|-|3|Annual|country, postal/ZIP code|No
|
|
137
139
|
|Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
|
|
@@ -840,4 +842,4 @@ Some convenient ways to start contributing are:
|
|
|
840
842
|
- [More perks for registered users](https://profile.upgini.com)
|
|
841
843
|
|
|
842
844
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
843
|
-
Please report it here</a></sup>
|
|
845
|
+
Please report it here</a></sup>
|
|
@@ -1,26 +1,26 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=_0z3wkU1Qyf7uc0tWztaZ9d93IS373XBtHXVE9Apmzw,34
|
|
2
2
|
upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=HQFLw3VyEsZfAt4xFnIYOnp3fzQSHAsyHzIm0gTJpOI,177543
|
|
7
7
|
upgini/http.py,sha256=bp6jWl422Icy3AhHMdCcJv5NjExE45gSMmzMTPJjPuk,42600
|
|
8
8
|
upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=qDAIO7NLSSQp_XiXCv3U4XJTLO0KH3YuQ8lvCLYPqzs,9781
|
|
10
|
+
upgini/metrics.py,sha256=DiDgdFvYu64ArlPEgjppZShK6yybWtIEbdPAhI3yO1I,30930
|
|
11
11
|
upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
|
|
18
|
+
upgini/autofe/binary.py,sha256=ml0MszLARZqp3UGUqTGsVjT4DD69zTisfBBEqbZ7klU,6767
|
|
19
|
+
upgini/autofe/date.py,sha256=Qq11EGLFHJxy5DQF2V1CBMtH2j4g5RpinRcw-7SobMs,8442
|
|
20
|
+
upgini/autofe/feature.py,sha256=cPbLJYAfzT8VqMDOGuEOBslJEDTdVphozQf6fCD8uuk,13587
|
|
21
21
|
upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
|
|
22
22
|
upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
23
|
+
upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
|
|
24
24
|
upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=1cQZrK630VztwGGDp41ec9gqIeUtkefaqSSQEitVWiM,19581
|
|
@@ -30,22 +30,22 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/phone_normalizer.py,sha256=EzTaahk6myRv6ZXgbyVFGY4kpo_2VlQgOrm5_lfbmNI,9996
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=1oHurL4I83P2lXIavx9vSdKM8ZqncAPXH2IZf76bD6g,26292
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
37
37
|
upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
|
|
38
38
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
39
39
|
upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
|
|
40
|
-
upgini/utils/base_search_key_detector.py,sha256=
|
|
40
|
+
upgini/utils/base_search_key_detector.py,sha256=UNs2uxEcD1N_mOtkx3k6U70DCajW-QEO2vZp41GF0mU,855
|
|
41
41
|
upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
|
|
42
42
|
upgini/utils/country_utils.py,sha256=yE8oRgMpXuJxPfQm4fioY6dg6700HgVnHSk4Cv9sUyM,6511
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
45
|
upgini/utils/datetime_utils.py,sha256=Ujmu1ouwSFtG5SywQXJlmtDnGigAnIWPdE5Vx5NvgUM,10951
|
|
46
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
46
|
+
upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
|
-
upgini/utils/email_utils.py,sha256=
|
|
48
|
+
upgini/utils/email_utils.py,sha256=PLufTO97Pg9PPsNqB9agcM6M98MIxKUgIgNn2mVwSQ0,3520
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
50
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.299a3511.dev6.dist-info/METADATA,sha256=KzZj0GPmhe4dHrujcrKXrqe3xtQCN7OMGYPUjLKJGpA,48230
|
|
61
|
+
upgini-1.1.299a3511.dev6.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
62
|
+
upgini-1.1.299a3511.dev6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.299a3511.dev6.dist-info/RECORD,,
|
|
File without changes
|