upgini 1.1.299a3511.dev10__py3-none-any.whl → 1.1.300__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/all_operands.py +7 -26
- upgini/autofe/binary.py +2 -93
- upgini/autofe/date.py +4 -17
- upgini/autofe/feature.py +8 -10
- upgini/autofe/unary.py +0 -7
- upgini/dataset.py +11 -2
- upgini/features_enricher.py +223 -103
- upgini/metadata.py +10 -2
- upgini/metrics.py +1 -1
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/deduplicate_utils.py +11 -1
- upgini/utils/email_utils.py +5 -0
- {upgini-1.1.299a3511.dev10.dist-info → upgini-1.1.300.dist-info}/METADATA +3 -5
- {upgini-1.1.299a3511.dev10.dist-info → upgini-1.1.300.dist-info}/RECORD +18 -18
- {upgini-1.1.299a3511.dev10.dist-info → upgini-1.1.300.dist-info}/WHEEL +1 -1
- {upgini-1.1.299a3511.dev10.dist-info → upgini-1.1.300.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.1.
|
|
1
|
+
__version__ = "1.1.300"
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,20 +1,6 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
2
|
|
|
3
|
-
from upgini.autofe.binary import
|
|
4
|
-
Add,
|
|
5
|
-
Combine,
|
|
6
|
-
CombineThenFreq,
|
|
7
|
-
Distance,
|
|
8
|
-
Divide,
|
|
9
|
-
JaroWinklerSim1,
|
|
10
|
-
JaroWinklerSim2,
|
|
11
|
-
LevenshteinSim,
|
|
12
|
-
Max,
|
|
13
|
-
Min,
|
|
14
|
-
Multiply,
|
|
15
|
-
Sim,
|
|
16
|
-
Subtract,
|
|
17
|
-
)
|
|
3
|
+
from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
|
|
18
4
|
from upgini.autofe.date import (
|
|
19
5
|
DateDiff,
|
|
20
6
|
DateDiffType2,
|
|
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
|
|
|
23
9
|
DatePercentile,
|
|
24
10
|
DatePercentileMethod2,
|
|
25
11
|
)
|
|
26
|
-
from upgini.autofe.groupby import GroupByThenAgg,
|
|
12
|
+
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
27
13
|
from upgini.autofe.operand import Operand
|
|
28
|
-
from upgini.autofe.unary import Abs,
|
|
14
|
+
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
|
|
29
15
|
from upgini.autofe.vector import Mean, Sum
|
|
30
16
|
|
|
31
17
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
53
39
|
GroupByThenAgg(name="GroupByThenMedian", agg="median"),
|
|
54
40
|
GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
|
|
55
41
|
GroupByThenRank(),
|
|
56
|
-
Combine
|
|
57
|
-
CombineThenFreq
|
|
58
|
-
GroupByThenNUnique
|
|
59
|
-
GroupByThenFreq
|
|
42
|
+
Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
|
|
43
|
+
Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
|
|
44
|
+
Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
|
|
45
|
+
Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
|
|
60
46
|
Sim(),
|
|
61
47
|
DateDiff(),
|
|
62
48
|
DateDiffType2(),
|
|
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
73
59
|
DatePercentile(),
|
|
74
60
|
DatePercentileMethod2(),
|
|
75
61
|
Norm(),
|
|
76
|
-
JaroWinklerSim1(),
|
|
77
|
-
JaroWinklerSim2(),
|
|
78
|
-
LevenshteinSim(),
|
|
79
|
-
Distance(),
|
|
80
|
-
Embeddings(),
|
|
81
62
|
]
|
|
82
63
|
}
|
|
83
64
|
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,11 +1,7 @@
|
|
|
1
|
-
import abc
|
|
2
|
-
from typing import Optional
|
|
3
|
-
import Levenshtein
|
|
4
1
|
import numpy as np
|
|
5
2
|
import pandas as pd
|
|
6
3
|
from numpy import dot
|
|
7
4
|
from numpy.linalg import norm
|
|
8
|
-
from jarowinkler import jarowinkler_similarity
|
|
9
5
|
|
|
10
6
|
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
11
7
|
|
|
@@ -134,27 +130,7 @@ class CombineThenFreq(PandasOperand):
|
|
|
134
130
|
self._loc(temp, value_counts)
|
|
135
131
|
|
|
136
132
|
|
|
137
|
-
class
|
|
138
|
-
name = "dist"
|
|
139
|
-
is_binary = True
|
|
140
|
-
output_type = "float"
|
|
141
|
-
is_symmetrical = True
|
|
142
|
-
has_symmetry_importance = True
|
|
143
|
-
|
|
144
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
145
|
-
return pd.Series(
|
|
146
|
-
1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
|
|
147
|
-
)
|
|
148
|
-
|
|
149
|
-
# row-wise dot product
|
|
150
|
-
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
151
|
-
res = (left.dropna() * right.dropna()).apply(np.sum)
|
|
152
|
-
res = res.reindex(left.index.union(right.index))
|
|
153
|
-
return res
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
# Left for backward compatibility
|
|
157
|
-
class Sim(Distance):
|
|
133
|
+
class Sim(PandasOperand):
|
|
158
134
|
name = "sim"
|
|
159
135
|
is_binary = True
|
|
160
136
|
output_type = "float"
|
|
@@ -162,71 +138,4 @@ class Sim(Distance):
|
|
|
162
138
|
has_symmetry_importance = True
|
|
163
139
|
|
|
164
140
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
165
|
-
return
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
class StringSim(PandasOperand, abc.ABC):
|
|
169
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
170
|
-
sims = []
|
|
171
|
-
for i in left.index:
|
|
172
|
-
left_i = self._prepare_value(left.get(i))
|
|
173
|
-
right_i = self._prepare_value(right.get(i))
|
|
174
|
-
if left_i is not None and right_i is not None:
|
|
175
|
-
sims.append(self._similarity(left_i, right_i))
|
|
176
|
-
else:
|
|
177
|
-
sims.append(None)
|
|
178
|
-
|
|
179
|
-
return pd.Series(sims, index=left.index)
|
|
180
|
-
|
|
181
|
-
@abc.abstractmethod
|
|
182
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
183
|
-
pass
|
|
184
|
-
|
|
185
|
-
@abc.abstractmethod
|
|
186
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
187
|
-
pass
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
class JaroWinklerSim1(StringSim):
|
|
191
|
-
name = "sim_jw1"
|
|
192
|
-
is_binary = True
|
|
193
|
-
input_type = "string"
|
|
194
|
-
output_type = "float"
|
|
195
|
-
is_symmetrical = True
|
|
196
|
-
has_symmetry_importance = True
|
|
197
|
-
|
|
198
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
199
|
-
return value
|
|
200
|
-
|
|
201
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
202
|
-
return jarowinkler_similarity(left, right)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
class JaroWinklerSim2(StringSim):
|
|
206
|
-
name = "sim_jw2"
|
|
207
|
-
is_binary = True
|
|
208
|
-
input_type = "string"
|
|
209
|
-
output_type = "float"
|
|
210
|
-
is_symmetrical = True
|
|
211
|
-
has_symmetry_importance = True
|
|
212
|
-
|
|
213
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
214
|
-
return value[::-1] if value is not None else None
|
|
215
|
-
|
|
216
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
217
|
-
return jarowinkler_similarity(left, right)
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
class LevenshteinSim(StringSim):
|
|
221
|
-
name = "sim_lv"
|
|
222
|
-
is_binary = True
|
|
223
|
-
input_type = "string"
|
|
224
|
-
output_type = "float"
|
|
225
|
-
is_symmetrical = True
|
|
226
|
-
has_symmetry_importance = True
|
|
227
|
-
|
|
228
|
-
def _prepare_value(self, value: Optional[str]) -> Optional[str]:
|
|
229
|
-
return value
|
|
230
|
-
|
|
231
|
-
def _similarity(self, left: str, right: str) -> float:
|
|
232
|
-
return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
|
|
141
|
+
return dot(left, right) / (norm(left) * norm(right))
|
upgini/autofe/date.py
CHANGED
|
@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
43
43
|
is_binary = True
|
|
44
44
|
has_symmetry_importance = True
|
|
45
45
|
|
|
46
|
-
replace_negative: bool = False
|
|
47
|
-
|
|
48
46
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
49
47
|
res = super().get_params()
|
|
50
48
|
res.update(
|
|
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
52
50
|
"diff_unit": self.diff_unit,
|
|
53
51
|
"left_unit": self.left_unit,
|
|
54
52
|
"right_unit": self.right_unit,
|
|
55
|
-
"replace_negative": self.replace_negative,
|
|
56
53
|
}
|
|
57
54
|
)
|
|
58
55
|
return res
|
|
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
61
|
return self.__replace_negative(diff)
|
|
65
62
|
|
|
66
63
|
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
67
|
-
|
|
68
|
-
x[x < 0] = None
|
|
64
|
+
x[x < 0] = None
|
|
69
65
|
return x
|
|
70
66
|
|
|
71
67
|
|
|
@@ -89,7 +85,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
89
85
|
left = self._convert_to_date(left, self.left_unit)
|
|
90
86
|
right = self._convert_to_date(right, self.right_unit)
|
|
91
87
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
92
|
-
lambda y:
|
|
88
|
+
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
93
89
|
)
|
|
94
90
|
future = pd.to_datetime(future)
|
|
95
91
|
before = future[future < left]
|
|
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
105
101
|
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
106
102
|
is_binary = True
|
|
107
103
|
has_symmetry_importance = True
|
|
108
|
-
|
|
109
104
|
aggregation: str
|
|
110
|
-
replace_negative: bool = False
|
|
111
105
|
|
|
112
106
|
def get_params(self) -> Dict[str, Optional[str]]:
|
|
113
107
|
res = super().get_params()
|
|
114
108
|
res.update(
|
|
115
109
|
{
|
|
116
110
|
"aggregation": self.aggregation,
|
|
117
|
-
"diff_unit": self.diff_unit,
|
|
118
|
-
"left_unit": self.left_unit,
|
|
119
|
-
"right_unit": self.right_unit,
|
|
120
|
-
"replace_negative": self.replace_negative,
|
|
121
111
|
}
|
|
122
112
|
)
|
|
123
113
|
return res
|
|
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
135
125
|
|
|
136
126
|
def _diff(self, x: TimedeltaArray):
|
|
137
127
|
x = self._convert_diff_to_unit(x)
|
|
138
|
-
return x[x > 0]
|
|
128
|
+
return x[x > 0]
|
|
139
129
|
|
|
140
130
|
def _agg(self, x):
|
|
141
131
|
method = getattr(np, self.aggregation, None)
|
|
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
|
|
|
167
157
|
super().__init__(**data)
|
|
168
158
|
|
|
169
159
|
def _agg(self, x):
|
|
170
|
-
x = x[
|
|
171
|
-
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
172
|
-
& (x < (self.upper_bound if self.upper_bound is not None else np.inf))
|
|
173
|
-
]
|
|
160
|
+
x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
|
|
174
161
|
return super()._agg(x)
|
|
175
162
|
|
|
176
163
|
|
upgini/autofe/feature.py
CHANGED
|
@@ -138,17 +138,15 @@ class Feature:
|
|
|
138
138
|
if self.cached_display_name is not None and cache:
|
|
139
139
|
return self.cached_display_name
|
|
140
140
|
|
|
141
|
-
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
142
|
-
prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
|
|
143
|
-
|
|
144
141
|
if self.alias:
|
|
145
142
|
components = ["f_autofe", self.alias]
|
|
146
|
-
elif shorten and
|
|
147
|
-
components = ["f_autofe"
|
|
143
|
+
elif shorten and not self.op.is_unary:
|
|
144
|
+
components = ["f_autofe", self.get_op_display_name()]
|
|
148
145
|
else:
|
|
149
|
-
components = (
|
|
150
|
-
|
|
151
|
-
|
|
146
|
+
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
|
|
147
|
+
"autofe",
|
|
148
|
+
self.get_op_display_name(),
|
|
149
|
+
]
|
|
152
150
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
153
151
|
display_name = "_".join(components)
|
|
154
152
|
|
|
@@ -323,10 +321,10 @@ class FeatureGroup:
|
|
|
323
321
|
lower_order_names = [ch.get_display_name() for ch in lower_order_children]
|
|
324
322
|
if any(isinstance(f, Feature) for f in lower_order_children):
|
|
325
323
|
child_data = pd.concat(
|
|
326
|
-
[data[main_column
|
|
324
|
+
[data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
|
|
327
325
|
axis=1,
|
|
328
326
|
)
|
|
329
|
-
child_data.columns =
|
|
327
|
+
child_data.columns = [main_column] + lower_order_names
|
|
330
328
|
else:
|
|
331
329
|
child_data = data[columns]
|
|
332
330
|
|
upgini/autofe/unary.py
CHANGED
|
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
|
|
|
125
125
|
normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
|
|
126
126
|
normalized_data = normalized_data.reindex(data.index)
|
|
127
127
|
return normalized_data
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
class Embeddings(PandasOperand):
|
|
131
|
-
name = "emb"
|
|
132
|
-
is_unary = True
|
|
133
|
-
input_type = "string"
|
|
134
|
-
output_type = "vector"
|
upgini/dataset.py
CHANGED
|
@@ -23,7 +23,9 @@ from pandas.api.types import (
|
|
|
23
23
|
from upgini.errors import ValidationError
|
|
24
24
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
25
25
|
from upgini.metadata import (
|
|
26
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
26
27
|
EVAL_SET_INDEX,
|
|
28
|
+
SEARCH_KEY_UNNEST,
|
|
27
29
|
SYSTEM_COLUMNS,
|
|
28
30
|
SYSTEM_RECORD_ID,
|
|
29
31
|
TARGET,
|
|
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
79
81
|
path: Optional[str] = None,
|
|
80
82
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
81
83
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
84
|
+
unnest_search_keys: Optional[Dict[str, str]] = None,
|
|
82
85
|
model_task_type: Optional[ModelTaskType] = None,
|
|
83
86
|
random_state: Optional[int] = None,
|
|
84
87
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
113
116
|
self.description = description
|
|
114
117
|
self.meaning_types = meaning_types
|
|
115
118
|
self.search_keys = search_keys
|
|
119
|
+
self.unnest_search_keys = unnest_search_keys
|
|
116
120
|
self.ignore_columns = []
|
|
117
121
|
self.hierarchical_group_keys = []
|
|
118
122
|
self.hierarchical_subgroup_keys = []
|
|
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
172
176
|
new_columns = []
|
|
173
177
|
dup_counter = 0
|
|
174
178
|
for column in self.data.columns:
|
|
175
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
179
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
|
|
176
180
|
self.columns_renaming[column] = column
|
|
177
181
|
new_columns.append(column)
|
|
178
182
|
continue
|
|
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
353
357
|
|
|
354
358
|
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
355
359
|
try:
|
|
356
|
-
self.data[postal_code] =
|
|
360
|
+
self.data[postal_code] = (
|
|
361
|
+
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
362
|
+
)
|
|
357
363
|
except Exception:
|
|
358
364
|
pass
|
|
359
365
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
803
809
|
meaningType=meaning_type,
|
|
804
810
|
minMaxValues=min_max_values,
|
|
805
811
|
)
|
|
812
|
+
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
813
|
+
column_meta.isUnnest = True
|
|
814
|
+
column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
|
|
806
815
|
|
|
807
816
|
columns.append(column_meta)
|
|
808
817
|
|
upgini/features_enricher.py
CHANGED
|
@@ -11,6 +11,7 @@ import sys
|
|
|
11
11
|
import tempfile
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
|
+
from collections import Counter
|
|
14
15
|
from dataclasses import dataclass
|
|
15
16
|
from threading import Thread
|
|
16
17
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -45,9 +46,11 @@ from upgini.mdc import MDC
|
|
|
45
46
|
from upgini.metadata import (
|
|
46
47
|
COUNTRY,
|
|
47
48
|
DEFAULT_INDEX,
|
|
49
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
48
50
|
EVAL_SET_INDEX,
|
|
49
51
|
ORIGINAL_INDEX,
|
|
50
52
|
RENAMED_INDEX,
|
|
53
|
+
SEARCH_KEY_UNNEST,
|
|
51
54
|
SORT_ID,
|
|
52
55
|
SYSTEM_RECORD_ID,
|
|
53
56
|
TARGET,
|
|
@@ -248,7 +251,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
248
251
|
self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
|
|
249
252
|
|
|
250
253
|
validate_version(self.logger)
|
|
251
|
-
self.search_keys = search_keys or
|
|
254
|
+
self.search_keys = search_keys or {}
|
|
252
255
|
self.country_code = country_code
|
|
253
256
|
self.__validate_search_keys(search_keys, search_id)
|
|
254
257
|
self.model_task_type = model_task_type
|
|
@@ -1200,7 +1203,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1200
1203
|
email_column = self._get_email_column(search_keys)
|
|
1201
1204
|
hem_column = self._get_hem_column(search_keys)
|
|
1202
1205
|
if email_column:
|
|
1203
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1206
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
|
|
1204
1207
|
extended_X = converter.convert(extended_X)
|
|
1205
1208
|
generated_features.extend(converter.generated_features)
|
|
1206
1209
|
if (
|
|
@@ -1353,7 +1356,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1353
1356
|
not in (
|
|
1354
1357
|
excluding_search_keys
|
|
1355
1358
|
+ list(self.fit_dropped_features)
|
|
1356
|
-
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1359
|
+
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
1357
1360
|
)
|
|
1358
1361
|
]
|
|
1359
1362
|
|
|
@@ -1417,7 +1420,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1417
1420
|
fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
|
|
1418
1421
|
)
|
|
1419
1422
|
|
|
1420
|
-
fitting_eval_set_dict =
|
|
1423
|
+
fitting_eval_set_dict = {}
|
|
1421
1424
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1422
1425
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1423
1426
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1534,7 +1537,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1534
1537
|
def __sample_only_input(
|
|
1535
1538
|
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
|
|
1536
1539
|
) -> _SampledDataForMetrics:
|
|
1537
|
-
eval_set_sampled_dict =
|
|
1540
|
+
eval_set_sampled_dict = {}
|
|
1538
1541
|
|
|
1539
1542
|
df = validated_X.copy()
|
|
1540
1543
|
df[TARGET] = validated_y
|
|
@@ -1560,7 +1563,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1560
1563
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1561
1564
|
|
|
1562
1565
|
df_extended, search_keys = self._extend_x(df, is_demo_dataset)
|
|
1563
|
-
df_extended = self.__add_fit_system_record_id(df_extended,
|
|
1566
|
+
df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
|
|
1564
1567
|
|
|
1565
1568
|
train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
|
|
1566
1569
|
X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
@@ -1584,7 +1587,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1584
1587
|
trace_id: str,
|
|
1585
1588
|
remove_outliers_calc_metrics: Optional[bool],
|
|
1586
1589
|
) -> _SampledDataForMetrics:
|
|
1587
|
-
eval_set_sampled_dict =
|
|
1590
|
+
eval_set_sampled_dict = {}
|
|
1588
1591
|
search_keys = self.fit_search_keys
|
|
1589
1592
|
|
|
1590
1593
|
rows_to_drop = None
|
|
@@ -1598,8 +1601,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1598
1601
|
outliers = pd.merge(
|
|
1599
1602
|
self.df_with_original_index,
|
|
1600
1603
|
target_outliers_df,
|
|
1601
|
-
|
|
1602
|
-
right_on=SYSTEM_RECORD_ID,
|
|
1604
|
+
on=ENTITY_SYSTEM_RECORD_ID,
|
|
1603
1605
|
how="inner",
|
|
1604
1606
|
)
|
|
1605
1607
|
top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
|
|
@@ -1658,7 +1660,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1658
1660
|
progress_bar: Optional[ProgressBar],
|
|
1659
1661
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1660
1662
|
) -> _SampledDataForMetrics:
|
|
1661
|
-
eval_set_sampled_dict =
|
|
1663
|
+
eval_set_sampled_dict = {}
|
|
1662
1664
|
if eval_set is not None:
|
|
1663
1665
|
self.logger.info("Transform with eval_set")
|
|
1664
1666
|
# concatenate X and eval_set with eval_set_index
|
|
@@ -1680,7 +1682,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1680
1682
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1681
1683
|
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1682
1684
|
|
|
1683
|
-
eval_set_sampled_dict =
|
|
1685
|
+
eval_set_sampled_dict = {}
|
|
1684
1686
|
|
|
1685
1687
|
tmp_target_name = "__target"
|
|
1686
1688
|
df = df.rename(columns={TARGET: tmp_target_name})
|
|
@@ -1943,11 +1945,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1943
1945
|
self.logger.info("Input dataset hasn't date column")
|
|
1944
1946
|
if self.add_date_if_missing:
|
|
1945
1947
|
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1948
|
+
|
|
1949
|
+
# Don't pass all features in backend on transform
|
|
1950
|
+
original_features_for_transform = []
|
|
1951
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1952
|
+
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1953
|
+
if len(features_not_to_pass) > 0:
|
|
1954
|
+
# Pass only features that need for transform
|
|
1955
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1956
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1957
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1958
|
+
original_features_for_transform = [
|
|
1959
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1960
|
+
]
|
|
1961
|
+
|
|
1962
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1963
|
+
|
|
1964
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1965
|
+
|
|
1966
|
+
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1967
|
+
df[columns_for_system_record_id], index=False
|
|
1968
|
+
).astype("Float64")
|
|
1969
|
+
|
|
1970
|
+
# Explode multiple search keys
|
|
1971
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1972
|
+
|
|
1946
1973
|
email_column = self._get_email_column(search_keys)
|
|
1947
1974
|
hem_column = self._get_hem_column(search_keys)
|
|
1948
1975
|
email_converted_to_hem = False
|
|
1949
1976
|
if email_column:
|
|
1950
|
-
converter = EmailSearchKeyConverter(
|
|
1977
|
+
converter = EmailSearchKeyConverter(
|
|
1978
|
+
email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
|
|
1979
|
+
)
|
|
1951
1980
|
df = converter.convert(df)
|
|
1952
1981
|
generated_features.extend(converter.generated_features)
|
|
1953
1982
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1961,30 +1990,21 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1961
1990
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1962
1991
|
|
|
1963
1992
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1964
|
-
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1993
|
+
# non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1994
|
+
for col in original_features_for_transform:
|
|
1995
|
+
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1996
|
+
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1965
1997
|
|
|
1966
1998
|
if email_converted_to_hem:
|
|
1967
|
-
|
|
1999
|
+
features_not_to_pass.append(email_column)
|
|
1968
2000
|
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1972
|
-
if len(non_keys_columns) > 0:
|
|
1973
|
-
# Pass only features that need for transform
|
|
1974
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1975
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1976
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1977
|
-
original_features_for_transform = [
|
|
1978
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1979
|
-
]
|
|
1980
|
-
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1981
|
-
|
|
1982
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
2001
|
+
features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
|
|
2002
|
+
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1983
2003
|
|
|
1984
2004
|
if add_fit_system_record_id:
|
|
1985
|
-
df = self.__add_fit_system_record_id(df,
|
|
2005
|
+
df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
|
|
1986
2006
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1987
|
-
|
|
2007
|
+
features_not_to_pass.append(SORT_ID)
|
|
1988
2008
|
|
|
1989
2009
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1990
2010
|
|
|
@@ -1992,16 +2012,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1992
2012
|
"Float64"
|
|
1993
2013
|
)
|
|
1994
2014
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
2015
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2016
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2017
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
1995
2018
|
|
|
1996
2019
|
df = df.reset_index(drop=True)
|
|
1997
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
2020
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
|
|
1998
2021
|
if add_fit_system_record_id:
|
|
1999
2022
|
system_columns_with_original_index.append(SORT_ID)
|
|
2000
2023
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
2001
2024
|
|
|
2002
2025
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2003
2026
|
|
|
2004
|
-
df_without_features = df.drop(columns=
|
|
2027
|
+
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2005
2028
|
|
|
2006
2029
|
df_without_features = clean_full_duplicates(
|
|
2007
2030
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2013,12 +2036,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2013
2036
|
dataset = Dataset(
|
|
2014
2037
|
"sample_" + str(uuid.uuid4()),
|
|
2015
2038
|
df=df_without_features,
|
|
2039
|
+
meaning_types=meaning_types,
|
|
2040
|
+
search_keys=combined_search_keys,
|
|
2041
|
+
unnest_search_keys=unnest_search_keys,
|
|
2016
2042
|
date_format=self.date_format,
|
|
2017
2043
|
rest_client=self.rest_client,
|
|
2018
2044
|
logger=self.logger,
|
|
2019
2045
|
)
|
|
2020
|
-
dataset.meaning_types = meaning_types
|
|
2021
|
-
dataset.search_keys = combined_search_keys
|
|
2022
2046
|
if email_converted_to_hem:
|
|
2023
2047
|
dataset.ignore_columns = [email_column]
|
|
2024
2048
|
|
|
@@ -2157,6 +2181,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2157
2181
|
|
|
2158
2182
|
key_types = search_keys.values()
|
|
2159
2183
|
|
|
2184
|
+
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2185
|
+
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2186
|
+
for multi_key in multi_keys:
|
|
2187
|
+
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2188
|
+
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2189
|
+
self.logger.warning(msg)
|
|
2190
|
+
raise ValidationError(msg)
|
|
2191
|
+
|
|
2160
2192
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2161
2193
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2162
2194
|
self.logger.warning(msg)
|
|
@@ -2172,11 +2204,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2172
2204
|
self.logger.warning(msg)
|
|
2173
2205
|
raise ValidationError(msg)
|
|
2174
2206
|
|
|
2175
|
-
for key_type in SearchKey.__members__.values():
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2207
|
+
# for key_type in SearchKey.__members__.values():
|
|
2208
|
+
# if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2209
|
+
# msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2210
|
+
# self.logger.warning(msg)
|
|
2211
|
+
# raise ValidationError(msg)
|
|
2180
2212
|
|
|
2181
2213
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2182
2214
|
# if (
|
|
@@ -2314,14 +2346,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2314
2346
|
self.logger.info("Input dataset hasn't date column")
|
|
2315
2347
|
if self.add_date_if_missing:
|
|
2316
2348
|
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2317
|
-
|
|
2318
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2319
|
-
email_converted_to_hem = False
|
|
2320
|
-
if email_column:
|
|
2321
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2322
|
-
df = converter.convert(df)
|
|
2323
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2324
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2349
|
+
|
|
2325
2350
|
if (
|
|
2326
2351
|
self.detect_missing_search_keys
|
|
2327
2352
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2330,7 +2355,37 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2330
2355
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2331
2356
|
df = converter.convert(df)
|
|
2332
2357
|
|
|
2358
|
+
# Explode multiple search keys
|
|
2333
2359
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2360
|
+
meaning_types = {
|
|
2361
|
+
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2362
|
+
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2363
|
+
}
|
|
2364
|
+
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2365
|
+
if eval_set is not None and len(eval_set) > 0:
|
|
2366
|
+
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2367
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2368
|
+
|
|
2369
|
+
# TODO check that this is correct for enrichment
|
|
2370
|
+
self.df_with_original_index = df.copy()
|
|
2371
|
+
|
|
2372
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2373
|
+
|
|
2374
|
+
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2375
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2376
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2377
|
+
email_converted_to_hem = False
|
|
2378
|
+
if email_column:
|
|
2379
|
+
converter = EmailSearchKeyConverter(
|
|
2380
|
+
email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
|
|
2381
|
+
)
|
|
2382
|
+
df = converter.convert(df)
|
|
2383
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2384
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2385
|
+
|
|
2386
|
+
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2387
|
+
self.fit_search_keys.keys()
|
|
2388
|
+
)
|
|
2334
2389
|
if email_converted_to_hem:
|
|
2335
2390
|
non_feature_columns.append(email_column)
|
|
2336
2391
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2354,12 +2409,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2354
2409
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2355
2410
|
}
|
|
2356
2411
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2412
|
+
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2413
|
+
if SEARCH_KEY_UNNEST in df.columns:
|
|
2414
|
+
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2357
2415
|
if eval_set is not None and len(eval_set) > 0:
|
|
2358
2416
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2359
2417
|
|
|
2360
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2418
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2361
2419
|
|
|
2362
|
-
self.df_with_original_index = df.copy()
|
|
2363
2420
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2364
2421
|
|
|
2365
2422
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2367,14 +2424,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2367
2424
|
dataset = Dataset(
|
|
2368
2425
|
"tds_" + str(uuid.uuid4()),
|
|
2369
2426
|
df=df,
|
|
2427
|
+
meaning_types=meaning_types,
|
|
2428
|
+
search_keys=combined_search_keys,
|
|
2429
|
+
unnest_search_keys=unnest_search_keys,
|
|
2370
2430
|
model_task_type=model_task_type,
|
|
2371
2431
|
date_format=self.date_format,
|
|
2372
2432
|
random_state=self.random_state,
|
|
2373
2433
|
rest_client=self.rest_client,
|
|
2374
2434
|
logger=self.logger,
|
|
2375
2435
|
)
|
|
2376
|
-
dataset.meaning_types = meaning_types
|
|
2377
|
-
dataset.search_keys = combined_search_keys
|
|
2378
2436
|
if email_converted_to_hem:
|
|
2379
2437
|
dataset.ignore_columns = [email_column]
|
|
2380
2438
|
|
|
@@ -2744,9 +2802,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2744
2802
|
X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
|
|
2745
2803
|
) -> Tuple[pd.DataFrame, pd.Series]:
|
|
2746
2804
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2805
|
+
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
|
2747
2806
|
Xy = X.copy()
|
|
2748
2807
|
Xy[TARGET] = y
|
|
2749
|
-
Xy = Xy.sort_values(by=
|
|
2808
|
+
Xy = Xy.sort_values(by=record_id_column).reset_index(drop=True)
|
|
2750
2809
|
X = Xy.drop(columns=TARGET)
|
|
2751
2810
|
y = Xy[TARGET].copy()
|
|
2752
2811
|
|
|
@@ -2925,15 +2984,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2925
2984
|
|
|
2926
2985
|
@staticmethod
|
|
2927
2986
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2928
|
-
for col, t in search_keys.items()
|
|
2929
|
-
|
|
2930
|
-
|
|
2987
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
|
|
2988
|
+
if len(cols) > 1:
|
|
2989
|
+
raise Exception("More than one email column found after unnest")
|
|
2990
|
+
if len(cols) == 1:
|
|
2991
|
+
return cols[0]
|
|
2931
2992
|
|
|
2932
2993
|
@staticmethod
|
|
2933
2994
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2934
|
-
for col, t in search_keys.items()
|
|
2935
|
-
|
|
2936
|
-
|
|
2995
|
+
cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
|
|
2996
|
+
if len(cols) > 1:
|
|
2997
|
+
raise Exception("More than one hem column found after unnest")
|
|
2998
|
+
if len(cols) == 1:
|
|
2999
|
+
return cols[0]
|
|
2937
3000
|
|
|
2938
3001
|
@staticmethod
|
|
2939
3002
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2941,8 +3004,44 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2941
3004
|
if t == SearchKey.PHONE:
|
|
2942
3005
|
return col
|
|
2943
3006
|
|
|
3007
|
+
def _explode_multiple_search_keys(
|
|
3008
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
3009
|
+
) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
|
|
3010
|
+
# find groups of multiple search keys
|
|
3011
|
+
search_key_names_by_type: Dict[SearchKey, str] = {}
|
|
3012
|
+
for key_name, key_type in search_keys.items():
|
|
3013
|
+
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
3014
|
+
search_key_names_by_type = {
|
|
3015
|
+
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
3016
|
+
}
|
|
3017
|
+
if len(search_key_names_by_type) == 0:
|
|
3018
|
+
return df, {}
|
|
3019
|
+
|
|
3020
|
+
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
3021
|
+
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
3022
|
+
exploded_dfs = []
|
|
3023
|
+
unnest_search_keys = {}
|
|
3024
|
+
|
|
3025
|
+
for key_type, key_names in search_key_names_by_type.items():
|
|
3026
|
+
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
3027
|
+
exploded_df = pd.melt(
|
|
3028
|
+
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
3029
|
+
)
|
|
3030
|
+
exploded_dfs.append(exploded_df)
|
|
3031
|
+
for old_key in key_names:
|
|
3032
|
+
del search_keys[old_key]
|
|
3033
|
+
search_keys[new_search_key] = key_type
|
|
3034
|
+
unnest_search_keys[new_search_key] = key_names
|
|
3035
|
+
|
|
3036
|
+
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
3037
|
+
return df, unnest_search_keys
|
|
3038
|
+
|
|
2944
3039
|
def __add_fit_system_record_id(
|
|
2945
|
-
self,
|
|
3040
|
+
self,
|
|
3041
|
+
df: pd.DataFrame,
|
|
3042
|
+
meaning_types: Dict[str, FileColumnMeaningType],
|
|
3043
|
+
search_keys: Dict[str, SearchKey],
|
|
3044
|
+
id_name: str,
|
|
2946
3045
|
) -> pd.DataFrame:
|
|
2947
3046
|
# save original order or rows
|
|
2948
3047
|
original_index_name = df.index.name
|
|
@@ -2953,7 +3052,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2953
3052
|
|
|
2954
3053
|
# order by date and idempotent order by other keys
|
|
2955
3054
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2956
|
-
sort_exclude_columns = [
|
|
3055
|
+
sort_exclude_columns = [
|
|
3056
|
+
original_order_name,
|
|
3057
|
+
ORIGINAL_INDEX,
|
|
3058
|
+
EVAL_SET_INDEX,
|
|
3059
|
+
TARGET,
|
|
3060
|
+
"__target",
|
|
3061
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
3062
|
+
]
|
|
2957
3063
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2958
3064
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2959
3065
|
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
@@ -2991,14 +3097,18 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2991
3097
|
|
|
2992
3098
|
df = df.reset_index(drop=True).reset_index()
|
|
2993
3099
|
# system_record_id saves correct order for fit
|
|
2994
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
3100
|
+
df = df.rename(columns={DEFAULT_INDEX: id_name})
|
|
2995
3101
|
|
|
2996
3102
|
# return original order
|
|
2997
3103
|
df = df.set_index(ORIGINAL_INDEX)
|
|
2998
3104
|
df.index.name = original_index_name
|
|
2999
3105
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3000
3106
|
|
|
3001
|
-
meaning_types[
|
|
3107
|
+
meaning_types[id_name] = (
|
|
3108
|
+
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3109
|
+
if id_name == SYSTEM_RECORD_ID
|
|
3110
|
+
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3111
|
+
)
|
|
3002
3112
|
return df
|
|
3003
3113
|
|
|
3004
3114
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3053,7 +3163,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3053
3163
|
)
|
|
3054
3164
|
|
|
3055
3165
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3056
|
-
dup_features = [
|
|
3166
|
+
dup_features = [
|
|
3167
|
+
c
|
|
3168
|
+
for c in comparing_columns
|
|
3169
|
+
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3170
|
+
]
|
|
3057
3171
|
if len(dup_features) > 0:
|
|
3058
3172
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3059
3173
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3064,8 +3178,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3064
3178
|
result_features = pd.merge(
|
|
3065
3179
|
df_with_original_index,
|
|
3066
3180
|
result_features,
|
|
3067
|
-
|
|
3068
|
-
right_on=SYSTEM_RECORD_ID,
|
|
3181
|
+
on=ENTITY_SYSTEM_RECORD_ID,
|
|
3069
3182
|
how="left" if is_transform else "inner",
|
|
3070
3183
|
)
|
|
3071
3184
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3076,7 +3189,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3076
3189
|
result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
|
|
3077
3190
|
self.logger.info(f"After dropping target outliers size: {len(result_features)}")
|
|
3078
3191
|
|
|
3079
|
-
result_eval_sets =
|
|
3192
|
+
result_eval_sets = {}
|
|
3080
3193
|
if not is_transform and EVAL_SET_INDEX in result_features.columns:
|
|
3081
3194
|
result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
|
|
3082
3195
|
eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
|
|
@@ -3288,7 +3401,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3288
3401
|
if autofe_feature.op.is_vector:
|
|
3289
3402
|
continue
|
|
3290
3403
|
|
|
3291
|
-
description =
|
|
3404
|
+
description = {}
|
|
3292
3405
|
|
|
3293
3406
|
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3294
3407
|
if feature_meta is None:
|
|
@@ -3454,13 +3567,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3454
3567
|
self.warning_counter.increment()
|
|
3455
3568
|
|
|
3456
3569
|
if len(valid_search_keys) == 1:
|
|
3457
|
-
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
|
|
3570
|
+
key, value = list(valid_search_keys.items())[0]
|
|
3571
|
+
# Show warning for country only if country is the only key
|
|
3572
|
+
if x[key].nunique() == 1:
|
|
3573
|
+
msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
|
|
3574
|
+
print(msg)
|
|
3575
|
+
self.logger.warning(msg)
|
|
3576
|
+
self.warning_counter.increment()
|
|
3464
3577
|
|
|
3465
3578
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3466
3579
|
|
|
@@ -3570,61 +3683,68 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3570
3683
|
def check_need_detect(search_key: SearchKey):
|
|
3571
3684
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3572
3685
|
|
|
3573
|
-
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3686
|
+
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3687
|
+
if check_need_detect(SearchKey.POSTAL_CODE):
|
|
3688
|
+
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3689
|
+
if maybe_keys:
|
|
3690
|
+
new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
|
|
3691
|
+
search_keys.update(new_keys)
|
|
3692
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3693
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3579
3694
|
if not silent_mode:
|
|
3580
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3695
|
+
print(self.bundle.get("postal_code_detected").format(maybe_keys))
|
|
3581
3696
|
|
|
3582
3697
|
if (
|
|
3583
3698
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3584
3699
|
and self.country_code is None
|
|
3585
3700
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3586
3701
|
):
|
|
3587
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3588
|
-
if maybe_key
|
|
3589
|
-
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3590
|
-
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3702
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3703
|
+
if maybe_key:
|
|
3704
|
+
search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3705
|
+
self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
|
|
3591
3706
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3592
3707
|
if not silent_mode:
|
|
3593
3708
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3594
3709
|
|
|
3595
3710
|
if (
|
|
3596
|
-
SearchKey.EMAIL not in search_keys.values()
|
|
3597
|
-
|
|
3711
|
+
# SearchKey.EMAIL not in search_keys.values()
|
|
3712
|
+
SearchKey.HEM not in search_keys.values()
|
|
3598
3713
|
and check_need_detect(SearchKey.HEM)
|
|
3599
3714
|
):
|
|
3600
|
-
|
|
3601
|
-
if
|
|
3715
|
+
maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3716
|
+
if maybe_keys:
|
|
3602
3717
|
if self.__is_registered or is_demo_dataset:
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
self.
|
|
3718
|
+
new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
|
|
3719
|
+
search_keys.update(new_keys)
|
|
3720
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3721
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3606
3722
|
if not silent_mode:
|
|
3607
|
-
print(self.bundle.get("email_detected").format(
|
|
3723
|
+
print(self.bundle.get("email_detected").format(maybe_keys))
|
|
3608
3724
|
else:
|
|
3609
3725
|
self.logger.warning(
|
|
3610
|
-
f"Autodetected search key EMAIL in column {
|
|
3726
|
+
f"Autodetected search key EMAIL in column {maybe_keys}."
|
|
3727
|
+
" But not used because not registered user"
|
|
3611
3728
|
)
|
|
3612
3729
|
if not silent_mode:
|
|
3613
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3730
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
|
|
3614
3731
|
self.warning_counter.increment()
|
|
3615
3732
|
|
|
3616
|
-
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3617
|
-
|
|
3618
|
-
|
|
3733
|
+
# if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3734
|
+
if check_need_detect(SearchKey.PHONE):
|
|
3735
|
+
maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
3736
|
+
if maybe_keys:
|
|
3619
3737
|
if self.__is_registered or is_demo_dataset:
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
self.
|
|
3738
|
+
new_keys = {key: SearchKey.PHONE for key in maybe_keys}
|
|
3739
|
+
search_keys.update(new_keys)
|
|
3740
|
+
self.autodetected_search_keys.update(new_keys)
|
|
3741
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3623
3742
|
if not silent_mode:
|
|
3624
|
-
print(self.bundle.get("phone_detected").format(
|
|
3743
|
+
print(self.bundle.get("phone_detected").format(maybe_keys))
|
|
3625
3744
|
else:
|
|
3626
3745
|
self.logger.warning(
|
|
3627
|
-
f"Autodetected search key PHONE in column {
|
|
3746
|
+
f"Autodetected search key PHONE in column {maybe_keys}. "
|
|
3747
|
+
"But not used because not registered user"
|
|
3628
3748
|
)
|
|
3629
3749
|
if not silent_mode:
|
|
3630
3750
|
print(self.bundle.get("phone_detected_not_registered"))
|
upgini/metadata.py
CHANGED
|
@@ -6,6 +6,8 @@ from typing import Dict, List, Optional, Set
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
SYSTEM_RECORD_ID = "system_record_id"
|
|
9
|
+
ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
|
|
10
|
+
SEARCH_KEY_UNNEST = "search_key_unnest"
|
|
9
11
|
SORT_ID = "sort_id"
|
|
10
12
|
EVAL_SET_INDEX = "eval_set_index"
|
|
11
13
|
TARGET = "target"
|
|
@@ -13,7 +15,7 @@ COUNTRY = "country_iso_code"
|
|
|
13
15
|
RENAMED_INDEX = "index_col"
|
|
14
16
|
DEFAULT_INDEX = "index"
|
|
15
17
|
ORIGINAL_INDEX = "original_index"
|
|
16
|
-
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY
|
|
18
|
+
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
|
|
17
19
|
|
|
18
20
|
|
|
19
21
|
class FileColumnMeaningType(Enum):
|
|
@@ -39,6 +41,8 @@ class FileColumnMeaningType(Enum):
|
|
|
39
41
|
POSTAL_CODE = "POSTAL_CODE"
|
|
40
42
|
SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
|
|
41
43
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
44
|
+
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
45
|
+
UNNEST_KEY = "UNNEST_KEY"
|
|
42
46
|
|
|
43
47
|
|
|
44
48
|
class SearchKey(Enum):
|
|
@@ -184,6 +188,10 @@ class FileColumnMetadata(BaseModel):
|
|
|
184
188
|
meaningType: FileColumnMeaningType
|
|
185
189
|
minMaxValues: Optional[NumericInterval] = None
|
|
186
190
|
originalName: Optional[str]
|
|
191
|
+
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
192
|
+
isUnnest: bool = False
|
|
193
|
+
# list of original etalon key column names like msisdn1, msisdn2
|
|
194
|
+
unnestKeyNames: Optional[list[str]]
|
|
187
195
|
|
|
188
196
|
|
|
189
197
|
class FileMetadata(BaseModel):
|
|
@@ -281,7 +289,7 @@ class FeaturesFilter(BaseModel):
|
|
|
281
289
|
|
|
282
290
|
|
|
283
291
|
class RuntimeParameters(BaseModel):
|
|
284
|
-
properties: Dict[str, str] =
|
|
292
|
+
properties: Dict[str, str] = {}
|
|
285
293
|
|
|
286
294
|
|
|
287
295
|
class SearchCustomization(BaseModel):
|
upgini/metrics.py
CHANGED
|
@@ -369,7 +369,7 @@ class EstimatorWrapper:
|
|
|
369
369
|
"logger": logger,
|
|
370
370
|
}
|
|
371
371
|
if estimator is None:
|
|
372
|
-
params =
|
|
372
|
+
params = {}
|
|
373
373
|
params["has_time"] = has_date
|
|
374
374
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
375
375
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
@@ -88,6 +88,7 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
|
88
88
|
search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
89
89
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
90
90
|
single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
91
|
+
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
91
92
|
unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
92
93
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
93
94
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
11
|
raise NotImplementedError
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
13
|
+
def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
|
|
14
|
+
return [
|
|
15
|
+
column_name
|
|
16
|
+
for column_name in column_names
|
|
17
|
+
if self._is_search_key_by_name(column_name)
|
|
18
|
+
]
|
|
17
19
|
|
|
18
|
-
def
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
for column_name in df.columns:
|
|
20
|
+
def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
|
|
21
|
+
other_columns = [col for col in df.columns if col not in existing_search_keys]
|
|
22
|
+
columns_by_names = self._get_search_keys_by_name(other_columns)
|
|
23
|
+
columns_by_values = []
|
|
24
|
+
for column_name in other_columns:
|
|
24
25
|
if self._is_search_key_by_values(df[column_name]):
|
|
25
|
-
|
|
26
|
+
columns_by_values.append(column_name)
|
|
27
|
+
return list(set(columns_by_names + columns_by_values))
|
|
@@ -3,7 +3,15 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import
|
|
6
|
+
from upgini.metadata import (
|
|
7
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
+
EVAL_SET_INDEX,
|
|
9
|
+
SORT_ID,
|
|
10
|
+
SYSTEM_RECORD_ID,
|
|
11
|
+
TARGET,
|
|
12
|
+
ModelTaskType,
|
|
13
|
+
SearchKey,
|
|
14
|
+
)
|
|
7
15
|
from upgini.resource_bundle import ResourceBundle
|
|
8
16
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
9
17
|
from upgini.utils.target_utils import define_task
|
|
@@ -143,6 +151,8 @@ def clean_full_duplicates(
|
|
|
143
151
|
unique_columns = df.columns.tolist()
|
|
144
152
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
145
153
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
154
|
+
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
155
|
+
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
146
156
|
if SORT_ID in unique_columns:
|
|
147
157
|
unique_columns.remove(SORT_ID)
|
|
148
158
|
if EVAL_SET_INDEX in unique_columns:
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,11 +38,13 @@ class EmailSearchKeyConverter:
|
|
|
38
38
|
email_column: str,
|
|
39
39
|
hem_column: Optional[str],
|
|
40
40
|
search_keys: Dict[str, SearchKey],
|
|
41
|
+
unnest_search_keys: Optional[List[str]] = None,
|
|
41
42
|
logger: Optional[logging.Logger] = None,
|
|
42
43
|
):
|
|
43
44
|
self.email_column = email_column
|
|
44
45
|
self.hem_column = hem_column
|
|
45
46
|
self.search_keys = search_keys
|
|
47
|
+
self.unnest_search_keys = unnest_search_keys
|
|
46
48
|
if logger is not None:
|
|
47
49
|
self.logger = logger
|
|
48
50
|
else:
|
|
@@ -80,9 +82,12 @@ class EmailSearchKeyConverter:
|
|
|
80
82
|
del self.search_keys[self.email_column]
|
|
81
83
|
return df
|
|
82
84
|
self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
|
|
85
|
+
self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
|
|
83
86
|
self.email_converted_to_hem = True
|
|
84
87
|
|
|
85
88
|
del self.search_keys[self.email_column]
|
|
89
|
+
if self.email_column in self.unnest_search_keys:
|
|
90
|
+
self.unnest_search_keys.remove(self.email_column)
|
|
86
91
|
|
|
87
92
|
df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
|
|
88
93
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.300
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
|
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
-
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
-
Requires-Dist: levenshtein>=0.25.1
|
|
31
29
|
Requires-Dist: lightgbm>=3.3.2
|
|
32
30
|
Requires-Dist: numpy>=1.19.0
|
|
33
31
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
@@ -133,7 +131,7 @@ Description-Content-Type: text/markdown
|
|
|
133
131
|
|Consumer Confidence index| 44 |22|-|Monthly|date, country|No
|
|
134
132
|
|World economic indicators|191 |41|-|Monthly|date, country|No
|
|
135
133
|
|Markets data|-|17|-|Monthly|date, datetime|No
|
|
136
|
-
|World mobile & fixed broadband network coverage and
|
|
134
|
+
|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
|
|
137
135
|
|World demographic data |90|-|2|Annual|country, postal/ZIP code|No
|
|
138
136
|
|World house prices |44|-|3|Annual|country, postal/ZIP code|No
|
|
139
137
|
|Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
|
|
@@ -842,4 +840,4 @@ Some convenient ways to start contributing are:
|
|
|
842
840
|
- [More perks for registered users](https://profile.upgini.com)
|
|
843
841
|
|
|
844
842
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
845
|
-
Please report it here</a></sup>
|
|
843
|
+
Please report it here</a></sup>
|
|
@@ -1,26 +1,26 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=IcfjUPTVJrTs-NV6tcVvLWMSPbW14GYerq2VeSrrzc0,24
|
|
2
2
|
upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=MOzBVsvzlHLxNfPWtMaXC_jIPeW7_gUvbSGeXnsPgNI,46158
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=nweJzEV8ZbDN5wvf5Gdf-HgcNz7711gNgmRxz4ZUopI,183112
|
|
7
7
|
upgini/http.py,sha256=bp6jWl422Icy3AhHMdCcJv5NjExE45gSMmzMTPJjPuk,42600
|
|
8
8
|
upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=wOFCJruDBhC4Hiiiqf8GeHZnnm6rhJy8t6fg5B0Z4TQ,10209
|
|
10
|
+
upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
|
|
11
11
|
upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
upgini/autofe/all_operands.py,sha256=
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=XbvgX2IU4aee9rJZ--d5MdmrfKhON_emle5-RU1qlEY,2506
|
|
18
|
+
upgini/autofe/binary.py,sha256=8FXPJxN7fnC5wphO0Dp1tQCa0lFMSDGQGvBMkSIVAcE,4155
|
|
19
|
+
upgini/autofe/date.py,sha256=8zYVhjl7jVS4xt-IjCgk9px2LHnACX2YlMlmDELlRTc,7943
|
|
20
|
+
upgini/autofe/feature.py,sha256=ayxiF8Ip1ww_pt_BC9Pk127fAHZ_3fuluulS1EYLolk,13423
|
|
21
21
|
upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
|
|
22
22
|
upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
23
|
+
upgini/autofe/unary.py,sha256=ZWjLd-CUkNt_PpM8YuWLLipW1v_RdBlsl4JxXIVo9aM,3652
|
|
24
24
|
upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
26
|
upgini/data_source/data_source_publisher.py,sha256=1cQZrK630VztwGGDp41ec9gqIeUtkefaqSSQEitVWiM,19581
|
|
@@ -30,22 +30,22 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/phone_normalizer.py,sha256=EzTaahk6myRv6ZXgbyVFGY4kpo_2VlQgOrm5_lfbmNI,9996
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=6jYqcxj06ZopXwr5YYMGXX1QiNNJNFo2SuwAR0qleRk,26358
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
37
37
|
upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
|
|
38
38
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
39
39
|
upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
|
|
40
|
-
upgini/utils/base_search_key_detector.py,sha256=
|
|
40
|
+
upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
|
|
41
41
|
upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
|
|
42
42
|
upgini/utils/country_utils.py,sha256=yE8oRgMpXuJxPfQm4fioY6dg6700HgVnHSk4Cv9sUyM,6511
|
|
43
43
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
44
44
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
45
45
|
upgini/utils/datetime_utils.py,sha256=Ujmu1ouwSFtG5SywQXJlmtDnGigAnIWPdE5Vx5NvgUM,10951
|
|
46
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
46
|
+
upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
|
-
upgini/utils/email_utils.py,sha256=
|
|
48
|
+
upgini/utils/email_utils.py,sha256=aKHa4xVBSsEsiZtFCPj_DrUaFupceYfvJeP_e8w_D5E,3813
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
50
|
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
60
|
+
upgini-1.1.300.dist-info/METADATA,sha256=hLE9o5ZxxN1PnP4WR6ZUnp5yFpEb1cbVSwWEOiQDZBE,48153
|
|
61
|
+
upgini-1.1.300.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.1.300.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.1.300.dist-info/RECORD,,
|
|
File without changes
|