upgini 1.2.6a1__tar.gz → 1.2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.6a1 → upgini-1.2.8}/PKG-INFO +1 -1
- upgini-1.2.8/src/upgini/__about__.py +1 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/all_operands.py +2 -1
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/feature.py +44 -14
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/data_source/data_source_publisher.py +8 -1
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/features_enricher.py +3 -2
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/ip_utils.py +1 -1
- upgini-1.2.6a1/src/upgini/__about__.py +0 -1
- {upgini-1.2.6a1 → upgini-1.2.8}/.gitignore +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/LICENSE +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/README.md +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/pyproject.toml +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/ads.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/dataset.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/errors.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/http.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/metadata.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/metrics.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/search_task.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/spinner.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.6a1 → upgini-1.2.8}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.8"
|
|
@@ -22,6 +22,9 @@ class Column:
|
|
|
22
22
|
def set_op_params(self, params: Dict[str, str]) -> "Column":
|
|
23
23
|
return self
|
|
24
24
|
|
|
25
|
+
def get_op_params(self, **kwargs):
|
|
26
|
+
return dict()
|
|
27
|
+
|
|
25
28
|
def rename_columns(self, mapping: Dict[str, str]) -> "Column":
|
|
26
29
|
self.name = self._unhash(mapping.get(self.name) or self.name)
|
|
27
30
|
return self
|
|
@@ -44,6 +47,10 @@ class Column:
|
|
|
44
47
|
def get_columns(self, **kwargs) -> List[str]:
|
|
45
48
|
return [self.name]
|
|
46
49
|
|
|
50
|
+
@property
|
|
51
|
+
def children(self) -> List[Union["Feature", "Column"]]:
|
|
52
|
+
return []
|
|
53
|
+
|
|
47
54
|
def infer_type(self, data: pd.DataFrame) -> DtypeObj:
|
|
48
55
|
return data[self.name].dtype
|
|
49
56
|
|
|
@@ -88,9 +95,30 @@ class Feature:
|
|
|
88
95
|
self.op.set_params(params)
|
|
89
96
|
|
|
90
97
|
for child in self.children:
|
|
91
|
-
|
|
98
|
+
child_params = {
|
|
99
|
+
k[len(child.get_display_name()) + 1 :]: v
|
|
100
|
+
for k, v in params.items()
|
|
101
|
+
if k.startswith(child.get_display_name())
|
|
102
|
+
}
|
|
103
|
+
if not child_params:
|
|
104
|
+
child_params = params
|
|
105
|
+
child.set_op_params(child_params)
|
|
92
106
|
return self
|
|
93
107
|
|
|
108
|
+
def get_op_params(self, **kwargs) -> Dict[str, str]:
|
|
109
|
+
return {
|
|
110
|
+
k: str(v)
|
|
111
|
+
for k, v in dict(
|
|
112
|
+
(
|
|
113
|
+
(f"{child.get_display_name(**kwargs)}_{k}", v)
|
|
114
|
+
for child in self.children
|
|
115
|
+
for k, v in child.get_op_params(**kwargs).items()
|
|
116
|
+
),
|
|
117
|
+
**(self.op.get_params() or {}),
|
|
118
|
+
).items()
|
|
119
|
+
if v is not None
|
|
120
|
+
}
|
|
121
|
+
|
|
94
122
|
def get_hash(self) -> str:
|
|
95
123
|
return hashlib.sha256(
|
|
96
124
|
"_".join([self.op.name] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
|
|
@@ -326,24 +354,26 @@ class FeatureGroup:
|
|
|
326
354
|
return names
|
|
327
355
|
|
|
328
356
|
def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
|
|
329
|
-
main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
|
|
330
357
|
if isinstance(self.op, PandasOperand):
|
|
331
|
-
|
|
332
|
-
lower_order_children = [
|
|
358
|
+
main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
|
|
359
|
+
lower_order_children = []
|
|
360
|
+
if self.main_column_node is not None:
|
|
361
|
+
lower_order_children.append(self.main_column_node)
|
|
362
|
+
lower_order_children.extend(
|
|
333
363
|
ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
|
|
334
|
-
|
|
364
|
+
)
|
|
335
365
|
lower_order_names = [ch.get_display_name() for ch in lower_order_children]
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
|
|
342
|
-
else:
|
|
343
|
-
child_data = data[columns]
|
|
366
|
+
child_data = pd.concat(
|
|
367
|
+
[ch.calculate(data) for ch in lower_order_children],
|
|
368
|
+
axis=1,
|
|
369
|
+
)
|
|
370
|
+
child_data.columns = lower_order_names
|
|
344
371
|
|
|
345
372
|
new_data = self.op.calculate_group(child_data, main_column=main_column)
|
|
346
|
-
new_data.rename(
|
|
373
|
+
new_data.rename(
|
|
374
|
+
columns=dict(zip((n for n in lower_order_names if n != main_column), self.get_display_names())),
|
|
375
|
+
inplace=True,
|
|
376
|
+
)
|
|
347
377
|
else:
|
|
348
378
|
raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
|
|
349
379
|
|
|
@@ -64,6 +64,7 @@ class DataSourcePublisher:
|
|
|
64
64
|
date_features: Optional[List[str]] = None,
|
|
65
65
|
date_vector_features: Optional[List[str]] = None,
|
|
66
66
|
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
67
|
+
exclude_raw: Optional[List[str]] = None,
|
|
67
68
|
_force_generation=False,
|
|
68
69
|
_silent=False,
|
|
69
70
|
) -> str:
|
|
@@ -88,6 +89,8 @@ class DataSourcePublisher:
|
|
|
88
89
|
features_for_embeddings - optional list of str - list of features that should be used for GPT features
|
|
89
90
|
generation
|
|
90
91
|
|
|
92
|
+
exclude_raw - optional list of str - list of features that should NOT be used as raw features
|
|
93
|
+
|
|
91
94
|
...
|
|
92
95
|
|
|
93
96
|
data_table_id_to_replace - optional str - id of registered ADS that should be replaced by new table
|
|
@@ -166,6 +169,8 @@ class DataSourcePublisher:
|
|
|
166
169
|
request["dateVectorFeatures"] = date_vector_features
|
|
167
170
|
if generate_runtime_embeddings is not None:
|
|
168
171
|
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
172
|
+
if exclude_raw is not None:
|
|
173
|
+
request["excludeRaw"] = exclude_raw
|
|
169
174
|
self.logger.info(f"Start registering data table {request}")
|
|
170
175
|
|
|
171
176
|
task_id = self._rest_client.register_ads(request, trace_id)
|
|
@@ -281,6 +286,7 @@ class DataSourcePublisher:
|
|
|
281
286
|
date_vector_features: Optional[List[str]] = None,
|
|
282
287
|
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
283
288
|
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
289
|
+
exclude_raw: Optional[List[str]] = None,
|
|
284
290
|
):
|
|
285
291
|
trace_id = str(uuid.uuid4())
|
|
286
292
|
with MDC(trace_id=trace_id):
|
|
@@ -336,6 +342,8 @@ class DataSourcePublisher:
|
|
|
336
342
|
request["excludeFromGenerationFeatures"] = exclude_from_autofe_generation
|
|
337
343
|
if generate_runtime_embeddings is not None:
|
|
338
344
|
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
345
|
+
if exclude_raw is not None:
|
|
346
|
+
request["excludeRaw"] = exclude_raw
|
|
339
347
|
self.logger.info(f"Activating data tables with request {request}")
|
|
340
348
|
|
|
341
349
|
self._rest_client.activate_datatables(request, trace_id)
|
|
@@ -378,7 +386,6 @@ class DataSourcePublisher:
|
|
|
378
386
|
search_keys = [k.value.value for k in search_keys] if search_keys else None
|
|
379
387
|
request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
|
|
380
388
|
task_id = self._rest_client.upload_online(request, trace_id)
|
|
381
|
-
print(f"Start polling management task_id={task_id} with trace_id={trace_id}")
|
|
382
389
|
with Spinner():
|
|
383
390
|
status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
|
|
384
391
|
while status_response["status"] not in self.FINAL_STATUSES:
|
|
@@ -2095,7 +2095,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2095
2095
|
features_not_to_pass = [
|
|
2096
2096
|
c
|
|
2097
2097
|
for c in df.columns
|
|
2098
|
-
if c not in search_keys.keys()
|
|
2098
|
+
if c not in search_keys.keys()
|
|
2099
|
+
and c not in features_for_transform
|
|
2100
|
+
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2099
2101
|
]
|
|
2100
2102
|
|
|
2101
2103
|
if add_fit_system_record_id:
|
|
@@ -3235,7 +3237,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3235
3237
|
]
|
|
3236
3238
|
)
|
|
3237
3239
|
|
|
3238
|
-
# TODO some columns not exists
|
|
3239
3240
|
all_other_columns = sorted_other_keys + other_columns
|
|
3240
3241
|
|
|
3241
3242
|
search_keys_hash = "search_keys_hash"
|
|
@@ -104,7 +104,7 @@ class IpSearchKeyConverter:
|
|
|
104
104
|
del self.search_keys[self.ip_column]
|
|
105
105
|
del self.columns_renaming[self.ip_column]
|
|
106
106
|
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
107
|
-
self.columns_renaming[ipv6] = original_ip # could be
|
|
107
|
+
self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
|
|
108
108
|
|
|
109
109
|
return df
|
|
110
110
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.6a1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|