upgini 1.2.55a1__tar.gz → 1.2.56__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.55a1 → upgini-1.2.56}/PKG-INFO +1 -1
- upgini-1.2.56/src/upgini/__about__.py +1 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/dataset.py +8 -16
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/features_enricher.py +25 -15
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/metadata.py +24 -22
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/resource_bundle/strings.properties +0 -1
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/email_utils.py +6 -6
- upgini-1.2.55a1/src/upgini/__about__.py +0 -1
- {upgini-1.2.55a1 → upgini-1.2.56}/.gitignore +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/LICENSE +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/README.md +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/pyproject.toml +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/ads.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/errors.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/http.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/metrics.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/search_task.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/spinner.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.55a1 → upgini-1.2.56}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.56"
|
|
@@ -587,23 +587,15 @@ class Dataset: # (pd.DataFrame):
|
|
|
587
587
|
if (
|
|
588
588
|
runtime_parameters is not None
|
|
589
589
|
and runtime_parameters.properties is not None
|
|
590
|
+
and "generate_features" in runtime_parameters.properties
|
|
590
591
|
):
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
for
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
|
|
599
|
-
if "columns_for_online_api" in runtime_parameters.properties:
|
|
600
|
-
columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
|
|
601
|
-
renamed_columns_for_online_api = []
|
|
602
|
-
for f in columns_for_online_api:
|
|
603
|
-
for new_column, orig_column in self.columns_renaming.items():
|
|
604
|
-
if f == orig_column:
|
|
605
|
-
renamed_columns_for_online_api.append(new_column)
|
|
606
|
-
runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
|
|
592
|
+
generate_features = runtime_parameters.properties["generate_features"].split(",")
|
|
593
|
+
renamed_generate_features = []
|
|
594
|
+
for f in generate_features:
|
|
595
|
+
for new_column, orig_column in self.columns_renaming.items():
|
|
596
|
+
if f == orig_column:
|
|
597
|
+
renamed_generate_features.append(new_column)
|
|
598
|
+
runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
|
|
607
599
|
|
|
608
600
|
return runtime_parameters
|
|
609
601
|
|
|
@@ -222,7 +222,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
222
222
|
loss: Optional[str] = None,
|
|
223
223
|
detect_missing_search_keys: bool = True,
|
|
224
224
|
generate_features: Optional[List[str]] = None,
|
|
225
|
-
columns_for_online_api: Optional[List[str]] = None,
|
|
226
225
|
round_embeddings: Optional[int] = None,
|
|
227
226
|
logs_enabled: bool = True,
|
|
228
227
|
raise_validation_error: bool = True,
|
|
@@ -346,9 +345,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
346
345
|
self.logger.error(msg)
|
|
347
346
|
raise ValidationError(msg)
|
|
348
347
|
self.runtime_parameters.properties["round_embeddings"] = round_embeddings
|
|
349
|
-
self.columns_for_online_api = columns_for_online_api
|
|
350
|
-
if columns_for_online_api is not None:
|
|
351
|
-
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
|
352
348
|
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
|
353
349
|
if maybe_downsampling_limit is not None:
|
|
354
350
|
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
|
@@ -2464,7 +2460,19 @@ if response.status_code == 200:
|
|
|
2464
2460
|
if add_fit_system_record_id:
|
|
2465
2461
|
selecting_columns.append(SORT_ID)
|
|
2466
2462
|
|
|
2467
|
-
|
|
2463
|
+
selecting_columns = list(set(selecting_columns))
|
|
2464
|
+
# sorting: first columns from X, then generated features, then enriched features
|
|
2465
|
+
sorted_selecting_columns = [c for c in validated_X.columns if c in selecting_columns]
|
|
2466
|
+
for c in generated_features:
|
|
2467
|
+
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2468
|
+
sorted_selecting_columns.append(c)
|
|
2469
|
+
for c in result.columns:
|
|
2470
|
+
if c in selecting_columns and c not in sorted_selecting_columns:
|
|
2471
|
+
sorted_selecting_columns.append(c)
|
|
2472
|
+
|
|
2473
|
+
self.logger.info(f"Transform sorted_selecting_columns: {sorted_selecting_columns}")
|
|
2474
|
+
|
|
2475
|
+
result = result[sorted_selecting_columns]
|
|
2468
2476
|
|
|
2469
2477
|
if self.country_added:
|
|
2470
2478
|
result = result.drop(columns=COUNTRY, errors="ignore")
|
|
@@ -2612,18 +2620,17 @@ if response.status_code == 200:
|
|
|
2612
2620
|
checked_generate_features = []
|
|
2613
2621
|
for gen_feature in self.generate_features:
|
|
2614
2622
|
if gen_feature not in x_columns:
|
|
2615
|
-
|
|
2616
|
-
|
|
2623
|
+
if gen_feature == self._get_phone_column(self.search_keys):
|
|
2624
|
+
raise ValidationError(
|
|
2625
|
+
self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2626
|
+
)
|
|
2627
|
+
else:
|
|
2628
|
+
self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
|
|
2617
2629
|
else:
|
|
2618
2630
|
checked_generate_features.append(gen_feature)
|
|
2619
2631
|
self.generate_features = checked_generate_features
|
|
2620
2632
|
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2621
2633
|
|
|
2622
|
-
if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
|
|
2623
|
-
for column in self.columns_for_online_api:
|
|
2624
|
-
if column not in validated_X.columns:
|
|
2625
|
-
raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
|
|
2626
|
-
|
|
2627
2634
|
if self.id_columns is not None:
|
|
2628
2635
|
for id_column in self.id_columns:
|
|
2629
2636
|
if id_column not in validated_X.columns:
|
|
@@ -3726,6 +3733,8 @@ if response.status_code == 200:
|
|
|
3726
3733
|
features_info_without_links = []
|
|
3727
3734
|
internal_features_info = []
|
|
3728
3735
|
|
|
3736
|
+
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
|
3737
|
+
|
|
3729
3738
|
if updated_shaps is not None:
|
|
3730
3739
|
for fm in features_meta:
|
|
3731
3740
|
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
|
@@ -3737,15 +3746,16 @@ if response.status_code == 200:
|
|
|
3737
3746
|
|
|
3738
3747
|
is_client_feature = feature_meta.name in x_columns
|
|
3739
3748
|
|
|
3740
|
-
|
|
3749
|
+
# TODO make a decision about selected features based on special flag from mlb
|
|
3750
|
+
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
|
3741
3751
|
if self.fit_select_features:
|
|
3742
3752
|
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3743
3753
|
continue
|
|
3744
3754
|
|
|
3745
3755
|
# Use only important features
|
|
3746
3756
|
if (
|
|
3747
|
-
feature_meta.name in self.fit_generated_features
|
|
3748
|
-
|
|
3757
|
+
# feature_meta.name in self.fit_generated_features or
|
|
3758
|
+
feature_meta.name == COUNTRY
|
|
3749
3759
|
# In select_features mode we select also from etalon features and need to show them
|
|
3750
3760
|
or (not self.fit_select_features and is_client_feature)
|
|
3751
3761
|
):
|
|
@@ -89,7 +89,7 @@ class SearchKey(Enum):
|
|
|
89
89
|
if meaning_type == FileColumnMeaningType.EMAIL:
|
|
90
90
|
return SearchKey.EMAIL
|
|
91
91
|
if meaning_type == FileColumnMeaningType.HEM:
|
|
92
|
-
return SearchKey.HEM
|
|
92
|
+
return SearchKey.HEM # TODO check that it wasn't EMAIL
|
|
93
93
|
if meaning_type == FileColumnMeaningType.IP_ADDRESS:
|
|
94
94
|
return SearchKey.IP
|
|
95
95
|
if meaning_type == FileColumnMeaningType.MSISDN:
|
|
@@ -105,27 +105,27 @@ class SearchKey(Enum):
|
|
|
105
105
|
if meaning_type == FileColumnMeaningType.POSTAL_CODE:
|
|
106
106
|
return SearchKey.POSTAL_CODE
|
|
107
107
|
if meaning_type == FileColumnMeaningType.IPV6_ADDRESS:
|
|
108
|
-
return SearchKey.
|
|
109
|
-
if meaning_type == FileColumnMeaningType.IPV6_RANGE_FROM:
|
|
110
|
-
|
|
111
|
-
if meaning_type == FileColumnMeaningType.IPV6_RANGE_TO:
|
|
112
|
-
|
|
113
|
-
if meaning_type == FileColumnMeaningType.EMAIL_ONE_DOMAIN:
|
|
114
|
-
|
|
115
|
-
if meaning_type == FileColumnMeaningType.IP_RANGE_FROM:
|
|
116
|
-
|
|
117
|
-
if meaning_type == FileColumnMeaningType.IP_RANGE_TO:
|
|
118
|
-
|
|
119
|
-
if meaning_type == FileColumnMeaningType.MSISDN_RANGE_FROM:
|
|
120
|
-
|
|
121
|
-
if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
|
|
122
|
-
|
|
108
|
+
return SearchKey.IP
|
|
109
|
+
# if meaning_type == FileColumnMeaningType.IPV6_RANGE_FROM:
|
|
110
|
+
# return SearchKey.IPV6_RANGE_FROM
|
|
111
|
+
# if meaning_type == FileColumnMeaningType.IPV6_RANGE_TO:
|
|
112
|
+
# return SearchKey.IPV6_RANGE_TO
|
|
113
|
+
# if meaning_type == FileColumnMeaningType.EMAIL_ONE_DOMAIN:
|
|
114
|
+
# return SearchKey.EMAIL_ONE_DOMAIN
|
|
115
|
+
# if meaning_type == FileColumnMeaningType.IP_RANGE_FROM:
|
|
116
|
+
# return SearchKey.IP_RANGE_FROM
|
|
117
|
+
# if meaning_type == FileColumnMeaningType.IP_RANGE_TO:
|
|
118
|
+
# return SearchKey.IP_RANGE_TO
|
|
119
|
+
# if meaning_type == FileColumnMeaningType.MSISDN_RANGE_FROM:
|
|
120
|
+
# return SearchKey.MSISDN_RANGE_FROM
|
|
121
|
+
# if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
|
|
122
|
+
# return SearchKey.MSISDN_RANGE_TO
|
|
123
123
|
if meaning_type == FileColumnMeaningType.IP_BINARY:
|
|
124
|
-
return SearchKey.
|
|
125
|
-
if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
|
|
126
|
-
|
|
127
|
-
if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
|
|
128
|
-
|
|
124
|
+
return SearchKey.IP
|
|
125
|
+
# if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
|
|
126
|
+
# return SearchKey.IP_RANGE_FROM_BINARY
|
|
127
|
+
# if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
|
|
128
|
+
# return SearchKey.IP_RANGE_TO_BINARY
|
|
129
129
|
|
|
130
130
|
@staticmethod
|
|
131
131
|
def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
|
|
@@ -249,7 +249,9 @@ class FileMetadata(BaseModel):
|
|
|
249
249
|
for key in keys_group:
|
|
250
250
|
column = self.column_by_name(key)
|
|
251
251
|
if column:
|
|
252
|
-
|
|
252
|
+
search_key = SearchKey.from_meaning_type(column.meaningType)
|
|
253
|
+
if search_key is not None:
|
|
254
|
+
search_keys[search_key] = column.name
|
|
253
255
|
return search_keys
|
|
254
256
|
|
|
255
257
|
|
|
@@ -111,7 +111,6 @@ x_is_empty=X is empty
|
|
|
111
111
|
y_is_empty=y is empty
|
|
112
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
113
|
missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
-
missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
|
|
115
114
|
x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
116
115
|
train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
117
116
|
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
|
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
|
|
|
116
116
|
else:
|
|
117
117
|
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
118
118
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
119
|
+
del self.search_keys[self.email_column]
|
|
120
|
+
if self.email_column in self.unnest_search_keys:
|
|
121
|
+
self.unnest_search_keys.remove(self.email_column)
|
|
122
122
|
|
|
123
123
|
one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
|
|
124
124
|
df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
|
|
125
125
|
self.columns_renaming[one_domain_name] = original_email_column
|
|
126
126
|
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
127
127
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
128
|
+
if self.email_converted_to_hem:
|
|
129
|
+
df = df.drop(columns=self.email_column)
|
|
130
|
+
del self.columns_renaming[self.email_column]
|
|
131
131
|
|
|
132
132
|
return df
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.55a1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|