upgini 1.2.58a1__py3-none-any.whl → 1.2.59__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +8 -0
- upgini/autofe/feature.py +1 -10
- upgini/data_source/data_source_publisher.py +1 -0
- upgini/dataset.py +16 -8
- upgini/features_enricher.py +74 -69
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/email_utils.py +6 -6
- {upgini-1.2.58a1.dist-info → upgini-1.2.59.dist-info}/METADATA +2 -2
- {upgini-1.2.58a1.dist-info → upgini-1.2.59.dist-info}/RECORD +12 -14
- upgini/utils/mstats.py +0 -177
- upgini/utils/sort.py +0 -160
- {upgini-1.2.58a1.dist-info → upgini-1.2.59.dist-info}/WHEEL +0 -0
- {upgini-1.2.58a1.dist-info → upgini-1.2.59.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.59"
|
upgini/autofe/date.py
CHANGED
|
@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
64
|
return res
|
|
65
65
|
|
|
66
66
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
67
|
+
if left.isna().all() or right.isna().all():
|
|
68
|
+
return pd.Series([None] * len(left))
|
|
69
|
+
|
|
67
70
|
left = self._convert_to_date(left, self.left_unit)
|
|
68
71
|
right = self._convert_to_date(right, self.right_unit)
|
|
69
72
|
diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
|
|
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
|
142
145
|
return cls(aggregation=aggregation)
|
|
143
146
|
|
|
144
147
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
148
|
+
if left.isna().all() or right.isna().all():
|
|
149
|
+
return pd.Series([None] * len(left), dtype=np.float64)
|
|
150
|
+
|
|
145
151
|
left = self._convert_to_date(left, self.left_unit)
|
|
146
152
|
right_mask = right.apply(lambda x: len(x) > 0)
|
|
147
153
|
mask = left.notna() & right.notna() & right_mask
|
|
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
230
236
|
pass
|
|
231
237
|
|
|
232
238
|
def _perc(self, f, bounds):
|
|
239
|
+
if f is None or np.isnan(f):
|
|
240
|
+
return np.nan
|
|
233
241
|
hit = np.where(f >= np.array(bounds))[0]
|
|
234
242
|
if hit.size > 0:
|
|
235
243
|
return np.max(hit) + 1
|
upgini/autofe/feature.py
CHANGED
|
@@ -26,18 +26,9 @@ class Column:
|
|
|
26
26
|
return dict()
|
|
27
27
|
|
|
28
28
|
def rename_columns(self, mapping: Dict[str, str]) -> "Column":
|
|
29
|
-
self.name =
|
|
29
|
+
self.name = mapping.get(self.name) or self.name
|
|
30
30
|
return self
|
|
31
31
|
|
|
32
|
-
def _unhash(self, feature_name: str) -> str:
|
|
33
|
-
last_component_idx = feature_name.rfind("_")
|
|
34
|
-
if not feature_name.startswith("f_"):
|
|
35
|
-
return feature_name # etalon feature
|
|
36
|
-
elif last_component_idx == 1:
|
|
37
|
-
return feature_name[2:] # fully hashed name, cannot unhash
|
|
38
|
-
else:
|
|
39
|
-
return feature_name[2:last_component_idx]
|
|
40
|
-
|
|
41
32
|
def delete_data(self):
|
|
42
33
|
self.data = None
|
|
43
34
|
|
|
@@ -386,6 +386,7 @@ class DataSourcePublisher:
|
|
|
386
386
|
search_keys = [k.value.value for k in search_keys] if search_keys else None
|
|
387
387
|
request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
|
|
388
388
|
task_id = self._rest_client.upload_online(request, trace_id)
|
|
389
|
+
print(f"Uploading online task created. task_id={task_id}")
|
|
389
390
|
with Spinner():
|
|
390
391
|
status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
|
|
391
392
|
while status_response["status"] not in self.FINAL_STATUSES:
|
upgini/dataset.py
CHANGED
|
@@ -587,15 +587,23 @@ class Dataset: # (pd.DataFrame):
|
|
|
587
587
|
if (
|
|
588
588
|
runtime_parameters is not None
|
|
589
589
|
and runtime_parameters.properties is not None
|
|
590
|
-
and "generate_features" in runtime_parameters.properties
|
|
591
590
|
):
|
|
592
|
-
generate_features
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
for
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
591
|
+
if "generate_features" in runtime_parameters.properties:
|
|
592
|
+
generate_features = runtime_parameters.properties["generate_features"].split(",")
|
|
593
|
+
renamed_generate_features = []
|
|
594
|
+
for f in generate_features:
|
|
595
|
+
for new_column, orig_column in self.columns_renaming.items():
|
|
596
|
+
if f == orig_column:
|
|
597
|
+
renamed_generate_features.append(new_column)
|
|
598
|
+
runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
|
|
599
|
+
if "columns_for_online_api" in runtime_parameters.properties:
|
|
600
|
+
columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
|
|
601
|
+
renamed_columns_for_online_api = []
|
|
602
|
+
for f in columns_for_online_api:
|
|
603
|
+
for new_column, orig_column in self.columns_renaming.items():
|
|
604
|
+
if f == orig_column:
|
|
605
|
+
renamed_columns_for_online_api.append(new_column)
|
|
606
|
+
runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
|
|
599
607
|
|
|
600
608
|
return runtime_parameters
|
|
601
609
|
|
upgini/features_enricher.py
CHANGED
|
@@ -112,7 +112,6 @@ try:
|
|
|
112
112
|
except Exception:
|
|
113
113
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
114
114
|
|
|
115
|
-
from upgini.utils.sort import sort_columns
|
|
116
115
|
from upgini.utils.target_utils import (
|
|
117
116
|
balance_undersample_forced,
|
|
118
117
|
calculate_psi,
|
|
@@ -223,6 +222,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
223
222
|
loss: Optional[str] = None,
|
|
224
223
|
detect_missing_search_keys: bool = True,
|
|
225
224
|
generate_features: Optional[List[str]] = None,
|
|
225
|
+
columns_for_online_api: Optional[List[str]] = None,
|
|
226
226
|
round_embeddings: Optional[int] = None,
|
|
227
227
|
logs_enabled: bool = True,
|
|
228
228
|
raise_validation_error: bool = True,
|
|
@@ -346,6 +346,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
346
346
|
self.logger.error(msg)
|
|
347
347
|
raise ValidationError(msg)
|
|
348
348
|
self.runtime_parameters.properties["round_embeddings"] = round_embeddings
|
|
349
|
+
self.columns_for_online_api = columns_for_online_api
|
|
350
|
+
if columns_for_online_api is not None:
|
|
351
|
+
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
|
349
352
|
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
|
350
353
|
if maybe_downsampling_limit is not None:
|
|
351
354
|
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
|
@@ -1258,7 +1261,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1258
1261
|
for feature, shap in new_shaps.items()
|
|
1259
1262
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1260
1263
|
}
|
|
1261
|
-
self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
|
|
1264
|
+
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1262
1265
|
|
|
1263
1266
|
if self.features_info_display_handle is not None:
|
|
1264
1267
|
try:
|
|
@@ -1735,7 +1738,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1735
1738
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1736
1739
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1737
1740
|
|
|
1738
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID
|
|
1741
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1739
1742
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1740
1743
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1741
1744
|
|
|
@@ -1874,13 +1877,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1874
1877
|
|
|
1875
1878
|
# downsample if need to eval_set threshold
|
|
1876
1879
|
num_samples = _num_samples(df)
|
|
1877
|
-
phone_column = self._get_phone_column(self.search_keys)
|
|
1878
1880
|
force_downsampling = (
|
|
1879
1881
|
not self.disable_force_downsampling
|
|
1880
|
-
and self.
|
|
1881
|
-
and phone_column is not None
|
|
1882
|
-
and self.fit_columns_renaming is not None
|
|
1883
|
-
and self.fit_columns_renaming.get(phone_column) in self.generate_features
|
|
1882
|
+
and self.columns_for_online_api is not None
|
|
1884
1883
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1885
1884
|
)
|
|
1886
1885
|
if force_downsampling:
|
|
@@ -1916,7 +1915,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1916
1915
|
progress_bar=progress_bar,
|
|
1917
1916
|
progress_callback=progress_callback,
|
|
1918
1917
|
add_fit_system_record_id=True,
|
|
1919
|
-
target_name=tmp_target_name,
|
|
1920
1918
|
)
|
|
1921
1919
|
if enriched_df is None:
|
|
1922
1920
|
return None
|
|
@@ -1950,7 +1948,27 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1950
1948
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1951
1949
|
|
|
1952
1950
|
num_samples = _num_samples(df)
|
|
1953
|
-
|
|
1951
|
+
force_downsampling = (
|
|
1952
|
+
not self.disable_force_downsampling
|
|
1953
|
+
and self.columns_for_online_api is not None
|
|
1954
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1955
|
+
)
|
|
1956
|
+
if force_downsampling:
|
|
1957
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1958
|
+
df = balance_undersample_forced(
|
|
1959
|
+
df=df,
|
|
1960
|
+
target_column=TARGET,
|
|
1961
|
+
id_columns=self.id_columns,
|
|
1962
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1963
|
+
task_type=self.model_task_type,
|
|
1964
|
+
cv_type=self.cv,
|
|
1965
|
+
random_state=self.random_state,
|
|
1966
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1967
|
+
logger=self.logger,
|
|
1968
|
+
bundle=self.bundle,
|
|
1969
|
+
warning_callback=self.__log_warning,
|
|
1970
|
+
)
|
|
1971
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1954
1972
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1955
1973
|
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
1956
1974
|
|
|
@@ -1966,7 +1984,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1966
1984
|
progress_bar=progress_bar,
|
|
1967
1985
|
progress_callback=progress_callback,
|
|
1968
1986
|
add_fit_system_record_id=True,
|
|
1969
|
-
target_name=tmp_target_name,
|
|
1970
1987
|
)
|
|
1971
1988
|
if enriched_Xy is None:
|
|
1972
1989
|
return None
|
|
@@ -2128,7 +2145,6 @@ if response.status_code == 200:
|
|
|
2128
2145
|
progress_bar: Optional[ProgressBar] = None,
|
|
2129
2146
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2130
2147
|
add_fit_system_record_id: bool = False,
|
|
2131
|
-
target_name: Optional[str] = None,
|
|
2132
2148
|
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2133
2149
|
if self._search_task is None:
|
|
2134
2150
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
@@ -2313,11 +2329,8 @@ if response.status_code == 200:
|
|
|
2313
2329
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2314
2330
|
]
|
|
2315
2331
|
|
|
2316
|
-
if add_fit_system_record_id
|
|
2317
|
-
|
|
2318
|
-
df = self.__add_fit_system_record_id(
|
|
2319
|
-
df, search_keys, SYSTEM_RECORD_ID, reversed_columns_renaming.get(target_name, target_name)
|
|
2320
|
-
)
|
|
2332
|
+
if add_fit_system_record_id:
|
|
2333
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2321
2334
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2322
2335
|
features_not_to_pass.append(SORT_ID)
|
|
2323
2336
|
|
|
@@ -2627,17 +2640,18 @@ if response.status_code == 200:
|
|
|
2627
2640
|
checked_generate_features = []
|
|
2628
2641
|
for gen_feature in self.generate_features:
|
|
2629
2642
|
if gen_feature not in x_columns:
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2633
|
-
)
|
|
2634
|
-
else:
|
|
2635
|
-
self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
|
|
2643
|
+
msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2644
|
+
self.__log_warning(msg)
|
|
2636
2645
|
else:
|
|
2637
2646
|
checked_generate_features.append(gen_feature)
|
|
2638
2647
|
self.generate_features = checked_generate_features
|
|
2639
2648
|
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2640
2649
|
|
|
2650
|
+
if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
|
|
2651
|
+
for column in self.columns_for_online_api:
|
|
2652
|
+
if column not in validated_X.columns:
|
|
2653
|
+
raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
|
|
2654
|
+
|
|
2641
2655
|
if self.id_columns is not None:
|
|
2642
2656
|
for id_column in self.id_columns:
|
|
2643
2657
|
if id_column not in validated_X.columns:
|
|
@@ -2761,7 +2775,7 @@ if response.status_code == 200:
|
|
|
2761
2775
|
self.__log_warning(full_duplicates_warning)
|
|
2762
2776
|
|
|
2763
2777
|
# Explode multiple search keys
|
|
2764
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID
|
|
2778
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2765
2779
|
|
|
2766
2780
|
# TODO check that this is correct for enrichment
|
|
2767
2781
|
self.df_with_original_index = df.copy()
|
|
@@ -2843,7 +2857,7 @@ if response.status_code == 200:
|
|
|
2843
2857
|
if eval_set is not None and len(eval_set) > 0:
|
|
2844
2858
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2845
2859
|
|
|
2846
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID
|
|
2860
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2847
2861
|
|
|
2848
2862
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2849
2863
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
@@ -2859,9 +2873,7 @@ if response.status_code == 200:
|
|
|
2859
2873
|
# Force downsampling to 7000 for API features generation
|
|
2860
2874
|
force_downsampling = (
|
|
2861
2875
|
not self.disable_force_downsampling
|
|
2862
|
-
and self.
|
|
2863
|
-
and phone_column is not None
|
|
2864
|
-
and self.fit_columns_renaming[phone_column] in self.generate_features
|
|
2876
|
+
and self.columns_for_online_api is not None
|
|
2865
2877
|
and len(df) > Dataset.FORCE_SAMPLE_SIZE
|
|
2866
2878
|
)
|
|
2867
2879
|
if force_downsampling:
|
|
@@ -3535,60 +3547,53 @@ if response.status_code == 200:
|
|
|
3535
3547
|
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3536
3548
|
search_keys: Dict[str, SearchKey],
|
|
3537
3549
|
id_name: str,
|
|
3538
|
-
target_name: str,
|
|
3539
3550
|
) -> pd.DataFrame:
|
|
3551
|
+
# save original order or rows
|
|
3540
3552
|
original_index_name = df.index.name
|
|
3541
3553
|
index_name = df.index.name or DEFAULT_INDEX
|
|
3542
3554
|
original_order_name = "original_order"
|
|
3543
|
-
# Save original index
|
|
3544
3555
|
df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
|
|
3545
|
-
# Save original order
|
|
3546
3556
|
df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
|
|
3547
3557
|
|
|
3548
|
-
# order by date and idempotent order by other keys
|
|
3558
|
+
# order by date and idempotent order by other keys
|
|
3559
|
+
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
3560
|
+
sort_exclude_columns = [
|
|
3561
|
+
original_order_name,
|
|
3562
|
+
ORIGINAL_INDEX,
|
|
3563
|
+
EVAL_SET_INDEX,
|
|
3564
|
+
TARGET,
|
|
3565
|
+
"__target",
|
|
3566
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
3567
|
+
]
|
|
3568
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3569
|
+
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3570
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3571
|
+
else:
|
|
3572
|
+
date_column = self._get_date_column(search_keys)
|
|
3573
|
+
sort_columns = [date_column] if date_column is not None else []
|
|
3549
3574
|
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
ORIGINAL_INDEX,
|
|
3553
|
-
EVAL_SET_INDEX,
|
|
3554
|
-
TARGET,
|
|
3555
|
-
"__target",
|
|
3556
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
3557
|
-
]
|
|
3558
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3559
|
-
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3560
|
-
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3561
|
-
else:
|
|
3562
|
-
date_column = self._get_date_column(search_keys)
|
|
3563
|
-
sort_exclude_columns.append(date_column)
|
|
3564
|
-
columns_to_sort = [date_column] if date_column is not None else []
|
|
3575
|
+
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
3576
|
+
sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
|
|
3565
3577
|
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3578
|
+
other_columns = sorted(
|
|
3579
|
+
[
|
|
3580
|
+
c
|
|
3581
|
+
for c in df.columns
|
|
3582
|
+
if c not in sort_columns
|
|
3583
|
+
and c not in sorted_other_keys
|
|
3584
|
+
and c not in sort_exclude_columns
|
|
3585
|
+
and df[c].nunique() > 1
|
|
3586
|
+
]
|
|
3587
|
+
)
|
|
3572
3588
|
|
|
3573
|
-
|
|
3574
|
-
if duplicates.any():
|
|
3575
|
-
self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
|
3576
|
-
do_sorting = False
|
|
3577
|
-
else:
|
|
3578
|
-
columns_to_hash = list(search_keys.keys()) + self.id_columns
|
|
3579
|
-
columns_to_hash = sort_columns(
|
|
3580
|
-
df[columns_to_hash], target_name, search_keys, self.model_task_type, sort_exclude_columns
|
|
3581
|
-
)
|
|
3582
|
-
else:
|
|
3583
|
-
columns_to_hash = sort_columns(df, target_name, search_keys, self.model_task_type, sort_exclude_columns)
|
|
3589
|
+
all_other_columns = sorted_other_keys + other_columns
|
|
3584
3590
|
|
|
3585
|
-
if do_sorting:
|
|
3586
3591
|
search_keys_hash = "search_keys_hash"
|
|
3587
|
-
if len(
|
|
3588
|
-
|
|
3589
|
-
|
|
3592
|
+
if len(all_other_columns) > 0:
|
|
3593
|
+
sort_columns.append(search_keys_hash)
|
|
3594
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
|
|
3590
3595
|
|
|
3591
|
-
df = df.sort_values(by=
|
|
3596
|
+
df = df.sort_values(by=sort_columns)
|
|
3592
3597
|
|
|
3593
3598
|
if search_keys_hash in df.columns:
|
|
3594
3599
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
@@ -35,7 +35,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
|
|
|
35
35
|
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
|
36
36
|
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
|
37
37
|
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
38
|
-
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
|
|
39
38
|
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
|
40
39
|
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
40
|
# Errors
|
|
@@ -112,6 +111,7 @@ x_is_empty=X is empty
|
|
|
112
111
|
y_is_empty=y is empty
|
|
113
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
114
113
|
missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
+
missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
|
|
115
115
|
x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
116
116
|
train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
117
117
|
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
upgini/utils/email_utils.py
CHANGED
|
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
|
|
|
116
116
|
else:
|
|
117
117
|
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
118
118
|
|
|
119
|
-
del self.search_keys[self.email_column]
|
|
120
|
-
if self.email_column in self.unnest_search_keys:
|
|
121
|
-
|
|
119
|
+
# del self.search_keys[self.email_column]
|
|
120
|
+
# if self.email_column in self.unnest_search_keys:
|
|
121
|
+
# self.unnest_search_keys.remove(self.email_column)
|
|
122
122
|
|
|
123
123
|
one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
|
|
124
124
|
df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
|
|
125
125
|
self.columns_renaming[one_domain_name] = original_email_column
|
|
126
126
|
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
127
127
|
|
|
128
|
-
if self.email_converted_to_hem:
|
|
129
|
-
|
|
130
|
-
|
|
128
|
+
# if self.email_converted_to_hem:
|
|
129
|
+
# df = df.drop(columns=self.email_column)
|
|
130
|
+
# del self.columns_renaming[self.email_column]
|
|
131
131
|
|
|
132
132
|
return df
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.59
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -34,7 +34,7 @@ Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
|
34
34
|
Requires-Dist: pyjwt>=2.8.0
|
|
35
35
|
Requires-Dist: python-bidi==0.4.2
|
|
36
36
|
Requires-Dist: python-dateutil>=2.8.0
|
|
37
|
-
Requires-Dist: python-json-logger>=
|
|
37
|
+
Requires-Dist: python-json-logger>=3.3.0
|
|
38
38
|
Requires-Dist: requests>=2.8.0
|
|
39
39
|
Requires-Dist: scikit-learn>=1.3.0
|
|
40
40
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=sMabNMxFs5tlVPv0grJYMF0DEVkcmjezcGHIUNpibDk,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=IXU6ahvQqMGLdZsrHCjOGEia1pBAgixfld3pNVPcGEM,202468
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
@@ -16,21 +16,21 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
|
|
18
18
|
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=oykxfmny4LOr6m79IipOUCtk2JQSUdSCWHh8K9n7nek,10726
|
|
20
|
+
upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
|
|
21
21
|
upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
|
|
22
22
|
upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
|
|
23
23
|
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
24
|
upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
26
|
+
upgini/data_source/data_source_publisher.py,sha256=0vaYz5v3KclJnA6jAWiTUiMQO5mbBTBINWV9jr2F5xM,22591
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=UXMiaFP3p-WdiXyZJN3O_OZstb-F33BWVDxDiofyxd4,27464
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -46,22 +46,20 @@ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
|
46
46
|
upgini/utils/datetime_utils.py,sha256=RVAk4_rakK8X9zjybK3-rj0to0e3elye8tnBuA4wTWU,13491
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
-
upgini/utils/email_utils.py,sha256=
|
|
49
|
+
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
51
|
upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
|
|
52
52
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
54
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
55
|
-
upgini/utils/mstats.py,sha256=GjBAUacgfAoVQVFUrMiRYdVkmx93CIThLRNvYLLiV48,5765
|
|
56
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
57
56
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
58
57
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
59
58
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
60
|
-
upgini/utils/sort.py,sha256=ACcYrRNu1MPoYyt1oEYXvdL_dXx9YtJoSLjeiBu46KU,5648
|
|
61
59
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
62
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
63
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
66
|
-
upgini-1.2.
|
|
67
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.59.dist-info/METADATA,sha256=bAdAK5gEcGUD4_CWwP9ykvpSSlQ3ooTt598wpb1Qft8,49055
|
|
63
|
+
upgini-1.2.59.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.59.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.59.dist-info/RECORD,,
|
upgini/utils/mstats.py
DELETED
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
import warnings
|
|
2
|
-
from collections import namedtuple
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import numpy.ma as ma
|
|
6
|
-
import scipy
|
|
7
|
-
from joblib import Parallel, delayed
|
|
8
|
-
from numpy import ndarray
|
|
9
|
-
from psutil import cpu_count
|
|
10
|
-
|
|
11
|
-
np.seterr(divide="ignore")
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
warnings.simplefilter(action="ignore", category=RuntimeWarning)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _find_repeats(arr):
|
|
18
|
-
# This function assumes it may clobber its input.
|
|
19
|
-
if len(arr) == 0:
|
|
20
|
-
return np.array(0, np.float64), np.array(0, np.intp)
|
|
21
|
-
|
|
22
|
-
# XXX This cast was previously needed for the Fortran implementation,
|
|
23
|
-
# should we ditch it?
|
|
24
|
-
arr = np.asarray(arr, np.float64).ravel()
|
|
25
|
-
arr.sort()
|
|
26
|
-
|
|
27
|
-
# Taken from NumPy 1.9's np.unique.
|
|
28
|
-
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
|
29
|
-
unique = arr[change]
|
|
30
|
-
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
|
31
|
-
freq = np.diff(change_idx)
|
|
32
|
-
atleast2 = freq > 1
|
|
33
|
-
return unique[atleast2], freq[atleast2]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def find_repeats(arr):
|
|
37
|
-
# Make sure we get a copy. ma.compressed promises a "new array", but can
|
|
38
|
-
# actually return a reference.
|
|
39
|
-
compr = np.asarray(ma.compressed(arr), dtype=np.float64)
|
|
40
|
-
try:
|
|
41
|
-
need_copy = np.may_share_memory(compr, arr)
|
|
42
|
-
except AttributeError:
|
|
43
|
-
# numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
|
|
44
|
-
# while in numpy 1.8.2 and above it just (correctly) returns False.
|
|
45
|
-
need_copy = False
|
|
46
|
-
if need_copy:
|
|
47
|
-
compr = compr.copy()
|
|
48
|
-
return _find_repeats(compr)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def rankdata(data, axis=None, use_missing=False):
|
|
52
|
-
def _rank1d(data, use_missing=False):
|
|
53
|
-
n = data.count()
|
|
54
|
-
rk = np.empty(data.size, dtype=float)
|
|
55
|
-
idx = data.argsort()
|
|
56
|
-
rk[idx[:n]] = np.arange(1, n + 1)
|
|
57
|
-
|
|
58
|
-
if use_missing:
|
|
59
|
-
rk[idx[n:]] = (n + 1) / 2.0
|
|
60
|
-
else:
|
|
61
|
-
rk[idx[n:]] = 0
|
|
62
|
-
|
|
63
|
-
repeats = find_repeats(data.copy())
|
|
64
|
-
for r in repeats[0]:
|
|
65
|
-
condition = (data == r).filled(False)
|
|
66
|
-
rk[condition] = rk[condition].mean()
|
|
67
|
-
return rk
|
|
68
|
-
|
|
69
|
-
data = ma.array(data, copy=False)
|
|
70
|
-
if axis is None:
|
|
71
|
-
if data.ndim > 1:
|
|
72
|
-
return _rank1d(data.ravel(), use_missing).reshape(data.shape)
|
|
73
|
-
else:
|
|
74
|
-
return _rank1d(data, use_missing)
|
|
75
|
-
else:
|
|
76
|
-
return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _chk_asarray(a, axis):
|
|
80
|
-
# Always returns a masked array, raveled for axis=None
|
|
81
|
-
a = ma.asanyarray(a)
|
|
82
|
-
if axis is None:
|
|
83
|
-
a = ma.ravel(a)
|
|
84
|
-
outaxis = 0
|
|
85
|
-
else:
|
|
86
|
-
outaxis = axis
|
|
87
|
-
return a, outaxis
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
# Taken from scipy.mstats with following tweaks:
|
|
94
|
-
# 1. parallel pairwise computation
|
|
95
|
-
# 2. custom masking
|
|
96
|
-
def spearmanr(
|
|
97
|
-
x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
|
|
98
|
-
):
|
|
99
|
-
if not use_ties:
|
|
100
|
-
raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
|
|
101
|
-
|
|
102
|
-
# Always returns a masked array, raveled if axis=None
|
|
103
|
-
x, axisout = _chk_asarray(x, axis)
|
|
104
|
-
if y is not None:
|
|
105
|
-
# Deal only with 2-D `x` case.
|
|
106
|
-
y, _ = _chk_asarray(y, axis)
|
|
107
|
-
if axisout == 0:
|
|
108
|
-
x = ma.column_stack((x, y))
|
|
109
|
-
else:
|
|
110
|
-
x = ma.row_stack((x, y))
|
|
111
|
-
|
|
112
|
-
if axisout == 1:
|
|
113
|
-
# To simplify the code that follow (always use `n_obs, n_vars` shape)
|
|
114
|
-
x = x.T
|
|
115
|
-
|
|
116
|
-
if nan_policy == "omit":
|
|
117
|
-
x = mask_fn(x)
|
|
118
|
-
|
|
119
|
-
def _spearmanr_2cols(x):
|
|
120
|
-
# Mask the same observations for all variables, and then drop those
|
|
121
|
-
# observations (can't leave them masked, rankdata is weird).
|
|
122
|
-
x = ma.mask_rowcols(x, axis=0)
|
|
123
|
-
x = x[~x.mask.any(axis=1), :]
|
|
124
|
-
|
|
125
|
-
# If either column is entirely NaN or Inf
|
|
126
|
-
if not np.any(x.data):
|
|
127
|
-
return SpearmanrResult(np.nan, np.nan)
|
|
128
|
-
|
|
129
|
-
m = ma.getmask(x)
|
|
130
|
-
n_obs = x.shape[0]
|
|
131
|
-
dof = n_obs - 2 - int(m.sum(axis=0)[0])
|
|
132
|
-
if dof < 0:
|
|
133
|
-
return SpearmanrResult(np.nan, np.nan)
|
|
134
|
-
|
|
135
|
-
# Gets the ranks and rank differences
|
|
136
|
-
x_ranked = rankdata(x, axis=0)
|
|
137
|
-
rs = ma.corrcoef(x_ranked, rowvar=False).data
|
|
138
|
-
|
|
139
|
-
# rs can have elements equal to 1, so avoid zero division warnings
|
|
140
|
-
with np.errstate(divide="ignore"):
|
|
141
|
-
# clip the small negative values possibly caused by rounding
|
|
142
|
-
# errors before taking the square root
|
|
143
|
-
t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
|
|
144
|
-
|
|
145
|
-
t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
|
|
146
|
-
|
|
147
|
-
# For backwards compatibility, return scalars when comparing 2 columns
|
|
148
|
-
if rs.shape == (2, 2):
|
|
149
|
-
return SpearmanrResult(rs[1, 0], prob[1, 0])
|
|
150
|
-
else:
|
|
151
|
-
return SpearmanrResult(rs, prob)
|
|
152
|
-
|
|
153
|
-
# Need to do this per pair of variables, otherwise the dropped observations
|
|
154
|
-
# in a third column mess up the result for a pair.
|
|
155
|
-
n_vars = x.shape[1]
|
|
156
|
-
if n_vars == 2:
|
|
157
|
-
return _spearmanr_2cols(x)
|
|
158
|
-
else:
|
|
159
|
-
max_cpu_cores = cpu_count(logical=False)
|
|
160
|
-
with np.errstate(divide="ignore"):
|
|
161
|
-
results = Parallel(n_jobs=max_cpu_cores)(
|
|
162
|
-
delayed(_spearmanr_2cols)(x[:, [var1, var2]])
|
|
163
|
-
for var1 in range(n_vars - 1)
|
|
164
|
-
for var2 in range(var1 + 1, n_vars)
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
rs = np.ones((n_vars, n_vars), dtype=float)
|
|
168
|
-
prob = np.zeros((n_vars, n_vars), dtype=float)
|
|
169
|
-
for var1 in range(n_vars - 1):
|
|
170
|
-
for var2 in range(var1 + 1, n_vars):
|
|
171
|
-
result = results.pop(0)
|
|
172
|
-
rs[var1, var2] = result.correlation
|
|
173
|
-
rs[var2, var1] = result.correlation
|
|
174
|
-
prob[var1, var2] = result.pvalue
|
|
175
|
-
prob[var2, var1] = result.pvalue
|
|
176
|
-
|
|
177
|
-
return SpearmanrResult(rs, prob)
|
upgini/utils/sort.py
DELETED
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
from typing import Any, Dict, List
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from joblib import Parallel, delayed
|
|
7
|
-
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
|
|
8
|
-
from psutil import cpu_count
|
|
9
|
-
from scipy.stats import skew, spearmanr
|
|
10
|
-
|
|
11
|
-
from upgini.metadata import ModelTaskType, SearchKey
|
|
12
|
-
from upgini.utils import mstats
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def sort_columns(
|
|
16
|
-
df: pd.DataFrame,
|
|
17
|
-
target_column: str,
|
|
18
|
-
search_keys: Dict[str, SearchKey],
|
|
19
|
-
model_task_type: ModelTaskType,
|
|
20
|
-
exclude_columns: List[str],
|
|
21
|
-
) -> List[str]:
|
|
22
|
-
df = df.copy() # avoid side effects
|
|
23
|
-
sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
|
|
24
|
-
sorted_keys = [k for k in sorted_keys if k not in exclude_columns]
|
|
25
|
-
|
|
26
|
-
other_columns = sorted(
|
|
27
|
-
[
|
|
28
|
-
c
|
|
29
|
-
for c in df.columns
|
|
30
|
-
if c not in sorted_keys
|
|
31
|
-
and c not in exclude_columns
|
|
32
|
-
and df[c].nunique() > 1
|
|
33
|
-
]
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
target = prepare_target(df[target_column], model_task_type)
|
|
37
|
-
sort_dict = get_sort_columns_dict(df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True)
|
|
38
|
-
other_columns = [c for c in other_columns if c in sort_dict]
|
|
39
|
-
columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
|
|
40
|
-
return columns_for_sort
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def get_sort_columns_dict(
|
|
44
|
-
df: pd.DataFrame,
|
|
45
|
-
target: pd.Series,
|
|
46
|
-
sorted_keys: List[str],
|
|
47
|
-
omit_nan: bool,
|
|
48
|
-
n_jobs: int | None = None,
|
|
49
|
-
) -> dict[str, Any]:
|
|
50
|
-
string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
|
|
51
|
-
columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
|
|
52
|
-
if len(string_features) > 0:
|
|
53
|
-
if len(df) > len(df.drop(columns=string_features).drop_duplicates()):
|
|
54
|
-
# factorize string features
|
|
55
|
-
for c in string_features:
|
|
56
|
-
df[c] = df[c].factorize(sort=True)[0]
|
|
57
|
-
columns_for_sort.extend(string_features)
|
|
58
|
-
|
|
59
|
-
if len(columns_for_sort) == 0:
|
|
60
|
-
return {}
|
|
61
|
-
|
|
62
|
-
df = df[columns_for_sort]
|
|
63
|
-
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
64
|
-
df = np.asarray(df, dtype=np.float32)
|
|
65
|
-
correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
|
|
66
|
-
|
|
67
|
-
sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
|
|
68
|
-
return sort_dict
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None):
|
|
72
|
-
target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
|
|
73
|
-
|
|
74
|
-
return np.max(target_correlations, axis=0)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def get_target_correlations(
|
|
78
|
-
df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None, precision: int = 15
|
|
79
|
-
):
|
|
80
|
-
df = np.asarray(df, dtype=np.float32)
|
|
81
|
-
target_correlations = np.zeros((2, df.shape[1]))
|
|
82
|
-
target_correlations[0, :] = np.nan_to_num(
|
|
83
|
-
calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
|
|
84
|
-
)
|
|
85
|
-
target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
|
|
86
|
-
|
|
87
|
-
target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
|
|
88
|
-
|
|
89
|
-
return target_correlations
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def corr_dict_from_sort_dict(sort_dict: dict[str, tuple[float, int]]) -> dict[str, float]:
|
|
93
|
-
return {k: v[0] for k, v in sort_dict.items()}
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def calculate_spearman_corr_with_target(
|
|
97
|
-
X: pd.DataFrame | np.ndarray, y: pd.Series, omit_nan: bool = False, n_jobs: int | None = None
|
|
98
|
-
) -> np.ndarray:
|
|
99
|
-
if isinstance(X, pd.DataFrame):
|
|
100
|
-
X = np.asarray(X, dtype=np.float32)
|
|
101
|
-
|
|
102
|
-
if X.size == 0:
|
|
103
|
-
return np.ndarray(shape=(0,))
|
|
104
|
-
|
|
105
|
-
all_correlations = np.zeros(X.shape[1])
|
|
106
|
-
all_correlations.fill(np.nan)
|
|
107
|
-
cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
|
|
108
|
-
|
|
109
|
-
if omit_nan:
|
|
110
|
-
results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
|
|
111
|
-
delayed(mstats.spearmanr)(
|
|
112
|
-
X[:, i],
|
|
113
|
-
y,
|
|
114
|
-
nan_policy="omit",
|
|
115
|
-
axis=0,
|
|
116
|
-
)
|
|
117
|
-
for i in cols2calc
|
|
118
|
-
)
|
|
119
|
-
target_correlations = np.array([abs(res.correlation) for res in results])
|
|
120
|
-
else:
|
|
121
|
-
cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
|
|
122
|
-
target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
|
|
123
|
-
if isinstance(target_correlations, float):
|
|
124
|
-
target_correlations = np.abs([target_correlations])
|
|
125
|
-
else:
|
|
126
|
-
target_correlations = np.abs(target_correlations)[-1, :-1]
|
|
127
|
-
|
|
128
|
-
all_correlations[cols2calc] = target_correlations
|
|
129
|
-
|
|
130
|
-
return all_correlations
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def calculate_spearman(X: np.ndarray, y: pd.Series | None, nan_policy: str):
|
|
134
|
-
features_num = X.shape[1]
|
|
135
|
-
if y is not None:
|
|
136
|
-
features_num += 1
|
|
137
|
-
|
|
138
|
-
if features_num < 2:
|
|
139
|
-
return 1.0
|
|
140
|
-
else:
|
|
141
|
-
return spearmanr(X, y, nan_policy=nan_policy).correlation
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def hash_series(series: pd.Series) -> int:
|
|
145
|
-
return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
|
|
149
|
-
target_name = target.name
|
|
150
|
-
if model_task_type != ModelTaskType.REGRESSION or (
|
|
151
|
-
not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
|
|
152
|
-
):
|
|
153
|
-
target = target.astype(str).astype("category").cat.codes
|
|
154
|
-
|
|
155
|
-
elif model_task_type == ModelTaskType.REGRESSION:
|
|
156
|
-
skewness = round(abs(skew(target)), 2)
|
|
157
|
-
if (target.min() >= 0) and (skewness >= 0.9):
|
|
158
|
-
target = np.log1p(target)
|
|
159
|
-
|
|
160
|
-
return pd.Series(target, name=target_name)
|
|
File without changes
|
|
File without changes
|