upgini 1.2.58a1__tar.gz → 1.2.59a3818.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/PKG-INFO +2 -2
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/pyproject.toml +1 -1
- upgini-1.2.59a3818.dev1/src/upgini/__about__.py +1 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/date.py +8 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/feature.py +1 -10
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/vector.py +1 -1
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/features_enricher.py +40 -54
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/strings.properties +0 -1
- upgini-1.2.58a1/src/upgini/__about__.py +0 -1
- upgini-1.2.58a1/src/upgini/utils/mstats.py +0 -177
- upgini-1.2.58a1/src/upgini/utils/sort.py +0 -160
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/.gitignore +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/LICENSE +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/README.md +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/ads.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/dataset.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/errors.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/http.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/metadata.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/metrics.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.59a3818.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -34,7 +34,7 @@ Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
|
34
34
|
Requires-Dist: pyjwt>=2.8.0
|
|
35
35
|
Requires-Dist: python-bidi==0.4.2
|
|
36
36
|
Requires-Dist: python-dateutil>=2.8.0
|
|
37
|
-
Requires-Dist: python-json-logger>=
|
|
37
|
+
Requires-Dist: python-json-logger>=3.3.0
|
|
38
38
|
Requires-Dist: requests>=2.8.0
|
|
39
39
|
Requires-Dist: scikit-learn>=1.3.0
|
|
40
40
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.59a3818.dev1"
|
|
@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
|
|
|
64
64
|
return res
|
|
65
65
|
|
|
66
66
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
67
|
+
if left.isna().all() or right.isna().all():
|
|
68
|
+
return pd.Series([None] * len(left))
|
|
69
|
+
|
|
67
70
|
left = self._convert_to_date(left, self.left_unit)
|
|
68
71
|
right = self._convert_to_date(right, self.right_unit)
|
|
69
72
|
diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
|
|
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
|
142
145
|
return cls(aggregation=aggregation)
|
|
143
146
|
|
|
144
147
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
148
|
+
if left.isna().all() or right.isna().all():
|
|
149
|
+
return pd.Series([None] * len(left), dtype=np.float64)
|
|
150
|
+
|
|
145
151
|
left = self._convert_to_date(left, self.left_unit)
|
|
146
152
|
right_mask = right.apply(lambda x: len(x) > 0)
|
|
147
153
|
mask = left.notna() & right.notna() & right_mask
|
|
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
|
|
|
230
236
|
pass
|
|
231
237
|
|
|
232
238
|
def _perc(self, f, bounds):
|
|
239
|
+
if f is None or np.isnan(f):
|
|
240
|
+
return np.nan
|
|
233
241
|
hit = np.where(f >= np.array(bounds))[0]
|
|
234
242
|
if hit.size > 0:
|
|
235
243
|
return np.max(hit) + 1
|
|
@@ -26,18 +26,9 @@ class Column:
|
|
|
26
26
|
return dict()
|
|
27
27
|
|
|
28
28
|
def rename_columns(self, mapping: Dict[str, str]) -> "Column":
|
|
29
|
-
self.name =
|
|
29
|
+
self.name = mapping.get(self.name) or self.name
|
|
30
30
|
return self
|
|
31
31
|
|
|
32
|
-
def _unhash(self, feature_name: str) -> str:
|
|
33
|
-
last_component_idx = feature_name.rfind("_")
|
|
34
|
-
if not feature_name.startswith("f_"):
|
|
35
|
-
return feature_name # etalon feature
|
|
36
|
-
elif last_component_idx == 1:
|
|
37
|
-
return feature_name[2:] # fully hashed name, cannot unhash
|
|
38
|
-
else:
|
|
39
|
-
return feature_name[2:last_component_idx]
|
|
40
|
-
|
|
41
32
|
def delete_data(self):
|
|
42
33
|
self.data = None
|
|
43
34
|
|
|
@@ -55,7 +55,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
|
55
55
|
ts.set_index(date.name, inplace=True)
|
|
56
56
|
ts = ts[ts.index.notna()].sort_index()
|
|
57
57
|
ts = (
|
|
58
|
-
ts.groupby([c.name for c in data[1:-1]])
|
|
58
|
+
ts.groupby([c.name for c in data[1:-1]], group_keys=True)
|
|
59
59
|
.apply(self._shift)[data[-1].name]
|
|
60
60
|
.to_frame()
|
|
61
61
|
.reset_index()
|
|
@@ -112,7 +112,6 @@ try:
|
|
|
112
112
|
except Exception:
|
|
113
113
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
114
114
|
|
|
115
|
-
from upgini.utils.sort import sort_columns
|
|
116
115
|
from upgini.utils.target_utils import (
|
|
117
116
|
balance_undersample_forced,
|
|
118
117
|
calculate_psi,
|
|
@@ -1258,7 +1257,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1258
1257
|
for feature, shap in new_shaps.items()
|
|
1259
1258
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1260
1259
|
}
|
|
1261
|
-
self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
|
|
1260
|
+
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1262
1261
|
|
|
1263
1262
|
if self.features_info_display_handle is not None:
|
|
1264
1263
|
try:
|
|
@@ -1735,7 +1734,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1735
1734
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1736
1735
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1737
1736
|
|
|
1738
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID
|
|
1737
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1739
1738
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1740
1739
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1741
1740
|
|
|
@@ -1916,7 +1915,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1916
1915
|
progress_bar=progress_bar,
|
|
1917
1916
|
progress_callback=progress_callback,
|
|
1918
1917
|
add_fit_system_record_id=True,
|
|
1919
|
-
target_name=tmp_target_name,
|
|
1920
1918
|
)
|
|
1921
1919
|
if enriched_df is None:
|
|
1922
1920
|
return None
|
|
@@ -1966,7 +1964,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1966
1964
|
progress_bar=progress_bar,
|
|
1967
1965
|
progress_callback=progress_callback,
|
|
1968
1966
|
add_fit_system_record_id=True,
|
|
1969
|
-
target_name=tmp_target_name,
|
|
1970
1967
|
)
|
|
1971
1968
|
if enriched_Xy is None:
|
|
1972
1969
|
return None
|
|
@@ -2128,7 +2125,6 @@ if response.status_code == 200:
|
|
|
2128
2125
|
progress_bar: Optional[ProgressBar] = None,
|
|
2129
2126
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2130
2127
|
add_fit_system_record_id: bool = False,
|
|
2131
|
-
target_name: Optional[str] = None,
|
|
2132
2128
|
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2133
2129
|
if self._search_task is None:
|
|
2134
2130
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
@@ -2313,11 +2309,8 @@ if response.status_code == 200:
|
|
|
2313
2309
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2314
2310
|
]
|
|
2315
2311
|
|
|
2316
|
-
if add_fit_system_record_id
|
|
2317
|
-
|
|
2318
|
-
df = self.__add_fit_system_record_id(
|
|
2319
|
-
df, search_keys, SYSTEM_RECORD_ID, reversed_columns_renaming.get(target_name, target_name)
|
|
2320
|
-
)
|
|
2312
|
+
if add_fit_system_record_id:
|
|
2313
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2321
2314
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2322
2315
|
features_not_to_pass.append(SORT_ID)
|
|
2323
2316
|
|
|
@@ -2761,7 +2754,7 @@ if response.status_code == 200:
|
|
|
2761
2754
|
self.__log_warning(full_duplicates_warning)
|
|
2762
2755
|
|
|
2763
2756
|
# Explode multiple search keys
|
|
2764
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID
|
|
2757
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2765
2758
|
|
|
2766
2759
|
# TODO check that this is correct for enrichment
|
|
2767
2760
|
self.df_with_original_index = df.copy()
|
|
@@ -2843,7 +2836,7 @@ if response.status_code == 200:
|
|
|
2843
2836
|
if eval_set is not None and len(eval_set) > 0:
|
|
2844
2837
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2845
2838
|
|
|
2846
|
-
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID
|
|
2839
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2847
2840
|
|
|
2848
2841
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2849
2842
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
@@ -3535,60 +3528,53 @@ if response.status_code == 200:
|
|
|
3535
3528
|
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3536
3529
|
search_keys: Dict[str, SearchKey],
|
|
3537
3530
|
id_name: str,
|
|
3538
|
-
target_name: str,
|
|
3539
3531
|
) -> pd.DataFrame:
|
|
3532
|
+
# save original order or rows
|
|
3540
3533
|
original_index_name = df.index.name
|
|
3541
3534
|
index_name = df.index.name or DEFAULT_INDEX
|
|
3542
3535
|
original_order_name = "original_order"
|
|
3543
|
-
# Save original index
|
|
3544
3536
|
df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
|
|
3545
|
-
# Save original order
|
|
3546
3537
|
df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
|
|
3547
3538
|
|
|
3548
|
-
# order by date and idempotent order by other keys
|
|
3539
|
+
# order by date and idempotent order by other keys
|
|
3540
|
+
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
3541
|
+
sort_exclude_columns = [
|
|
3542
|
+
original_order_name,
|
|
3543
|
+
ORIGINAL_INDEX,
|
|
3544
|
+
EVAL_SET_INDEX,
|
|
3545
|
+
TARGET,
|
|
3546
|
+
"__target",
|
|
3547
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
3548
|
+
]
|
|
3549
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3550
|
+
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3551
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3552
|
+
else:
|
|
3553
|
+
date_column = self._get_date_column(search_keys)
|
|
3554
|
+
sort_columns = [date_column] if date_column is not None else []
|
|
3549
3555
|
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
ORIGINAL_INDEX,
|
|
3553
|
-
EVAL_SET_INDEX,
|
|
3554
|
-
TARGET,
|
|
3555
|
-
"__target",
|
|
3556
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
3557
|
-
]
|
|
3558
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3559
|
-
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3560
|
-
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3561
|
-
else:
|
|
3562
|
-
date_column = self._get_date_column(search_keys)
|
|
3563
|
-
sort_exclude_columns.append(date_column)
|
|
3564
|
-
columns_to_sort = [date_column] if date_column is not None else []
|
|
3556
|
+
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
3557
|
+
sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
|
|
3565
3558
|
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3559
|
+
other_columns = sorted(
|
|
3560
|
+
[
|
|
3561
|
+
c
|
|
3562
|
+
for c in df.columns
|
|
3563
|
+
if c not in sort_columns
|
|
3564
|
+
and c not in sorted_other_keys
|
|
3565
|
+
and c not in sort_exclude_columns
|
|
3566
|
+
and df[c].nunique() > 1
|
|
3567
|
+
]
|
|
3568
|
+
)
|
|
3572
3569
|
|
|
3573
|
-
|
|
3574
|
-
if duplicates.any():
|
|
3575
|
-
self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
|
3576
|
-
do_sorting = False
|
|
3577
|
-
else:
|
|
3578
|
-
columns_to_hash = list(search_keys.keys()) + self.id_columns
|
|
3579
|
-
columns_to_hash = sort_columns(
|
|
3580
|
-
df[columns_to_hash], target_name, search_keys, self.model_task_type, sort_exclude_columns
|
|
3581
|
-
)
|
|
3582
|
-
else:
|
|
3583
|
-
columns_to_hash = sort_columns(df, target_name, search_keys, self.model_task_type, sort_exclude_columns)
|
|
3570
|
+
all_other_columns = sorted_other_keys + other_columns
|
|
3584
3571
|
|
|
3585
|
-
if do_sorting:
|
|
3586
3572
|
search_keys_hash = "search_keys_hash"
|
|
3587
|
-
if len(
|
|
3588
|
-
|
|
3589
|
-
|
|
3573
|
+
if len(all_other_columns) > 0:
|
|
3574
|
+
sort_columns.append(search_keys_hash)
|
|
3575
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
|
|
3590
3576
|
|
|
3591
|
-
df = df.sort_values(by=
|
|
3577
|
+
df = df.sort_values(by=sort_columns)
|
|
3592
3578
|
|
|
3593
3579
|
if search_keys_hash in df.columns:
|
|
3594
3580
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
@@ -35,7 +35,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
|
|
|
35
35
|
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
|
36
36
|
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
|
37
37
|
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
38
|
-
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
|
|
39
38
|
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
|
40
39
|
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
40
|
# Errors
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.58a1"
|
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
import warnings
|
|
2
|
-
from collections import namedtuple
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import numpy.ma as ma
|
|
6
|
-
import scipy
|
|
7
|
-
from joblib import Parallel, delayed
|
|
8
|
-
from numpy import ndarray
|
|
9
|
-
from psutil import cpu_count
|
|
10
|
-
|
|
11
|
-
np.seterr(divide="ignore")
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
warnings.simplefilter(action="ignore", category=RuntimeWarning)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _find_repeats(arr):
|
|
18
|
-
# This function assumes it may clobber its input.
|
|
19
|
-
if len(arr) == 0:
|
|
20
|
-
return np.array(0, np.float64), np.array(0, np.intp)
|
|
21
|
-
|
|
22
|
-
# XXX This cast was previously needed for the Fortran implementation,
|
|
23
|
-
# should we ditch it?
|
|
24
|
-
arr = np.asarray(arr, np.float64).ravel()
|
|
25
|
-
arr.sort()
|
|
26
|
-
|
|
27
|
-
# Taken from NumPy 1.9's np.unique.
|
|
28
|
-
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
|
29
|
-
unique = arr[change]
|
|
30
|
-
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
|
31
|
-
freq = np.diff(change_idx)
|
|
32
|
-
atleast2 = freq > 1
|
|
33
|
-
return unique[atleast2], freq[atleast2]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def find_repeats(arr):
|
|
37
|
-
# Make sure we get a copy. ma.compressed promises a "new array", but can
|
|
38
|
-
# actually return a reference.
|
|
39
|
-
compr = np.asarray(ma.compressed(arr), dtype=np.float64)
|
|
40
|
-
try:
|
|
41
|
-
need_copy = np.may_share_memory(compr, arr)
|
|
42
|
-
except AttributeError:
|
|
43
|
-
# numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
|
|
44
|
-
# while in numpy 1.8.2 and above it just (correctly) returns False.
|
|
45
|
-
need_copy = False
|
|
46
|
-
if need_copy:
|
|
47
|
-
compr = compr.copy()
|
|
48
|
-
return _find_repeats(compr)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def rankdata(data, axis=None, use_missing=False):
|
|
52
|
-
def _rank1d(data, use_missing=False):
|
|
53
|
-
n = data.count()
|
|
54
|
-
rk = np.empty(data.size, dtype=float)
|
|
55
|
-
idx = data.argsort()
|
|
56
|
-
rk[idx[:n]] = np.arange(1, n + 1)
|
|
57
|
-
|
|
58
|
-
if use_missing:
|
|
59
|
-
rk[idx[n:]] = (n + 1) / 2.0
|
|
60
|
-
else:
|
|
61
|
-
rk[idx[n:]] = 0
|
|
62
|
-
|
|
63
|
-
repeats = find_repeats(data.copy())
|
|
64
|
-
for r in repeats[0]:
|
|
65
|
-
condition = (data == r).filled(False)
|
|
66
|
-
rk[condition] = rk[condition].mean()
|
|
67
|
-
return rk
|
|
68
|
-
|
|
69
|
-
data = ma.array(data, copy=False)
|
|
70
|
-
if axis is None:
|
|
71
|
-
if data.ndim > 1:
|
|
72
|
-
return _rank1d(data.ravel(), use_missing).reshape(data.shape)
|
|
73
|
-
else:
|
|
74
|
-
return _rank1d(data, use_missing)
|
|
75
|
-
else:
|
|
76
|
-
return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _chk_asarray(a, axis):
|
|
80
|
-
# Always returns a masked array, raveled for axis=None
|
|
81
|
-
a = ma.asanyarray(a)
|
|
82
|
-
if axis is None:
|
|
83
|
-
a = ma.ravel(a)
|
|
84
|
-
outaxis = 0
|
|
85
|
-
else:
|
|
86
|
-
outaxis = axis
|
|
87
|
-
return a, outaxis
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
# Taken from scipy.mstats with following tweaks:
|
|
94
|
-
# 1. parallel pairwise computation
|
|
95
|
-
# 2. custom masking
|
|
96
|
-
def spearmanr(
|
|
97
|
-
x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
|
|
98
|
-
):
|
|
99
|
-
if not use_ties:
|
|
100
|
-
raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
|
|
101
|
-
|
|
102
|
-
# Always returns a masked array, raveled if axis=None
|
|
103
|
-
x, axisout = _chk_asarray(x, axis)
|
|
104
|
-
if y is not None:
|
|
105
|
-
# Deal only with 2-D `x` case.
|
|
106
|
-
y, _ = _chk_asarray(y, axis)
|
|
107
|
-
if axisout == 0:
|
|
108
|
-
x = ma.column_stack((x, y))
|
|
109
|
-
else:
|
|
110
|
-
x = ma.row_stack((x, y))
|
|
111
|
-
|
|
112
|
-
if axisout == 1:
|
|
113
|
-
# To simplify the code that follow (always use `n_obs, n_vars` shape)
|
|
114
|
-
x = x.T
|
|
115
|
-
|
|
116
|
-
if nan_policy == "omit":
|
|
117
|
-
x = mask_fn(x)
|
|
118
|
-
|
|
119
|
-
def _spearmanr_2cols(x):
|
|
120
|
-
# Mask the same observations for all variables, and then drop those
|
|
121
|
-
# observations (can't leave them masked, rankdata is weird).
|
|
122
|
-
x = ma.mask_rowcols(x, axis=0)
|
|
123
|
-
x = x[~x.mask.any(axis=1), :]
|
|
124
|
-
|
|
125
|
-
# If either column is entirely NaN or Inf
|
|
126
|
-
if not np.any(x.data):
|
|
127
|
-
return SpearmanrResult(np.nan, np.nan)
|
|
128
|
-
|
|
129
|
-
m = ma.getmask(x)
|
|
130
|
-
n_obs = x.shape[0]
|
|
131
|
-
dof = n_obs - 2 - int(m.sum(axis=0)[0])
|
|
132
|
-
if dof < 0:
|
|
133
|
-
return SpearmanrResult(np.nan, np.nan)
|
|
134
|
-
|
|
135
|
-
# Gets the ranks and rank differences
|
|
136
|
-
x_ranked = rankdata(x, axis=0)
|
|
137
|
-
rs = ma.corrcoef(x_ranked, rowvar=False).data
|
|
138
|
-
|
|
139
|
-
# rs can have elements equal to 1, so avoid zero division warnings
|
|
140
|
-
with np.errstate(divide="ignore"):
|
|
141
|
-
# clip the small negative values possibly caused by rounding
|
|
142
|
-
# errors before taking the square root
|
|
143
|
-
t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
|
|
144
|
-
|
|
145
|
-
t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
|
|
146
|
-
|
|
147
|
-
# For backwards compatibility, return scalars when comparing 2 columns
|
|
148
|
-
if rs.shape == (2, 2):
|
|
149
|
-
return SpearmanrResult(rs[1, 0], prob[1, 0])
|
|
150
|
-
else:
|
|
151
|
-
return SpearmanrResult(rs, prob)
|
|
152
|
-
|
|
153
|
-
# Need to do this per pair of variables, otherwise the dropped observations
|
|
154
|
-
# in a third column mess up the result for a pair.
|
|
155
|
-
n_vars = x.shape[1]
|
|
156
|
-
if n_vars == 2:
|
|
157
|
-
return _spearmanr_2cols(x)
|
|
158
|
-
else:
|
|
159
|
-
max_cpu_cores = cpu_count(logical=False)
|
|
160
|
-
with np.errstate(divide="ignore"):
|
|
161
|
-
results = Parallel(n_jobs=max_cpu_cores)(
|
|
162
|
-
delayed(_spearmanr_2cols)(x[:, [var1, var2]])
|
|
163
|
-
for var1 in range(n_vars - 1)
|
|
164
|
-
for var2 in range(var1 + 1, n_vars)
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
rs = np.ones((n_vars, n_vars), dtype=float)
|
|
168
|
-
prob = np.zeros((n_vars, n_vars), dtype=float)
|
|
169
|
-
for var1 in range(n_vars - 1):
|
|
170
|
-
for var2 in range(var1 + 1, n_vars):
|
|
171
|
-
result = results.pop(0)
|
|
172
|
-
rs[var1, var2] = result.correlation
|
|
173
|
-
rs[var2, var1] = result.correlation
|
|
174
|
-
prob[var1, var2] = result.pvalue
|
|
175
|
-
prob[var2, var1] = result.pvalue
|
|
176
|
-
|
|
177
|
-
return SpearmanrResult(rs, prob)
|
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
from typing import Any, Dict, List
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import pandas as pd
|
|
6
|
-
from joblib import Parallel, delayed
|
|
7
|
-
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
|
|
8
|
-
from psutil import cpu_count
|
|
9
|
-
from scipy.stats import skew, spearmanr
|
|
10
|
-
|
|
11
|
-
from upgini.metadata import ModelTaskType, SearchKey
|
|
12
|
-
from upgini.utils import mstats
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def sort_columns(
|
|
16
|
-
df: pd.DataFrame,
|
|
17
|
-
target_column: str,
|
|
18
|
-
search_keys: Dict[str, SearchKey],
|
|
19
|
-
model_task_type: ModelTaskType,
|
|
20
|
-
exclude_columns: List[str],
|
|
21
|
-
) -> List[str]:
|
|
22
|
-
df = df.copy() # avoid side effects
|
|
23
|
-
sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
|
|
24
|
-
sorted_keys = [k for k in sorted_keys if k not in exclude_columns]
|
|
25
|
-
|
|
26
|
-
other_columns = sorted(
|
|
27
|
-
[
|
|
28
|
-
c
|
|
29
|
-
for c in df.columns
|
|
30
|
-
if c not in sorted_keys
|
|
31
|
-
and c not in exclude_columns
|
|
32
|
-
and df[c].nunique() > 1
|
|
33
|
-
]
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
target = prepare_target(df[target_column], model_task_type)
|
|
37
|
-
sort_dict = get_sort_columns_dict(df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True)
|
|
38
|
-
other_columns = [c for c in other_columns if c in sort_dict]
|
|
39
|
-
columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
|
|
40
|
-
return columns_for_sort
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
def get_sort_columns_dict(
|
|
44
|
-
df: pd.DataFrame,
|
|
45
|
-
target: pd.Series,
|
|
46
|
-
sorted_keys: List[str],
|
|
47
|
-
omit_nan: bool,
|
|
48
|
-
n_jobs: int | None = None,
|
|
49
|
-
) -> dict[str, Any]:
|
|
50
|
-
string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
|
|
51
|
-
columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
|
|
52
|
-
if len(string_features) > 0:
|
|
53
|
-
if len(df) > len(df.drop(columns=string_features).drop_duplicates()):
|
|
54
|
-
# factorize string features
|
|
55
|
-
for c in string_features:
|
|
56
|
-
df[c] = df[c].factorize(sort=True)[0]
|
|
57
|
-
columns_for_sort.extend(string_features)
|
|
58
|
-
|
|
59
|
-
if len(columns_for_sort) == 0:
|
|
60
|
-
return {}
|
|
61
|
-
|
|
62
|
-
df = df[columns_for_sort]
|
|
63
|
-
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
64
|
-
df = np.asarray(df, dtype=np.float32)
|
|
65
|
-
correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
|
|
66
|
-
|
|
67
|
-
sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
|
|
68
|
-
return sort_dict
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None):
|
|
72
|
-
target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
|
|
73
|
-
|
|
74
|
-
return np.max(target_correlations, axis=0)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def get_target_correlations(
|
|
78
|
-
df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: int | None = None, precision: int = 15
|
|
79
|
-
):
|
|
80
|
-
df = np.asarray(df, dtype=np.float32)
|
|
81
|
-
target_correlations = np.zeros((2, df.shape[1]))
|
|
82
|
-
target_correlations[0, :] = np.nan_to_num(
|
|
83
|
-
calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
|
|
84
|
-
)
|
|
85
|
-
target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
|
|
86
|
-
|
|
87
|
-
target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
|
|
88
|
-
|
|
89
|
-
return target_correlations
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def corr_dict_from_sort_dict(sort_dict: dict[str, tuple[float, int]]) -> dict[str, float]:
|
|
93
|
-
return {k: v[0] for k, v in sort_dict.items()}
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def calculate_spearman_corr_with_target(
|
|
97
|
-
X: pd.DataFrame | np.ndarray, y: pd.Series, omit_nan: bool = False, n_jobs: int | None = None
|
|
98
|
-
) -> np.ndarray:
|
|
99
|
-
if isinstance(X, pd.DataFrame):
|
|
100
|
-
X = np.asarray(X, dtype=np.float32)
|
|
101
|
-
|
|
102
|
-
if X.size == 0:
|
|
103
|
-
return np.ndarray(shape=(0,))
|
|
104
|
-
|
|
105
|
-
all_correlations = np.zeros(X.shape[1])
|
|
106
|
-
all_correlations.fill(np.nan)
|
|
107
|
-
cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
|
|
108
|
-
|
|
109
|
-
if omit_nan:
|
|
110
|
-
results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
|
|
111
|
-
delayed(mstats.spearmanr)(
|
|
112
|
-
X[:, i],
|
|
113
|
-
y,
|
|
114
|
-
nan_policy="omit",
|
|
115
|
-
axis=0,
|
|
116
|
-
)
|
|
117
|
-
for i in cols2calc
|
|
118
|
-
)
|
|
119
|
-
target_correlations = np.array([abs(res.correlation) for res in results])
|
|
120
|
-
else:
|
|
121
|
-
cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
|
|
122
|
-
target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
|
|
123
|
-
if isinstance(target_correlations, float):
|
|
124
|
-
target_correlations = np.abs([target_correlations])
|
|
125
|
-
else:
|
|
126
|
-
target_correlations = np.abs(target_correlations)[-1, :-1]
|
|
127
|
-
|
|
128
|
-
all_correlations[cols2calc] = target_correlations
|
|
129
|
-
|
|
130
|
-
return all_correlations
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def calculate_spearman(X: np.ndarray, y: pd.Series | None, nan_policy: str):
|
|
134
|
-
features_num = X.shape[1]
|
|
135
|
-
if y is not None:
|
|
136
|
-
features_num += 1
|
|
137
|
-
|
|
138
|
-
if features_num < 2:
|
|
139
|
-
return 1.0
|
|
140
|
-
else:
|
|
141
|
-
return spearmanr(X, y, nan_policy=nan_policy).correlation
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
def hash_series(series: pd.Series) -> int:
|
|
145
|
-
return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
|
|
149
|
-
target_name = target.name
|
|
150
|
-
if model_task_type != ModelTaskType.REGRESSION or (
|
|
151
|
-
not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
|
|
152
|
-
):
|
|
153
|
-
target = target.astype(str).astype("category").cat.codes
|
|
154
|
-
|
|
155
|
-
elif model_task_type == ModelTaskType.REGRESSION:
|
|
156
|
-
skewness = round(abs(skew(target)), 2)
|
|
157
|
-
if (target.min() >= 0) and (skewness >= 0.9):
|
|
158
|
-
target = np.log1p(target)
|
|
159
|
-
|
|
160
|
-
return pd.Series(target, name=target_name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.58a1 → upgini-1.2.59a3818.dev1}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|