upgini 1.2.60__tar.gz → 1.2.60a3792.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/PKG-INFO +1 -2
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/pyproject.toml +0 -1
- upgini-1.2.60a3792.dev2/src/upgini/__about__.py +1 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/date.py +1 -1
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/dataset.py +17 -7
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/features_enricher.py +45 -107
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/metrics.py +7 -4
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/strings.properties +0 -1
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/datetime_utils.py +0 -2
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/target_utils.py +57 -4
- upgini-1.2.60a3792.dev2/src/upgini/utils/ts_utils.py +47 -0
- upgini-1.2.60/src/upgini/__about__.py +0 -1
- upgini-1.2.60/src/upgini/utils/mstats.py +0 -177
- upgini-1.2.60/src/upgini/utils/sort.py +0 -172
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/.gitignore +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/LICENSE +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/README.md +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/ads.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/errors.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/http.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/metadata.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/search_task.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/spinner.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.60a3792.dev2
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -30,7 +30,6 @@ Requires-Dist: jarowinkler>=2.0.0
|
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
32
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
33
|
-
Requires-Dist: psutil>=6.0.0
|
|
34
33
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
35
34
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
35
|
Requires-Dist: python-bidi==0.4.2
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.60a3792.dev2"
|
|
@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
|
40
40
|
from upgini.utils.target_utils import (
|
|
41
41
|
balance_undersample,
|
|
42
42
|
balance_undersample_forced,
|
|
43
|
-
|
|
43
|
+
balance_undersample_time_series_trunc,
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
try:
|
|
@@ -58,6 +58,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
58
58
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
59
59
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
60
60
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
61
|
+
FIT_SAMPLE_THRESHOLD_TS = 54_000
|
|
62
|
+
FIT_SAMPLE_ROWS_TS = 54_000
|
|
61
63
|
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
62
64
|
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
63
65
|
IMBALANCE_THESHOLD = 0.6
|
|
@@ -301,7 +303,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
301
303
|
)
|
|
302
304
|
|
|
303
305
|
# Resample over fit threshold
|
|
304
|
-
if
|
|
306
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
307
|
+
sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
|
|
308
|
+
sample_rows = self.FIT_SAMPLE_ROWS_TS
|
|
309
|
+
elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|
|
305
310
|
sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
306
311
|
sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
307
312
|
else:
|
|
@@ -314,7 +319,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
314
319
|
f"and will be downsampled to {sample_rows}"
|
|
315
320
|
)
|
|
316
321
|
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
317
|
-
resampled_data =
|
|
322
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
318
323
|
df=self.data,
|
|
319
324
|
id_columns=self.id_columns,
|
|
320
325
|
date_column=next(
|
|
@@ -584,10 +589,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
584
589
|
return search_customization
|
|
585
590
|
|
|
586
591
|
def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
587
|
-
if
|
|
588
|
-
runtime_parameters is not None
|
|
589
|
-
and runtime_parameters.properties is not None
|
|
590
|
-
):
|
|
592
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
591
593
|
if "generate_features" in runtime_parameters.properties:
|
|
592
594
|
generate_features = runtime_parameters.properties["generate_features"].split(",")
|
|
593
595
|
renamed_generate_features = []
|
|
@@ -607,6 +609,13 @@ class Dataset: # (pd.DataFrame):
|
|
|
607
609
|
|
|
608
610
|
return runtime_parameters
|
|
609
611
|
|
|
612
|
+
def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
613
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
614
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
615
|
+
runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
616
|
+
runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
617
|
+
return runtime_parameters
|
|
618
|
+
|
|
610
619
|
def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
611
620
|
if (
|
|
612
621
|
runtime_parameters is not None
|
|
@@ -638,6 +647,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
638
647
|
file_metrics = FileMetrics()
|
|
639
648
|
|
|
640
649
|
runtime_parameters = self._rename_generate_features(runtime_parameters)
|
|
650
|
+
runtime_parameters = self._set_sample_size(runtime_parameters)
|
|
641
651
|
|
|
642
652
|
file_metadata = self.__construct_metadata(exclude_features_sources)
|
|
643
653
|
search_customization = self.__construct_search_customization(
|
|
@@ -112,7 +112,6 @@ try:
|
|
|
112
112
|
except Exception:
|
|
113
113
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
114
114
|
|
|
115
|
-
from upgini.utils.sort import sort_columns
|
|
116
115
|
from upgini.utils.target_utils import (
|
|
117
116
|
balance_undersample_forced,
|
|
118
117
|
calculate_psi,
|
|
@@ -1262,7 +1261,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1262
1261
|
for feature, shap in new_shaps.items()
|
|
1263
1262
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1264
1263
|
}
|
|
1265
|
-
self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
|
|
1264
|
+
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1266
1265
|
|
|
1267
1266
|
if self.features_info_display_handle is not None:
|
|
1268
1267
|
try:
|
|
@@ -1569,23 +1568,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1569
1568
|
|
|
1570
1569
|
fitting_eval_set_dict = {}
|
|
1571
1570
|
fitting_x_columns = fitting_X.columns.to_list()
|
|
1572
|
-
|
|
1573
|
-
fitting_x_columns = sort_columns(
|
|
1574
|
-
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
|
1575
|
-
)
|
|
1576
|
-
fitting_X = fitting_X[fitting_x_columns]
|
|
1577
|
-
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
|
1571
|
+
self.logger.info(f"Final list of fitting X columns: {fitting_x_columns}")
|
|
1578
1572
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
|
1579
|
-
|
|
1580
|
-
fitting_enriched_X,
|
|
1581
|
-
enriched_y_sorted,
|
|
1582
|
-
search_keys,
|
|
1583
|
-
self.model_task_type,
|
|
1584
|
-
sort_all_columns=True,
|
|
1585
|
-
logger=self.logger,
|
|
1586
|
-
)
|
|
1587
|
-
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
|
1588
|
-
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
|
1573
|
+
self.logger.info(f"Final list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
|
1589
1574
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1590
1575
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1591
1576
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1749,15 +1734,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1749
1734
|
if eval_set is not None
|
|
1750
1735
|
else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
|
|
1751
1736
|
)
|
|
1752
|
-
|
|
1753
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
|
1754
|
-
# Sample after sorting by system_record_id for idempotency
|
|
1755
|
-
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
|
1756
|
-
|
|
1757
1737
|
if num_samples > sample_threshold:
|
|
1758
1738
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1759
1739
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1760
1740
|
|
|
1741
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1761
1742
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1762
1743
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1763
1744
|
|
|
@@ -1901,7 +1882,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1901
1882
|
and self.columns_for_online_api is not None
|
|
1902
1883
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1903
1884
|
)
|
|
1904
|
-
# TODO: check that system_record_id was added before this step
|
|
1905
1885
|
if force_downsampling:
|
|
1906
1886
|
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1907
1887
|
df = balance_undersample_forced(
|
|
@@ -1935,7 +1915,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1935
1915
|
progress_bar=progress_bar,
|
|
1936
1916
|
progress_callback=progress_callback,
|
|
1937
1917
|
add_fit_system_record_id=True,
|
|
1938
|
-
target_name=tmp_target_name,
|
|
1939
1918
|
)
|
|
1940
1919
|
if enriched_df is None:
|
|
1941
1920
|
return None
|
|
@@ -1974,7 +1953,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1974
1953
|
and self.columns_for_online_api is not None
|
|
1975
1954
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1976
1955
|
)
|
|
1977
|
-
|
|
1978
1956
|
if force_downsampling:
|
|
1979
1957
|
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1980
1958
|
df = balance_undersample_forced(
|
|
@@ -2006,7 +1984,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2006
1984
|
progress_bar=progress_bar,
|
|
2007
1985
|
progress_callback=progress_callback,
|
|
2008
1986
|
add_fit_system_record_id=True,
|
|
2009
|
-
target_name=tmp_target_name,
|
|
2010
1987
|
)
|
|
2011
1988
|
if enriched_Xy is None:
|
|
2012
1989
|
return None
|
|
@@ -2168,7 +2145,6 @@ if response.status_code == 200:
|
|
|
2168
2145
|
progress_bar: Optional[ProgressBar] = None,
|
|
2169
2146
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2170
2147
|
add_fit_system_record_id: bool = False,
|
|
2171
|
-
target_name: Optional[str] = None,
|
|
2172
2148
|
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2173
2149
|
if self._search_task is None:
|
|
2174
2150
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
@@ -2353,16 +2329,8 @@ if response.status_code == 200:
|
|
|
2353
2329
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2354
2330
|
]
|
|
2355
2331
|
|
|
2356
|
-
if add_fit_system_record_id
|
|
2357
|
-
|
|
2358
|
-
df = self.__add_fit_system_record_id(
|
|
2359
|
-
df,
|
|
2360
|
-
search_keys,
|
|
2361
|
-
SYSTEM_RECORD_ID,
|
|
2362
|
-
reversed_columns_renaming.get(target_name, target_name),
|
|
2363
|
-
columns_renaming,
|
|
2364
|
-
silent=True,
|
|
2365
|
-
)
|
|
2332
|
+
if add_fit_system_record_id:
|
|
2333
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2366
2334
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2367
2335
|
features_not_to_pass.append(SORT_ID)
|
|
2368
2336
|
|
|
@@ -2807,9 +2775,7 @@ if response.status_code == 200:
|
|
|
2807
2775
|
self.__log_warning(full_duplicates_warning)
|
|
2808
2776
|
|
|
2809
2777
|
# Explode multiple search keys
|
|
2810
|
-
df = self.__add_fit_system_record_id(
|
|
2811
|
-
df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
|
|
2812
|
-
)
|
|
2778
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2813
2779
|
|
|
2814
2780
|
# TODO check that this is correct for enrichment
|
|
2815
2781
|
self.df_with_original_index = df.copy()
|
|
@@ -2891,9 +2857,7 @@ if response.status_code == 200:
|
|
|
2891
2857
|
if eval_set is not None and len(eval_set) > 0:
|
|
2892
2858
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2893
2859
|
|
|
2894
|
-
df = self.__add_fit_system_record_id(
|
|
2895
|
-
df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
|
|
2896
|
-
)
|
|
2860
|
+
df = self.__add_fit_system_record_id(df, self.fit_search_keys, SYSTEM_RECORD_ID)
|
|
2897
2861
|
|
|
2898
2862
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2899
2863
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
@@ -3580,82 +3544,56 @@ if response.status_code == 200:
|
|
|
3580
3544
|
def __add_fit_system_record_id(
|
|
3581
3545
|
self,
|
|
3582
3546
|
df: pd.DataFrame,
|
|
3547
|
+
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3583
3548
|
search_keys: Dict[str, SearchKey],
|
|
3584
3549
|
id_name: str,
|
|
3585
|
-
target_name: str,
|
|
3586
|
-
columns_renaming: Dict[str, str],
|
|
3587
|
-
silent: bool = False,
|
|
3588
3550
|
) -> pd.DataFrame:
|
|
3551
|
+
# save original order or rows
|
|
3589
3552
|
original_index_name = df.index.name
|
|
3590
3553
|
index_name = df.index.name or DEFAULT_INDEX
|
|
3591
3554
|
original_order_name = "original_order"
|
|
3592
|
-
# Save original index
|
|
3593
3555
|
df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
|
|
3594
|
-
# Save original order
|
|
3595
3556
|
df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
|
|
3596
3557
|
|
|
3597
|
-
# order by date and idempotent order by other keys
|
|
3558
|
+
# order by date and idempotent order by other keys
|
|
3559
|
+
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
3560
|
+
sort_exclude_columns = [
|
|
3561
|
+
original_order_name,
|
|
3562
|
+
ORIGINAL_INDEX,
|
|
3563
|
+
EVAL_SET_INDEX,
|
|
3564
|
+
TARGET,
|
|
3565
|
+
"__target",
|
|
3566
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
3567
|
+
]
|
|
3568
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3569
|
+
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3570
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3571
|
+
else:
|
|
3572
|
+
date_column = self._get_date_column(search_keys)
|
|
3573
|
+
sort_columns = [date_column] if date_column is not None else []
|
|
3598
3574
|
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
ORIGINAL_INDEX,
|
|
3602
|
-
EVAL_SET_INDEX,
|
|
3603
|
-
TARGET,
|
|
3604
|
-
"__target",
|
|
3605
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
3606
|
-
]
|
|
3607
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3608
|
-
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3609
|
-
sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
|
|
3610
|
-
else:
|
|
3611
|
-
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
3612
|
-
sort_exclude_columns.append(date_column)
|
|
3613
|
-
columns_to_sort = [date_column] if date_column is not None else []
|
|
3614
|
-
|
|
3615
|
-
do_sorting = True
|
|
3616
|
-
if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
|
|
3617
|
-
# Check duplicates by date and id_columns
|
|
3618
|
-
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
3619
|
-
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
|
|
3620
|
-
duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
|
|
3621
|
-
if date_column is not None:
|
|
3622
|
-
duplicate_check_columns.append(date_column)
|
|
3575
|
+
sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
|
|
3576
|
+
sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
|
|
3623
3577
|
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
do_sorting = False
|
|
3634
|
-
else:
|
|
3635
|
-
columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
|
|
3636
|
-
columns_to_hash = sort_columns(
|
|
3637
|
-
df[columns_to_hash],
|
|
3638
|
-
target_name,
|
|
3639
|
-
search_keys,
|
|
3640
|
-
self.model_task_type,
|
|
3641
|
-
sort_exclude_columns,
|
|
3642
|
-
logger=self.logger,
|
|
3643
|
-
)
|
|
3644
|
-
else:
|
|
3645
|
-
columns_to_hash = sort_columns(
|
|
3646
|
-
df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
|
|
3578
|
+
other_columns = sorted(
|
|
3579
|
+
[
|
|
3580
|
+
c
|
|
3581
|
+
for c in df.columns
|
|
3582
|
+
if c not in sort_columns
|
|
3583
|
+
and c not in sorted_other_keys
|
|
3584
|
+
and c not in sort_exclude_columns
|
|
3585
|
+
and df[c].nunique() > 1
|
|
3586
|
+
]
|
|
3647
3587
|
)
|
|
3648
|
-
|
|
3588
|
+
|
|
3589
|
+
all_other_columns = sorted_other_keys + other_columns
|
|
3590
|
+
|
|
3649
3591
|
search_keys_hash = "search_keys_hash"
|
|
3650
|
-
if len(
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(factorized_df[columns_to_hash], index=False)
|
|
3656
|
-
columns_to_sort.append(search_keys_hash)
|
|
3657
|
-
|
|
3658
|
-
df = df.sort_values(by=columns_to_sort)
|
|
3592
|
+
if len(all_other_columns) > 0:
|
|
3593
|
+
sort_columns.append(search_keys_hash)
|
|
3594
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
|
|
3595
|
+
|
|
3596
|
+
df = df.sort_values(by=sort_columns)
|
|
3659
3597
|
|
|
3660
3598
|
if search_keys_hash in df.columns:
|
|
3661
3599
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
@@ -30,8 +30,8 @@ except ImportError:
|
|
|
30
30
|
from sklearn.metrics._regression import (
|
|
31
31
|
_check_reg_targets,
|
|
32
32
|
check_consistent_length,
|
|
33
|
+
mean_squared_error,
|
|
33
34
|
)
|
|
34
|
-
from sklearn.metrics import mean_squared_error
|
|
35
35
|
from sklearn.model_selection import BaseCrossValidator
|
|
36
36
|
|
|
37
37
|
from upgini.errors import ValidationError
|
|
@@ -289,6 +289,9 @@ class EstimatorWrapper:
|
|
|
289
289
|
else:
|
|
290
290
|
x, y = self._remove_empty_target_rows(x, y)
|
|
291
291
|
|
|
292
|
+
# Make order of columns idempotent
|
|
293
|
+
x = x[sorted(x.columns)]
|
|
294
|
+
|
|
292
295
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
293
296
|
return x, y, groups
|
|
294
297
|
|
|
@@ -566,7 +569,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
566
569
|
if all([isinstance(c, int) for c in estimator_cat_features]):
|
|
567
570
|
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
|
568
571
|
cat_features_idx.update(estimator_cat_features)
|
|
569
|
-
self.cat_features = [x.columns[idx] for idx in cat_features_idx]
|
|
572
|
+
self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
|
|
570
573
|
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
|
571
574
|
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
|
572
575
|
else:
|
|
@@ -937,13 +940,13 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
|
937
940
|
if (y_true < 0).any():
|
|
938
941
|
raise ValidationError(bundle.get("metrics_msle_negative_target"))
|
|
939
942
|
|
|
940
|
-
|
|
943
|
+
return mean_squared_error(
|
|
941
944
|
log1p(y_true),
|
|
942
945
|
log1p(y_pred.clip(0)),
|
|
943
946
|
sample_weight=sample_weight,
|
|
944
947
|
multioutput=multioutput,
|
|
948
|
+
squared=squared,
|
|
945
949
|
)
|
|
946
|
-
return mse if squared else np.sqrt(mse)
|
|
947
950
|
|
|
948
951
|
|
|
949
952
|
def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
|
|
@@ -35,7 +35,6 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
|
|
|
35
35
|
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
|
36
36
|
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
|
37
37
|
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
38
|
-
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
|
|
39
38
|
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
|
40
39
|
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
40
|
# Errors
|
|
@@ -166,8 +166,6 @@ class DateTimeSearchKeyConverter:
|
|
|
166
166
|
|
|
167
167
|
# Drop intermediate columns if not needed
|
|
168
168
|
df.drop(columns=["second", "minute", "hour"], inplace=True)
|
|
169
|
-
else:
|
|
170
|
-
keep_time = False
|
|
171
169
|
|
|
172
170
|
for generated_feature in self.generated_features[:]:
|
|
173
171
|
if df[generated_feature].dropna().nunique() <= 1:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
import logging
|
|
2
3
|
from typing import Callable, List, Optional, Union
|
|
3
4
|
|
|
@@ -9,6 +10,7 @@ from upgini.errors import ValidationError
|
|
|
9
10
|
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
|
10
11
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
11
12
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
13
|
+
from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
|
12
14
|
|
|
13
15
|
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
14
16
|
|
|
@@ -206,7 +208,7 @@ def balance_undersample_forced(
|
|
|
206
208
|
id_columns: List[str],
|
|
207
209
|
date_column: str,
|
|
208
210
|
task_type: ModelTaskType,
|
|
209
|
-
cv_type:
|
|
211
|
+
cv_type: CVType | None,
|
|
210
212
|
random_state: int,
|
|
211
213
|
sample_size: int = 7000,
|
|
212
214
|
logger: Optional[logging.Logger] = None,
|
|
@@ -240,7 +242,7 @@ def balance_undersample_forced(
|
|
|
240
242
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
241
243
|
if cv_type is not None and cv_type.is_time_series():
|
|
242
244
|
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
243
|
-
resampled_data =
|
|
245
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
244
246
|
df,
|
|
245
247
|
id_columns=id_columns,
|
|
246
248
|
date_column=date_column,
|
|
@@ -279,6 +281,58 @@ def balance_undersample_forced(
|
|
|
279
281
|
return resampled_data
|
|
280
282
|
|
|
281
283
|
|
|
284
|
+
DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
|
285
|
+
DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
|
286
|
+
DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def balance_undersample_time_series_trunc(
|
|
290
|
+
df: pd.DataFrame,
|
|
291
|
+
id_columns: List[str],
|
|
292
|
+
date_column: str,
|
|
293
|
+
sample_size: int,
|
|
294
|
+
random_state: int = 42,
|
|
295
|
+
logger: Optional[logging.Logger] = None,
|
|
296
|
+
highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
|
|
297
|
+
lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
|
|
298
|
+
time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
|
|
299
|
+
**kwargs,
|
|
300
|
+
):
|
|
301
|
+
# Convert date column to datetime
|
|
302
|
+
dates_df = df[id_columns + [date_column]].copy()
|
|
303
|
+
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
|
304
|
+
|
|
305
|
+
time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
|
|
306
|
+
if logger is not None:
|
|
307
|
+
logger.info(f"Time unit: {time_unit}")
|
|
308
|
+
|
|
309
|
+
if time_unit is None:
|
|
310
|
+
if logger is not None:
|
|
311
|
+
logger.info("Cannot detect time unit, returning original dataset")
|
|
312
|
+
return df
|
|
313
|
+
|
|
314
|
+
if time_unit < time_unit_threshold:
|
|
315
|
+
for trunc_length in highfreq_trunc_lengths:
|
|
316
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
|
|
317
|
+
if len(sampled_df) <= sample_size:
|
|
318
|
+
break
|
|
319
|
+
if len(sampled_df) > sample_size:
|
|
320
|
+
sampled_df = balance_undersample_time_series(
|
|
321
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
for trunc_length in lowfreq_trunc_lengths:
|
|
325
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
|
|
326
|
+
if len(sampled_df) <= sample_size:
|
|
327
|
+
break
|
|
328
|
+
if len(sampled_df) > sample_size:
|
|
329
|
+
sampled_df = balance_undersample_time_series(
|
|
330
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
return df.loc[sampled_df.index]
|
|
334
|
+
|
|
335
|
+
|
|
282
336
|
def balance_undersample_time_series(
|
|
283
337
|
df: pd.DataFrame,
|
|
284
338
|
id_columns: List[str],
|
|
@@ -318,8 +372,7 @@ def balance_undersample_time_series(
|
|
|
318
372
|
if len(id_counts) < min_different_ids:
|
|
319
373
|
if logger is not None:
|
|
320
374
|
logger.info(
|
|
321
|
-
f"Different ids count {len(id_counts)} for sample size {sample_size}"
|
|
322
|
-
f" is less than min different ids {min_different_ids}, sampling time window"
|
|
375
|
+
f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
|
|
323
376
|
)
|
|
324
377
|
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
325
378
|
ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
|
|
7
|
+
|
|
8
|
+
def closest_unit(diff):
|
|
9
|
+
return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
|
|
10
|
+
|
|
11
|
+
# Calculate differences for each ID group
|
|
12
|
+
all_diffs = []
|
|
13
|
+
groups = df.groupby(id_columns) if id_columns else [(None, df)]
|
|
14
|
+
for _, group in groups:
|
|
15
|
+
# Get sorted dates for this group
|
|
16
|
+
group_dates = group[date_column].sort_values().unique()
|
|
17
|
+
if len(group_dates) > 1:
|
|
18
|
+
# Calculate time differences between consecutive dates
|
|
19
|
+
diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
|
|
20
|
+
# Convert to nanoseconds
|
|
21
|
+
diff_ns = diff_series.dt.total_seconds()
|
|
22
|
+
all_diffs.extend(diff_ns)
|
|
23
|
+
|
|
24
|
+
# Convert to series for easier processing
|
|
25
|
+
all_diffs = pd.Series(all_diffs)
|
|
26
|
+
|
|
27
|
+
# Get most common time unit across all groups
|
|
28
|
+
most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
|
|
29
|
+
|
|
30
|
+
return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def trunc_datetime(
|
|
34
|
+
df: pd.DataFrame,
|
|
35
|
+
id_columns: List[str],
|
|
36
|
+
date_column: str,
|
|
37
|
+
length: pd.DateOffset,
|
|
38
|
+
logger: Optional[logging.Logger] = None,
|
|
39
|
+
) -> pd.DataFrame:
|
|
40
|
+
if logger is not None:
|
|
41
|
+
logger.info(f"Truncating time series dataset to {length}")
|
|
42
|
+
|
|
43
|
+
if id_columns:
|
|
44
|
+
min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
|
|
45
|
+
else:
|
|
46
|
+
min_datetime = df[date_column].max() - length
|
|
47
|
+
return df[df[date_column] > min_datetime]
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.60"
|
|
@@ -1,177 +0,0 @@
|
|
|
1
|
-
import warnings
|
|
2
|
-
from collections import namedtuple
|
|
3
|
-
|
|
4
|
-
import numpy as np
|
|
5
|
-
import numpy.ma as ma
|
|
6
|
-
import scipy
|
|
7
|
-
from joblib import Parallel, delayed
|
|
8
|
-
from numpy import ndarray
|
|
9
|
-
from psutil import cpu_count
|
|
10
|
-
|
|
11
|
-
np.seterr(divide="ignore")
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
warnings.simplefilter(action="ignore", category=RuntimeWarning)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def _find_repeats(arr):
|
|
18
|
-
# This function assumes it may clobber its input.
|
|
19
|
-
if len(arr) == 0:
|
|
20
|
-
return np.array(0, np.float64), np.array(0, np.intp)
|
|
21
|
-
|
|
22
|
-
# XXX This cast was previously needed for the Fortran implementation,
|
|
23
|
-
# should we ditch it?
|
|
24
|
-
arr = np.asarray(arr, np.float64).ravel()
|
|
25
|
-
arr.sort()
|
|
26
|
-
|
|
27
|
-
# Taken from NumPy 1.9's np.unique.
|
|
28
|
-
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
|
29
|
-
unique = arr[change]
|
|
30
|
-
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
|
31
|
-
freq = np.diff(change_idx)
|
|
32
|
-
atleast2 = freq > 1
|
|
33
|
-
return unique[atleast2], freq[atleast2]
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def find_repeats(arr):
|
|
37
|
-
# Make sure we get a copy. ma.compressed promises a "new array", but can
|
|
38
|
-
# actually return a reference.
|
|
39
|
-
compr = np.asarray(ma.compressed(arr), dtype=np.float64)
|
|
40
|
-
try:
|
|
41
|
-
need_copy = np.may_share_memory(compr, arr)
|
|
42
|
-
except AttributeError:
|
|
43
|
-
# numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
|
|
44
|
-
# while in numpy 1.8.2 and above it just (correctly) returns False.
|
|
45
|
-
need_copy = False
|
|
46
|
-
if need_copy:
|
|
47
|
-
compr = compr.copy()
|
|
48
|
-
return _find_repeats(compr)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def rankdata(data, axis=None, use_missing=False):
|
|
52
|
-
def _rank1d(data, use_missing=False):
|
|
53
|
-
n = data.count()
|
|
54
|
-
rk = np.empty(data.size, dtype=float)
|
|
55
|
-
idx = data.argsort()
|
|
56
|
-
rk[idx[:n]] = np.arange(1, n + 1)
|
|
57
|
-
|
|
58
|
-
if use_missing:
|
|
59
|
-
rk[idx[n:]] = (n + 1) / 2.0
|
|
60
|
-
else:
|
|
61
|
-
rk[idx[n:]] = 0
|
|
62
|
-
|
|
63
|
-
repeats = find_repeats(data.copy())
|
|
64
|
-
for r in repeats[0]:
|
|
65
|
-
condition = (data == r).filled(False)
|
|
66
|
-
rk[condition] = rk[condition].mean()
|
|
67
|
-
return rk
|
|
68
|
-
|
|
69
|
-
data = ma.array(data, copy=False)
|
|
70
|
-
if axis is None:
|
|
71
|
-
if data.ndim > 1:
|
|
72
|
-
return _rank1d(data.ravel(), use_missing).reshape(data.shape)
|
|
73
|
-
else:
|
|
74
|
-
return _rank1d(data, use_missing)
|
|
75
|
-
else:
|
|
76
|
-
return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def _chk_asarray(a, axis):
|
|
80
|
-
# Always returns a masked array, raveled for axis=None
|
|
81
|
-
a = ma.asanyarray(a)
|
|
82
|
-
if axis is None:
|
|
83
|
-
a = ma.ravel(a)
|
|
84
|
-
outaxis = 0
|
|
85
|
-
else:
|
|
86
|
-
outaxis = axis
|
|
87
|
-
return a, outaxis
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
# Taken from scipy.mstats with following tweaks:
|
|
94
|
-
# 1. parallel pairwise computation
|
|
95
|
-
# 2. custom masking
|
|
96
|
-
def spearmanr(
|
|
97
|
-
x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
|
|
98
|
-
):
|
|
99
|
-
if not use_ties:
|
|
100
|
-
raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
|
|
101
|
-
|
|
102
|
-
# Always returns a masked array, raveled if axis=None
|
|
103
|
-
x, axisout = _chk_asarray(x, axis)
|
|
104
|
-
if y is not None:
|
|
105
|
-
# Deal only with 2-D `x` case.
|
|
106
|
-
y, _ = _chk_asarray(y, axis)
|
|
107
|
-
if axisout == 0:
|
|
108
|
-
x = ma.column_stack((x, y))
|
|
109
|
-
else:
|
|
110
|
-
x = ma.row_stack((x, y))
|
|
111
|
-
|
|
112
|
-
if axisout == 1:
|
|
113
|
-
# To simplify the code that follow (always use `n_obs, n_vars` shape)
|
|
114
|
-
x = x.T
|
|
115
|
-
|
|
116
|
-
if nan_policy == "omit":
|
|
117
|
-
x = mask_fn(x)
|
|
118
|
-
|
|
119
|
-
def _spearmanr_2cols(x):
|
|
120
|
-
# Mask the same observations for all variables, and then drop those
|
|
121
|
-
# observations (can't leave them masked, rankdata is weird).
|
|
122
|
-
x = ma.mask_rowcols(x, axis=0)
|
|
123
|
-
x = x[~x.mask.any(axis=1), :]
|
|
124
|
-
|
|
125
|
-
# If either column is entirely NaN or Inf
|
|
126
|
-
if not np.any(x.data):
|
|
127
|
-
return SpearmanrResult(np.nan, np.nan)
|
|
128
|
-
|
|
129
|
-
m = ma.getmask(x)
|
|
130
|
-
n_obs = x.shape[0]
|
|
131
|
-
dof = n_obs - 2 - int(m.sum(axis=0)[0])
|
|
132
|
-
if dof < 0:
|
|
133
|
-
return SpearmanrResult(np.nan, np.nan)
|
|
134
|
-
|
|
135
|
-
# Gets the ranks and rank differences
|
|
136
|
-
x_ranked = rankdata(x, axis=0)
|
|
137
|
-
rs = ma.corrcoef(x_ranked, rowvar=False).data
|
|
138
|
-
|
|
139
|
-
# rs can have elements equal to 1, so avoid zero division warnings
|
|
140
|
-
with np.errstate(divide="ignore"):
|
|
141
|
-
# clip the small negative values possibly caused by rounding
|
|
142
|
-
# errors before taking the square root
|
|
143
|
-
t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
|
|
144
|
-
|
|
145
|
-
t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
|
|
146
|
-
|
|
147
|
-
# For backwards compatibility, return scalars when comparing 2 columns
|
|
148
|
-
if rs.shape == (2, 2):
|
|
149
|
-
return SpearmanrResult(rs[1, 0], prob[1, 0])
|
|
150
|
-
else:
|
|
151
|
-
return SpearmanrResult(rs, prob)
|
|
152
|
-
|
|
153
|
-
# Need to do this per pair of variables, otherwise the dropped observations
|
|
154
|
-
# in a third column mess up the result for a pair.
|
|
155
|
-
n_vars = x.shape[1]
|
|
156
|
-
if n_vars == 2:
|
|
157
|
-
return _spearmanr_2cols(x)
|
|
158
|
-
else:
|
|
159
|
-
max_cpu_cores = cpu_count(logical=False)
|
|
160
|
-
with np.errstate(divide="ignore"):
|
|
161
|
-
results = Parallel(n_jobs=max_cpu_cores)(
|
|
162
|
-
delayed(_spearmanr_2cols)(x[:, [var1, var2]])
|
|
163
|
-
for var1 in range(n_vars - 1)
|
|
164
|
-
for var2 in range(var1 + 1, n_vars)
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
rs = np.ones((n_vars, n_vars), dtype=float)
|
|
168
|
-
prob = np.zeros((n_vars, n_vars), dtype=float)
|
|
169
|
-
for var1 in range(n_vars - 1):
|
|
170
|
-
for var2 in range(var1 + 1, n_vars):
|
|
171
|
-
result = results.pop(0)
|
|
172
|
-
rs[var1, var2] = result.correlation
|
|
173
|
-
rs[var2, var1] = result.correlation
|
|
174
|
-
prob[var1, var2] = result.pvalue
|
|
175
|
-
prob[var2, var1] = result.pvalue
|
|
176
|
-
|
|
177
|
-
return SpearmanrResult(rs, prob)
|
|
@@ -1,172 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Any, Dict, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
|
-
from joblib import Parallel, delayed
|
|
8
|
-
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
|
|
9
|
-
from psutil import cpu_count
|
|
10
|
-
from scipy.stats import skew, spearmanr
|
|
11
|
-
|
|
12
|
-
from upgini.metadata import ModelTaskType, SearchKey
|
|
13
|
-
from upgini.utils import mstats
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def sort_columns(
|
|
17
|
-
df: pd.DataFrame,
|
|
18
|
-
target_column: Union[str, pd.Series],
|
|
19
|
-
search_keys: Dict[str, SearchKey],
|
|
20
|
-
model_task_type: ModelTaskType,
|
|
21
|
-
exclude_columns: Optional[List[str]] = None,
|
|
22
|
-
sort_all_columns: bool = False,
|
|
23
|
-
logger: Optional[logging.Logger] = None,
|
|
24
|
-
) -> List[str]:
|
|
25
|
-
if exclude_columns is None:
|
|
26
|
-
exclude_columns = []
|
|
27
|
-
if logger is None:
|
|
28
|
-
logger = logging.getLogger(__name__)
|
|
29
|
-
logger.setLevel(logging.FATAL)
|
|
30
|
-
df = df.copy() # avoid side effects
|
|
31
|
-
|
|
32
|
-
# Check multiple search keys
|
|
33
|
-
search_key_values = list(search_keys.values())
|
|
34
|
-
has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
|
|
35
|
-
if has_duplicate_search_keys:
|
|
36
|
-
logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
|
|
37
|
-
|
|
38
|
-
sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
|
|
39
|
-
sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
|
|
40
|
-
|
|
41
|
-
other_columns = sorted(
|
|
42
|
-
[
|
|
43
|
-
c
|
|
44
|
-
for c in df.columns
|
|
45
|
-
if c not in sorted_keys and c not in exclude_columns and (df[c].nunique() > 1 or sort_all_columns)
|
|
46
|
-
]
|
|
47
|
-
)
|
|
48
|
-
target = target_column if isinstance(target_column, pd.Series) else df[target_column]
|
|
49
|
-
target = prepare_target(target, model_task_type)
|
|
50
|
-
sort_dict = get_sort_columns_dict(
|
|
51
|
-
df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
|
|
52
|
-
)
|
|
53
|
-
other_columns = [c for c in other_columns if c in sort_dict]
|
|
54
|
-
columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
|
|
55
|
-
return columns_for_sort
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def get_sort_columns_dict(
|
|
59
|
-
df: pd.DataFrame,
|
|
60
|
-
target: pd.Series,
|
|
61
|
-
sorted_keys: List[str],
|
|
62
|
-
omit_nan: bool,
|
|
63
|
-
n_jobs: Optional[int] = None,
|
|
64
|
-
sort_all_columns: bool = False,
|
|
65
|
-
) -> Dict[str, Any]:
|
|
66
|
-
string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
|
|
67
|
-
columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
|
|
68
|
-
if len(string_features) > 0:
|
|
69
|
-
if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
|
|
70
|
-
# factorize string features
|
|
71
|
-
for c in string_features:
|
|
72
|
-
df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
|
|
73
|
-
columns_for_sort.extend(string_features)
|
|
74
|
-
|
|
75
|
-
if len(columns_for_sort) == 0:
|
|
76
|
-
return {}
|
|
77
|
-
|
|
78
|
-
df = df[columns_for_sort]
|
|
79
|
-
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
80
|
-
df = np.asarray(df, dtype=np.float32)
|
|
81
|
-
correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
|
|
82
|
-
|
|
83
|
-
sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
|
|
84
|
-
return sort_dict
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None):
|
|
88
|
-
target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
|
|
89
|
-
|
|
90
|
-
return np.max(target_correlations, axis=0)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def get_target_correlations(
|
|
94
|
-
df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None, precision: int = 15
|
|
95
|
-
):
|
|
96
|
-
df = np.asarray(df, dtype=np.float32)
|
|
97
|
-
target_correlations = np.zeros((2, df.shape[1]))
|
|
98
|
-
target_correlations[0, :] = np.nan_to_num(
|
|
99
|
-
calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
|
|
100
|
-
)
|
|
101
|
-
target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
|
|
102
|
-
|
|
103
|
-
target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
|
|
104
|
-
|
|
105
|
-
return target_correlations
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
def calculate_spearman_corr_with_target(
|
|
109
|
-
X: Union[pd.DataFrame, np.ndarray], y: pd.Series, omit_nan: bool = False, n_jobs: Optional[int] = None
|
|
110
|
-
) -> np.ndarray:
|
|
111
|
-
if isinstance(X, pd.DataFrame):
|
|
112
|
-
X = np.asarray(X, dtype=np.float32)
|
|
113
|
-
|
|
114
|
-
if X.size == 0:
|
|
115
|
-
return np.ndarray(shape=(0,))
|
|
116
|
-
|
|
117
|
-
all_correlations = np.zeros(X.shape[1])
|
|
118
|
-
all_correlations.fill(np.nan)
|
|
119
|
-
cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
|
|
120
|
-
|
|
121
|
-
if omit_nan:
|
|
122
|
-
results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
|
|
123
|
-
delayed(mstats.spearmanr)(
|
|
124
|
-
X[:, i],
|
|
125
|
-
y,
|
|
126
|
-
nan_policy="omit",
|
|
127
|
-
axis=0,
|
|
128
|
-
)
|
|
129
|
-
for i in cols2calc
|
|
130
|
-
)
|
|
131
|
-
target_correlations = np.array([abs(res.correlation) for res in results])
|
|
132
|
-
else:
|
|
133
|
-
cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
|
|
134
|
-
target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
|
|
135
|
-
if isinstance(target_correlations, float):
|
|
136
|
-
target_correlations = np.abs([target_correlations])
|
|
137
|
-
else:
|
|
138
|
-
target_correlations = np.abs(target_correlations)[-1, :-1]
|
|
139
|
-
|
|
140
|
-
all_correlations[cols2calc] = target_correlations
|
|
141
|
-
|
|
142
|
-
return all_correlations
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def calculate_spearman(X: np.ndarray, y: Optional[pd.Series], nan_policy: str):
|
|
146
|
-
features_num = X.shape[1]
|
|
147
|
-
if y is not None:
|
|
148
|
-
features_num += 1
|
|
149
|
-
|
|
150
|
-
if features_num < 2:
|
|
151
|
-
return 1.0
|
|
152
|
-
else:
|
|
153
|
-
return spearmanr(X, y, nan_policy=nan_policy).correlation
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
def hash_series(series: pd.Series) -> int:
|
|
157
|
-
return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
|
|
161
|
-
target_name = target.name
|
|
162
|
-
if model_task_type != ModelTaskType.REGRESSION or (
|
|
163
|
-
not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
|
|
164
|
-
):
|
|
165
|
-
target = target.astype(str).astype("category").cat.codes
|
|
166
|
-
|
|
167
|
-
elif model_task_type == ModelTaskType.REGRESSION:
|
|
168
|
-
skewness = round(abs(skew(target)), 2)
|
|
169
|
-
if (target.min() >= 0) and (skewness >= 0.9):
|
|
170
|
-
target = np.log1p(target)
|
|
171
|
-
|
|
172
|
-
return pd.Series(target, name=target_name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.60 → upgini-1.2.60a3792.dev2}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|