upgini 1.2.38a3769.dev1__tar.gz → 1.2.38a3769.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/PKG-INFO +1 -1
- upgini-1.2.38a3769.dev3/src/upgini/__about__.py +1 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/dataset.py +1 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/features_enricher.py +3 -2
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/target_utils.py +14 -4
- upgini-1.2.38a3769.dev1/src/upgini/__about__.py +0 -1
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/.gitignore +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/LICENSE +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/README.md +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/pyproject.toml +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/ads.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/errors.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/http.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/metadata.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/metrics.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/search_task.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/spinner.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.38a3769.
|
|
3
|
+
Version: 1.2.38a3769.dev3
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.38a3769.dev3"
|
|
@@ -281,8 +281,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
281
281
|
|
|
282
282
|
self.search_keys = search_keys or {}
|
|
283
283
|
self.id_columns = id_columns
|
|
284
|
-
if id_columns is not None:
|
|
285
|
-
self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in id_columns})
|
|
286
284
|
self.country_code = country_code
|
|
287
285
|
self.__validate_search_keys(search_keys, search_id)
|
|
288
286
|
|
|
@@ -2657,6 +2655,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2657
2655
|
|
|
2658
2656
|
self.__adjust_cv(df)
|
|
2659
2657
|
|
|
2658
|
+
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2659
|
+
self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns})
|
|
2660
|
+
|
|
2660
2661
|
df, fintech_warnings = remove_fintech_duplicates(
|
|
2661
2662
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
2662
2663
|
)
|
|
@@ -246,6 +246,7 @@ def balance_undersample_forced(
|
|
|
246
246
|
id_columns=id_columns,
|
|
247
247
|
date_column=date_column,
|
|
248
248
|
sample_size=sample_size,
|
|
249
|
+
random_state=random_state,
|
|
249
250
|
logger=logger,
|
|
250
251
|
)
|
|
251
252
|
elif task_type in [ModelTaskType.MULTICLASS, ModelTaskType.REGRESSION]:
|
|
@@ -284,19 +285,28 @@ def balance_undersample_time_series(
|
|
|
284
285
|
id_columns: List[str],
|
|
285
286
|
date_column: str,
|
|
286
287
|
sample_size: int,
|
|
288
|
+
random_state: int = 42,
|
|
287
289
|
min_different_ids_ratio: float = TS_MIN_DIFFERENT_IDS_RATIO,
|
|
290
|
+
prefer_recent_dates: bool = True,
|
|
288
291
|
logger: Optional[logging.Logger] = None,
|
|
289
292
|
):
|
|
290
293
|
def ensure_tuple(x):
|
|
291
294
|
return tuple([x]) if not isinstance(x, tuple) else x
|
|
292
295
|
|
|
296
|
+
random_state = np.random.RandomState(random_state)
|
|
297
|
+
|
|
293
298
|
ids_sort = df.groupby(id_columns)[date_column].aggregate(["max", "count"]).T.to_dict()
|
|
294
|
-
ids_sort = {
|
|
299
|
+
ids_sort = {
|
|
300
|
+
ensure_tuple(k): (
|
|
301
|
+
(v["max"], v["count"], random_state.rand()) if prefer_recent_dates else (v["count"], random_state.rand())
|
|
302
|
+
)
|
|
303
|
+
for k, v in ids_sort.items()
|
|
304
|
+
}
|
|
295
305
|
id_counts = df[id_columns].value_counts()
|
|
296
306
|
id_counts.index = [ensure_tuple(i) for i in id_counts.index]
|
|
297
307
|
id_counts = id_counts.sort_index(key=lambda x: [ids_sort[y] for y in x], ascending=False).cumsum()
|
|
298
308
|
id_counts = id_counts[id_counts <= sample_size]
|
|
299
|
-
min_different_ids = int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio)
|
|
309
|
+
min_different_ids = max(int(len(df[id_columns].drop_duplicates()) * min_different_ids_ratio), 1)
|
|
300
310
|
|
|
301
311
|
def id_mask(sample_index: pd.Index) -> pd.Index:
|
|
302
312
|
if isinstance(sample_index, pd.MultiIndex):
|
|
@@ -307,10 +317,10 @@ def balance_undersample_time_series(
|
|
|
307
317
|
if len(id_counts) < min_different_ids:
|
|
308
318
|
if logger is not None:
|
|
309
319
|
logger.info(
|
|
310
|
-
f"Different ids count {len(id_counts)} is less than min different ids {min_different_ids}, sampling time window"
|
|
320
|
+
f"Different ids count {len(id_counts)} for sample size {sample_size} is less than min different ids {min_different_ids}, sampling time window"
|
|
311
321
|
)
|
|
312
322
|
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
313
|
-
ids_to_sample = date_counts.index[:min_different_ids]
|
|
323
|
+
ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
|
|
314
324
|
mask = id_mask(ids_to_sample)
|
|
315
325
|
df = df[mask]
|
|
316
326
|
sample_date_counts = df[date_column].value_counts().sort_index(ascending=False).cumsum()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.38a3769.dev1"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/ads_management/ads_manager.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/data_source/data_source_publisher.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/normalizer/normalize_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/resource_bundle/exceptions.py
RENAMED
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/resource_bundle/strings.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/sampler/random_under_sampler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/base_search_key_detector.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.38a3769.dev1 → upgini-1.2.38a3769.dev3}/src/upgini/utils/fallback_progress_bar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|