upgini 1.2.90__tar.gz → 1.2.91a3884.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/PKG-INFO +10 -1
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/README.md +9 -0
- upgini-1.2.91a3884.dev1/src/upgini/__about__.py +1 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/dataset.py +55 -96
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/features_enricher.py +207 -187
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/metadata.py +3 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/metrics.py +11 -10
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/resource_bundle/strings.properties +1 -0
- upgini-1.2.91a3884.dev1/src/upgini/utils/sample_utils.py +348 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/target_utils.py +3 -199
- upgini-1.2.90/src/upgini/__about__.py +0 -1
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/.gitignore +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/LICENSE +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/pyproject.toml +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/ads.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/errors.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/http.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.90 → upgini-1.2.91a3884.dev1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.91a3884.dev1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -808,6 +808,15 @@ enricher = FeaturesEnricher(
|
|
808
808
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
809
809
|
```
|
810
810
|
|
811
|
+
## Turn off generating features on search keys
|
812
|
+
Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
|
813
|
+
|
814
|
+
```python
|
815
|
+
enricher = FeaturesEnricher(
|
816
|
+
search_keys={"date": SearchKey.DATE},
|
817
|
+
generate_search_key_features=False,
|
818
|
+
)
|
819
|
+
|
811
820
|
## 🔑 Open up all capabilities of Upgini
|
812
821
|
|
813
822
|
[Register](https://profile.upgini.com) and get a free API key for exclusive data sources and features: 600 mln+ phone numbers, 350 mln+ emails, 2^32 IP addresses
|
@@ -762,6 +762,15 @@ enricher = FeaturesEnricher(
|
|
762
762
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
763
763
|
```
|
764
764
|
|
765
|
+
## Turn off generating features on search keys
|
766
|
+
Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
|
767
|
+
|
768
|
+
```python
|
769
|
+
enricher = FeaturesEnricher(
|
770
|
+
search_keys={"date": SearchKey.DATE},
|
771
|
+
generate_search_key_features=False,
|
772
|
+
)
|
773
|
+
|
765
774
|
## 🔑 Open up all capabilities of Upgini
|
766
775
|
|
767
776
|
[Register](https://profile.upgini.com) and get a free API key for exclusive data sources and features: 600 mln+ phone numbers, 350 mln+ emails, 2^32 IP addresses
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.91a3884.dev1"
|
@@ -38,11 +38,7 @@ from upgini.metadata import (
|
|
38
38
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
39
39
|
from upgini.search_task import SearchTask
|
40
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
41
|
-
from upgini.utils.
|
42
|
-
balance_undersample,
|
43
|
-
balance_undersample_forced,
|
44
|
-
balance_undersample_time_series_trunc,
|
45
|
-
)
|
41
|
+
from upgini.utils.sample_utils import SampleColumns, SampleConfig, sample
|
46
42
|
|
47
43
|
try:
|
48
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -88,6 +84,7 @@ class Dataset: # (pd.DataFrame):
|
|
88
84
|
date_column: Optional[str] = None,
|
89
85
|
id_columns: Optional[List[str]] = None,
|
90
86
|
random_state: Optional[int] = None,
|
87
|
+
sample_config: Optional[SampleConfig] = None,
|
91
88
|
rest_client: Optional[_RestClient] = None,
|
92
89
|
logger: Optional[logging.Logger] = None,
|
93
90
|
bundle: Optional[ResourceBundle] = None,
|
@@ -95,6 +92,7 @@ class Dataset: # (pd.DataFrame):
|
|
95
92
|
**kwargs,
|
96
93
|
):
|
97
94
|
self.bundle = bundle or get_custom_bundle()
|
95
|
+
self.sample_config = sample_config or SampleConfig(force_sample_size=self.FORCE_SAMPLE_SIZE)
|
98
96
|
if df is not None:
|
99
97
|
data = df.copy()
|
100
98
|
elif path is not None:
|
@@ -233,109 +231,70 @@ class Dataset: # (pd.DataFrame):
|
|
233
231
|
raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
234
232
|
|
235
233
|
def __resample(self, force_downsampling=False):
|
236
|
-
# self.logger.info("Resampling etalon")
|
237
|
-
# Resample imbalanced target. Only train segment (without eval_set)
|
238
|
-
if force_downsampling:
|
239
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
240
|
-
self.data = balance_undersample_forced(
|
241
|
-
df=self.data,
|
242
|
-
target_column=target_column,
|
243
|
-
task_type=self.task_type,
|
244
|
-
cv_type=self.cv_type,
|
245
|
-
date_column=self.date_column,
|
246
|
-
id_columns=self.id_columns,
|
247
|
-
random_state=self.random_state,
|
248
|
-
sample_size=self.FORCE_SAMPLE_SIZE,
|
249
|
-
logger=self.logger,
|
250
|
-
bundle=self.bundle,
|
251
|
-
warning_callback=self.warning_callback,
|
252
|
-
)
|
253
|
-
return
|
254
234
|
|
255
|
-
if EVAL_SET_INDEX in self.data.columns:
|
235
|
+
if EVAL_SET_INDEX in self.data.columns and not force_downsampling:
|
256
236
|
train_segment = self.data[self.data[EVAL_SET_INDEX] == 0]
|
257
237
|
else:
|
258
238
|
train_segment = self.data
|
259
239
|
|
260
|
-
|
261
|
-
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
|
262
|
-
):
|
263
|
-
count = len(train_segment)
|
264
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
265
|
-
target = train_segment[target_column]
|
266
|
-
target_classes_count = target.nunique()
|
240
|
+
self.imbalanced = self.__is_imbalanced(train_segment)
|
267
241
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
242
|
+
sample_columns = SampleColumns(
|
243
|
+
ids=self.id_columns,
|
244
|
+
date=self.date_column,
|
245
|
+
target=self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET),
|
246
|
+
eval_set_index=EVAL_SET_INDEX,
|
247
|
+
)
|
274
248
|
|
275
|
-
|
276
|
-
|
277
|
-
|
249
|
+
self.data = sample(
|
250
|
+
train_segment if self.imbalanced else self.data, # for imbalanced data we will be doing transform anyway
|
251
|
+
self.task_type,
|
252
|
+
self.cv_type,
|
253
|
+
self.sample_config,
|
254
|
+
sample_columns,
|
255
|
+
self.random_state,
|
256
|
+
balance=self.imbalanced,
|
257
|
+
force_downsampling=force_downsampling,
|
258
|
+
logger=self.logger,
|
259
|
+
bundle=self.bundle,
|
260
|
+
warning_callback=self.warning_callback,
|
261
|
+
)
|
278
262
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
)
|
283
|
-
self.logger.warning(msg)
|
284
|
-
raise ValidationError(msg)
|
263
|
+
def __is_imbalanced(self, data: pd.DataFrame) -> bool:
|
264
|
+
if self.task_type is None or not self.task_type.is_classification():
|
265
|
+
return False
|
285
266
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
290
|
-
if min_class_count < min_class_threshold:
|
291
|
-
self.imbalanced = True
|
292
|
-
self.data = balance_undersample(
|
293
|
-
df=train_segment,
|
294
|
-
target_column=target_column,
|
295
|
-
task_type=self.task_type,
|
296
|
-
random_state=self.random_state,
|
297
|
-
binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
|
298
|
-
multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
|
299
|
-
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
300
|
-
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
301
|
-
logger=self.logger,
|
302
|
-
bundle=self.bundle,
|
303
|
-
warning_callback=self.warning_callback,
|
304
|
-
)
|
267
|
+
if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
|
268
|
+
return False
|
305
269
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|
311
|
-
sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
312
|
-
sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
313
|
-
else:
|
314
|
-
sample_threshold = self.FIT_SAMPLE_THRESHOLD
|
315
|
-
sample_rows = self.FIT_SAMPLE_ROWS
|
270
|
+
count = len(data)
|
271
|
+
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
272
|
+
target = data[target_column]
|
273
|
+
target_classes_count = target.nunique()
|
316
274
|
|
317
|
-
if
|
318
|
-
self.
|
319
|
-
|
320
|
-
f"and will be downsampled to {sample_rows}"
|
275
|
+
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
276
|
+
msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
|
277
|
+
target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
|
321
278
|
)
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
279
|
+
self.logger.warning(msg)
|
280
|
+
raise ValidationError(msg)
|
281
|
+
|
282
|
+
vc = target.value_counts()
|
283
|
+
min_class_value = vc.index[len(vc) - 1]
|
284
|
+
min_class_count = vc[min_class_value]
|
285
|
+
|
286
|
+
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
287
|
+
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
288
|
+
min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
|
289
|
+
)
|
290
|
+
self.logger.warning(msg)
|
291
|
+
raise ValidationError(msg)
|
292
|
+
|
293
|
+
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
294
|
+
min_class_threshold = min_class_percent * count
|
295
|
+
|
296
|
+
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
297
|
+
return bool(min_class_count < min_class_threshold)
|
339
298
|
|
340
299
|
def __validate_dataset(self, validate_target: bool, silent_mode: bool):
|
341
300
|
"""Validate DataSet"""
|