upgini 1.2.114a4__tar.gz → 1.2.114a5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.114a4 → upgini-1.2.114a5}/PKG-INFO +2 -16
- {upgini-1.2.114a4 → upgini-1.2.114a5}/README.md +0 -14
- {upgini-1.2.114a4 → upgini-1.2.114a5}/pyproject.toml +1 -1
- upgini-1.2.114a5/src/upgini/__about__.py +1 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/dataset.py +8 -72
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/features_enricher.py +273 -458
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/http.py +11 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/metadata.py +0 -10
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/resource_bundle/strings.properties +1 -4
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/search_task.py +6 -0
- upgini-1.2.114a5/src/upgini/utils/config.py +43 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/display_utils.py +1 -1
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/hash_utils.py +23 -1
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/psi.py +6 -3
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/sample_utils.py +16 -41
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/target_utils.py +48 -2
- upgini-1.2.114a4/src/upgini/__about__.py +0 -1
- {upgini-1.2.114a4 → upgini-1.2.114a5}/.gitignore +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/LICENSE +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/ads.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/errors.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/metrics.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/spinner.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.114a4 → upgini-1.2.114a5}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.114a5
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -38,7 +38,7 @@ Requires-Dist: python-bidi==0.4.2
|
|
38
38
|
Requires-Dist: python-dateutil>=2.8.0
|
39
39
|
Requires-Dist: python-json-logger>=3.3.0
|
40
40
|
Requires-Dist: requests>=2.8.0
|
41
|
-
Requires-Dist: scikit-learn
|
41
|
+
Requires-Dist: scikit-learn<1.8.0,>=1.3.0
|
42
42
|
Requires-Dist: scipy>=1.10.0
|
43
43
|
Requires-Dist: shap>=0.44.0
|
44
44
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
@@ -769,20 +769,6 @@ enricher = FeaturesEnricher(
|
|
769
769
|
enriched_dataframe.fit(X, y)
|
770
770
|
```
|
771
771
|
|
772
|
-
### Return initial dataframe enriched with TOP external features by importance
|
773
|
-
|
774
|
-
`transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
|
775
|
-
- `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
|
776
|
-
- `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
|
777
|
-
|
778
|
-
And `keep_input=True` will keep all initial columns from search dataset X:
|
779
|
-
```python
|
780
|
-
enricher = FeaturesEnricher(
|
781
|
-
search_keys={"subscription_activation_date": SearchKey.DATE}
|
782
|
-
)
|
783
|
-
enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
|
784
|
-
```
|
785
|
-
|
786
772
|
### Exclude premium data sources from fit, transform and metrics calculation
|
787
773
|
|
788
774
|
`fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
|
@@ -723,20 +723,6 @@ enricher = FeaturesEnricher(
|
|
723
723
|
enriched_dataframe.fit(X, y)
|
724
724
|
```
|
725
725
|
|
726
|
-
### Return initial dataframe enriched with TOP external features by importance
|
727
|
-
|
728
|
-
`transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
|
729
|
-
- `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
|
730
|
-
- `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
|
731
|
-
|
732
|
-
And `keep_input=True` will keep all initial columns from search dataset X:
|
733
|
-
```python
|
734
|
-
enricher = FeaturesEnricher(
|
735
|
-
search_keys={"subscription_activation_date": SearchKey.DATE}
|
736
|
-
)
|
737
|
-
enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
|
738
|
-
```
|
739
|
-
|
740
726
|
### Exclude premium data sources from fit, transform and metrics calculation
|
741
727
|
|
742
728
|
`fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.114a5"
|
@@ -25,7 +25,6 @@ from upgini.metadata import (
|
|
25
25
|
AutoFEParameters,
|
26
26
|
CVType,
|
27
27
|
DataType,
|
28
|
-
FeaturesFilter,
|
29
28
|
FileColumnMeaningType,
|
30
29
|
FileColumnMetadata,
|
31
30
|
FileMetadata,
|
@@ -37,8 +36,9 @@ from upgini.metadata import (
|
|
37
36
|
)
|
38
37
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
39
38
|
from upgini.search_task import SearchTask
|
39
|
+
from upgini.utils.config import SampleConfig
|
40
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
41
|
-
from upgini.utils.sample_utils import SampleColumns,
|
41
|
+
from upgini.utils.sample_utils import SampleColumns, sample
|
42
42
|
|
43
43
|
try:
|
44
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -51,9 +51,6 @@ except Exception:
|
|
51
51
|
class Dataset:
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
53
|
MAX_ROWS = 200_000
|
54
|
-
IMBALANCE_THESHOLD = 0.6
|
55
|
-
MIN_TARGET_CLASS_ROWS = 100
|
56
|
-
MAX_MULTICLASS_CLASS_COUNT = 100
|
57
54
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
58
55
|
MAX_FEATURES_COUNT = 3500
|
59
56
|
MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
|
@@ -73,6 +70,7 @@ class Dataset:
|
|
73
70
|
cv_type: Optional[CVType] = None,
|
74
71
|
date_column: Optional[str] = None,
|
75
72
|
id_columns: Optional[List[str]] = None,
|
73
|
+
is_imbalanced: bool = False,
|
76
74
|
random_state: Optional[int] = None,
|
77
75
|
sample_config: Optional[SampleConfig] = None,
|
78
76
|
rest_client: Optional[_RestClient] = None,
|
@@ -117,8 +115,9 @@ class Dataset:
|
|
117
115
|
self.rest_client = rest_client
|
118
116
|
self.random_state = random_state
|
119
117
|
self.columns_renaming: Dict[str, str] = {}
|
120
|
-
self.
|
118
|
+
self.is_imbalanced: bool = False
|
121
119
|
self.id_columns = id_columns
|
120
|
+
self.is_imbalanced = is_imbalanced
|
122
121
|
self.date_column = date_column
|
123
122
|
if logger is not None:
|
124
123
|
self.logger = logger
|
@@ -239,8 +238,6 @@ class Dataset:
|
|
239
238
|
else:
|
240
239
|
train_segment = self.data
|
241
240
|
|
242
|
-
self.imbalanced = self.__is_imbalanced(train_segment)
|
243
|
-
|
244
241
|
sample_columns = SampleColumns(
|
245
242
|
ids=self.id_columns,
|
246
243
|
date=self.date_column,
|
@@ -249,55 +246,19 @@ class Dataset:
|
|
249
246
|
)
|
250
247
|
|
251
248
|
self.data = sample(
|
252
|
-
train_segment if self.
|
249
|
+
train_segment if self.is_imbalanced else self.data, # for imbalanced data we will be doing transform anyway
|
253
250
|
self.task_type,
|
254
251
|
self.cv_type,
|
255
252
|
self.sample_config,
|
256
253
|
sample_columns,
|
257
254
|
self.random_state,
|
258
|
-
balance=self.
|
255
|
+
balance=self.is_imbalanced,
|
259
256
|
force_downsampling=force_downsampling,
|
260
257
|
logger=self.logger,
|
261
258
|
bundle=self.bundle,
|
262
259
|
warning_callback=self.warning_callback,
|
263
260
|
)
|
264
261
|
|
265
|
-
def __is_imbalanced(self, data: pd.DataFrame) -> bool:
|
266
|
-
if self.task_type is None or not self.task_type.is_classification():
|
267
|
-
return False
|
268
|
-
|
269
|
-
if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
|
270
|
-
return False
|
271
|
-
|
272
|
-
count = len(data)
|
273
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
274
|
-
target = data[target_column]
|
275
|
-
target_classes_count = target.nunique()
|
276
|
-
|
277
|
-
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
278
|
-
msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
|
279
|
-
target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
|
280
|
-
)
|
281
|
-
self.logger.warning(msg)
|
282
|
-
raise ValidationError(msg)
|
283
|
-
|
284
|
-
vc = target.value_counts()
|
285
|
-
min_class_value = vc.index[len(vc) - 1]
|
286
|
-
min_class_count = vc[min_class_value]
|
287
|
-
|
288
|
-
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
289
|
-
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
290
|
-
min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
|
291
|
-
)
|
292
|
-
self.logger.warning(msg)
|
293
|
-
raise ValidationError(msg)
|
294
|
-
|
295
|
-
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
296
|
-
min_class_threshold = min_class_percent * count
|
297
|
-
|
298
|
-
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
299
|
-
return bool(min_class_count < min_class_threshold)
|
300
|
-
|
301
262
|
def __validate_dataset(self, validate_target: bool, silent_mode: bool):
|
302
263
|
"""Validate DataSet"""
|
303
264
|
# self.logger.info("validating etalon")
|
@@ -537,9 +498,6 @@ class Dataset:
|
|
537
498
|
return_scores: bool,
|
538
499
|
extract_features: bool,
|
539
500
|
accurate_model: Optional[bool] = None,
|
540
|
-
importance_threshold: Optional[float] = None,
|
541
|
-
max_features: Optional[int] = None,
|
542
|
-
filter_features: Optional[dict] = None,
|
543
501
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
544
502
|
metrics_calculation: Optional[bool] = False,
|
545
503
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
@@ -548,28 +506,12 @@ class Dataset:
|
|
548
506
|
search_customization = SearchCustomization(
|
549
507
|
extractFeatures=extract_features,
|
550
508
|
accurateModel=accurate_model,
|
551
|
-
importanceThreshold=importance_threshold,
|
552
|
-
maxFeatures=max_features,
|
553
509
|
returnScores=return_scores,
|
554
510
|
runtimeParameters=runtime_parameters,
|
555
511
|
metricsCalculation=metrics_calculation,
|
556
512
|
)
|
557
|
-
if filter_features:
|
558
|
-
if [
|
559
|
-
key
|
560
|
-
for key in filter_features
|
561
|
-
if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
|
562
|
-
]:
|
563
|
-
raise ValidationError(self.bundle.get("dataset_invalid_filter"))
|
564
|
-
feature_filter = FeaturesFilter(
|
565
|
-
minImportance=filter_features.get("min_importance"),
|
566
|
-
maxPSI=filter_features.get("max_psi"),
|
567
|
-
maxCount=filter_features.get("max_count"),
|
568
|
-
selectedFeatures=filter_features.get("selected_features"),
|
569
|
-
)
|
570
|
-
search_customization.featuresFilter = feature_filter
|
571
513
|
|
572
|
-
search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.
|
514
|
+
search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.is_imbalanced
|
573
515
|
if auto_fe_parameters is not None:
|
574
516
|
search_customization.runtimeParameters.properties["feature_generation_params.ts.gap_days"] = (
|
575
517
|
auto_fe_parameters.ts_gap_days
|
@@ -624,9 +566,6 @@ class Dataset:
|
|
624
566
|
extract_features: bool = False,
|
625
567
|
accurate_model: bool = False,
|
626
568
|
exclude_features_sources: Optional[List[str]] = None,
|
627
|
-
importance_threshold: Optional[float] = None, # deprecated
|
628
|
-
max_features: Optional[int] = None, # deprecated
|
629
|
-
filter_features: Optional[dict] = None, # deprecated
|
630
569
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
631
570
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
632
571
|
force_downsampling: bool = False,
|
@@ -643,9 +582,6 @@ class Dataset:
|
|
643
582
|
return_scores=return_scores,
|
644
583
|
extract_features=extract_features,
|
645
584
|
accurate_model=accurate_model,
|
646
|
-
importance_threshold=importance_threshold,
|
647
|
-
max_features=max_features,
|
648
|
-
filter_features=filter_features,
|
649
585
|
runtime_parameters=runtime_parameters,
|
650
586
|
auto_fe_parameters=auto_fe_parameters,
|
651
587
|
)
|