upgini 1.2.135a3__tar.gz → 1.2.137__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.135a3 → upgini-1.2.137}/PKG-INFO +2 -1
- {upgini-1.2.135a3 → upgini-1.2.137}/pyproject.toml +1 -0
- upgini-1.2.137/src/upgini/__about__.py +1 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/dataset.py +36 -1
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/features_enricher.py +20 -9
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/http.py +2 -2
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/metadata.py +12 -1
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/datetime_utils.py +4 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/features_validator.py +1 -1
- upgini-1.2.135a3/src/upgini/__about__.py +0 -1
- {upgini-1.2.135a3 → upgini-1.2.137}/.gitignore +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/LICENSE +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/README.md +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/ads.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/errors.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/metrics.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/search_task.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/spinner.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/config.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/hash_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/psi.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/sample_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.135a3 → upgini-1.2.137}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.137
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -34,6 +34,7 @@ Requires-Dist: more-itertools==10.7.0
|
|
|
34
34
|
Requires-Dist: numpy<3.0.0,>=1.19.0
|
|
35
35
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
36
36
|
Requires-Dist: psutil>=5.9.0
|
|
37
|
+
Requires-Dist: pyarrow==18.1.0
|
|
37
38
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
38
39
|
Requires-Dist: pyjwt>=2.8.0
|
|
39
40
|
Requires-Dist: python-bidi==0.4.2
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.137"
|
|
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pandas as pd
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
import pyarrow.parquet as pq
|
|
10
12
|
from pandas.api.types import (
|
|
11
13
|
is_float_dtype,
|
|
12
14
|
is_integer_dtype,
|
|
@@ -18,6 +20,7 @@ from pandas.api.types import (
|
|
|
18
20
|
from upgini.errors import ValidationError
|
|
19
21
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
20
22
|
from upgini.metadata import (
|
|
23
|
+
CURRENT_DATE_COL,
|
|
21
24
|
ENTITY_SYSTEM_RECORD_ID,
|
|
22
25
|
EVAL_SET_INDEX,
|
|
23
26
|
SYSTEM_RECORD_ID,
|
|
@@ -38,6 +41,7 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
|
38
41
|
from upgini.search_task import SearchTask
|
|
39
42
|
from upgini.utils.config import SampleConfig
|
|
40
43
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
44
|
+
from upgini.utils.hash_utils import file_hash
|
|
41
45
|
from upgini.utils.sample_utils import SampleColumns, sample
|
|
42
46
|
|
|
43
47
|
try:
|
|
@@ -71,6 +75,7 @@ class Dataset:
|
|
|
71
75
|
date_column: Optional[str] = None,
|
|
72
76
|
id_columns: Optional[List[str]] = None,
|
|
73
77
|
is_imbalanced: bool = False,
|
|
78
|
+
dropped_columns: Optional[List[str]] = None,
|
|
74
79
|
random_state: Optional[int] = None,
|
|
75
80
|
sample_config: Optional[SampleConfig] = None,
|
|
76
81
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -118,6 +123,7 @@ class Dataset:
|
|
|
118
123
|
self.is_imbalanced: bool = False
|
|
119
124
|
self.id_columns = id_columns
|
|
120
125
|
self.is_imbalanced = is_imbalanced
|
|
126
|
+
self.dropped_columns = dropped_columns
|
|
121
127
|
self.date_column = date_column
|
|
122
128
|
if logger is not None:
|
|
123
129
|
self.logger = logger
|
|
@@ -285,7 +291,7 @@ class Dataset:
|
|
|
285
291
|
for key in search_group
|
|
286
292
|
if key in self.columns_renaming
|
|
287
293
|
and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
288
|
-
and not self.columns_renaming.get(key) ==
|
|
294
|
+
and not self.columns_renaming.get(key) == CURRENT_DATE_COL
|
|
289
295
|
}
|
|
290
296
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
291
297
|
if (
|
|
@@ -467,6 +473,33 @@ class Dataset:
|
|
|
467
473
|
|
|
468
474
|
columns.append(column_meta)
|
|
469
475
|
|
|
476
|
+
current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
|
|
477
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
478
|
+
if (
|
|
479
|
+
self.date_column is not None
|
|
480
|
+
and self.data[self.date_column].nunique() == 1
|
|
481
|
+
and self.data[self.date_column].iloc[0] == current_date
|
|
482
|
+
):
|
|
483
|
+
df_without_fake_date = self.data.drop(columns=[self.date_column])
|
|
484
|
+
else:
|
|
485
|
+
df_without_fake_date = self.data
|
|
486
|
+
parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
|
|
487
|
+
|
|
488
|
+
# calculate deterministic digest for any environment
|
|
489
|
+
|
|
490
|
+
table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
|
|
491
|
+
table = table.replace_schema_metadata({}) # remove all metadata
|
|
492
|
+
pq.write_table(
|
|
493
|
+
table,
|
|
494
|
+
parquet_file_path,
|
|
495
|
+
compression=None, # any compression will make it non-deterministic
|
|
496
|
+
data_page_size=0, # optional, to remove page layout variations
|
|
497
|
+
use_deprecated_int96_timestamps=False, # fix timestamp format
|
|
498
|
+
write_statistics=False, # remove statistics to make it deterministic
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
deterministic_digest = file_hash(parquet_file_path)
|
|
502
|
+
|
|
470
503
|
return FileMetadata(
|
|
471
504
|
name=self.dataset_name,
|
|
472
505
|
description=self.description,
|
|
@@ -476,6 +509,8 @@ class Dataset:
|
|
|
476
509
|
hierarchicalGroupKeys=self.hierarchical_group_keys,
|
|
477
510
|
hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
|
|
478
511
|
taskType=self.task_type,
|
|
512
|
+
droppedColumns=self.dropped_columns,
|
|
513
|
+
deterministicDigest=deterministic_digest,
|
|
479
514
|
)
|
|
480
515
|
|
|
481
516
|
@staticmethod
|
|
@@ -44,6 +44,7 @@ from upgini.http import (
|
|
|
44
44
|
from upgini.mdc import MDC
|
|
45
45
|
from upgini.metadata import (
|
|
46
46
|
COUNTRY,
|
|
47
|
+
CURRENT_DATE_COL,
|
|
47
48
|
DEFAULT_INDEX,
|
|
48
49
|
ENTITY_SYSTEM_RECORD_ID,
|
|
49
50
|
EVAL_SET_INDEX,
|
|
@@ -167,7 +168,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
167
168
|
"""
|
|
168
169
|
|
|
169
170
|
TARGET_NAME = "target"
|
|
170
|
-
CURRENT_DATE = "current_date"
|
|
171
171
|
RANDOM_STATE = 42
|
|
172
172
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
173
173
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -1737,6 +1737,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1737
1737
|
|
|
1738
1738
|
self.logger.info(f"Excluding search keys: {excluding_search_keys}")
|
|
1739
1739
|
|
|
1740
|
+
file_meta = self._search_task.get_file_metadata(trace_id)
|
|
1741
|
+
fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
|
|
1742
|
+
original_dropped_features = [columns_renaming.get(f, f) for f in fit_dropped_features]
|
|
1743
|
+
|
|
1740
1744
|
client_features = [
|
|
1741
1745
|
c
|
|
1742
1746
|
for c in validated_X.columns.to_list()
|
|
@@ -1744,7 +1748,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1744
1748
|
and c
|
|
1745
1749
|
not in (
|
|
1746
1750
|
excluding_search_keys
|
|
1747
|
-
+
|
|
1751
|
+
+ original_dropped_features
|
|
1748
1752
|
+ [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
1749
1753
|
)
|
|
1750
1754
|
]
|
|
@@ -2955,8 +2959,10 @@ if response.status_code == 200:
|
|
|
2955
2959
|
trace_id: str,
|
|
2956
2960
|
is_transform: bool = False,
|
|
2957
2961
|
):
|
|
2958
|
-
|
|
2959
|
-
|
|
2962
|
+
file_meta = self._search_task.get_file_metadata(trace_id)
|
|
2963
|
+
fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
|
|
2964
|
+
fit_input_columns = [c.originalName for c in file_meta.columns]
|
|
2965
|
+
original_dropped_features = [self.fit_columns_renaming.get(c, c) for c in fit_dropped_features]
|
|
2960
2966
|
new_columns_on_transform = [
|
|
2961
2967
|
c for c in validated_Xy.columns if c not in fit_input_columns and c not in original_dropped_features
|
|
2962
2968
|
]
|
|
@@ -2977,6 +2983,9 @@ if response.status_code == 200:
|
|
|
2977
2983
|
else:
|
|
2978
2984
|
selected_input_columns = []
|
|
2979
2985
|
|
|
2986
|
+
if DEFAULT_INDEX in selected_input_columns:
|
|
2987
|
+
selected_input_columns.remove(DEFAULT_INDEX)
|
|
2988
|
+
|
|
2980
2989
|
return selected_input_columns + selected_generated_features
|
|
2981
2990
|
|
|
2982
2991
|
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
|
@@ -3161,7 +3170,7 @@ if response.status_code == 200:
|
|
|
3161
3170
|
|
|
3162
3171
|
if DEFAULT_INDEX in df.columns:
|
|
3163
3172
|
msg = self.bundle.get("unsupported_index_column")
|
|
3164
|
-
self.logger.
|
|
3173
|
+
self.logger.warning(msg)
|
|
3165
3174
|
print(msg)
|
|
3166
3175
|
self.fit_dropped_features.add(DEFAULT_INDEX)
|
|
3167
3176
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
@@ -3380,6 +3389,7 @@ if response.status_code == 200:
|
|
|
3380
3389
|
cv_type=self.cv,
|
|
3381
3390
|
id_columns=self.__get_renamed_id_columns(),
|
|
3382
3391
|
is_imbalanced=self.imbalanced,
|
|
3392
|
+
dropped_columns=[self.fit_columns_renaming.get(f, f) for f in self.fit_dropped_features],
|
|
3383
3393
|
date_column=self._get_date_column(self.fit_search_keys),
|
|
3384
3394
|
date_format=self.date_format,
|
|
3385
3395
|
random_state=self.random_state,
|
|
@@ -4086,9 +4096,10 @@ if response.status_code == 200:
|
|
|
4086
4096
|
):
|
|
4087
4097
|
if not silent:
|
|
4088
4098
|
self.__log_warning(bundle.get("current_date_added"))
|
|
4089
|
-
df[
|
|
4090
|
-
|
|
4091
|
-
|
|
4099
|
+
df[CURRENT_DATE_COL] = datetime.date.today()
|
|
4100
|
+
# df[CURRENT_DATE_COL] = datetime.date(2025, 10, 15)
|
|
4101
|
+
search_keys[CURRENT_DATE_COL] = SearchKey.DATE
|
|
4102
|
+
converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
|
|
4092
4103
|
df = converter.convert(df)
|
|
4093
4104
|
return df
|
|
4094
4105
|
|
|
@@ -4102,7 +4113,7 @@ if response.status_code == 200:
|
|
|
4102
4113
|
return [
|
|
4103
4114
|
col
|
|
4104
4115
|
for col, t in search_keys.items()
|
|
4105
|
-
if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
|
|
4116
|
+
if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
|
|
4106
4117
|
]
|
|
4107
4118
|
|
|
4108
4119
|
@staticmethod
|
|
@@ -433,8 +433,8 @@ class _RestClient:
|
|
|
433
433
|
with open(file_path, "rb") as file:
|
|
434
434
|
content = file.read()
|
|
435
435
|
md5_hash.update(content)
|
|
436
|
-
|
|
437
|
-
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5":
|
|
436
|
+
digest_md5 = md5_hash.hexdigest()
|
|
437
|
+
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
|
|
438
438
|
|
|
439
439
|
digest_sha256 = file_hash(file_path)
|
|
440
440
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
|
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
|
|
|
12
12
|
EVAL_SET_INDEX = "eval_set_index"
|
|
13
13
|
TARGET = "target"
|
|
14
14
|
COUNTRY = "country_iso_code"
|
|
15
|
+
CURRENT_DATE_COL = "current_date_"
|
|
15
16
|
RENAMED_INDEX = "index_col"
|
|
16
17
|
DEFAULT_INDEX = "index"
|
|
17
18
|
ORIGINAL_INDEX = "original_index"
|
|
18
|
-
SYSTEM_COLUMNS = {
|
|
19
|
+
SYSTEM_COLUMNS = {
|
|
20
|
+
SYSTEM_RECORD_ID,
|
|
21
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
22
|
+
SEARCH_KEY_UNNEST,
|
|
23
|
+
EVAL_SET_INDEX,
|
|
24
|
+
TARGET,
|
|
25
|
+
COUNTRY,
|
|
26
|
+
CURRENT_DATE_COL,
|
|
27
|
+
}
|
|
19
28
|
|
|
20
29
|
|
|
21
30
|
class FileColumnMeaningType(Enum):
|
|
@@ -252,6 +261,8 @@ class FileMetadata(BaseModel):
|
|
|
252
261
|
rowsCount: Optional[int] = None
|
|
253
262
|
checksumMD5: Optional[str] = None
|
|
254
263
|
digest: Optional[str] = None
|
|
264
|
+
deterministicDigest: Optional[str] = None
|
|
265
|
+
droppedColumns: Optional[List[str]] = None
|
|
255
266
|
|
|
256
267
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
|
257
268
|
for c in self.columns:
|
|
@@ -408,6 +408,10 @@ def is_dates_distribution_valid(
|
|
|
408
408
|
if maybe_date_col is None:
|
|
409
409
|
return
|
|
410
410
|
|
|
411
|
+
# Don't check if date column is constant
|
|
412
|
+
if X[maybe_date_col].nunique() <= 1:
|
|
413
|
+
return
|
|
414
|
+
|
|
411
415
|
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
412
416
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
413
417
|
elif pd.__version__ >= "2.0.0":
|
|
@@ -46,7 +46,7 @@ class FeaturesValidator:
|
|
|
46
46
|
|
|
47
47
|
columns_renaming = columns_renaming or {}
|
|
48
48
|
|
|
49
|
-
if one_hot_encoded_features:
|
|
49
|
+
if one_hot_encoded_features and len(one_hot_encoded_features) > 1:
|
|
50
50
|
msg = bundle.get("one_hot_encoded_features").format(
|
|
51
51
|
[columns_renaming.get(f, f) for f in one_hot_encoded_features]
|
|
52
52
|
)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.135a3"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|