upgini 1.2.136__py3-none-any.whl → 1.2.137__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +33 -1
- upgini/features_enricher.py +10 -6
- upgini/http.py +2 -2
- upgini/metadata.py +11 -1
- upgini/utils/datetime_utils.py +4 -0
- {upgini-1.2.136.dist-info → upgini-1.2.137.dist-info}/METADATA +2 -1
- {upgini-1.2.136.dist-info → upgini-1.2.137.dist-info}/RECORD +10 -10
- {upgini-1.2.136.dist-info → upgini-1.2.137.dist-info}/WHEEL +0 -0
- {upgini-1.2.136.dist-info → upgini-1.2.137.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.137"
|
upgini/dataset.py
CHANGED
|
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pandas as pd
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
import pyarrow.parquet as pq
|
|
10
12
|
from pandas.api.types import (
|
|
11
13
|
is_float_dtype,
|
|
12
14
|
is_integer_dtype,
|
|
@@ -18,6 +20,7 @@ from pandas.api.types import (
|
|
|
18
20
|
from upgini.errors import ValidationError
|
|
19
21
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
20
22
|
from upgini.metadata import (
|
|
23
|
+
CURRENT_DATE_COL,
|
|
21
24
|
ENTITY_SYSTEM_RECORD_ID,
|
|
22
25
|
EVAL_SET_INDEX,
|
|
23
26
|
SYSTEM_RECORD_ID,
|
|
@@ -38,6 +41,7 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
|
38
41
|
from upgini.search_task import SearchTask
|
|
39
42
|
from upgini.utils.config import SampleConfig
|
|
40
43
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
44
|
+
from upgini.utils.hash_utils import file_hash
|
|
41
45
|
from upgini.utils.sample_utils import SampleColumns, sample
|
|
42
46
|
|
|
43
47
|
try:
|
|
@@ -287,7 +291,7 @@ class Dataset:
|
|
|
287
291
|
for key in search_group
|
|
288
292
|
if key in self.columns_renaming
|
|
289
293
|
and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
290
|
-
and not self.columns_renaming.get(key) ==
|
|
294
|
+
and not self.columns_renaming.get(key) == CURRENT_DATE_COL
|
|
291
295
|
}
|
|
292
296
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
293
297
|
if (
|
|
@@ -469,6 +473,33 @@ class Dataset:
|
|
|
469
473
|
|
|
470
474
|
columns.append(column_meta)
|
|
471
475
|
|
|
476
|
+
current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
|
|
477
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
478
|
+
if (
|
|
479
|
+
self.date_column is not None
|
|
480
|
+
and self.data[self.date_column].nunique() == 1
|
|
481
|
+
and self.data[self.date_column].iloc[0] == current_date
|
|
482
|
+
):
|
|
483
|
+
df_without_fake_date = self.data.drop(columns=[self.date_column])
|
|
484
|
+
else:
|
|
485
|
+
df_without_fake_date = self.data
|
|
486
|
+
parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
|
|
487
|
+
|
|
488
|
+
# calculate deterministic digest for any environment
|
|
489
|
+
|
|
490
|
+
table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
|
|
491
|
+
table = table.replace_schema_metadata({}) # remove all metadata
|
|
492
|
+
pq.write_table(
|
|
493
|
+
table,
|
|
494
|
+
parquet_file_path,
|
|
495
|
+
compression=None, # any compression will make it non-deterministic
|
|
496
|
+
data_page_size=0, # optional, to remove page layout variations
|
|
497
|
+
use_deprecated_int96_timestamps=False, # fix timestamp format
|
|
498
|
+
write_statistics=False, # remove statistics to make it deterministic
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
deterministic_digest = file_hash(parquet_file_path)
|
|
502
|
+
|
|
472
503
|
return FileMetadata(
|
|
473
504
|
name=self.dataset_name,
|
|
474
505
|
description=self.description,
|
|
@@ -479,6 +510,7 @@ class Dataset:
|
|
|
479
510
|
hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
|
|
480
511
|
taskType=self.task_type,
|
|
481
512
|
droppedColumns=self.dropped_columns,
|
|
513
|
+
deterministicDigest=deterministic_digest,
|
|
482
514
|
)
|
|
483
515
|
|
|
484
516
|
@staticmethod
|
upgini/features_enricher.py
CHANGED
|
@@ -44,6 +44,7 @@ from upgini.http import (
|
|
|
44
44
|
from upgini.mdc import MDC
|
|
45
45
|
from upgini.metadata import (
|
|
46
46
|
COUNTRY,
|
|
47
|
+
CURRENT_DATE_COL,
|
|
47
48
|
DEFAULT_INDEX,
|
|
48
49
|
ENTITY_SYSTEM_RECORD_ID,
|
|
49
50
|
EVAL_SET_INDEX,
|
|
@@ -167,7 +168,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
167
168
|
"""
|
|
168
169
|
|
|
169
170
|
TARGET_NAME = "target"
|
|
170
|
-
CURRENT_DATE = "current_date"
|
|
171
171
|
RANDOM_STATE = 42
|
|
172
172
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
173
173
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -2983,6 +2983,9 @@ if response.status_code == 200:
|
|
|
2983
2983
|
else:
|
|
2984
2984
|
selected_input_columns = []
|
|
2985
2985
|
|
|
2986
|
+
if DEFAULT_INDEX in selected_input_columns:
|
|
2987
|
+
selected_input_columns.remove(DEFAULT_INDEX)
|
|
2988
|
+
|
|
2986
2989
|
return selected_input_columns + selected_generated_features
|
|
2987
2990
|
|
|
2988
2991
|
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
|
@@ -3167,7 +3170,7 @@ if response.status_code == 200:
|
|
|
3167
3170
|
|
|
3168
3171
|
if DEFAULT_INDEX in df.columns:
|
|
3169
3172
|
msg = self.bundle.get("unsupported_index_column")
|
|
3170
|
-
self.logger.
|
|
3173
|
+
self.logger.warning(msg)
|
|
3171
3174
|
print(msg)
|
|
3172
3175
|
self.fit_dropped_features.add(DEFAULT_INDEX)
|
|
3173
3176
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
@@ -4093,9 +4096,10 @@ if response.status_code == 200:
|
|
|
4093
4096
|
):
|
|
4094
4097
|
if not silent:
|
|
4095
4098
|
self.__log_warning(bundle.get("current_date_added"))
|
|
4096
|
-
df[
|
|
4097
|
-
|
|
4098
|
-
|
|
4099
|
+
df[CURRENT_DATE_COL] = datetime.date.today()
|
|
4100
|
+
# df[CURRENT_DATE_COL] = datetime.date(2025, 10, 15)
|
|
4101
|
+
search_keys[CURRENT_DATE_COL] = SearchKey.DATE
|
|
4102
|
+
converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
|
|
4099
4103
|
df = converter.convert(df)
|
|
4100
4104
|
return df
|
|
4101
4105
|
|
|
@@ -4109,7 +4113,7 @@ if response.status_code == 200:
|
|
|
4109
4113
|
return [
|
|
4110
4114
|
col
|
|
4111
4115
|
for col, t in search_keys.items()
|
|
4112
|
-
if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
|
|
4116
|
+
if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
|
|
4113
4117
|
]
|
|
4114
4118
|
|
|
4115
4119
|
@staticmethod
|
upgini/http.py
CHANGED
|
@@ -433,8 +433,8 @@ class _RestClient:
|
|
|
433
433
|
with open(file_path, "rb") as file:
|
|
434
434
|
content = file.read()
|
|
435
435
|
md5_hash.update(content)
|
|
436
|
-
|
|
437
|
-
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5":
|
|
436
|
+
digest_md5 = md5_hash.hexdigest()
|
|
437
|
+
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
|
|
438
438
|
|
|
439
439
|
digest_sha256 = file_hash(file_path)
|
|
440
440
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
upgini/metadata.py
CHANGED
|
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
|
|
|
12
12
|
EVAL_SET_INDEX = "eval_set_index"
|
|
13
13
|
TARGET = "target"
|
|
14
14
|
COUNTRY = "country_iso_code"
|
|
15
|
+
CURRENT_DATE_COL = "current_date_"
|
|
15
16
|
RENAMED_INDEX = "index_col"
|
|
16
17
|
DEFAULT_INDEX = "index"
|
|
17
18
|
ORIGINAL_INDEX = "original_index"
|
|
18
|
-
SYSTEM_COLUMNS = {
|
|
19
|
+
SYSTEM_COLUMNS = {
|
|
20
|
+
SYSTEM_RECORD_ID,
|
|
21
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
22
|
+
SEARCH_KEY_UNNEST,
|
|
23
|
+
EVAL_SET_INDEX,
|
|
24
|
+
TARGET,
|
|
25
|
+
COUNTRY,
|
|
26
|
+
CURRENT_DATE_COL,
|
|
27
|
+
}
|
|
19
28
|
|
|
20
29
|
|
|
21
30
|
class FileColumnMeaningType(Enum):
|
|
@@ -252,6 +261,7 @@ class FileMetadata(BaseModel):
|
|
|
252
261
|
rowsCount: Optional[int] = None
|
|
253
262
|
checksumMD5: Optional[str] = None
|
|
254
263
|
digest: Optional[str] = None
|
|
264
|
+
deterministicDigest: Optional[str] = None
|
|
255
265
|
droppedColumns: Optional[List[str]] = None
|
|
256
266
|
|
|
257
267
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -408,6 +408,10 @@ def is_dates_distribution_valid(
|
|
|
408
408
|
if maybe_date_col is None:
|
|
409
409
|
return
|
|
410
410
|
|
|
411
|
+
# Don't check if date column is constant
|
|
412
|
+
if X[maybe_date_col].nunique() <= 1:
|
|
413
|
+
return
|
|
414
|
+
|
|
411
415
|
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
412
416
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
413
417
|
elif pd.__version__ >= "2.0.0":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.137
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -34,6 +34,7 @@ Requires-Dist: more-itertools==10.7.0
|
|
|
34
34
|
Requires-Dist: numpy<3.0.0,>=1.19.0
|
|
35
35
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
36
36
|
Requires-Dist: psutil>=5.9.0
|
|
37
|
+
Requires-Dist: pyarrow==18.1.0
|
|
37
38
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
38
39
|
Requires-Dist: pyjwt>=2.8.0
|
|
39
40
|
Requires-Dist: python-bidi==0.4.2
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=WAhVusMOc6hw9YR0UCWwVJJi3v2_uHEpPqxnSm9SguM,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=D9JzJJkZLPP_dp8GOlGgMhTtrd5pvP-4cHIcqiY3q-E,33354
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256
|
|
8
|
-
upgini/metadata.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=g1daAwtECDpSsM2TtVmozU3crsdMQ_xC5PlBJAafAX0,236099
|
|
7
|
+
upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
|
|
8
|
+
upgini/metadata.py,sha256=BwUTCY-EUHqPtO0tGazHrk3wqhh-NfjNZhlBHW8bR78,12796
|
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
|
10
10
|
upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
|
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -52,7 +52,7 @@ upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
|
|
|
52
52
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
53
53
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
54
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
55
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
55
|
+
upgini/utils/datetime_utils.py,sha256=B9zNaH2ZyV-lbBSTBdCZjc4zq1nVlejci40sf-TYfik,17102
|
|
56
56
|
upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
|
|
57
57
|
upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
|
|
|
74
74
|
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.137.dist-info/METADATA,sha256=qTIcTcJz2tn18BOyAa7SbP14-fOxG4l3rtQyCxG8wDI,51164
|
|
78
|
+
upgini-1.2.137.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
79
|
+
upgini-1.2.137.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.137.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|