upgini 1.2.136__py3-none-any.whl → 1.2.137__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.136"
1
+ __version__ = "1.2.137"
upgini/dataset.py CHANGED
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
+ import pyarrow as pa
11
+ import pyarrow.parquet as pq
10
12
  from pandas.api.types import (
11
13
  is_float_dtype,
12
14
  is_integer_dtype,
@@ -18,6 +20,7 @@ from pandas.api.types import (
18
20
  from upgini.errors import ValidationError
19
21
  from upgini.http import ProgressStage, SearchProgress, _RestClient
20
22
  from upgini.metadata import (
23
+ CURRENT_DATE_COL,
21
24
  ENTITY_SYSTEM_RECORD_ID,
22
25
  EVAL_SET_INDEX,
23
26
  SYSTEM_RECORD_ID,
@@ -38,6 +41,7 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
38
41
  from upgini.search_task import SearchTask
39
42
  from upgini.utils.config import SampleConfig
40
43
  from upgini.utils.email_utils import EmailSearchKeyConverter
44
+ from upgini.utils.hash_utils import file_hash
41
45
  from upgini.utils.sample_utils import SampleColumns, sample
42
46
 
43
47
  try:
@@ -287,7 +291,7 @@ class Dataset:
287
291
  for key in search_group
288
292
  if key in self.columns_renaming
289
293
  and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
290
- and not self.columns_renaming.get(key) == "current_date"
294
+ and not self.columns_renaming.get(key) == CURRENT_DATE_COL
291
295
  }
292
296
  ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
293
297
  if (
@@ -469,6 +473,33 @@ class Dataset:
469
473
 
470
474
  columns.append(column_meta)
471
475
 
476
+ current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
477
+ with tempfile.TemporaryDirectory() as tmp_dir:
478
+ if (
479
+ self.date_column is not None
480
+ and self.data[self.date_column].nunique() == 1
481
+ and self.data[self.date_column].iloc[0] == current_date
482
+ ):
483
+ df_without_fake_date = self.data.drop(columns=[self.date_column])
484
+ else:
485
+ df_without_fake_date = self.data
486
+ parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
487
+
488
+ # calculate deterministic digest for any environment
489
+
490
+ table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
491
+ table = table.replace_schema_metadata({}) # remove all metadata
492
+ pq.write_table(
493
+ table,
494
+ parquet_file_path,
495
+ compression=None, # any compression will make it non-deterministic
496
+ data_page_size=0, # optional, to remove page layout variations
497
+ use_deprecated_int96_timestamps=False, # fix timestamp format
498
+ write_statistics=False, # remove statistics to make it deterministic
499
+ )
500
+
501
+ deterministic_digest = file_hash(parquet_file_path)
502
+
472
503
  return FileMetadata(
473
504
  name=self.dataset_name,
474
505
  description=self.description,
@@ -479,6 +510,7 @@ class Dataset:
479
510
  hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
480
511
  taskType=self.task_type,
481
512
  droppedColumns=self.dropped_columns,
513
+ deterministicDigest=deterministic_digest,
482
514
  )
483
515
 
484
516
  @staticmethod
@@ -44,6 +44,7 @@ from upgini.http import (
44
44
  from upgini.mdc import MDC
45
45
  from upgini.metadata import (
46
46
  COUNTRY,
47
+ CURRENT_DATE_COL,
47
48
  DEFAULT_INDEX,
48
49
  ENTITY_SYSTEM_RECORD_ID,
49
50
  EVAL_SET_INDEX,
@@ -167,7 +168,6 @@ class FeaturesEnricher(TransformerMixin):
167
168
  """
168
169
 
169
170
  TARGET_NAME = "target"
170
- CURRENT_DATE = "current_date"
171
171
  RANDOM_STATE = 42
172
172
  CALCULATE_METRICS_THRESHOLD = 50_000_000
173
173
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -2983,6 +2983,9 @@ if response.status_code == 200:
2983
2983
  else:
2984
2984
  selected_input_columns = []
2985
2985
 
2986
+ if DEFAULT_INDEX in selected_input_columns:
2987
+ selected_input_columns.remove(DEFAULT_INDEX)
2988
+
2986
2989
  return selected_input_columns + selected_generated_features
2987
2990
 
2988
2991
  def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
@@ -3167,7 +3170,7 @@ if response.status_code == 200:
3167
3170
 
3168
3171
  if DEFAULT_INDEX in df.columns:
3169
3172
  msg = self.bundle.get("unsupported_index_column")
3170
- self.logger.info(msg)
3173
+ self.logger.warning(msg)
3171
3174
  print(msg)
3172
3175
  self.fit_dropped_features.add(DEFAULT_INDEX)
3173
3176
  df.drop(columns=DEFAULT_INDEX, inplace=True)
@@ -4093,9 +4096,10 @@ if response.status_code == 200:
4093
4096
  ):
4094
4097
  if not silent:
4095
4098
  self.__log_warning(bundle.get("current_date_added"))
4096
- df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4097
- search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4098
- converter = DateTimeConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4099
+ df[CURRENT_DATE_COL] = datetime.date.today()
4100
+ # df[CURRENT_DATE_COL] = datetime.date(2025, 10, 15)
4101
+ search_keys[CURRENT_DATE_COL] = SearchKey.DATE
4102
+ converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
4099
4103
  df = converter.convert(df)
4100
4104
  return df
4101
4105
 
@@ -4109,7 +4113,7 @@ if response.status_code == 200:
4109
4113
  return [
4110
4114
  col
4111
4115
  for col, t in search_keys.items()
4112
- if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
4116
+ if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
4113
4117
  ]
4114
4118
 
4115
4119
  @staticmethod
upgini/http.py CHANGED
@@ -433,8 +433,8 @@ class _RestClient:
433
433
  with open(file_path, "rb") as file:
434
434
  content = file.read()
435
435
  md5_hash.update(content)
436
- digest = md5_hash.hexdigest()
437
- metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
436
+ digest_md5 = md5_hash.hexdigest()
437
+ metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
438
438
 
439
439
  digest_sha256 = file_hash(file_path)
440
440
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
upgini/metadata.py CHANGED
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
12
12
  EVAL_SET_INDEX = "eval_set_index"
13
13
  TARGET = "target"
14
14
  COUNTRY = "country_iso_code"
15
+ CURRENT_DATE_COL = "current_date_"
15
16
  RENAMED_INDEX = "index_col"
16
17
  DEFAULT_INDEX = "index"
17
18
  ORIGINAL_INDEX = "original_index"
18
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
19
+ SYSTEM_COLUMNS = {
20
+ SYSTEM_RECORD_ID,
21
+ ENTITY_SYSTEM_RECORD_ID,
22
+ SEARCH_KEY_UNNEST,
23
+ EVAL_SET_INDEX,
24
+ TARGET,
25
+ COUNTRY,
26
+ CURRENT_DATE_COL,
27
+ }
19
28
 
20
29
 
21
30
  class FileColumnMeaningType(Enum):
@@ -252,6 +261,7 @@ class FileMetadata(BaseModel):
252
261
  rowsCount: Optional[int] = None
253
262
  checksumMD5: Optional[str] = None
254
263
  digest: Optional[str] = None
264
+ deterministicDigest: Optional[str] = None
255
265
  droppedColumns: Optional[List[str]] = None
256
266
 
257
267
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
@@ -408,6 +408,10 @@ def is_dates_distribution_valid(
408
408
  if maybe_date_col is None:
409
409
  return
410
410
 
411
+ # Don't check if date column is constant
412
+ if X[maybe_date_col].nunique() <= 1:
413
+ return
414
+
411
415
  if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
412
416
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
413
417
  elif pd.__version__ >= "2.0.0":
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.136
3
+ Version: 1.2.137
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -34,6 +34,7 @@ Requires-Dist: more-itertools==10.7.0
34
34
  Requires-Dist: numpy<3.0.0,>=1.19.0
35
35
  Requires-Dist: pandas<3.0.0,>=1.1.0
36
36
  Requires-Dist: psutil>=5.9.0
37
+ Requires-Dist: pyarrow==18.1.0
37
38
  Requires-Dist: pydantic<3.0.0,>1.0.0
38
39
  Requires-Dist: pyjwt>=2.8.0
39
40
  Requires-Dist: python-bidi==0.4.2
@@ -1,11 +1,11 @@
1
- upgini/__about__.py,sha256=geG4WOLTgtin9k9NxkyjPJlJc57re5T0iGsOwFw3cp4,24
1
+ upgini/__about__.py,sha256=WAhVusMOc6hw9YR0UCWwVJJi3v2_uHEpPqxnSm9SguM,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=4bWKKFdFbFvdcb-JS4Nt2Je8eKqPg5QRLlmchQuY2aw,31870
4
+ upgini/dataset.py,sha256=D9JzJJkZLPP_dp8GOlGgMhTtrd5pvP-4cHIcqiY3q-E,33354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=9pBxZKWsXF_IhZXnswCQZGU-cqOOT1EqH-FPu1zJo4E,235950
7
- upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
- upgini/metadata.py,sha256=soRxxAG9gpOk77oOxCl35f9nWPDTaYyJEHgwKWhkS84,12666
6
+ upgini/features_enricher.py,sha256=g1daAwtECDpSsM2TtVmozU3crsdMQ_xC5PlBJAafAX0,236099
7
+ upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
8
+ upgini/metadata.py,sha256=BwUTCY-EUHqPtO0tGazHrk3wqhh-NfjNZhlBHW8bR78,12796
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
10
10
  upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -52,7 +52,7 @@ upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
52
52
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
53
53
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
54
54
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
- upgini/utils/datetime_utils.py,sha256=3_FQoa_ywgEeznaEPN2kuH_ES-LZJWSN2AI39sM9NRg,16988
55
+ upgini/utils/datetime_utils.py,sha256=B9zNaH2ZyV-lbBSTBdCZjc4zq1nVlejci40sf-TYfik,17102
56
56
  upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
57
57
  upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
74
74
  upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.136.dist-info/METADATA,sha256=iszl9ghp_J6AHfTIZCtf5PliRGfKprpYxdwPfSUsEPg,51133
78
- upgini-1.2.136.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
- upgini-1.2.136.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.136.dist-info/RECORD,,
77
+ upgini-1.2.137.dist-info/METADATA,sha256=qTIcTcJz2tn18BOyAa7SbP14-fOxG4l3rtQyCxG8wDI,51164
78
+ upgini-1.2.137.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
+ upgini-1.2.137.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.137.dist-info/RECORD,,