upgini 1.2.135a3__py3-none-any.whl → 1.2.137__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.135a3"
1
+ __version__ = "1.2.137"
upgini/dataset.py CHANGED
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
10
+ import pyarrow as pa
11
+ import pyarrow.parquet as pq
10
12
  from pandas.api.types import (
11
13
  is_float_dtype,
12
14
  is_integer_dtype,
@@ -18,6 +20,7 @@ from pandas.api.types import (
18
20
  from upgini.errors import ValidationError
19
21
  from upgini.http import ProgressStage, SearchProgress, _RestClient
20
22
  from upgini.metadata import (
23
+ CURRENT_DATE_COL,
21
24
  ENTITY_SYSTEM_RECORD_ID,
22
25
  EVAL_SET_INDEX,
23
26
  SYSTEM_RECORD_ID,
@@ -38,6 +41,7 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
38
41
  from upgini.search_task import SearchTask
39
42
  from upgini.utils.config import SampleConfig
40
43
  from upgini.utils.email_utils import EmailSearchKeyConverter
44
+ from upgini.utils.hash_utils import file_hash
41
45
  from upgini.utils.sample_utils import SampleColumns, sample
42
46
 
43
47
  try:
@@ -71,6 +75,7 @@ class Dataset:
71
75
  date_column: Optional[str] = None,
72
76
  id_columns: Optional[List[str]] = None,
73
77
  is_imbalanced: bool = False,
78
+ dropped_columns: Optional[List[str]] = None,
74
79
  random_state: Optional[int] = None,
75
80
  sample_config: Optional[SampleConfig] = None,
76
81
  rest_client: Optional[_RestClient] = None,
@@ -118,6 +123,7 @@ class Dataset:
118
123
  self.is_imbalanced: bool = False
119
124
  self.id_columns = id_columns
120
125
  self.is_imbalanced = is_imbalanced
126
+ self.dropped_columns = dropped_columns
121
127
  self.date_column = date_column
122
128
  if logger is not None:
123
129
  self.logger = logger
@@ -285,7 +291,7 @@ class Dataset:
285
291
  for key in search_group
286
292
  if key in self.columns_renaming
287
293
  and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
288
- and not self.columns_renaming.get(key) == "current_date"
294
+ and not self.columns_renaming.get(key) == CURRENT_DATE_COL
289
295
  }
290
296
  ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
291
297
  if (
@@ -467,6 +473,33 @@ class Dataset:
467
473
 
468
474
  columns.append(column_meta)
469
475
 
476
+ current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
477
+ with tempfile.TemporaryDirectory() as tmp_dir:
478
+ if (
479
+ self.date_column is not None
480
+ and self.data[self.date_column].nunique() == 1
481
+ and self.data[self.date_column].iloc[0] == current_date
482
+ ):
483
+ df_without_fake_date = self.data.drop(columns=[self.date_column])
484
+ else:
485
+ df_without_fake_date = self.data
486
+ parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
487
+
488
+ # calculate deterministic digest for any environment
489
+
490
+ table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
491
+ table = table.replace_schema_metadata({}) # remove all metadata
492
+ pq.write_table(
493
+ table,
494
+ parquet_file_path,
495
+ compression=None, # any compression will make it non-deterministic
496
+ data_page_size=0, # optional, to remove page layout variations
497
+ use_deprecated_int96_timestamps=False, # fix timestamp format
498
+ write_statistics=False, # remove statistics to make it deterministic
499
+ )
500
+
501
+ deterministic_digest = file_hash(parquet_file_path)
502
+
470
503
  return FileMetadata(
471
504
  name=self.dataset_name,
472
505
  description=self.description,
@@ -476,6 +509,8 @@ class Dataset:
476
509
  hierarchicalGroupKeys=self.hierarchical_group_keys,
477
510
  hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
478
511
  taskType=self.task_type,
512
+ droppedColumns=self.dropped_columns,
513
+ deterministicDigest=deterministic_digest,
479
514
  )
480
515
 
481
516
  @staticmethod
@@ -44,6 +44,7 @@ from upgini.http import (
44
44
  from upgini.mdc import MDC
45
45
  from upgini.metadata import (
46
46
  COUNTRY,
47
+ CURRENT_DATE_COL,
47
48
  DEFAULT_INDEX,
48
49
  ENTITY_SYSTEM_RECORD_ID,
49
50
  EVAL_SET_INDEX,
@@ -167,7 +168,6 @@ class FeaturesEnricher(TransformerMixin):
167
168
  """
168
169
 
169
170
  TARGET_NAME = "target"
170
- CURRENT_DATE = "current_date"
171
171
  RANDOM_STATE = 42
172
172
  CALCULATE_METRICS_THRESHOLD = 50_000_000
173
173
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -1737,6 +1737,10 @@ class FeaturesEnricher(TransformerMixin):
1737
1737
 
1738
1738
  self.logger.info(f"Excluding search keys: {excluding_search_keys}")
1739
1739
 
1740
+ file_meta = self._search_task.get_file_metadata(trace_id)
1741
+ fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
1742
+ original_dropped_features = [columns_renaming.get(f, f) for f in fit_dropped_features]
1743
+
1740
1744
  client_features = [
1741
1745
  c
1742
1746
  for c in validated_X.columns.to_list()
@@ -1744,7 +1748,7 @@ class FeaturesEnricher(TransformerMixin):
1744
1748
  and c
1745
1749
  not in (
1746
1750
  excluding_search_keys
1747
- + list(self.fit_dropped_features)
1751
+ + original_dropped_features
1748
1752
  + [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1749
1753
  )
1750
1754
  ]
@@ -2955,8 +2959,10 @@ if response.status_code == 200:
2955
2959
  trace_id: str,
2956
2960
  is_transform: bool = False,
2957
2961
  ):
2958
- fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2959
- original_dropped_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_dropped_features]
2962
+ file_meta = self._search_task.get_file_metadata(trace_id)
2963
+ fit_dropped_features = self.fit_dropped_features or file_meta.droppedColumns or []
2964
+ fit_input_columns = [c.originalName for c in file_meta.columns]
2965
+ original_dropped_features = [self.fit_columns_renaming.get(c, c) for c in fit_dropped_features]
2960
2966
  new_columns_on_transform = [
2961
2967
  c for c in validated_Xy.columns if c not in fit_input_columns and c not in original_dropped_features
2962
2968
  ]
@@ -2977,6 +2983,9 @@ if response.status_code == 200:
2977
2983
  else:
2978
2984
  selected_input_columns = []
2979
2985
 
2986
+ if DEFAULT_INDEX in selected_input_columns:
2987
+ selected_input_columns.remove(DEFAULT_INDEX)
2988
+
2980
2989
  return selected_input_columns + selected_generated_features
2981
2990
 
2982
2991
  def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
@@ -3161,7 +3170,7 @@ if response.status_code == 200:
3161
3170
 
3162
3171
  if DEFAULT_INDEX in df.columns:
3163
3172
  msg = self.bundle.get("unsupported_index_column")
3164
- self.logger.info(msg)
3173
+ self.logger.warning(msg)
3165
3174
  print(msg)
3166
3175
  self.fit_dropped_features.add(DEFAULT_INDEX)
3167
3176
  df.drop(columns=DEFAULT_INDEX, inplace=True)
@@ -3380,6 +3389,7 @@ if response.status_code == 200:
3380
3389
  cv_type=self.cv,
3381
3390
  id_columns=self.__get_renamed_id_columns(),
3382
3391
  is_imbalanced=self.imbalanced,
3392
+ dropped_columns=[self.fit_columns_renaming.get(f, f) for f in self.fit_dropped_features],
3383
3393
  date_column=self._get_date_column(self.fit_search_keys),
3384
3394
  date_format=self.date_format,
3385
3395
  random_state=self.random_state,
@@ -4086,9 +4096,10 @@ if response.status_code == 200:
4086
4096
  ):
4087
4097
  if not silent:
4088
4098
  self.__log_warning(bundle.get("current_date_added"))
4089
- df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
4090
- search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
4091
- converter = DateTimeConverter(FeaturesEnricher.CURRENT_DATE, generate_cyclical_features=False)
4099
+ df[CURRENT_DATE_COL] = datetime.date.today()
4100
+ # df[CURRENT_DATE_COL] = datetime.date(2025, 10, 15)
4101
+ search_keys[CURRENT_DATE_COL] = SearchKey.DATE
4102
+ converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
4092
4103
  df = converter.convert(df)
4093
4104
  return df
4094
4105
 
@@ -4102,7 +4113,7 @@ if response.status_code == 200:
4102
4113
  return [
4103
4114
  col
4104
4115
  for col, t in search_keys.items()
4105
- if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
4116
+ if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
4106
4117
  ]
4107
4118
 
4108
4119
  @staticmethod
upgini/http.py CHANGED
@@ -433,8 +433,8 @@ class _RestClient:
433
433
  with open(file_path, "rb") as file:
434
434
  content = file.read()
435
435
  md5_hash.update(content)
436
- digest = md5_hash.hexdigest()
437
- metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
436
+ digest_md5 = md5_hash.hexdigest()
437
+ metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
438
438
 
439
439
  digest_sha256 = file_hash(file_path)
440
440
  metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
upgini/metadata.py CHANGED
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
12
12
  EVAL_SET_INDEX = "eval_set_index"
13
13
  TARGET = "target"
14
14
  COUNTRY = "country_iso_code"
15
+ CURRENT_DATE_COL = "current_date_"
15
16
  RENAMED_INDEX = "index_col"
16
17
  DEFAULT_INDEX = "index"
17
18
  ORIGINAL_INDEX = "original_index"
18
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
19
+ SYSTEM_COLUMNS = {
20
+ SYSTEM_RECORD_ID,
21
+ ENTITY_SYSTEM_RECORD_ID,
22
+ SEARCH_KEY_UNNEST,
23
+ EVAL_SET_INDEX,
24
+ TARGET,
25
+ COUNTRY,
26
+ CURRENT_DATE_COL,
27
+ }
19
28
 
20
29
 
21
30
  class FileColumnMeaningType(Enum):
@@ -252,6 +261,8 @@ class FileMetadata(BaseModel):
252
261
  rowsCount: Optional[int] = None
253
262
  checksumMD5: Optional[str] = None
254
263
  digest: Optional[str] = None
264
+ deterministicDigest: Optional[str] = None
265
+ droppedColumns: Optional[List[str]] = None
255
266
 
256
267
  def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
257
268
  for c in self.columns:
@@ -408,6 +408,10 @@ def is_dates_distribution_valid(
408
408
  if maybe_date_col is None:
409
409
  return
410
410
 
411
+ # Don't check if date column is constant
412
+ if X[maybe_date_col].nunique() <= 1:
413
+ return
414
+
411
415
  if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
412
416
  dates = X[maybe_date_col].dt.to_timestamp().dt.date
413
417
  elif pd.__version__ >= "2.0.0":
@@ -46,7 +46,7 @@ class FeaturesValidator:
46
46
 
47
47
  columns_renaming = columns_renaming or {}
48
48
 
49
- if one_hot_encoded_features:
49
+ if one_hot_encoded_features and len(one_hot_encoded_features) > 1:
50
50
  msg = bundle.get("one_hot_encoded_features").format(
51
51
  [columns_renaming.get(f, f) for f in one_hot_encoded_features]
52
52
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.135a3
3
+ Version: 1.2.137
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -34,6 +34,7 @@ Requires-Dist: more-itertools==10.7.0
34
34
  Requires-Dist: numpy<3.0.0,>=1.19.0
35
35
  Requires-Dist: pandas<3.0.0,>=1.1.0
36
36
  Requires-Dist: psutil>=5.9.0
37
+ Requires-Dist: pyarrow==18.1.0
37
38
  Requires-Dist: pydantic<3.0.0,>1.0.0
38
39
  Requires-Dist: pyjwt>=2.8.0
39
40
  Requires-Dist: python-bidi==0.4.2
@@ -1,11 +1,11 @@
1
- upgini/__about__.py,sha256=zKar2BSHx-M-ruW4V1DQDr70UfG-vBoiDPsQB4duA-c,26
1
+ upgini/__about__.py,sha256=WAhVusMOc6hw9YR0UCWwVJJi3v2_uHEpPqxnSm9SguM,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=qXo1eAUM21jmNMYGa6IeGuRJmjU3jtCieS4sAVVIfXs,31721
4
+ upgini/dataset.py,sha256=D9JzJJkZLPP_dp8GOlGgMhTtrd5pvP-4cHIcqiY3q-E,33354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=dC71_-18cqymS26EhEnfYnZFGDvRtvW6Pga4ZtJN-BI,235485
7
- upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
- upgini/metadata.py,sha256=H3wiN37k-yqWZgbPD0tJzx8DzaCIkgmX5cybhByQWLg,12619
6
+ upgini/features_enricher.py,sha256=g1daAwtECDpSsM2TtVmozU3crsdMQ_xC5PlBJAafAX0,236099
7
+ upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
8
+ upgini/metadata.py,sha256=BwUTCY-EUHqPtO0tGazHrk3wqhh-NfjNZhlBHW8bR78,12796
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
10
10
  upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -52,13 +52,13 @@ upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
52
52
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
53
53
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
54
54
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
- upgini/utils/datetime_utils.py,sha256=3_FQoa_ywgEeznaEPN2kuH_ES-LZJWSN2AI39sM9NRg,16988
55
+ upgini/utils/datetime_utils.py,sha256=B9zNaH2ZyV-lbBSTBdCZjc4zq1nVlejci40sf-TYfik,17102
56
56
  upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
57
57
  upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
59
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
60
60
  upgini/utils/feature_info.py,sha256=SQTRbSxJDkh2G2c0KGBmOv8f69gVzWbTtcXn0_2Qb-8,7945
61
- upgini/utils/features_validator.py,sha256=RdRMisZYeJ8HVCKiKxqSyWjoLf_MsZNXxHIuWf6H2g4,4939
61
+ upgini/utils/features_validator.py,sha256=pAyS57-jYlihMOhweM12GOvCTJC13fTIJ9lQwgoON5c,4977
62
62
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
63
  upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
64
64
  upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
74
74
  upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.135a3.dist-info/METADATA,sha256=p6JAaluvHU1B6Al8fHzq5KsgyNQvznWkEDwE1fhHuZI,51135
78
- upgini-1.2.135a3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
- upgini-1.2.135a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.135a3.dist-info/RECORD,,
77
+ upgini-1.2.137.dist-info/METADATA,sha256=qTIcTcJz2tn18BOyAa7SbP14-fOxG4l3rtQyCxG8wDI,51164
78
+ upgini-1.2.137.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
+ upgini-1.2.137.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.137.dist-info/RECORD,,