upgini 1.1.316a3__py3-none-any.whl → 1.1.316a5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.316a3"
1
+ __version__ = "1.1.316a5"
upgini/autofe/date.py CHANGED
@@ -4,11 +4,16 @@ from typing import Any, Dict, List, Optional, Union
4
4
  import numpy as np
5
5
  import pandas as pd
6
6
  from pandas.core.arrays.timedeltas import TimedeltaArray
7
- from pydantic import BaseModel, validator
7
+ from pydantic import BaseModel, __version__ as pydantic_version
8
8
 
9
9
  from upgini.autofe.operand import PandasOperand
10
10
 
11
11
 
12
+ def get_pydantic_version():
13
+ major_version = int(pydantic_version.split('.')[0])
14
+ return major_version
15
+
16
+
12
17
  class DateDiffMixin(BaseModel):
13
18
  diff_unit: str = "D"
14
19
  left_unit: Optional[str] = None
@@ -246,12 +251,25 @@ class DatePercentile(DatePercentileBase):
246
251
  )
247
252
  return res
248
253
 
249
- @validator("zero_bounds", pre="true")
250
- def validate_bounds(cls, value):
251
- if value is None or isinstance(value, list):
254
+ # Check Pydantic version
255
+ if get_pydantic_version() >= 2:
256
+ # Use @field_validator for Pydantic 2.x
257
+ from pydantic import field_validator
258
+
259
+ @field_validator('zero_bounds', mode='before')
260
+ def parse_zero_bounds(cls, value):
261
+ if isinstance(value, str):
262
+ return value[1:-1].split(", ")
263
+ return value
264
+ else:
265
+ # Use @validator for Pydantic 1.x
266
+ from pydantic import validator
267
+
268
+ @validator('zero_bounds', pre=True)
269
+ def parse_zero_bounds(cls, value):
270
+ if isinstance(value, str):
271
+ return value[1:-1].split(", ")
252
272
  return value
253
- elif isinstance(value, str):
254
- return value[1:-1].split(", ")
255
273
 
256
274
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
257
275
  months = date_col.dt.month
upgini/autofe/feature.py CHANGED
@@ -82,9 +82,9 @@ class Feature:
82
82
  self.alias = alias
83
83
 
84
84
  def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
85
- obj_dict = self.op.model_dump().copy()
85
+ obj_dict = self.op.dict().copy()
86
86
  obj_dict.update(params or {})
87
- self.op = self.op.__class__.model_validate(obj_dict)
87
+ self.op = self.op.__class__.parse_obj(obj_dict)
88
88
  self.op.set_params(params)
89
89
 
90
90
  for child in self.children:
upgini/dataset.py CHANGED
@@ -18,6 +18,7 @@ from pandas.api.types import (
18
18
  from upgini.errors import ValidationError
19
19
  from upgini.http import ProgressStage, SearchProgress, _RestClient
20
20
  from upgini.metadata import (
21
+ ENTITY_SYSTEM_RECORD_ID,
21
22
  EVAL_SET_INDEX,
22
23
  SYSTEM_RECORD_ID,
23
24
  TARGET,
@@ -157,7 +158,11 @@ class Dataset: # (pd.DataFrame):
157
158
  raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
158
159
 
159
160
  def __validate_max_row_count(self):
160
- if len(self.data) > self.MAX_ROWS:
161
+ if ENTITY_SYSTEM_RECORD_ID in self.data.columns:
162
+ rows_count = self.data[ENTITY_SYSTEM_RECORD_ID].nunique()
163
+ else:
164
+ rows_count = len(self.data)
165
+ if rows_count > self.MAX_ROWS:
161
166
  raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
162
167
 
163
168
  def __target_value(self) -> pd.Series:
@@ -199,14 +204,14 @@ class Dataset: # (pd.DataFrame):
199
204
  elif self.task_type == ModelTaskType.REGRESSION:
200
205
  if not is_float_dtype(target):
201
206
  try:
202
- self.data[target_column] = self.data[target_column].astype("float")
207
+ self.data[target_column] = self.data[target_column].astype("float64")
203
208
  except ValueError:
204
209
  self.logger.exception("Failed to cast target to float for regression task type")
205
210
  raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
206
211
  elif self.task_type == ModelTaskType.TIMESERIES:
207
212
  if not is_float_dtype(target):
208
213
  try:
209
- self.data[target_column] = self.data[target_column].astype("float")
214
+ self.data[target_column] = self.data[target_column].astype("float64")
210
215
  except ValueError:
211
216
  self.logger.exception("Failed to cast target to float for timeseries task type")
212
217
  raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
@@ -2042,7 +2042,7 @@ class FeaturesEnricher(TransformerMixin):
2042
2042
 
2043
2043
  df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
2044
2044
  df[columns_for_system_record_id], index=False
2045
- ).astype("Float64")
2045
+ ).astype("float64")
2046
2046
 
2047
2047
  # Explode multiple search keys
2048
2048
  df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
@@ -2108,7 +2108,7 @@ class FeaturesEnricher(TransformerMixin):
2108
2108
  # search keys might be changed after explode
2109
2109
  columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
2110
2110
  df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
2111
- "Float64"
2111
+ "float64"
2112
2112
  )
2113
2113
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2114
2114
  meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
upgini/http.py CHANGED
@@ -440,18 +440,18 @@ class _RestClient:
440
440
  content = file.read()
441
441
  md5_hash.update(content)
442
442
  digest = md5_hash.hexdigest()
443
- metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
443
+ metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
444
444
 
445
445
  digest_sha256 = hashlib.sha256(
446
446
  pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
447
447
  ).hexdigest()
448
- metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
448
+ metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
449
449
 
450
450
  with open(file_path, "rb") as file:
451
451
  files = {
452
452
  "metadata": (
453
453
  "metadata.json",
454
- metadata_with_md5.model_dump_json(exclude_none=True).encode(),
454
+ metadata_with_md5.json(exclude_none=True).encode(),
455
455
  "application/json",
456
456
  ),
457
457
  "tracking": (
@@ -461,7 +461,7 @@ class _RestClient:
461
461
  ),
462
462
  "metrics": (
463
463
  "metrics.json",
464
- metrics.model_dump_json(exclude_none=True).encode(),
464
+ metrics.json(exclude_none=True).encode(),
465
465
  "application/json",
466
466
  ),
467
467
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
@@ -469,7 +469,7 @@ class _RestClient:
469
469
  if search_customization is not None:
470
470
  files["customization"] = (
471
471
  "customization.json",
472
- search_customization.model_dump_json(exclude_none=True).encode(),
472
+ search_customization.json(exclude_none=True).encode(),
473
473
  "application/json",
474
474
  )
475
475
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
@@ -484,7 +484,7 @@ class _RestClient:
484
484
  def check_uploaded_file_v2(self, trace_id: str, file_upload_id: str, metadata: FileMetadata) -> bool:
485
485
  api_path = self.CHECK_UPLOADED_FILE_URL_FMT_V2.format(file_upload_id)
486
486
  response = self._with_unauth_retry(
487
- lambda: self._send_post_req(api_path, trace_id, metadata.model_dump_json(exclude_none=True))
487
+ lambda: self._send_post_req(api_path, trace_id, metadata.json(exclude_none=True))
488
488
  )
489
489
  return bool(response)
490
490
 
@@ -498,11 +498,11 @@ class _RestClient:
498
498
  ) -> SearchTaskResponse:
499
499
  api_path = self.INITIAL_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id)
500
500
  files = {
501
- "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
502
- "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
501
+ "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
502
+ "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
503
503
  }
504
504
  if search_customization is not None:
505
- files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
505
+ files["customization"] = search_customization.json(exclude_none=True).encode()
506
506
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
507
507
  response = self._with_unauth_retry(
508
508
  lambda: self._send_post_file_req_v2(
@@ -528,18 +528,18 @@ class _RestClient:
528
528
  content = file.read()
529
529
  md5_hash.update(content)
530
530
  digest = md5_hash.hexdigest()
531
- metadata_with_md5 = metadata.model_copy(update={"checksumMD5": digest})
531
+ metadata_with_md5 = metadata.copy(update={"checksumMD5": digest})
532
532
 
533
533
  digest_sha256 = hashlib.sha256(
534
534
  pd.util.hash_pandas_object(pd.read_parquet(file_path, engine="fastparquet")).values
535
535
  ).hexdigest()
536
- metadata_with_md5 = metadata_with_md5.model_copy(update={"digest": digest_sha256})
536
+ metadata_with_md5 = metadata_with_md5.copy(update={"digest": digest_sha256})
537
537
 
538
538
  with open(file_path, "rb") as file:
539
539
  files = {
540
540
  "metadata": (
541
541
  "metadata.json",
542
- metadata_with_md5.model_dump_json(exclude_none=True).encode(),
542
+ metadata_with_md5.json(exclude_none=True).encode(),
543
543
  "application/json",
544
544
  ),
545
545
  "tracking": (
@@ -549,7 +549,7 @@ class _RestClient:
549
549
  ),
550
550
  "metrics": (
551
551
  "metrics.json",
552
- metrics.model_dump_json(exclude_none=True).encode(),
552
+ metrics.json(exclude_none=True).encode(),
553
553
  "application/json",
554
554
  ),
555
555
  "file": (metadata_with_md5.name, file, "application/octet-stream"),
@@ -557,7 +557,7 @@ class _RestClient:
557
557
  if search_customization is not None:
558
558
  files["customization"] = (
559
559
  "customization.json",
560
- search_customization.model_dump_json(exclude_none=True).encode(),
560
+ search_customization.json(exclude_none=True).encode(),
561
561
  "application/json",
562
562
  )
563
563
 
@@ -581,11 +581,11 @@ class _RestClient:
581
581
  ) -> SearchTaskResponse:
582
582
  api_path = self.VALIDATION_SEARCH_WITHOUT_UPLOAD_URI_FMT_V2.format(file_upload_id, initial_search_task_id)
583
583
  files = {
584
- "metadata": ("metadata.json", metadata.model_dump_json(exclude_none=True).encode(), "application/json"),
585
- "metrics": ("metrics.json", metrics.model_dump_json(exclude_none=True).encode(), "application/json"),
584
+ "metadata": ("metadata.json", metadata.json(exclude_none=True).encode(), "application/json"),
585
+ "metrics": ("metrics.json", metrics.json(exclude_none=True).encode(), "application/json"),
586
586
  }
587
587
  if search_customization is not None:
588
- files["customization"] = search_customization.model_dump_json(exclude_none=True).encode()
588
+ files["customization"] = search_customization.json(exclude_none=True).encode()
589
589
  additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
590
590
  response = self._with_unauth_retry(
591
591
  lambda: self._send_post_file_req_v2(
@@ -649,7 +649,7 @@ class _RestClient:
649
649
  "file": (metadata.name, file, "application/octet-stream"),
650
650
  "metadata": (
651
651
  "metadata.json",
652
- metadata.model_dump_json(exclude_none=True).encode(),
652
+ metadata.json(exclude_none=True).encode(),
653
653
  "application/json",
654
654
  ),
655
655
  }
@@ -661,12 +661,12 @@ class _RestClient:
661
661
  def get_search_file_metadata(self, search_task_id: str, trace_id: str) -> FileMetadata:
662
662
  api_path = self.SEARCH_FILE_METADATA_URI_FMT_V2.format(search_task_id)
663
663
  response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
664
- return FileMetadata.model_validate(response)
664
+ return FileMetadata.parse_obj(response)
665
665
 
666
666
  def get_provider_search_metadata_v3(self, provider_search_task_id: str, trace_id: str) -> ProviderTaskMetadataV2:
667
667
  api_path = self.SEARCH_TASK_METADATA_FMT_V3.format(provider_search_task_id)
668
668
  response = self._with_unauth_retry(lambda: self._send_get_req(api_path, trace_id))
669
- return ProviderTaskMetadataV2.model_validate(response)
669
+ return ProviderTaskMetadataV2.parse_obj(response)
670
670
 
671
671
  def get_current_transform_usage(self, trace_id) -> TransformUsage:
672
672
  track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
@@ -25,7 +25,7 @@ class PostalCodeSearchKeyConverter:
25
25
  if is_string_dtype(df[self.postal_code_column]) or is_object_dtype(df[self.postal_code_column]):
26
26
  try:
27
27
  df[self.postal_code_column] = (
28
- df[self.postal_code_column].astype("string").astype("Float64").astype("Int64").astype("string")
28
+ df[self.postal_code_column].astype("string").astype("float64").astype("Int64").astype("string")
29
29
  )
30
30
  except Exception:
31
31
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.316a3
3
+ Version: 1.1.316a5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,10 +1,10 @@
1
- upgini/__about__.py,sha256=xP_PhI7jmSCABPEedhQOlt9k8Njn3IHiI7PyPcsXGQQ,26
1
+ upgini/__about__.py,sha256=5SaWm460mZelKwFqDXwqvCt7MFnWhCTJ17oJESSgrVA,26
2
2
  upgini/__init__.py,sha256=Xs0YFVBu1KUdtZzbStGRPQtLt3YLzJnjx5nIUBlX8BE,415
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=yAWIygHejxdKXOA4g3QjtCu0VRa9at-4nPPuugCr77U,30857
4
+ upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=_d8ya5RRoYN0o6mV6gda-bLdOngQ4rb1SA51SlM_TG0,188002
7
- upgini/http.py,sha256=gCN5ru_I6JNHk-m6-Ckjhd23iMzOAzDSLb0tSEcxkC4,43068
6
+ upgini/features_enricher.py,sha256=9l8C3p6OaLkgE9O_kln_uJxqY1A7qqDgi5_l7X7ukeE,188002
7
+ upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
10
10
  upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
@@ -16,8 +16,8 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
18
18
  upgini/autofe/binary.py,sha256=xRBT7RNqQ7pprz6cRpO1KnvZCb7PvU3QXBfaP6Omqi4,7425
19
- upgini/autofe/date.py,sha256=aKuEsguYSrFdFiLd6tBLVH4TiQ3JFMo_49_Ajp8eKQg,9208
20
- upgini/autofe/feature.py,sha256=CivPkE7YrAtDrgF8WhVPnDAnNDR8gbRQ-8_hXiQE6ew,14234
19
+ upgini/autofe/date.py,sha256=ku3kcmzpPmyUmpXHIBwT6JCIkaslRknW8DifUXvFnG8,9762
20
+ upgini/autofe/feature.py,sha256=gwGWY2UcX_0wHAvfEiu1rRU7GFZyzMWZIaPVcf6kD80,14223
21
21
  upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
22
  upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
23
23
  upgini/autofe/unary.py,sha256=RiK-Fz3fgjPlqWWfro6x7qChjEZ8W8RTnl5-MT1kaQA,4218
@@ -51,13 +51,13 @@ upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
52
52
  upgini/utils/ip_utils.py,sha256=ZZj_uQFTHhagzt-MRew__ZBOp2DdnkMrachS7PElkSE,5143
53
53
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
54
- upgini/utils/postal_code_utils.py,sha256=C899tJS8qM_ps4I3g-Ve6qzIa22O_UqwNmGFoyy9sO8,1716
54
+ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
55
55
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
56
56
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
57
57
  upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.316a3.dist-info/METADATA,sha256=wqF_a0Mo2hFvIHf5cxVPquLOnkz0LHeIOmTdRUP7R9M,48232
61
- upgini-1.1.316a3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.316a3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.316a3.dist-info/RECORD,,
60
+ upgini-1.1.316a5.dist-info/METADATA,sha256=xGm11UrAxkdD9Fi3SYyek-IDOvUcDxA68Dy8cH0gQ3c,48232
61
+ upgini-1.1.316a5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.1.316a5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.316a5.dist-info/RECORD,,