upgini 1.1.274__tar.gz → 1.1.274a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (85) hide show
  1. {upgini-1.1.274/src/upgini.egg-info → upgini-1.1.274a1}/PKG-INFO +2 -2
  2. {upgini-1.1.274 → upgini-1.1.274a1}/setup.py +2 -2
  3. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/date.py +2 -9
  4. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/data_source/data_source_publisher.py +1 -1
  5. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/features_enricher.py +27 -66
  6. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/metrics.py +0 -12
  7. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/normalizer/phone_normalizer.py +2 -2
  8. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/strings.properties +1 -2
  9. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/datetime_utils.py +0 -3
  10. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/track_info.py +13 -25
  11. {upgini-1.1.274 → upgini-1.1.274a1/src/upgini.egg-info}/PKG-INFO +2 -2
  12. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/requires.txt +1 -1
  13. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_autofe_operands.py +1 -2
  14. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_features_enricher.py +13 -18
  15. {upgini-1.1.274 → upgini-1.1.274a1}/LICENSE +0 -0
  16. {upgini-1.1.274 → upgini-1.1.274a1}/README.md +0 -0
  17. {upgini-1.1.274 → upgini-1.1.274a1}/pyproject.toml +0 -0
  18. {upgini-1.1.274 → upgini-1.1.274a1}/setup.cfg +0 -0
  19. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/__init__.py +0 -0
  20. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/ads.py +0 -0
  21. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/ads_management/__init__.py +0 -0
  22. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/ads_management/ads_manager.py +0 -0
  23. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/__init__.py +0 -0
  24. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/all_operands.py +0 -0
  25. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/binary.py +0 -0
  26. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/feature.py +0 -0
  27. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/groupby.py +0 -0
  28. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/operand.py +0 -0
  29. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/vector.py +0 -0
  31. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/data_source/__init__.py +0 -0
  32. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/dataset.py +0 -0
  33. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/errors.py +0 -0
  34. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/fingerprint.js +0 -0
  35. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/http.py +0 -0
  36. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/mdc/__init__.py +0 -0
  37. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/mdc/context.py +0 -0
  38. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/metadata.py +0 -0
  39. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/normalizer/__init__.py +0 -0
  40. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/__init__.py +0 -0
  41. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  42. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  43. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/__init__.py +0 -0
  44. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/base.py +0 -0
  45. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  46. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/utils.py +0 -0
  47. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/search_task.py +0 -0
  48. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/spinner.py +0 -0
  49. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/__init__.py +0 -0
  50. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  51. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/blocked_time_series.py +0 -0
  52. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/country_utils.py +0 -0
  53. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  54. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/cv_utils.py +0 -0
  55. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  56. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/display_utils.py +0 -0
  57. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/email_utils.py +0 -0
  58. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  59. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/features_validator.py +0 -0
  60. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/format.py +0 -0
  61. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/ip_utils.py +0 -0
  62. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/phone_utils.py +0 -0
  63. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/postal_code_utils.py +0 -0
  64. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/progress_bar.py +0 -0
  65. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/sklearn_ext.py +0 -0
  66. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/target_utils.py +0 -0
  67. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/warning_counter.py +0 -0
  68. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/version_validator.py +0 -0
  69. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/SOURCES.txt +0 -0
  70. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/dependency_links.txt +0 -0
  71. {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/top_level.txt +0 -0
  72. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_binary_dataset.py +0 -0
  73. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_blocked_time_series.py +0 -0
  74. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_categorical_dataset.py +0 -0
  75. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_continuous_dataset.py +0 -0
  76. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_country_utils.py +0 -0
  77. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_custom_loss_utils.py +0 -0
  78. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_datetime_utils.py +0 -0
  79. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_email_utils.py +0 -0
  80. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_etalon_validation.py +0 -0
  81. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_metrics.py +0 -0
  82. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_phone_utils.py +0 -0
  83. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_postal_code_utils.py +0 -0
  84. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_target_utils.py +0 -0
  85. {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.274
3
+ Version: 1.1.274a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
28
28
  License-File: LICENSE
29
29
  Requires-Dist: python-dateutil>=2.8.0
30
30
  Requires-Dist: requests>=2.8.0
31
- Requires-Dist: pandas<3.0.0,>=1.1.0
31
+ Requires-Dist: pandas<2.1.0,>=1.1.0
32
32
  Requires-Dist: numpy>=1.19.0
33
33
  Requires-Dist: scikit-learn>=1.3.0
34
34
  Requires-Dist: pydantic<2.0.0,>=1.8.2
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.274"
43
+ version = "1.1.274a1"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -77,7 +77,7 @@ try:
77
77
  install_requires=[
78
78
  "python-dateutil>=2.8.0",
79
79
  "requests>=2.8.0",
80
- "pandas>=1.1.0,<3.0.0",
80
+ "pandas>=1.1.0,<2.1.0",
81
81
  "numpy>=1.19.0",
82
82
  "scikit-learn>=1.3.0",
83
83
  "pydantic>=1.8.2,<2.0.0",
@@ -2,7 +2,6 @@ from typing import Any, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
  from pydantic import BaseModel
5
- from pandas.core.arrays.timedeltas import TimedeltaArray
6
5
 
7
6
  from upgini.autofe.operand import PandasOperand
8
7
 
@@ -47,7 +46,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
47
46
  future = right + (left.dt.year - right.dt.year).apply(
48
47
  lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
49
48
  )
50
- future = pd.to_datetime(future)
51
49
  before = future[future < left]
52
50
  future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
53
51
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
@@ -74,13 +72,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
74
72
 
75
73
  return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
76
74
 
77
- def _diff(self, x: TimedeltaArray):
78
- if self.diff_unit == "Y":
79
- x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
80
- elif self.diff_unit == "M":
81
- raise Exception("Unsupported difference unit: Month")
82
- else:
83
- x = x / np.timedelta64(1, self.diff_unit)
75
+ def _diff(self, x):
76
+ x = x / np.timedelta64(1, self.diff_unit)
84
77
  return x[x > 0]
85
78
 
86
79
  def _agg(self, x):
@@ -48,7 +48,6 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
- exclude_from_autofe_generation: Optional[List[str]],
52
51
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
53
52
  sort_column: Optional[str] = None,
54
53
  date_format: Optional[str] = None,
@@ -58,6 +57,7 @@ class DataSourcePublisher:
58
57
  join_date_abs_limit_days: Optional[int] = None,
59
58
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
60
59
  data_table_id_to_replace: Optional[str] = None,
60
+ exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
@@ -1,5 +1,4 @@
1
1
  import dataclasses
2
- import datetime
3
2
  import gc
4
3
  import hashlib
5
4
  import itertools
@@ -147,7 +146,6 @@ class FeaturesEnricher(TransformerMixin):
147
146
  """
148
147
 
149
148
  TARGET_NAME = "target"
150
- CURRENT_DATE = "current_date"
151
149
  RANDOM_STATE = 42
152
150
  CALCULATE_METRICS_THRESHOLD = 50_000_000
153
151
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -209,7 +207,6 @@ class FeaturesEnricher(TransformerMixin):
209
207
  client_ip: Optional[str] = None,
210
208
  client_visitorid: Optional[str] = None,
211
209
  custom_bundle_config: Optional[str] = None,
212
- add_date_if_missing: bool = True,
213
210
  **kwargs,
214
211
  ):
215
212
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -320,7 +317,6 @@ class FeaturesEnricher(TransformerMixin):
320
317
  self.raise_validation_error = raise_validation_error
321
318
  self.exclude_columns = exclude_columns
322
319
  self.baseline_score_column = baseline_score_column
323
- self.add_date_if_missing = add_date_if_missing
324
320
 
325
321
  def _get_api_key(self):
326
322
  return self._api_key
@@ -424,9 +420,6 @@ class FeaturesEnricher(TransformerMixin):
424
420
 
425
421
  self.__validate_search_keys(self.search_keys, self.search_id)
426
422
 
427
- # Validate client estimator params
428
- self._get_client_cat_features(estimator, X, self.search_keys)
429
-
430
423
  try:
431
424
  self.X = X
432
425
  self.y = y
@@ -820,7 +813,6 @@ class FeaturesEnricher(TransformerMixin):
820
813
  trace_id = trace_id or str(uuid.uuid4())
821
814
  start_time = time.time()
822
815
  with MDC(trace_id=trace_id):
823
- self.logger.info("Start calculate metrics")
824
816
  if len(args) > 0:
825
817
  msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
826
818
  self.logger.warning(msg)
@@ -872,9 +864,22 @@ class FeaturesEnricher(TransformerMixin):
872
864
  self.__display_support_link(msg)
873
865
  return None
874
866
 
875
- cat_features, search_keys_for_metrics = self._get_client_cat_features(
876
- estimator, effective_X, self.search_keys
877
- )
867
+ cat_features = None
868
+ search_keys_for_metrics = []
869
+ if (
870
+ estimator is not None
871
+ and hasattr(estimator, "get_param")
872
+ and estimator.get_param("cat_features") is not None
873
+ ):
874
+ cat_features = estimator.get_param("cat_features")
875
+ if len(cat_features) > 0 and isinstance(cat_features[0], int):
876
+ cat_features = [effective_X.columns[i] for i in cat_features]
877
+ for cat_feature in cat_features:
878
+ if cat_feature in self.search_keys:
879
+ if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
880
+ search_keys_for_metrics.append(cat_feature)
881
+ else:
882
+ raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
878
883
 
879
884
  prepared_data = self._prepare_data_for_metrics(
880
885
  trace_id=trace_id,
@@ -889,7 +894,6 @@ class FeaturesEnricher(TransformerMixin):
889
894
  search_keys_for_metrics=search_keys_for_metrics,
890
895
  progress_bar=progress_bar,
891
896
  progress_callback=progress_callback,
892
- cat_features=cat_features,
893
897
  )
894
898
  if prepared_data is None:
895
899
  return None
@@ -1265,29 +1269,6 @@ class FeaturesEnricher(TransformerMixin):
1265
1269
 
1266
1270
  return _cv, groups
1267
1271
 
1268
- def _get_client_cat_features(
1269
- self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
1270
- ) -> Optional[List[str]]:
1271
- cat_features = None
1272
- search_keys_for_metrics = []
1273
- if (
1274
- estimator is not None
1275
- and hasattr(estimator, "get_param")
1276
- and estimator.get_param("cat_features") is not None
1277
- ):
1278
- cat_features = estimator.get_param("cat_features")
1279
- if len(cat_features) > 0:
1280
- if all([isinstance(f, int) for f in cat_features]):
1281
- cat_features = [X.columns[i] for i in cat_features]
1282
- self.logger.info(f"Collected categorical features {cat_features} from user estimator")
1283
- for cat_feature in cat_features:
1284
- if cat_feature in search_keys:
1285
- if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
1286
- search_keys_for_metrics.append(cat_feature)
1287
- else:
1288
- raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
1289
- return cat_features, search_keys_for_metrics
1290
-
1291
1272
  def _prepare_data_for_metrics(
1292
1273
  self,
1293
1274
  trace_id: str,
@@ -1302,7 +1283,6 @@ class FeaturesEnricher(TransformerMixin):
1302
1283
  search_keys_for_metrics: Optional[List[str]] = None,
1303
1284
  progress_bar: Optional[ProgressBar] = None,
1304
1285
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
1305
- cat_features: Optional[List[str]] = None,
1306
1286
  ):
1307
1287
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1308
1288
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
@@ -1360,8 +1340,9 @@ class FeaturesEnricher(TransformerMixin):
1360
1340
 
1361
1341
  # Detect and drop high cardinality columns in train
1362
1342
  columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
1363
- non_excluding_columns = (self.generate_features or []) + (cat_features or [])
1364
- columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
1343
+ columns_with_high_cardinality = [
1344
+ c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
1345
+ ]
1365
1346
  if len(columns_with_high_cardinality) > 0:
1366
1347
  self.logger.warning(
1367
1348
  f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
@@ -1823,11 +1804,10 @@ class FeaturesEnricher(TransformerMixin):
1823
1804
  else:
1824
1805
  features_section = ""
1825
1806
 
1826
- search_id = self._search_task.search_task_id
1827
- api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
1807
+ api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
1828
1808
  -H 'Authorization: {self.api_key}' \\
1829
1809
  -H 'Content-Type: application/json' \\
1830
- -d '{{"search_keys": {keys}{features_section}}}'"""
1810
+ -d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
1831
1811
  return api_example
1832
1812
 
1833
1813
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -1922,8 +1902,6 @@ class FeaturesEnricher(TransformerMixin):
1922
1902
  generated_features.extend(converter.generated_features)
1923
1903
  else:
1924
1904
  self.logger.info("Input dataset hasn't date column")
1925
- if self.add_date_if_missing:
1926
- df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1927
1905
  email_column = self._get_email_column(search_keys)
1928
1906
  hem_column = self._get_hem_column(search_keys)
1929
1907
  email_converted_to_hem = False
@@ -2242,7 +2220,9 @@ class FeaturesEnricher(TransformerMixin):
2242
2220
  self.fit_search_keys = self.search_keys.copy()
2243
2221
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2244
2222
 
2245
- validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2223
+ validate_dates_distribution(
2224
+ validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2225
+ )
2246
2226
 
2247
2227
  maybe_date_column = self._get_date_column(self.fit_search_keys)
2248
2228
  has_date = maybe_date_column is not None
@@ -2293,8 +2273,6 @@ class FeaturesEnricher(TransformerMixin):
2293
2273
  self.fit_generated_features.extend(converter.generated_features)
2294
2274
  else:
2295
2275
  self.logger.info("Input dataset hasn't date column")
2296
- if self.add_date_if_missing:
2297
- df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2298
2276
  email_column = self._get_email_column(self.fit_search_keys)
2299
2277
  hem_column = self._get_hem_column(self.fit_search_keys)
2300
2278
  email_converted_to_hem = False
@@ -2875,25 +2853,6 @@ class FeaturesEnricher(TransformerMixin):
2875
2853
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2876
2854
  return col
2877
2855
 
2878
- @staticmethod
2879
- def _add_current_date_as_key(
2880
- df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
2881
- ) -> pd.DataFrame:
2882
- if (
2883
- set(search_keys.values()) == {SearchKey.PHONE}
2884
- or set(search_keys.values()) == {SearchKey.EMAIL}
2885
- or set(search_keys.values()) == {SearchKey.HEM}
2886
- or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
2887
- ):
2888
- msg = bundle.get("current_date_added")
2889
- print(msg)
2890
- logger.warning(msg)
2891
- df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2892
- search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
2893
- converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
2894
- df = converter.convert(df)
2895
- return df
2896
-
2897
2856
  @staticmethod
2898
2857
  def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2899
2858
  return [
@@ -2944,7 +2903,9 @@ class FeaturesEnricher(TransformerMixin):
2944
2903
  [
2945
2904
  c
2946
2905
  for c in df.columns
2947
- if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
2906
+ if c not in sort_columns
2907
+ and c not in sort_exclude_columns
2908
+ and df[c].nunique() > 1
2948
2909
  ]
2949
2910
  # [
2950
2911
  # sk
@@ -1,4 +1,3 @@
1
- import inspect
2
1
  import logging
3
2
  import re
4
3
  from copy import deepcopy
@@ -382,11 +381,6 @@ class EstimatorWrapper:
382
381
  kwargs["estimator"] = estimator_copy
383
382
  if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
384
383
  if cat_features is not None:
385
- for cat_feature in cat_features:
386
- if cat_feature not in X.columns:
387
- logger.error(
388
- f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
389
- )
390
384
  estimator_copy.set_params(
391
385
  cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
392
386
  )
@@ -653,12 +647,6 @@ class OtherEstimatorWrapper(EstimatorWrapper):
653
647
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
654
648
  if isinstance(scoring, str) and scoring is not None:
655
649
  _get_scorer_by_name(scoring)
656
- elif isinstance(scoring, Callable):
657
- spec = inspect.getfullargspec(scoring)
658
- if len(spec.args) < 3:
659
- raise ValidationError(
660
- f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
661
- )
662
650
 
663
651
 
664
652
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -44,7 +44,7 @@ class PhoneNormalizer:
44
44
  Method will remove all non numeric chars from string and convert it to int.
45
45
  None will be set for phone numbers that couldn"t be converted to int
46
46
  """
47
- if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
47
+ if is_string_dtype(self.df[self.phone_column_name]):
48
48
  convert_func = self.phone_str_to_int_safe
49
49
  elif is_float_dtype(self.df[self.phone_column_name]):
50
50
  convert_func = self.phone_float_to_int_safe
@@ -38,7 +38,6 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
38
38
  loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
39
39
  multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
40
40
  group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
41
- current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
42
41
 
43
42
  # Errors
44
43
  failed_search_by_task_id=Failed to retrieve the specified search results
@@ -159,7 +158,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
159
158
  dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
160
159
  dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
161
160
  dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
162
- dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
161
+ dataset_rarest_class_less_min=Frequency of the rarest class `{}` is {}, minimum frequency must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
163
162
  dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
164
163
  dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
165
164
  dataset_too_many_features=Too many features. Maximum number of features is {}
@@ -100,9 +100,6 @@ class DateTimeSearchKeyConverter:
100
100
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
101
101
  self.logger.warning(msg)
102
102
  raise ValidationError(msg)
103
- else:
104
- df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
105
- df[self.date_column] = self.parse_date(df)
106
103
 
107
104
  # If column with date is datetime then extract seconds of the day and minute of the hour
108
105
  # as additional features
@@ -55,7 +55,7 @@ def _get_execution_ide() -> str:
55
55
  def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
56
56
  # default values
57
57
  track = {"ide": _get_execution_ide()}
58
- ident_res = "https://api64.ipify.org"
58
+ ident_res = "https://api.ipify.org"
59
59
 
60
60
  try:
61
61
  track["hostname"] = socket.gethostname()
@@ -74,20 +74,17 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
74
74
  display(
75
75
  Javascript(
76
76
  """
77
- async function getVisitorId() {
78
- return import('https://upgini.github.io/upgini/js/a.js')
77
+ import('https://upgini.github.io/upgini/js/a.js')
79
78
  .then(FingerprintJS => FingerprintJS.load())
80
79
  .then(fp => fp.get())
81
- .then(result => result.visitorId);
82
- }
80
+ .then(result => window.visitorId = result.visitorId);
83
81
  """
84
82
  )
85
83
  )
86
- track["visitorId"] = output.eval_js("getVisitorId()", timeout_sec=30)
84
+ track["visitorId"] = output.eval_js("window.visitorId", timeout_sec=10)
87
85
  except Exception as e:
88
86
  track["err"] = str(e)
89
- if "visitorId" not in track:
90
- track["visitorId"] = "None"
87
+ track["visitorId"] = "None"
91
88
  if client_ip:
92
89
  track["ip"] = client_ip
93
90
  else:
@@ -98,19 +95,16 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
98
95
  display(
99
96
  Javascript(
100
97
  f"""
101
- async function getIP() {{
102
- return fetch("{ident_res}")
98
+ fetch("{ident_res}")
103
99
  .then(response => response.text())
104
- .then(data => data);
105
- }}
100
+ .then(data => window.clientIP = data);
106
101
  """
107
102
  )
108
103
  )
109
- track["ip"] = output.eval_js("getIP()", timeout_sec=10)
104
+ track["ip"] = output.eval_js("window.clientIP", timeout_sec=10)
110
105
  except Exception as e:
111
106
  track["err"] = str(e)
112
- if "ip" not in track:
113
- track["ip"] = "0.0.0.0"
107
+ track["ip"] = "0.0.0.0"
114
108
 
115
109
  elif track["ide"] == "binder":
116
110
  try:
@@ -122,10 +116,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
122
116
  track["visitorId"] = sha256(os.environ["CLIENT_IP"].encode()).hexdigest()
123
117
  except Exception as e:
124
118
  track["err"] = str(e)
125
- if "ip" not in track:
126
- track["ip"] = "0.0.0.0"
127
- if "visitorId" not in track:
128
- track["visitorId"] = "None"
119
+ track["ip"] = "0.0.0.0"
120
+ track["visitorId"] = "None"
129
121
 
130
122
  elif track["ide"] == "kaggle":
131
123
  try:
@@ -144,8 +136,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
144
136
  raise Exception(err)
145
137
  except Exception as e:
146
138
  track["err"] = str(e)
147
- if "visitorId" not in track:
148
- track["visitorId"] = "None"
139
+ track["ip"] = "0.0.0.0"
140
+ track["visitorId"] = "None"
149
141
  else:
150
142
  try:
151
143
  if client_ip:
@@ -158,9 +150,5 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
158
150
  track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
159
151
  except Exception as e:
160
152
  track["err"] = str(e)
161
- if "visitorId" not in track:
162
- track["visitorId"] = "None"
163
- if "ip" not in track:
164
- track["ip"] = "0.0.0.0"
165
153
 
166
154
  return track
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.274
3
+ Version: 1.1.274a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
28
28
  License-File: LICENSE
29
29
  Requires-Dist: python-dateutil>=2.8.0
30
30
  Requires-Dist: requests>=2.8.0
31
- Requires-Dist: pandas<3.0.0,>=1.1.0
31
+ Requires-Dist: pandas<2.1.0,>=1.1.0
32
32
  Requires-Dist: numpy>=1.19.0
33
33
  Requires-Dist: scikit-learn>=1.3.0
34
34
  Requires-Dist: pydantic<2.0.0,>=1.8.2
@@ -1,6 +1,6 @@
1
1
  python-dateutil>=2.8.0
2
2
  requests>=2.8.0
3
- pandas<3.0.0,>=1.1.0
3
+ pandas<2.1.0,>=1.1.0
4
4
  numpy>=1.19.0
5
5
  scikit-learn>=1.3.0
6
6
  pydantic<2.0.0,>=1.8.2
@@ -30,8 +30,7 @@ def test_date_diff_type2():
30
30
 
31
31
  operand = DateDiffType2(left_unit="s")
32
32
  expected_result = pd.Series([61.0, 182.0])
33
- actual = operand.calculate_binary(df.date1, df.date2)
34
- assert_series_equal(actual, expected_result)
33
+ assert_series_equal(operand.calculate_binary(df.date1, df.date2), expected_result)
35
34
 
36
35
 
37
36
  def test_date_diff_list():
@@ -246,7 +246,7 @@ def test_eval_set_with_diff_order_of_columns(requests_mock: Mocker):
246
246
  eval1_df = df[10000:11000].reset_index(drop=True)
247
247
  eval1_features = eval1_df.drop(columns="target")
248
248
  # shuffle columns
249
- eval1_features = eval1_features[list(eval1_features.columns)]
249
+ eval1_features = eval1_features[set(eval1_features.columns)]
250
250
  eval1_target = eval1_df["target"].reset_index(drop=True)
251
251
 
252
252
  eval2_df = df[11000:12000]
@@ -375,7 +375,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
375
375
  url = "http://fake_url2"
376
376
 
377
377
  path_to_mock_features = os.path.join(
378
- os.path.dirname(os.path.realpath(__file__)), "test_data/binary/validation_features_v3.parquet"
378
+ os.path.dirname(os.path.realpath(__file__)), "test_data/binary/validation_features.parquet"
379
379
  )
380
380
 
381
381
  mock_default_requests(requests_mock, url)
@@ -462,7 +462,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
462
462
  segment_header: [train_segment, eval_1_segment, eval_2_segment],
463
463
  rows_header: [10000, 1000, 1000],
464
464
  target_mean_header: [0.5044, 0.487, 0.486],
465
- enriched_gini: [0.021830, -0.006607, -0.018483],
465
+ enriched_gini: [-0.000136, 0.000000, -0.003728],
466
466
  }
467
467
  )
468
468
  print("Expected metrics: ")
@@ -487,13 +487,16 @@ def test_saved_features_enricher(requests_mock: Mocker):
487
487
  train_random_indices = random.choice(train_target.index, size=9000, replace=False)
488
488
  train_target.loc[train_random_indices] = 0
489
489
 
490
- metrics = enricher.calculate_metrics(train_features, train_target)
490
+ metrics = enricher.calculate_metrics(
491
+ train_features,
492
+ train_target
493
+ )
491
494
  expected_metrics = pd.DataFrame(
492
495
  {
493
496
  segment_header: [train_segment],
494
497
  rows_header: [10000],
495
498
  target_mean_header: [0.049],
496
- enriched_gini: [0.054454],
499
+ enriched_gini: [0.000985],
497
500
  }
498
501
  )
499
502
  print("Expected metrics: ")
@@ -2227,9 +2230,8 @@ def test_email_search_key(requests_mock: Mocker):
2227
2230
  "hashed_email_64ff8c",
2228
2231
  "email_one_domain_3b0a68",
2229
2232
  "email_domain_10c73f",
2230
- "current_date_b993c4",
2231
2233
  }
2232
- assert {"hashed_email_64ff8c", "email_one_domain_3b0a68", "current_date_b993c4"} == {
2234
+ assert {"hashed_email_64ff8c", "email_one_domain_3b0a68"} == {
2233
2235
  sk for sublist in self.search_keys for sk in sublist
2234
2236
  }
2235
2237
  raise TestException()
@@ -2274,18 +2276,10 @@ def test_composit_index_search_key(requests_mock: Mocker):
2274
2276
  **kwargs,
2275
2277
  ):
2276
2278
  self.validate()
2277
- assert set(self.columns.to_list()) == {
2278
- "system_record_id",
2279
- "country_aff64e",
2280
- "postal_code_13534a",
2281
- "current_date_b993c4",
2282
- "target",
2283
- }
2279
+ assert set(self.columns.to_list()) == {"system_record_id", "country_aff64e", "postal_code_13534a", "target"}
2284
2280
  assert "country_aff64e" in self.columns
2285
2281
  assert "postal_code_13534a"
2286
- assert {"country_aff64e", "postal_code_13534a", "current_date_b993c4"} == {
2287
- sk for sublist in self.search_keys for sk in sublist
2288
- }
2282
+ assert {"country_aff64e", "postal_code_13534a"} == {sk for sublist in self.search_keys for sk in sublist}
2289
2283
  # assert "country_fake_a" in self.columns
2290
2284
  # assert "postal_code_fake_a" in self.columns
2291
2285
  # assert {"country_fake_a", "postal_code_fake_a"} == {sk for sublist in self.search_keys for sk in sublist}
@@ -2658,4 +2652,5 @@ class DataFrameWrapper:
2658
2652
 
2659
2653
 
2660
2654
  class TestException(Exception):
2661
- pass
2655
+ def __init__(self):
2656
+ super().__init__()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes