upgini 1.1.315a1__tar.gz → 1.1.316__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.1.315a1 → upgini-1.1.316}/PKG-INFO +1 -1
  2. upgini-1.1.316/src/upgini/__about__.py +1 -0
  3. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/binary.py +4 -1
  4. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/unary.py +3 -0
  5. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/data_source/data_source_publisher.py +9 -0
  6. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/dataset.py +1 -1
  7. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/features_enricher.py +42 -24
  8. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/datetime_utils.py +0 -1
  9. upgini-1.1.315a1/src/upgini/__about__.py +0 -1
  10. {upgini-1.1.315a1 → upgini-1.1.316}/.gitignore +0 -0
  11. {upgini-1.1.315a1 → upgini-1.1.316}/LICENSE +0 -0
  12. {upgini-1.1.315a1 → upgini-1.1.316}/README.md +0 -0
  13. {upgini-1.1.315a1 → upgini-1.1.316}/pyproject.toml +0 -0
  14. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/__init__.py +0 -0
  15. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/ads.py +0 -0
  16. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/ads_management/__init__.py +0 -0
  17. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/ads_management/ads_manager.py +0 -0
  18. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/__init__.py +0 -0
  19. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/all_operands.py +0 -0
  20. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/autofe/vector.py +0 -0
  25. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/errors.py +0 -0
  27. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/http.py +0 -0
  28. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/lazy_import.py +0 -0
  29. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/mdc/__init__.py +0 -0
  30. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/mdc/context.py +0 -0
  31. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/metadata.py +0 -0
  32. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/metrics.py +0 -0
  33. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/normalizer/normalize_utils.py +0 -0
  35. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/search_task.py +0 -0
  44. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/spinner.py +0 -0
  45. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/display_utils.py +0 -0
  53. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.315a1 → upgini-1.1.316}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.315a1
3
+ Version: 1.1.316
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.1.316"
@@ -141,7 +141,7 @@ class Distance(PandasOperand):
141
141
 
142
142
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
143
143
  return pd.Series(
144
- 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
144
+ 1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
145
145
  )
146
146
 
147
147
  # row-wise dot product
@@ -152,6 +152,9 @@ class Distance(PandasOperand):
152
152
  res = res.reindex(left.index.union(right.index))
153
153
  return res
154
154
 
155
+ def __norm(self, vector: pd.Series) -> pd.Series:
156
+ return np.sqrt(self.__dot(vector, vector))
157
+
155
158
 
156
159
  # Left for backward compatibility
157
160
  class Sim(Distance):
@@ -121,6 +121,9 @@ class Norm(PandasOperand):
121
121
 
122
122
  def calculate_unary(self, data: pd.Series) -> pd.Series:
123
123
  data_dropna = data.dropna()
124
+ if data_dropna.empty:
125
+ return data
126
+
124
127
  normalized_data = Normalizer().transform(data_dropna.to_frame().T).T
125
128
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
129
  normalized_data = normalized_data.reindex(data.index)
@@ -63,6 +63,7 @@ class DataSourcePublisher:
63
63
  keep_features: Optional[List[str]] = None,
64
64
  date_features: Optional[List[str]] = None,
65
65
  date_vector_features: Optional[List[str]] = None,
66
+ generate_runtime_embeddings: Optional[List[str]] = None,
66
67
  _force_generation=False,
67
68
  _silent=False,
68
69
  ) -> str:
@@ -163,6 +164,8 @@ class DataSourcePublisher:
163
164
  if date_format is None:
164
165
  raise ValidationError("date_format should be presented if you use date vector features")
165
166
  request["dateVectorFeatures"] = date_vector_features
167
+ if generate_runtime_embeddings is not None:
168
+ request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
166
169
  self.logger.info(f"Start registering data table {request}")
167
170
 
168
171
  task_id = self._rest_client.register_ads(request, trace_id)
@@ -276,6 +279,8 @@ class DataSourcePublisher:
276
279
  client_emails: Optional[List[str]] = None,
277
280
  date_features: Optional[List[str]] = None,
278
281
  date_vector_features: Optional[List[str]] = None,
282
+ exclude_from_autofe_generation: Optional[List[str]] = None,
283
+ generate_runtime_embeddings: Optional[List[str]] = None,
279
284
  ):
280
285
  trace_id = str(uuid.uuid4())
281
286
  with MDC(trace_id=trace_id):
@@ -327,6 +332,10 @@ class DataSourcePublisher:
327
332
  request["dateFeatures"] = date_features
328
333
  if date_vector_features is not None:
329
334
  request["dateVectorFeatures"] = date_vector_features
335
+ if exclude_from_autofe_generation is not None:
336
+ request["excludeFromGenerationFeatures"] = exclude_from_autofe_generation
337
+ if generate_runtime_embeddings is not None:
338
+ request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
330
339
  self.logger.info(f"Activating data tables with request {request}")
331
340
 
332
341
  self._rest_client.activate_datatables(request, trace_id)
@@ -692,7 +692,7 @@ class Dataset: # (pd.DataFrame):
692
692
  parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
693
693
  self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
694
694
  uploading_file_size = Path(parquet_file_path).stat().st_size
695
- self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
695
+ self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
696
696
  if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
697
697
  raise ValidationError(self.bundle.get("dataset_too_big_file"))
698
698
  return parquet_file_path
@@ -846,17 +846,37 @@ class FeaturesEnricher(TransformerMixin):
846
846
  self.logger.warning(msg)
847
847
  print(msg)
848
848
 
849
+ if X is not None and y is None:
850
+ raise ValidationError("X passed without y")
851
+
849
852
  self.__validate_search_keys(self.search_keys, self.search_id)
850
853
  effective_X = X if X is not None else self.X
851
854
  effective_y = y if y is not None else self.y
852
855
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
853
856
  effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
854
857
 
858
+ if (
859
+ self._search_task is None
860
+ or self._search_task.provider_metadata_v2 is None
861
+ or len(self._search_task.provider_metadata_v2) == 0
862
+ or effective_X is None
863
+ or effective_y is None
864
+ ):
865
+ raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
866
+
867
+ validated_X = self._validate_X(effective_X)
868
+ validated_y = self._validate_y(validated_X, effective_y)
869
+ validated_eval_set = (
870
+ [self._validate_eval_set_pair(validated_X, eval_pair) for eval_pair in effective_eval_set]
871
+ if effective_eval_set is not None
872
+ else None
873
+ )
874
+
855
875
  try:
856
876
  self.__log_debug_information(
857
- effective_X,
858
- effective_y,
859
- effective_eval_set,
877
+ validated_X,
878
+ validated_y,
879
+ validated_eval_set,
860
880
  exclude_features_sources=exclude_features_sources,
861
881
  cv=cv if cv is not None else self.cv,
862
882
  importance_threshold=importance_threshold,
@@ -866,21 +886,9 @@ class FeaturesEnricher(TransformerMixin):
866
886
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
867
887
  )
868
888
 
869
- if (
870
- self._search_task is None
871
- or self._search_task.provider_metadata_v2 is None
872
- or len(self._search_task.provider_metadata_v2) == 0
873
- or effective_X is None
874
- or effective_y is None
875
- ):
876
- raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
877
-
878
- if X is not None and y is None:
879
- raise ValidationError("X passed without y")
880
-
881
889
  validate_scoring_argument(scoring)
882
890
 
883
- self._validate_baseline_score(effective_X, effective_eval_set)
891
+ self._validate_baseline_score(validated_X, validated_eval_set)
884
892
 
885
893
  if self._has_paid_features(exclude_features_sources):
886
894
  msg = self.bundle.get("metrics_with_paid_features")
@@ -889,7 +897,7 @@ class FeaturesEnricher(TransformerMixin):
889
897
  return None
890
898
 
891
899
  cat_features, search_keys_for_metrics = self._get_client_cat_features(
892
- estimator, effective_X, self.search_keys
900
+ estimator, validated_X, self.search_keys
893
901
  )
894
902
 
895
903
  prepared_data = self._prepare_data_for_metrics(
@@ -1034,10 +1042,10 @@ class FeaturesEnricher(TransformerMixin):
1034
1042
  self.bundle.get("quality_metrics_rows_header"): _num_samples(effective_X),
1035
1043
  }
1036
1044
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1037
- effective_y
1045
+ validated_y
1038
1046
  ):
1039
1047
  train_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1040
- np.mean(effective_y), 4
1048
+ np.mean(validated_y), 4
1041
1049
  )
1042
1050
  if etalon_metric is not None:
1043
1051
  train_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = etalon_metric
@@ -1107,10 +1115,10 @@ class FeaturesEnricher(TransformerMixin):
1107
1115
  # self.bundle.get("quality_metrics_match_rate_header"): eval_hit_rate,
1108
1116
  }
1109
1117
  if model_task_type in [ModelTaskType.BINARY, ModelTaskType.REGRESSION] and is_numeric_dtype(
1110
- effective_eval_set[idx][1]
1118
+ validated_eval_set[idx][1]
1111
1119
  ):
1112
1120
  eval_metrics[self.bundle.get("quality_metrics_mean_target_header")] = round(
1113
- np.mean(effective_eval_set[idx][1]), 4
1121
+ np.mean(validated_eval_set[idx][1]), 4
1114
1122
  )
1115
1123
  if etalon_eval_metric is not None:
1116
1124
  eval_metrics[self.bundle.get("quality_metrics_baseline_header").format(metric)] = (
@@ -3158,6 +3166,7 @@ class FeaturesEnricher(TransformerMixin):
3158
3166
  if len(search_key_names_by_type) == 0:
3159
3167
  return df, {}
3160
3168
 
3169
+ self.logger.info(f"Start exploding dataset by {search_key_names_by_type}. Size before: {len(df)}")
3161
3170
  multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3162
3171
  other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3163
3172
  exploded_dfs = []
@@ -3176,6 +3185,7 @@ class FeaturesEnricher(TransformerMixin):
3176
3185
  columns_renaming[new_search_key] = new_search_key
3177
3186
 
3178
3187
  df = pd.concat(exploded_dfs, ignore_index=True)
3188
+ self.logger.info(f"Finished explosion. Size after: {len(df)}")
3179
3189
  return df, unnest_search_keys
3180
3190
 
3181
3191
  def __add_fit_system_record_id(
@@ -3209,18 +3219,26 @@ class FeaturesEnricher(TransformerMixin):
3209
3219
  date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3210
3220
  sort_columns = [date_column] if date_column is not None else []
3211
3221
 
3222
+ sorted_other_keys = sorted(search_keys, key=lambda x: str(search_keys.get(x)))
3223
+ sorted_other_keys = [k for k in sorted_other_keys if k not in sort_exclude_columns]
3224
+
3212
3225
  other_columns = sorted(
3213
3226
  [
3214
3227
  c
3215
3228
  for c in df.columns
3216
- if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
3229
+ if c not in sort_columns
3230
+ and c not in sorted_other_keys
3231
+ and c not in sort_exclude_columns
3232
+ and df[c].nunique() > 1
3217
3233
  ]
3218
3234
  )
3219
3235
 
3236
+ all_other_columns = sorted_other_keys + other_columns
3237
+
3220
3238
  search_keys_hash = "search_keys_hash"
3221
- if len(other_columns) > 0:
3239
+ if len(all_other_columns) > 0:
3222
3240
  sort_columns.append(search_keys_hash)
3223
- df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
3241
+ df[search_keys_hash] = pd.util.hash_pandas_object(df[all_other_columns], index=False)
3224
3242
 
3225
3243
  df = df.sort_values(by=sort_columns)
3226
3244
 
@@ -1,7 +1,6 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- import pytz
5
4
  from typing import Dict, List, Optional
6
5
 
7
6
  import numpy as np
@@ -1 +0,0 @@
1
- __version__ = "1.1.315a1"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes