upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -9
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +83 -41
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +931 -542
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
  39. upgini-1.2.31.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280.dev0.dist-info/RECORD +0 -62
  42. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280.dev0.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,7 @@ from typing import Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
- from pandas.api.types import is_numeric_dtype
6
+ from pandas.api.types import is_numeric_dtype, is_bool_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
9
  from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
@@ -24,49 +24,83 @@ def define_task(
24
24
  ) -> ModelTaskType:
25
25
  if logger is None:
26
26
  logger = logging.getLogger()
27
+
28
+ # Replace inf and -inf with NaN to handle extreme values correctly
29
+ y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
30
+
31
+ # Drop NaN values from the target
27
32
  target = y.dropna()
33
+
34
+ # Check if target is numeric and finite
28
35
  if is_numeric_dtype(target):
29
36
  target = target.loc[np.isfinite(target)]
30
37
  else:
38
+ # If not numeric, drop empty strings as well
31
39
  target = target.loc[target != ""]
40
+
41
+ # Raise error if there are no valid values left in the target
32
42
  if len(target) == 0:
33
43
  raise ValidationError(bundle.get("empty_target"))
44
+
45
+ # Count unique values in the target
34
46
  target_items = target.nunique()
47
+
48
+ # Raise error if all target values are the same
35
49
  if target_items == 1:
36
50
  raise ValidationError(bundle.get("dataset_constant_target"))
51
+
52
+ reason = "" # Will store the reason for selecting the task type
53
+
54
+ # Binary classification case: exactly two unique values
37
55
  if target_items == 2:
38
56
  task = ModelTaskType.BINARY
57
+ reason = bundle.get("binary_target_reason")
39
58
  else:
59
+ # Attempt to convert target to numeric
40
60
  try:
41
61
  target = pd.to_numeric(target)
42
62
  is_numeric = True
43
63
  except Exception:
44
64
  is_numeric = False
45
65
 
46
- # If any value is non numeric - multiclass
66
+ # If target cannot be converted to numeric, assume multiclass classification
47
67
  if not is_numeric:
48
68
  task = ModelTaskType.MULTICLASS
69
+ reason = bundle.get("non_numeric_multiclass_reason")
49
70
  else:
71
+ # Multiclass classification: few unique values and integer encoding
50
72
  if target.nunique() <= 50 and is_int_encoding(target.unique()):
51
73
  task = ModelTaskType.MULTICLASS
74
+ reason = bundle.get("few_unique_label_multiclass_reason")
75
+ # Regression case: if there is date, assume regression
52
76
  elif has_date:
53
77
  task = ModelTaskType.REGRESSION
78
+ reason = bundle.get("date_search_key_regression_reason")
54
79
  else:
80
+ # Remove zero values and recalculate unique ratio
55
81
  non_zero_target = target[target != 0]
56
82
  target_items = non_zero_target.nunique()
57
83
  target_ratio = target_items / len(non_zero_target)
84
+
85
+ # Use unique_ratio to determine whether to classify as regression or multiclass
58
86
  if (
59
- (target.dtype.kind == "f" and np.any(target != target.astype(int))) # any non integer
87
+ (target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
60
88
  or target_items > 50
61
- or target_ratio > 0.2
89
+ or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
62
90
  ):
63
91
  task = ModelTaskType.REGRESSION
92
+ reason = bundle.get("many_unique_label_regression_reason")
64
93
  else:
65
94
  task = ModelTaskType.MULTICLASS
95
+ reason = bundle.get("limited_int_multiclass_reason")
66
96
 
67
- logger.info(f"Detected task type: {task}")
97
+ # Log or print the reason for the selected task type
98
+ logger.info(f"Detected task type: {task} (Reason: {reason})")
99
+
100
+ # Print task type and reason if silent mode is off
68
101
  if not silent:
69
- print(bundle.get("target_type_detected").format(task))
102
+ print(bundle.get("target_type_detected").format(task, reason))
103
+
70
104
  return task
71
105
 
72
106
 
@@ -81,8 +115,8 @@ def balance_undersample(
81
115
  target_column: str,
82
116
  task_type: ModelTaskType,
83
117
  random_state: int,
84
- imbalance_threshold: int = 0.2,
85
- min_sample_threshold: int = 5000,
118
+ binary_min_sample_threshold: int = 5000,
119
+ multiclass_min_sample_threshold: int = 25000,
86
120
  binary_bootstrap_loops: int = 5,
87
121
  multiclass_bootstrap_loops: int = 2,
88
122
  logger: Optional[logging.Logger] = None,
@@ -96,52 +130,60 @@ def balance_undersample(
96
130
  if SYSTEM_RECORD_ID not in df.columns:
97
131
  raise Exception("System record id must be presented for undersampling")
98
132
 
99
- count = len(df)
133
+ # count = len(df)
100
134
  target = df[target_column].copy()
101
- target_classes_count = target.nunique()
135
+ # target_classes_count = target.nunique()
102
136
 
103
137
  vc = target.value_counts()
104
138
  max_class_value = vc.index[0]
105
139
  min_class_value = vc.index[len(vc) - 1]
106
140
  max_class_count = vc[max_class_value]
107
141
  min_class_count = vc[min_class_value]
142
+ num_classes = len(vc)
108
143
 
109
- min_class_percent = imbalance_threshold / target_classes_count
110
- min_class_threshold = int(min_class_percent * count)
144
+ # min_class_percent = imbalance_threshold / target_classes_count
145
+ # min_class_threshold = int(min_class_percent * count)
111
146
 
112
147
  resampled_data = df
113
148
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
114
149
  if task_type == ModelTaskType.MULTICLASS:
115
- # Sort classes by rows count and find 25% quantile class
116
- classes = vc.index
117
- quantile25_idx = int(0.75 * len(classes)) - 1
118
- quantile25_class = classes[quantile25_idx]
119
- quantile25_class_cnt = vc[quantile25_class]
120
-
121
- if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
122
- msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
150
+ if len(df) > multiclass_min_sample_threshold and max_class_count > (
151
+ min_class_count * multiclass_bootstrap_loops
152
+ ):
153
+
154
+ # msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
155
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
123
156
  logger.warning(msg)
124
157
  print(msg)
125
158
  if warning_counter:
126
159
  warning_counter.increment()
127
160
 
128
- # 25% and lower classes will stay as is. Higher classes will be downsampled
129
161
  sample_strategy = dict()
130
- for class_idx in range(quantile25_idx):
131
- # compare class count with count_of_quantile25_class * 2
132
- class_value = classes[class_idx]
162
+ for class_value in vc.index:
163
+ if class_value == min_class_value:
164
+ continue
133
165
  class_count = vc[class_value]
134
- sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
166
+ sample_size = min(
167
+ class_count,
168
+ multiclass_bootstrap_loops
169
+ * (
170
+ min_class_count
171
+ + max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
172
+ ),
173
+ )
174
+ sample_strategy[class_value] = int(sample_size)
175
+ logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
135
176
  sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
136
177
  X = df[SYSTEM_RECORD_ID]
137
178
  X = X.to_frame(SYSTEM_RECORD_ID)
138
179
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
139
180
 
140
181
  resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
141
- elif len(df) > min_sample_threshold and min_class_count < min_sample_threshold / 2:
142
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
143
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
144
- )
182
+ elif len(df) > binary_min_sample_threshold:
183
+ # msg = bundle.get("dataset_rarest_class_less_threshold").format(
184
+ # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
185
+ # )
186
+ msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
145
187
  logger.warning(msg)
146
188
  print(msg)
147
189
  if warning_counter:
@@ -150,48 +192,62 @@ def balance_undersample(
150
192
  # fill up to min_sample_threshold by majority class
151
193
  minority_class = df[df[target_column] == min_class_value]
152
194
  majority_class = df[df[target_column] != min_class_value]
153
- sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
195
+ # sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
196
+ sample_size = min(
197
+ max_class_count,
198
+ binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
199
+ )
200
+ logger.info(
201
+ f"Min class count: {min_class_count}. Max class count: {max_class_count}."
202
+ f" Rebalance sample size: {sample_size}"
203
+ )
154
204
  sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
155
205
  resampled_data = df[
156
206
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
157
207
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
158
208
  ]
159
209
 
160
- elif max_class_count > min_class_count * binary_bootstrap_loops:
161
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
162
- min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
163
- )
164
- logger.warning(msg)
165
- print(msg)
166
- if warning_counter:
167
- warning_counter.increment()
210
+ # elif max_class_count > min_class_count * binary_bootstrap_loops:
211
+ # msg = bundle.get("dataset_rarest_class_less_threshold").format(
212
+ # min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
213
+ # )
214
+ # logger.warning(msg)
215
+ # print(msg)
216
+ # if warning_counter:
217
+ # warning_counter.increment()
168
218
 
169
- sampler = RandomUnderSampler(
170
- sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
171
- )
172
- X = df[SYSTEM_RECORD_ID]
173
- X = X.to_frame(SYSTEM_RECORD_ID)
174
- new_x, _ = sampler.fit_resample(X, target) # type: ignore
219
+ # sampler = RandomUnderSampler(
220
+ # sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
221
+ # )
222
+ # X = df[SYSTEM_RECORD_ID]
223
+ # X = X.to_frame(SYSTEM_RECORD_ID)
224
+ # new_x, _ = sampler.fit_resample(X, target) # type: ignore
175
225
 
176
- resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
226
+ # resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
177
227
 
178
228
  logger.info(f"Shape after rebalance resampling: {resampled_data}")
179
229
  return resampled_data
180
230
 
181
231
 
182
- def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
183
- df = pd.concat([expected, actual])
232
+ def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
233
+ try:
234
+ df = pd.concat([expected, actual])
235
+
236
+ if is_bool_dtype(df):
237
+ df = np.where(df, 1, 0)
184
238
 
185
- # Define the bins for the target variable
186
- df_min = df.min()
187
- df_max = df.max()
188
- bins = [df_min, (df_min + df_max) / 2, df_max]
239
+ # Define the bins for the target variable
240
+ df_min = df.min()
241
+ df_max = df.max()
242
+ bins = [df_min, (df_min + df_max) / 2, df_max]
189
243
 
190
- # Calculate the base distribution
191
- train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
244
+ # Calculate the base distribution
245
+ train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
192
246
 
193
- # Calculate the target distribution
194
- test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
247
+ # Calculate the target distribution
248
+ test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
195
249
 
196
- # Calculate the PSI
197
- return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
250
+ # Calculate the PSI
251
+ return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
252
+ except Exception as e:
253
+ return e
@@ -4,6 +4,7 @@ class WarningCounter:
4
4
 
5
5
  def increment(self):
6
6
  self._count += 1
7
+ return self._count
7
8
 
8
9
  def reset(self):
9
10
  self._count = 0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import threading
3
+ from typing import Callable, Optional
3
4
 
4
5
  import requests
5
6
 
@@ -30,15 +31,18 @@ def get_version(package, url_pattern=URL_PATTERN):
30
31
  return version
31
32
 
32
33
 
33
- def validate_version(logger: logging.Logger):
34
+ def validate_version(logger: logging.Logger, warning_function: Optional[Callable[[str], None]] = None):
34
35
  def task():
35
36
  try:
36
37
  current_version = parse(__version__)
37
38
  latest_version = get_version("upgini")
38
- if current_version < latest_version: # type: ignore
39
+ if current_version < latest_version:
39
40
  msg = bundle.get("version_warning").format(current_version, latest_version)
40
- logger.warning(msg)
41
- print(msg)
41
+ if warning_function:
42
+ warning_function(msg)
43
+ else:
44
+ logger.warning(msg)
45
+ print(msg)
42
46
  except Exception:
43
47
  logger.warning("Failed to validate version", exc_info=True)
44
48
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.280.dev0
3
+ Version: 1.2.31
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -22,15 +22,17 @@ Classifier: Programming Language :: Python :: 3.9
22
22
  Classifier: Programming Language :: Python :: 3.10
23
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
24
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
- Requires-Python: <3.11,>=3.8
25
+ Requires-Python: <3.12,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
- Requires-Dist: lightgbm>=3.3.2
30
- Requires-Dist: numpy>=1.19.0
29
+ Requires-Dist: jarowinkler>=2.0.0
30
+ Requires-Dist: levenshtein>=0.25.1
31
+ Requires-Dist: numpy<=1.26.4,>=1.19.0
31
32
  Requires-Dist: pandas<3.0.0,>=1.1.0
32
- Requires-Dist: pydantic<2.0.0,>=1.8.2
33
+ Requires-Dist: pydantic<3.0.0,>1.0.0
33
34
  Requires-Dist: pyjwt>=2.8.0
35
+ Requires-Dist: python-bidi==0.4.2
34
36
  Requires-Dist: python-dateutil>=2.8.0
35
37
  Requires-Dist: python-json-logger>=2.0.2
36
38
  Requires-Dist: requests>=2.8.0
@@ -130,7 +132,7 @@ Description-Content-Type: text/markdown
130
132
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
131
133
  |World economic indicators|191 |41|-|Monthly|date, country|No
132
134
  |Markets data|-|17|-|Monthly|date, datetime|No
133
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
135
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
134
136
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
135
137
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
136
138
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -143,7 +145,7 @@ Description-Content-Type: text/markdown
143
145
 
144
146
  ## 💼 Tutorials
145
147
 
146
- ### [Search of relevant external features & Automated feature generation for Salary predicton task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
148
+ ### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
147
149
 
148
150
  * The goal is to predict salary for data science job postning based on information about employer and job description.
149
151
  * Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
@@ -257,7 +259,9 @@ We do dataset verification and cleaning under the hood, but still there are some
257
259
  *Search keys* columns will be used to match records from all potential external data sources / features.
258
260
  Define one or multiple columns as a search keys with `FeaturesEnricher` class initialization.
259
261
  ```python
260
- from upgini import FeaturesEnricher, SearchKey
262
+ from upgini.features_enricher import FeaturesEnricher
263
+ from upgini.metadata import SearchKey
264
+
261
265
  enricher = FeaturesEnricher(
262
266
  search_keys={
263
267
  "subscription_activation_date": SearchKey.DATE,
@@ -343,7 +347,9 @@ enricher = FeaturesEnricher(
343
347
 
344
348
  For the meaning types <tt>SearchKey.DATE</tt>/<tt>SearchKey.DATETIME</tt> with dtypes <tt>object</tt> or <tt>string</tt> you have to clarify date/datetime format by passing <tt>date_format</tt> parameter to `FeaturesEnricher`. For example:
345
349
  ```python
346
- from upgini import FeaturesEnricher, SearchKey
350
+ from upgini.features_enricher import FeaturesEnricher
351
+ from upgini.metadata import SearchKey
352
+
347
353
  enricher = FeaturesEnricher(
348
354
  search_keys={
349
355
  "subscription_activation_date": SearchKey.DATE,
@@ -364,7 +370,9 @@ df["date"] = df.date.astype("datetime64").dt.tz_localize("Europe/Warsaw")
364
370
 
365
371
  Single country for the whole training dataset can be passed with `country_code` parameter:
366
372
  ```python
367
- from upgini import FeaturesEnricher, SearchKey
373
+ from upgini.features_enricher import FeaturesEnricher
374
+ from upgini.metadata import SearchKey
375
+
368
376
  enricher = FeaturesEnricher(
369
377
  search_keys={
370
378
  "subscription_activation_date": SearchKey.DATE,
@@ -383,7 +391,8 @@ Create instance of the `FeaturesEnricher` class and call:
383
391
  Let's try it out!
384
392
  ```python
385
393
  import pandas as pd
386
- from upgini import FeaturesEnricher, SearchKey
394
+ from upgini.features_enricher import FeaturesEnricher
395
+ from upgini.metadata import SearchKey
387
396
 
388
397
  # load labeled training dataset to initiate search
389
398
  train_df = pd.read_csv("customer_churn_prediction_train.csv")
@@ -474,7 +483,9 @@ We detect ML task under the hood based on label column values. Currently we supp
474
483
 
475
484
  But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML taks type:
476
485
  ```python
477
- from upgini import ModelTaskType
486
+ from upgini.features_enricher import FeaturesEnricher
487
+ from upgini.metadata import SearchKey, ModelTaskType
488
+
478
489
  enricher = FeaturesEnricher(
479
490
  search_keys={"subscription_activation_date": SearchKey.DATE},
480
491
  model_task_type=ModelTaskType.REGRESSION
@@ -487,7 +498,9 @@ enricher = FeaturesEnricher(
487
498
 
488
499
  To initiate feature search you can pass cross-validation type parameter to `FeaturesEnricher` with time series specific CV type:
489
500
  ```python
490
- from upgini.metadata import CVType
501
+ from upgini.features_enricher import FeaturesEnricher
502
+ from upgini.metadata import SearchKey, CVType
503
+
491
504
  enricher = FeaturesEnricher(
492
505
  search_keys={"sales_date": SearchKey.DATE},
493
506
  cv=CVType.time_series
@@ -621,7 +634,9 @@ But you can easily define new split by passing child of BaseCrossValidator to pa
621
634
 
622
635
  Example with more tips-and-tricks:
623
636
  ```python
624
- from upgini import FeaturesEnricher, SearchKey
637
+ from upgini.features_enricher import FeaturesEnricher
638
+ from upgini.metadata import SearchKey
639
+
625
640
  enricher = FeaturesEnricher(search_keys={"registration_date": SearchKey.DATE})
626
641
 
627
642
  # Fit with default setup for metrics calculation
@@ -794,7 +809,7 @@ You may publish ANY data which you consider as royalty / license free ([Open Dat
794
809
  2. Copy *Upgini API key* from profile and upload your data from Upgini python library with this key:
795
810
  ```python
796
811
  import pandas as pd
797
- from upgini import SearchKey
812
+ from upgini.metadata import SearchKey
798
813
  from upgini.ads import upload_user_ads
799
814
  import os
800
815
  os.environ["UPGINI_API_KEY"] = "your_long_string_api_key_goes_here"
@@ -839,4 +854,4 @@ Some convenient ways to start contributing are:
839
854
  - [More perks for registered users](https://profile.upgini.com)
840
855
 
841
856
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
842
- Please report it here.</a></sup>
857
+ Please report it here</a></sup>
@@ -0,0 +1,65 @@
1
+ upgini/__about__.py,sha256=ZMRxZM_8KClqm4X0jGVzsRbSK2eN35eEoOdQFqr5IU0,23
2
+ upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
+ upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
+ upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
5
+ upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
+ upgini/features_enricher.py,sha256=lNfu5Z40NmkkGJScKAwe_0VBtL8liePifuAlKE_flfA,192053
7
+ upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
8
+ upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
+ upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
10
+ upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
11
+ upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
+ upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
+ upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
14
+ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
+ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
+ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
18
+ upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
+ upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
20
+ upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
21
+ upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
22
+ upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
23
+ upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
+ upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
25
+ upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
27
+ upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
+ upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
+ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
+ upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
+ upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
+ upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
+ upgini/resource_bundle/strings.properties,sha256=bKw_rjZZTomLJhQBqiM7_P2EoRq45_Ng2gP4WE6MRBE,26921
34
+ upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
+ upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
37
+ upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
38
+ upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
39
+ upgini/utils/Roboto-Regular.ttf,sha256=kqYnZjMRQMpbyLulIChCLSdgYa1XF8GsUIoRi2Gcauw,168260
40
+ upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
41
+ upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
42
+ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
43
+ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
44
+ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
45
+ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
46
+ upgini/utils/datetime_utils.py,sha256=F61i2vZCB6eUy4WwodDyPi50XKPbhOHsxDrU6tGa6CM,13133
47
+ upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
48
+ upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
49
+ upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
50
+ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
51
+ upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
52
+ upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
53
+ upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
54
+ upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
55
+ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
56
+ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
57
+ upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
58
+ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
59
+ upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
60
+ upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
+ upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
+ upgini-1.2.31.dist-info/METADATA,sha256=_OJUvR8p-0uuVdltUq34yo_W5OZZvKOlID5OHlYY9Do,48578
63
+ upgini-1.2.31.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
+ upgini-1.2.31.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.31.dist-info/RECORD,,