upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -10
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +78 -54
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +936 -541
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/METADATA +31 -16
- upgini-1.2.31.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/WHEEL +0 -0
- {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31.dist-info}/licenses/LICENSE +0 -0
upgini/utils/target_utils.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from pandas.api.types import is_numeric_dtype
|
|
6
|
+
from pandas.api.types import is_numeric_dtype, is_bool_dtype
|
|
7
7
|
|
|
8
8
|
from upgini.errors import ValidationError
|
|
9
9
|
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
@@ -24,49 +24,83 @@ def define_task(
|
|
|
24
24
|
) -> ModelTaskType:
|
|
25
25
|
if logger is None:
|
|
26
26
|
logger = logging.getLogger()
|
|
27
|
+
|
|
28
|
+
# Replace inf and -inf with NaN to handle extreme values correctly
|
|
29
|
+
y = y.replace([np.inf, -np.inf], np.nan, inplace=False)
|
|
30
|
+
|
|
31
|
+
# Drop NaN values from the target
|
|
27
32
|
target = y.dropna()
|
|
33
|
+
|
|
34
|
+
# Check if target is numeric and finite
|
|
28
35
|
if is_numeric_dtype(target):
|
|
29
36
|
target = target.loc[np.isfinite(target)]
|
|
30
37
|
else:
|
|
38
|
+
# If not numeric, drop empty strings as well
|
|
31
39
|
target = target.loc[target != ""]
|
|
40
|
+
|
|
41
|
+
# Raise error if there are no valid values left in the target
|
|
32
42
|
if len(target) == 0:
|
|
33
43
|
raise ValidationError(bundle.get("empty_target"))
|
|
44
|
+
|
|
45
|
+
# Count unique values in the target
|
|
34
46
|
target_items = target.nunique()
|
|
47
|
+
|
|
48
|
+
# Raise error if all target values are the same
|
|
35
49
|
if target_items == 1:
|
|
36
50
|
raise ValidationError(bundle.get("dataset_constant_target"))
|
|
51
|
+
|
|
52
|
+
reason = "" # Will store the reason for selecting the task type
|
|
53
|
+
|
|
54
|
+
# Binary classification case: exactly two unique values
|
|
37
55
|
if target_items == 2:
|
|
38
56
|
task = ModelTaskType.BINARY
|
|
57
|
+
reason = bundle.get("binary_target_reason")
|
|
39
58
|
else:
|
|
59
|
+
# Attempt to convert target to numeric
|
|
40
60
|
try:
|
|
41
61
|
target = pd.to_numeric(target)
|
|
42
62
|
is_numeric = True
|
|
43
63
|
except Exception:
|
|
44
64
|
is_numeric = False
|
|
45
65
|
|
|
46
|
-
# If
|
|
66
|
+
# If target cannot be converted to numeric, assume multiclass classification
|
|
47
67
|
if not is_numeric:
|
|
48
68
|
task = ModelTaskType.MULTICLASS
|
|
69
|
+
reason = bundle.get("non_numeric_multiclass_reason")
|
|
49
70
|
else:
|
|
71
|
+
# Multiclass classification: few unique values and integer encoding
|
|
50
72
|
if target.nunique() <= 50 and is_int_encoding(target.unique()):
|
|
51
73
|
task = ModelTaskType.MULTICLASS
|
|
74
|
+
reason = bundle.get("few_unique_label_multiclass_reason")
|
|
75
|
+
# Regression case: if there is date, assume regression
|
|
52
76
|
elif has_date:
|
|
53
77
|
task = ModelTaskType.REGRESSION
|
|
78
|
+
reason = bundle.get("date_search_key_regression_reason")
|
|
54
79
|
else:
|
|
80
|
+
# Remove zero values and recalculate unique ratio
|
|
55
81
|
non_zero_target = target[target != 0]
|
|
56
82
|
target_items = non_zero_target.nunique()
|
|
57
83
|
target_ratio = target_items / len(non_zero_target)
|
|
84
|
+
|
|
85
|
+
# Use unique_ratio to determine whether to classify as regression or multiclass
|
|
58
86
|
if (
|
|
59
|
-
(target.dtype.kind == "f" and np.any(target != target.astype(int))) #
|
|
87
|
+
(target.dtype.kind == "f" and np.any(target != target.astype(int))) # Non-integer float values
|
|
60
88
|
or target_items > 50
|
|
61
|
-
or target_ratio > 0.2
|
|
89
|
+
or target_ratio > 0.2 # If non-zero values have high ratio of uniqueness
|
|
62
90
|
):
|
|
63
91
|
task = ModelTaskType.REGRESSION
|
|
92
|
+
reason = bundle.get("many_unique_label_regression_reason")
|
|
64
93
|
else:
|
|
65
94
|
task = ModelTaskType.MULTICLASS
|
|
95
|
+
reason = bundle.get("limited_int_multiclass_reason")
|
|
66
96
|
|
|
67
|
-
|
|
97
|
+
# Log or print the reason for the selected task type
|
|
98
|
+
logger.info(f"Detected task type: {task} (Reason: {reason})")
|
|
99
|
+
|
|
100
|
+
# Print task type and reason if silent mode is off
|
|
68
101
|
if not silent:
|
|
69
|
-
print(bundle.get("target_type_detected").format(task))
|
|
102
|
+
print(bundle.get("target_type_detected").format(task, reason))
|
|
103
|
+
|
|
70
104
|
return task
|
|
71
105
|
|
|
72
106
|
|
|
@@ -81,8 +115,8 @@ def balance_undersample(
|
|
|
81
115
|
target_column: str,
|
|
82
116
|
task_type: ModelTaskType,
|
|
83
117
|
random_state: int,
|
|
84
|
-
|
|
85
|
-
|
|
118
|
+
binary_min_sample_threshold: int = 5000,
|
|
119
|
+
multiclass_min_sample_threshold: int = 25000,
|
|
86
120
|
binary_bootstrap_loops: int = 5,
|
|
87
121
|
multiclass_bootstrap_loops: int = 2,
|
|
88
122
|
logger: Optional[logging.Logger] = None,
|
|
@@ -96,52 +130,60 @@ def balance_undersample(
|
|
|
96
130
|
if SYSTEM_RECORD_ID not in df.columns:
|
|
97
131
|
raise Exception("System record id must be presented for undersampling")
|
|
98
132
|
|
|
99
|
-
count = len(df)
|
|
133
|
+
# count = len(df)
|
|
100
134
|
target = df[target_column].copy()
|
|
101
|
-
target_classes_count = target.nunique()
|
|
135
|
+
# target_classes_count = target.nunique()
|
|
102
136
|
|
|
103
137
|
vc = target.value_counts()
|
|
104
138
|
max_class_value = vc.index[0]
|
|
105
139
|
min_class_value = vc.index[len(vc) - 1]
|
|
106
140
|
max_class_count = vc[max_class_value]
|
|
107
141
|
min_class_count = vc[min_class_value]
|
|
142
|
+
num_classes = len(vc)
|
|
108
143
|
|
|
109
|
-
min_class_percent = imbalance_threshold / target_classes_count
|
|
110
|
-
min_class_threshold = int(min_class_percent * count)
|
|
144
|
+
# min_class_percent = imbalance_threshold / target_classes_count
|
|
145
|
+
# min_class_threshold = int(min_class_percent * count)
|
|
111
146
|
|
|
112
147
|
resampled_data = df
|
|
113
148
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
114
149
|
if task_type == ModelTaskType.MULTICLASS:
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
if max_class_count > (quantile25_class_cnt * multiclass_bootstrap_loops):
|
|
122
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, quantile25_class_cnt)
|
|
150
|
+
if len(df) > multiclass_min_sample_threshold and max_class_count > (
|
|
151
|
+
min_class_count * multiclass_bootstrap_loops
|
|
152
|
+
):
|
|
153
|
+
|
|
154
|
+
# msg = bundle.get("imbalance_multiclass").format(min_class_value, min_class_count)
|
|
155
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
123
156
|
logger.warning(msg)
|
|
124
157
|
print(msg)
|
|
125
158
|
if warning_counter:
|
|
126
159
|
warning_counter.increment()
|
|
127
160
|
|
|
128
|
-
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
129
161
|
sample_strategy = dict()
|
|
130
|
-
for
|
|
131
|
-
|
|
132
|
-
|
|
162
|
+
for class_value in vc.index:
|
|
163
|
+
if class_value == min_class_value:
|
|
164
|
+
continue
|
|
133
165
|
class_count = vc[class_value]
|
|
134
|
-
|
|
166
|
+
sample_size = min(
|
|
167
|
+
class_count,
|
|
168
|
+
multiclass_bootstrap_loops
|
|
169
|
+
* (
|
|
170
|
+
min_class_count
|
|
171
|
+
+ max((multiclass_min_sample_threshold - num_classes * min_class_count) / (num_classes - 1), 0)
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
sample_strategy[class_value] = int(sample_size)
|
|
175
|
+
logger.info(f"Rebalance sample strategy: {sample_strategy}. Min class count: {min_class_count}")
|
|
135
176
|
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
136
177
|
X = df[SYSTEM_RECORD_ID]
|
|
137
178
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
138
179
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
139
180
|
|
|
140
181
|
resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
141
|
-
elif len(df) >
|
|
142
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
143
|
-
|
|
144
|
-
)
|
|
182
|
+
elif len(df) > binary_min_sample_threshold:
|
|
183
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
184
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
185
|
+
# )
|
|
186
|
+
msg = bundle.get("imbalanced_target").format(min_class_value, min_class_count)
|
|
145
187
|
logger.warning(msg)
|
|
146
188
|
print(msg)
|
|
147
189
|
if warning_counter:
|
|
@@ -150,48 +192,62 @@ def balance_undersample(
|
|
|
150
192
|
# fill up to min_sample_threshold by majority class
|
|
151
193
|
minority_class = df[df[target_column] == min_class_value]
|
|
152
194
|
majority_class = df[df[target_column] != min_class_value]
|
|
153
|
-
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
195
|
+
# sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
196
|
+
sample_size = min(
|
|
197
|
+
max_class_count,
|
|
198
|
+
binary_bootstrap_loops * (min_class_count + max(binary_min_sample_threshold - 2 * min_class_count, 0)),
|
|
199
|
+
)
|
|
200
|
+
logger.info(
|
|
201
|
+
f"Min class count: {min_class_count}. Max class count: {max_class_count}."
|
|
202
|
+
f" Rebalance sample size: {sample_size}"
|
|
203
|
+
)
|
|
154
204
|
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
155
205
|
resampled_data = df[
|
|
156
206
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
157
207
|
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
158
208
|
]
|
|
159
209
|
|
|
160
|
-
elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
210
|
+
# elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
211
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
212
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
213
|
+
# )
|
|
214
|
+
# logger.warning(msg)
|
|
215
|
+
# print(msg)
|
|
216
|
+
# if warning_counter:
|
|
217
|
+
# warning_counter.increment()
|
|
168
218
|
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
219
|
+
# sampler = RandomUnderSampler(
|
|
220
|
+
# sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
|
|
221
|
+
# )
|
|
222
|
+
# X = df[SYSTEM_RECORD_ID]
|
|
223
|
+
# X = X.to_frame(SYSTEM_RECORD_ID)
|
|
224
|
+
# new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
175
225
|
|
|
176
|
-
|
|
226
|
+
# resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
177
227
|
|
|
178
228
|
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
179
229
|
return resampled_data
|
|
180
230
|
|
|
181
231
|
|
|
182
|
-
def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
183
|
-
|
|
232
|
+
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
233
|
+
try:
|
|
234
|
+
df = pd.concat([expected, actual])
|
|
235
|
+
|
|
236
|
+
if is_bool_dtype(df):
|
|
237
|
+
df = np.where(df, 1, 0)
|
|
184
238
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
239
|
+
# Define the bins for the target variable
|
|
240
|
+
df_min = df.min()
|
|
241
|
+
df_max = df.max()
|
|
242
|
+
bins = [df_min, (df_min + df_max) / 2, df_max]
|
|
189
243
|
|
|
190
|
-
|
|
191
|
-
|
|
244
|
+
# Calculate the base distribution
|
|
245
|
+
train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
|
|
192
246
|
|
|
193
|
-
|
|
194
|
-
|
|
247
|
+
# Calculate the target distribution
|
|
248
|
+
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
195
249
|
|
|
196
|
-
|
|
197
|
-
|
|
250
|
+
# Calculate the PSI
|
|
251
|
+
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
252
|
+
except Exception as e:
|
|
253
|
+
return e
|
upgini/utils/warning_counter.py
CHANGED
upgini/version_validator.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import threading
|
|
3
|
+
from typing import Callable, Optional
|
|
3
4
|
|
|
4
5
|
import requests
|
|
5
6
|
|
|
@@ -30,15 +31,18 @@ def get_version(package, url_pattern=URL_PATTERN):
|
|
|
30
31
|
return version
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
def validate_version(logger: logging.Logger):
|
|
34
|
+
def validate_version(logger: logging.Logger, warning_function: Optional[Callable[[str], None]] = None):
|
|
34
35
|
def task():
|
|
35
36
|
try:
|
|
36
37
|
current_version = parse(__version__)
|
|
37
38
|
latest_version = get_version("upgini")
|
|
38
|
-
if current_version < latest_version:
|
|
39
|
+
if current_version < latest_version:
|
|
39
40
|
msg = bundle.get("version_warning").format(current_version, latest_version)
|
|
40
|
-
|
|
41
|
-
|
|
41
|
+
if warning_function:
|
|
42
|
+
warning_function(msg)
|
|
43
|
+
else:
|
|
44
|
+
logger.warning(msg)
|
|
45
|
+
print(msg)
|
|
42
46
|
except Exception:
|
|
43
47
|
logger.warning("Failed to validate version", exc_info=True)
|
|
44
48
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.31
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -22,15 +22,17 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
22
22
|
Classifier: Programming Language :: Python :: 3.10
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
24
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
-
Requires-Python: <3.
|
|
25
|
+
Requires-Python: <3.12,>=3.8
|
|
26
26
|
Requires-Dist: catboost>=1.0.3
|
|
27
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist:
|
|
29
|
+
Requires-Dist: jarowinkler>=2.0.0
|
|
30
|
+
Requires-Dist: levenshtein>=0.25.1
|
|
31
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
31
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
32
|
-
Requires-Dist: pydantic<
|
|
33
|
+
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
33
34
|
Requires-Dist: pyjwt>=2.8.0
|
|
35
|
+
Requires-Dist: python-bidi==0.4.2
|
|
34
36
|
Requires-Dist: python-dateutil>=2.8.0
|
|
35
37
|
Requires-Dist: python-json-logger>=2.0.2
|
|
36
38
|
Requires-Dist: requests>=2.8.0
|
|
@@ -130,7 +132,7 @@ Description-Content-Type: text/markdown
|
|
|
130
132
|
|Consumer Confidence index| 44 |22|-|Monthly|date, country|No
|
|
131
133
|
|World economic indicators|191 |41|-|Monthly|date, country|No
|
|
132
134
|
|Markets data|-|17|-|Monthly|date, datetime|No
|
|
133
|
-
|World mobile & fixed broadband network coverage and
|
|
135
|
+
|World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
|
|
134
136
|
|World demographic data |90|-|2|Annual|country, postal/ZIP code|No
|
|
135
137
|
|World house prices |44|-|3|Annual|country, postal/ZIP code|No
|
|
136
138
|
|Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
|
|
@@ -143,7 +145,7 @@ Description-Content-Type: text/markdown
|
|
|
143
145
|
|
|
144
146
|
## 💼 Tutorials
|
|
145
147
|
|
|
146
|
-
### [Search of relevant external features & Automated feature generation for Salary
|
|
148
|
+
### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
|
|
147
149
|
|
|
148
150
|
* The goal is to predict salary for data science job postning based on information about employer and job description.
|
|
149
151
|
* Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
|
|
@@ -257,7 +259,9 @@ We do dataset verification and cleaning under the hood, but still there are some
|
|
|
257
259
|
*Search keys* columns will be used to match records from all potential external data sources / features.
|
|
258
260
|
Define one or multiple columns as a search keys with `FeaturesEnricher` class initialization.
|
|
259
261
|
```python
|
|
260
|
-
from upgini import FeaturesEnricher
|
|
262
|
+
from upgini.features_enricher import FeaturesEnricher
|
|
263
|
+
from upgini.metadata import SearchKey
|
|
264
|
+
|
|
261
265
|
enricher = FeaturesEnricher(
|
|
262
266
|
search_keys={
|
|
263
267
|
"subscription_activation_date": SearchKey.DATE,
|
|
@@ -343,7 +347,9 @@ enricher = FeaturesEnricher(
|
|
|
343
347
|
|
|
344
348
|
For the meaning types <tt>SearchKey.DATE</tt>/<tt>SearchKey.DATETIME</tt> with dtypes <tt>object</tt> or <tt>string</tt> you have to clarify date/datetime format by passing <tt>date_format</tt> parameter to `FeaturesEnricher`. For example:
|
|
345
349
|
```python
|
|
346
|
-
from upgini import FeaturesEnricher
|
|
350
|
+
from upgini.features_enricher import FeaturesEnricher
|
|
351
|
+
from upgini.metadata import SearchKey
|
|
352
|
+
|
|
347
353
|
enricher = FeaturesEnricher(
|
|
348
354
|
search_keys={
|
|
349
355
|
"subscription_activation_date": SearchKey.DATE,
|
|
@@ -364,7 +370,9 @@ df["date"] = df.date.astype("datetime64").dt.tz_localize("Europe/Warsaw")
|
|
|
364
370
|
|
|
365
371
|
Single country for the whole training dataset can be passed with `country_code` parameter:
|
|
366
372
|
```python
|
|
367
|
-
from upgini import FeaturesEnricher
|
|
373
|
+
from upgini.features_enricher import FeaturesEnricher
|
|
374
|
+
from upgini.metadata import SearchKey
|
|
375
|
+
|
|
368
376
|
enricher = FeaturesEnricher(
|
|
369
377
|
search_keys={
|
|
370
378
|
"subscription_activation_date": SearchKey.DATE,
|
|
@@ -383,7 +391,8 @@ Create instance of the `FeaturesEnricher` class and call:
|
|
|
383
391
|
Let's try it out!
|
|
384
392
|
```python
|
|
385
393
|
import pandas as pd
|
|
386
|
-
from upgini import FeaturesEnricher
|
|
394
|
+
from upgini.features_enricher import FeaturesEnricher
|
|
395
|
+
from upgini.metadata import SearchKey
|
|
387
396
|
|
|
388
397
|
# load labeled training dataset to initiate search
|
|
389
398
|
train_df = pd.read_csv("customer_churn_prediction_train.csv")
|
|
@@ -474,7 +483,9 @@ We detect ML task under the hood based on label column values. Currently we supp
|
|
|
474
483
|
|
|
475
484
|
But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML taks type:
|
|
476
485
|
```python
|
|
477
|
-
from upgini import
|
|
486
|
+
from upgini.features_enricher import FeaturesEnricher
|
|
487
|
+
from upgini.metadata import SearchKey, ModelTaskType
|
|
488
|
+
|
|
478
489
|
enricher = FeaturesEnricher(
|
|
479
490
|
search_keys={"subscription_activation_date": SearchKey.DATE},
|
|
480
491
|
model_task_type=ModelTaskType.REGRESSION
|
|
@@ -487,7 +498,9 @@ enricher = FeaturesEnricher(
|
|
|
487
498
|
|
|
488
499
|
To initiate feature search you can pass cross-validation type parameter to `FeaturesEnricher` with time series specific CV type:
|
|
489
500
|
```python
|
|
490
|
-
from upgini.
|
|
501
|
+
from upgini.features_enricher import FeaturesEnricher
|
|
502
|
+
from upgini.metadata import SearchKey, CVType
|
|
503
|
+
|
|
491
504
|
enricher = FeaturesEnricher(
|
|
492
505
|
search_keys={"sales_date": SearchKey.DATE},
|
|
493
506
|
cv=CVType.time_series
|
|
@@ -621,7 +634,9 @@ But you can easily define new split by passing child of BaseCrossValidator to pa
|
|
|
621
634
|
|
|
622
635
|
Example with more tips-and-tricks:
|
|
623
636
|
```python
|
|
624
|
-
from upgini import FeaturesEnricher
|
|
637
|
+
from upgini.features_enricher import FeaturesEnricher
|
|
638
|
+
from upgini.metadata import SearchKey
|
|
639
|
+
|
|
625
640
|
enricher = FeaturesEnricher(search_keys={"registration_date": SearchKey.DATE})
|
|
626
641
|
|
|
627
642
|
# Fit with default setup for metrics calculation
|
|
@@ -794,7 +809,7 @@ You may publish ANY data which you consider as royalty / license free ([Open Dat
|
|
|
794
809
|
2. Copy *Upgini API key* from profile and upload your data from Upgini python library with this key:
|
|
795
810
|
```python
|
|
796
811
|
import pandas as pd
|
|
797
|
-
from upgini import SearchKey
|
|
812
|
+
from upgini.metadata import SearchKey
|
|
798
813
|
from upgini.ads import upload_user_ads
|
|
799
814
|
import os
|
|
800
815
|
os.environ["UPGINI_API_KEY"] = "your_long_string_api_key_goes_here"
|
|
@@ -839,4 +854,4 @@ Some convenient ways to start contributing are:
|
|
|
839
854
|
- [More perks for registered users](https://profile.upgini.com)
|
|
840
855
|
|
|
841
856
|
<sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
|
|
842
|
-
Please report it here
|
|
857
|
+
Please report it here</a></sup>
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
upgini/__about__.py,sha256=ZMRxZM_8KClqm4X0jGVzsRbSK2eN35eEoOdQFqr5IU0,23
|
|
2
|
+
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
|
+
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
+
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
|
+
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
+
upgini/features_enricher.py,sha256=lNfu5Z40NmkkGJScKAwe_0VBtL8liePifuAlKE_flfA,192053
|
|
7
|
+
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
|
+
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
+
upgini/metadata.py,sha256=lUa2xYhBhnCeTqNt6lWc9iP_YuikYGIsDSn8Vwyjv1I,11235
|
|
10
|
+
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
|
+
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
|
+
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
|
+
upgini/version_validator.py,sha256=h1GViOWzULy5vf6M4dpTJuIk-4V38UCrTY1sb9yLa5I,1594
|
|
14
|
+
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
15
|
+
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
|
+
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
+
upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
|
|
18
|
+
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
|
+
upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
|
|
20
|
+
upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
|
|
21
|
+
upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
|
|
22
|
+
upgini/autofe/operand.py,sha256=uk883RaNqgXqtkaRqA1re1d9OFnnpv0JVvelYx09Yw0,2943
|
|
23
|
+
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
|
+
upgini/autofe/vector.py,sha256=ehcZUDqV71TfbU8EmKfdYp603gS2dJY_-fpr10ho5sI,663
|
|
25
|
+
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
+
upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
|
|
27
|
+
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
|
+
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
|
+
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
|
+
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
|
+
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=bKw_rjZZTomLJhQBqiM7_P2EoRq45_Ng2gP4WE6MRBE,26921
|
|
34
|
+
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
|
+
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
|
+
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
37
|
+
upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
|
|
38
|
+
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
39
|
+
upgini/utils/Roboto-Regular.ttf,sha256=kqYnZjMRQMpbyLulIChCLSdgYa1XF8GsUIoRi2Gcauw,168260
|
|
40
|
+
upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
|
|
41
|
+
upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
|
|
42
|
+
upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
|
|
43
|
+
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
44
|
+
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
45
|
+
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
46
|
+
upgini/utils/datetime_utils.py,sha256=F61i2vZCB6eUy4WwodDyPi50XKPbhOHsxDrU6tGa6CM,13133
|
|
47
|
+
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
|
+
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
+
upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
|
|
50
|
+
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
|
+
upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
|
|
52
|
+
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
|
+
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
|
+
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
55
|
+
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
56
|
+
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
|
+
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
58
|
+
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
59
|
+
upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
|
|
60
|
+
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
|
+
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
+
upgini-1.2.31.dist-info/METADATA,sha256=_OJUvR8p-0uuVdltUq34yo_W5OZZvKOlID5OHlYY9Do,48578
|
|
63
|
+
upgini-1.2.31.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.31.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.31.dist-info/RECORD,,
|