upgini 1.1.275a1__py3-none-any.whl → 1.1.275a99__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/autofe/date.py +9 -2
- upgini/data_source/data_source_publisher.py +1 -1
- upgini/dataset.py +2 -10
- upgini/features_enricher.py +150 -218
- upgini/metadata.py +1 -9
- upgini/metrics.py +12 -0
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/strings.properties +2 -2
- upgini/utils/base_search_key_detector.py +12 -14
- upgini/utils/datetime_utils.py +3 -0
- upgini/utils/deduplicate_utils.py +1 -11
- upgini/utils/email_utils.py +0 -5
- upgini/utils/features_validator.py +2 -1
- upgini/utils/track_info.py +25 -13
- {upgini-1.1.275a1.dist-info → upgini-1.1.275a99.dist-info}/METADATA +2 -2
- {upgini-1.1.275a1.dist-info → upgini-1.1.275a99.dist-info}/RECORD +19 -19
- {upgini-1.1.275a1.dist-info → upgini-1.1.275a99.dist-info}/LICENSE +0 -0
- {upgini-1.1.275a1.dist-info → upgini-1.1.275a99.dist-info}/WHEEL +0 -0
- {upgini-1.1.275a1.dist-info → upgini-1.1.275a99.dist-info}/top_level.txt +0 -0
upgini/autofe/date.py
CHANGED
|
@@ -2,6 +2,7 @@ from typing import Any, Optional, Union
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
+
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
5
6
|
|
|
6
7
|
from upgini.autofe.operand import PandasOperand
|
|
7
8
|
|
|
@@ -46,6 +47,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
46
47
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
48
|
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
49
|
)
|
|
50
|
+
future = pd.to_datetime(future)
|
|
49
51
|
before = future[future < left]
|
|
50
52
|
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
51
53
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
@@ -72,8 +74,13 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
72
74
|
|
|
73
75
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
74
76
|
|
|
75
|
-
def _diff(self, x):
|
|
76
|
-
|
|
77
|
+
def _diff(self, x: TimedeltaArray):
|
|
78
|
+
if self.diff_unit == "Y":
|
|
79
|
+
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
80
|
+
elif self.diff_unit == "M":
|
|
81
|
+
raise Exception("Unsupported difference unit: Month")
|
|
82
|
+
else:
|
|
83
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
77
84
|
return x[x > 0]
|
|
78
85
|
|
|
79
86
|
def _agg(self, x):
|
|
@@ -48,6 +48,7 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
+
exclude_from_autofe_generation: Optional[List[str]],
|
|
51
52
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
52
53
|
sort_column: Optional[str] = None,
|
|
53
54
|
date_format: Optional[str] = None,
|
|
@@ -57,7 +58,6 @@ class DataSourcePublisher:
|
|
|
57
58
|
join_date_abs_limit_days: Optional[int] = None,
|
|
58
59
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
59
60
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
-
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
upgini/dataset.py
CHANGED
|
@@ -22,9 +22,7 @@ from pandas.api.types import (
|
|
|
22
22
|
from upgini.errors import ValidationError
|
|
23
23
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
24
24
|
from upgini.metadata import (
|
|
25
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
26
25
|
EVAL_SET_INDEX,
|
|
27
|
-
SEARCH_KEY_UNNEST,
|
|
28
26
|
SYSTEM_COLUMNS,
|
|
29
27
|
SYSTEM_RECORD_ID,
|
|
30
28
|
TARGET,
|
|
@@ -80,7 +78,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
80
78
|
path: Optional[str] = None,
|
|
81
79
|
meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
|
|
82
80
|
search_keys: Optional[List[Tuple[str, ...]]] = None,
|
|
83
|
-
unnest_search_keys: Optional[List[str]] = None,
|
|
84
81
|
model_task_type: Optional[ModelTaskType] = None,
|
|
85
82
|
random_state: Optional[int] = None,
|
|
86
83
|
rest_client: Optional[_RestClient] = None,
|
|
@@ -115,7 +112,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
115
112
|
self.description = description
|
|
116
113
|
self.meaning_types = meaning_types
|
|
117
114
|
self.search_keys = search_keys
|
|
118
|
-
self.unnest_search_keys = unnest_search_keys
|
|
119
115
|
self.ignore_columns = []
|
|
120
116
|
self.hierarchical_group_keys = []
|
|
121
117
|
self.hierarchical_subgroup_keys = []
|
|
@@ -175,7 +171,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
175
171
|
new_columns = []
|
|
176
172
|
dup_counter = 0
|
|
177
173
|
for column in self.data.columns:
|
|
178
|
-
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID
|
|
174
|
+
if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
|
|
179
175
|
self.columns_renaming[column] = column
|
|
180
176
|
new_columns.append(column)
|
|
181
177
|
continue
|
|
@@ -356,9 +352,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
356
352
|
|
|
357
353
|
if is_string_dtype(self.data[postal_code]):
|
|
358
354
|
try:
|
|
359
|
-
self.data[postal_code] = (
|
|
360
|
-
self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
|
|
361
|
-
)
|
|
355
|
+
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
362
356
|
except Exception:
|
|
363
357
|
pass
|
|
364
358
|
elif is_float_dtype(self.data[postal_code]):
|
|
@@ -808,8 +802,6 @@ class Dataset: # (pd.DataFrame):
|
|
|
808
802
|
meaningType=meaning_type,
|
|
809
803
|
minMaxValues=min_max_values,
|
|
810
804
|
)
|
|
811
|
-
if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
|
|
812
|
-
column_meta.isUnnest = True
|
|
813
805
|
|
|
814
806
|
columns.append(column_meta)
|
|
815
807
|
|
upgini/features_enricher.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import datetime
|
|
2
3
|
import gc
|
|
3
4
|
import hashlib
|
|
4
5
|
import itertools
|
|
@@ -10,7 +11,6 @@ import sys
|
|
|
10
11
|
import tempfile
|
|
11
12
|
import time
|
|
12
13
|
import uuid
|
|
13
|
-
from collections import Counter
|
|
14
14
|
from dataclasses import dataclass
|
|
15
15
|
from threading import Thread
|
|
16
16
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -44,11 +44,9 @@ from upgini.mdc import MDC
|
|
|
44
44
|
from upgini.metadata import (
|
|
45
45
|
COUNTRY,
|
|
46
46
|
DEFAULT_INDEX,
|
|
47
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
48
47
|
EVAL_SET_INDEX,
|
|
49
48
|
ORIGINAL_INDEX,
|
|
50
49
|
RENAMED_INDEX,
|
|
51
|
-
SEARCH_KEY_UNNEST,
|
|
52
50
|
SORT_ID,
|
|
53
51
|
SYSTEM_RECORD_ID,
|
|
54
52
|
TARGET,
|
|
@@ -149,6 +147,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
149
147
|
"""
|
|
150
148
|
|
|
151
149
|
TARGET_NAME = "target"
|
|
150
|
+
CURRENT_DATE = "current_date"
|
|
152
151
|
RANDOM_STATE = 42
|
|
153
152
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
154
153
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -210,6 +209,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
210
209
|
client_ip: Optional[str] = None,
|
|
211
210
|
client_visitorid: Optional[str] = None,
|
|
212
211
|
custom_bundle_config: Optional[str] = None,
|
|
212
|
+
add_date_if_missing: bool = True,
|
|
213
213
|
**kwargs,
|
|
214
214
|
):
|
|
215
215
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -320,6 +320,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
320
320
|
self.raise_validation_error = raise_validation_error
|
|
321
321
|
self.exclude_columns = exclude_columns
|
|
322
322
|
self.baseline_score_column = baseline_score_column
|
|
323
|
+
self.add_date_if_missing = add_date_if_missing
|
|
323
324
|
|
|
324
325
|
def _get_api_key(self):
|
|
325
326
|
return self._api_key
|
|
@@ -423,6 +424,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
423
424
|
|
|
424
425
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
425
426
|
|
|
427
|
+
# Validate client estimator params
|
|
428
|
+
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
429
|
+
|
|
426
430
|
try:
|
|
427
431
|
self.X = X
|
|
428
432
|
self.y = y
|
|
@@ -816,6 +820,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
816
820
|
trace_id = trace_id or str(uuid.uuid4())
|
|
817
821
|
start_time = time.time()
|
|
818
822
|
with MDC(trace_id=trace_id):
|
|
823
|
+
self.logger.info("Start calculate metrics")
|
|
819
824
|
if len(args) > 0:
|
|
820
825
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
821
826
|
self.logger.warning(msg)
|
|
@@ -867,22 +872,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
867
872
|
self.__display_support_link(msg)
|
|
868
873
|
return None
|
|
869
874
|
|
|
870
|
-
cat_features =
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
estimator is not None
|
|
874
|
-
and hasattr(estimator, "get_param")
|
|
875
|
-
and estimator.get_param("cat_features") is not None
|
|
876
|
-
):
|
|
877
|
-
cat_features = estimator.get_param("cat_features")
|
|
878
|
-
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
879
|
-
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
880
|
-
for cat_feature in cat_features:
|
|
881
|
-
if cat_feature in self.search_keys:
|
|
882
|
-
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
883
|
-
search_keys_for_metrics.append(cat_feature)
|
|
884
|
-
else:
|
|
885
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
875
|
+
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
876
|
+
estimator, effective_X, self.search_keys
|
|
877
|
+
)
|
|
886
878
|
|
|
887
879
|
prepared_data = self._prepare_data_for_metrics(
|
|
888
880
|
trace_id=trace_id,
|
|
@@ -897,6 +889,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
897
889
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
898
890
|
progress_bar=progress_bar,
|
|
899
891
|
progress_callback=progress_callback,
|
|
892
|
+
cat_features=cat_features,
|
|
900
893
|
)
|
|
901
894
|
if prepared_data is None:
|
|
902
895
|
return None
|
|
@@ -1184,8 +1177,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1184
1177
|
search_keys = self.search_keys.copy()
|
|
1185
1178
|
search_keys = self.__prepare_search_keys(x, search_keys, is_demo_dataset, is_transform=True, silent_mode=True)
|
|
1186
1179
|
|
|
1187
|
-
unnest_search_keys = []
|
|
1188
|
-
|
|
1189
1180
|
extended_X = x.copy()
|
|
1190
1181
|
generated_features = []
|
|
1191
1182
|
date_column = self._get_date_column(search_keys)
|
|
@@ -1196,7 +1187,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1196
1187
|
email_column = self._get_email_column(search_keys)
|
|
1197
1188
|
hem_column = self._get_hem_column(search_keys)
|
|
1198
1189
|
if email_column:
|
|
1199
|
-
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys,
|
|
1190
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1200
1191
|
extended_X = converter.convert(extended_X)
|
|
1201
1192
|
generated_features.extend(converter.generated_features)
|
|
1202
1193
|
if (
|
|
@@ -1274,6 +1265,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1274
1265
|
|
|
1275
1266
|
return _cv, groups
|
|
1276
1267
|
|
|
1268
|
+
def _get_client_cat_features(
|
|
1269
|
+
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1270
|
+
) -> Optional[List[str]]:
|
|
1271
|
+
cat_features = None
|
|
1272
|
+
search_keys_for_metrics = []
|
|
1273
|
+
if (
|
|
1274
|
+
estimator is not None
|
|
1275
|
+
and hasattr(estimator, "get_param")
|
|
1276
|
+
and estimator.get_param("cat_features") is not None
|
|
1277
|
+
):
|
|
1278
|
+
cat_features = estimator.get_param("cat_features")
|
|
1279
|
+
if len(cat_features) > 0:
|
|
1280
|
+
if all([isinstance(f, int) for f in cat_features]):
|
|
1281
|
+
cat_features = [X.columns[i] for i in cat_features]
|
|
1282
|
+
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1283
|
+
for cat_feature in cat_features:
|
|
1284
|
+
if cat_feature in search_keys:
|
|
1285
|
+
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1286
|
+
search_keys_for_metrics.append(cat_feature)
|
|
1287
|
+
else:
|
|
1288
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1289
|
+
return cat_features, search_keys_for_metrics
|
|
1290
|
+
|
|
1277
1291
|
def _prepare_data_for_metrics(
|
|
1278
1292
|
self,
|
|
1279
1293
|
trace_id: str,
|
|
@@ -1288,6 +1302,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1288
1302
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1289
1303
|
progress_bar: Optional[ProgressBar] = None,
|
|
1290
1304
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1305
|
+
cat_features: Optional[List[str]] = None,
|
|
1291
1306
|
):
|
|
1292
1307
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1293
1308
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1345,9 +1360,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1345
1360
|
|
|
1346
1361
|
# Detect and drop high cardinality columns in train
|
|
1347
1362
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
]
|
|
1363
|
+
non_excluding_columns = (self.generate_features or []) + (cat_features or [])
|
|
1364
|
+
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
|
1351
1365
|
if len(columns_with_high_cardinality) > 0:
|
|
1352
1366
|
self.logger.warning(
|
|
1353
1367
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -1809,10 +1823,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1809
1823
|
else:
|
|
1810
1824
|
features_section = ""
|
|
1811
1825
|
|
|
1812
|
-
|
|
1826
|
+
search_id = self._search_task.search_task_id
|
|
1827
|
+
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
1813
1828
|
-H 'Authorization: {self.api_key}' \\
|
|
1814
1829
|
-H 'Content-Type: application/json' \\
|
|
1815
|
-
-d '{{"
|
|
1830
|
+
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
1816
1831
|
return api_example
|
|
1817
1832
|
|
|
1818
1833
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -1907,38 +1922,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1907
1922
|
generated_features.extend(converter.generated_features)
|
|
1908
1923
|
else:
|
|
1909
1924
|
self.logger.info("Input dataset hasn't date column")
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
original_features_for_transform = []
|
|
1913
|
-
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1914
|
-
features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
|
|
1915
|
-
if len(features_not_to_pass) > 0:
|
|
1916
|
-
# Pass only features that need for transform
|
|
1917
|
-
features_for_transform = self._search_task.get_features_for_transform()
|
|
1918
|
-
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1919
|
-
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1920
|
-
original_features_for_transform = [
|
|
1921
|
-
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1922
|
-
]
|
|
1923
|
-
|
|
1924
|
-
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1925
|
-
|
|
1926
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1927
|
-
|
|
1928
|
-
df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
|
|
1929
|
-
df[columns_for_system_record_id], index=False
|
|
1930
|
-
).astype("Float64")
|
|
1931
|
-
|
|
1932
|
-
# Explode multiple search keys
|
|
1933
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
|
|
1934
|
-
|
|
1925
|
+
if self.add_date_if_missing:
|
|
1926
|
+
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1935
1927
|
email_column = self._get_email_column(search_keys)
|
|
1936
1928
|
hem_column = self._get_hem_column(search_keys)
|
|
1937
1929
|
email_converted_to_hem = False
|
|
1938
1930
|
if email_column:
|
|
1939
|
-
converter = EmailSearchKeyConverter(
|
|
1940
|
-
email_column, hem_column, search_keys, unnest_search_keys, self.logger
|
|
1941
|
-
)
|
|
1931
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
|
|
1942
1932
|
df = converter.convert(df)
|
|
1943
1933
|
generated_features.extend(converter.generated_features)
|
|
1944
1934
|
email_converted_to_hem = converter.email_converted_to_hem
|
|
@@ -1952,21 +1942,30 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1952
1942
|
generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
1953
1943
|
|
|
1954
1944
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1955
|
-
|
|
1956
|
-
for col in original_features_for_transform:
|
|
1957
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
1958
|
-
features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
|
|
1945
|
+
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1959
1946
|
|
|
1960
1947
|
if email_converted_to_hem:
|
|
1961
|
-
|
|
1948
|
+
non_keys_columns.append(email_column)
|
|
1949
|
+
|
|
1950
|
+
# Don't pass features in backend on transform
|
|
1951
|
+
original_features_for_transform = None
|
|
1952
|
+
runtime_parameters = self._get_copy_of_runtime_parameters()
|
|
1953
|
+
if len(non_keys_columns) > 0:
|
|
1954
|
+
# Pass only features that need for transform
|
|
1955
|
+
features_for_transform = self._search_task.get_features_for_transform()
|
|
1956
|
+
if features_for_transform is not None and len(features_for_transform) > 0:
|
|
1957
|
+
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
1958
|
+
original_features_for_transform = [
|
|
1959
|
+
c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
|
|
1960
|
+
]
|
|
1961
|
+
non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
|
|
1962
1962
|
|
|
1963
|
-
|
|
1964
|
-
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
|
|
1963
|
+
runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
|
|
1965
1964
|
|
|
1966
1965
|
if add_fit_system_record_id:
|
|
1967
1966
|
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1968
1967
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1969
|
-
|
|
1968
|
+
non_keys_columns.append(SORT_ID)
|
|
1970
1969
|
|
|
1971
1970
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1972
1971
|
|
|
@@ -1974,19 +1973,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1974
1973
|
"Float64"
|
|
1975
1974
|
)
|
|
1976
1975
|
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
1977
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
1978
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
1979
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
1980
1976
|
|
|
1981
1977
|
df = df.reset_index(drop=True)
|
|
1982
|
-
system_columns_with_original_index = [SYSTEM_RECORD_ID
|
|
1978
|
+
system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
|
|
1983
1979
|
if add_fit_system_record_id:
|
|
1984
1980
|
system_columns_with_original_index.append(SORT_ID)
|
|
1985
1981
|
df_with_original_index = df[system_columns_with_original_index].copy()
|
|
1986
1982
|
|
|
1987
1983
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
1988
1984
|
|
|
1989
|
-
df_without_features = df.drop(columns=
|
|
1985
|
+
df_without_features = df.drop(columns=non_keys_columns)
|
|
1990
1986
|
|
|
1991
1987
|
df_without_features = clean_full_duplicates(
|
|
1992
1988
|
df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
|
|
@@ -2142,14 +2138,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2142
2138
|
|
|
2143
2139
|
key_types = search_keys.values()
|
|
2144
2140
|
|
|
2145
|
-
# Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
|
|
2146
|
-
multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
|
|
2147
|
-
for multi_key in multi_keys:
|
|
2148
|
-
if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
|
|
2149
|
-
msg = self.bundle.get("unsupported_multi_key").format(multi_key)
|
|
2150
|
-
self.logger.warning(msg)
|
|
2151
|
-
raise ValidationError(msg)
|
|
2152
|
-
|
|
2153
2141
|
if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
|
|
2154
2142
|
msg = self.bundle.get("date_and_datetime_simultanious")
|
|
2155
2143
|
self.logger.warning(msg)
|
|
@@ -2165,11 +2153,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2165
2153
|
self.logger.warning(msg)
|
|
2166
2154
|
raise ValidationError(msg)
|
|
2167
2155
|
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2156
|
+
for key_type in SearchKey.__members__.values():
|
|
2157
|
+
if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
|
|
2158
|
+
msg = self.bundle.get("multiple_search_key").format(key_type)
|
|
2159
|
+
self.logger.warning(msg)
|
|
2160
|
+
raise ValidationError(msg)
|
|
2173
2161
|
|
|
2174
2162
|
# non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
|
|
2175
2163
|
# if (
|
|
@@ -2305,7 +2293,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2305
2293
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2306
2294
|
else:
|
|
2307
2295
|
self.logger.info("Input dataset hasn't date column")
|
|
2308
|
-
|
|
2296
|
+
if self.add_date_if_missing:
|
|
2297
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2298
|
+
email_column = self._get_email_column(self.fit_search_keys)
|
|
2299
|
+
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2300
|
+
email_converted_to_hem = False
|
|
2301
|
+
if email_column:
|
|
2302
|
+
converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
|
|
2303
|
+
df = converter.convert(df)
|
|
2304
|
+
self.fit_generated_features.extend(converter.generated_features)
|
|
2305
|
+
email_converted_to_hem = converter.email_converted_to_hem
|
|
2309
2306
|
if (
|
|
2310
2307
|
self.detect_missing_search_keys
|
|
2311
2308
|
and list(self.fit_search_keys.values()) == [SearchKey.DATE]
|
|
@@ -2314,37 +2311,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2314
2311
|
converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
|
|
2315
2312
|
df = converter.convert(df)
|
|
2316
2313
|
|
|
2317
|
-
# Explode multiple search keys
|
|
2318
2314
|
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
|
|
2319
|
-
meaning_types = {
|
|
2320
|
-
**{col: key.value for col, key in self.fit_search_keys.items()},
|
|
2321
|
-
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2322
|
-
}
|
|
2323
|
-
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2324
|
-
if eval_set is not None and len(eval_set) > 0:
|
|
2325
|
-
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2326
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
|
|
2327
|
-
|
|
2328
|
-
# TODO check that this is correct for enrichment
|
|
2329
|
-
self.df_with_original_index = df.copy()
|
|
2330
|
-
|
|
2331
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
|
|
2332
|
-
|
|
2333
|
-
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2334
|
-
email_column = self._get_email_column(self.fit_search_keys)
|
|
2335
|
-
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2336
|
-
email_converted_to_hem = False
|
|
2337
|
-
if email_column:
|
|
2338
|
-
converter = EmailSearchKeyConverter(
|
|
2339
|
-
email_column, hem_column, self.fit_search_keys, unnest_search_keys, self.logger
|
|
2340
|
-
)
|
|
2341
|
-
df = converter.convert(df)
|
|
2342
|
-
self.fit_generated_features.extend(converter.generated_features)
|
|
2343
|
-
email_converted_to_hem = converter.email_converted_to_hem
|
|
2344
|
-
|
|
2345
|
-
non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
|
|
2346
|
-
self.fit_search_keys.keys()
|
|
2347
|
-
)
|
|
2348
2315
|
if email_converted_to_hem:
|
|
2349
2316
|
non_feature_columns.append(email_column)
|
|
2350
2317
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
@@ -2368,14 +2335,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2368
2335
|
**{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
|
|
2369
2336
|
}
|
|
2370
2337
|
meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
|
|
2371
|
-
meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
2372
|
-
if SEARCH_KEY_UNNEST in df.columns:
|
|
2373
|
-
meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
|
|
2374
2338
|
if eval_set is not None and len(eval_set) > 0:
|
|
2375
2339
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2376
2340
|
|
|
2377
|
-
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys
|
|
2341
|
+
df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
|
|
2378
2342
|
|
|
2343
|
+
self.df_with_original_index = df.copy()
|
|
2379
2344
|
df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
|
|
2380
2345
|
|
|
2381
2346
|
combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
|
|
@@ -2383,15 +2348,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2383
2348
|
dataset = Dataset(
|
|
2384
2349
|
"tds_" + str(uuid.uuid4()),
|
|
2385
2350
|
df=df,
|
|
2386
|
-
meaning_types=meaning_types,
|
|
2387
|
-
search_keys=combined_search_keys,
|
|
2388
|
-
unnest_search_keys=unnest_search_keys,
|
|
2389
2351
|
model_task_type=model_task_type,
|
|
2390
2352
|
date_format=self.date_format,
|
|
2391
2353
|
random_state=self.random_state,
|
|
2392
2354
|
rest_client=self.rest_client,
|
|
2393
2355
|
logger=self.logger,
|
|
2394
2356
|
)
|
|
2357
|
+
dataset.meaning_types = meaning_types
|
|
2358
|
+
dataset.search_keys = combined_search_keys
|
|
2395
2359
|
if email_converted_to_hem:
|
|
2396
2360
|
dataset.ignore_columns = [email_column]
|
|
2397
2361
|
|
|
@@ -2911,6 +2875,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2911
2875
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2912
2876
|
return col
|
|
2913
2877
|
|
|
2878
|
+
@staticmethod
|
|
2879
|
+
def _add_current_date_as_key(
|
|
2880
|
+
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
2881
|
+
) -> pd.DataFrame:
|
|
2882
|
+
if (
|
|
2883
|
+
set(search_keys.values()) == {SearchKey.PHONE}
|
|
2884
|
+
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
2885
|
+
or set(search_keys.values()) == {SearchKey.HEM}
|
|
2886
|
+
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
2887
|
+
):
|
|
2888
|
+
msg = bundle.get("current_date_added")
|
|
2889
|
+
print(msg)
|
|
2890
|
+
logger.warning(msg)
|
|
2891
|
+
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2892
|
+
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2893
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
2894
|
+
df = converter.convert(df)
|
|
2895
|
+
return df
|
|
2896
|
+
|
|
2914
2897
|
@staticmethod
|
|
2915
2898
|
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2916
2899
|
return [
|
|
@@ -2921,19 +2904,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2921
2904
|
|
|
2922
2905
|
@staticmethod
|
|
2923
2906
|
def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
if len(cols) == 1:
|
|
2928
|
-
return cols[0]
|
|
2907
|
+
for col, t in search_keys.items():
|
|
2908
|
+
if t == SearchKey.EMAIL:
|
|
2909
|
+
return col
|
|
2929
2910
|
|
|
2930
2911
|
@staticmethod
|
|
2931
2912
|
def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
if len(cols) == 1:
|
|
2936
|
-
return cols[0]
|
|
2913
|
+
for col, t in search_keys.items():
|
|
2914
|
+
if t == SearchKey.HEM:
|
|
2915
|
+
return col
|
|
2937
2916
|
|
|
2938
2917
|
@staticmethod
|
|
2939
2918
|
def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
|
|
@@ -2941,42 +2920,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2941
2920
|
if t == SearchKey.PHONE:
|
|
2942
2921
|
return col
|
|
2943
2922
|
|
|
2944
|
-
def _explode_multiple_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
|
|
2945
|
-
# find groups of multiple search keys
|
|
2946
|
-
search_key_names_by_type: Dict[SearchKey, str] = dict()
|
|
2947
|
-
for key_name, key_type in search_keys.items():
|
|
2948
|
-
search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
|
|
2949
|
-
search_key_names_by_type = {
|
|
2950
|
-
key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
|
|
2951
|
-
}
|
|
2952
|
-
if len(search_key_names_by_type) == 0:
|
|
2953
|
-
return df, []
|
|
2954
|
-
|
|
2955
|
-
multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
|
|
2956
|
-
other_columns = [col for col in df.columns if col not in multiple_keys_columns]
|
|
2957
|
-
exploded_dfs = []
|
|
2958
|
-
unnest_search_keys = []
|
|
2959
|
-
|
|
2960
|
-
for key_type, key_names in search_key_names_by_type.items():
|
|
2961
|
-
new_search_key = f"upgini_{key_type.name.lower()}_unnest"
|
|
2962
|
-
exploded_df = pd.melt(
|
|
2963
|
-
df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
|
|
2964
|
-
)
|
|
2965
|
-
exploded_dfs.append(exploded_df)
|
|
2966
|
-
for old_key in key_names:
|
|
2967
|
-
del search_keys[old_key]
|
|
2968
|
-
search_keys[new_search_key] = key_type
|
|
2969
|
-
unnest_search_keys.append(new_search_key)
|
|
2970
|
-
|
|
2971
|
-
df = pd.concat(exploded_dfs, ignore_index=True)
|
|
2972
|
-
return df, unnest_search_keys
|
|
2973
|
-
|
|
2974
2923
|
def __add_fit_system_record_id(
|
|
2975
|
-
self,
|
|
2976
|
-
df: pd.DataFrame,
|
|
2977
|
-
meaning_types: Dict[str, FileColumnMeaningType],
|
|
2978
|
-
search_keys: Dict[str, SearchKey],
|
|
2979
|
-
id_name: str,
|
|
2924
|
+
self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
|
|
2980
2925
|
) -> pd.DataFrame:
|
|
2981
2926
|
# save original order or rows
|
|
2982
2927
|
original_index_name = df.index.name
|
|
@@ -3025,18 +2970,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3025
2970
|
|
|
3026
2971
|
df = df.reset_index(drop=True).reset_index()
|
|
3027
2972
|
# system_record_id saves correct order for fit
|
|
3028
|
-
df = df.rename(columns={DEFAULT_INDEX:
|
|
2973
|
+
df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
|
|
3029
2974
|
|
|
3030
2975
|
# return original order
|
|
3031
2976
|
df = df.set_index(ORIGINAL_INDEX)
|
|
3032
2977
|
df.index.name = original_index_name
|
|
3033
2978
|
df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
|
|
3034
2979
|
|
|
3035
|
-
meaning_types[
|
|
3036
|
-
FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3037
|
-
if id_name == SYSTEM_RECORD_ID
|
|
3038
|
-
else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
|
|
3039
|
-
)
|
|
2980
|
+
meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
|
|
3040
2981
|
return df
|
|
3041
2982
|
|
|
3042
2983
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -3091,10 +3032,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3091
3032
|
)
|
|
3092
3033
|
|
|
3093
3034
|
comparing_columns = X.columns if is_transform else df_with_original_index.columns
|
|
3094
|
-
dup_features = [
|
|
3095
|
-
c for c in comparing_columns
|
|
3096
|
-
if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
3097
|
-
]
|
|
3035
|
+
dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
|
|
3098
3036
|
if len(dup_features) > 0:
|
|
3099
3037
|
self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
|
|
3100
3038
|
raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
|
|
@@ -3105,7 +3043,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3105
3043
|
result_features = pd.merge(
|
|
3106
3044
|
df_with_original_index,
|
|
3107
3045
|
result_features,
|
|
3108
|
-
|
|
3046
|
+
left_on=SYSTEM_RECORD_ID,
|
|
3047
|
+
right_on=SYSTEM_RECORD_ID,
|
|
3109
3048
|
how="left" if is_transform else "inner",
|
|
3110
3049
|
)
|
|
3111
3050
|
result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
|
|
@@ -3485,13 +3424,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3485
3424
|
self.warning_counter.increment()
|
|
3486
3425
|
|
|
3487
3426
|
if len(valid_search_keys) == 1:
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
|
|
3491
|
-
|
|
3492
|
-
|
|
3493
|
-
|
|
3494
|
-
|
|
3427
|
+
for k, v in valid_search_keys.items():
|
|
3428
|
+
# Show warning for country only if country is the only key
|
|
3429
|
+
if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
|
|
3430
|
+
msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
|
|
3431
|
+
print(msg)
|
|
3432
|
+
self.logger.warning(msg)
|
|
3433
|
+
self.warning_counter.increment()
|
|
3495
3434
|
|
|
3496
3435
|
self.logger.info(f"Prepared search keys: {valid_search_keys}")
|
|
3497
3436
|
|
|
@@ -3601,68 +3540,61 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3601
3540
|
def check_need_detect(search_key: SearchKey):
|
|
3602
3541
|
return not is_transform or search_key in self.fit_search_keys.values()
|
|
3603
3542
|
|
|
3604
|
-
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3610
|
-
self.autodetected_search_keys.update(new_keys)
|
|
3611
|
-
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
|
|
3543
|
+
if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
3544
|
+
maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
|
|
3545
|
+
if maybe_key is not None:
|
|
3546
|
+
search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3547
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
|
|
3548
|
+
self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
|
|
3612
3549
|
if not silent_mode:
|
|
3613
|
-
print(self.bundle.get("postal_code_detected").format(
|
|
3550
|
+
print(self.bundle.get("postal_code_detected").format(maybe_key))
|
|
3614
3551
|
|
|
3615
3552
|
if (
|
|
3616
3553
|
SearchKey.COUNTRY not in search_keys.values()
|
|
3617
3554
|
and self.country_code is None
|
|
3618
3555
|
and check_need_detect(SearchKey.COUNTRY)
|
|
3619
3556
|
):
|
|
3620
|
-
maybe_key = CountrySearchKeyDetector().
|
|
3621
|
-
if maybe_key:
|
|
3622
|
-
search_keys[maybe_key
|
|
3623
|
-
self.autodetected_search_keys[maybe_key
|
|
3557
|
+
maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
|
|
3558
|
+
if maybe_key is not None:
|
|
3559
|
+
search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3560
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
|
|
3624
3561
|
self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
|
|
3625
3562
|
if not silent_mode:
|
|
3626
3563
|
print(self.bundle.get("country_detected").format(maybe_key))
|
|
3627
3564
|
|
|
3628
3565
|
if (
|
|
3629
|
-
|
|
3630
|
-
SearchKey.HEM not in search_keys.values()
|
|
3566
|
+
SearchKey.EMAIL not in search_keys.values()
|
|
3567
|
+
and SearchKey.HEM not in search_keys.values()
|
|
3631
3568
|
and check_need_detect(SearchKey.HEM)
|
|
3632
3569
|
):
|
|
3633
|
-
|
|
3634
|
-
if
|
|
3570
|
+
maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
|
|
3571
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3635
3572
|
if self.__is_registered or is_demo_dataset:
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
self.
|
|
3639
|
-
self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
|
|
3573
|
+
search_keys[maybe_key] = SearchKey.EMAIL
|
|
3574
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
|
|
3575
|
+
self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
|
|
3640
3576
|
if not silent_mode:
|
|
3641
|
-
print(self.bundle.get("email_detected").format(
|
|
3577
|
+
print(self.bundle.get("email_detected").format(maybe_key))
|
|
3642
3578
|
else:
|
|
3643
3579
|
self.logger.warning(
|
|
3644
|
-
f"Autodetected search key EMAIL in column {
|
|
3645
|
-
" But not used because not registered user"
|
|
3580
|
+
f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
|
|
3646
3581
|
)
|
|
3647
3582
|
if not silent_mode:
|
|
3648
|
-
print(self.bundle.get("email_detected_not_registered").format(
|
|
3583
|
+
print(self.bundle.get("email_detected_not_registered").format(maybe_key))
|
|
3649
3584
|
self.warning_counter.increment()
|
|
3650
3585
|
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
if maybe_keys:
|
|
3586
|
+
if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
|
|
3587
|
+
maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
|
|
3588
|
+
if maybe_key is not None and maybe_key not in search_keys.keys():
|
|
3655
3589
|
if self.__is_registered or is_demo_dataset:
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
self.
|
|
3659
|
-
self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
|
|
3590
|
+
search_keys[maybe_key] = SearchKey.PHONE
|
|
3591
|
+
self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
|
|
3592
|
+
self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
|
|
3660
3593
|
if not silent_mode:
|
|
3661
|
-
print(self.bundle.get("phone_detected").format(
|
|
3594
|
+
print(self.bundle.get("phone_detected").format(maybe_key))
|
|
3662
3595
|
else:
|
|
3663
3596
|
self.logger.warning(
|
|
3664
|
-
f"Autodetected search key PHONE in column {
|
|
3665
|
-
"But not used because not registered user"
|
|
3597
|
+
f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
|
|
3666
3598
|
)
|
|
3667
3599
|
if not silent_mode:
|
|
3668
3600
|
print(self.bundle.get("phone_detected_not_registered"))
|
upgini/metadata.py
CHANGED
|
@@ -4,8 +4,6 @@ from typing import Dict, List, Optional, Set
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
|
|
6
6
|
SYSTEM_RECORD_ID = "system_record_id"
|
|
7
|
-
ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
|
|
8
|
-
SEARCH_KEY_UNNEST = "search_key_unnest"
|
|
9
7
|
SORT_ID = "sort_id"
|
|
10
8
|
EVAL_SET_INDEX = "eval_set_index"
|
|
11
9
|
TARGET = "target"
|
|
@@ -13,7 +11,7 @@ COUNTRY = "country_iso_code"
|
|
|
13
11
|
RENAMED_INDEX = "index_col"
|
|
14
12
|
DEFAULT_INDEX = "index"
|
|
15
13
|
ORIGINAL_INDEX = "original_index"
|
|
16
|
-
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID,
|
|
14
|
+
SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
class FileColumnMeaningType(Enum):
|
|
@@ -39,8 +37,6 @@ class FileColumnMeaningType(Enum):
|
|
|
39
37
|
POSTAL_CODE = "POSTAL_CODE"
|
|
40
38
|
SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
|
|
41
39
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
42
|
-
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
43
|
-
UNNEST_KEY = "UNNEST_KEY"
|
|
44
40
|
|
|
45
41
|
|
|
46
42
|
class SearchKey(Enum):
|
|
@@ -186,10 +182,6 @@ class FileColumnMetadata(BaseModel):
|
|
|
186
182
|
meaningType: FileColumnMeaningType
|
|
187
183
|
minMaxValues: Optional[NumericInterval] = None
|
|
188
184
|
originalName: Optional[str]
|
|
189
|
-
# is this column contains keys from multiple key columns like msisdn1, msisdn2
|
|
190
|
-
isUnnest: bool = False,
|
|
191
|
-
# list of original etalon key column names like msisdn1, msisdn2
|
|
192
|
-
unnestKeyNames: Optional[list[str]]
|
|
193
185
|
|
|
194
186
|
|
|
195
187
|
class FileMetadata(BaseModel):
|
upgini/metrics.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
import re
|
|
3
4
|
from copy import deepcopy
|
|
@@ -381,6 +382,11 @@ class EstimatorWrapper:
|
|
|
381
382
|
kwargs["estimator"] = estimator_copy
|
|
382
383
|
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
383
384
|
if cat_features is not None:
|
|
385
|
+
for cat_feature in cat_features:
|
|
386
|
+
if cat_feature not in X.columns:
|
|
387
|
+
logger.error(
|
|
388
|
+
f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
|
|
389
|
+
)
|
|
384
390
|
estimator_copy.set_params(
|
|
385
391
|
cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
386
392
|
)
|
|
@@ -647,6 +653,12 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
647
653
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
648
654
|
if isinstance(scoring, str) and scoring is not None:
|
|
649
655
|
_get_scorer_by_name(scoring)
|
|
656
|
+
elif isinstance(scoring, Callable):
|
|
657
|
+
spec = inspect.getfullargspec(scoring)
|
|
658
|
+
if len(spec.args) < 3:
|
|
659
|
+
raise ValidationError(
|
|
660
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
|
|
661
|
+
)
|
|
650
662
|
|
|
651
663
|
|
|
652
664
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -44,7 +44,7 @@ class PhoneNormalizer:
|
|
|
44
44
|
Method will remove all non numeric chars from string and convert it to int.
|
|
45
45
|
None will be set for phone numbers that couldn"t be converted to int
|
|
46
46
|
"""
|
|
47
|
-
if is_string_dtype(self.df[self.phone_column_name]):
|
|
47
|
+
if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
|
|
48
48
|
convert_func = self.phone_str_to_int_safe
|
|
49
49
|
elif is_float_dtype(self.df[self.phone_column_name]):
|
|
50
50
|
convert_func = self.phone_float_to_int_safe
|
|
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
|
|
|
38
38
|
loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
|
|
39
39
|
multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
40
40
|
group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
|
|
41
|
+
current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
42
|
|
|
42
43
|
# Errors
|
|
43
44
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
@@ -87,7 +88,6 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
|
87
88
|
search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
|
|
88
89
|
empty_search_key=Search key {} is empty. Please fill values or remove this search key
|
|
89
90
|
single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
|
|
90
|
-
unsupported_multi_key=Search key {} cannot be used multiple times
|
|
91
91
|
unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
|
|
92
92
|
date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
|
|
93
93
|
invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
|
|
@@ -159,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
|
|
|
159
159
|
dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
|
|
160
160
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
161
161
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
162
|
-
dataset_rarest_class_less_min=
|
|
162
|
+
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
163
163
|
dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
164
164
|
dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
165
165
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List, Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
@@ -10,18 +10,16 @@ class BaseSearchKeyDetector:
|
|
|
10
10
|
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
11
11
|
raise NotImplementedError()
|
|
12
12
|
|
|
13
|
-
def
|
|
14
|
-
|
|
15
|
-
column_name
|
|
16
|
-
|
|
17
|
-
if self._is_search_key_by_name(column_name)
|
|
18
|
-
]
|
|
13
|
+
def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
|
|
14
|
+
for column_name in column_names:
|
|
15
|
+
if self._is_search_key_by_name(column_name):
|
|
16
|
+
return column_name
|
|
19
17
|
|
|
20
|
-
def
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
18
|
+
def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
|
|
19
|
+
maybe_column = self._get_search_key_by_name(df.columns.to_list())
|
|
20
|
+
if maybe_column is not None:
|
|
21
|
+
return maybe_column
|
|
22
|
+
|
|
23
|
+
for column_name in df.columns:
|
|
25
24
|
if self._is_search_key_by_values(df[column_name]):
|
|
26
|
-
|
|
27
|
-
return list(set(columns_by_names + columns_by_values))
|
|
25
|
+
return column_name
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -100,6 +100,9 @@ class DateTimeSearchKeyConverter:
|
|
|
100
100
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
101
101
|
self.logger.warning(msg)
|
|
102
102
|
raise ValidationError(msg)
|
|
103
|
+
else:
|
|
104
|
+
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
105
|
+
df[self.date_column] = self.parse_date(df)
|
|
103
106
|
|
|
104
107
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
105
108
|
# as additional features
|
|
@@ -3,15 +3,7 @@ from typing import Dict, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
|
|
6
|
-
from upgini.metadata import
|
|
7
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
8
|
-
EVAL_SET_INDEX,
|
|
9
|
-
SORT_ID,
|
|
10
|
-
SYSTEM_RECORD_ID,
|
|
11
|
-
TARGET,
|
|
12
|
-
ModelTaskType,
|
|
13
|
-
SearchKey,
|
|
14
|
-
)
|
|
6
|
+
from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
|
|
15
7
|
from upgini.resource_bundle import ResourceBundle
|
|
16
8
|
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
17
9
|
from upgini.utils.target_utils import define_task
|
|
@@ -151,8 +143,6 @@ def clean_full_duplicates(
|
|
|
151
143
|
unique_columns = df.columns.tolist()
|
|
152
144
|
if SYSTEM_RECORD_ID in unique_columns:
|
|
153
145
|
unique_columns.remove(SYSTEM_RECORD_ID)
|
|
154
|
-
if ENTITY_SYSTEM_RECORD_ID in unique_columns:
|
|
155
|
-
unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
|
|
156
146
|
if SORT_ID in unique_columns:
|
|
157
147
|
unique_columns.remove(SORT_ID)
|
|
158
148
|
if EVAL_SET_INDEX in unique_columns:
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,13 +38,11 @@ class EmailSearchKeyConverter:
|
|
|
38
38
|
email_column: str,
|
|
39
39
|
hem_column: Optional[str],
|
|
40
40
|
search_keys: Dict[str, SearchKey],
|
|
41
|
-
unnest_search_keys: Optional[List[str]] = None,
|
|
42
41
|
logger: Optional[logging.Logger] = None,
|
|
43
42
|
):
|
|
44
43
|
self.email_column = email_column
|
|
45
44
|
self.hem_column = hem_column
|
|
46
45
|
self.search_keys = search_keys
|
|
47
|
-
self.unnest_search_keys = unnest_search_keys
|
|
48
46
|
if logger is not None:
|
|
49
47
|
self.logger = logger
|
|
50
48
|
else:
|
|
@@ -82,12 +80,9 @@ class EmailSearchKeyConverter:
|
|
|
82
80
|
del self.search_keys[self.email_column]
|
|
83
81
|
return df
|
|
84
82
|
self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
|
|
85
|
-
self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
|
|
86
83
|
self.email_converted_to_hem = True
|
|
87
84
|
|
|
88
85
|
del self.search_keys[self.email_column]
|
|
89
|
-
if self.email_column in self.unnest_search_keys:
|
|
90
|
-
self.unnest_search_keys.remove(self.email_column)
|
|
91
86
|
|
|
92
87
|
df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
|
|
93
88
|
|
|
@@ -81,7 +81,8 @@ class FeaturesValidator:
|
|
|
81
81
|
return [
|
|
82
82
|
i
|
|
83
83
|
for i in df
|
|
84
|
-
if (
|
|
84
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or is_integer_dtype(df[i]))
|
|
85
|
+
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
85
86
|
]
|
|
86
87
|
|
|
87
88
|
@staticmethod
|
upgini/utils/track_info.py
CHANGED
|
@@ -55,7 +55,7 @@ def _get_execution_ide() -> str:
|
|
|
55
55
|
def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
56
56
|
# default values
|
|
57
57
|
track = {"ide": _get_execution_ide()}
|
|
58
|
-
ident_res = "https://
|
|
58
|
+
ident_res = "https://api64.ipify.org"
|
|
59
59
|
|
|
60
60
|
try:
|
|
61
61
|
track["hostname"] = socket.gethostname()
|
|
@@ -74,17 +74,20 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
74
74
|
display(
|
|
75
75
|
Javascript(
|
|
76
76
|
"""
|
|
77
|
-
|
|
77
|
+
async function getVisitorId() {
|
|
78
|
+
return import('https://upgini.github.io/upgini/js/a.js')
|
|
78
79
|
.then(FingerprintJS => FingerprintJS.load())
|
|
79
80
|
.then(fp => fp.get())
|
|
80
|
-
.then(result =>
|
|
81
|
+
.then(result => result.visitorId);
|
|
82
|
+
}
|
|
81
83
|
"""
|
|
82
84
|
)
|
|
83
85
|
)
|
|
84
|
-
track["visitorId"] = output.eval_js("
|
|
86
|
+
track["visitorId"] = output.eval_js("getVisitorId()", timeout_sec=30)
|
|
85
87
|
except Exception as e:
|
|
86
88
|
track["err"] = str(e)
|
|
87
|
-
|
|
89
|
+
if "visitorId" not in track:
|
|
90
|
+
track["visitorId"] = "None"
|
|
88
91
|
if client_ip:
|
|
89
92
|
track["ip"] = client_ip
|
|
90
93
|
else:
|
|
@@ -95,16 +98,19 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
95
98
|
display(
|
|
96
99
|
Javascript(
|
|
97
100
|
f"""
|
|
98
|
-
|
|
101
|
+
async function getIP() {{
|
|
102
|
+
return fetch("{ident_res}")
|
|
99
103
|
.then(response => response.text())
|
|
100
|
-
.then(data =>
|
|
104
|
+
.then(data => data);
|
|
105
|
+
}}
|
|
101
106
|
"""
|
|
102
107
|
)
|
|
103
108
|
)
|
|
104
|
-
track["ip"] = output.eval_js("
|
|
109
|
+
track["ip"] = output.eval_js("getIP()", timeout_sec=10)
|
|
105
110
|
except Exception as e:
|
|
106
111
|
track["err"] = str(e)
|
|
107
|
-
|
|
112
|
+
if "ip" not in track:
|
|
113
|
+
track["ip"] = "0.0.0.0"
|
|
108
114
|
|
|
109
115
|
elif track["ide"] == "binder":
|
|
110
116
|
try:
|
|
@@ -116,8 +122,10 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
116
122
|
track["visitorId"] = sha256(os.environ["CLIENT_IP"].encode()).hexdigest()
|
|
117
123
|
except Exception as e:
|
|
118
124
|
track["err"] = str(e)
|
|
119
|
-
|
|
120
|
-
|
|
125
|
+
if "ip" not in track:
|
|
126
|
+
track["ip"] = "0.0.0.0"
|
|
127
|
+
if "visitorId" not in track:
|
|
128
|
+
track["visitorId"] = "None"
|
|
121
129
|
|
|
122
130
|
elif track["ide"] == "kaggle":
|
|
123
131
|
try:
|
|
@@ -136,8 +144,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
136
144
|
raise Exception(err)
|
|
137
145
|
except Exception as e:
|
|
138
146
|
track["err"] = str(e)
|
|
139
|
-
|
|
140
|
-
|
|
147
|
+
if "visitorId" not in track:
|
|
148
|
+
track["visitorId"] = "None"
|
|
141
149
|
else:
|
|
142
150
|
try:
|
|
143
151
|
if client_ip:
|
|
@@ -150,5 +158,9 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
150
158
|
track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
|
|
151
159
|
except Exception as e:
|
|
152
160
|
track["err"] = str(e)
|
|
161
|
+
if "visitorId" not in track:
|
|
162
|
+
track["visitorId"] = "None"
|
|
163
|
+
if "ip" not in track:
|
|
164
|
+
track["ip"] = "0.0.0.0"
|
|
153
165
|
|
|
154
166
|
return track
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.275a99
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil >=2.8.0
|
|
30
30
|
Requires-Dist: requests >=2.8.0
|
|
31
|
-
Requires-Dist: pandas <
|
|
31
|
+
Requires-Dist: pandas <3.0.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy >=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn >=1.3.0
|
|
34
34
|
Requires-Dist: pydantic <2.0.0,>=1.8.2
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=A03SPhpJNxpZiAq6aSKiVOG6mqo3YrZ9MQRwkk8_OSg,176071
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
|
|
8
|
-
upgini/metadata.py,sha256=
|
|
9
|
-
upgini/metrics.py,sha256=
|
|
8
|
+
upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
|
|
9
|
+
upgini/metrics.py,sha256=tGzdn0jgup86OlH_GS4eoza8ZJZ9wgaJr7SaX3Upwzo,29652
|
|
10
10
|
upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
@@ -15,38 +15,38 @@ upgini/ads_management/ads_manager.py,sha256=fP4Yqx3h2Snw5X335TbXEwFoupq1RYsE7y0P
|
|
|
15
15
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
upgini/autofe/all_operands.py,sha256=H66wqVLD-H9k8A4-q2wslhV9QaNxlb49f8YiT0Xfkps,2356
|
|
17
17
|
upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
|
|
18
|
-
upgini/autofe/date.py,sha256=
|
|
18
|
+
upgini/autofe/date.py,sha256=408p8P2OTPM2D3LsEGGtaiCepKGgM1BbOCQNRzAmI6c,4223
|
|
19
19
|
upgini/autofe/feature.py,sha256=2FQRGtIumNz60hFAjfLReaY18SI7HxzYZOoC5avzSjQ,11847
|
|
20
20
|
upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
|
|
21
21
|
upgini/autofe/operand.py,sha256=dhtToPDGWtP_0u_RjayUpezJJZAgq_TzNbPH0bI9OXI,2805
|
|
22
22
|
upgini/autofe/unary.py,sha256=YRTzQLttbDdOnkogWBPnBexpu7uHWSLSFAxSCu3iFdY,3145
|
|
23
23
|
upgini/autofe/vector.py,sha256=5qhI_bdwaWM1l7fgCkx1tMt9R9gxWzoYCl-7WO4KiOs,604
|
|
24
24
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
25
|
+
upgini/data_source/data_source_publisher.py,sha256=taRzyGgrPrTTSGw4Y-Ca5k4bf30aiTa68rxqT9zfqeI,16478
|
|
26
26
|
upgini/mdc/__init__.py,sha256=ETDh3JKbrDdPMOECiYLAa8lvKYe68mv4IY6fZa9FimA,1126
|
|
27
27
|
upgini/mdc/context.py,sha256=Sl1S_InKlzzRxYqwJ2k24lawJdCKWgGJ-RIRfvzWJrk,1468
|
|
28
28
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
|
-
upgini/normalizer/phone_normalizer.py,sha256=
|
|
29
|
+
upgini/normalizer/phone_normalizer.py,sha256=_SYMX4GTgwzRXArK54Jp3vUBE5d4jZxSVyze-0tqzg0,9996
|
|
30
30
|
upgini/resource_bundle/__init__.py,sha256=hdvbqL0b0xMWbY6-kiYGsW1ro2GMiWpxxsO9uCv-h9Q,8379
|
|
31
31
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
32
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
32
|
+
upgini/resource_bundle/strings.properties,sha256=1O779a0-Ai0j7W-Z5AznvjuV69YkJvgGhJda-6VMLOQ,26287
|
|
33
33
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
34
34
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
upgini/sampler/base.py,sha256=CC-DvPbrN7zp5--SVFuUqkVmdWM_5F7R0Do98ETV82U,6421
|
|
36
36
|
upgini/sampler/random_under_sampler.py,sha256=XU4c2swPIFxVXHOPpxgM2bUao0Xm-aoMmd6fKjIuV5s,4068
|
|
37
37
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
38
38
|
upgini/utils/__init__.py,sha256=dQ4-s8-sZ5eOBZ-mH3gEwDHTdI0wI1bUAVgVqUKKPx4,786
|
|
39
|
-
upgini/utils/base_search_key_detector.py,sha256=
|
|
39
|
+
upgini/utils/base_search_key_detector.py,sha256=DGwhXLvc8i5VZWMDr0rncFfV5GEHdsCSnLGon_W9TPs,859
|
|
40
40
|
upgini/utils/blocked_time_series.py,sha256=dMz5ewk3PsoeOrc3lDzInCVPS9u_2XQkV0W6PuMMjPg,3380
|
|
41
41
|
upgini/utils/country_utils.py,sha256=1KXhLSNqkNYVL3on8-zK0Arc_SspUH7AMZvGZICysOU,6462
|
|
42
42
|
upgini/utils/custom_loss_utils.py,sha256=DBslpjWGPt7xTeypt78baR59012SYphbPsO_YLKdilo,3972
|
|
43
43
|
upgini/utils/cv_utils.py,sha256=Tn01RJvpZGZh0PUQUimlBkV-AXwe7s6yjCNFtw352Uc,3525
|
|
44
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
45
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
44
|
+
upgini/utils/datetime_utils.py,sha256=XciFOIYI4Zi7PqQS8dHxuPDEtdtwXbOrWsiAa04v2J4,10511
|
|
45
|
+
upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
|
|
46
46
|
upgini/utils/display_utils.py,sha256=LKoSwjrE0xgS5_cqVhc2og2CQ1UCZ1nTI2VKboIhoQA,10858
|
|
47
|
-
upgini/utils/email_utils.py,sha256=
|
|
47
|
+
upgini/utils/email_utils.py,sha256=3CvHXTSzlgLyGsQOXfRYVfFhfPy6OXG4uXOBWRaLfHg,3479
|
|
48
48
|
upgini/utils/fallback_progress_bar.py,sha256=cdbd1XGcWm4Ed4eAqV2_St3z7uC_kkH22gEyrN5ub6M,1090
|
|
49
|
-
upgini/utils/features_validator.py,sha256=
|
|
49
|
+
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
50
50
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
51
51
|
upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
52
52
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
@@ -54,10 +54,10 @@ upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3
|
|
|
54
54
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
55
55
|
upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
|
|
56
56
|
upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
|
|
57
|
-
upgini/utils/track_info.py,sha256=
|
|
57
|
+
upgini/utils/track_info.py,sha256=p8gmuHhLamZF5JG7K9DeK-PcytQhlFCR29lyRr-wq_U,5665
|
|
58
58
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
62
|
-
upgini-1.1.
|
|
63
|
-
upgini-1.1.
|
|
59
|
+
upgini-1.1.275a99.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
60
|
+
upgini-1.1.275a99.dist-info/METADATA,sha256=6wFwtaOYKQ4o9mZBpQlJqST1_r1YaTkwqgYAi7zkkHM,48159
|
|
61
|
+
upgini-1.1.275a99.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
62
|
+
upgini-1.1.275a99.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
63
|
+
upgini-1.1.275a99.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|