upgini 1.1.299a3511.dev10__py3-none-any.whl → 1.1.300__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.299a3511.dev10"
1
+ __version__ = "1.1.300"
@@ -1,20 +1,6 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import (
4
- Add,
5
- Combine,
6
- CombineThenFreq,
7
- Distance,
8
- Divide,
9
- JaroWinklerSim1,
10
- JaroWinklerSim2,
11
- LevenshteinSim,
12
- Max,
13
- Min,
14
- Multiply,
15
- Sim,
16
- Subtract,
17
- )
3
+ from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
18
4
  from upgini.autofe.date import (
19
5
  DateDiff,
20
6
  DateDiffType2,
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
23
9
  DatePercentile,
24
10
  DatePercentileMethod2,
25
11
  )
26
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
12
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
27
13
  from upgini.autofe.operand import Operand
28
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
14
+ from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
29
15
  from upgini.autofe.vector import Mean, Sum
30
16
 
31
17
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
53
39
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
54
40
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
55
41
  GroupByThenRank(),
56
- Combine(),
57
- CombineThenFreq(),
58
- GroupByThenNUnique(),
59
- GroupByThenFreq(),
42
+ Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
+ Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
+ Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
+ Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
60
46
  Sim(),
61
47
  DateDiff(),
62
48
  DateDiffType2(),
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
73
59
  DatePercentile(),
74
60
  DatePercentileMethod2(),
75
61
  Norm(),
76
- JaroWinklerSim1(),
77
- JaroWinklerSim2(),
78
- LevenshteinSim(),
79
- Distance(),
80
- Embeddings(),
81
62
  ]
82
63
  }
83
64
 
upgini/autofe/binary.py CHANGED
@@ -1,11 +1,7 @@
1
- import abc
2
- from typing import Optional
3
- import Levenshtein
4
1
  import numpy as np
5
2
  import pandas as pd
6
3
  from numpy import dot
7
4
  from numpy.linalg import norm
8
- from jarowinkler import jarowinkler_similarity
9
5
 
10
6
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
11
7
 
@@ -134,27 +130,7 @@ class CombineThenFreq(PandasOperand):
134
130
  self._loc(temp, value_counts)
135
131
 
136
132
 
137
- class Distance(PandasOperand):
138
- name = "dist"
139
- is_binary = True
140
- output_type = "float"
141
- is_symmetrical = True
142
- has_symmetry_importance = True
143
-
144
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
145
- return pd.Series(
146
- 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
147
- )
148
-
149
- # row-wise dot product
150
- def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
151
- res = (left.dropna() * right.dropna()).apply(np.sum)
152
- res = res.reindex(left.index.union(right.index))
153
- return res
154
-
155
-
156
- # Left for backward compatibility
157
- class Sim(Distance):
133
+ class Sim(PandasOperand):
158
134
  name = "sim"
159
135
  is_binary = True
160
136
  output_type = "float"
@@ -162,71 +138,4 @@ class Sim(Distance):
162
138
  has_symmetry_importance = True
163
139
 
164
140
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
165
- return 1 - super().calculate_binary(left, right)
166
-
167
-
168
- class StringSim(PandasOperand, abc.ABC):
169
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
170
- sims = []
171
- for i in left.index:
172
- left_i = self._prepare_value(left.get(i))
173
- right_i = self._prepare_value(right.get(i))
174
- if left_i is not None and right_i is not None:
175
- sims.append(self._similarity(left_i, right_i))
176
- else:
177
- sims.append(None)
178
-
179
- return pd.Series(sims, index=left.index)
180
-
181
- @abc.abstractmethod
182
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
183
- pass
184
-
185
- @abc.abstractmethod
186
- def _similarity(self, left: str, right: str) -> float:
187
- pass
188
-
189
-
190
- class JaroWinklerSim1(StringSim):
191
- name = "sim_jw1"
192
- is_binary = True
193
- input_type = "string"
194
- output_type = "float"
195
- is_symmetrical = True
196
- has_symmetry_importance = True
197
-
198
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
199
- return value
200
-
201
- def _similarity(self, left: str, right: str) -> float:
202
- return jarowinkler_similarity(left, right)
203
-
204
-
205
- class JaroWinklerSim2(StringSim):
206
- name = "sim_jw2"
207
- is_binary = True
208
- input_type = "string"
209
- output_type = "float"
210
- is_symmetrical = True
211
- has_symmetry_importance = True
212
-
213
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
214
- return value[::-1] if value is not None else None
215
-
216
- def _similarity(self, left: str, right: str) -> float:
217
- return jarowinkler_similarity(left, right)
218
-
219
-
220
- class LevenshteinSim(StringSim):
221
- name = "sim_lv"
222
- is_binary = True
223
- input_type = "string"
224
- output_type = "float"
225
- is_symmetrical = True
226
- has_symmetry_importance = True
227
-
228
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
229
- return value
230
-
231
- def _similarity(self, left: str, right: str) -> float:
232
- return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
141
+ return dot(left, right) / (norm(left) * norm(right))
upgini/autofe/date.py CHANGED
@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
43
43
  is_binary = True
44
44
  has_symmetry_importance = True
45
45
 
46
- replace_negative: bool = False
47
-
48
46
  def get_params(self) -> Dict[str, Optional[str]]:
49
47
  res = super().get_params()
50
48
  res.update(
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
52
50
  "diff_unit": self.diff_unit,
53
51
  "left_unit": self.left_unit,
54
52
  "right_unit": self.right_unit,
55
- "replace_negative": self.replace_negative,
56
53
  }
57
54
  )
58
55
  return res
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
64
61
  return self.__replace_negative(diff)
65
62
 
66
63
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
67
- if self.replace_negative:
68
- x[x < 0] = None
64
+ x[x < 0] = None
69
65
  return x
70
66
 
71
67
 
@@ -89,7 +85,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
89
85
  left = self._convert_to_date(left, self.left_unit)
90
86
  right = self._convert_to_date(right, self.right_unit)
91
87
  future = right + (left.dt.year - right.dt.year).apply(
92
- lambda y: pd.tseries.offsets.DateOffset(years=0 if np.isnan(y) else y)
88
+ lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
93
89
  )
94
90
  future = pd.to_datetime(future)
95
91
  before = future[future < left]
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
105
101
  class DateListDiff(PandasOperand, DateDiffMixin):
106
102
  is_binary = True
107
103
  has_symmetry_importance = True
108
-
109
104
  aggregation: str
110
- replace_negative: bool = False
111
105
 
112
106
  def get_params(self) -> Dict[str, Optional[str]]:
113
107
  res = super().get_params()
114
108
  res.update(
115
109
  {
116
110
  "aggregation": self.aggregation,
117
- "diff_unit": self.diff_unit,
118
- "left_unit": self.left_unit,
119
- "right_unit": self.right_unit,
120
- "replace_negative": self.replace_negative,
121
111
  }
122
112
  )
123
113
  return res
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
135
125
 
136
126
  def _diff(self, x: TimedeltaArray):
137
127
  x = self._convert_diff_to_unit(x)
138
- return x[x > 0] if self.replace_negative else x
128
+ return x[x > 0]
139
129
 
140
130
  def _agg(self, x):
141
131
  method = getattr(np, self.aggregation, None)
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
167
157
  super().__init__(**data)
168
158
 
169
159
  def _agg(self, x):
170
- x = x[
171
- (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
172
- & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
173
- ]
160
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
174
161
  return super()._agg(x)
175
162
 
176
163
 
upgini/autofe/feature.py CHANGED
@@ -138,17 +138,15 @@ class Feature:
138
138
  if self.cached_display_name is not None and cache:
139
139
  return self.cached_display_name
140
140
 
141
- should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
142
- prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
143
-
144
141
  if self.alias:
145
142
  components = ["f_autofe", self.alias]
146
- elif shorten and (not self.op.is_unary or should_stack_op):
147
- components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
143
+ elif shorten and not self.op.is_unary:
144
+ components = ["f_autofe", self.get_op_display_name()]
148
145
  else:
149
- components = (
150
- ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
151
- )
146
+ components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
147
+ "autofe",
148
+ self.get_op_display_name(),
149
+ ]
152
150
  components.extend([str(self.display_index)] if self.display_index is not None else [])
153
151
  display_name = "_".join(components)
154
152
 
@@ -323,10 +321,10 @@ class FeatureGroup:
323
321
  lower_order_names = [ch.get_display_name() for ch in lower_order_children]
324
322
  if any(isinstance(f, Feature) for f in lower_order_children):
325
323
  child_data = pd.concat(
326
- [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
324
+ [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
327
325
  axis=1,
328
326
  )
329
- child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
327
+ child_data.columns = [main_column] + lower_order_names
330
328
  else:
331
329
  child_data = data[columns]
332
330
 
upgini/autofe/unary.py CHANGED
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
-
129
-
130
- class Embeddings(PandasOperand):
131
- name = "emb"
132
- is_unary = True
133
- input_type = "string"
134
- output_type = "vector"
upgini/dataset.py CHANGED
@@ -23,7 +23,9 @@ from pandas.api.types import (
23
23
  from upgini.errors import ValidationError
24
24
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
25
  from upgini.metadata import (
26
+ ENTITY_SYSTEM_RECORD_ID,
26
27
  EVAL_SET_INDEX,
28
+ SEARCH_KEY_UNNEST,
27
29
  SYSTEM_COLUMNS,
28
30
  SYSTEM_RECORD_ID,
29
31
  TARGET,
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
79
81
  path: Optional[str] = None,
80
82
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
81
83
  search_keys: Optional[List[Tuple[str, ...]]] = None,
84
+ unnest_search_keys: Optional[Dict[str, str]] = None,
82
85
  model_task_type: Optional[ModelTaskType] = None,
83
86
  random_state: Optional[int] = None,
84
87
  rest_client: Optional[_RestClient] = None,
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
113
116
  self.description = description
114
117
  self.meaning_types = meaning_types
115
118
  self.search_keys = search_keys
119
+ self.unnest_search_keys = unnest_search_keys
116
120
  self.ignore_columns = []
117
121
  self.hierarchical_group_keys = []
118
122
  self.hierarchical_subgroup_keys = []
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
172
176
  new_columns = []
173
177
  dup_counter = 0
174
178
  for column in self.data.columns:
175
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
179
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
176
180
  self.columns_renaming[column] = column
177
181
  new_columns.append(column)
178
182
  continue
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
353
357
 
354
358
  if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
355
359
  try:
356
- self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
360
+ self.data[postal_code] = (
361
+ self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
362
+ )
357
363
  except Exception:
358
364
  pass
359
365
  elif is_float_dtype(self.data[postal_code]):
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
803
809
  meaningType=meaning_type,
804
810
  minMaxValues=min_max_values,
805
811
  )
812
+ if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
813
+ column_meta.isUnnest = True
814
+ column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
806
815
 
807
816
  columns.append(column_meta)
808
817
 
@@ -11,6 +11,7 @@ import sys
11
11
  import tempfile
12
12
  import time
13
13
  import uuid
14
+ from collections import Counter
14
15
  from dataclasses import dataclass
15
16
  from threading import Thread
16
17
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -45,9 +46,11 @@ from upgini.mdc import MDC
45
46
  from upgini.metadata import (
46
47
  COUNTRY,
47
48
  DEFAULT_INDEX,
49
+ ENTITY_SYSTEM_RECORD_ID,
48
50
  EVAL_SET_INDEX,
49
51
  ORIGINAL_INDEX,
50
52
  RENAMED_INDEX,
53
+ SEARCH_KEY_UNNEST,
51
54
  SORT_ID,
52
55
  SYSTEM_RECORD_ID,
53
56
  TARGET,
@@ -248,7 +251,7 @@ class FeaturesEnricher(TransformerMixin):
248
251
  self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
249
252
 
250
253
  validate_version(self.logger)
251
- self.search_keys = search_keys or dict()
254
+ self.search_keys = search_keys or {}
252
255
  self.country_code = country_code
253
256
  self.__validate_search_keys(search_keys, search_id)
254
257
  self.model_task_type = model_task_type
@@ -1200,7 +1203,7 @@ class FeaturesEnricher(TransformerMixin):
1200
1203
  email_column = self._get_email_column(search_keys)
1201
1204
  hem_column = self._get_hem_column(search_keys)
1202
1205
  if email_column:
1203
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1206
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1204
1207
  extended_X = converter.convert(extended_X)
1205
1208
  generated_features.extend(converter.generated_features)
1206
1209
  if (
@@ -1353,7 +1356,7 @@ class FeaturesEnricher(TransformerMixin):
1353
1356
  not in (
1354
1357
  excluding_search_keys
1355
1358
  + list(self.fit_dropped_features)
1356
- + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1359
+ + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1357
1360
  )
1358
1361
  ]
1359
1362
 
@@ -1417,7 +1420,7 @@ class FeaturesEnricher(TransformerMixin):
1417
1420
  fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
1418
1421
  )
1419
1422
 
1420
- fitting_eval_set_dict = dict()
1423
+ fitting_eval_set_dict = {}
1421
1424
  for idx, eval_tuple in eval_set_sampled_dict.items():
1422
1425
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1423
1426
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1534,7 +1537,7 @@ class FeaturesEnricher(TransformerMixin):
1534
1537
  def __sample_only_input(
1535
1538
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1536
1539
  ) -> _SampledDataForMetrics:
1537
- eval_set_sampled_dict = dict()
1540
+ eval_set_sampled_dict = {}
1538
1541
 
1539
1542
  df = validated_X.copy()
1540
1543
  df[TARGET] = validated_y
@@ -1560,7 +1563,7 @@ class FeaturesEnricher(TransformerMixin):
1560
1563
  df = df.sample(n=sample_rows, random_state=self.random_state)
1561
1564
 
1562
1565
  df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1563
- df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1566
+ df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1564
1567
 
1565
1568
  train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1566
1569
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -1584,7 +1587,7 @@ class FeaturesEnricher(TransformerMixin):
1584
1587
  trace_id: str,
1585
1588
  remove_outliers_calc_metrics: Optional[bool],
1586
1589
  ) -> _SampledDataForMetrics:
1587
- eval_set_sampled_dict = dict()
1590
+ eval_set_sampled_dict = {}
1588
1591
  search_keys = self.fit_search_keys
1589
1592
 
1590
1593
  rows_to_drop = None
@@ -1598,8 +1601,7 @@ class FeaturesEnricher(TransformerMixin):
1598
1601
  outliers = pd.merge(
1599
1602
  self.df_with_original_index,
1600
1603
  target_outliers_df,
1601
- left_on=SYSTEM_RECORD_ID,
1602
- right_on=SYSTEM_RECORD_ID,
1604
+ on=ENTITY_SYSTEM_RECORD_ID,
1603
1605
  how="inner",
1604
1606
  )
1605
1607
  top_outliers = outliers.sort_values(by=TARGET, ascending=False)[TARGET].head(3)
@@ -1658,7 +1660,7 @@ class FeaturesEnricher(TransformerMixin):
1658
1660
  progress_bar: Optional[ProgressBar],
1659
1661
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1660
1662
  ) -> _SampledDataForMetrics:
1661
- eval_set_sampled_dict = dict()
1663
+ eval_set_sampled_dict = {}
1662
1664
  if eval_set is not None:
1663
1665
  self.logger.info("Transform with eval_set")
1664
1666
  # concatenate X and eval_set with eval_set_index
@@ -1680,7 +1682,7 @@ class FeaturesEnricher(TransformerMixin):
1680
1682
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1681
1683
  df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1682
1684
 
1683
- eval_set_sampled_dict = dict()
1685
+ eval_set_sampled_dict = {}
1684
1686
 
1685
1687
  tmp_target_name = "__target"
1686
1688
  df = df.rename(columns={TARGET: tmp_target_name})
@@ -1943,11 +1945,38 @@ class FeaturesEnricher(TransformerMixin):
1943
1945
  self.logger.info("Input dataset hasn't date column")
1944
1946
  if self.add_date_if_missing:
1945
1947
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1948
+
1949
+ # Don't pass all features in backend on transform
1950
+ original_features_for_transform = []
1951
+ runtime_parameters = self._get_copy_of_runtime_parameters()
1952
+ features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1953
+ if len(features_not_to_pass) > 0:
1954
+ # Pass only features that need for transform
1955
+ features_for_transform = self._search_task.get_features_for_transform()
1956
+ if features_for_transform is not None and len(features_for_transform) > 0:
1957
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1958
+ original_features_for_transform = [
1959
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1960
+ ]
1961
+
1962
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1963
+
1964
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1965
+
1966
+ df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1967
+ df[columns_for_system_record_id], index=False
1968
+ ).astype("Float64")
1969
+
1970
+ # Explode multiple search keys
1971
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
1972
+
1946
1973
  email_column = self._get_email_column(search_keys)
1947
1974
  hem_column = self._get_hem_column(search_keys)
1948
1975
  email_converted_to_hem = False
1949
1976
  if email_column:
1950
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1977
+ converter = EmailSearchKeyConverter(
1978
+ email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
1979
+ )
1951
1980
  df = converter.convert(df)
1952
1981
  generated_features.extend(converter.generated_features)
1953
1982
  email_converted_to_hem = converter.email_converted_to_hem
@@ -1961,30 +1990,21 @@ class FeaturesEnricher(TransformerMixin):
1961
1990
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1962
1991
 
1963
1992
  meaning_types = {col: key.value for col, key in search_keys.items()}
1964
- non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1993
+ # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1994
+ for col in original_features_for_transform:
1995
+ meaning_types[col] = FileColumnMeaningType.FEATURE
1996
+ features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1965
1997
 
1966
1998
  if email_converted_to_hem:
1967
- non_keys_columns.append(email_column)
1999
+ features_not_to_pass.append(email_column)
1968
2000
 
1969
- # Don't pass features in backend on transform
1970
- original_features_for_transform = None
1971
- runtime_parameters = self._get_copy_of_runtime_parameters()
1972
- if len(non_keys_columns) > 0:
1973
- # Pass only features that need for transform
1974
- features_for_transform = self._search_task.get_features_for_transform()
1975
- if features_for_transform is not None and len(features_for_transform) > 0:
1976
- file_metadata = self._search_task.get_file_metadata(trace_id)
1977
- original_features_for_transform = [
1978
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1979
- ]
1980
- non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
1981
-
1982
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2001
+ features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
2002
+ columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1983
2003
 
1984
2004
  if add_fit_system_record_id:
1985
- df = self.__add_fit_system_record_id(df, dict(), search_keys)
2005
+ df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
1986
2006
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
1987
- non_keys_columns.append(SORT_ID)
2007
+ features_not_to_pass.append(SORT_ID)
1988
2008
 
1989
2009
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
1990
2010
 
@@ -1992,16 +2012,19 @@ class FeaturesEnricher(TransformerMixin):
1992
2012
  "Float64"
1993
2013
  )
1994
2014
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2015
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2016
+ if SEARCH_KEY_UNNEST in df.columns:
2017
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
1995
2018
 
1996
2019
  df = df.reset_index(drop=True)
1997
- system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
2020
+ system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
1998
2021
  if add_fit_system_record_id:
1999
2022
  system_columns_with_original_index.append(SORT_ID)
2000
2023
  df_with_original_index = df[system_columns_with_original_index].copy()
2001
2024
 
2002
2025
  combined_search_keys = combine_search_keys(search_keys.keys())
2003
2026
 
2004
- df_without_features = df.drop(columns=non_keys_columns)
2027
+ df_without_features = df.drop(columns=features_not_to_pass)
2005
2028
 
2006
2029
  df_without_features = clean_full_duplicates(
2007
2030
  df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
@@ -2013,12 +2036,13 @@ class FeaturesEnricher(TransformerMixin):
2013
2036
  dataset = Dataset(
2014
2037
  "sample_" + str(uuid.uuid4()),
2015
2038
  df=df_without_features,
2039
+ meaning_types=meaning_types,
2040
+ search_keys=combined_search_keys,
2041
+ unnest_search_keys=unnest_search_keys,
2016
2042
  date_format=self.date_format,
2017
2043
  rest_client=self.rest_client,
2018
2044
  logger=self.logger,
2019
2045
  )
2020
- dataset.meaning_types = meaning_types
2021
- dataset.search_keys = combined_search_keys
2022
2046
  if email_converted_to_hem:
2023
2047
  dataset.ignore_columns = [email_column]
2024
2048
 
@@ -2157,6 +2181,14 @@ class FeaturesEnricher(TransformerMixin):
2157
2181
 
2158
2182
  key_types = search_keys.values()
2159
2183
 
2184
+ # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2185
+ multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2186
+ for multi_key in multi_keys:
2187
+ if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2188
+ msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2189
+ self.logger.warning(msg)
2190
+ raise ValidationError(msg)
2191
+
2160
2192
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2161
2193
  msg = self.bundle.get("date_and_datetime_simultanious")
2162
2194
  self.logger.warning(msg)
@@ -2172,11 +2204,11 @@ class FeaturesEnricher(TransformerMixin):
2172
2204
  self.logger.warning(msg)
2173
2205
  raise ValidationError(msg)
2174
2206
 
2175
- for key_type in SearchKey.__members__.values():
2176
- if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2177
- msg = self.bundle.get("multiple_search_key").format(key_type)
2178
- self.logger.warning(msg)
2179
- raise ValidationError(msg)
2207
+ # for key_type in SearchKey.__members__.values():
2208
+ # if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2209
+ # msg = self.bundle.get("multiple_search_key").format(key_type)
2210
+ # self.logger.warning(msg)
2211
+ # raise ValidationError(msg)
2180
2212
 
2181
2213
  # non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
2182
2214
  # if (
@@ -2314,14 +2346,7 @@ class FeaturesEnricher(TransformerMixin):
2314
2346
  self.logger.info("Input dataset hasn't date column")
2315
2347
  if self.add_date_if_missing:
2316
2348
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2317
- email_column = self._get_email_column(self.fit_search_keys)
2318
- hem_column = self._get_hem_column(self.fit_search_keys)
2319
- email_converted_to_hem = False
2320
- if email_column:
2321
- converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
2322
- df = converter.convert(df)
2323
- self.fit_generated_features.extend(converter.generated_features)
2324
- email_converted_to_hem = converter.email_converted_to_hem
2349
+
2325
2350
  if (
2326
2351
  self.detect_missing_search_keys
2327
2352
  and list(self.fit_search_keys.values()) == [SearchKey.DATE]
@@ -2330,7 +2355,37 @@ class FeaturesEnricher(TransformerMixin):
2330
2355
  converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2331
2356
  df = converter.convert(df)
2332
2357
 
2358
+ # Explode multiple search keys
2333
2359
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2360
+ meaning_types = {
2361
+ **{col: key.value for col, key in self.fit_search_keys.items()},
2362
+ **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2363
+ }
2364
+ meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2365
+ if eval_set is not None and len(eval_set) > 0:
2366
+ meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2367
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2368
+
2369
+ # TODO check that this is correct for enrichment
2370
+ self.df_with_original_index = df.copy()
2371
+
2372
+ df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2373
+
2374
+ # Convert EMAIL to HEM after unnesting to do it only with one column
2375
+ email_column = self._get_email_column(self.fit_search_keys)
2376
+ hem_column = self._get_hem_column(self.fit_search_keys)
2377
+ email_converted_to_hem = False
2378
+ if email_column:
2379
+ converter = EmailSearchKeyConverter(
2380
+ email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2381
+ )
2382
+ df = converter.convert(df)
2383
+ self.fit_generated_features.extend(converter.generated_features)
2384
+ email_converted_to_hem = converter.email_converted_to_hem
2385
+
2386
+ non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2387
+ self.fit_search_keys.keys()
2388
+ )
2334
2389
  if email_converted_to_hem:
2335
2390
  non_feature_columns.append(email_column)
2336
2391
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -2354,12 +2409,14 @@ class FeaturesEnricher(TransformerMixin):
2354
2409
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2355
2410
  }
2356
2411
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2412
+ meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2413
+ if SEARCH_KEY_UNNEST in df.columns:
2414
+ meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2357
2415
  if eval_set is not None and len(eval_set) > 0:
2358
2416
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2359
2417
 
2360
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
2418
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2361
2419
 
2362
- self.df_with_original_index = df.copy()
2363
2420
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2364
2421
 
2365
2422
  combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
@@ -2367,14 +2424,15 @@ class FeaturesEnricher(TransformerMixin):
2367
2424
  dataset = Dataset(
2368
2425
  "tds_" + str(uuid.uuid4()),
2369
2426
  df=df,
2427
+ meaning_types=meaning_types,
2428
+ search_keys=combined_search_keys,
2429
+ unnest_search_keys=unnest_search_keys,
2370
2430
  model_task_type=model_task_type,
2371
2431
  date_format=self.date_format,
2372
2432
  random_state=self.random_state,
2373
2433
  rest_client=self.rest_client,
2374
2434
  logger=self.logger,
2375
2435
  )
2376
- dataset.meaning_types = meaning_types
2377
- dataset.search_keys = combined_search_keys
2378
2436
  if email_converted_to_hem:
2379
2437
  dataset.ignore_columns = [email_column]
2380
2438
 
@@ -2744,9 +2802,10 @@ class FeaturesEnricher(TransformerMixin):
2744
2802
  X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
2745
2803
  ) -> Tuple[pd.DataFrame, pd.Series]:
2746
2804
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
2805
+ record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
2747
2806
  Xy = X.copy()
2748
2807
  Xy[TARGET] = y
2749
- Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2808
+ Xy = Xy.sort_values(by=record_id_column).reset_index(drop=True)
2750
2809
  X = Xy.drop(columns=TARGET)
2751
2810
  y = Xy[TARGET].copy()
2752
2811
 
@@ -2925,15 +2984,19 @@ class FeaturesEnricher(TransformerMixin):
2925
2984
 
2926
2985
  @staticmethod
2927
2986
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2928
- for col, t in search_keys.items():
2929
- if t == SearchKey.EMAIL:
2930
- return col
2987
+ cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
2988
+ if len(cols) > 1:
2989
+ raise Exception("More than one email column found after unnest")
2990
+ if len(cols) == 1:
2991
+ return cols[0]
2931
2992
 
2932
2993
  @staticmethod
2933
2994
  def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2934
- for col, t in search_keys.items():
2935
- if t == SearchKey.HEM:
2936
- return col
2995
+ cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
2996
+ if len(cols) > 1:
2997
+ raise Exception("More than one hem column found after unnest")
2998
+ if len(cols) == 1:
2999
+ return cols[0]
2937
3000
 
2938
3001
  @staticmethod
2939
3002
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -2941,8 +3004,44 @@ class FeaturesEnricher(TransformerMixin):
2941
3004
  if t == SearchKey.PHONE:
2942
3005
  return col
2943
3006
 
3007
+ def _explode_multiple_search_keys(
3008
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3009
+ ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
3010
+ # find groups of multiple search keys
3011
+ search_key_names_by_type: Dict[SearchKey, str] = {}
3012
+ for key_name, key_type in search_keys.items():
3013
+ search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3014
+ search_key_names_by_type = {
3015
+ key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
3016
+ }
3017
+ if len(search_key_names_by_type) == 0:
3018
+ return df, {}
3019
+
3020
+ multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3021
+ other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3022
+ exploded_dfs = []
3023
+ unnest_search_keys = {}
3024
+
3025
+ for key_type, key_names in search_key_names_by_type.items():
3026
+ new_search_key = f"upgini_{key_type.name.lower()}_unnest"
3027
+ exploded_df = pd.melt(
3028
+ df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
3029
+ )
3030
+ exploded_dfs.append(exploded_df)
3031
+ for old_key in key_names:
3032
+ del search_keys[old_key]
3033
+ search_keys[new_search_key] = key_type
3034
+ unnest_search_keys[new_search_key] = key_names
3035
+
3036
+ df = pd.concat(exploded_dfs, ignore_index=True)
3037
+ return df, unnest_search_keys
3038
+
2944
3039
  def __add_fit_system_record_id(
2945
- self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
3040
+ self,
3041
+ df: pd.DataFrame,
3042
+ meaning_types: Dict[str, FileColumnMeaningType],
3043
+ search_keys: Dict[str, SearchKey],
3044
+ id_name: str,
2946
3045
  ) -> pd.DataFrame:
2947
3046
  # save original order or rows
2948
3047
  original_index_name = df.index.name
@@ -2953,7 +3052,14 @@ class FeaturesEnricher(TransformerMixin):
2953
3052
 
2954
3053
  # order by date and idempotent order by other keys
2955
3054
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
2956
- sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
3055
+ sort_exclude_columns = [
3056
+ original_order_name,
3057
+ ORIGINAL_INDEX,
3058
+ EVAL_SET_INDEX,
3059
+ TARGET,
3060
+ "__target",
3061
+ ENTITY_SYSTEM_RECORD_ID,
3062
+ ]
2957
3063
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
2958
3064
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
2959
3065
  sort_exclude_columns.append(self._get_date_column(search_keys))
@@ -2991,14 +3097,18 @@ class FeaturesEnricher(TransformerMixin):
2991
3097
 
2992
3098
  df = df.reset_index(drop=True).reset_index()
2993
3099
  # system_record_id saves correct order for fit
2994
- df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
3100
+ df = df.rename(columns={DEFAULT_INDEX: id_name})
2995
3101
 
2996
3102
  # return original order
2997
3103
  df = df.set_index(ORIGINAL_INDEX)
2998
3104
  df.index.name = original_index_name
2999
3105
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3000
3106
 
3001
- meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3107
+ meaning_types[id_name] = (
3108
+ FileColumnMeaningType.SYSTEM_RECORD_ID
3109
+ if id_name == SYSTEM_RECORD_ID
3110
+ else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3111
+ )
3002
3112
  return df
3003
3113
 
3004
3114
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3053,7 +3163,11 @@ class FeaturesEnricher(TransformerMixin):
3053
3163
  )
3054
3164
 
3055
3165
  comparing_columns = X.columns if is_transform else df_with_original_index.columns
3056
- dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
3166
+ dup_features = [
3167
+ c
3168
+ for c in comparing_columns
3169
+ if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
3170
+ ]
3057
3171
  if len(dup_features) > 0:
3058
3172
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3059
3173
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
@@ -3064,8 +3178,7 @@ class FeaturesEnricher(TransformerMixin):
3064
3178
  result_features = pd.merge(
3065
3179
  df_with_original_index,
3066
3180
  result_features,
3067
- left_on=SYSTEM_RECORD_ID,
3068
- right_on=SYSTEM_RECORD_ID,
3181
+ on=ENTITY_SYSTEM_RECORD_ID,
3069
3182
  how="left" if is_transform else "inner",
3070
3183
  )
3071
3184
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
@@ -3076,7 +3189,7 @@ class FeaturesEnricher(TransformerMixin):
3076
3189
  result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
3077
3190
  self.logger.info(f"After dropping target outliers size: {len(result_features)}")
3078
3191
 
3079
- result_eval_sets = dict()
3192
+ result_eval_sets = {}
3080
3193
  if not is_transform and EVAL_SET_INDEX in result_features.columns:
3081
3194
  result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
3082
3195
  eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
@@ -3288,7 +3401,7 @@ class FeaturesEnricher(TransformerMixin):
3288
3401
  if autofe_feature.op.is_vector:
3289
3402
  continue
3290
3403
 
3291
- description = dict()
3404
+ description = {}
3292
3405
 
3293
3406
  feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
3294
3407
  if feature_meta is None:
@@ -3454,13 +3567,13 @@ class FeaturesEnricher(TransformerMixin):
3454
3567
  self.warning_counter.increment()
3455
3568
 
3456
3569
  if len(valid_search_keys) == 1:
3457
- for k, v in valid_search_keys.items():
3458
- # Show warning for country only if country is the only key
3459
- if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3460
- msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3461
- print(msg)
3462
- self.logger.warning(msg)
3463
- self.warning_counter.increment()
3570
+ key, value = list(valid_search_keys.items())[0]
3571
+ # Show warning for country only if country is the only key
3572
+ if x[key].nunique() == 1:
3573
+ msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3574
+ print(msg)
3575
+ self.logger.warning(msg)
3576
+ self.warning_counter.increment()
3464
3577
 
3465
3578
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3466
3579
 
@@ -3570,61 +3683,68 @@ class FeaturesEnricher(TransformerMixin):
3570
3683
  def check_need_detect(search_key: SearchKey):
3571
3684
  return not is_transform or search_key in self.fit_search_keys.values()
3572
3685
 
3573
- if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3574
- maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
3575
- if maybe_key is not None:
3576
- search_keys[maybe_key] = SearchKey.POSTAL_CODE
3577
- self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3578
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3686
+ # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3687
+ if check_need_detect(SearchKey.POSTAL_CODE):
3688
+ maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
3689
+ if maybe_keys:
3690
+ new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
3691
+ search_keys.update(new_keys)
3692
+ self.autodetected_search_keys.update(new_keys)
3693
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
3579
3694
  if not silent_mode:
3580
- print(self.bundle.get("postal_code_detected").format(maybe_key))
3695
+ print(self.bundle.get("postal_code_detected").format(maybe_keys))
3581
3696
 
3582
3697
  if (
3583
3698
  SearchKey.COUNTRY not in search_keys.values()
3584
3699
  and self.country_code is None
3585
3700
  and check_need_detect(SearchKey.COUNTRY)
3586
3701
  ):
3587
- maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
3588
- if maybe_key is not None:
3589
- search_keys[maybe_key] = SearchKey.COUNTRY
3590
- self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3702
+ maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
3703
+ if maybe_key:
3704
+ search_keys[maybe_key[0]] = SearchKey.COUNTRY
3705
+ self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
3591
3706
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3592
3707
  if not silent_mode:
3593
3708
  print(self.bundle.get("country_detected").format(maybe_key))
3594
3709
 
3595
3710
  if (
3596
- SearchKey.EMAIL not in search_keys.values()
3597
- and SearchKey.HEM not in search_keys.values()
3711
+ # SearchKey.EMAIL not in search_keys.values()
3712
+ SearchKey.HEM not in search_keys.values()
3598
3713
  and check_need_detect(SearchKey.HEM)
3599
3714
  ):
3600
- maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
3601
- if maybe_key is not None and maybe_key not in search_keys.keys():
3715
+ maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
3716
+ if maybe_keys:
3602
3717
  if self.__is_registered or is_demo_dataset:
3603
- search_keys[maybe_key] = SearchKey.EMAIL
3604
- self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3605
- self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3718
+ new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
3719
+ search_keys.update(new_keys)
3720
+ self.autodetected_search_keys.update(new_keys)
3721
+ self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
3606
3722
  if not silent_mode:
3607
- print(self.bundle.get("email_detected").format(maybe_key))
3723
+ print(self.bundle.get("email_detected").format(maybe_keys))
3608
3724
  else:
3609
3725
  self.logger.warning(
3610
- f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3726
+ f"Autodetected search key EMAIL in column {maybe_keys}."
3727
+ " But not used because not registered user"
3611
3728
  )
3612
3729
  if not silent_mode:
3613
- print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3730
+ print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3614
3731
  self.warning_counter.increment()
3615
3732
 
3616
- if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3617
- maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
3618
- if maybe_key is not None and maybe_key not in search_keys.keys():
3733
+ # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3734
+ if check_need_detect(SearchKey.PHONE):
3735
+ maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
3736
+ if maybe_keys:
3619
3737
  if self.__is_registered or is_demo_dataset:
3620
- search_keys[maybe_key] = SearchKey.PHONE
3621
- self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3622
- self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3738
+ new_keys = {key: SearchKey.PHONE for key in maybe_keys}
3739
+ search_keys.update(new_keys)
3740
+ self.autodetected_search_keys.update(new_keys)
3741
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
3623
3742
  if not silent_mode:
3624
- print(self.bundle.get("phone_detected").format(maybe_key))
3743
+ print(self.bundle.get("phone_detected").format(maybe_keys))
3625
3744
  else:
3626
3745
  self.logger.warning(
3627
- f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3746
+ f"Autodetected search key PHONE in column {maybe_keys}. "
3747
+ "But not used because not registered user"
3628
3748
  )
3629
3749
  if not silent_mode:
3630
3750
  print(self.bundle.get("phone_detected_not_registered"))
upgini/metadata.py CHANGED
@@ -6,6 +6,8 @@ from typing import Dict, List, Optional, Set
6
6
  from pydantic import BaseModel
7
7
 
8
8
  SYSTEM_RECORD_ID = "system_record_id"
9
+ ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
10
+ SEARCH_KEY_UNNEST = "search_key_unnest"
9
11
  SORT_ID = "sort_id"
10
12
  EVAL_SET_INDEX = "eval_set_index"
11
13
  TARGET = "target"
@@ -13,7 +15,7 @@ COUNTRY = "country_iso_code"
13
15
  RENAMED_INDEX = "index_col"
14
16
  DEFAULT_INDEX = "index"
15
17
  ORIGINAL_INDEX = "original_index"
16
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
18
+ SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
17
19
 
18
20
 
19
21
  class FileColumnMeaningType(Enum):
@@ -39,6 +41,8 @@ class FileColumnMeaningType(Enum):
39
41
  POSTAL_CODE = "POSTAL_CODE"
40
42
  SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
41
43
  EVAL_SET_INDEX = "EVAL_SET_INDEX"
44
+ ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
45
+ UNNEST_KEY = "UNNEST_KEY"
42
46
 
43
47
 
44
48
  class SearchKey(Enum):
@@ -184,6 +188,10 @@ class FileColumnMetadata(BaseModel):
184
188
  meaningType: FileColumnMeaningType
185
189
  minMaxValues: Optional[NumericInterval] = None
186
190
  originalName: Optional[str]
191
+ # is this column contains keys from multiple key columns like msisdn1, msisdn2
192
+ isUnnest: bool = False
193
+ # list of original etalon key column names like msisdn1, msisdn2
194
+ unnestKeyNames: Optional[list[str]]
187
195
 
188
196
 
189
197
  class FileMetadata(BaseModel):
@@ -281,7 +289,7 @@ class FeaturesFilter(BaseModel):
281
289
 
282
290
 
283
291
  class RuntimeParameters(BaseModel):
284
- properties: Dict[str, str] = dict()
292
+ properties: Dict[str, str] = {}
285
293
 
286
294
 
287
295
  class SearchCustomization(BaseModel):
upgini/metrics.py CHANGED
@@ -369,7 +369,7 @@ class EstimatorWrapper:
369
369
  "logger": logger,
370
370
  }
371
371
  if estimator is None:
372
- params = dict()
372
+ params = {}
373
373
  params["has_time"] = has_date
374
374
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
375
375
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
@@ -88,6 +88,7 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
88
88
  search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
89
89
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
90
90
  single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
91
+ unsupported_multi_key=Search key {} cannot be used multiple times
91
92
  unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
92
93
  date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
93
94
  invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
@@ -1,4 +1,4 @@
1
- from typing import List, Optional
1
+ from typing import List
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -10,16 +10,18 @@ class BaseSearchKeyDetector:
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
11
  raise NotImplementedError
12
12
 
13
- def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
- for column_name in column_names:
15
- if self._is_search_key_by_name(column_name):
16
- return column_name
13
+ def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
14
+ return [
15
+ column_name
16
+ for column_name in column_names
17
+ if self._is_search_key_by_name(column_name)
18
+ ]
17
19
 
18
- def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
19
- maybe_column = self._get_search_key_by_name(df.columns.to_list())
20
- if maybe_column is not None:
21
- return maybe_column
22
-
23
- for column_name in df.columns:
20
+ def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
21
+ other_columns = [col for col in df.columns if col not in existing_search_keys]
22
+ columns_by_names = self._get_search_keys_by_name(other_columns)
23
+ columns_by_values = []
24
+ for column_name in other_columns:
24
25
  if self._is_search_key_by_values(df[column_name]):
25
- return column_name
26
+ columns_by_values.append(column_name)
27
+ return list(set(columns_by_names + columns_by_values))
@@ -3,7 +3,15 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
6
+ from upgini.metadata import (
7
+ ENTITY_SYSTEM_RECORD_ID,
8
+ EVAL_SET_INDEX,
9
+ SORT_ID,
10
+ SYSTEM_RECORD_ID,
11
+ TARGET,
12
+ ModelTaskType,
13
+ SearchKey,
14
+ )
7
15
  from upgini.resource_bundle import ResourceBundle
8
16
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
9
17
  from upgini.utils.target_utils import define_task
@@ -143,6 +151,8 @@ def clean_full_duplicates(
143
151
  unique_columns = df.columns.tolist()
144
152
  if SYSTEM_RECORD_ID in unique_columns:
145
153
  unique_columns.remove(SYSTEM_RECORD_ID)
154
+ if ENTITY_SYSTEM_RECORD_ID in unique_columns:
155
+ unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
146
156
  if SORT_ID in unique_columns:
147
157
  unique_columns.remove(SORT_ID)
148
158
  if EVAL_SET_INDEX in unique_columns:
@@ -38,11 +38,13 @@ class EmailSearchKeyConverter:
38
38
  email_column: str,
39
39
  hem_column: Optional[str],
40
40
  search_keys: Dict[str, SearchKey],
41
+ unnest_search_keys: Optional[List[str]] = None,
41
42
  logger: Optional[logging.Logger] = None,
42
43
  ):
43
44
  self.email_column = email_column
44
45
  self.hem_column = hem_column
45
46
  self.search_keys = search_keys
47
+ self.unnest_search_keys = unnest_search_keys
46
48
  if logger is not None:
47
49
  self.logger = logger
48
50
  else:
@@ -80,9 +82,12 @@ class EmailSearchKeyConverter:
80
82
  del self.search_keys[self.email_column]
81
83
  return df
82
84
  self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
85
+ self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
83
86
  self.email_converted_to_hem = True
84
87
 
85
88
  del self.search_keys[self.email_column]
89
+ if self.email_column in self.unnest_search_keys:
90
+ self.unnest_search_keys.remove(self.email_column)
86
91
 
87
92
  df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
88
93
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.299a3511.dev10
3
+ Version: 1.1.300
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
- Requires-Dist: jarowinkler>=2.0.0
30
- Requires-Dist: levenshtein>=0.25.1
31
29
  Requires-Dist: lightgbm>=3.3.2
32
30
  Requires-Dist: numpy>=1.19.0
33
31
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -133,7 +131,7 @@ Description-Content-Type: text/markdown
133
131
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
134
132
  |World economic indicators|191 |41|-|Monthly|date, country|No
135
133
  |Markets data|-|17|-|Monthly|date, datetime|No
136
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
134
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
137
135
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
138
136
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
139
137
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -842,4 +840,4 @@ Some convenient ways to start contributing are:
842
840
  - [More perks for registered users](https://profile.upgini.com)
843
841
 
844
842
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
845
- Please report it here</a></sup>
843
+ Please report it here</a></sup>
@@ -1,26 +1,26 @@
1
- upgini/__about__.py,sha256=I0j7AWy1_EKycImUvMKvXo5YCpu6evDNqtJKxoVOX5c,35
1
+ upgini/__about__.py,sha256=IcfjUPTVJrTs-NV6tcVvLWMSPbW14GYerq2VeSrrzc0,24
2
2
  upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
4
+ upgini/dataset.py,sha256=MOzBVsvzlHLxNfPWtMaXC_jIPeW7_gUvbSGeXnsPgNI,46158
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=HQFLw3VyEsZfAt4xFnIYOnp3fzQSHAsyHzIm0gTJpOI,177543
6
+ upgini/features_enricher.py,sha256=nweJzEV8ZbDN5wvf5Gdf-HgcNz7711gNgmRxz4ZUopI,183112
7
7
  upgini/http.py,sha256=bp6jWl422Icy3AhHMdCcJv5NjExE45gSMmzMTPJjPuk,42600
8
8
  upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
- upgini/metadata.py,sha256=qDAIO7NLSSQp_XiXCv3U4XJTLO0KH3YuQ8lvCLYPqzs,9781
10
- upgini/metrics.py,sha256=DiDgdFvYu64ArlPEgjppZShK6yybWtIEbdPAhI3yO1I,30930
9
+ upgini/metadata.py,sha256=wOFCJruDBhC4Hiiiqf8GeHZnnm6rhJy8t6fg5B0Z4TQ,10209
10
+ upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
11
11
  upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
18
- upgini/autofe/binary.py,sha256=9W1DL2kZEmgV1P-0BEy8JYj9u_xhiDPKfeEsFQfrlkU,6860
19
- upgini/autofe/date.py,sha256=y_ien-z_nyPQX7FyZFcrB8dshyWLwA-8bSR9c3mCZZQ,8423
20
- upgini/autofe/feature.py,sha256=Rs-1_6UhObbA2UMz-_QeHjV1u2hTEY0h43HQjnA2OQo,13689
17
+ upgini/autofe/all_operands.py,sha256=XbvgX2IU4aee9rJZ--d5MdmrfKhON_emle5-RU1qlEY,2506
18
+ upgini/autofe/binary.py,sha256=8FXPJxN7fnC5wphO0Dp1tQCa0lFMSDGQGvBMkSIVAcE,4155
19
+ upgini/autofe/date.py,sha256=8zYVhjl7jVS4xt-IjCgk9px2LHnACX2YlMlmDELlRTc,7943
20
+ upgini/autofe/feature.py,sha256=ayxiF8Ip1ww_pt_BC9Pk127fAHZ_3fuluulS1EYLolk,13423
21
21
  upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
22
  upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
- upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
23
+ upgini/autofe/unary.py,sha256=ZWjLd-CUkNt_PpM8YuWLLipW1v_RdBlsl4JxXIVo9aM,3652
24
24
  upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=1cQZrK630VztwGGDp41ec9gqIeUtkefaqSSQEitVWiM,19581
@@ -30,22 +30,22 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/phone_normalizer.py,sha256=EzTaahk6myRv6ZXgbyVFGY4kpo_2VlQgOrm5_lfbmNI,9996
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=1oHurL4I83P2lXIavx9vSdKM8ZqncAPXH2IZf76bD6g,26292
33
+ upgini/resource_bundle/strings.properties,sha256=6jYqcxj06ZopXwr5YYMGXX1QiNNJNFo2SuwAR0qleRk,26358
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
37
37
  upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
38
38
  upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
39
39
  upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
40
- upgini/utils/base_search_key_detector.py,sha256=UNs2uxEcD1N_mOtkx3k6U70DCajW-QEO2vZp41GF0mU,855
40
+ upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
41
41
  upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
42
42
  upgini/utils/country_utils.py,sha256=yE8oRgMpXuJxPfQm4fioY6dg6700HgVnHSk4Cv9sUyM,6511
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
45
  upgini/utils/datetime_utils.py,sha256=Ujmu1ouwSFtG5SywQXJlmtDnGigAnIWPdE5Vx5NvgUM,10951
46
- upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
46
+ upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
- upgini/utils/email_utils.py,sha256=PLufTO97Pg9PPsNqB9agcM6M98MIxKUgIgNn2mVwSQ0,3520
48
+ upgini/utils/email_utils.py,sha256=aKHa4xVBSsEsiZtFCPj_DrUaFupceYfvJeP_e8w_D5E,3813
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.299a3511.dev10.dist-info/METADATA,sha256=qdPriI7TNVOSVvPHj-0S4IibaSoOojqgyq6tjDH3OjE,48231
61
- upgini-1.1.299a3511.dev10.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
- upgini-1.1.299a3511.dev10.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.299a3511.dev10.dist-info/RECORD,,
60
+ upgini-1.1.300.dist-info/METADATA,sha256=hLE9o5ZxxN1PnP4WR6ZUnp5yFpEb1cbVSwWEOiQDZBE,48153
61
+ upgini-1.1.300.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
+ upgini-1.1.300.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.300.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any