upgini 1.1.299__py3-none-any.whl → 1.1.299a3511.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.299"
1
+ __version__ = "1.1.299a3511.dev7"
@@ -1,6 +1,20 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
3
+ from upgini.autofe.binary import (
4
+ Add,
5
+ Combine,
6
+ CombineThenFreq,
7
+ Distance,
8
+ Divide,
9
+ JaroWinklerSim1,
10
+ JaroWinklerSim2,
11
+ LevenshteinSim,
12
+ Max,
13
+ Min,
14
+ Multiply,
15
+ Sim,
16
+ Subtract,
17
+ )
4
18
  from upgini.autofe.date import (
5
19
  DateDiff,
6
20
  DateDiffType2,
@@ -9,9 +23,9 @@ from upgini.autofe.date import (
9
23
  DatePercentile,
10
24
  DatePercentileMethod2,
11
25
  )
12
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
26
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
13
27
  from upgini.autofe.operand import Operand
14
- from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
28
+ from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
15
29
  from upgini.autofe.vector import Mean, Sum
16
30
 
17
31
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -39,10 +53,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
39
53
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
40
54
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
41
55
  GroupByThenRank(),
42
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
56
+ Combine(),
57
+ CombineThenFreq(),
58
+ GroupByThenNUnique(),
59
+ GroupByThenFreq(),
46
60
  Sim(),
47
61
  DateDiff(),
48
62
  DateDiffType2(),
@@ -59,6 +73,11 @@ ALL_OPERANDS: Dict[str, Operand] = {
59
73
  DatePercentile(),
60
74
  DatePercentileMethod2(),
61
75
  Norm(),
76
+ JaroWinklerSim1(),
77
+ JaroWinklerSim2(),
78
+ LevenshteinSim(),
79
+ Distance(),
80
+ Embeddings(),
62
81
  ]
63
82
  }
64
83
 
upgini/autofe/binary.py CHANGED
@@ -1,7 +1,11 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
6
  from numpy import dot
4
7
  from numpy.linalg import norm
8
+ from jarowinkler import jarowinkler_similarity
5
9
 
6
10
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
11
 
@@ -130,7 +134,25 @@ class CombineThenFreq(PandasOperand):
130
134
  self._loc(temp, value_counts)
131
135
 
132
136
 
133
- class Sim(PandasOperand):
137
+ class Distance(PandasOperand):
138
+ name = "dist"
139
+ is_binary = True
140
+ output_type = "float"
141
+ is_symmetrical = True
142
+ has_symmetry_importance = True
143
+
144
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
145
+ return pd.Series(
146
+ 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
147
+ )
148
+
149
+ # row-wise dot product
150
+ def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
151
+ return (left * right).apply(np.sum)
152
+
153
+
154
+ # Left for backward compatibility
155
+ class Sim(Distance):
134
156
  name = "sim"
135
157
  is_binary = True
136
158
  output_type = "float"
@@ -138,4 +160,71 @@ class Sim(PandasOperand):
138
160
  has_symmetry_importance = True
139
161
 
140
162
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
141
- return dot(left, right) / (norm(left) * norm(right))
163
+ return 1 - super().calculate_binary(left, right)
164
+
165
+
166
+ class StringSim(PandasOperand, abc.ABC):
167
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
+ sims = []
169
+ for i in left.index:
170
+ left_i = self._prepare_value(left.get(i))
171
+ right_i = self._prepare_value(right.get(i))
172
+ if left_i is not None and right_i is not None:
173
+ sims.append(self._similarity(left_i, right_i))
174
+ else:
175
+ sims.append(None)
176
+
177
+ return pd.Series(sims, index=left.index)
178
+
179
+ @abc.abstractmethod
180
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
181
+ pass
182
+
183
+ @abc.abstractmethod
184
+ def _similarity(self, left: str, right: str) -> float:
185
+ pass
186
+
187
+
188
+ class JaroWinklerSim1(StringSim):
189
+ name = "sim_jw1"
190
+ is_binary = True
191
+ input_type = "string"
192
+ output_type = "float"
193
+ is_symmetrical = True
194
+ has_symmetry_importance = True
195
+
196
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
197
+ return value
198
+
199
+ def _similarity(self, left: str, right: str) -> float:
200
+ return jarowinkler_similarity(left, right)
201
+
202
+
203
+ class JaroWinklerSim2(StringSim):
204
+ name = "sim_jw2"
205
+ is_binary = True
206
+ input_type = "string"
207
+ output_type = "float"
208
+ is_symmetrical = True
209
+ has_symmetry_importance = True
210
+
211
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
212
+ return value[::-1] if value is not None else None
213
+
214
+ def _similarity(self, left: str, right: str) -> float:
215
+ return jarowinkler_similarity(left, right)
216
+
217
+
218
+ class LevenshteinSim(StringSim):
219
+ name = "sim_lv"
220
+ is_binary = True
221
+ input_type = "string"
222
+ output_type = "float"
223
+ is_symmetrical = True
224
+ has_symmetry_importance = True
225
+
226
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
227
+ return value
228
+
229
+ def _similarity(self, left: str, right: str) -> float:
230
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
upgini/autofe/date.py CHANGED
@@ -43,6 +43,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
43
43
  is_binary = True
44
44
  has_symmetry_importance = True
45
45
 
46
+ replace_negative: bool = False
47
+
46
48
  def get_params(self) -> Dict[str, Optional[str]]:
47
49
  res = super().get_params()
48
50
  res.update(
@@ -50,6 +52,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
50
52
  "diff_unit": self.diff_unit,
51
53
  "left_unit": self.left_unit,
52
54
  "right_unit": self.right_unit,
55
+ "replace_negative": self.replace_negative,
53
56
  }
54
57
  )
55
58
  return res
@@ -61,7 +64,8 @@ class DateDiff(PandasOperand, DateDiffMixin):
61
64
  return self.__replace_negative(diff)
62
65
 
63
66
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
64
- x[x < 0] = None
67
+ if self.replace_negative:
68
+ x[x < 0] = None
65
69
  return x
66
70
 
67
71
 
@@ -101,13 +105,19 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
101
105
  class DateListDiff(PandasOperand, DateDiffMixin):
102
106
  is_binary = True
103
107
  has_symmetry_importance = True
108
+
104
109
  aggregation: str
110
+ replace_negative: bool = False
105
111
 
106
112
  def get_params(self) -> Dict[str, Optional[str]]:
107
113
  res = super().get_params()
108
114
  res.update(
109
115
  {
110
116
  "aggregation": self.aggregation,
117
+ "diff_unit": self.diff_unit,
118
+ "left_unit": self.left_unit,
119
+ "right_unit": self.right_unit,
120
+ "replace_negative": self.replace_negative,
111
121
  }
112
122
  )
113
123
  return res
@@ -125,7 +135,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
125
135
 
126
136
  def _diff(self, x: TimedeltaArray):
127
137
  x = self._convert_diff_to_unit(x)
128
- return x[x > 0]
138
+ return x[x > 0] if self.replace_negative else x
129
139
 
130
140
  def _agg(self, x):
131
141
  method = getattr(np, self.aggregation, None)
@@ -157,7 +167,10 @@ class DateListDiffBounded(DateListDiff):
157
167
  super().__init__(**data)
158
168
 
159
169
  def _agg(self, x):
160
- x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
170
+ x = x[
171
+ (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
172
+ & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
173
+ ]
161
174
  return super()._agg(x)
162
175
 
163
176
 
upgini/autofe/feature.py CHANGED
@@ -138,15 +138,17 @@ class Feature:
138
138
  if self.cached_display_name is not None and cache:
139
139
  return self.cached_display_name
140
140
 
141
+ should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
142
+ prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
143
+
141
144
  if self.alias:
142
145
  components = ["f_autofe", self.alias]
143
- elif shorten and not self.op.is_unary:
144
- components = ["f_autofe", self.get_op_display_name()]
146
+ elif shorten and (not self.op.is_unary or should_stack_op):
147
+ components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
145
148
  else:
146
- components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
147
- "autofe",
148
- self.get_op_display_name(),
149
- ]
149
+ components = (
150
+ ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
151
+ )
150
152
  components.extend([str(self.display_index)] if self.display_index is not None else [])
151
153
  display_name = "_".join(components)
152
154
 
upgini/autofe/unary.py CHANGED
@@ -125,3 +125,10 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
+
129
+
130
+ class Embeddings(PandasOperand):
131
+ name = "emb"
132
+ is_unary = True
133
+ input_type = "string"
134
+ output_type = "vector"
upgini/dataset.py CHANGED
@@ -23,9 +23,7 @@ from pandas.api.types import (
23
23
  from upgini.errors import ValidationError
24
24
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
25
  from upgini.metadata import (
26
- ENTITY_SYSTEM_RECORD_ID,
27
26
  EVAL_SET_INDEX,
28
- SEARCH_KEY_UNNEST,
29
27
  SYSTEM_COLUMNS,
30
28
  SYSTEM_RECORD_ID,
31
29
  TARGET,
@@ -81,7 +79,6 @@ class Dataset: # (pd.DataFrame):
81
79
  path: Optional[str] = None,
82
80
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
83
81
  search_keys: Optional[List[Tuple[str, ...]]] = None,
84
- unnest_search_keys: Optional[Dict[str, str]] = None,
85
82
  model_task_type: Optional[ModelTaskType] = None,
86
83
  random_state: Optional[int] = None,
87
84
  rest_client: Optional[_RestClient] = None,
@@ -116,7 +113,6 @@ class Dataset: # (pd.DataFrame):
116
113
  self.description = description
117
114
  self.meaning_types = meaning_types
118
115
  self.search_keys = search_keys
119
- self.unnest_search_keys = unnest_search_keys
120
116
  self.ignore_columns = []
121
117
  self.hierarchical_group_keys = []
122
118
  self.hierarchical_subgroup_keys = []
@@ -176,7 +172,7 @@ class Dataset: # (pd.DataFrame):
176
172
  new_columns = []
177
173
  dup_counter = 0
178
174
  for column in self.data.columns:
179
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
175
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
180
176
  self.columns_renaming[column] = column
181
177
  new_columns.append(column)
182
178
  continue
@@ -357,9 +353,7 @@ class Dataset: # (pd.DataFrame):
357
353
 
358
354
  if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
359
355
  try:
360
- self.data[postal_code] = (
361
- self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
362
- )
356
+ self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
363
357
  except Exception:
364
358
  pass
365
359
  elif is_float_dtype(self.data[postal_code]):
@@ -809,9 +803,6 @@ class Dataset: # (pd.DataFrame):
809
803
  meaningType=meaning_type,
810
804
  minMaxValues=min_max_values,
811
805
  )
812
- if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
813
- column_meta.isUnnest = True
814
- column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
815
806
 
816
807
  columns.append(column_meta)
817
808
 
@@ -11,7 +11,6 @@ import sys
11
11
  import tempfile
12
12
  import time
13
13
  import uuid
14
- from collections import Counter
15
14
  from dataclasses import dataclass
16
15
  from threading import Thread
17
16
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -46,11 +45,9 @@ from upgini.mdc import MDC
46
45
  from upgini.metadata import (
47
46
  COUNTRY,
48
47
  DEFAULT_INDEX,
49
- ENTITY_SYSTEM_RECORD_ID,
50
48
  EVAL_SET_INDEX,
51
49
  ORIGINAL_INDEX,
52
50
  RENAMED_INDEX,
53
- SEARCH_KEY_UNNEST,
54
51
  SORT_ID,
55
52
  SYSTEM_RECORD_ID,
56
53
  TARGET,
@@ -251,7 +248,7 @@ class FeaturesEnricher(TransformerMixin):
251
248
  self.__cached_sampled_datasets: Optional[Tuple[pd.DataFrame, pd.DataFrame, pd.Series, Dict, Dict]] = None
252
249
 
253
250
  validate_version(self.logger)
254
- self.search_keys = search_keys or {}
251
+ self.search_keys = search_keys or dict()
255
252
  self.country_code = country_code
256
253
  self.__validate_search_keys(search_keys, search_id)
257
254
  self.model_task_type = model_task_type
@@ -1203,7 +1200,7 @@ class FeaturesEnricher(TransformerMixin):
1203
1200
  email_column = self._get_email_column(search_keys)
1204
1201
  hem_column = self._get_hem_column(search_keys)
1205
1202
  if email_column:
1206
- converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, [], self.logger)
1203
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1207
1204
  extended_X = converter.convert(extended_X)
1208
1205
  generated_features.extend(converter.generated_features)
1209
1206
  if (
@@ -1356,7 +1353,7 @@ class FeaturesEnricher(TransformerMixin):
1356
1353
  not in (
1357
1354
  excluding_search_keys
1358
1355
  + list(self.fit_dropped_features)
1359
- + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1356
+ + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1360
1357
  )
1361
1358
  ]
1362
1359
 
@@ -1420,7 +1417,7 @@ class FeaturesEnricher(TransformerMixin):
1420
1417
  fitting_enriched_X[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
1421
1418
  )
1422
1419
 
1423
- fitting_eval_set_dict = {}
1420
+ fitting_eval_set_dict = dict()
1424
1421
  for idx, eval_tuple in eval_set_sampled_dict.items():
1425
1422
  eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
1426
1423
  eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
@@ -1537,7 +1534,7 @@ class FeaturesEnricher(TransformerMixin):
1537
1534
  def __sample_only_input(
1538
1535
  self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]], is_demo_dataset: bool
1539
1536
  ) -> _SampledDataForMetrics:
1540
- eval_set_sampled_dict = {}
1537
+ eval_set_sampled_dict = dict()
1541
1538
 
1542
1539
  df = validated_X.copy()
1543
1540
  df[TARGET] = validated_y
@@ -1563,7 +1560,7 @@ class FeaturesEnricher(TransformerMixin):
1563
1560
  df = df.sample(n=sample_rows, random_state=self.random_state)
1564
1561
 
1565
1562
  df_extended, search_keys = self._extend_x(df, is_demo_dataset)
1566
- df_extended = self.__add_fit_system_record_id(df_extended, {}, search_keys, SYSTEM_RECORD_ID)
1563
+ df_extended = self.__add_fit_system_record_id(df_extended, dict(), search_keys)
1567
1564
 
1568
1565
  train_df = df_extended.query(f"{EVAL_SET_INDEX} == 0") if eval_set is not None else df_extended
1569
1566
  X_sampled = train_df.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -1587,7 +1584,7 @@ class FeaturesEnricher(TransformerMixin):
1587
1584
  trace_id: str,
1588
1585
  remove_outliers_calc_metrics: Optional[bool],
1589
1586
  ) -> _SampledDataForMetrics:
1590
- eval_set_sampled_dict = {}
1587
+ eval_set_sampled_dict = dict()
1591
1588
  search_keys = self.fit_search_keys
1592
1589
 
1593
1590
  rows_to_drop = None
@@ -1661,7 +1658,7 @@ class FeaturesEnricher(TransformerMixin):
1661
1658
  progress_bar: Optional[ProgressBar],
1662
1659
  progress_callback: Optional[Callable[[SearchProgress], Any]],
1663
1660
  ) -> _SampledDataForMetrics:
1664
- eval_set_sampled_dict = {}
1661
+ eval_set_sampled_dict = dict()
1665
1662
  if eval_set is not None:
1666
1663
  self.logger.info("Transform with eval_set")
1667
1664
  # concatenate X and eval_set with eval_set_index
@@ -1683,7 +1680,7 @@ class FeaturesEnricher(TransformerMixin):
1683
1680
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1684
1681
  df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1685
1682
 
1686
- eval_set_sampled_dict = {}
1683
+ eval_set_sampled_dict = dict()
1687
1684
 
1688
1685
  tmp_target_name = "__target"
1689
1686
  df = df.rename(columns={TARGET: tmp_target_name})
@@ -1946,38 +1943,11 @@ class FeaturesEnricher(TransformerMixin):
1946
1943
  self.logger.info("Input dataset hasn't date column")
1947
1944
  if self.add_date_if_missing:
1948
1945
  df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
1949
-
1950
- # Don't pass all features in backend on transform
1951
- original_features_for_transform = []
1952
- runtime_parameters = self._get_copy_of_runtime_parameters()
1953
- features_not_to_pass = [column for column in df.columns if column not in search_keys.keys()]
1954
- if len(features_not_to_pass) > 0:
1955
- # Pass only features that need for transform
1956
- features_for_transform = self._search_task.get_features_for_transform()
1957
- if features_for_transform is not None and len(features_for_transform) > 0:
1958
- file_metadata = self._search_task.get_file_metadata(trace_id)
1959
- original_features_for_transform = [
1960
- c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1961
- ]
1962
-
1963
- runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
1964
-
1965
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1966
-
1967
- df[ENTITY_SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(
1968
- df[columns_for_system_record_id], index=False
1969
- ).astype("Float64")
1970
-
1971
- # Explode multiple search keys
1972
- df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys)
1973
-
1974
1946
  email_column = self._get_email_column(search_keys)
1975
1947
  hem_column = self._get_hem_column(search_keys)
1976
1948
  email_converted_to_hem = False
1977
1949
  if email_column:
1978
- converter = EmailSearchKeyConverter(
1979
- email_column, hem_column, search_keys, list(unnest_search_keys.keys()), self.logger
1980
- )
1950
+ converter = EmailSearchKeyConverter(email_column, hem_column, search_keys, self.logger)
1981
1951
  df = converter.convert(df)
1982
1952
  generated_features.extend(converter.generated_features)
1983
1953
  email_converted_to_hem = converter.email_converted_to_hem
@@ -1991,21 +1961,30 @@ class FeaturesEnricher(TransformerMixin):
1991
1961
  generated_features = [f for f in generated_features if f in self.fit_generated_features]
1992
1962
 
1993
1963
  meaning_types = {col: key.value for col, key in search_keys.items()}
1994
- # non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1995
- for col in original_features_for_transform:
1996
- meaning_types[col] = FileColumnMeaningType.FEATURE
1997
- features_not_to_pass = [column for column in features_not_to_pass if column not in search_keys.keys()]
1964
+ non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1998
1965
 
1999
1966
  if email_converted_to_hem:
2000
- features_not_to_pass.append(email_column)
1967
+ non_keys_columns.append(email_column)
2001
1968
 
2002
- features_not_to_pass = [c for c in features_not_to_pass if c not in original_features_for_transform]
2003
- columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform))
1969
+ # Don't pass features in backend on transform
1970
+ original_features_for_transform = None
1971
+ runtime_parameters = self._get_copy_of_runtime_parameters()
1972
+ if len(non_keys_columns) > 0:
1973
+ # Pass only features that need for transform
1974
+ features_for_transform = self._search_task.get_features_for_transform()
1975
+ if features_for_transform is not None and len(features_for_transform) > 0:
1976
+ file_metadata = self._search_task.get_file_metadata(trace_id)
1977
+ original_features_for_transform = [
1978
+ c.originalName or c.name for c in file_metadata.columns if c.name in features_for_transform
1979
+ ]
1980
+ non_keys_columns = [c for c in non_keys_columns if c not in original_features_for_transform]
1981
+
1982
+ runtime_parameters.properties["features_for_embeddings"] = ",".join(features_for_transform)
2004
1983
 
2005
1984
  if add_fit_system_record_id:
2006
- df = self.__add_fit_system_record_id(df, {}, search_keys, SYSTEM_RECORD_ID)
1985
+ df = self.__add_fit_system_record_id(df, dict(), search_keys)
2007
1986
  df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
2008
- features_not_to_pass.append(SORT_ID)
1987
+ non_keys_columns.append(SORT_ID)
2009
1988
 
2010
1989
  columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
2011
1990
 
@@ -2013,19 +1992,16 @@ class FeaturesEnricher(TransformerMixin):
2013
1992
  "Float64"
2014
1993
  )
2015
1994
  meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
2016
- meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2017
- if SEARCH_KEY_UNNEST in df.columns:
2018
- meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2019
1995
 
2020
1996
  df = df.reset_index(drop=True)
2021
- system_columns_with_original_index = [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID] + generated_features
1997
+ system_columns_with_original_index = [SYSTEM_RECORD_ID] + generated_features
2022
1998
  if add_fit_system_record_id:
2023
1999
  system_columns_with_original_index.append(SORT_ID)
2024
2000
  df_with_original_index = df[system_columns_with_original_index].copy()
2025
2001
 
2026
2002
  combined_search_keys = combine_search_keys(search_keys.keys())
2027
2003
 
2028
- df_without_features = df.drop(columns=features_not_to_pass)
2004
+ df_without_features = df.drop(columns=non_keys_columns)
2029
2005
 
2030
2006
  df_without_features = clean_full_duplicates(
2031
2007
  df_without_features, self.logger, silent=silent_mode, bundle=self.bundle
@@ -2037,13 +2013,12 @@ class FeaturesEnricher(TransformerMixin):
2037
2013
  dataset = Dataset(
2038
2014
  "sample_" + str(uuid.uuid4()),
2039
2015
  df=df_without_features,
2040
- meaning_types=meaning_types,
2041
- search_keys=combined_search_keys,
2042
- unnest_search_keys=unnest_search_keys,
2043
2016
  date_format=self.date_format,
2044
2017
  rest_client=self.rest_client,
2045
2018
  logger=self.logger,
2046
2019
  )
2020
+ dataset.meaning_types = meaning_types
2021
+ dataset.search_keys = combined_search_keys
2047
2022
  if email_converted_to_hem:
2048
2023
  dataset.ignore_columns = [email_column]
2049
2024
 
@@ -2182,14 +2157,6 @@ class FeaturesEnricher(TransformerMixin):
2182
2157
 
2183
2158
  key_types = search_keys.values()
2184
2159
 
2185
- # Multiple search keys allowed only for PHONE, IP, POSTAL_CODE, EMAIL, HEM
2186
- multi_keys = [key for key, count in Counter(key_types).items() if count > 1]
2187
- for multi_key in multi_keys:
2188
- if multi_key not in [SearchKey.PHONE, SearchKey.IP, SearchKey.POSTAL_CODE, SearchKey.EMAIL, SearchKey.HEM]:
2189
- msg = self.bundle.get("unsupported_multi_key").format(multi_key)
2190
- self.logger.warning(msg)
2191
- raise ValidationError(msg)
2192
-
2193
2160
  if SearchKey.DATE in key_types and SearchKey.DATETIME in key_types:
2194
2161
  msg = self.bundle.get("date_and_datetime_simultanious")
2195
2162
  self.logger.warning(msg)
@@ -2205,11 +2172,11 @@ class FeaturesEnricher(TransformerMixin):
2205
2172
  self.logger.warning(msg)
2206
2173
  raise ValidationError(msg)
2207
2174
 
2208
- # for key_type in SearchKey.__members__.values():
2209
- # if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2210
- # msg = self.bundle.get("multiple_search_key").format(key_type)
2211
- # self.logger.warning(msg)
2212
- # raise ValidationError(msg)
2175
+ for key_type in SearchKey.__members__.values():
2176
+ if key_type != SearchKey.CUSTOM_KEY and list(key_types).count(key_type) > 1:
2177
+ msg = self.bundle.get("multiple_search_key").format(key_type)
2178
+ self.logger.warning(msg)
2179
+ raise ValidationError(msg)
2213
2180
 
2214
2181
  # non_personal_keys = set(SearchKey.__members__.values()) - set(SearchKey.personal_keys())
2215
2182
  # if (
@@ -2347,7 +2314,14 @@ class FeaturesEnricher(TransformerMixin):
2347
2314
  self.logger.info("Input dataset hasn't date column")
2348
2315
  if self.add_date_if_missing:
2349
2316
  df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
2350
-
2317
+ email_column = self._get_email_column(self.fit_search_keys)
2318
+ hem_column = self._get_hem_column(self.fit_search_keys)
2319
+ email_converted_to_hem = False
2320
+ if email_column:
2321
+ converter = EmailSearchKeyConverter(email_column, hem_column, self.fit_search_keys, self.logger)
2322
+ df = converter.convert(df)
2323
+ self.fit_generated_features.extend(converter.generated_features)
2324
+ email_converted_to_hem = converter.email_converted_to_hem
2351
2325
  if (
2352
2326
  self.detect_missing_search_keys
2353
2327
  and list(self.fit_search_keys.values()) == [SearchKey.DATE]
@@ -2356,37 +2330,7 @@ class FeaturesEnricher(TransformerMixin):
2356
2330
  converter = IpToCountrySearchKeyConverter(self.fit_search_keys, self.logger)
2357
2331
  df = converter.convert(df)
2358
2332
 
2359
- # Explode multiple search keys
2360
2333
  non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX] + list(self.fit_search_keys.keys())
2361
- meaning_types = {
2362
- **{col: key.value for col, key in self.fit_search_keys.items()},
2363
- **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2364
- }
2365
- meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2366
- if eval_set is not None and len(eval_set) > 0:
2367
- meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2368
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID)
2369
-
2370
- # TODO check that this is correct for enrichment
2371
- self.df_with_original_index = df.copy()
2372
-
2373
- df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys)
2374
-
2375
- # Convert EMAIL to HEM after unnesting to do it only with one column
2376
- email_column = self._get_email_column(self.fit_search_keys)
2377
- hem_column = self._get_hem_column(self.fit_search_keys)
2378
- email_converted_to_hem = False
2379
- if email_column:
2380
- converter = EmailSearchKeyConverter(
2381
- email_column, hem_column, self.fit_search_keys, list(unnest_search_keys.keys()), self.logger
2382
- )
2383
- df = converter.convert(df)
2384
- self.fit_generated_features.extend(converter.generated_features)
2385
- email_converted_to_hem = converter.email_converted_to_hem
2386
-
2387
- non_feature_columns = [self.TARGET_NAME, EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST] + list(
2388
- self.fit_search_keys.keys()
2389
- )
2390
2334
  if email_converted_to_hem:
2391
2335
  non_feature_columns.append(email_column)
2392
2336
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
@@ -2410,14 +2354,12 @@ class FeaturesEnricher(TransformerMixin):
2410
2354
  **{str(c): FileColumnMeaningType.FEATURE for c in df.columns if c not in non_feature_columns},
2411
2355
  }
2412
2356
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
2413
- meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
2414
- if SEARCH_KEY_UNNEST in df.columns:
2415
- meaning_types[SEARCH_KEY_UNNEST] = FileColumnMeaningType.UNNEST_KEY
2416
2357
  if eval_set is not None and len(eval_set) > 0:
2417
2358
  meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
2418
2359
 
2419
- df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys, SYSTEM_RECORD_ID)
2360
+ df = self.__add_fit_system_record_id(df, meaning_types, self.fit_search_keys)
2420
2361
 
2362
+ self.df_with_original_index = df.copy()
2421
2363
  df = df.reset_index(drop=True).sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2422
2364
 
2423
2365
  combined_search_keys = combine_search_keys(self.fit_search_keys.keys())
@@ -2425,15 +2367,14 @@ class FeaturesEnricher(TransformerMixin):
2425
2367
  dataset = Dataset(
2426
2368
  "tds_" + str(uuid.uuid4()),
2427
2369
  df=df,
2428
- meaning_types=meaning_types,
2429
- search_keys=combined_search_keys,
2430
- unnest_search_keys=unnest_search_keys,
2431
2370
  model_task_type=model_task_type,
2432
2371
  date_format=self.date_format,
2433
2372
  random_state=self.random_state,
2434
2373
  rest_client=self.rest_client,
2435
2374
  logger=self.logger,
2436
2375
  )
2376
+ dataset.meaning_types = meaning_types
2377
+ dataset.search_keys = combined_search_keys
2437
2378
  if email_converted_to_hem:
2438
2379
  dataset.ignore_columns = [email_column]
2439
2380
 
@@ -2803,10 +2744,9 @@ class FeaturesEnricher(TransformerMixin):
2803
2744
  X: pd.DataFrame, y: pd.Series, cv: Optional[CVType]
2804
2745
  ) -> Tuple[pd.DataFrame, pd.Series]:
2805
2746
  if cv not in [CVType.time_series, CVType.blocked_time_series]:
2806
- record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
2807
2747
  Xy = X.copy()
2808
2748
  Xy[TARGET] = y
2809
- Xy = Xy.sort_values(by=record_id_column).reset_index(drop=True)
2749
+ Xy = Xy.sort_values(by=SYSTEM_RECORD_ID).reset_index(drop=True)
2810
2750
  X = Xy.drop(columns=TARGET)
2811
2751
  y = Xy[TARGET].copy()
2812
2752
 
@@ -2985,19 +2925,15 @@ class FeaturesEnricher(TransformerMixin):
2985
2925
 
2986
2926
  @staticmethod
2987
2927
  def _get_email_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2988
- cols = [col for col, t in search_keys.items() if t == SearchKey.EMAIL]
2989
- if len(cols) > 1:
2990
- raise Exception("More than one email column found after unnest")
2991
- if len(cols) == 1:
2992
- return cols[0]
2928
+ for col, t in search_keys.items():
2929
+ if t == SearchKey.EMAIL:
2930
+ return col
2993
2931
 
2994
2932
  @staticmethod
2995
2933
  def _get_hem_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
2996
- cols = [col for col, t in search_keys.items() if t == SearchKey.HEM]
2997
- if len(cols) > 1:
2998
- raise Exception("More than one hem column found after unnest")
2999
- if len(cols) == 1:
3000
- return cols[0]
2934
+ for col, t in search_keys.items():
2935
+ if t == SearchKey.HEM:
2936
+ return col
3001
2937
 
3002
2938
  @staticmethod
3003
2939
  def _get_phone_column(search_keys: Dict[str, SearchKey]) -> Optional[str]:
@@ -3005,44 +2941,8 @@ class FeaturesEnricher(TransformerMixin):
3005
2941
  if t == SearchKey.PHONE:
3006
2942
  return col
3007
2943
 
3008
- def _explode_multiple_search_keys(
3009
- self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]
3010
- ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
3011
- # find groups of multiple search keys
3012
- search_key_names_by_type: Dict[SearchKey, str] = {}
3013
- for key_name, key_type in search_keys.items():
3014
- search_key_names_by_type[key_type] = search_key_names_by_type.get(key_type, []) + [key_name]
3015
- search_key_names_by_type = {
3016
- key_type: key_names for key_type, key_names in search_key_names_by_type.items() if len(key_names) > 1
3017
- }
3018
- if len(search_key_names_by_type) == 0:
3019
- return df, {}
3020
-
3021
- multiple_keys_columns = [col for cols in search_key_names_by_type.values() for col in cols]
3022
- other_columns = [col for col in df.columns if col not in multiple_keys_columns]
3023
- exploded_dfs = []
3024
- unnest_search_keys = {}
3025
-
3026
- for key_type, key_names in search_key_names_by_type.items():
3027
- new_search_key = f"upgini_{key_type.name.lower()}_unnest"
3028
- exploded_df = pd.melt(
3029
- df, id_vars=other_columns, value_vars=key_names, var_name=SEARCH_KEY_UNNEST, value_name=new_search_key
3030
- )
3031
- exploded_dfs.append(exploded_df)
3032
- for old_key in key_names:
3033
- del search_keys[old_key]
3034
- search_keys[new_search_key] = key_type
3035
- unnest_search_keys[new_search_key] = key_names
3036
-
3037
- df = pd.concat(exploded_dfs, ignore_index=True)
3038
- return df, unnest_search_keys
3039
-
3040
2944
  def __add_fit_system_record_id(
3041
- self,
3042
- df: pd.DataFrame,
3043
- meaning_types: Dict[str, FileColumnMeaningType],
3044
- search_keys: Dict[str, SearchKey],
3045
- id_name: str,
2945
+ self, df: pd.DataFrame, meaning_types: Dict[str, FileColumnMeaningType], search_keys: Dict[str, SearchKey]
3046
2946
  ) -> pd.DataFrame:
3047
2947
  # save original order or rows
3048
2948
  original_index_name = df.index.name
@@ -3053,14 +2953,7 @@ class FeaturesEnricher(TransformerMixin):
3053
2953
 
3054
2954
  # order by date and idempotent order by other keys
3055
2955
  if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
3056
- sort_exclude_columns = [
3057
- original_order_name,
3058
- ORIGINAL_INDEX,
3059
- EVAL_SET_INDEX,
3060
- TARGET,
3061
- "__target",
3062
- ENTITY_SYSTEM_RECORD_ID,
3063
- ]
2956
+ sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
3064
2957
  if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
3065
2958
  date_column = DateTimeSearchKeyConverter.DATETIME_COL
3066
2959
  sort_exclude_columns.append(self._get_date_column(search_keys))
@@ -3098,18 +2991,14 @@ class FeaturesEnricher(TransformerMixin):
3098
2991
 
3099
2992
  df = df.reset_index(drop=True).reset_index()
3100
2993
  # system_record_id saves correct order for fit
3101
- df = df.rename(columns={DEFAULT_INDEX: id_name})
2994
+ df = df.rename(columns={DEFAULT_INDEX: SYSTEM_RECORD_ID})
3102
2995
 
3103
2996
  # return original order
3104
2997
  df = df.set_index(ORIGINAL_INDEX)
3105
2998
  df.index.name = original_index_name
3106
2999
  df = df.sort_values(by=original_order_name).drop(columns=original_order_name)
3107
3000
 
3108
- meaning_types[id_name] = (
3109
- FileColumnMeaningType.SYSTEM_RECORD_ID
3110
- if id_name == SYSTEM_RECORD_ID
3111
- else FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
3112
- )
3001
+ meaning_types[SYSTEM_RECORD_ID] = FileColumnMeaningType.SYSTEM_RECORD_ID
3113
3002
  return df
3114
3003
 
3115
3004
  def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -3164,11 +3053,7 @@ class FeaturesEnricher(TransformerMixin):
3164
3053
  )
3165
3054
 
3166
3055
  comparing_columns = X.columns if is_transform else df_with_original_index.columns
3167
- dup_features = [
3168
- c
3169
- for c in comparing_columns
3170
- if c in result_features.columns and c not in [SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
3171
- ]
3056
+ dup_features = [c for c in comparing_columns if c in result_features.columns and c != SYSTEM_RECORD_ID]
3172
3057
  if len(dup_features) > 0:
3173
3058
  self.logger.warning(f"X contain columns with same name as returned from backend: {dup_features}")
3174
3059
  raise ValidationError(self.bundle.get("returned_features_same_as_passed").format(dup_features))
@@ -3179,7 +3064,8 @@ class FeaturesEnricher(TransformerMixin):
3179
3064
  result_features = pd.merge(
3180
3065
  df_with_original_index,
3181
3066
  result_features,
3182
- on=ENTITY_SYSTEM_RECORD_ID,
3067
+ left_on=SYSTEM_RECORD_ID,
3068
+ right_on=SYSTEM_RECORD_ID,
3183
3069
  how="left" if is_transform else "inner",
3184
3070
  )
3185
3071
  result_features = result_features.set_index(original_index_name or DEFAULT_INDEX)
@@ -3190,7 +3076,7 @@ class FeaturesEnricher(TransformerMixin):
3190
3076
  result_features = result_features[~result_features[SYSTEM_RECORD_ID].isin(rows_to_drop[SYSTEM_RECORD_ID])]
3191
3077
  self.logger.info(f"After dropping target outliers size: {len(result_features)}")
3192
3078
 
3193
- result_eval_sets = {}
3079
+ result_eval_sets = dict()
3194
3080
  if not is_transform and EVAL_SET_INDEX in result_features.columns:
3195
3081
  result_train_features = result_features.loc[result_features[EVAL_SET_INDEX] == 0].copy()
3196
3082
  eval_set_indices = list(result_features[EVAL_SET_INDEX].unique())
@@ -3402,7 +3288,7 @@ class FeaturesEnricher(TransformerMixin):
3402
3288
  if autofe_feature.op.is_vector:
3403
3289
  continue
3404
3290
 
3405
- description = {}
3291
+ description = dict()
3406
3292
 
3407
3293
  feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
3408
3294
  if feature_meta is None:
@@ -3568,13 +3454,13 @@ class FeaturesEnricher(TransformerMixin):
3568
3454
  self.warning_counter.increment()
3569
3455
 
3570
3456
  if len(valid_search_keys) == 1:
3571
- key, value = list(valid_search_keys.items())[0]
3572
- # Show warning for country only if country is the only key
3573
- if x[key].nunique() == 1:
3574
- msg = self.bundle.get("single_constant_search_key").format(value, x[key].values[0])
3575
- print(msg)
3576
- self.logger.warning(msg)
3577
- self.warning_counter.increment()
3457
+ for k, v in valid_search_keys.items():
3458
+ # Show warning for country only if country is the only key
3459
+ if x[k].nunique() == 1 and (v != SearchKey.COUNTRY or len(valid_search_keys) == 1):
3460
+ msg = self.bundle.get("single_constant_search_key").format(v, x[k].values[0])
3461
+ print(msg)
3462
+ self.logger.warning(msg)
3463
+ self.warning_counter.increment()
3578
3464
 
3579
3465
  self.logger.info(f"Prepared search keys: {valid_search_keys}")
3580
3466
 
@@ -3684,68 +3570,61 @@ class FeaturesEnricher(TransformerMixin):
3684
3570
  def check_need_detect(search_key: SearchKey):
3685
3571
  return not is_transform or search_key in self.fit_search_keys.values()
3686
3572
 
3687
- # if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3688
- if check_need_detect(SearchKey.POSTAL_CODE):
3689
- maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
3690
- if maybe_keys:
3691
- new_keys = {key: SearchKey.POSTAL_CODE for key in maybe_keys}
3692
- search_keys.update(new_keys)
3693
- self.autodetected_search_keys.update(new_keys)
3694
- self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_keys}")
3573
+ if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
3574
+ maybe_key = PostalCodeSearchKeyDetector().get_search_key_column(sample)
3575
+ if maybe_key is not None:
3576
+ search_keys[maybe_key] = SearchKey.POSTAL_CODE
3577
+ self.autodetected_search_keys[maybe_key] = SearchKey.POSTAL_CODE
3578
+ self.logger.info(f"Autodetected search key POSTAL_CODE in column {maybe_key}")
3695
3579
  if not silent_mode:
3696
- print(self.bundle.get("postal_code_detected").format(maybe_keys))
3580
+ print(self.bundle.get("postal_code_detected").format(maybe_key))
3697
3581
 
3698
3582
  if (
3699
3583
  SearchKey.COUNTRY not in search_keys.values()
3700
3584
  and self.country_code is None
3701
3585
  and check_need_detect(SearchKey.COUNTRY)
3702
3586
  ):
3703
- maybe_key = CountrySearchKeyDetector().get_search_key_columns(sample, search_keys)
3704
- if maybe_key:
3705
- search_keys[maybe_key[0]] = SearchKey.COUNTRY
3706
- self.autodetected_search_keys[maybe_key[0]] = SearchKey.COUNTRY
3587
+ maybe_key = CountrySearchKeyDetector().get_search_key_column(sample)
3588
+ if maybe_key is not None:
3589
+ search_keys[maybe_key] = SearchKey.COUNTRY
3590
+ self.autodetected_search_keys[maybe_key] = SearchKey.COUNTRY
3707
3591
  self.logger.info(f"Autodetected search key COUNTRY in column {maybe_key}")
3708
3592
  if not silent_mode:
3709
3593
  print(self.bundle.get("country_detected").format(maybe_key))
3710
3594
 
3711
3595
  if (
3712
- # SearchKey.EMAIL not in search_keys.values()
3713
- SearchKey.HEM not in search_keys.values()
3596
+ SearchKey.EMAIL not in search_keys.values()
3597
+ and SearchKey.HEM not in search_keys.values()
3714
3598
  and check_need_detect(SearchKey.HEM)
3715
3599
  ):
3716
- maybe_keys = EmailSearchKeyDetector().get_search_key_columns(sample, search_keys)
3717
- if maybe_keys:
3600
+ maybe_key = EmailSearchKeyDetector().get_search_key_column(sample)
3601
+ if maybe_key is not None and maybe_key not in search_keys.keys():
3718
3602
  if self.__is_registered or is_demo_dataset:
3719
- new_keys = {key: SearchKey.EMAIL for key in maybe_keys}
3720
- search_keys.update(new_keys)
3721
- self.autodetected_search_keys.update(new_keys)
3722
- self.logger.info(f"Autodetected search key EMAIL in column {maybe_keys}")
3603
+ search_keys[maybe_key] = SearchKey.EMAIL
3604
+ self.autodetected_search_keys[maybe_key] = SearchKey.EMAIL
3605
+ self.logger.info(f"Autodetected search key EMAIL in column {maybe_key}")
3723
3606
  if not silent_mode:
3724
- print(self.bundle.get("email_detected").format(maybe_keys))
3607
+ print(self.bundle.get("email_detected").format(maybe_key))
3725
3608
  else:
3726
3609
  self.logger.warning(
3727
- f"Autodetected search key EMAIL in column {maybe_keys}."
3728
- " But not used because not registered user"
3610
+ f"Autodetected search key EMAIL in column {maybe_key}. But not used because not registered user"
3729
3611
  )
3730
3612
  if not silent_mode:
3731
- print(self.bundle.get("email_detected_not_registered").format(maybe_keys))
3613
+ print(self.bundle.get("email_detected_not_registered").format(maybe_key))
3732
3614
  self.warning_counter.increment()
3733
3615
 
3734
- # if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3735
- if check_need_detect(SearchKey.PHONE):
3736
- maybe_keys = PhoneSearchKeyDetector().get_search_key_columns(sample, search_keys)
3737
- if maybe_keys:
3616
+ if SearchKey.PHONE not in search_keys.values() and check_need_detect(SearchKey.PHONE):
3617
+ maybe_key = PhoneSearchKeyDetector().get_search_key_column(sample)
3618
+ if maybe_key is not None and maybe_key not in search_keys.keys():
3738
3619
  if self.__is_registered or is_demo_dataset:
3739
- new_keys = {key: SearchKey.PHONE for key in maybe_keys}
3740
- search_keys.update(new_keys)
3741
- self.autodetected_search_keys.update(new_keys)
3742
- self.logger.info(f"Autodetected search key PHONE in column {maybe_keys}")
3620
+ search_keys[maybe_key] = SearchKey.PHONE
3621
+ self.autodetected_search_keys[maybe_key] = SearchKey.PHONE
3622
+ self.logger.info(f"Autodetected search key PHONE in column {maybe_key}")
3743
3623
  if not silent_mode:
3744
- print(self.bundle.get("phone_detected").format(maybe_keys))
3624
+ print(self.bundle.get("phone_detected").format(maybe_key))
3745
3625
  else:
3746
3626
  self.logger.warning(
3747
- f"Autodetected search key PHONE in column {maybe_keys}. "
3748
- "But not used because not registered user"
3627
+ f"Autodetected search key PHONE in column {maybe_key}. But not used because not registered user"
3749
3628
  )
3750
3629
  if not silent_mode:
3751
3630
  print(self.bundle.get("phone_detected_not_registered"))
upgini/metadata.py CHANGED
@@ -6,8 +6,6 @@ from typing import Dict, List, Optional, Set
6
6
  from pydantic import BaseModel
7
7
 
8
8
  SYSTEM_RECORD_ID = "system_record_id"
9
- ENTITY_SYSTEM_RECORD_ID = "entity_system_record_id"
10
- SEARCH_KEY_UNNEST = "search_key_unnest"
11
9
  SORT_ID = "sort_id"
12
10
  EVAL_SET_INDEX = "eval_set_index"
13
11
  TARGET = "target"
@@ -15,7 +13,7 @@ COUNTRY = "country_iso_code"
15
13
  RENAMED_INDEX = "index_col"
16
14
  DEFAULT_INDEX = "index"
17
15
  ORIGINAL_INDEX = "original_index"
18
- SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
16
+ SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, EVAL_SET_INDEX, TARGET, COUNTRY, SORT_ID}
19
17
 
20
18
 
21
19
  class FileColumnMeaningType(Enum):
@@ -41,8 +39,6 @@ class FileColumnMeaningType(Enum):
41
39
  POSTAL_CODE = "POSTAL_CODE"
42
40
  SYSTEM_RECORD_ID = "SYSTEM_RECORD_ID"
43
41
  EVAL_SET_INDEX = "EVAL_SET_INDEX"
44
- ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
45
- UNNEST_KEY = "UNNEST_KEY"
46
42
 
47
43
 
48
44
  class SearchKey(Enum):
@@ -188,10 +184,6 @@ class FileColumnMetadata(BaseModel):
188
184
  meaningType: FileColumnMeaningType
189
185
  minMaxValues: Optional[NumericInterval] = None
190
186
  originalName: Optional[str]
191
- # is this column contains keys from multiple key columns like msisdn1, msisdn2
192
- isUnnest: bool = False
193
- # list of original etalon key column names like msisdn1, msisdn2
194
- unnestKeyNames: Optional[list[str]]
195
187
 
196
188
 
197
189
  class FileMetadata(BaseModel):
@@ -289,7 +281,7 @@ class FeaturesFilter(BaseModel):
289
281
 
290
282
 
291
283
  class RuntimeParameters(BaseModel):
292
- properties: Dict[str, str] = {}
284
+ properties: Dict[str, str] = dict()
293
285
 
294
286
 
295
287
  class SearchCustomization(BaseModel):
upgini/metrics.py CHANGED
@@ -369,7 +369,7 @@ class EstimatorWrapper:
369
369
  "logger": logger,
370
370
  }
371
371
  if estimator is None:
372
- params = {}
372
+ params = dict()
373
373
  params["has_time"] = has_date
374
374
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
375
375
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
@@ -88,7 +88,6 @@ unsupported_search_key_type=Unsupported type of key in search_keys: {}
88
88
  search_key_country_and_country_code=\nWARNING: SearchKey.COUNTRY and country_code parameter were passed simultaniously. Parameter country_code will be ignored
89
89
  empty_search_key=Search key {} is empty. Please fill values or remove this search key
90
90
  single_constant_search_key=\nWARNING: Constant value detected for the {} search key in the X dataframe: {}.\nThat search key will add constant features for different y values.\nPlease add extra search keys with non constant values, like the COUNTRY, POSTAL_CODE, DATE, PHONE NUMBER, EMAIL/HEM or IPv4
91
- unsupported_multi_key=Search key {} cannot be used multiple times
92
91
  unsupported_index_column=\nWARNING: Your column with name `index` was dropped because it's reserved name is booked for system needs.
93
92
  date_string_without_format=Date column `{}` has string type, but date_format is not specified. Convert column to datetime type or pass date_format
94
93
  invalid_date_format=Failed to parse date in column `{}`. Try to pass explicit date format in date_format argument of FeaturesEnricher constructor
@@ -1,4 +1,4 @@
1
- from typing import List
1
+ from typing import List, Optional
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -10,18 +10,16 @@ class BaseSearchKeyDetector:
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
11
  raise NotImplementedError
12
12
 
13
- def _get_search_keys_by_name(self, column_names: List[str]) -> List[str]:
14
- return [
15
- column_name
16
- for column_name in column_names
17
- if self._is_search_key_by_name(column_name)
18
- ]
13
+ def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
+ for column_name in column_names:
15
+ if self._is_search_key_by_name(column_name):
16
+ return column_name
19
17
 
20
- def get_search_key_columns(self, df: pd.DataFrame, existing_search_keys: List[str]) -> List[str]:
21
- other_columns = [col for col in df.columns if col not in existing_search_keys]
22
- columns_by_names = self._get_search_keys_by_name(other_columns)
23
- columns_by_values = []
24
- for column_name in other_columns:
18
+ def get_search_key_column(self, df: pd.DataFrame) -> Optional[str]:
19
+ maybe_column = self._get_search_key_by_name(df.columns.to_list())
20
+ if maybe_column is not None:
21
+ return maybe_column
22
+
23
+ for column_name in df.columns:
25
24
  if self._is_search_key_by_values(df[column_name]):
26
- columns_by_values.append(column_name)
27
- return list(set(columns_by_names + columns_by_values))
25
+ return column_name
@@ -3,15 +3,7 @@ from typing import Dict, List, Optional, Union
3
3
 
4
4
  import pandas as pd
5
5
 
6
- from upgini.metadata import (
7
- ENTITY_SYSTEM_RECORD_ID,
8
- EVAL_SET_INDEX,
9
- SORT_ID,
10
- SYSTEM_RECORD_ID,
11
- TARGET,
12
- ModelTaskType,
13
- SearchKey,
14
- )
6
+ from upgini.metadata import EVAL_SET_INDEX, SORT_ID, SYSTEM_RECORD_ID, TARGET, ModelTaskType, SearchKey
15
7
  from upgini.resource_bundle import ResourceBundle
16
8
  from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
17
9
  from upgini.utils.target_utils import define_task
@@ -151,8 +143,6 @@ def clean_full_duplicates(
151
143
  unique_columns = df.columns.tolist()
152
144
  if SYSTEM_RECORD_ID in unique_columns:
153
145
  unique_columns.remove(SYSTEM_RECORD_ID)
154
- if ENTITY_SYSTEM_RECORD_ID in unique_columns:
155
- unique_columns.remove(ENTITY_SYSTEM_RECORD_ID)
156
146
  if SORT_ID in unique_columns:
157
147
  unique_columns.remove(SORT_ID)
158
148
  if EVAL_SET_INDEX in unique_columns:
@@ -38,13 +38,11 @@ class EmailSearchKeyConverter:
38
38
  email_column: str,
39
39
  hem_column: Optional[str],
40
40
  search_keys: Dict[str, SearchKey],
41
- unnest_search_keys: Optional[List[str]] = None,
42
41
  logger: Optional[logging.Logger] = None,
43
42
  ):
44
43
  self.email_column = email_column
45
44
  self.hem_column = hem_column
46
45
  self.search_keys = search_keys
47
- self.unnest_search_keys = unnest_search_keys
48
46
  if logger is not None:
49
47
  self.logger = logger
50
48
  else:
@@ -82,12 +80,9 @@ class EmailSearchKeyConverter:
82
80
  del self.search_keys[self.email_column]
83
81
  return df
84
82
  self.search_keys[self.HEM_COLUMN_NAME] = SearchKey.HEM
85
- self.unnest_search_keys.append(self.HEM_COLUMN_NAME)
86
83
  self.email_converted_to_hem = True
87
84
 
88
85
  del self.search_keys[self.email_column]
89
- if self.email_column in self.unnest_search_keys:
90
- self.unnest_search_keys.remove(self.email_column)
91
86
 
92
87
  df[self.EMAIL_ONE_DOMAIN_COLUMN_NAME] = df[self.email_column].apply(self._email_to_one_domain)
93
88
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.299
3
+ Version: 1.1.299a3511.dev7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
+ Requires-Dist: jarowinkler>=2.0.0
30
+ Requires-Dist: levenshtein>=0.25.1
29
31
  Requires-Dist: lightgbm>=3.3.2
30
32
  Requires-Dist: numpy>=1.19.0
31
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -131,7 +133,7 @@ Description-Content-Type: text/markdown
131
133
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
132
134
  |World economic indicators|191 |41|-|Monthly|date, country|No
133
135
  |Markets data|-|17|-|Monthly|date, datetime|No
134
- |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
136
+ |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
135
137
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
136
138
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
137
139
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -840,4 +842,4 @@ Some convenient ways to start contributing are:
840
842
  - [More perks for registered users](https://profile.upgini.com)
841
843
 
842
844
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
843
- Please report it here</a></sup>
845
+ Please report it here</a></sup>
@@ -1,26 +1,26 @@
1
- upgini/__about__.py,sha256=UQSkVroAOLINihC2MNtqGFKJfbFI7X9Lopc0G3W2z0I,24
1
+ upgini/__about__.py,sha256=jjXVyJxJgsQpkOsEHy8YVhJaQJ27frNdozj77ToT5ps,34
2
2
  upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=MOzBVsvzlHLxNfPWtMaXC_jIPeW7_gUvbSGeXnsPgNI,46158
4
+ upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=afPo71dYHp9edimm3TWeAzr6aY0sPYRkvdqR34cQP9A,183157
6
+ upgini/features_enricher.py,sha256=HQFLw3VyEsZfAt4xFnIYOnp3fzQSHAsyHzIm0gTJpOI,177543
7
7
  upgini/http.py,sha256=bp6jWl422Icy3AhHMdCcJv5NjExE45gSMmzMTPJjPuk,42600
8
8
  upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
- upgini/metadata.py,sha256=wOFCJruDBhC4Hiiiqf8GeHZnnm6rhJy8t6fg5B0Z4TQ,10209
10
- upgini/metrics.py,sha256=Tu5cN8RlhOSSMWUTXRSkdl8SWBqR1N_2eJpBum9pZxc,30926
9
+ upgini/metadata.py,sha256=qDAIO7NLSSQp_XiXCv3U4XJTLO0KH3YuQ8lvCLYPqzs,9781
10
+ upgini/metrics.py,sha256=DiDgdFvYu64ArlPEgjppZShK6yybWtIEbdPAhI3yO1I,30930
11
11
  upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=XbvgX2IU4aee9rJZ--d5MdmrfKhON_emle5-RU1qlEY,2506
18
- upgini/autofe/binary.py,sha256=8FXPJxN7fnC5wphO0Dp1tQCa0lFMSDGQGvBMkSIVAcE,4155
19
- upgini/autofe/date.py,sha256=8zYVhjl7jVS4xt-IjCgk9px2LHnACX2YlMlmDELlRTc,7943
20
- upgini/autofe/feature.py,sha256=ayxiF8Ip1ww_pt_BC9Pk127fAHZ_3fuluulS1EYLolk,13423
17
+ upgini/autofe/all_operands.py,sha256=3LiH9iU-ArGmYpS8FHWH7yCFx40ILfvlSXJlKIa75BQ,2542
18
+ upgini/autofe/binary.py,sha256=ml0MszLARZqp3UGUqTGsVjT4DD69zTisfBBEqbZ7klU,6767
19
+ upgini/autofe/date.py,sha256=Qq11EGLFHJxy5DQF2V1CBMtH2j4g5RpinRcw-7SobMs,8442
20
+ upgini/autofe/feature.py,sha256=COBHf62aCniOGuNF6K38SQRmT93Yb0uw6guUm5QDc3s,13646
21
21
  upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
22
22
  upgini/autofe/operand.py,sha256=MKEsl3zxpWzRDpTkE0sNJxTu62U20sWOvEKhPjUWS6s,2915
23
- upgini/autofe/unary.py,sha256=ZWjLd-CUkNt_PpM8YuWLLipW1v_RdBlsl4JxXIVo9aM,3652
23
+ upgini/autofe/unary.py,sha256=B4wp8oKnlJ0nUng-DRMKSiF8MHlhAFYbgmo9Nd_0ZaA,3777
24
24
  upgini/autofe/vector.py,sha256=dLxfAstJs-gw_OQ1xxoxcM6pVzORlV0HVzdzt7cLXVQ,606
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=1cQZrK630VztwGGDp41ec9gqIeUtkefaqSSQEitVWiM,19581
@@ -30,22 +30,22 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/phone_normalizer.py,sha256=EzTaahk6myRv6ZXgbyVFGY4kpo_2VlQgOrm5_lfbmNI,9996
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=6jYqcxj06ZopXwr5YYMGXX1QiNNJNFo2SuwAR0qleRk,26358
33
+ upgini/resource_bundle/strings.properties,sha256=1oHurL4I83P2lXIavx9vSdKM8ZqncAPXH2IZf76bD6g,26292
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
37
37
  upgini/sampler/random_under_sampler.py,sha256=TIbm7ATo-bCMF-IiS5sZeDC1ad1SYg0eY_rRmg84yIQ,4024
38
38
  upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
39
39
  upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
40
- upgini/utils/base_search_key_detector.py,sha256=Inc6iGG-VXQdejWFfbekIkZk2ahC4k7CdGqzOkie6Bs,1021
40
+ upgini/utils/base_search_key_detector.py,sha256=UNs2uxEcD1N_mOtkx3k6U70DCajW-QEO2vZp41GF0mU,855
41
41
  upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
42
42
  upgini/utils/country_utils.py,sha256=yE8oRgMpXuJxPfQm4fioY6dg6700HgVnHSk4Cv9sUyM,6511
43
43
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
44
44
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
45
45
  upgini/utils/datetime_utils.py,sha256=Ujmu1ouwSFtG5SywQXJlmtDnGigAnIWPdE5Vx5NvgUM,10951
46
- upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwtXuV8,8770
46
+ upgini/utils/deduplicate_utils.py,sha256=6AbARehUCghJZ4PppFtrej2s3gFRruh41MEm6mzakHs,8607
47
47
  upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
48
- upgini/utils/email_utils.py,sha256=aKHa4xVBSsEsiZtFCPj_DrUaFupceYfvJeP_e8w_D5E,3813
48
+ upgini/utils/email_utils.py,sha256=PLufTO97Pg9PPsNqB9agcM6M98MIxKUgIgNn2mVwSQ0,3520
49
49
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
50
50
  upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
51
51
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.299.dist-info/METADATA,sha256=IITBjTPICcuZmOUmDbmU_GdWOLVSSrDw_llm2mHGK3A,48153
61
- upgini-1.1.299.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.299.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.299.dist-info/RECORD,,
60
+ upgini-1.1.299a3511.dev7.dist-info/METADATA,sha256=4uKYgKYftXwbRamHS0DphtecHOppCn1BwqIEBLOudak,48230
61
+ upgini-1.1.299a3511.dev7.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
+ upgini-1.1.299a3511.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.299a3511.dev7.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any