upgini 1.1.299a3511.dev10__tar.gz → 1.1.300__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (66) hide show
  1. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/PKG-INFO +3 -5
  2. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/README.md +2 -2
  3. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/pyproject.toml +0 -3
  4. upgini-1.1.300/src/upgini/__about__.py +1 -0
  5. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/all_operands.py +7 -26
  6. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/binary.py +2 -93
  7. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/date.py +4 -17
  8. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/feature.py +8 -10
  9. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/unary.py +0 -7
  10. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/dataset.py +11 -2
  11. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/features_enricher.py +223 -103
  12. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/metadata.py +10 -2
  13. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/metrics.py +1 -1
  14. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/resource_bundle/strings.properties +1 -0
  15. upgini-1.1.300/src/upgini/utils/base_search_key_detector.py +27 -0
  16. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/deduplicate_utils.py +11 -1
  17. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/email_utils.py +5 -0
  18. upgini-1.1.299a3511.dev10/src/upgini/__about__.py +0 -1
  19. upgini-1.1.299a3511.dev10/src/upgini/utils/base_search_key_detector.py +0 -25
  20. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/.gitignore +0 -0
  21. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/LICENSE +0 -0
  22. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/__init__.py +0 -0
  23. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/ads.py +0 -0
  24. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/ads_management/__init__.py +0 -0
  25. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/ads_management/ads_manager.py +0 -0
  26. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/__init__.py +0 -0
  27. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/groupby.py +0 -0
  28. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/operand.py +0 -0
  29. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/autofe/vector.py +0 -0
  30. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/data_source/__init__.py +0 -0
  31. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/data_source/data_source_publisher.py +0 -0
  32. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/errors.py +0 -0
  33. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/http.py +0 -0
  34. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/lazy_import.py +0 -0
  35. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/mdc/__init__.py +0 -0
  36. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/mdc/context.py +0 -0
  37. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/normalizer/__init__.py +0 -0
  38. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/normalizer/phone_normalizer.py +0 -0
  39. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/resource_bundle/__init__.py +0 -0
  40. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/resource_bundle/exceptions.py +0 -0
  41. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  42. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/sampler/__init__.py +0 -0
  43. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/sampler/base.py +0 -0
  44. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/sampler/random_under_sampler.py +0 -0
  45. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/sampler/utils.py +0 -0
  46. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/search_task.py +0 -0
  47. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/spinner.py +0 -0
  48. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/datetime_utils.py +0 -0
  54. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/features_validator.py +0 -0
  57. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/target_utils.py +0 -0
  64. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/track_info.py +0 -0
  65. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/utils/warning_counter.py +0 -0
  66. {upgini-1.1.299a3511.dev10 → upgini-1.1.300}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.299a3511.dev10
3
+ Version: 1.1.300
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
- Requires-Dist: jarowinkler>=2.0.0
30
- Requires-Dist: levenshtein>=0.25.1
31
29
  Requires-Dist: lightgbm>=3.3.2
32
30
  Requires-Dist: numpy>=1.19.0
33
31
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -133,7 +131,7 @@ Description-Content-Type: text/markdown
133
131
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
134
132
  |World economic indicators|191 |41|-|Monthly|date, country|No
135
133
  |Markets data|-|17|-|Monthly|date, datetime|No
136
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
134
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
137
135
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
138
136
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
139
137
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -842,4 +840,4 @@ Some convenient ways to start contributing are:
842
840
  - [More perks for registered users](https://profile.upgini.com)
843
841
 
844
842
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
845
- Please report it here</a></sup>
843
+ Please report it here</a></sup>
@@ -90,7 +90,7 @@
90
90
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
91
91
  |World economic indicators|191 |41|-|Monthly|date, country|No
92
92
  |Markets data|-|17|-|Monthly|date, datetime|No
93
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
93
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
94
94
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
95
95
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
96
96
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -799,4 +799,4 @@ Some convenient ways to start contributing are:
799
799
  - [More perks for registered users](https://profile.upgini.com)
800
800
 
801
801
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
802
- Please report it here</a></sup>
802
+ Please report it here</a></sup>
@@ -49,9 +49,6 @@ dependencies = [
49
49
  "scikit-learn>=1.3.0",
50
50
  "python-bidi==0.4.2",
51
51
  "xhtml2pdf==0.2.11",
52
- "jarowinkler>=2.0.0",
53
- "levenshtein>=0.25.1",
54
- "python-bidi==0.4.2",
55
52
  ]
56
53
 
57
54
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.1.300"
@@ -1,20 +1,6 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import (
4
- Add,
5
- Combine,
6
- CombineThenFreq,
7
- Distance,
8
- Divide,
9
- JaroWinklerSim1,
10
- JaroWinklerSim2,
11
- LevenshteinSim,
12
- Max,
13
- Min,
14
- Multiply,
15
- Sim,
16
- Subtract,
17
- )
3
+ from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
18
4
  from upgini.autofe.date import (
19
5
  DateDiff,
20
6
  DateDiffType2,
@@ -23,9 +9,9 @@ from upgini.autofe.date import (
23
9
  DatePercentile,
24
10
  DatePercentileMethod2,
25
11
  )
26
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
12
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
27
13
  from upgini.autofe.operand import Operand
28
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
14
+ from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
29
15
  from upgini.autofe.vector import Mean, Sum
30
16
 
31
17
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -53,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
53
39
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
54
40
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
55
41
  GroupByThenRank(),
56
- Combine(),
57
- CombineThenFreq(),
58
- GroupByThenNUnique(),
59
- GroupByThenFreq(),
42
+ Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
+ Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
+ Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
+ Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
60
46
  Sim(),
61
47
  DateDiff(),
62
48
  DateDiffType2(),
@@ -73,11 +59,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
73
59
  DatePercentile(),
74
60
  DatePercentileMethod2(),
75
61
  Norm(),
76
- JaroWinklerSim1(),
77
- JaroWinklerSim2(),
78
- LevenshteinSim(),
79
- Distance(),
80
- Embeddings(),
81
62
  ]
82
63
  }
83
64
 
@@ -1,11 +1,7 @@
1
- import abc
2
- from typing import Optional
3
- import Levenshtein
4
1
  import numpy as np
5
2
  import pandas as pd
6
3
  from numpy import dot
7
4
  from numpy.linalg import norm
8
- from jarowinkler import jarowinkler_similarity
9
5
 
10
6
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
11
7
 
@@ -134,27 +130,7 @@ class CombineThenFreq(PandasOperand):
134
130
  self._loc(temp, value_counts)
135
131
 
136
132
 
137
- class Distance(PandasOperand):
138
- name = "dist"
139
- is_binary = True
140
- output_type = "float"
141
- is_symmetrical = True
142
- has_symmetry_importance = True
143
-
144
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
145
- return pd.Series(
146
- 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
147
- )
148
-
149
- # row-wise dot product
150
- def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
151
- res = (left.dropna() * right.dropna()).apply(np.sum)
152
- res = res.reindex(left.index.union(right.index))
153
- return res
154
-
155
-
156
- # Left for backward compatibility
157
- class Sim(Distance):
133
+ class Sim(PandasOperand):
158
134
  name = "sim"
159
135
  is_binary = True
160
136
  output_type = "float"
@@ -162,71 +138,4 @@ class Sim(Distance):
162
138
  has_symmetry_importance = True
163
139
 
164
140
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
165
- return 1 - super().calculate_binary(left, right)
166
-
167
-
168
- class StringSim(PandasOperand, abc.ABC):
169
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
170
- sims = []
171
- for i in left.index:
172
- left_i = self._prepare_value(left.get(i))
173
- right_i = self._prepare_value(right.get(i))
174
- if left_i is not None and right_i is not None:
175
- sims.append(self._similarity(left_i, right_i))
176
- else:
177
- sims.append(None)
178
-
179
- return pd.Series(sims, index=left.index)
180
-
181
- @abc.abstractmethod
182
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
183
- pass
184
-
185
- @abc.abstractmethod
186
- def _similarity(self, left: str, right: str) -> float:
187
- pass
188
-
189
-
190
- class JaroWinklerSim1(StringSim):
191
- name = "sim_jw1"
192
- is_binary = True
193
- input_type = "string"
194
- output_type = "float"
195
- is_symmetrical = True
196
- has_symmetry_importance = True
197
-
198
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
199
- return value
200
-
201
- def _similarity(self, left: str, right: str) -> float:
202
- return jarowinkler_similarity(left, right)
203
-
204
-
205
- class JaroWinklerSim2(StringSim):
206
- name = "sim_jw2"
207
- is_binary = True
208
- input_type = "string"
209
- output_type = "float"
210
- is_symmetrical = True
211
- has_symmetry_importance = True
212
-
213
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
214
- return value[::-1] if value is not None else None
215
-
216
- def _similarity(self, left: str, right: str) -> float:
217
- return jarowinkler_similarity(left, right)
218
-
219
-
220
- class LevenshteinSim(StringSim):
221
- name = "sim_lv"
222
- is_binary = True
223
- input_type = "string"
224
- output_type = "float"
225
- is_symmetrical = True
226
- has_symmetry_importance = True
227
-
228
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
229
- return value
230
-
231
- def _similarity(self, left: str, right: str) -> float:
232
- return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
141
+ return dot(left, right) / (norm(left) * norm(right))
@@ -43,8 +43,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
43
43
  is_binary = True
44
44
  has_symmetry_importance = True
45
45
 
46
- replace_negative: bool = False
47
-
48
46
  def get_params(self) -> Dict[str, Optional[str]]:
49
47
  res = super().get_params()
50
48
  res.update(
@@ -52,7 +50,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
52
50
  "diff_unit": self.diff_unit,
53
51
  "left_unit": self.left_unit,
54
52
  "right_unit": self.right_unit,
55
- "replace_negative": self.replace_negative,
56
53
  }
57
54
  )
58
55
  return res
@@ -64,8 +61,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
64
61
  return self.__replace_negative(diff)
65
62
 
66
63
  def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
67
- if self.replace_negative:
68
- x[x < 0] = None
64
+ x[x < 0] = None
69
65
  return x
70
66
 
71
67
 
@@ -89,7 +85,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
89
85
  left = self._convert_to_date(left, self.left_unit)
90
86
  right = self._convert_to_date(right, self.right_unit)
91
87
  future = right + (left.dt.year - right.dt.year).apply(
92
- lambda y: pd.tseries.offsets.DateOffset(years=0 if np.isnan(y) else y)
88
+ lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
93
89
  )
94
90
  future = pd.to_datetime(future)
95
91
  before = future[future < left]
@@ -105,19 +101,13 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
105
101
  class DateListDiff(PandasOperand, DateDiffMixin):
106
102
  is_binary = True
107
103
  has_symmetry_importance = True
108
-
109
104
  aggregation: str
110
- replace_negative: bool = False
111
105
 
112
106
  def get_params(self) -> Dict[str, Optional[str]]:
113
107
  res = super().get_params()
114
108
  res.update(
115
109
  {
116
110
  "aggregation": self.aggregation,
117
- "diff_unit": self.diff_unit,
118
- "left_unit": self.left_unit,
119
- "right_unit": self.right_unit,
120
- "replace_negative": self.replace_negative,
121
111
  }
122
112
  )
123
113
  return res
@@ -135,7 +125,7 @@ class DateListDiff(PandasOperand, DateDiffMixin):
135
125
 
136
126
  def _diff(self, x: TimedeltaArray):
137
127
  x = self._convert_diff_to_unit(x)
138
- return x[x > 0] if self.replace_negative else x
128
+ return x[x > 0]
139
129
 
140
130
  def _agg(self, x):
141
131
  method = getattr(np, self.aggregation, None)
@@ -167,10 +157,7 @@ class DateListDiffBounded(DateListDiff):
167
157
  super().__init__(**data)
168
158
 
169
159
  def _agg(self, x):
170
- x = x[
171
- (x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
172
- & (x < (self.upper_bound if self.upper_bound is not None else np.inf))
173
- ]
160
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
174
161
  return super()._agg(x)
175
162
 
176
163
 
@@ -138,17 +138,15 @@ class Feature:
138
138
  if self.cached_display_name is not None and cache:
139
139
  return self.cached_display_name
140
140
 
141
- should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
142
- prev_name = [self.children[0].get_op_display_name()] if should_stack_op else []
143
-
144
141
  if self.alias:
145
142
  components = ["f_autofe", self.alias]
146
- elif shorten and (not self.op.is_unary or should_stack_op):
147
- components = ["f_autofe"] + prev_name + [self.get_op_display_name()]
143
+ elif shorten and not self.op.is_unary:
144
+ components = ["f_autofe", self.get_op_display_name()]
148
145
  else:
149
- components = (
150
- ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
151
- )
146
+ components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
147
+ "autofe",
148
+ self.get_op_display_name(),
149
+ ]
152
150
  components.extend([str(self.display_index)] if self.display_index is not None else [])
153
151
  display_name = "_".join(components)
154
152
 
@@ -323,10 +321,10 @@ class FeatureGroup:
323
321
  lower_order_names = [ch.get_display_name() for ch in lower_order_children]
324
322
  if any(isinstance(f, Feature) for f in lower_order_children):
325
323
  child_data = pd.concat(
326
- [data[main_column or []]] + [ch.calculate(data) for ch in lower_order_children],
324
+ [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
327
325
  axis=1,
328
326
  )
329
- child_data.columns = ([main_column] if main_column is not None else []) + lower_order_names
327
+ child_data.columns = [main_column] + lower_order_names
330
328
  else:
331
329
  child_data = data[columns]
332
330
 
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
-
129
-
130
- class Embeddings(PandasOperand):
131
- name = "emb"
132
- is_unary = True
133
- input_type = "string"
134
- output_type = "vector"
@@ -23,7 +23,9 @@ from pandas.api.types import (
23
23
  from upgini.errors import ValidationError
24
24
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
25
  from upgini.metadata import (
26
+ ENTITY_SYSTEM_RECORD_ID,
26
27
  EVAL_SET_INDEX,
28
+ SEARCH_KEY_UNNEST,
27
29
  SYSTEM_COLUMNS,
28
30
  SYSTEM_RECORD_ID,
29
31
  TARGET,
@@ -79,6 +81,7 @@ class Dataset: # (pd.DataFrame):
79
81
  path: Optional[str] = None,
80
82
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
81
83
  search_keys: Optional[List[Tuple[str, ...]]] = None,
84
+ unnest_search_keys: Optional[Dict[str, str]] = None,
82
85
  model_task_type: Optional[ModelTaskType] = None,
83
86
  random_state: Optional[int] = None,
84
87
  rest_client: Optional[_RestClient] = None,
@@ -113,6 +116,7 @@ class Dataset: # (pd.DataFrame):
113
116
  self.description = description
114
117
  self.meaning_types = meaning_types
115
118
  self.search_keys = search_keys
119
+ self.unnest_search_keys = unnest_search_keys
116
120
  self.ignore_columns = []
117
121
  self.hierarchical_group_keys = []
118
122
  self.hierarchical_subgroup_keys = []
@@ -172,7 +176,7 @@ class Dataset: # (pd.DataFrame):
172
176
  new_columns = []
173
177
  dup_counter = 0
174
178
  for column in self.data.columns:
175
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
179
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
176
180
  self.columns_renaming[column] = column
177
181
  new_columns.append(column)
178
182
  continue
@@ -353,7 +357,9 @@ class Dataset: # (pd.DataFrame):
353
357
 
354
358
  if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
355
359
  try:
356
- self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
360
+ self.data[postal_code] = (
361
+ self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
362
+ )
357
363
  except Exception:
358
364
  pass
359
365
  elif is_float_dtype(self.data[postal_code]):
@@ -803,6 +809,9 @@ class Dataset: # (pd.DataFrame):
803
809
  meaningType=meaning_type,
804
810
  minMaxValues=min_max_values,
805
811
  )
812
+ if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
813
+ column_meta.isUnnest = True
814
+ column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
806
815
 
807
816
  columns.append(column_meta)
808
817