upgini 1.1.296a3511.dev4__tar.gz → 1.1.296a3521.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/PKG-INFO +1 -3
  2. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/pyproject.toml +0 -2
  3. upgini-1.1.296a3521.dev1/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/all_operands.py +15 -26
  5. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/binary.py +2 -91
  6. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/date.py +31 -15
  7. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/unary.py +0 -7
  8. upgini-1.1.296a3511.dev4/src/upgini/__about__.py +0 -1
  9. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/.gitignore +0 -0
  10. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/LICENSE +0 -0
  11. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/README.md +0 -0
  12. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/__init__.py +0 -0
  13. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/ads.py +0 -0
  14. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/feature.py +0 -0
  18. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/groupby.py +0 -0
  19. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/operand.py +0 -0
  20. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/autofe/vector.py +0 -0
  21. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/data_source/__init__.py +0 -0
  22. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
  23. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/dataset.py +0 -0
  24. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/errors.py +0 -0
  25. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/features_enricher.py +0 -0
  26. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/http.py +0 -0
  27. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/lazy_import.py +0 -0
  28. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/mdc/__init__.py +0 -0
  29. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/mdc/context.py +0 -0
  30. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/metadata.py +0 -0
  31. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/metrics.py +0 -0
  32. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/normalizer/__init__.py +0 -0
  33. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/display_utils.py +0 -0
  53. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.296a3511.dev4 → upgini-1.1.296a3521.dev1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.296a3511.dev4
3
+ Version: 1.1.296a3521.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,8 +26,6 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
- Requires-Dist: jarowinkler>=2.0.0
30
- Requires-Dist: levenshtein>=0.25.1
31
29
  Requires-Dist: lightgbm>=3.3.2
32
30
  Requires-Dist: numpy>=1.19.0
33
31
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -48,8 +48,6 @@ dependencies = [
48
48
  "requests>=2.8.0",
49
49
  "scikit-learn>=1.3.0",
50
50
  "xhtml2pdf==0.2.11",
51
- "jarowinkler>=2.0.0",
52
- "levenshtein>=0.25.1",
53
51
  ]
54
52
 
55
53
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.1.296a3521.dev1"
@@ -1,24 +1,17 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import (
4
- Add,
5
- Combine,
6
- CombineThenFreq,
7
- Distance,
8
- Divide,
9
- JaroWinklerSim1,
10
- JaroWinklerSim2,
11
- LevenshteinSim,
12
- Max,
13
- Min,
14
- Multiply,
15
- Sim,
16
- Subtract,
3
+ from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
+ from upgini.autofe.date import (
5
+ DateDiff,
6
+ DateDiffType2,
7
+ DateListDiff,
8
+ DateListDiffBounded,
9
+ DatePercentile,
10
+ DatePercentileType2,
17
11
  )
18
- from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
19
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
12
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
20
13
  from upgini.autofe.operand import Operand
21
- from upgini.autofe.unary import Abs, Embeddings, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
14
+ from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
22
15
  from upgini.autofe.vector import Mean, Sum
23
16
 
24
17
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -46,10 +39,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
46
39
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
47
40
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
48
41
  GroupByThenRank(),
49
- Combine(),
50
- CombineThenFreq(),
51
- GroupByThenNUnique(),
52
- GroupByThenFreq(),
42
+ Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
43
+ Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
44
+ Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
45
+ Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
53
46
  Sim(),
54
47
  DateDiff(),
55
48
  DateDiffType2(),
@@ -64,12 +57,8 @@ ALL_OPERANDS: Dict[str, Operand] = {
64
57
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
65
58
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
66
59
  DatePercentile(),
60
+ DatePercentileType2(),
67
61
  Norm(),
68
- JaroWinklerSim1(),
69
- JaroWinklerSim2(),
70
- LevenshteinSim(),
71
- Distance(),
72
- Embeddings(),
73
62
  ]
74
63
  }
75
64
 
@@ -1,11 +1,7 @@
1
- import abc
2
- from typing import Optional
3
- import Levenshtein
4
1
  import numpy as np
5
2
  import pandas as pd
6
3
  from numpy import dot
7
4
  from numpy.linalg import norm
8
- from jarowinkler import jarowinkler_similarity
9
5
 
10
6
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
11
7
 
@@ -134,25 +130,7 @@ class CombineThenFreq(PandasOperand):
134
130
  self._loc(temp, value_counts)
135
131
 
136
132
 
137
- class Distance(PandasOperand):
138
- name = "dist"
139
- is_binary = True
140
- output_type = "float"
141
- is_symmetrical = True
142
- has_symmetry_importance = True
143
-
144
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
145
- return pd.Series(
146
- 1 - self.__dot(left, right) / (self.__dot(left, left) * self.__dot(right, right)), index=left.index
147
- )
148
-
149
- # row-wise dot product
150
- def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
151
- return (left * right).apply(np.sum)
152
-
153
-
154
- # Left for backward compatibility
155
- class Sim(Distance):
133
+ class Sim(PandasOperand):
156
134
  name = "sim"
157
135
  is_binary = True
158
136
  output_type = "float"
@@ -160,71 +138,4 @@ class Sim(Distance):
160
138
  has_symmetry_importance = True
161
139
 
162
140
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
163
- return 1 - super().calculate_binary(left, right)
164
-
165
-
166
- class StringSim(PandasOperand, abc.ABC):
167
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
- sims = []
169
- for i in left.index:
170
- left_i = self._prepare_value(left.get(i))
171
- right_i = self._prepare_value(right.get(i))
172
- if left_i is not None and right_i is not None:
173
- sims.append(self._similarity(left_i, right_i))
174
- else:
175
- sims.append(None)
176
-
177
- return pd.Series(sims, index=left.index)
178
-
179
- @abc.abstractmethod
180
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
181
- pass
182
-
183
- @abc.abstractmethod
184
- def _similarity(self, left: str, right: str) -> float:
185
- pass
186
-
187
-
188
- class JaroWinklerSim1(StringSim):
189
- name = "sim_jw1"
190
- is_binary = True
191
- input_type = "string"
192
- output_type = "float"
193
- is_symmetrical = True
194
- has_symmetry_importance = True
195
-
196
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
197
- return value
198
-
199
- def _similarity(self, left: str, right: str) -> float:
200
- return jarowinkler_similarity(left, right)
201
-
202
-
203
- class JaroWinklerSim2(StringSim):
204
- name = "sim_jw2"
205
- is_binary = True
206
- input_type = "string"
207
- output_type = "float"
208
- is_symmetrical = True
209
- has_symmetry_importance = True
210
-
211
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
212
- return value[::-1] if value is not None else None
213
-
214
- def _similarity(self, left: str, right: str) -> float:
215
- return jarowinkler_similarity(left, right)
216
-
217
-
218
- class LevenshteinSim(StringSim):
219
- name = "sim_lv"
220
- is_binary = True
221
- input_type = "string"
222
- output_type = "float"
223
- is_symmetrical = True
224
- has_symmetry_importance = True
225
-
226
- def _prepare_value(self, value: Optional[str]) -> Optional[str]:
227
- return value
228
-
229
- def _similarity(self, left: str, right: str) -> float:
230
- return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
141
+ return dot(left, right) / (norm(left) * norm(right))
@@ -1,3 +1,4 @@
1
+ import abc
1
2
  from typing import Any, Dict, List, Optional, Union
2
3
 
3
4
  import numpy as np
@@ -159,11 +160,33 @@ class DateListDiffBounded(DateListDiff):
159
160
  return super()._agg(x)
160
161
 
161
162
 
162
- class DatePercentile(PandasOperand):
163
- name = "date_per"
163
+ class DatePercentileBase(PandasOperand, abc.ABC):
164
164
  is_binary = True
165
165
  output_type = "float"
166
166
 
167
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
168
+ # Assuming that left is a date column, right is a feature column
169
+ left = pd.to_datetime(left, unit=self.date_unit)
170
+
171
+ bounds = self._get_bounds(left)
172
+
173
+ return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
174
+
175
+ @abc.abstractmethod
176
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
177
+ pass
178
+
179
+ def __perc(self, f, bounds):
180
+ hit = np.where(f >= bounds)[0]
181
+ if hit.size > 0:
182
+ return np.max(hit) + 1
183
+ else:
184
+ return np.nan
185
+
186
+
187
+ class DatePercentile(DatePercentileBase):
188
+ name = "date_per"
189
+
167
190
  date_unit: Optional[str] = None
168
191
  zero_month: Optional[int]
169
192
  zero_year: Optional[int]
@@ -190,22 +213,15 @@ class DatePercentile(PandasOperand):
190
213
  elif isinstance(value, str):
191
214
  return value[1:-1].split(", ")
192
215
 
193
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
194
- # Assuming that left is a date column, right is a feature column
195
- left = pd.to_datetime(left, unit=self.date_unit)
196
- months = left.dt.month
197
- years = left.dt.year
216
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
217
+ months = date_col.dt.month
218
+ years = date_col.dt.year
198
219
 
199
220
  month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
200
- bounds = month_diffs.apply(
221
+ return month_diffs.apply(
201
222
  lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
202
223
  )
203
224
 
204
- return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
205
225
 
206
- def __perc(self, f, bounds):
207
- hit = np.where(f >= bounds)[0]
208
- if hit.size > 0:
209
- return np.max(hit) + 1
210
- else:
211
- return np.nan
226
+ class DatePercentileType2(DatePercentileBase):
227
+ name = "date_per_type2"
@@ -125,10 +125,3 @@ class Norm(PandasOperand):
125
125
  normalized_data = pd.Series(normalized_data[:, 0], index=data_dropna.index, name=data.name)
126
126
  normalized_data = normalized_data.reindex(data.index)
127
127
  return normalized_data
128
-
129
-
130
- class Embeddings(PandasOperand):
131
- name = "emb"
132
- is_unary = True
133
- input_type = "string"
134
- output_type = "vector"
@@ -1 +0,0 @@
1
- __version__ = "1.1.296a3511.dev4"