upgini 1.1.295__tar.gz → 1.1.296a3511.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/PKG-INFO +1 -1
  2. upgini-1.1.296a3511.dev1/src/upgini/__about__.py +1 -0
  3. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/all_operands.py +24 -6
  4. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/binary.py +81 -2
  5. upgini-1.1.295/src/upgini/__about__.py +0 -1
  6. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/.gitignore +0 -0
  7. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/LICENSE +0 -0
  8. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/README.md +0 -0
  9. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/pyproject.toml +0 -0
  10. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/__init__.py +0 -0
  11. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/ads.py +0 -0
  12. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/date.py +0 -0
  16. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/feature.py +0 -0
  17. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/groupby.py +0 -0
  18. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/operand.py +0 -0
  19. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/unary.py +0 -0
  20. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/autofe/vector.py +0 -0
  21. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/data_source/__init__.py +0 -0
  22. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
  23. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/dataset.py +0 -0
  24. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/errors.py +0 -0
  25. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/features_enricher.py +0 -0
  26. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/http.py +0 -0
  27. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/lazy_import.py +0 -0
  28. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/mdc/__init__.py +0 -0
  29. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/mdc/context.py +0 -0
  30. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/metadata.py +0 -0
  31. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/metrics.py +0 -0
  32. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/normalizer/__init__.py +0 -0
  33. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/display_utils.py +0 -0
  53. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.295 → upgini-1.1.296a3511.dev1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.295
3
+ Version: 1.1.296a3511.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.1.296a3511.dev1"
@@ -1,8 +1,22 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
3
+ from upgini.autofe.binary import (
4
+ Add,
5
+ Combine,
6
+ CombineThenFreq,
7
+ Distance,
8
+ Divide,
9
+ JaroWinklerSim1,
10
+ JaroWinklerSim2,
11
+ LevenshteinSim,
12
+ Max,
13
+ Min,
14
+ Multiply,
15
+ Sim,
16
+ Subtract,
17
+ )
4
18
  from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
5
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
19
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
6
20
  from upgini.autofe.operand import Operand
7
21
  from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
8
22
  from upgini.autofe.vector import Mean, Sum
@@ -32,10 +46,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
32
46
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
33
47
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
34
48
  GroupByThenRank(),
35
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
36
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
37
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
38
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
49
+ Combine(),
50
+ CombineThenFreq(),
51
+ GroupByThenNUnique(),
52
+ GroupByThenFreq(),
39
53
  Sim(),
40
54
  DateDiff(),
41
55
  DateDiffType2(),
@@ -51,6 +65,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
51
65
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
52
66
  DatePercentile(),
53
67
  Norm(),
68
+ JaroWinklerSim1(),
69
+ JaroWinklerSim2(),
70
+ LevenshteinSim(),
71
+ Distance(),
54
72
  ]
55
73
  }
56
74
 
@@ -1,7 +1,11 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
6
  from numpy import dot
4
7
  from numpy.linalg import norm
8
+ from jarowinkler import jarowinkler_similarity
5
9
 
6
10
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
11
 
@@ -130,8 +134,8 @@ class CombineThenFreq(PandasOperand):
130
134
  self._loc(temp, value_counts)
131
135
 
132
136
 
133
- class Sim(PandasOperand):
134
- name = "sim"
137
+ class Distance(PandasOperand):
138
+ name = "dist"
135
139
  is_binary = True
136
140
  output_type = "float"
137
141
  is_symmetrical = True
@@ -139,3 +143,78 @@ class Sim(PandasOperand):
139
143
 
140
144
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
141
145
  return dot(left, right) / (norm(left) * norm(right))
146
+
147
+
148
+ class Sim(Distance):
149
+ name = "sim"
150
+ is_binary = True
151
+ output_type = "float"
152
+ is_symmetrical = True
153
+ has_symmetry_importance = True
154
+
155
+
156
+ class StringSim(PandasOperand, abc.ABC):
157
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
158
+ sims = []
159
+ for i in left.index:
160
+ left_i = self._prepare_value(left.get(i))
161
+ right_i = self._prepare_value(right.get(i))
162
+ if left_i is not None and right_i is not None:
163
+ sims.append(self._similarity(left_i, right_i))
164
+ else:
165
+ sims.append(None)
166
+
167
+ return pd.Series(sims, index=left.index)
168
+
169
+ @abc.abstractmethod
170
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
171
+ pass
172
+
173
+ @abc.abstractmethod
174
+ def _similarity(self, left: str, right: str) -> float:
175
+ pass
176
+
177
+
178
+ class JaroWinklerSim1(StringSim):
179
+ name = "sim_jw1"
180
+ is_binary = True
181
+ input_type = "string"
182
+ output_type = "float"
183
+ is_symmetrical = True
184
+ has_symmetry_importance = True
185
+
186
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
187
+ return value
188
+
189
+ def _similarity(self, left: str, right: str) -> float:
190
+ return jarowinkler_similarity(left, right)
191
+
192
+
193
+ class JaroWinklerSim2(StringSim):
194
+ name = "sim_jw2"
195
+ is_binary = True
196
+ input_type = "string"
197
+ output_type = "float"
198
+ is_symmetrical = True
199
+ has_symmetry_importance = True
200
+
201
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
202
+ return value[::-1] if value is not None else None
203
+
204
+ def _similarity(self, left: str, right: str) -> float:
205
+ return jarowinkler_similarity(left, right)
206
+
207
+
208
+ class LevenshteinSim(StringSim):
209
+ name = "sim_lv"
210
+ is_binary = True
211
+ input_type = "string"
212
+ output_type = "float"
213
+ is_symmetrical = True
214
+ has_symmetry_importance = True
215
+
216
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
217
+ return value
218
+
219
+ def _similarity(self, left: str, right: str) -> float:
220
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
@@ -1 +0,0 @@
1
- __version__ = "1.1.295"
File without changes
File without changes
File without changes