upgini 1.1.294__py3-none-any.whl → 1.1.296a3511.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.1.294"
1
+ __version__ = "1.1.296a3511.dev1"
@@ -1,8 +1,22 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
3
+ from upgini.autofe.binary import (
4
+ Add,
5
+ Combine,
6
+ CombineThenFreq,
7
+ Distance,
8
+ Divide,
9
+ JaroWinklerSim1,
10
+ JaroWinklerSim2,
11
+ LevenshteinSim,
12
+ Max,
13
+ Min,
14
+ Multiply,
15
+ Sim,
16
+ Subtract,
17
+ )
4
18
  from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
5
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
19
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
6
20
  from upgini.autofe.operand import Operand
7
21
  from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
8
22
  from upgini.autofe.vector import Mean, Sum
@@ -32,10 +46,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
32
46
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
33
47
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
34
48
  GroupByThenRank(),
35
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
36
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
37
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
38
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
49
+ Combine(),
50
+ CombineThenFreq(),
51
+ GroupByThenNUnique(),
52
+ GroupByThenFreq(),
39
53
  Sim(),
40
54
  DateDiff(),
41
55
  DateDiffType2(),
@@ -51,6 +65,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
51
65
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
52
66
  DatePercentile(),
53
67
  Norm(),
68
+ JaroWinklerSim1(),
69
+ JaroWinklerSim2(),
70
+ LevenshteinSim(),
71
+ Distance(),
54
72
  ]
55
73
  }
56
74
 
upgini/autofe/binary.py CHANGED
@@ -1,7 +1,11 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
6
  from numpy import dot
4
7
  from numpy.linalg import norm
8
+ from jarowinkler import jarowinkler_similarity
5
9
 
6
10
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
11
 
@@ -130,8 +134,8 @@ class CombineThenFreq(PandasOperand):
130
134
  self._loc(temp, value_counts)
131
135
 
132
136
 
133
- class Sim(PandasOperand):
134
- name = "sim"
137
+ class Distance(PandasOperand):
138
+ name = "dist"
135
139
  is_binary = True
136
140
  output_type = "float"
137
141
  is_symmetrical = True
@@ -139,3 +143,78 @@ class Sim(PandasOperand):
139
143
 
140
144
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
141
145
  return dot(left, right) / (norm(left) * norm(right))
146
+
147
+
148
+ class Sim(Distance):
149
+ name = "sim"
150
+ is_binary = True
151
+ output_type = "float"
152
+ is_symmetrical = True
153
+ has_symmetry_importance = True
154
+
155
+
156
+ class StringSim(PandasOperand, abc.ABC):
157
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
158
+ sims = []
159
+ for i in left.index:
160
+ left_i = self._prepare_value(left.get(i))
161
+ right_i = self._prepare_value(right.get(i))
162
+ if left_i is not None and right_i is not None:
163
+ sims.append(self._similarity(left_i, right_i))
164
+ else:
165
+ sims.append(None)
166
+
167
+ return pd.Series(sims, index=left.index)
168
+
169
+ @abc.abstractmethod
170
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
171
+ pass
172
+
173
+ @abc.abstractmethod
174
+ def _similarity(self, left: str, right: str) -> float:
175
+ pass
176
+
177
+
178
+ class JaroWinklerSim1(StringSim):
179
+ name = "sim_jw1"
180
+ is_binary = True
181
+ input_type = "string"
182
+ output_type = "float"
183
+ is_symmetrical = True
184
+ has_symmetry_importance = True
185
+
186
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
187
+ return value
188
+
189
+ def _similarity(self, left: str, right: str) -> float:
190
+ return jarowinkler_similarity(left, right)
191
+
192
+
193
+ class JaroWinklerSim2(StringSim):
194
+ name = "sim_jw2"
195
+ is_binary = True
196
+ input_type = "string"
197
+ output_type = "float"
198
+ is_symmetrical = True
199
+ has_symmetry_importance = True
200
+
201
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
202
+ return value[::-1] if value is not None else None
203
+
204
+ def _similarity(self, left: str, right: str) -> float:
205
+ return jarowinkler_similarity(left, right)
206
+
207
+
208
+ class LevenshteinSim(StringSim):
209
+ name = "sim_lv"
210
+ is_binary = True
211
+ input_type = "string"
212
+ output_type = "float"
213
+ is_symmetrical = True
214
+ has_symmetry_importance = True
215
+
216
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
217
+ return value
218
+
219
+ def _similarity(self, left: str, right: str) -> float:
220
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
upgini/metrics.py CHANGED
@@ -679,6 +679,11 @@ def validate_scoring_argument(scoring: Union[Callable, str, None]):
679
679
  raise ValidationError(
680
680
  f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
681
681
  )
682
+ elif scoring is not None:
683
+ raise ValidationError(
684
+ f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
685
+ " that accepts 3 input arguments: estimator, x, y"
686
+ )
682
687
 
683
688
 
684
689
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.294
3
+ Version: 1.1.296a3511.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=2vfGkEyxotARb9WDVVoCNdQiOrrhNqGA_vyl426_y5w,24
1
+ upgini/__about__.py,sha256=cimRDJH6AT3BICa6gKJosTY9mQiinR-sub8yCsdaiuY,34
2
2
  upgini/__init__.py,sha256=ObEtjFkIssl83qeKNMLpIQygfwK8TzztwiI43YTsAP0,353
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=7TLVVhGtjgx_9yaiaIUK3kZSe_R9wg5dY0d4F5qCGM4,45636
@@ -7,15 +7,15 @@ upgini/features_enricher.py,sha256=rLy6BwhL94VkRCp8W4RxJ0lBb7sZqTOsz-bUVI8nXZU,1
7
7
  upgini/http.py,sha256=bp6jWl422Icy3AhHMdCcJv5NjExE45gSMmzMTPJjPuk,42600
8
8
  upgini/lazy_import.py,sha256=EwoM0msNGbSmWBhGbrLDny1DSnOlvTxCjmMKPxYlDms,610
9
9
  upgini/metadata.py,sha256=qDAIO7NLSSQp_XiXCv3U4XJTLO0KH3YuQ8lvCLYPqzs,9781
10
- upgini/metrics.py,sha256=DLvA2YLV4f7lnzBCcfZ5T4NkqAv3pbstbjTepavuT7U,30688
10
+ upgini/metrics.py,sha256=DiDgdFvYu64ArlPEgjppZShK6yybWtIEbdPAhI3yO1I,30930
11
11
  upgini/search_task.py,sha256=LtRJ9bCPjMo1gJ-sUDKERhDwGcWKImrzwVFHjkMSQHQ,17071
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
13
  upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=cpwUfhZWF9QBfrUyJ0xZ72iGYyt1eXIZQ46FB-7ZDI4,2421
18
- upgini/autofe/binary.py,sha256=8FXPJxN7fnC5wphO0Dp1tQCa0lFMSDGQGvBMkSIVAcE,4155
17
+ upgini/autofe/all_operands.py,sha256=ZS9A7u1gV7Bt8tHFfCiNx9u4q85I1ny4NIL-keyfWHY,2423
18
+ upgini/autofe/binary.py,sha256=P46AVcFRiMKYucK3_N34QDqVMbvv2OuyWAREx0bUNwg,6367
19
19
  upgini/autofe/date.py,sha256=qzk0NT332Q0vR1eRwTuNiMSrGE3ulh6Ic3QLBZqSdvw,7284
20
20
  upgini/autofe/feature.py,sha256=_V9B74B3ue7eAYXSOt9JKhVC9klkAKks22MwnBRye_w,12487
21
21
  upgini/autofe/groupby.py,sha256=4WjDzQxqpZxB79Ih4ihMMI5GDxaFqiH6ZelfV82ClT4,3091
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
57
57
  upgini/utils/target_utils.py,sha256=Y96_PJ5cC-WsEbeqg20v9uqywDQobLoTb-xoP7S3o4E,7807
58
58
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
59
59
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
60
- upgini-1.1.294.dist-info/METADATA,sha256=mOES85LZo9_PlVh9tpbFPRp2QI16iINyIk5ywPby7Dc,48117
61
- upgini-1.1.294.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
62
- upgini-1.1.294.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
- upgini-1.1.294.dist-info/RECORD,,
60
+ upgini-1.1.296a3511.dev1.dist-info/METADATA,sha256=_tPLQLuHI0_j5lGzkGhMSftXbfpLi8aCTRpjT4fd8QQ,48127
61
+ upgini-1.1.296a3511.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
62
+ upgini-1.1.296a3511.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
63
+ upgini-1.1.296a3511.dev1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any