upgini 1.1.296__tar.gz → 1.1.296a3511.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/PKG-INFO +3 -1
  2. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/pyproject.toml +2 -0
  3. upgini-1.1.296a3511.dev2/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/all_operands.py +24 -6
  5. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/binary.py +81 -2
  6. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/data_source/data_source_publisher.py +0 -37
  7. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/features_enricher.py +1 -1
  8. upgini-1.1.296/src/upgini/__about__.py +0 -1
  9. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/.gitignore +0 -0
  10. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/LICENSE +0 -0
  11. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/README.md +0 -0
  12. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/__init__.py +0 -0
  13. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/ads.py +0 -0
  14. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/operand.py +0 -0
  21. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/unary.py +0 -0
  22. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/autofe/vector.py +0 -0
  23. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/data_source/__init__.py +0 -0
  24. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/dataset.py +0 -0
  25. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/errors.py +0 -0
  26. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/http.py +0 -0
  27. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/lazy_import.py +0 -0
  28. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/mdc/__init__.py +0 -0
  29. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/mdc/context.py +0 -0
  30. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/metadata.py +0 -0
  31. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/metrics.py +0 -0
  32. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/normalizer/__init__.py +0 -0
  33. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/display_utils.py +0 -0
  53. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.296 → upgini-1.1.296a3511.dev2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.296
3
+ Version: 1.1.296a3511.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -26,6 +26,8 @@ Requires-Python: <3.11,>=3.8
26
26
  Requires-Dist: catboost>=1.0.3
27
27
  Requires-Dist: fastparquet>=0.8.1
28
28
  Requires-Dist: ipywidgets>=8.1.0
29
+ Requires-Dist: jarowinkler>=2.0.0
30
+ Requires-Dist: levenshtein>=0.25.1
29
31
  Requires-Dist: lightgbm>=3.3.2
30
32
  Requires-Dist: numpy>=1.19.0
31
33
  Requires-Dist: pandas<3.0.0,>=1.1.0
@@ -48,6 +48,8 @@ dependencies = [
48
48
  "requests>=2.8.0",
49
49
  "scikit-learn>=1.3.0",
50
50
  "xhtml2pdf==0.2.11",
51
+ "jarowinkler>=2.0.0",
52
+ "levenshtein>=0.25.1",
51
53
  ]
52
54
 
53
55
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.1.296a3511.dev2"
@@ -1,8 +1,22 @@
1
1
  from typing import Dict
2
2
 
3
- from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
3
+ from upgini.autofe.binary import (
4
+ Add,
5
+ Combine,
6
+ CombineThenFreq,
7
+ Distance,
8
+ Divide,
9
+ JaroWinklerSim1,
10
+ JaroWinklerSim2,
11
+ LevenshteinSim,
12
+ Max,
13
+ Min,
14
+ Multiply,
15
+ Sim,
16
+ Subtract,
17
+ )
4
18
  from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
5
- from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
19
+ from upgini.autofe.groupby import GroupByThenAgg, GroupByThenFreq, GroupByThenNUnique, GroupByThenRank
6
20
  from upgini.autofe.operand import Operand
7
21
  from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
8
22
  from upgini.autofe.vector import Mean, Sum
@@ -32,10 +46,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
32
46
  GroupByThenAgg(name="GroupByThenMedian", agg="median"),
33
47
  GroupByThenAgg(name="GroupByThenStd", output_type="float", agg="std"),
34
48
  GroupByThenRank(),
35
- Operand(name="Combine", has_symmetry_importance=True, output_type="object", is_categorical=True),
36
- Operand(name="CombineThenFreq", has_symmetry_importance=True, output_type="float"),
37
- Operand(name="GroupByThenNUnique", output_type="int", is_vectorizable=True, is_grouping=True),
38
- Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
49
+ Combine(),
50
+ CombineThenFreq(),
51
+ GroupByThenNUnique(),
52
+ GroupByThenFreq(),
39
53
  Sim(),
40
54
  DateDiff(),
41
55
  DateDiffType2(),
@@ -51,6 +65,10 @@ ALL_OPERANDS: Dict[str, Operand] = {
51
65
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
52
66
  DatePercentile(),
53
67
  Norm(),
68
+ JaroWinklerSim1(),
69
+ JaroWinklerSim2(),
70
+ LevenshteinSim(),
71
+ Distance(),
54
72
  ]
55
73
  }
56
74
 
@@ -1,7 +1,11 @@
1
+ import abc
2
+ from typing import Optional
3
+ import Levenshtein
1
4
  import numpy as np
2
5
  import pandas as pd
3
6
  from numpy import dot
4
7
  from numpy.linalg import norm
8
+ from jarowinkler import jarowinkler_similarity
5
9
 
6
10
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
11
 
@@ -130,8 +134,8 @@ class CombineThenFreq(PandasOperand):
130
134
  self._loc(temp, value_counts)
131
135
 
132
136
 
133
- class Sim(PandasOperand):
134
- name = "sim"
137
+ class Distance(PandasOperand):
138
+ name = "dist"
135
139
  is_binary = True
136
140
  output_type = "float"
137
141
  is_symmetrical = True
@@ -139,3 +143,78 @@ class Sim(PandasOperand):
139
143
 
140
144
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
141
145
  return dot(left, right) / (norm(left) * norm(right))
146
+
147
+
148
+ class Sim(Distance):
149
+ name = "sim"
150
+ is_binary = True
151
+ output_type = "float"
152
+ is_symmetrical = True
153
+ has_symmetry_importance = True
154
+
155
+
156
+ class StringSim(PandasOperand, abc.ABC):
157
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
158
+ sims = []
159
+ for i in left.index:
160
+ left_i = self._prepare_value(left.get(i))
161
+ right_i = self._prepare_value(right.get(i))
162
+ if left_i is not None and right_i is not None:
163
+ sims.append(self._similarity(left_i, right_i))
164
+ else:
165
+ sims.append(None)
166
+
167
+ return pd.Series(sims, index=left.index)
168
+
169
+ @abc.abstractmethod
170
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
171
+ pass
172
+
173
+ @abc.abstractmethod
174
+ def _similarity(self, left: str, right: str) -> float:
175
+ pass
176
+
177
+
178
+ class JaroWinklerSim1(StringSim):
179
+ name = "sim_jw1"
180
+ is_binary = True
181
+ input_type = "string"
182
+ output_type = "float"
183
+ is_symmetrical = True
184
+ has_symmetry_importance = True
185
+
186
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
187
+ return value
188
+
189
+ def _similarity(self, left: str, right: str) -> float:
190
+ return jarowinkler_similarity(left, right)
191
+
192
+
193
+ class JaroWinklerSim2(StringSim):
194
+ name = "sim_jw2"
195
+ is_binary = True
196
+ input_type = "string"
197
+ output_type = "float"
198
+ is_symmetrical = True
199
+ has_symmetry_importance = True
200
+
201
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
202
+ return value[::-1] if value is not None else None
203
+
204
+ def _similarity(self, left: str, right: str) -> float:
205
+ return jarowinkler_similarity(left, right)
206
+
207
+
208
+ class LevenshteinSim(StringSim):
209
+ name = "sim_lv"
210
+ is_binary = True
211
+ input_type = "string"
212
+ output_type = "float"
213
+ is_symmetrical = True
214
+ has_symmetry_importance = True
215
+
216
+ def _prepare_value(self, value: Optional[str]) -> Optional[str]:
217
+ return value
218
+
219
+ def _similarity(self, left: str, right: str) -> float:
220
+ return 1 - Levenshtein.distance(left, right) / max(len(left), len(right))
@@ -59,35 +59,9 @@ class DataSourcePublisher:
59
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
60
60
  data_table_id_to_replace: Optional[str] = None,
61
61
  keep_features: Optional[List[str]] = None,
62
- date_features: Optional[List[str]] = None,
63
- date_vector_features: Optional[List[str]] = None,
64
62
  _force_generation=False,
65
63
  _silent=False,
66
64
  ) -> str:
67
- """Register new ADS
68
-
69
- Parameters
70
- ----------
71
- data_table_uri - str - table name in format {project_id}.{datasource_name}.{table_name}
72
-
73
- search_keys - dict with column names as keys and SearchKey as value
74
-
75
- update_frequency - str - (Monthly, Weekly, Daily, Annually, Quarterly)
76
-
77
- exclude_from_autofe_generation - optional list of features that should be excluded from AutoFE
78
-
79
- secondary_search_keys - optional dict of secondary search keys
80
-
81
- sort_column - optional str - name of unique column that could be used for sort
82
-
83
- date_format - optional str - format of date if it is present in search keys
84
-
85
- ...
86
-
87
- data_table_id_to_replace - optional str - id of registered ADS that should be replaced by new table
88
-
89
- keep_features - optional list - features that should not be removed from ADS (even if they are personal)
90
- """
91
65
  trace_id = str(uuid.uuid4())
92
66
 
93
67
  with MDC(trace_id=trace_id):
@@ -150,14 +124,6 @@ class DataSourcePublisher:
150
124
  request["excludeFromGeneration"] = exclude_from_autofe_generation
151
125
  if keep_features is not None:
152
126
  request["keepFeatures"] = keep_features
153
- if date_features is not None:
154
- if date_format is None:
155
- raise ValidationError("date_format should be presented if you use date features")
156
- request["dateFeatures"] = date_features
157
- if date_vector_features is not None:
158
- if date_format is None:
159
- raise ValidationError("date_format should be presented if you use date vector features")
160
- request["dateVectorFeatures"] = date_vector_features
161
127
  self.logger.info(f"Start registering data table {request}")
162
128
 
163
129
  task_id = self._rest_client.register_ads(request, trace_id)
@@ -215,9 +181,6 @@ class DataSourcePublisher:
215
181
  msg = f"Data table successfully registered with id: {data_table_id}"
216
182
  self.logger.info(msg)
217
183
  print(msg)
218
- if "warnings" in status_response and status_response["warnings"]:
219
- self.logger.warning(status_response["warnings"])
220
- print(status_response["warnings"])
221
184
  return data_table_id
222
185
  except KeyboardInterrupt:
223
186
  if task_id is not None:
@@ -2870,7 +2870,7 @@ class FeaturesEnricher(TransformerMixin):
2870
2870
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
2871
2871
 
2872
2872
  except Exception:
2873
- self.logger.warning("Failed to log debug information", exc_info=True)
2873
+ self.logger.exception("Failed to log debug information")
2874
2874
 
2875
2875
  def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
2876
2876
  index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
@@ -1 +0,0 @@
1
- __version__ = "1.1.296"
File without changes
File without changes
File without changes