upgini 1.1.278a2__py3-none-any.whl → 1.1.279__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. upgini/__about__.py +1 -0
  2. upgini/ads_management/ads_manager.py +4 -2
  3. upgini/autofe/all_operands.py +3 -2
  4. upgini/autofe/binary.py +2 -1
  5. upgini/autofe/date.py +2 -1
  6. upgini/autofe/feature.py +1 -1
  7. upgini/autofe/groupby.py +3 -1
  8. upgini/autofe/operand.py +4 -3
  9. upgini/autofe/unary.py +2 -1
  10. upgini/autofe/vector.py +2 -0
  11. upgini/dataset.py +6 -15
  12. upgini/errors.py +1 -1
  13. upgini/features_enricher.py +104 -217
  14. upgini/http.py +11 -10
  15. upgini/mdc/__init__.py +1 -3
  16. upgini/mdc/context.py +4 -6
  17. upgini/metadata.py +5 -10
  18. upgini/metrics.py +102 -100
  19. upgini/normalizer/phone_normalizer.py +1 -1
  20. upgini/resource_bundle/__init__.py +5 -5
  21. upgini/resource_bundle/strings.properties +0 -1
  22. upgini/sampler/base.py +1 -4
  23. upgini/sampler/random_under_sampler.py +2 -5
  24. upgini/search_task.py +4 -4
  25. upgini/spinner.py +1 -1
  26. upgini/utils/__init__.py +1 -1
  27. upgini/utils/base_search_key_detector.py +14 -16
  28. upgini/utils/blocked_time_series.py +4 -2
  29. upgini/utils/country_utils.py +1 -1
  30. upgini/utils/custom_loss_utils.py +3 -2
  31. upgini/utils/cv_utils.py +2 -2
  32. upgini/utils/datetime_utils.py +20 -15
  33. upgini/utils/deduplicate_utils.py +1 -11
  34. upgini/utils/email_utils.py +2 -7
  35. upgini/utils/fallback_progress_bar.py +1 -1
  36. upgini/utils/progress_bar.py +1 -1
  37. upgini/utils/sklearn_ext.py +14 -13
  38. upgini/utils/track_info.py +2 -2
  39. upgini/version_validator.py +2 -2
  40. {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info}/METADATA +21 -23
  41. upgini-1.1.279.dist-info/RECORD +62 -0
  42. {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info}/WHEEL +1 -2
  43. upgini-1.1.278a2.dist-info/RECORD +0 -62
  44. upgini-1.1.278a2.dist-info/top_level.txt +0 -1
  45. {upgini-1.1.278a2.dist-info → upgini-1.1.279.dist-info/licenses}/LICENSE +0 -0
upgini/__about__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "1.1.279"
@@ -1,9 +1,11 @@
1
1
  import time
2
- from typing import Dict, Optional
3
2
  import uuid
3
+ from typing import Dict, Optional
4
+
5
+ import pandas as pd
6
+
4
7
  from upgini.http import get_rest_client
5
8
  from upgini.spinner import Spinner
6
- import pandas as pd
7
9
 
8
10
 
9
11
  class AdsManager:
@@ -1,9 +1,10 @@
1
1
  from typing import Dict
2
+
3
+ from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
2
4
  from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
3
5
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
4
6
  from upgini.autofe.operand import Operand
5
- from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
6
- from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
7
+ from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
7
8
  from upgini.autofe.vector import Mean, Sum
8
9
 
9
10
  ALL_OPERANDS: Dict[str, Operand] = {
upgini/autofe/binary.py CHANGED
@@ -1,9 +1,10 @@
1
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
1
  import numpy as np
3
2
  import pandas as pd
4
3
  from numpy import dot
5
4
  from numpy.linalg import norm
6
5
 
6
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
+
7
8
 
8
9
  class Min(PandasOperand):
9
10
  name = "min"
upgini/autofe/date.py CHANGED
@@ -1,8 +1,9 @@
1
1
  from typing import Any, Optional, Union
2
+
2
3
  import numpy as np
3
4
  import pandas as pd
4
- from pydantic import BaseModel
5
5
  from pandas.core.arrays.timedeltas import TimedeltaArray
6
+ from pydantic import BaseModel
6
7
 
7
8
  from upgini.autofe.operand import PandasOperand
8
9
 
upgini/autofe/feature.py CHANGED
@@ -215,7 +215,7 @@ class Feature:
215
215
  return Column(string)
216
216
 
217
217
  def is_trivial_char(c: str) -> bool:
218
- return not (c in "()+-*/,")
218
+ return c not in "()+-*/,"
219
219
 
220
220
  def find_prev(string: str) -> int:
221
221
  if string[-1] != ")":
upgini/autofe/groupby.py CHANGED
@@ -1,7 +1,9 @@
1
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
1
  from typing import Optional
2
+
3
3
  import pandas as pd
4
4
 
5
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
+
5
7
 
6
8
  class GroupByThenAgg(PandasOperand, VectorizableMixin):
7
9
  agg: Optional[str]
upgini/autofe/operand.py CHANGED
@@ -1,8 +1,9 @@
1
- from pydantic import BaseModel
2
- from typing import Dict, List, Optional, Tuple, Union
3
1
  import abc
4
- import pandas as pd
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+
5
4
  import numpy as np
5
+ import pandas as pd
6
+ from pydantic import BaseModel
6
7
 
7
8
 
8
9
  class Operand(BaseModel):
upgini/autofe/unary.py CHANGED
@@ -1,7 +1,8 @@
1
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
1
  import numpy as np
3
2
  import pandas as pd
4
3
 
4
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+
5
6
 
6
7
  class Abs(PandasOperand, VectorizableMixin):
7
8
  name = "abs"
upgini/autofe/vector.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from typing import List
2
+
2
3
  import pandas as pd
4
+
3
5
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
4
6
 
5
7
 
upgini/dataset.py CHANGED
@@ -15,17 +15,15 @@ from pandas.api.types import (
15
15
  is_float_dtype,
16
16
  is_integer_dtype,
17
17
  is_numeric_dtype,
18
+ is_object_dtype,
18
19
  is_period_dtype,
19
20
  is_string_dtype,
20
- is_object_dtype,
21
21
  )
22
22
 
23
23
  from upgini.errors import ValidationError
24
24
  from upgini.http import ProgressStage, SearchProgress, _RestClient
25
25
  from upgini.metadata import (
26
- ENTITY_SYSTEM_RECORD_ID,
27
26
  EVAL_SET_INDEX,
28
- SEARCH_KEY_UNNEST,
29
27
  SYSTEM_COLUMNS,
30
28
  SYSTEM_RECORD_ID,
31
29
  TARGET,
@@ -81,7 +79,6 @@ class Dataset: # (pd.DataFrame):
81
79
  path: Optional[str] = None,
82
80
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
83
81
  search_keys: Optional[List[Tuple[str, ...]]] = None,
84
- unnest_search_keys: Optional[Dict[str, str]] = None,
85
82
  model_task_type: Optional[ModelTaskType] = None,
86
83
  random_state: Optional[int] = None,
87
84
  rest_client: Optional[_RestClient] = None,
@@ -98,7 +95,7 @@ class Dataset: # (pd.DataFrame):
98
95
  data = pd.read_csv(path, **kwargs)
99
96
  else:
100
97
  # try different separators: , ; \t ...
101
- with open(path, mode="r") as csvfile:
98
+ with open(path) as csvfile:
102
99
  sep = csv.Sniffer().sniff(csvfile.read(2048)).delimiter
103
100
  kwargs["sep"] = sep
104
101
  data = pd.read_csv(path, **kwargs)
@@ -116,7 +113,6 @@ class Dataset: # (pd.DataFrame):
116
113
  self.description = description
117
114
  self.meaning_types = meaning_types
118
115
  self.search_keys = search_keys
119
- self.unnest_search_keys = unnest_search_keys
120
116
  self.ignore_columns = []
121
117
  self.hierarchical_group_keys = []
122
118
  self.hierarchical_subgroup_keys = []
@@ -176,7 +172,7 @@ class Dataset: # (pd.DataFrame):
176
172
  new_columns = []
177
173
  dup_counter = 0
178
174
  for column in self.data.columns:
179
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
175
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
180
176
  self.columns_renaming[column] = column
181
177
  new_columns.append(column)
182
178
  continue
@@ -255,7 +251,7 @@ class Dataset: # (pd.DataFrame):
255
251
  @staticmethod
256
252
  def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
257
253
  try:
258
- if isinstance(ip, IPv4Address) or isinstance(ip, IPv6Address):
254
+ if isinstance(ip, (IPv4Address, IPv6Address)):
259
255
  return int(ip)
260
256
  except Exception:
261
257
  pass
@@ -263,7 +259,7 @@ class Dataset: # (pd.DataFrame):
263
259
  @staticmethod
264
260
  def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
265
261
  try:
266
- if isinstance(ip, IPv4Address) or isinstance(ip, IPv6Address):
262
+ if isinstance(ip, (IPv4Address, IPv6Address)):
267
263
  return str(int(ip))
268
264
  except Exception:
269
265
  pass
@@ -357,9 +353,7 @@ class Dataset: # (pd.DataFrame):
357
353
 
358
354
  if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
359
355
  try:
360
- self.data[postal_code] = (
361
- self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
362
- )
356
+ self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
363
357
  except Exception:
364
358
  pass
365
359
  elif is_float_dtype(self.data[postal_code]):
@@ -809,9 +803,6 @@ class Dataset: # (pd.DataFrame):
809
803
  meaningType=meaning_type,
810
804
  minMaxValues=min_max_values,
811
805
  )
812
- if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
813
- column_meta.isUnnest = True
814
- column_meta.unnestKeyNames = self.unnest_search_keys[column_meta.originalName]
815
806
 
816
807
  columns.append(column_meta)
817
808
 
upgini/errors.py CHANGED
@@ -16,7 +16,7 @@ class UnauthorizedError(HttpError):
16
16
  """Unauthorized error from REST API."""
17
17
 
18
18
  def __init__(self, message, status_code):
19
- message = "Unauthorized, please check your authorization token ({})".format(message)
19
+ message = f"Unauthorized, please check your authorization token ({message})"
20
20
  super(UnauthorizedError, self).__init__(message, status_code)
21
21
 
22
22