upgini 1.2.97__tar.gz → 1.2.98__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {upgini-1.2.97 → upgini-1.2.98}/PKG-INFO +1 -1
  2. upgini-1.2.98/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/feature.py +20 -10
  4. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/unary.py +22 -1
  5. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/features_enricher.py +1 -1
  6. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/metrics.py +6 -6
  7. upgini-1.2.97/src/upgini/__about__.py +0 -1
  8. {upgini-1.2.97 → upgini-1.2.98}/.gitignore +0 -0
  9. {upgini-1.2.97 → upgini-1.2.98}/LICENSE +0 -0
  10. {upgini-1.2.97 → upgini-1.2.98}/README.md +0 -0
  11. {upgini-1.2.97 → upgini-1.2.98}/pyproject.toml +0 -0
  12. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/__init__.py +0 -0
  13. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/ads.py +0 -0
  14. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/all_operators.py +0 -0
  18. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/operator.py +0 -0
  22. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/__init__.py +0 -0
  23. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/base.py +0 -0
  24. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/cross.py +0 -0
  25. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/delta.py +0 -0
  26. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/lag.py +0 -0
  27. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/roll.py +0 -0
  28. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/trend.py +0 -0
  29. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/timeseries/volatility.py +0 -0
  30. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/http.py +0 -0
  37. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/mdc/__init__.py +0 -0
  38. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/mdc/context.py +0 -0
  39. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/metadata.py +0 -0
  40. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/normalizer/__init__.py +0 -0
  41. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/normalizer/normalize_utils.py +0 -0
  42. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/resource_bundle/__init__.py +0 -0
  43. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/resource_bundle/exceptions.py +0 -0
  44. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/resource_bundle/strings.properties +0 -0
  45. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  46. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/sampler/__init__.py +0 -0
  47. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/sampler/base.py +0 -0
  48. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/sampler/random_under_sampler.py +0 -0
  49. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/sampler/utils.py +0 -0
  50. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/search_task.py +0 -0
  51. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/spinner.py +0 -0
  52. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  53. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/__init__.py +0 -0
  54. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/base_search_key_detector.py +0 -0
  55. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/blocked_time_series.py +0 -0
  56. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/country_utils.py +0 -0
  57. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/custom_loss_utils.py +0 -0
  58. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/cv_utils.py +0 -0
  59. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/datetime_utils.py +0 -0
  60. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/deduplicate_utils.py +0 -0
  61. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/display_utils.py +0 -0
  62. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/email_utils.py +0 -0
  63. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/fallback_progress_bar.py +0 -0
  64. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/feature_info.py +0 -0
  65. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/features_validator.py +0 -0
  66. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/format.py +0 -0
  67. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/ip_utils.py +0 -0
  68. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/mstats.py +0 -0
  69. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/phone_utils.py +0 -0
  70. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/postal_code_utils.py +0 -0
  71. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/progress_bar.py +0 -0
  72. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/sample_utils.py +0 -0
  73. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/sklearn_ext.py +0 -0
  74. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/sort.py +0 -0
  75. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/target_utils.py +0 -0
  76. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/track_info.py +0 -0
  77. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/ts_utils.py +0 -0
  78. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/utils/warning_counter.py +0 -0
  79. {upgini-1.2.97 → upgini-1.2.98}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.97
3
+ Version: 1.2.98
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.98"
@@ -18,10 +18,7 @@ class Column:
18
18
  self.data = data
19
19
  self.calculate_all = calculate_all
20
20
 
21
- def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
22
- return self.name
23
-
24
- def set_op_params(self, params: Dict[str, str]) -> "Column":
21
+ def set_op_params(self, params: Dict[str, str], **kwargs) -> "Column":
25
22
  return self
26
23
 
27
24
  def get_op_params(self, **kwargs):
@@ -37,8 +34,21 @@ class Column:
37
34
  def get_column_nodes(self) -> List["Column"]:
38
35
  return [self]
39
36
 
40
- def get_columns(self, **kwargs) -> List[str]:
41
- return [self.name]
37
+ def get_columns(self, unhash=False, **kwargs):
38
+ name = self.name
39
+ return [self._unhash(name) if unhash else name]
40
+
41
+ def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
42
+ return self.get_columns(**kwargs)[0]
43
+
44
+ def _unhash(self, feature_name: str) -> str:
45
+ last_component_idx = feature_name.rfind("_")
46
+ if not feature_name.startswith("f_"):
47
+ return feature_name # etalon feature
48
+ elif last_component_idx == 1:
49
+ return feature_name[2:] # fully hashed name, cannot unhash
50
+ else:
51
+ return feature_name[2:last_component_idx]
42
52
 
43
53
  @property
44
54
  def children(self) -> List[Union["Feature", "Column"]]:
@@ -81,7 +91,7 @@ class Feature:
81
91
  self.cached_display_name = cached_display_name
82
92
  self.alias = alias
83
93
 
84
- def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
94
+ def set_op_params(self, params: Optional[Dict[str, str]], **kwargs) -> "Feature":
85
95
  obj_dict = pydantic_dump_method(self.op)().copy()
86
96
  obj_dict.update(params or {})
87
97
  self.op = pydantic_parse_method(self.op.__class__)(obj_dict)
@@ -89,13 +99,13 @@ class Feature:
89
99
 
90
100
  for child in self.children:
91
101
  child_params = {
92
- k[len(child.get_display_name()) + 1 :]: v
102
+ k[len(child.get_display_name(**kwargs)) + 1 :]: v
93
103
  for k, v in params.items()
94
- if k.startswith(child.get_display_name())
104
+ if k.startswith(child.get_display_name(**kwargs))
95
105
  }
96
106
  if not child_params:
97
107
  child_params = params
98
- child.set_op_params(child_params)
108
+ child.set_op_params(child_params, **kwargs)
99
109
  return self
100
110
 
101
111
  def get_op_params(self, **kwargs) -> Dict[str, str]:
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
 
6
- from upgini.autofe.operator import PandasOperator, VectorizableMixin
6
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
7
7
  from upgini.autofe.utils import pydantic_validator
8
8
 
9
9
 
@@ -198,3 +198,24 @@ class Cluster(PandasOperator):
198
198
  input_type: Optional[str] = "vector"
199
199
  output_type: Optional[str] = "category"
200
200
  is_categorical: bool = True
201
+
202
+
203
+ class OutlierDistance(PandasOperator, ParametrizedOperator):
204
+ name: str = "outlier_dist"
205
+ is_unary: bool = True
206
+ input_type: Optional[str] = "vector"
207
+ output_type: Optional[str] = "float"
208
+ class_value: Optional[str] = None
209
+
210
+ def to_formula(self) -> str:
211
+ return f"outlier_dist_{self.class_value if self.class_value is not None else 'all'}"
212
+
213
+ @classmethod
214
+ def from_formula(cls, formula: str) -> Optional["OutlierDistance"]:
215
+ if formula == "outlier_dist":
216
+ return cls()
217
+
218
+ if formula.startswith("outlier_dist_"):
219
+ class_value = formula.split("_")[-1]
220
+ return cls(class_value=None if class_value == "all" else class_value)
221
+ return None
@@ -4174,7 +4174,7 @@ if response.status_code == 200:
4174
4174
 
4175
4175
  description = {}
4176
4176
 
4177
- feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
4177
+ feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True, unhash=True))
4178
4178
  if feature_meta is None:
4179
4179
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
4180
4180
  continue
@@ -399,14 +399,14 @@ class EstimatorWrapper:
399
399
  self.converted_to_str.append(c)
400
400
  elif c in self.cat_features:
401
401
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
402
- x[c] = x[c].astype(np.int64)
402
+ x[c] = x[c].astype(pd.Int64Dtype())
403
403
  self.converted_to_int.append(c)
404
404
  elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
405
405
  self.logger.info(
406
406
  f"Convert categorical feature {c} with integer categories"
407
407
  " to int64 and remove from cat_features"
408
408
  )
409
- x[c] = x[c].astype(np.int64)
409
+ x[c] = x[c].astype(pd.Int64Dtype())
410
410
  self.converted_to_int.append(c)
411
411
  self.cat_features.remove(c)
412
412
  elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
@@ -419,7 +419,7 @@ class EstimatorWrapper:
419
419
  else:
420
420
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
421
421
  self.logger.info(f"Convert bool feature {c} to int64")
422
- x[c] = x[c].astype(np.int64)
422
+ x[c] = x[c].astype(pd.Int64Dtype())
423
423
  self.converted_to_int.append(c)
424
424
  elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
425
425
  try:
@@ -442,7 +442,7 @@ class EstimatorWrapper:
442
442
  if self.converted_to_int:
443
443
  self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
444
444
  for c in self.converted_to_int:
445
- x[c] = x[c].astype(np.int64)
445
+ x[c] = x[c].astype(pd.Int64Dtype())
446
446
 
447
447
  if self.converted_to_str:
448
448
  self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
@@ -896,7 +896,7 @@ class LightGBMWrapper(EstimatorWrapper):
896
896
  x[c] = x[c].astype("category")
897
897
 
898
898
  for c in x.columns:
899
- if x[c].dtype not in ["category", "int64", "float64", "bool"]:
899
+ if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
900
900
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
901
901
  self.dropped_features.append(c)
902
902
  x = x.drop(columns=c, errors="ignore")
@@ -987,7 +987,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
987
987
  x[c] = x[c].astype("category")
988
988
  params["cat_features"] = self.cat_features
989
989
  for c in x.columns:
990
- if x[c].dtype not in ["category", "int64", "float64", "bool"]:
990
+ if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
991
991
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
992
992
  self.dropped_features.append(c)
993
993
  x = x.drop(columns=c, errors="ignore")
@@ -1 +0,0 @@
1
- __version__ = "1.2.97"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes