upgini 1.1.274a3388.post2__tar.gz → 1.1.275a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (86) hide show
  1. {upgini-1.1.274a3388.post2/src/upgini.egg-info → upgini-1.1.275a1}/PKG-INFO +1 -1
  2. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/setup.py +1 -1
  3. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/date.py +2 -17
  4. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/feature.py +1 -70
  5. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/operand.py +2 -10
  6. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/dataset.py +10 -2
  7. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/features_enricher.py +199 -92
  8. upgini-1.1.275a1/src/upgini/fingerprint.js +8 -0
  9. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/metadata.py +9 -1
  10. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/resource_bundle/strings.properties +1 -0
  11. upgini-1.1.275a1/src/upgini/utils/base_search_key_detector.py +27 -0
  12. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/deduplicate_utils.py +11 -1
  13. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/email_utils.py +5 -0
  14. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1/src/upgini.egg-info}/PKG-INFO +1 -1
  15. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini.egg-info/SOURCES.txt +1 -0
  16. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_autofe_operands.py +0 -43
  17. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_country_utils.py +4 -4
  18. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_email_utils.py +8 -10
  19. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_etalon_validation.py +21 -2
  20. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_features_enricher.py +13 -3
  21. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_phone_utils.py +6 -6
  22. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_postal_code_utils.py +6 -6
  23. upgini-1.1.274a3388.post2/src/upgini/utils/base_search_key_detector.py +0 -25
  24. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/LICENSE +0 -0
  25. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/README.md +0 -0
  26. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/pyproject.toml +0 -0
  27. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/setup.cfg +0 -0
  28. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/__init__.py +0 -0
  29. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/ads.py +0 -0
  30. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/ads_management/__init__.py +0 -0
  31. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/ads_management/ads_manager.py +0 -0
  32. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/__init__.py +0 -0
  33. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/all_operands.py +0 -0
  34. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/binary.py +0 -0
  35. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/groupby.py +0 -0
  36. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/unary.py +0 -0
  37. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/autofe/vector.py +0 -0
  38. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/data_source/__init__.py +0 -0
  39. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  40. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/errors.py +0 -0
  41. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/http.py +0 -0
  42. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/mdc/__init__.py +0 -0
  43. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/mdc/context.py +0 -0
  44. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/metrics.py +0 -0
  45. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/normalizer/__init__.py +0 -0
  46. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/normalizer/phone_normalizer.py +0 -0
  47. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/resource_bundle/__init__.py +0 -0
  48. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  49. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  50. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/sampler/__init__.py +0 -0
  51. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/sampler/base.py +0 -0
  52. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  53. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/sampler/utils.py +0 -0
  54. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/search_task.py +0 -0
  55. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/spinner.py +0 -0
  56. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/__init__.py +0 -0
  57. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/display_utils.py +0 -0
  63. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  64. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/features_validator.py +0 -0
  65. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/format.py +0 -0
  66. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/ip_utils.py +0 -0
  67. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/phone_utils.py +0 -0
  68. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/postal_code_utils.py +0 -0
  69. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/progress_bar.py +0 -0
  70. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/sklearn_ext.py +0 -0
  71. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/target_utils.py +0 -0
  72. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/track_info.py +0 -0
  73. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/utils/warning_counter.py +0 -0
  74. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini/version_validator.py +0 -0
  75. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini.egg-info/dependency_links.txt +0 -0
  76. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini.egg-info/requires.txt +0 -0
  77. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/src/upgini.egg-info/top_level.txt +0 -0
  78. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_binary_dataset.py +0 -0
  79. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_blocked_time_series.py +0 -0
  80. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_categorical_dataset.py +0 -0
  81. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_continuous_dataset.py +0 -0
  82. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_custom_loss_utils.py +0 -0
  83. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_datetime_utils.py +0 -0
  84. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_metrics.py +0 -0
  85. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_target_utils.py +0 -0
  86. {upgini-1.1.274a3388.post2 → upgini-1.1.275a1}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.274a3388.post2
3
+ Version: 1.1.275a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.274a3388-2"
43
+ version = "1.1.275a1"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -1,9 +1,9 @@
1
- from typing import Any, List, Optional, Union
1
+ from typing import Any, Optional, Union
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
  from pydantic import BaseModel
5
5
 
6
- from upgini.autofe.operand import MultiOperand, Operand, PandasOperand
6
+ from upgini.autofe.operand import PandasOperand
7
7
 
8
8
 
9
9
  class DateDiffMixin(BaseModel):
@@ -24,7 +24,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
24
24
  name = "date_diff"
25
25
  is_binary = True
26
26
  has_symmetry_importance = True
27
- common_type = "date_diff"
28
27
 
29
28
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
30
29
  left = self._convert_to_date(left, self.left_unit)
@@ -40,7 +39,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
40
39
  name = "date_diff_type2"
41
40
  is_binary = True
42
41
  has_symmetry_importance = True
43
- common_type = "date_diff"
44
42
 
45
43
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
46
44
  left = self._convert_to_date(left, self.left_unit)
@@ -62,7 +60,6 @@ class DateListDiff(PandasOperand, DateDiffMixin):
62
60
  is_binary = True
63
61
  has_symmetry_importance = True
64
62
  aggregation: str
65
- common_type = "date_diff_list"
66
63
 
67
64
  def __init__(self, **data: Any) -> None:
68
65
  if "name" not in data:
@@ -89,9 +86,6 @@ class DateListDiff(PandasOperand, DateDiffMixin):
89
86
 
90
87
  return method(x) if len(x) > 0 else default
91
88
 
92
- def make_multi_operand(self, operands: List[Operand]) -> Optional[MultiOperand]:
93
- return DateListDiffMulti(children=operands, aggregation="")
94
-
95
89
 
96
90
  class DateListDiffBounded(DateListDiff):
97
91
  lower_bound: Optional[int]
@@ -114,12 +108,3 @@ class DateListDiffBounded(DateListDiff):
114
108
  def _agg(self, x):
115
109
  x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
116
110
  return super()._agg(x)
117
-
118
-
119
- class DateListDiffMulti(DateListDiff, MultiOperand):
120
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
121
- left = self._convert_to_date(left, self.left_unit)
122
- right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
123
-
124
- diff = pd.Series(left - right.values).apply(self._diff)
125
- return diff.apply(lambda x: [c._agg(x) for c in self.children])
@@ -1,15 +1,13 @@
1
1
  import hashlib
2
2
  import itertools
3
- import operator
4
3
  from typing import Dict, List, Optional, Tuple, Union
5
4
 
6
- from more_itertools import map_reduce
7
5
  import numpy as np
8
6
  import pandas as pd
9
7
  from pandas._typing import DtypeObj
10
8
 
11
9
  from upgini.autofe.all_operands import find_op
12
- from upgini.autofe.operand import MultiOperand, Operand, PandasOperand
10
+ from upgini.autofe.operand import Operand, PandasOperand
13
11
 
14
12
 
15
13
  class Column:
@@ -31,9 +29,6 @@ class Column:
31
29
  else:
32
30
  return feature_name[2:last_component_idx]
33
31
 
34
- def get_display_name(self, **kwargs):
35
- return self.name
36
-
37
32
  def delete_data(self):
38
33
  self.data = None
39
34
 
@@ -162,8 +157,6 @@ class Feature:
162
157
  else:
163
158
  new_data = new_data.replace([-np.inf, np.inf], np.nan)
164
159
 
165
- new_data = new_data.rename(self.get_display_name())
166
-
167
160
  if is_root:
168
161
  self.data = new_data
169
162
  return new_data
@@ -333,65 +326,3 @@ class FeatureGroup:
333
326
  self.main_column_node.delete_data()
334
327
  for child in self.children:
335
328
  child.delete_data()
336
-
337
-
338
- class OperandGroup:
339
- def __init__(self, operand: MultiOperand, children: List[Union[Column, Feature]]):
340
- self.op = operand
341
- self.children = children
342
- self.data: Optional[pd.DataFrame] = None
343
-
344
- def get_columns(self, **kwargs) -> List[str]:
345
- column_list = []
346
- seen = set()
347
- for child in self.children:
348
- columns = child.get_columns(**kwargs)
349
- column_list.extend([f for f in columns if f not in seen])
350
- seen.update(columns)
351
- return column_list
352
-
353
- def get_display_names(self, **kwargs) -> List[str]:
354
- names = [Feature(op, self.children).get_display_name(**kwargs) for op in self.op.children]
355
- return names
356
-
357
- def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
358
- if isinstance(self.op, PandasOperand):
359
- if self.op.is_vector:
360
- ds = [child.calculate(data) for child in self.children]
361
- new_data = self.op.calculate(data=ds)
362
- else:
363
- d1 = self.children[0].calculate(data)
364
- d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
365
- new_data = self.op.calculate(data=d1, left=d1, right=d2)
366
-
367
- new_data = pd.DataFrame(new_data.values.tolist())
368
- new_data.columns = self.get_display_names()
369
- else:
370
- raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
371
-
372
- if is_root:
373
- self.data = new_data
374
- return new_data
375
-
376
- @staticmethod
377
- def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
378
- grouped_features = []
379
-
380
- for _, features in sorted(
381
- map_reduce(
382
- candidates, lambda f: (f.op.common_type or "", ",".join([c.get_display_name() for c in f.children]))
383
- ).items(),
384
- key=operator.itemgetter(0),
385
- ):
386
- feature_list = list(features)
387
- multi_op = feature_list[0].op.make_multi_operand([f.op for f in feature_list])
388
- if multi_op is not None:
389
- grouped_features.append(OperandGroup(multi_op, feature_list[0].children))
390
- else:
391
- grouped_features.extend(feature_list)
392
- return grouped_features
393
-
394
- def delete_data(self):
395
- self.data = None
396
- for child in self.children:
397
- child.delete_data()
@@ -5,9 +5,6 @@ import pandas as pd
5
5
  import numpy as np
6
6
 
7
7
 
8
- MAIN_COLUMN = "main_column"
9
-
10
-
11
8
  class Operand(BaseModel):
12
9
  name: str
13
10
  alias: Optional[str]
@@ -21,7 +18,6 @@ class Operand(BaseModel):
21
18
  is_binary: bool = False
22
19
  is_vector: bool = False
23
20
  is_distribution_dependent: bool = False
24
- common_type: Optional[str] = None
25
21
  params: Optional[Dict[str, str]]
26
22
 
27
23
  def set_params(self, params: Dict[str, str]):
@@ -31,8 +27,8 @@ class Operand(BaseModel):
31
27
  def get_params(self) -> Dict[str, str]:
32
28
  return self.params
33
29
 
34
- def make_multi_operand(self, operands: List["Operand"]) -> Optional["MultiOperand"]:
35
- return None
30
+
31
+ MAIN_COLUMN = "main_column"
36
32
 
37
33
 
38
34
  class PandasOperand(Operand, abc.ABC):
@@ -86,7 +82,3 @@ class VectorizableMixin(Operand):
86
82
  value_columns = [col for col in input_columns if col != group_column]
87
83
 
88
84
  return group_column, value_columns
89
-
90
-
91
- class MultiOperand(Operand):
92
- children: List[Operand]
@@ -22,7 +22,9 @@ from pandas.api.types import (
22
22
  from upgini.errors import ValidationError
23
23
  from upgini.http import ProgressStage, SearchProgress, _RestClient
24
24
  from upgini.metadata import (
25
+ ENTITY_SYSTEM_RECORD_ID,
25
26
  EVAL_SET_INDEX,
27
+ SEARCH_KEY_UNNEST,
26
28
  SYSTEM_COLUMNS,
27
29
  SYSTEM_RECORD_ID,
28
30
  TARGET,
@@ -78,6 +80,7 @@ class Dataset: # (pd.DataFrame):
78
80
  path: Optional[str] = None,
79
81
  meaning_types: Optional[Dict[str, FileColumnMeaningType]] = None,
80
82
  search_keys: Optional[List[Tuple[str, ...]]] = None,
83
+ unnest_search_keys: Optional[List[str]] = None,
81
84
  model_task_type: Optional[ModelTaskType] = None,
82
85
  random_state: Optional[int] = None,
83
86
  rest_client: Optional[_RestClient] = None,
@@ -112,6 +115,7 @@ class Dataset: # (pd.DataFrame):
112
115
  self.description = description
113
116
  self.meaning_types = meaning_types
114
117
  self.search_keys = search_keys
118
+ self.unnest_search_keys = unnest_search_keys
115
119
  self.ignore_columns = []
116
120
  self.hierarchical_group_keys = []
117
121
  self.hierarchical_subgroup_keys = []
@@ -171,7 +175,7 @@ class Dataset: # (pd.DataFrame):
171
175
  new_columns = []
172
176
  dup_counter = 0
173
177
  for column in self.data.columns:
174
- if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID]:
178
+ if column in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]:
175
179
  self.columns_renaming[column] = column
176
180
  new_columns.append(column)
177
181
  continue
@@ -352,7 +356,9 @@ class Dataset: # (pd.DataFrame):
352
356
 
353
357
  if is_string_dtype(self.data[postal_code]):
354
358
  try:
355
- self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
359
+ self.data[postal_code] = (
360
+ self.data[postal_code].astype("string").astype("Float64").astype("Int64").astype("string")
361
+ )
356
362
  except Exception:
357
363
  pass
358
364
  elif is_float_dtype(self.data[postal_code]):
@@ -802,6 +808,8 @@ class Dataset: # (pd.DataFrame):
802
808
  meaningType=meaning_type,
803
809
  minMaxValues=min_max_values,
804
810
  )
811
+ if self.unnest_search_keys and column_meta.originalName in self.unnest_search_keys:
812
+ column_meta.isUnnest = True
805
813
 
806
814
  columns.append(column_meta)
807
815