upgini 1.2.32__tar.gz → 1.2.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.32 → upgini-1.2.34}/PKG-INFO +3 -3
  2. {upgini-1.2.32 → upgini-1.2.34}/README.md +1 -1
  3. {upgini-1.2.32 → upgini-1.2.34}/pyproject.toml +1 -1
  4. upgini-1.2.34/src/upgini/__about__.py +1 -0
  5. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/dataset.py +8 -2
  6. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/features_enricher.py +48 -6
  7. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/metadata.py +14 -0
  8. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/ip_utils.py +15 -0
  9. upgini-1.2.32/src/upgini/__about__.py +0 -1
  10. {upgini-1.2.32 → upgini-1.2.34}/.gitignore +0 -0
  11. {upgini-1.2.32 → upgini-1.2.34}/LICENSE +0 -0
  12. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/__init__.py +0 -0
  13. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/ads.py +0 -0
  14. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/feature.py +0 -0
  21. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/groupby.py +0 -0
  22. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/operand.py +0 -0
  23. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/unary.py +0 -0
  24. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/autofe/vector.py +0 -0
  25. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/data_source/data_source_publisher.py +0 -0
  27. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/http.py +0 -0
  29. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/metrics.py +0 -0
  33. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/normalizer/normalize_utils.py +0 -0
  35. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  46. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/feature_info.py +0 -0
  58. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.32 → upgini-1.2.34}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.32
3
+ Version: 1.2.34
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -37,7 +37,7 @@ Requires-Dist: python-dateutil>=2.8.0
37
37
  Requires-Dist: python-json-logger>=2.0.2
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
- Requires-Dist: xhtml2pdf==0.2.11
40
+ Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
41
41
  Description-Content-Type: text/markdown
42
42
 
43
43
 
@@ -110,7 +110,7 @@ Description-Content-Type: text/markdown
110
110
  </tr>
111
111
  </table>
112
112
 
113
- ⭐️ [Simple Drag & Drop Search UI](https://upgini.com/upgini-widget):
113
+ ⭐️ [Simple Drag & Drop Search UI](https://www.upgini.com/data-search-widget):
114
114
  <a href="https://upgini.com/upgini-widget">
115
115
  <img width="710" alt="Drag & Drop Search UI" src="https://github.com/upgini/upgini/assets/95645411/36b6460c-51f3-400e-9f04-445b938bf45e">
116
116
  </a>
@@ -68,7 +68,7 @@
68
68
  </tr>
69
69
  </table>
70
70
 
71
- ⭐️ [Simple Drag & Drop Search UI](https://upgini.com/upgini-widget):
71
+ ⭐️ [Simple Drag & Drop Search UI](https://www.upgini.com/data-search-widget):
72
72
  <a href="https://upgini.com/upgini-widget">
73
73
  <img width="710" alt="Drag & Drop Search UI" src="https://github.com/upgini/upgini/assets/95645411/36b6460c-51f3-400e-9f04-445b938bf45e">
74
74
  </a>
@@ -47,7 +47,7 @@ dependencies = [
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
49
  "python-bidi==0.4.2",
50
- "xhtml2pdf==0.2.11",
50
+ "xhtml2pdf>=0.2.11,<0.3.0",
51
51
  "jarowinkler>=2.0.0",
52
52
  "levenshtein>=0.25.1",
53
53
  ]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.34"
@@ -422,11 +422,11 @@ class Dataset: # (pd.DataFrame):
422
422
  + "".join("<tr>" + "".join(map(map_color, row[1:])) + "</tr>" for row in df_stats.itertuples())
423
423
  + "</table>"
424
424
  )
425
- print()
426
425
  display(HTML(html_stats))
427
- except (ImportError, NameError):
428
426
  print()
427
+ except (ImportError, NameError):
429
428
  print(df_stats)
429
+ print()
430
430
 
431
431
  if len(self.data) == 0:
432
432
  raise ValidationError(self.bundle.get("all_search_keys_invalid"))
@@ -494,11 +494,17 @@ class Dataset: # (pd.DataFrame):
494
494
  taskType=self.task_type,
495
495
  )
496
496
 
497
+ @staticmethod
498
+ def is_column_binary_type(column):
499
+ return column.apply(lambda x: x is None or isinstance(x, (bytes, bytearray))).all()
500
+
497
501
  def __get_data_type(self, pandas_data_type, column_name: str) -> DataType:
498
502
  if is_integer_dtype(pandas_data_type):
499
503
  return DataType.INT
500
504
  elif is_float_dtype(pandas_data_type):
501
505
  return DataType.DECIMAL
506
+ elif self.is_column_binary_type(self.data[column_name]):
507
+ return DataType.BYTES
502
508
  elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
503
509
  return DataType.STRING
504
510
  else:
@@ -111,7 +111,11 @@ try:
111
111
  except Exception:
112
112
  from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
113
113
 
114
- from upgini.utils.target_utils import calculate_psi, define_task
114
+ from upgini.utils.target_utils import (
115
+ balance_undersample_forced,
116
+ calculate_psi,
117
+ define_task,
118
+ )
115
119
  from upgini.utils.warning_counter import WarningCounter
116
120
  from upgini.version_validator import validate_version
117
121
 
@@ -967,6 +971,13 @@ class FeaturesEnricher(TransformerMixin):
967
971
  self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
968
972
  return None
969
973
 
974
+ maybe_phone_column = self._get_phone_column(self.search_keys)
975
+ text_features = (
976
+ [f for f in self.generate_features if f != maybe_phone_column]
977
+ if self.generate_features is not None
978
+ else None
979
+ )
980
+
970
981
  print(self.bundle.get("metrics_start"))
971
982
  with Spinner():
972
983
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
@@ -982,7 +993,7 @@ class FeaturesEnricher(TransformerMixin):
982
993
  fitting_enriched_X,
983
994
  scoring,
984
995
  groups=groups,
985
- text_features=self.generate_features,
996
+ text_features=text_features,
986
997
  has_date=has_date,
987
998
  )
988
999
  metric = wrapper.metric_name
@@ -1009,7 +1020,7 @@ class FeaturesEnricher(TransformerMixin):
1009
1020
  cat_features,
1010
1021
  add_params=custom_loss_add_params,
1011
1022
  groups=groups,
1012
- text_features=self.generate_features,
1023
+ text_features=text_features,
1013
1024
  has_date=has_date,
1014
1025
  )
1015
1026
  etalon_cv_result = baseline_estimator.cross_val_predict(
@@ -1044,7 +1055,7 @@ class FeaturesEnricher(TransformerMixin):
1044
1055
  cat_features,
1045
1056
  add_params=custom_loss_add_params,
1046
1057
  groups=groups,
1047
- text_features=self.generate_features,
1058
+ text_features=text_features,
1048
1059
  has_date=has_date,
1049
1060
  )
1050
1061
  enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
@@ -1827,7 +1838,28 @@ class FeaturesEnricher(TransformerMixin):
1827
1838
 
1828
1839
  # downsample if need to eval_set threshold
1829
1840
  num_samples = _num_samples(df)
1830
- if num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1841
+ phone_column = self._get_phone_column(self.search_keys)
1842
+ force_downsampling = (
1843
+ not self.disable_force_downsampling
1844
+ and self.generate_features is not None
1845
+ and phone_column is not None
1846
+ and self.fit_columns_renaming is not None
1847
+ and self.fit_columns_renaming.get(phone_column) in self.generate_features
1848
+ and num_samples > Dataset.FORCE_SAMPLE_SIZE
1849
+ )
1850
+ if force_downsampling:
1851
+ self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
1852
+ df = balance_undersample_forced(
1853
+ df=df,
1854
+ target_column=TARGET,
1855
+ task_type=self.model_task_type,
1856
+ random_state=self.random_state,
1857
+ sample_size=Dataset.FORCE_SAMPLE_SIZE,
1858
+ logger=self.logger,
1859
+ bundle=self.bundle,
1860
+ warning_callback=self.__log_warning,
1861
+ )
1862
+ elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
1831
1863
  self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
1832
1864
  df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
1833
1865
 
@@ -2063,6 +2095,15 @@ class FeaturesEnricher(TransformerMixin):
2063
2095
  self.__display_support_link(msg)
2064
2096
  return None, {c: c for c in X.columns}, []
2065
2097
 
2098
+ features_meta = self._search_task.get_all_features_metadata_v2()
2099
+ online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
2100
+ if len(online_api_features) > 0:
2101
+ self.logger.warning(
2102
+ f"There are important features for transform, that generated by online API: {online_api_features}"
2103
+ )
2104
+ # TODO
2105
+ raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
2106
+
2066
2107
  if not metrics_calculation:
2067
2108
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
2068
2109
  self.logger.info(f"Current transform usage: {transform_usage}. Transforming {len(X)} rows")
@@ -2708,8 +2749,9 @@ class FeaturesEnricher(TransformerMixin):
2708
2749
  and self.generate_features is not None
2709
2750
  and phone_column is not None
2710
2751
  and self.fit_columns_renaming[phone_column] in self.generate_features
2752
+ and len(df) > Dataset.FORCE_SAMPLE_SIZE
2711
2753
  )
2712
- if force_downsampling and len(df) > Dataset.FORCE_SAMPLE_SIZE:
2754
+ if force_downsampling:
2713
2755
  runtime_parameters.properties["fast_fit"] = True
2714
2756
 
2715
2757
  dataset = Dataset(
@@ -43,6 +43,9 @@ class FileColumnMeaningType(Enum):
43
43
  EVAL_SET_INDEX = "EVAL_SET_INDEX"
44
44
  ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
45
45
  UNNEST_KEY = "UNNEST_KEY"
46
+ IP_BINARY = "IP_BINARY"
47
+ IP_RANGE_FROM_BINARY = "IP_RANGE_FROM_BINARY"
48
+ IP_RANGE_TO_BINARY = "IP_RANGE_TO_BINARY"
46
49
 
47
50
 
48
51
  class SearchKey(Enum):
@@ -60,6 +63,9 @@ class SearchKey(Enum):
60
63
  IPV6_ADDRESS = FileColumnMeaningType.IPV6_ADDRESS
61
64
  IPV6_RANGE_FROM = FileColumnMeaningType.IPV6_RANGE_FROM
62
65
  IPV6_RANGE_TO = FileColumnMeaningType.IPV6_RANGE_TO
66
+ IP_BINARY = FileColumnMeaningType.IP_BINARY
67
+ IP_RANGE_FROM_BINARY = FileColumnMeaningType.IP_RANGE_FROM_BINARY
68
+ IP_RANGE_TO_BINARY = FileColumnMeaningType.IP_RANGE_TO_BINARY
63
69
 
64
70
  # For data source registration. Don't use it for FeaturesEnricher
65
71
  EMAIL_ONE_DOMAIN = FileColumnMeaningType.EMAIL_ONE_DOMAIN
@@ -112,6 +118,12 @@ class SearchKey(Enum):
112
118
  return SearchKey.MSISDN_RANGE_FROM
113
119
  if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
114
120
  return SearchKey.MSISDN_RANGE_TO
121
+ if meaning_type == FileColumnMeaningType.IP_BINARY:
122
+ return SearchKey.IP_BINARY
123
+ if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
124
+ return SearchKey.IP_RANGE_FROM_BINARY
125
+ if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
126
+ return SearchKey.IP_RANGE_TO_BINARY
115
127
 
116
128
  @staticmethod
117
129
  def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
@@ -136,6 +148,7 @@ class DataType(Enum):
136
148
  DATE_TIME = "DATE_TIME"
137
149
  STRING = "STRING"
138
150
  BOOLEAN = "BOOLEAN"
151
+ BYTES = "BYTES"
139
152
 
140
153
 
141
154
  class ModelTaskType(Enum):
@@ -255,6 +268,7 @@ class FeaturesMetadataV2(BaseModel):
255
268
  data_source_links: Optional[List[str]] = None
256
269
  doc_link: Optional[str] = None
257
270
  update_frequency: Optional[str] = None
271
+ from_online_api: Optional[bool] = None
258
272
 
259
273
 
260
274
  class HitRateMetrics(BaseModel):
@@ -42,6 +42,16 @@ class IpSearchKeyConverter:
42
42
  except Exception:
43
43
  pass
44
44
 
45
+ @staticmethod
46
+ def _ip_to_binary(ip: Optional[_BaseAddress]) -> Optional[bytes]:
47
+ try:
48
+ if isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None:
49
+ return ip.ipv4_mapped.packed
50
+ else:
51
+ return ip.packed
52
+ except Exception:
53
+ pass
54
+
45
55
  @staticmethod
46
56
  def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
47
57
  try:
@@ -100,11 +110,16 @@ class IpSearchKeyConverter:
100
110
  .astype("string")
101
111
  # .str.replace(".0", "", regex=False)
102
112
  )
113
+ ip_binary = self.ip_column + "_binary"
114
+ df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
115
+
103
116
  df = df.drop(columns=self.ip_column)
104
117
  del self.search_keys[self.ip_column]
105
118
  del self.columns_renaming[self.ip_column]
106
119
  self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
120
+ self.search_keys[ip_binary] = SearchKey.IP_BINARY
107
121
  self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
122
+ self.columns_renaming[ip_binary] = original_ip
108
123
 
109
124
  return df
110
125
 
@@ -1 +0,0 @@
1
- __version__ = "1.2.32"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes