upgini 1.2.34a3657.dev4__tar.gz → 1.2.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/PKG-INFO +2 -2
  2. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/pyproject.toml +1 -1
  3. upgini-1.2.36/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/dataset.py +8 -2
  5. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/features_enricher.py +7 -6
  6. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/metadata.py +13 -0
  7. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/ip_utils.py +15 -0
  8. upgini-1.2.34a3657.dev4/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/.gitignore +0 -0
  10. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/LICENSE +0 -0
  11. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/README.md +0 -0
  12. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/__init__.py +0 -0
  13. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/ads.py +0 -0
  14. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/feature.py +0 -0
  21. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/groupby.py +0 -0
  22. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/operand.py +0 -0
  23. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/unary.py +0 -0
  24. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/autofe/vector.py +0 -0
  25. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/data_source/data_source_publisher.py +0 -0
  27. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/http.py +0 -0
  29. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/metrics.py +0 -0
  33. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/normalizer/normalize_utils.py +0 -0
  35. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  46. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/feature_info.py +0 -0
  58. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.34a3657.dev4 → upgini-1.2.36}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.34a3657.dev4
3
+ Version: 1.2.36
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -37,7 +37,7 @@ Requires-Dist: python-dateutil>=2.8.0
37
37
  Requires-Dist: python-json-logger>=2.0.2
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
- Requires-Dist: xhtml2pdf==0.2.11
40
+ Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
41
41
  Description-Content-Type: text/markdown
42
42
 
43
43
 
@@ -47,7 +47,7 @@ dependencies = [
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
49
  "python-bidi==0.4.2",
50
- "xhtml2pdf==0.2.11",
50
+ "xhtml2pdf>=0.2.11,<0.3.0",
51
51
  "jarowinkler>=2.0.0",
52
52
  "levenshtein>=0.25.1",
53
53
  ]
@@ -0,0 +1 @@
1
+ __version__ = "1.2.36"
@@ -422,11 +422,11 @@ class Dataset: # (pd.DataFrame):
422
422
  + "".join("<tr>" + "".join(map(map_color, row[1:])) + "</tr>" for row in df_stats.itertuples())
423
423
  + "</table>"
424
424
  )
425
- print()
426
425
  display(HTML(html_stats))
427
- except (ImportError, NameError):
428
426
  print()
427
+ except (ImportError, NameError):
429
428
  print(df_stats)
429
+ print()
430
430
 
431
431
  if len(self.data) == 0:
432
432
  raise ValidationError(self.bundle.get("all_search_keys_invalid"))
@@ -494,11 +494,17 @@ class Dataset: # (pd.DataFrame):
494
494
  taskType=self.task_type,
495
495
  )
496
496
 
497
+ @staticmethod
498
+ def is_column_binary_type(column):
499
+ return column.apply(lambda x: x is None or isinstance(x, (bytes, bytearray))).all()
500
+
497
501
  def __get_data_type(self, pandas_data_type, column_name: str) -> DataType:
498
502
  if is_integer_dtype(pandas_data_type):
499
503
  return DataType.INT
500
504
  elif is_float_dtype(pandas_data_type):
501
505
  return DataType.DECIMAL
506
+ elif self.is_column_binary_type(self.data[column_name]):
507
+ return DataType.BYTES
502
508
  elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
503
509
  return DataType.STRING
504
510
  else:
@@ -1844,7 +1844,8 @@ class FeaturesEnricher(TransformerMixin):
1844
1844
  not self.disable_force_downsampling
1845
1845
  and self.generate_features is not None
1846
1846
  and phone_column is not None
1847
- and self.fit_columns_renaming[phone_column] in self.generate_features
1847
+ and self.fit_columns_renaming is not None
1848
+ and self.fit_columns_renaming.get(phone_column) in self.generate_features
1848
1849
  and num_samples > Dataset.FORCE_SAMPLE_SIZE
1849
1850
  )
1850
1851
  if force_downsampling:
@@ -2868,7 +2869,7 @@ class FeaturesEnricher(TransformerMixin):
2868
2869
  df=autofe_description,
2869
2870
  internal_df=autofe_description,
2870
2871
  header=self.bundle.get("autofe_descriptions_header"),
2871
- display_id="autofe_descriptions",
2872
+ display_id=f"autofe_descriptions_{uuid.uuid4()}",
2872
2873
  )
2873
2874
 
2874
2875
  if self._has_paid_features(exclude_features_sources):
@@ -2909,10 +2910,10 @@ class FeaturesEnricher(TransformerMixin):
2909
2910
  progress_callback,
2910
2911
  )
2911
2912
  except Exception:
2912
- self.report_button_handle = self.__show_report_button(display_id="report_button")
2913
+ self.report_button_handle = self.__show_report_button(display_id=f"report_button_{uuid.uuid4()}")
2913
2914
  raise
2914
2915
 
2915
- self.report_button_handle = self.__show_report_button(display_id="report_button")
2916
+ self.report_button_handle = self.__show_report_button(display_id=f"report_button_{uuid.uuid4()}")
2916
2917
 
2917
2918
  if not self.warning_counter.has_warnings():
2918
2919
  self.__display_support_link(self.bundle.get("all_ok_community_invite"))
@@ -3929,14 +3930,14 @@ class FeaturesEnricher(TransformerMixin):
3929
3930
  self.features_info,
3930
3931
  self._features_info_without_links,
3931
3932
  self.bundle.get("relevant_features_header"),
3932
- display_id="features_info",
3933
+ display_id=f"features_info_{uuid.uuid4()}",
3933
3934
  )
3934
3935
 
3935
3936
  self.data_sources_display_handle = display_html_dataframe(
3936
3937
  self.relevant_data_sources,
3937
3938
  self._relevant_data_sources_wo_links,
3938
3939
  self.bundle.get("relevant_data_sources_header"),
3939
- display_id="data_sources",
3940
+ display_id=f"data_sources_{uuid.uuid4()}",
3940
3941
  )
3941
3942
  else:
3942
3943
  msg = self.bundle.get("features_info_zero_important_features")
@@ -43,6 +43,9 @@ class FileColumnMeaningType(Enum):
43
43
  EVAL_SET_INDEX = "EVAL_SET_INDEX"
44
44
  ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
45
45
  UNNEST_KEY = "UNNEST_KEY"
46
+ IP_BINARY = "IP_BINARY"
47
+ IP_RANGE_FROM_BINARY = "IP_RANGE_FROM_BINARY"
48
+ IP_RANGE_TO_BINARY = "IP_RANGE_TO_BINARY"
46
49
 
47
50
 
48
51
  class SearchKey(Enum):
@@ -60,6 +63,9 @@ class SearchKey(Enum):
60
63
  IPV6_ADDRESS = FileColumnMeaningType.IPV6_ADDRESS
61
64
  IPV6_RANGE_FROM = FileColumnMeaningType.IPV6_RANGE_FROM
62
65
  IPV6_RANGE_TO = FileColumnMeaningType.IPV6_RANGE_TO
66
+ IP_BINARY = FileColumnMeaningType.IP_BINARY
67
+ IP_RANGE_FROM_BINARY = FileColumnMeaningType.IP_RANGE_FROM_BINARY
68
+ IP_RANGE_TO_BINARY = FileColumnMeaningType.IP_RANGE_TO_BINARY
63
69
 
64
70
  # For data source registration. Don't use it for FeaturesEnricher
65
71
  EMAIL_ONE_DOMAIN = FileColumnMeaningType.EMAIL_ONE_DOMAIN
@@ -112,6 +118,12 @@ class SearchKey(Enum):
112
118
  return SearchKey.MSISDN_RANGE_FROM
113
119
  if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
114
120
  return SearchKey.MSISDN_RANGE_TO
121
+ if meaning_type == FileColumnMeaningType.IP_BINARY:
122
+ return SearchKey.IP_BINARY
123
+ if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
124
+ return SearchKey.IP_RANGE_FROM_BINARY
125
+ if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
126
+ return SearchKey.IP_RANGE_TO_BINARY
115
127
 
116
128
  @staticmethod
117
129
  def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
@@ -136,6 +148,7 @@ class DataType(Enum):
136
148
  DATE_TIME = "DATE_TIME"
137
149
  STRING = "STRING"
138
150
  BOOLEAN = "BOOLEAN"
151
+ BYTES = "BYTES"
139
152
 
140
153
 
141
154
  class ModelTaskType(Enum):
@@ -42,6 +42,16 @@ class IpSearchKeyConverter:
42
42
  except Exception:
43
43
  pass
44
44
 
45
+ @staticmethod
46
+ def _ip_to_binary(ip: Optional[_BaseAddress]) -> Optional[bytes]:
47
+ try:
48
+ if isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None:
49
+ return ip.ipv4_mapped.packed
50
+ else:
51
+ return ip.packed
52
+ except Exception:
53
+ pass
54
+
45
55
  @staticmethod
46
56
  def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
47
57
  try:
@@ -100,11 +110,16 @@ class IpSearchKeyConverter:
100
110
  .astype("string")
101
111
  # .str.replace(".0", "", regex=False)
102
112
  )
113
+ ip_binary = self.ip_column + "_binary"
114
+ df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
115
+
103
116
  df = df.drop(columns=self.ip_column)
104
117
  del self.search_keys[self.ip_column]
105
118
  del self.columns_renaming[self.ip_column]
106
119
  self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
120
+ self.search_keys[ip_binary] = SearchKey.IP_BINARY
107
121
  self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
122
+ self.columns_renaming[ip_binary] = original_ip
108
123
 
109
124
  return df
110
125
 
@@ -1 +0,0 @@
1
- __version__ = "1.2.34a3657.dev4"
File without changes
File without changes
File without changes