upgini 1.2.34a3657.dev4__py3-none-any.whl → 1.2.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +8 -2
- upgini/features_enricher.py +7 -6
- upgini/metadata.py +13 -0
- upgini/utils/ip_utils.py +15 -0
- {upgini-1.2.34a3657.dev4.dist-info → upgini-1.2.36.dist-info}/METADATA +2 -2
- {upgini-1.2.34a3657.dev4.dist-info → upgini-1.2.36.dist-info}/RECORD +9 -9
- {upgini-1.2.34a3657.dev4.dist-info → upgini-1.2.36.dist-info}/WHEEL +1 -1
- {upgini-1.2.34a3657.dev4.dist-info → upgini-1.2.36.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.36"
|
upgini/dataset.py
CHANGED
|
@@ -422,11 +422,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
422
422
|
+ "".join("<tr>" + "".join(map(map_color, row[1:])) + "</tr>" for row in df_stats.itertuples())
|
|
423
423
|
+ "</table>"
|
|
424
424
|
)
|
|
425
|
-
print()
|
|
426
425
|
display(HTML(html_stats))
|
|
427
|
-
except (ImportError, NameError):
|
|
428
426
|
print()
|
|
427
|
+
except (ImportError, NameError):
|
|
429
428
|
print(df_stats)
|
|
429
|
+
print()
|
|
430
430
|
|
|
431
431
|
if len(self.data) == 0:
|
|
432
432
|
raise ValidationError(self.bundle.get("all_search_keys_invalid"))
|
|
@@ -494,11 +494,17 @@ class Dataset: # (pd.DataFrame):
|
|
|
494
494
|
taskType=self.task_type,
|
|
495
495
|
)
|
|
496
496
|
|
|
497
|
+
@staticmethod
|
|
498
|
+
def is_column_binary_type(column):
|
|
499
|
+
return column.apply(lambda x: x is None or isinstance(x, (bytes, bytearray))).all()
|
|
500
|
+
|
|
497
501
|
def __get_data_type(self, pandas_data_type, column_name: str) -> DataType:
|
|
498
502
|
if is_integer_dtype(pandas_data_type):
|
|
499
503
|
return DataType.INT
|
|
500
504
|
elif is_float_dtype(pandas_data_type):
|
|
501
505
|
return DataType.DECIMAL
|
|
506
|
+
elif self.is_column_binary_type(self.data[column_name]):
|
|
507
|
+
return DataType.BYTES
|
|
502
508
|
elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
|
|
503
509
|
return DataType.STRING
|
|
504
510
|
else:
|
upgini/features_enricher.py
CHANGED
|
@@ -1844,7 +1844,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1844
1844
|
not self.disable_force_downsampling
|
|
1845
1845
|
and self.generate_features is not None
|
|
1846
1846
|
and phone_column is not None
|
|
1847
|
-
and self.fit_columns_renaming
|
|
1847
|
+
and self.fit_columns_renaming is not None
|
|
1848
|
+
and self.fit_columns_renaming.get(phone_column) in self.generate_features
|
|
1848
1849
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1849
1850
|
)
|
|
1850
1851
|
if force_downsampling:
|
|
@@ -2868,7 +2869,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2868
2869
|
df=autofe_description,
|
|
2869
2870
|
internal_df=autofe_description,
|
|
2870
2871
|
header=self.bundle.get("autofe_descriptions_header"),
|
|
2871
|
-
display_id="
|
|
2872
|
+
display_id=f"autofe_descriptions_{uuid.uuid4()}",
|
|
2872
2873
|
)
|
|
2873
2874
|
|
|
2874
2875
|
if self._has_paid_features(exclude_features_sources):
|
|
@@ -2909,10 +2910,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2909
2910
|
progress_callback,
|
|
2910
2911
|
)
|
|
2911
2912
|
except Exception:
|
|
2912
|
-
self.report_button_handle = self.__show_report_button(display_id="
|
|
2913
|
+
self.report_button_handle = self.__show_report_button(display_id=f"report_button_{uuid.uuid4()}")
|
|
2913
2914
|
raise
|
|
2914
2915
|
|
|
2915
|
-
self.report_button_handle = self.__show_report_button(display_id="
|
|
2916
|
+
self.report_button_handle = self.__show_report_button(display_id=f"report_button_{uuid.uuid4()}")
|
|
2916
2917
|
|
|
2917
2918
|
if not self.warning_counter.has_warnings():
|
|
2918
2919
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
@@ -3929,14 +3930,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3929
3930
|
self.features_info,
|
|
3930
3931
|
self._features_info_without_links,
|
|
3931
3932
|
self.bundle.get("relevant_features_header"),
|
|
3932
|
-
display_id="
|
|
3933
|
+
display_id=f"features_info_{uuid.uuid4()}",
|
|
3933
3934
|
)
|
|
3934
3935
|
|
|
3935
3936
|
self.data_sources_display_handle = display_html_dataframe(
|
|
3936
3937
|
self.relevant_data_sources,
|
|
3937
3938
|
self._relevant_data_sources_wo_links,
|
|
3938
3939
|
self.bundle.get("relevant_data_sources_header"),
|
|
3939
|
-
display_id="
|
|
3940
|
+
display_id=f"data_sources_{uuid.uuid4()}",
|
|
3940
3941
|
)
|
|
3941
3942
|
else:
|
|
3942
3943
|
msg = self.bundle.get("features_info_zero_important_features")
|
upgini/metadata.py
CHANGED
|
@@ -43,6 +43,9 @@ class FileColumnMeaningType(Enum):
|
|
|
43
43
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
44
44
|
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
45
45
|
UNNEST_KEY = "UNNEST_KEY"
|
|
46
|
+
IP_BINARY = "IP_BINARY"
|
|
47
|
+
IP_RANGE_FROM_BINARY = "IP_RANGE_FROM_BINARY"
|
|
48
|
+
IP_RANGE_TO_BINARY = "IP_RANGE_TO_BINARY"
|
|
46
49
|
|
|
47
50
|
|
|
48
51
|
class SearchKey(Enum):
|
|
@@ -60,6 +63,9 @@ class SearchKey(Enum):
|
|
|
60
63
|
IPV6_ADDRESS = FileColumnMeaningType.IPV6_ADDRESS
|
|
61
64
|
IPV6_RANGE_FROM = FileColumnMeaningType.IPV6_RANGE_FROM
|
|
62
65
|
IPV6_RANGE_TO = FileColumnMeaningType.IPV6_RANGE_TO
|
|
66
|
+
IP_BINARY = FileColumnMeaningType.IP_BINARY
|
|
67
|
+
IP_RANGE_FROM_BINARY = FileColumnMeaningType.IP_RANGE_FROM_BINARY
|
|
68
|
+
IP_RANGE_TO_BINARY = FileColumnMeaningType.IP_RANGE_TO_BINARY
|
|
63
69
|
|
|
64
70
|
# For data source registration. Don't use it for FeaturesEnricher
|
|
65
71
|
EMAIL_ONE_DOMAIN = FileColumnMeaningType.EMAIL_ONE_DOMAIN
|
|
@@ -112,6 +118,12 @@ class SearchKey(Enum):
|
|
|
112
118
|
return SearchKey.MSISDN_RANGE_FROM
|
|
113
119
|
if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
|
|
114
120
|
return SearchKey.MSISDN_RANGE_TO
|
|
121
|
+
if meaning_type == FileColumnMeaningType.IP_BINARY:
|
|
122
|
+
return SearchKey.IP_BINARY
|
|
123
|
+
if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
|
|
124
|
+
return SearchKey.IP_RANGE_FROM_BINARY
|
|
125
|
+
if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
|
|
126
|
+
return SearchKey.IP_RANGE_TO_BINARY
|
|
115
127
|
|
|
116
128
|
@staticmethod
|
|
117
129
|
def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
|
|
@@ -136,6 +148,7 @@ class DataType(Enum):
|
|
|
136
148
|
DATE_TIME = "DATE_TIME"
|
|
137
149
|
STRING = "STRING"
|
|
138
150
|
BOOLEAN = "BOOLEAN"
|
|
151
|
+
BYTES = "BYTES"
|
|
139
152
|
|
|
140
153
|
|
|
141
154
|
class ModelTaskType(Enum):
|
upgini/utils/ip_utils.py
CHANGED
|
@@ -42,6 +42,16 @@ class IpSearchKeyConverter:
|
|
|
42
42
|
except Exception:
|
|
43
43
|
pass
|
|
44
44
|
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _ip_to_binary(ip: Optional[_BaseAddress]) -> Optional[bytes]:
|
|
47
|
+
try:
|
|
48
|
+
if isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None:
|
|
49
|
+
return ip.ipv4_mapped.packed
|
|
50
|
+
else:
|
|
51
|
+
return ip.packed
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
54
|
+
|
|
45
55
|
@staticmethod
|
|
46
56
|
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
47
57
|
try:
|
|
@@ -100,11 +110,16 @@ class IpSearchKeyConverter:
|
|
|
100
110
|
.astype("string")
|
|
101
111
|
# .str.replace(".0", "", regex=False)
|
|
102
112
|
)
|
|
113
|
+
ip_binary = self.ip_column + "_binary"
|
|
114
|
+
df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
|
|
115
|
+
|
|
103
116
|
df = df.drop(columns=self.ip_column)
|
|
104
117
|
del self.search_keys[self.ip_column]
|
|
105
118
|
del self.columns_renaming[self.ip_column]
|
|
106
119
|
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
120
|
+
self.search_keys[ip_binary] = SearchKey.IP_BINARY
|
|
107
121
|
self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
|
|
122
|
+
self.columns_renaming[ip_binary] = original_ip
|
|
108
123
|
|
|
109
124
|
return df
|
|
110
125
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.36
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -37,7 +37,7 @@ Requires-Dist: python-dateutil>=2.8.0
|
|
|
37
37
|
Requires-Dist: python-json-logger>=2.0.2
|
|
38
38
|
Requires-Dist: requests>=2.8.0
|
|
39
39
|
Requires-Dist: scikit-learn>=1.3.0
|
|
40
|
-
Requires-Dist: xhtml2pdf
|
|
40
|
+
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
|
|
43
43
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=O-hCuvuPdLrMJex2Xy_kgI2WTMCu-UN2gSRGQ4sN2x4,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=rUBE7_G7CLaaHAviFEyVPqjVSsX1DaLmi1dGFQR-eEo,32279
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=uSv54b42GrbEysDAkfbj1w2oKBroii3pY-O9LfpvzG4,195113
|
|
7
7
|
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=sB5uU-fdz_dA6g-PO6A8FzwIfDbkcFOewcpNs2xZzoY,11943
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -51,7 +51,7 @@ upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0-
|
|
|
51
51
|
upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
|
|
52
52
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
|
-
upgini/utils/ip_utils.py,sha256=
|
|
54
|
+
upgini/utils/ip_utils.py,sha256=n_ZY2PPVsby6Iq3N_uZsBMWjD2i5cY8WnoEnGcgpYH4,5717
|
|
55
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
56
56
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
57
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=Ed5IXkPjV9AfAZQAwCYksAmKaPGQliplvDYS_yeWdfk,11330
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.36.dist-info/METADATA,sha256=zOfiwYvpA-hrD88zJ6_L2PVwF4D_rfhvql90zpc3p7M,48594
|
|
63
|
+
upgini-1.2.36.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.36.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.36.dist-info/RECORD,,
|
|
File without changes
|