upgini 1.2.33__tar.gz → 1.2.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.33 → upgini-1.2.34}/PKG-INFO +2 -2
- {upgini-1.2.33 → upgini-1.2.34}/pyproject.toml +1 -1
- upgini-1.2.34/src/upgini/__about__.py +1 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/dataset.py +8 -2
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/features_enricher.py +2 -1
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/metadata.py +13 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/ip_utils.py +15 -0
- upgini-1.2.33/src/upgini/__about__.py +0 -1
- {upgini-1.2.33 → upgini-1.2.34}/.gitignore +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/LICENSE +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/README.md +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/ads.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/errors.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/http.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/metrics.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/search_task.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/spinner.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.33 → upgini-1.2.34}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.34
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -37,7 +37,7 @@ Requires-Dist: python-dateutil>=2.8.0
|
|
|
37
37
|
Requires-Dist: python-json-logger>=2.0.2
|
|
38
38
|
Requires-Dist: requests>=2.8.0
|
|
39
39
|
Requires-Dist: scikit-learn>=1.3.0
|
|
40
|
-
Requires-Dist: xhtml2pdf
|
|
40
|
+
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
41
41
|
Description-Content-Type: text/markdown
|
|
42
42
|
|
|
43
43
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.34"
|
|
@@ -422,11 +422,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
422
422
|
+ "".join("<tr>" + "".join(map(map_color, row[1:])) + "</tr>" for row in df_stats.itertuples())
|
|
423
423
|
+ "</table>"
|
|
424
424
|
)
|
|
425
|
-
print()
|
|
426
425
|
display(HTML(html_stats))
|
|
427
|
-
except (ImportError, NameError):
|
|
428
426
|
print()
|
|
427
|
+
except (ImportError, NameError):
|
|
429
428
|
print(df_stats)
|
|
429
|
+
print()
|
|
430
430
|
|
|
431
431
|
if len(self.data) == 0:
|
|
432
432
|
raise ValidationError(self.bundle.get("all_search_keys_invalid"))
|
|
@@ -494,11 +494,17 @@ class Dataset: # (pd.DataFrame):
|
|
|
494
494
|
taskType=self.task_type,
|
|
495
495
|
)
|
|
496
496
|
|
|
497
|
+
@staticmethod
|
|
498
|
+
def is_column_binary_type(column):
|
|
499
|
+
return column.apply(lambda x: x is None or isinstance(x, (bytes, bytearray))).all()
|
|
500
|
+
|
|
497
501
|
def __get_data_type(self, pandas_data_type, column_name: str) -> DataType:
|
|
498
502
|
if is_integer_dtype(pandas_data_type):
|
|
499
503
|
return DataType.INT
|
|
500
504
|
elif is_float_dtype(pandas_data_type):
|
|
501
505
|
return DataType.DECIMAL
|
|
506
|
+
elif self.is_column_binary_type(self.data[column_name]):
|
|
507
|
+
return DataType.BYTES
|
|
502
508
|
elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
|
|
503
509
|
return DataType.STRING
|
|
504
510
|
else:
|
|
@@ -1843,7 +1843,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1843
1843
|
not self.disable_force_downsampling
|
|
1844
1844
|
and self.generate_features is not None
|
|
1845
1845
|
and phone_column is not None
|
|
1846
|
-
and self.fit_columns_renaming
|
|
1846
|
+
and self.fit_columns_renaming is not None
|
|
1847
|
+
and self.fit_columns_renaming.get(phone_column) in self.generate_features
|
|
1847
1848
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1848
1849
|
)
|
|
1849
1850
|
if force_downsampling:
|
|
@@ -43,6 +43,9 @@ class FileColumnMeaningType(Enum):
|
|
|
43
43
|
EVAL_SET_INDEX = "EVAL_SET_INDEX"
|
|
44
44
|
ENTITY_SYSTEM_RECORD_ID = "ENTITY_SYSTEM_RECORD_ID"
|
|
45
45
|
UNNEST_KEY = "UNNEST_KEY"
|
|
46
|
+
IP_BINARY = "IP_BINARY"
|
|
47
|
+
IP_RANGE_FROM_BINARY = "IP_RANGE_FROM_BINARY"
|
|
48
|
+
IP_RANGE_TO_BINARY = "IP_RANGE_TO_BINARY"
|
|
46
49
|
|
|
47
50
|
|
|
48
51
|
class SearchKey(Enum):
|
|
@@ -60,6 +63,9 @@ class SearchKey(Enum):
|
|
|
60
63
|
IPV6_ADDRESS = FileColumnMeaningType.IPV6_ADDRESS
|
|
61
64
|
IPV6_RANGE_FROM = FileColumnMeaningType.IPV6_RANGE_FROM
|
|
62
65
|
IPV6_RANGE_TO = FileColumnMeaningType.IPV6_RANGE_TO
|
|
66
|
+
IP_BINARY = FileColumnMeaningType.IP_BINARY
|
|
67
|
+
IP_RANGE_FROM_BINARY = FileColumnMeaningType.IP_RANGE_FROM_BINARY
|
|
68
|
+
IP_RANGE_TO_BINARY = FileColumnMeaningType.IP_RANGE_TO_BINARY
|
|
63
69
|
|
|
64
70
|
# For data source registration. Don't use it for FeaturesEnricher
|
|
65
71
|
EMAIL_ONE_DOMAIN = FileColumnMeaningType.EMAIL_ONE_DOMAIN
|
|
@@ -112,6 +118,12 @@ class SearchKey(Enum):
|
|
|
112
118
|
return SearchKey.MSISDN_RANGE_FROM
|
|
113
119
|
if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
|
|
114
120
|
return SearchKey.MSISDN_RANGE_TO
|
|
121
|
+
if meaning_type == FileColumnMeaningType.IP_BINARY:
|
|
122
|
+
return SearchKey.IP_BINARY
|
|
123
|
+
if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
|
|
124
|
+
return SearchKey.IP_RANGE_FROM_BINARY
|
|
125
|
+
if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
|
|
126
|
+
return SearchKey.IP_RANGE_TO_BINARY
|
|
115
127
|
|
|
116
128
|
@staticmethod
|
|
117
129
|
def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
|
|
@@ -136,6 +148,7 @@ class DataType(Enum):
|
|
|
136
148
|
DATE_TIME = "DATE_TIME"
|
|
137
149
|
STRING = "STRING"
|
|
138
150
|
BOOLEAN = "BOOLEAN"
|
|
151
|
+
BYTES = "BYTES"
|
|
139
152
|
|
|
140
153
|
|
|
141
154
|
class ModelTaskType(Enum):
|
|
@@ -42,6 +42,16 @@ class IpSearchKeyConverter:
|
|
|
42
42
|
except Exception:
|
|
43
43
|
pass
|
|
44
44
|
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _ip_to_binary(ip: Optional[_BaseAddress]) -> Optional[bytes]:
|
|
47
|
+
try:
|
|
48
|
+
if isinstance(ip, IPv6Address) and ip.ipv4_mapped is not None:
|
|
49
|
+
return ip.ipv4_mapped.packed
|
|
50
|
+
else:
|
|
51
|
+
return ip.packed
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
54
|
+
|
|
45
55
|
@staticmethod
|
|
46
56
|
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
47
57
|
try:
|
|
@@ -100,11 +110,16 @@ class IpSearchKeyConverter:
|
|
|
100
110
|
.astype("string")
|
|
101
111
|
# .str.replace(".0", "", regex=False)
|
|
102
112
|
)
|
|
113
|
+
ip_binary = self.ip_column + "_binary"
|
|
114
|
+
df[ip_binary] = df[self.ip_column].apply(self._ip_to_binary)
|
|
115
|
+
|
|
103
116
|
df = df.drop(columns=self.ip_column)
|
|
104
117
|
del self.search_keys[self.ip_column]
|
|
105
118
|
del self.columns_renaming[self.ip_column]
|
|
106
119
|
self.search_keys[ipv6] = SearchKey.IPV6_ADDRESS
|
|
120
|
+
self.search_keys[ip_binary] = SearchKey.IP_BINARY
|
|
107
121
|
self.columns_renaming[ipv6] = original_ip # could be __unnest_ip...
|
|
122
|
+
self.columns_renaming[ip_binary] = original_ip
|
|
108
123
|
|
|
109
124
|
return df
|
|
110
125
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.33"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|