upgini 1.1.312a4__py3-none-any.whl → 1.1.313__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

@@ -1,203 +0,0 @@
1
- import hashlib
2
- from logging import Logger, getLogger
3
- from typing import Dict, List
4
-
5
- import numpy as np
6
- import pandas as pd
7
- from pandas.api.types import is_bool_dtype as is_bool
8
- from pandas.api.types import is_datetime64_any_dtype as is_datetime
9
- from pandas.api.types import (
10
- is_float_dtype,
11
- is_numeric_dtype,
12
- is_object_dtype,
13
- is_period_dtype,
14
- is_string_dtype,
15
- )
16
-
17
- from upgini.errors import ValidationError
18
- from upgini.metadata import (
19
- ENTITY_SYSTEM_RECORD_ID,
20
- EVAL_SET_INDEX,
21
- SEARCH_KEY_UNNEST,
22
- SYSTEM_RECORD_ID,
23
- TARGET,
24
- SearchKey,
25
- )
26
- from upgini.resource_bundle import ResourceBundle, get_custom_bundle
27
- from upgini.utils import find_numbers_with_decimal_comma
28
- from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
29
- from upgini.utils.phone_utils import PhoneSearchKeyConverter
30
- from upgini.utils.warning_counter import WarningCounter
31
-
32
-
33
- class Normalizer:
34
-
35
- MAX_STRING_FEATURE_LENGTH = 24573
36
-
37
- def __init__(
38
- self,
39
- search_keys: Dict[str, SearchKey],
40
- generated_features: List[str],
41
- bundle: ResourceBundle = None,
42
- logger: Logger = None,
43
- warnings_counter: WarningCounter = None,
44
- silent_mode=False,
45
- ):
46
- self.search_keys = search_keys
47
- self.generated_features = generated_features
48
- self.bundle = bundle or get_custom_bundle()
49
- self.logger = logger or getLogger()
50
- self.warnings_counter = warnings_counter or WarningCounter()
51
- self.silent_mode = silent_mode
52
- self.columns_renaming = {}
53
-
54
- def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
55
- df = df.copy()
56
- df = self._rename_columns(df)
57
-
58
- df = self._remove_dates_from_features(df)
59
-
60
- df = self._cut_too_long_string_values(df)
61
-
62
- df = self._convert_bools(df)
63
-
64
- df = self._convert_float16(df)
65
-
66
- df = self._correct_decimal_comma(df)
67
-
68
- df = self._convert_phone_numbers(df)
69
-
70
- df = self.__convert_features_types(df)
71
-
72
- return df
73
-
74
- def _rename_columns(self, df: pd.DataFrame):
75
- # logger.info("Replace restricted symbols in column names")
76
- new_columns = []
77
- dup_counter = 0
78
- for column in df.columns:
79
- if column in [
80
- TARGET,
81
- EVAL_SET_INDEX,
82
- SYSTEM_RECORD_ID,
83
- ENTITY_SYSTEM_RECORD_ID,
84
- SEARCH_KEY_UNNEST,
85
- DateTimeSearchKeyConverter.DATETIME_COL,
86
- ] + self.generated_features:
87
- self.columns_renaming[column] = column
88
- new_columns.append(column)
89
- continue
90
-
91
- new_column = str(column)
92
- suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
93
- if len(new_column) == 0:
94
- raise ValidationError(self.bundle.get("dataset_empty_column_names"))
95
- # db limit for column length
96
- if len(new_column) > 250:
97
- new_column = new_column[:250]
98
-
99
- # make column name unique relative to server features
100
- new_column = f"{new_column}_{suffix}"
101
-
102
- new_column = new_column.lower()
103
-
104
- # if column starts with non alphabetic symbol then add "a" to the beginning of string
105
- if ord(new_column[0]) not in range(ord("a"), ord("z") + 1):
106
- new_column = "a" + new_column
107
-
108
- # replace unsupported characters to "_"
109
- for idx, c in enumerate(new_column):
110
- if ord(c) not in range(ord("a"), ord("z") + 1) and ord(c) not in range(ord("0"), ord("9") + 1):
111
- new_column = new_column[:idx] + "_" + new_column[idx + 1 :]
112
-
113
- if new_column in new_columns:
114
- new_column = f"{new_column}_{dup_counter}"
115
- dup_counter += 1
116
- new_columns.append(new_column)
117
-
118
- # df.columns.values[col_idx] = new_column
119
- # rename(columns={column: new_column}, inplace=True)
120
-
121
- if new_column != column and column in self.search_keys:
122
- self.search_keys[new_column] = self.search_keys[column]
123
- del self.search_keys[column]
124
- self.columns_renaming[new_column] = str(column)
125
- df.columns = new_columns
126
- return df
127
-
128
- def _get_features(self, df: pd.DataFrame) -> List[str]:
129
- system_columns = [ENTITY_SYSTEM_RECORD_ID, EVAL_SET_INDEX, SEARCH_KEY_UNNEST, SYSTEM_RECORD_ID, TARGET]
130
- features = set(df.columns) - set(self.search_keys.keys()) - set(system_columns)
131
- return sorted(list(features))
132
-
133
- def _remove_dates_from_features(self, df: pd.DataFrame):
134
- features = self._get_features(df)
135
-
136
- removed_features = []
137
- for f in features:
138
- if is_datetime(df[f]) or is_period_dtype(df[f]):
139
- removed_features.append(f)
140
- df.drop(columns=f, inplace=True)
141
-
142
- if removed_features:
143
- msg = self.bundle.get("dataset_date_features").format(removed_features)
144
- self.logger.warning(msg)
145
- if not self.silent_mode:
146
- print(msg)
147
- self.warnings_counter.increment()
148
-
149
- return df
150
-
151
- def _cut_too_long_string_values(self, df: pd.DataFrame):
152
- """Check that string values less than maximum characters for LLM"""
153
- # logger.info("Validate too long string values")
154
- for col in df.columns:
155
- if is_string_dtype(df[col]) or is_object_dtype(df[col]):
156
- max_length: int = df[col].astype("str").str.len().max()
157
- if max_length > self.MAX_STRING_FEATURE_LENGTH:
158
- df[col] = df[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
159
-
160
- return df
161
-
162
- @staticmethod
163
- def _convert_bools(df: pd.DataFrame):
164
- """Convert bool columns to string"""
165
- # logger.info("Converting bool to int")
166
- for col in df.columns:
167
- if is_bool(df[col]):
168
- df[col] = df[col].astype("str")
169
- return df
170
-
171
- @staticmethod
172
- def _convert_float16(df: pd.DataFrame):
173
- """Convert float16 to float"""
174
- # logger.info("Converting float16 to float")
175
- for col in df.columns:
176
- if is_float_dtype(df[col]):
177
- df[col] = df[col].astype("float64")
178
- return df
179
-
180
- def _correct_decimal_comma(self, df: pd.DataFrame):
181
- """Check DataSet for decimal commas and fix them"""
182
- # logger.info("Correct decimal commas")
183
- columns_to_fix = find_numbers_with_decimal_comma(df)
184
- if len(columns_to_fix) > 0:
185
- self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
186
- for col in columns_to_fix:
187
- df[col] = df[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
188
- return df
189
-
190
- def _convert_phone_numbers(self, df: pd.DataFrame) -> pd.DataFrame:
191
- maybe_country_col = SearchKey.find_key(self.search_keys, SearchKey.COUNTRY)
192
- for phone_col in SearchKey.find_all_keys(self.search_keys, SearchKey.PHONE):
193
- converter = PhoneSearchKeyConverter(phone_col, maybe_country_col)
194
- df = converter.convert(df)
195
- return df
196
-
197
- def __convert_features_types(self, df: pd.DataFrame):
198
- # self.logger.info("Convert features to supported data types")
199
-
200
- for f in self._get_features(df):
201
- if not is_numeric_dtype(df[f]):
202
- df[f] = df[f].astype("string")
203
- return df