upgini 1.2.136__py3-none-any.whl → 1.2.138__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +33 -1
- upgini/features_enricher.py +33 -11
- upgini/http.py +2 -2
- upgini/metadata.py +11 -1
- upgini/normalizer/normalize_utils.py +4 -4
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/base_search_key_detector.py +5 -1
- upgini/utils/datetime_utils.py +15 -1
- {upgini-1.2.136.dist-info → upgini-1.2.138.dist-info}/METADATA +2 -1
- {upgini-1.2.136.dist-info → upgini-1.2.138.dist-info}/RECORD +13 -13
- {upgini-1.2.136.dist-info → upgini-1.2.138.dist-info}/WHEEL +0 -0
- {upgini-1.2.136.dist-info → upgini-1.2.138.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.138"
|
upgini/dataset.py
CHANGED
|
@@ -7,6 +7,8 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pandas as pd
|
|
10
|
+
import pyarrow as pa
|
|
11
|
+
import pyarrow.parquet as pq
|
|
10
12
|
from pandas.api.types import (
|
|
11
13
|
is_float_dtype,
|
|
12
14
|
is_integer_dtype,
|
|
@@ -18,6 +20,7 @@ from pandas.api.types import (
|
|
|
18
20
|
from upgini.errors import ValidationError
|
|
19
21
|
from upgini.http import ProgressStage, SearchProgress, _RestClient
|
|
20
22
|
from upgini.metadata import (
|
|
23
|
+
CURRENT_DATE_COL,
|
|
21
24
|
ENTITY_SYSTEM_RECORD_ID,
|
|
22
25
|
EVAL_SET_INDEX,
|
|
23
26
|
SYSTEM_RECORD_ID,
|
|
@@ -38,6 +41,7 @@ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
|
38
41
|
from upgini.search_task import SearchTask
|
|
39
42
|
from upgini.utils.config import SampleConfig
|
|
40
43
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
44
|
+
from upgini.utils.hash_utils import file_hash
|
|
41
45
|
from upgini.utils.sample_utils import SampleColumns, sample
|
|
42
46
|
|
|
43
47
|
try:
|
|
@@ -287,7 +291,7 @@ class Dataset:
|
|
|
287
291
|
for key in search_group
|
|
288
292
|
if key in self.columns_renaming
|
|
289
293
|
and not self.columns_renaming.get(key).endswith(EmailSearchKeyConverter.ONE_DOMAIN_SUFFIX)
|
|
290
|
-
and not self.columns_renaming.get(key) ==
|
|
294
|
+
and not self.columns_renaming.get(key) == CURRENT_DATE_COL
|
|
291
295
|
}
|
|
292
296
|
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS.value)
|
|
293
297
|
if (
|
|
@@ -469,6 +473,33 @@ class Dataset:
|
|
|
469
473
|
|
|
470
474
|
columns.append(column_meta)
|
|
471
475
|
|
|
476
|
+
current_date = int(pd.Timestamp(pd.Timestamp.now().date(), tz="UTC").timestamp() * 1000)
|
|
477
|
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
478
|
+
if (
|
|
479
|
+
self.date_column is not None
|
|
480
|
+
and self.data[self.date_column].nunique() == 1
|
|
481
|
+
and self.data[self.date_column].iloc[0] == current_date
|
|
482
|
+
):
|
|
483
|
+
df_without_fake_date = self.data.drop(columns=[self.date_column])
|
|
484
|
+
else:
|
|
485
|
+
df_without_fake_date = self.data
|
|
486
|
+
parquet_file_path = f"{tmp_dir}/{self.dataset_name}.parquet"
|
|
487
|
+
|
|
488
|
+
# calculate deterministic digest for any environment
|
|
489
|
+
|
|
490
|
+
table = pa.Table.from_pandas(df_without_fake_date, preserve_index=False)
|
|
491
|
+
table = table.replace_schema_metadata({}) # remove all metadata
|
|
492
|
+
pq.write_table(
|
|
493
|
+
table,
|
|
494
|
+
parquet_file_path,
|
|
495
|
+
compression=None, # any compression will make it non-deterministic
|
|
496
|
+
data_page_size=0, # optional, to remove page layout variations
|
|
497
|
+
use_deprecated_int96_timestamps=False, # fix timestamp format
|
|
498
|
+
write_statistics=False, # remove statistics to make it deterministic
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
deterministic_digest = file_hash(parquet_file_path)
|
|
502
|
+
|
|
472
503
|
return FileMetadata(
|
|
473
504
|
name=self.dataset_name,
|
|
474
505
|
description=self.description,
|
|
@@ -479,6 +510,7 @@ class Dataset:
|
|
|
479
510
|
hierarchicalSubgroupKeys=self.hierarchical_subgroup_keys,
|
|
480
511
|
taskType=self.task_type,
|
|
481
512
|
droppedColumns=self.dropped_columns,
|
|
513
|
+
deterministicDigest=deterministic_digest,
|
|
482
514
|
)
|
|
483
515
|
|
|
484
516
|
@staticmethod
|
upgini/features_enricher.py
CHANGED
|
@@ -44,6 +44,7 @@ from upgini.http import (
|
|
|
44
44
|
from upgini.mdc import MDC
|
|
45
45
|
from upgini.metadata import (
|
|
46
46
|
COUNTRY,
|
|
47
|
+
CURRENT_DATE_COL,
|
|
47
48
|
DEFAULT_INDEX,
|
|
48
49
|
ENTITY_SYSTEM_RECORD_ID,
|
|
49
50
|
EVAL_SET_INDEX,
|
|
@@ -76,6 +77,7 @@ from upgini.utils.custom_loss_utils import (
|
|
|
76
77
|
)
|
|
77
78
|
from upgini.utils.cv_utils import CVConfig, get_groups
|
|
78
79
|
from upgini.utils.datetime_utils import (
|
|
80
|
+
DateSearchKeyDetector,
|
|
79
81
|
DateTimeConverter,
|
|
80
82
|
is_blocked_time_series,
|
|
81
83
|
is_dates_distribution_valid,
|
|
@@ -167,7 +169,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
167
169
|
"""
|
|
168
170
|
|
|
169
171
|
TARGET_NAME = "target"
|
|
170
|
-
CURRENT_DATE = "current_date"
|
|
171
172
|
RANDOM_STATE = 42
|
|
172
173
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
173
174
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -238,6 +239,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
238
239
|
generate_search_key_features: bool = True,
|
|
239
240
|
sample_config: SampleConfig | None = None,
|
|
240
241
|
print_trace_id: bool = False,
|
|
242
|
+
print_loaded_report: bool = True,
|
|
241
243
|
**kwargs,
|
|
242
244
|
):
|
|
243
245
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -284,7 +286,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
284
286
|
self.id_columns = id_columns
|
|
285
287
|
self.id_columns_encoder = None
|
|
286
288
|
self.country_code = country_code
|
|
287
|
-
self.__validate_search_keys(search_keys, search_id)
|
|
289
|
+
self.__validate_search_keys(self.search_keys, search_id)
|
|
288
290
|
|
|
289
291
|
self.model_task_type = ModelTaskType.parse(model_task_type)
|
|
290
292
|
self.endpoint = endpoint
|
|
@@ -317,7 +319,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
317
319
|
self.fit_columns_renaming = {c.name: c.originalName for c in file_metadata.columns}
|
|
318
320
|
df = pd.DataFrame(columns=x_columns)
|
|
319
321
|
self.__prepare_feature_importances(trace_id, df, silent=True, update_selected_features=False)
|
|
320
|
-
|
|
322
|
+
if print_loaded_report:
|
|
323
|
+
self.__show_selected_features()
|
|
321
324
|
# TODO validate search_keys with search_keys from file_metadata
|
|
322
325
|
print(self.bundle.get("search_by_task_id_finish"))
|
|
323
326
|
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
|
@@ -2983,6 +2986,9 @@ if response.status_code == 200:
|
|
|
2983
2986
|
else:
|
|
2984
2987
|
selected_input_columns = []
|
|
2985
2988
|
|
|
2989
|
+
if DEFAULT_INDEX in selected_input_columns:
|
|
2990
|
+
selected_input_columns.remove(DEFAULT_INDEX)
|
|
2991
|
+
|
|
2986
2992
|
return selected_input_columns + selected_generated_features
|
|
2987
2993
|
|
|
2988
2994
|
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
|
@@ -3167,7 +3173,7 @@ if response.status_code == 200:
|
|
|
3167
3173
|
|
|
3168
3174
|
if DEFAULT_INDEX in df.columns:
|
|
3169
3175
|
msg = self.bundle.get("unsupported_index_column")
|
|
3170
|
-
self.logger.
|
|
3176
|
+
self.logger.warning(msg)
|
|
3171
3177
|
print(msg)
|
|
3172
3178
|
self.fit_dropped_features.add(DEFAULT_INDEX)
|
|
3173
3179
|
df.drop(columns=DEFAULT_INDEX, inplace=True)
|
|
@@ -3223,8 +3229,11 @@ if response.status_code == 200:
|
|
|
3223
3229
|
df, self.fit_search_keys, self.fit_generated_features
|
|
3224
3230
|
)
|
|
3225
3231
|
self.fit_columns_renaming = normalizer.columns_renaming
|
|
3226
|
-
if normalizer.
|
|
3227
|
-
|
|
3232
|
+
if normalizer.removed_datetime_features:
|
|
3233
|
+
original_removed_datetime_features = [
|
|
3234
|
+
self.fit_columns_renaming.get(f, f) for f in normalizer.removed_datetime_features
|
|
3235
|
+
]
|
|
3236
|
+
self.__log_warning(self.bundle.get("dataset_date_features").format(original_removed_datetime_features))
|
|
3228
3237
|
|
|
3229
3238
|
non_feature_columns = [
|
|
3230
3239
|
self.TARGET_NAME,
|
|
@@ -4090,12 +4099,14 @@ if response.status_code == 200:
|
|
|
4090
4099
|
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
4091
4100
|
or set(search_keys.values()) == {SearchKey.HEM}
|
|
4092
4101
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
4102
|
+
or len(search_keys) == 0
|
|
4103
|
+
or set(search_keys.values()) == {SearchKey.CUSTOM_KEY}
|
|
4093
4104
|
):
|
|
4094
4105
|
if not silent:
|
|
4095
4106
|
self.__log_warning(bundle.get("current_date_added"))
|
|
4096
|
-
df[
|
|
4097
|
-
search_keys[
|
|
4098
|
-
converter = DateTimeConverter(
|
|
4107
|
+
df[CURRENT_DATE_COL] = datetime.date.today()
|
|
4108
|
+
search_keys[CURRENT_DATE_COL] = SearchKey.DATE
|
|
4109
|
+
converter = DateTimeConverter(CURRENT_DATE_COL, generate_cyclical_features=False)
|
|
4099
4110
|
df = converter.convert(df)
|
|
4100
4111
|
return df
|
|
4101
4112
|
|
|
@@ -4109,7 +4120,7 @@ if response.status_code == 200:
|
|
|
4109
4120
|
return [
|
|
4110
4121
|
col
|
|
4111
4122
|
for col, t in search_keys.items()
|
|
4112
|
-
if t not in [SearchKey.DATE, SearchKey.DATETIME] and df[col].dropna().nunique() > 1
|
|
4123
|
+
if t not in [SearchKey.DATE, SearchKey.DATETIME] and col in df.columns and df[col].dropna().nunique() > 1
|
|
4113
4124
|
]
|
|
4114
4125
|
|
|
4115
4126
|
@staticmethod
|
|
@@ -4777,7 +4788,8 @@ if response.status_code == 200:
|
|
|
4777
4788
|
else:
|
|
4778
4789
|
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4779
4790
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4780
|
-
|
|
4791
|
+
# Current date will be added later
|
|
4792
|
+
# raise ValidationError(msg)
|
|
4781
4793
|
|
|
4782
4794
|
if (
|
|
4783
4795
|
len(valid_search_keys.values()) == 1
|
|
@@ -4896,6 +4908,16 @@ if response.status_code == 200:
|
|
|
4896
4908
|
search_key in self.fit_search_keys.values() and search_key not in search_keys.values()
|
|
4897
4909
|
)
|
|
4898
4910
|
|
|
4911
|
+
if check_need_detect(SearchKey.DATE) and check_need_detect(SearchKey.DATETIME):
|
|
4912
|
+
maybe_keys = DateSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
|
4913
|
+
if len(maybe_keys) > 0:
|
|
4914
|
+
datetime_key = maybe_keys[0]
|
|
4915
|
+
search_keys[datetime_key] = SearchKey.DATETIME
|
|
4916
|
+
self.autodetected_search_keys[datetime_key] = SearchKey.DATETIME
|
|
4917
|
+
self.logger.info(f"Autodetected search key DATETIME in column {datetime_key}")
|
|
4918
|
+
if not silent_mode:
|
|
4919
|
+
print(self.bundle.get("datetime_detected").format(datetime_key))
|
|
4920
|
+
|
|
4899
4921
|
# if SearchKey.POSTAL_CODE not in search_keys.values() and check_need_detect(SearchKey.POSTAL_CODE):
|
|
4900
4922
|
if check_need_detect(SearchKey.POSTAL_CODE):
|
|
4901
4923
|
maybe_keys = PostalCodeSearchKeyDetector().get_search_key_columns(sample, search_keys)
|
upgini/http.py
CHANGED
|
@@ -433,8 +433,8 @@ class _RestClient:
|
|
|
433
433
|
with open(file_path, "rb") as file:
|
|
434
434
|
content = file.read()
|
|
435
435
|
md5_hash.update(content)
|
|
436
|
-
|
|
437
|
-
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5":
|
|
436
|
+
digest_md5 = md5_hash.hexdigest()
|
|
437
|
+
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
|
|
438
438
|
|
|
439
439
|
digest_sha256 = file_hash(file_path)
|
|
440
440
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
upgini/metadata.py
CHANGED
|
@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
|
|
|
12
12
|
EVAL_SET_INDEX = "eval_set_index"
|
|
13
13
|
TARGET = "target"
|
|
14
14
|
COUNTRY = "country_iso_code"
|
|
15
|
+
CURRENT_DATE_COL = "current_date_"
|
|
15
16
|
RENAMED_INDEX = "index_col"
|
|
16
17
|
DEFAULT_INDEX = "index"
|
|
17
18
|
ORIGINAL_INDEX = "original_index"
|
|
18
|
-
SYSTEM_COLUMNS = {
|
|
19
|
+
SYSTEM_COLUMNS = {
|
|
20
|
+
SYSTEM_RECORD_ID,
|
|
21
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
22
|
+
SEARCH_KEY_UNNEST,
|
|
23
|
+
EVAL_SET_INDEX,
|
|
24
|
+
TARGET,
|
|
25
|
+
COUNTRY,
|
|
26
|
+
CURRENT_DATE_COL,
|
|
27
|
+
}
|
|
19
28
|
|
|
20
29
|
|
|
21
30
|
class FileColumnMeaningType(Enum):
|
|
@@ -252,6 +261,7 @@ class FileMetadata(BaseModel):
|
|
|
252
261
|
rowsCount: Optional[int] = None
|
|
253
262
|
checksumMD5: Optional[str] = None
|
|
254
263
|
digest: Optional[str] = None
|
|
264
|
+
deterministicDigest: Optional[str] = None
|
|
255
265
|
droppedColumns: Optional[List[str]] = None
|
|
256
266
|
|
|
257
267
|
def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
|
|
@@ -5,7 +5,6 @@ from typing import Dict, List, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas.api.types import is_bool_dtype as is_bool
|
|
8
|
-
from pandas.api.types import is_datetime64_any_dtype as is_datetime
|
|
9
8
|
from pandas.api.types import (
|
|
10
9
|
is_float_dtype,
|
|
11
10
|
is_numeric_dtype,
|
|
@@ -45,7 +44,7 @@ class Normalizer:
|
|
|
45
44
|
self.columns_renaming = {}
|
|
46
45
|
self.search_keys = {}
|
|
47
46
|
self.generated_features = []
|
|
48
|
-
self.
|
|
47
|
+
self.removed_datetime_features = []
|
|
49
48
|
|
|
50
49
|
def normalize(
|
|
51
50
|
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
@@ -134,8 +133,9 @@ class Normalizer:
|
|
|
134
133
|
features = self._get_features(df)
|
|
135
134
|
|
|
136
135
|
for f in features:
|
|
137
|
-
|
|
138
|
-
|
|
136
|
+
converter = DateTimeConverter(f)
|
|
137
|
+
if converter.is_datetime(df):
|
|
138
|
+
self.removed_datetime_features.append(f)
|
|
139
139
|
df.drop(columns=f, inplace=True)
|
|
140
140
|
|
|
141
141
|
return df
|
|
@@ -210,6 +210,7 @@ features_info_zero_important_features=Oops, we can't find any relevant external
|
|
|
210
210
|
features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
211
211
|
features_not_generated=Following features didn't pass checks for automated feature generation: {}
|
|
212
212
|
# Information
|
|
213
|
+
datetime_detected=Datetime detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
213
214
|
postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
214
215
|
country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
215
216
|
country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
@@ -24,4 +24,8 @@ class BaseSearchKeyDetector:
|
|
|
24
24
|
for column_name in other_columns:
|
|
25
25
|
if self._is_search_key_by_values(df[column_name]):
|
|
26
26
|
columns_by_values.append(column_name)
|
|
27
|
-
|
|
27
|
+
|
|
28
|
+
both = [col for col in columns_by_names if col in columns_by_values]
|
|
29
|
+
only_values = [col for col in columns_by_values if col not in columns_by_names]
|
|
30
|
+
only_names = [col for col in columns_by_names if col not in columns_by_values]
|
|
31
|
+
return both + only_values + only_names
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -10,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
|
|
|
10
10
|
from upgini.errors import ValidationError
|
|
11
11
|
from upgini.metadata import EVAL_SET_INDEX, SearchKey
|
|
12
12
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
13
|
+
from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
|
|
13
14
|
|
|
14
15
|
DATE_FORMATS = [
|
|
15
16
|
"%Y-%m-%d",
|
|
@@ -29,6 +30,15 @@ DATE_FORMATS = [
|
|
|
29
30
|
DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
|
|
30
31
|
|
|
31
32
|
|
|
33
|
+
class DateSearchKeyDetector(BaseSearchKeyDetector):
|
|
34
|
+
def _is_search_key_by_name(self, column_name: str) -> bool:
|
|
35
|
+
lower_column_name = str(column_name).lower()
|
|
36
|
+
return "date" in lower_column_name or "time" in lower_column_name or "timestamp" in lower_column_name
|
|
37
|
+
|
|
38
|
+
def _is_search_key_by_values(self, column: pd.Series) -> bool:
|
|
39
|
+
return DateTimeConverter(column.name).is_datetime(column.to_frame(column.name))
|
|
40
|
+
|
|
41
|
+
|
|
32
42
|
class DateTimeConverter:
|
|
33
43
|
DATETIME_COL = "_date_time"
|
|
34
44
|
# MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31) # 946684800000 # 2000-01-01
|
|
@@ -80,7 +90,7 @@ class DateTimeConverter:
|
|
|
80
90
|
return True
|
|
81
91
|
|
|
82
92
|
parsed = self.parse_datetime(df, raise_errors=False)
|
|
83
|
-
return parsed is not None and
|
|
93
|
+
return parsed is not None and parsed.isna().mean() <= 0.5
|
|
84
94
|
|
|
85
95
|
def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
|
|
86
96
|
if len(df) == 0 or df[self.date_column].isna().all():
|
|
@@ -408,6 +418,10 @@ def is_dates_distribution_valid(
|
|
|
408
418
|
if maybe_date_col is None:
|
|
409
419
|
return
|
|
410
420
|
|
|
421
|
+
# Don't check if date column is constant
|
|
422
|
+
if X[maybe_date_col].nunique() <= 1:
|
|
423
|
+
return
|
|
424
|
+
|
|
411
425
|
if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
|
|
412
426
|
dates = X[maybe_date_col].dt.to_timestamp().dt.date
|
|
413
427
|
elif pd.__version__ >= "2.0.0":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.138
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -34,6 +34,7 @@ Requires-Dist: more-itertools==10.7.0
|
|
|
34
34
|
Requires-Dist: numpy<3.0.0,>=1.19.0
|
|
35
35
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
36
36
|
Requires-Dist: psutil>=5.9.0
|
|
37
|
+
Requires-Dist: pyarrow==18.1.0
|
|
37
38
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
38
39
|
Requires-Dist: pyjwt>=2.8.0
|
|
39
40
|
Requires-Dist: python-bidi==0.4.2
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=XCQXAFpTbucTcUYjHmbmGVUtguvbpWMCujKCLALhk3U,24
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=D9JzJJkZLPP_dp8GOlGgMhTtrd5pvP-4cHIcqiY3q-E,33354
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256
|
|
8
|
-
upgini/metadata.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=nb40GDtl7FEWn13z_oFXN_Q67Hh_XTOHoDY8ASgcDSw,237111
|
|
7
|
+
upgini/http.py,sha256=y26x4TQVYuEM3jz8JdASxSyBtvBemUkFf-FmX25sx-s,44356
|
|
8
|
+
upgini/metadata.py,sha256=BwUTCY-EUHqPtO0tGazHrk3wqhh-NfjNZhlBHW8bR78,12796
|
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
|
10
10
|
upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
|
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
@@ -35,10 +35,10 @@ upgini/data_source/data_source_publisher.py,sha256=CQi3fEukaStV-RiadSEvEFLThOlZJ
|
|
|
35
35
|
upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
|
|
36
36
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
37
37
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
38
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
38
|
+
upgini/normalizer/normalize_utils.py,sha256=oKevieBChYxtugocrbev8Uz2vqSZ_PB4Ibo55fHOylM,8452
|
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=Dym1ymRH2uTeb0CfrO_lccoP9LWj7lxNGvsxmR7vkSw,29583
|
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
44
|
upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
|
|
@@ -46,13 +46,13 @@ upgini/sampler/random_under_sampler.py,sha256=4mofmaRTmNwT_HqxecWJyfXdLKK0h9jMBw
|
|
|
46
46
|
upgini/sampler/utils.py,sha256=PYOk3kKSnFlyxcpdtDNLBEEhTB4lO_iP7pQHqeUcmAc,20211
|
|
47
47
|
upgini/utils/Roboto-Regular.ttf,sha256=kqYnZjMRQMpbyLulIChCLSdgYa1XF8GsUIoRi2Gcauw,168260
|
|
48
48
|
upgini/utils/__init__.py,sha256=O_KgzKiJjW3g4NoqZ7lAxUpoHcBi_gze6r3ndEjCH74,842
|
|
49
|
-
upgini/utils/base_search_key_detector.py,sha256=
|
|
49
|
+
upgini/utils/base_search_key_detector.py,sha256=DCmTbfhCdirJaDMpbKicFvhgVv6a9NNf9tXy3AbMCEg,1258
|
|
50
50
|
upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl1UOB4s,3382
|
|
51
51
|
upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
|
|
52
52
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
53
53
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
54
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
55
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
55
|
+
upgini/utils/datetime_utils.py,sha256=pTi5doYjjjlQgwEVDNrJuKaAEC3Jtx78fjX4W7M_UV4,17615
|
|
56
56
|
upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
|
|
57
57
|
upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
|
|
|
74
74
|
upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
|
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
77
|
-
upgini-1.2.
|
|
78
|
-
upgini-1.2.
|
|
79
|
-
upgini-1.2.
|
|
80
|
-
upgini-1.2.
|
|
77
|
+
upgini-1.2.138.dist-info/METADATA,sha256=AQ9v8a70kG-cbEps3pwuy1Rl0vUX-bHq3L11k4r5mp0,51164
|
|
78
|
+
upgini-1.2.138.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
79
|
+
upgini-1.2.138.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
80
|
+
upgini-1.2.138.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|