upgini 1.2.118__py3-none-any.whl → 1.2.120a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +2 -0
- upgini/features_enricher.py +67 -38
- upgini/http.py +11 -1
- upgini/utils/display_utils.py +12 -9
- {upgini-1.2.118.dist-info → upgini-1.2.120a1.dist-info}/METADATA +1 -1
- {upgini-1.2.118.dist-info → upgini-1.2.120a1.dist-info}/RECORD +9 -9
- {upgini-1.2.118.dist-info → upgini-1.2.120a1.dist-info}/WHEEL +0 -0
- {upgini-1.2.118.dist-info → upgini-1.2.120a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.120a1"
|
upgini/dataset.py
CHANGED
@@ -694,7 +694,9 @@ class Dataset:
|
|
694
694
|
|
695
695
|
def prepare_uploading_file(self, base_path: str) -> str:
|
696
696
|
parquet_file_path = f"{base_path}/{self.dataset_name}.parquet"
|
697
|
+
print("Before saving parquet file")
|
697
698
|
self.data.to_parquet(path=parquet_file_path, index=False, compression="gzip", engine="fastparquet")
|
699
|
+
print("After saving parquet file")
|
698
700
|
uploading_file_size = Path(parquet_file_path).stat().st_size
|
699
701
|
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}. {len(self.data)} rows")
|
700
702
|
if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
|
upgini/features_enricher.py
CHANGED
@@ -854,7 +854,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
854
854
|
raise e
|
855
855
|
finally:
|
856
856
|
self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
|
857
|
-
|
857
|
+
|
858
858
|
return result
|
859
859
|
|
860
860
|
def calculate_metrics(
|
@@ -1741,7 +1741,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1741
1741
|
c
|
1742
1742
|
for c in (validated_X.columns.to_list() + generated_features)
|
1743
1743
|
if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
|
1744
|
-
and c
|
1744
|
+
and c
|
1745
|
+
not in (
|
1745
1746
|
excluding_search_keys
|
1746
1747
|
+ list(self.fit_dropped_features)
|
1747
1748
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -2215,7 +2216,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
2215
2216
|
progress_callback=progress_callback,
|
2216
2217
|
add_fit_system_record_id=True,
|
2217
2218
|
)
|
2218
|
-
if enriched_df is None:
|
2219
|
+
if enriched_df is None or len(enriched_df) == 0 or len(enriched_df.columns) == 0:
|
2220
|
+
self.logger.warning(f"Empty enriched dataframe returned: {enriched_df}, returning None")
|
2219
2221
|
return None
|
2220
2222
|
|
2221
2223
|
x_columns = [
|
@@ -2519,7 +2521,7 @@ if response.status_code == 200:
|
|
2519
2521
|
if len(self.feature_names_) == 0:
|
2520
2522
|
msg = self.bundle.get("no_important_features_for_transform")
|
2521
2523
|
self.__log_warning(msg, show_support_link=True)
|
2522
|
-
return
|
2524
|
+
return None, {}, [], self.search_keys
|
2523
2525
|
|
2524
2526
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
2525
2527
|
|
@@ -2527,7 +2529,7 @@ if response.status_code == 200:
|
|
2527
2529
|
msg = self.bundle.get("transform_with_paid_features")
|
2528
2530
|
self.logger.warning(msg)
|
2529
2531
|
self.__display_support_link(msg)
|
2530
|
-
return None, {
|
2532
|
+
return None, {}, [], self.search_keys
|
2531
2533
|
|
2532
2534
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
2533
2535
|
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
@@ -2550,7 +2552,7 @@ if response.status_code == 200:
|
|
2550
2552
|
self.logger.warning(msg)
|
2551
2553
|
print(msg)
|
2552
2554
|
show_request_quote_button()
|
2553
|
-
return None, {
|
2555
|
+
return None, {}, [], {}
|
2554
2556
|
else:
|
2555
2557
|
msg = self.bundle.get("transform_usage_info").format(
|
2556
2558
|
transform_usage.limit, transform_usage.transformed_rows
|
@@ -2620,14 +2622,33 @@ if response.status_code == 200:
|
|
2620
2622
|
|
2621
2623
|
# If there are no external features, we don't call backend on transform
|
2622
2624
|
external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
|
2623
|
-
if
|
2625
|
+
if len(external_features) == 0:
|
2624
2626
|
self.logger.warning(
|
2625
2627
|
"No external features found, returning original dataframe"
|
2626
2628
|
f" with generated important features: {self.feature_names_}"
|
2627
2629
|
)
|
2628
|
-
|
2629
|
-
|
2630
|
-
|
2630
|
+
df = df.rename(columns=columns_renaming)
|
2631
|
+
generated_features = [columns_renaming.get(c, c) for c in generated_features]
|
2632
|
+
search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
|
2633
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
2634
|
+
validated_Xy, generated_features, keep_input, trace_id
|
2635
|
+
)
|
2636
|
+
self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
|
2637
|
+
if add_fit_system_record_id:
|
2638
|
+
df = self._add_fit_system_record_id(
|
2639
|
+
df,
|
2640
|
+
search_keys,
|
2641
|
+
SYSTEM_RECORD_ID,
|
2642
|
+
TARGET,
|
2643
|
+
columns_renaming,
|
2644
|
+
self.id_columns,
|
2645
|
+
self.cv,
|
2646
|
+
self.model_task_type,
|
2647
|
+
self.logger,
|
2648
|
+
self.bundle,
|
2649
|
+
)
|
2650
|
+
selecting_columns.append(SYSTEM_RECORD_ID)
|
2651
|
+
return df[selecting_columns], columns_renaming, generated_features, search_keys
|
2631
2652
|
|
2632
2653
|
# Don't pass all features in backend on transform
|
2633
2654
|
runtime_parameters = self._get_copy_of_runtime_parameters()
|
@@ -2845,29 +2866,12 @@ if response.status_code == 200:
|
|
2845
2866
|
how="left",
|
2846
2867
|
)
|
2847
2868
|
|
2848
|
-
|
2849
|
-
|
2850
|
-
|
2851
|
-
selected_generated_features = [
|
2852
|
-
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2853
|
-
]
|
2854
|
-
if keep_input is True:
|
2855
|
-
selected_input_columns = [
|
2856
|
-
c
|
2857
|
-
for c in validated_Xy.columns
|
2858
|
-
if not self.fit_select_features
|
2859
|
-
or c in self.feature_names_
|
2860
|
-
or c in new_columns_on_transform
|
2861
|
-
or c in self.search_keys
|
2862
|
-
or c in (self.id_columns or [])
|
2863
|
-
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2864
|
-
]
|
2865
|
-
else:
|
2866
|
-
selected_input_columns = []
|
2867
|
-
|
2868
|
-
selecting_columns = selected_input_columns + selected_generated_features
|
2869
|
+
selecting_columns = self._selecting_input_and_generated_columns(
|
2870
|
+
validated_Xy, generated_features, keep_input, trace_id
|
2871
|
+
)
|
2869
2872
|
selecting_columns.extend(
|
2870
|
-
c
|
2873
|
+
c
|
2874
|
+
for c in result.columns
|
2871
2875
|
if c in self.feature_names_ and c not in selecting_columns and c not in validated_Xy.columns
|
2872
2876
|
)
|
2873
2877
|
if add_fit_system_record_id:
|
@@ -2895,6 +2899,35 @@ if response.status_code == 200:
|
|
2895
2899
|
|
2896
2900
|
return result, columns_renaming, generated_features, search_keys
|
2897
2901
|
|
2902
|
+
def _selecting_input_and_generated_columns(
|
2903
|
+
self,
|
2904
|
+
validated_Xy: pd.DataFrame,
|
2905
|
+
generated_features: list[str],
|
2906
|
+
keep_input: bool,
|
2907
|
+
trace_id: str,
|
2908
|
+
):
|
2909
|
+
fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
|
2910
|
+
new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
|
2911
|
+
|
2912
|
+
selected_generated_features = [
|
2913
|
+
c for c in generated_features if not self.fit_select_features or c in self.feature_names_
|
2914
|
+
]
|
2915
|
+
if keep_input is True:
|
2916
|
+
selected_input_columns = [
|
2917
|
+
c
|
2918
|
+
for c in validated_Xy.columns
|
2919
|
+
if not self.fit_select_features
|
2920
|
+
or c in self.feature_names_
|
2921
|
+
or c in new_columns_on_transform
|
2922
|
+
or c in self.search_keys
|
2923
|
+
or c in (self.id_columns or [])
|
2924
|
+
or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
|
2925
|
+
]
|
2926
|
+
else:
|
2927
|
+
selected_input_columns = []
|
2928
|
+
|
2929
|
+
return selected_input_columns + selected_generated_features
|
2930
|
+
|
2898
2931
|
def __validate_search_keys(self, search_keys: dict[str, SearchKey], search_id: str | None = None):
|
2899
2932
|
if (search_keys is None or len(search_keys) == 0) and self.country_code is None:
|
2900
2933
|
if search_id:
|
@@ -3727,9 +3760,7 @@ if response.status_code == 200:
|
|
3727
3760
|
eval_types = validated_eval_X.dtypes
|
3728
3761
|
# Find columns with different types
|
3729
3762
|
diff_cols = [
|
3730
|
-
(col, x_types[col], eval_types[col])
|
3731
|
-
for col in x_types.index
|
3732
|
-
if x_types[col] != eval_types[col]
|
3763
|
+
(col, x_types[col], eval_types[col]) for col in x_types.index if x_types[col] != eval_types[col]
|
3733
3764
|
]
|
3734
3765
|
diff_col_names = [col for col, _, _ in diff_cols]
|
3735
3766
|
# print columns with different types
|
@@ -3815,9 +3846,7 @@ if response.status_code == 200:
|
|
3815
3846
|
return Xy[X.columns].copy(), Xy[TARGET].copy()
|
3816
3847
|
|
3817
3848
|
@staticmethod
|
3818
|
-
def _sort_by_system_record_id(
|
3819
|
-
X: pd.DataFrame, y: pd.Series, cv: CVType | None
|
3820
|
-
) -> tuple[pd.DataFrame, pd.Series]:
|
3849
|
+
def _sort_by_system_record_id(X: pd.DataFrame, y: pd.Series, cv: CVType | None) -> tuple[pd.DataFrame, pd.Series]:
|
3821
3850
|
if cv not in [CVType.time_series, CVType.blocked_time_series]:
|
3822
3851
|
record_id_column = ENTITY_SYSTEM_RECORD_ID if ENTITY_SYSTEM_RECORD_ID in X else SYSTEM_RECORD_ID
|
3823
3852
|
Xy = X.copy()
|
upgini/http.py
CHANGED
@@ -426,19 +426,26 @@ class _RestClient:
|
|
426
426
|
) -> SearchTaskResponse:
|
427
427
|
api_path = self.INITIAL_SEARCH_URI_FMT_V2
|
428
428
|
|
429
|
+
print("Before getting track metrics")
|
429
430
|
track_metrics = get_track_metrics(self.client_ip, self.client_visitorid)
|
431
|
+
print("After getting track metrics")
|
430
432
|
|
431
433
|
def open_and_send():
|
432
434
|
md5_hash = hashlib.md5()
|
435
|
+
print("Before opening file to calculate hashes")
|
433
436
|
with open(file_path, "rb") as file:
|
434
437
|
content = file.read()
|
435
438
|
md5_hash.update(content)
|
436
439
|
digest = md5_hash.hexdigest()
|
437
440
|
metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
|
441
|
+
print("After calculating md5")
|
438
442
|
|
443
|
+
print("Before calculating sha256")
|
439
444
|
digest_sha256 = file_hash(file_path)
|
445
|
+
print("After calculating sha256")
|
440
446
|
metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})
|
441
447
|
|
448
|
+
print("Before opening file to send")
|
442
449
|
with open(file_path, "rb") as file:
|
443
450
|
files = {
|
444
451
|
"metadata": (
|
@@ -466,9 +473,12 @@ class _RestClient:
|
|
466
473
|
)
|
467
474
|
additional_headers = {self.SEARCH_KEYS_HEADER_NAME: ",".join(self.search_keys_meaning_types(metadata))}
|
468
475
|
|
469
|
-
|
476
|
+
print("Before sending request for initial search")
|
477
|
+
response = self._send_post_file_req_v2(
|
470
478
|
api_path, files, trace_id=trace_id, additional_headers=additional_headers
|
471
479
|
)
|
480
|
+
print("After sending request")
|
481
|
+
return response
|
472
482
|
|
473
483
|
response = self._with_unauth_retry(open_and_send)
|
474
484
|
return SearchTaskResponse(response)
|
upgini/utils/display_utils.py
CHANGED
@@ -269,19 +269,22 @@ def make_html_report(
|
|
269
269
|
if search_keys is not None
|
270
270
|
else ""
|
271
271
|
}
|
272
|
-
{
|
273
|
-
|
274
|
-
|
272
|
+
{
|
273
|
+
"<h3>All relevant features. Accuracy after enrichment</h3>" + make_table(metrics_df)
|
274
|
+
if metrics_df is not None
|
275
|
+
else ""
|
275
276
|
}
|
276
|
-
{
|
277
|
-
|
278
|
-
|
277
|
+
{
|
278
|
+
"<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
|
279
|
+
if len(relevant_datasources_df) > 0
|
280
|
+
else ""
|
279
281
|
}
|
280
282
|
<h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
|
281
283
|
{make_table(relevant_features_df, wrap_long_string=25)}
|
282
|
-
{
|
283
|
-
|
284
|
-
|
284
|
+
{
|
285
|
+
"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
|
286
|
+
if autofe_descriptions_df is not None and len(autofe_descriptions_df) > 0
|
287
|
+
else ""
|
285
288
|
}
|
286
289
|
<p>To buy found data sources, please contact: <a href='mailto:sales@upgini.com'>sales@upgini.com</a></p>
|
287
290
|
<p>Best regards, </br><b>Upgini Team</b></p>
|
@@ -1,10 +1,10 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=J4ou6xfTwIgzTXi7mnxG9WD4vn49_cFGZVdB8RZEIPM,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=9xYeqp-Ti3-QcsucyxlDFOHQef6ZQsBX7bOZMCyT2rM,31665
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
7
|
-
upgini/http.py,sha256
|
6
|
+
upgini/features_enricher.py,sha256=C9pZKusj_QnG9coPVAa1a_88VC-lLR4Tre4uC10yt04,231852
|
7
|
+
upgini/http.py,sha256=CzDgSrYH6-R14G0d8xPyLalb-w42fjj9XOHVXh7leyM,44835
|
8
8
|
upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
10
10
|
upgini/search_task.py,sha256=SAiUd1AytbA2Q6PSnnztr7oTRKpud1wQZ5YtKjsmQHU,18256
|
@@ -54,7 +54,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
|
|
54
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
55
55
|
upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
|
56
56
|
upgini/utils/deduplicate_utils.py,sha256=oZEiZeN-A92zwAPysV4OP9hO-niC2RLt-Dhc_hynBTU,11273
|
57
|
-
upgini/utils/display_utils.py,sha256=
|
57
|
+
upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc,11973
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
59
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
60
60
|
upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
|
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.120a1.dist-info/METADATA,sha256=Ai4c0bpRvXFgEYB78zVltQNbWv6HpPdc96IAw85kPJI,50745
|
78
|
+
upgini-1.2.120a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.120a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.120a1.dist-info/RECORD,,
|
File without changes
|
File without changes
|