workbench 0.8.173__py3-none-any.whl → 0.8.174__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of workbench might be problematic. Click here for more details.
- workbench/core/artifacts/data_capture_core.py +44 -11
- workbench/utils/monitor_utils.py +26 -37
- {workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/METADATA +1 -1
- {workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/RECORD +8 -8
- {workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/WHEEL +0 -0
- {workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/entry_points.txt +0 -0
- {workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/licenses/LICENSE +0 -0
- {workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import re
|
|
5
|
+
import time
|
|
5
6
|
from datetime import datetime
|
|
6
7
|
from typing import Tuple
|
|
7
8
|
import pandas as pd
|
|
@@ -14,6 +15,9 @@ from workbench.core.artifacts.endpoint_core import EndpointCore
|
|
|
14
15
|
from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
|
|
15
16
|
from workbench.utils.monitor_utils import process_data_capture
|
|
16
17
|
|
|
18
|
+
# Setup logging
|
|
19
|
+
log = logging.getLogger("workbench")
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
class DataCaptureCore:
|
|
19
23
|
"""Manages data capture configuration and retrieval for SageMaker endpoints"""
|
|
@@ -203,7 +207,7 @@ class DataCaptureCore:
|
|
|
203
207
|
modes = [opt.get("CaptureMode") for opt in capture_options]
|
|
204
208
|
return ["REQUEST" if m == "Input" else "RESPONSE" for m in modes if m]
|
|
205
209
|
|
|
206
|
-
def get_captured_data(self, from_date=None, add_timestamp=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
210
|
+
def get_captured_data(self, from_date: str = None, add_timestamp: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
207
211
|
"""
|
|
208
212
|
Read and process captured data from S3.
|
|
209
213
|
|
|
@@ -226,29 +230,58 @@ class DataCaptureCore:
|
|
|
226
230
|
files = [f for f in files if self._file_date_filter(f, from_date_obj)]
|
|
227
231
|
self.log.info(f"Processing {len(files)} files from {from_date} onwards.")
|
|
228
232
|
else:
|
|
229
|
-
self.log.info(f"Processing all {len(files)} files
|
|
233
|
+
self.log.info(f"Processing all {len(files)} files...")
|
|
230
234
|
files.sort()
|
|
231
235
|
|
|
232
|
-
#
|
|
233
|
-
|
|
234
|
-
|
|
236
|
+
# Get all timestamps in one batch if needed
|
|
237
|
+
timestamps = {}
|
|
238
|
+
if add_timestamp:
|
|
239
|
+
# Batch describe operation - much more efficient than per-file calls
|
|
240
|
+
timestamps = wr.s3.describe_objects(path=files)
|
|
241
|
+
|
|
242
|
+
# Process files using concurrent.futures
|
|
243
|
+
start_time = time.time()
|
|
244
|
+
|
|
245
|
+
def process_single_file(file_path):
|
|
246
|
+
"""Process a single file and return input/output DataFrames."""
|
|
235
247
|
try:
|
|
248
|
+
log.debug(f"Processing file: {file_path}...")
|
|
236
249
|
df = wr.s3.read_json(path=file_path, lines=True)
|
|
237
250
|
if not df.empty:
|
|
238
251
|
input_df, output_df = process_data_capture(df)
|
|
239
|
-
if add_timestamp:
|
|
240
|
-
timestamp =
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
all_output_dfs.append(output_df)
|
|
252
|
+
if add_timestamp and file_path in timestamps:
|
|
253
|
+
output_df["timestamp"] = timestamps[file_path]["LastModified"]
|
|
254
|
+
return input_df, output_df
|
|
255
|
+
return pd.DataFrame(), pd.DataFrame()
|
|
244
256
|
except Exception as e:
|
|
245
257
|
self.log.warning(f"Error processing {file_path}: {e}")
|
|
258
|
+
return pd.DataFrame(), pd.DataFrame()
|
|
259
|
+
|
|
260
|
+
# Use ThreadPoolExecutor for I/O-bound operations
|
|
261
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
262
|
+
|
|
263
|
+
max_workers = min(32, len(files)) # Cap at 32 threads or number of files
|
|
264
|
+
|
|
265
|
+
all_input_dfs, all_output_dfs = [], []
|
|
266
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
267
|
+
futures = [executor.submit(process_single_file, file_path) for file_path in files]
|
|
268
|
+
for future in futures:
|
|
269
|
+
input_df, output_df = future.result()
|
|
270
|
+
if not input_df.empty:
|
|
271
|
+
all_input_dfs.append(input_df)
|
|
272
|
+
if not output_df.empty:
|
|
273
|
+
all_output_dfs.append(output_df)
|
|
246
274
|
|
|
247
275
|
if not all_input_dfs:
|
|
248
276
|
self.log.warning("No valid data was processed.")
|
|
249
277
|
return pd.DataFrame(), pd.DataFrame()
|
|
250
278
|
|
|
251
|
-
|
|
279
|
+
input_df = pd.concat(all_input_dfs, ignore_index=True)
|
|
280
|
+
output_df = pd.concat(all_output_dfs, ignore_index=True)
|
|
281
|
+
|
|
282
|
+
elapsed_time = time.time() - start_time
|
|
283
|
+
self.log.info(f"Processed {len(files)} files in {elapsed_time:.2f} seconds.")
|
|
284
|
+
return input_df, output_df
|
|
252
285
|
|
|
253
286
|
def _file_date_filter(self, file_path, from_date_obj):
|
|
254
287
|
"""Extract date from S3 path and compare with from_date."""
|
workbench/utils/monitor_utils.py
CHANGED
|
@@ -76,55 +76,44 @@ def process_data_capture(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
76
76
|
Returns:
|
|
77
77
|
tuple[DataFrame, DataFrame]: Input and output DataFrames.
|
|
78
78
|
"""
|
|
79
|
+
|
|
80
|
+
def parse_endpoint_data(data: dict) -> pd.DataFrame:
|
|
81
|
+
"""Parse endpoint data based on encoding type."""
|
|
82
|
+
encoding = data["encoding"].upper()
|
|
83
|
+
|
|
84
|
+
if encoding == "CSV":
|
|
85
|
+
return pd.read_csv(StringIO(data["data"]))
|
|
86
|
+
elif encoding == "JSON":
|
|
87
|
+
json_data = json.loads(data["data"])
|
|
88
|
+
if isinstance(json_data, dict):
|
|
89
|
+
return pd.DataFrame({k: [v] if not isinstance(v, list) else v for k, v in json_data.items()})
|
|
90
|
+
else:
|
|
91
|
+
return pd.DataFrame(json_data)
|
|
92
|
+
else:
|
|
93
|
+
return None # Unknown encoding
|
|
94
|
+
|
|
79
95
|
input_dfs = []
|
|
80
96
|
output_dfs = []
|
|
81
97
|
|
|
82
|
-
|
|
98
|
+
# Use itertuples() instead of iterrows() for better performance
|
|
99
|
+
for row in df.itertuples(index=True):
|
|
83
100
|
try:
|
|
84
|
-
capture_data = row
|
|
101
|
+
capture_data = row.captureData
|
|
85
102
|
|
|
86
103
|
# Process input data if present
|
|
87
104
|
if "endpointInput" in capture_data:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
if encoding == "CSV":
|
|
92
|
-
input_df = pd.read_csv(StringIO(input_data["data"]))
|
|
93
|
-
elif encoding == "JSON":
|
|
94
|
-
json_data = json.loads(input_data["data"])
|
|
95
|
-
if isinstance(json_data, dict):
|
|
96
|
-
input_df = pd.DataFrame(
|
|
97
|
-
{k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
|
|
98
|
-
)
|
|
99
|
-
else:
|
|
100
|
-
input_df = pd.DataFrame(json_data)
|
|
101
|
-
else:
|
|
102
|
-
continue # Skip unknown encodings
|
|
103
|
-
|
|
104
|
-
input_dfs.append(input_df)
|
|
105
|
+
input_df = parse_endpoint_data(capture_data["endpointInput"])
|
|
106
|
+
if input_df is not None:
|
|
107
|
+
input_dfs.append(input_df)
|
|
105
108
|
|
|
106
109
|
# Process output data if present
|
|
107
110
|
if "endpointOutput" in capture_data:
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
if encoding == "CSV":
|
|
112
|
-
output_df = pd.read_csv(StringIO(output_data["data"]))
|
|
113
|
-
elif encoding == "JSON":
|
|
114
|
-
json_data = json.loads(output_data["data"])
|
|
115
|
-
if isinstance(json_data, dict):
|
|
116
|
-
output_df = pd.DataFrame(
|
|
117
|
-
{k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
|
|
118
|
-
)
|
|
119
|
-
else:
|
|
120
|
-
output_df = pd.DataFrame(json_data)
|
|
121
|
-
else:
|
|
122
|
-
continue # Skip unknown encodings
|
|
123
|
-
|
|
124
|
-
output_dfs.append(output_df)
|
|
111
|
+
output_df = parse_endpoint_data(capture_data["endpointOutput"])
|
|
112
|
+
if output_df is not None:
|
|
113
|
+
output_dfs.append(output_df)
|
|
125
114
|
|
|
126
115
|
except Exception as e:
|
|
127
|
-
log.debug(f"Row {
|
|
116
|
+
log.debug(f"Row {row.Index}: Failed to process row: {e}")
|
|
128
117
|
continue
|
|
129
118
|
|
|
130
119
|
# Combine and return results
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: workbench
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.174
|
|
4
4
|
Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
|
|
5
5
|
Author-email: SuperCowPowers LLC <support@supercowpowers.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -51,7 +51,7 @@ workbench/core/artifacts/__init__.py,sha256=ps7rA_rbWnDbvWbg4kvu--IKMY8WmbPRyv4S
|
|
|
51
51
|
workbench/core/artifacts/artifact.py,sha256=AtTw8wfMd-fi7cHJHsBAXHUk53kRW_6lyBwwsIbHw54,17750
|
|
52
52
|
workbench/core/artifacts/athena_source.py,sha256=RNmCe7s6uH4gVHpcdJcL84aSbF5Q1ahJBLLGwHYRXEU,26081
|
|
53
53
|
workbench/core/artifacts/cached_artifact_mixin.py,sha256=ngqFLZ4cQx_TFouXZgXZQsv_7W6XCvxVGXXSfzzaft8,3775
|
|
54
|
-
workbench/core/artifacts/data_capture_core.py,sha256=
|
|
54
|
+
workbench/core/artifacts/data_capture_core.py,sha256=jOMLK2N8jst6WBbZ-XEVwnGz2JKiKS5RY84hMv5o_g4,14579
|
|
55
55
|
workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99vQAmQ0WHE5jcm5RfsVTg,10619
|
|
56
56
|
workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
|
|
57
57
|
workbench/core/artifacts/endpoint_core.py,sha256=lwgiz0jttW8C4YqcKaA8nf231WI3kol-nLnKcAbFJko,49049
|
|
@@ -221,7 +221,7 @@ workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYw
|
|
|
221
221
|
workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
|
|
222
222
|
workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
|
|
223
223
|
workbench/utils/model_utils.py,sha256=JeEztmFyDJ7yqRozDX0L6apuhLgKx1sgNlO5duB73qc,11938
|
|
224
|
-
workbench/utils/monitor_utils.py,sha256=
|
|
224
|
+
workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
|
|
225
225
|
workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
|
|
226
226
|
workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
|
|
227
227
|
workbench/utils/pipeline_utils.py,sha256=yzR5tgAzz6zNqvxzZR6YqsbS7r3QDKzBXozaM_ADXlc,2171
|
|
@@ -288,9 +288,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
|
|
|
288
288
|
workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
|
|
289
289
|
workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
|
|
290
290
|
workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
|
|
291
|
-
workbench-0.8.
|
|
292
|
-
workbench-0.8.
|
|
293
|
-
workbench-0.8.
|
|
294
|
-
workbench-0.8.
|
|
295
|
-
workbench-0.8.
|
|
296
|
-
workbench-0.8.
|
|
291
|
+
workbench-0.8.174.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
|
|
292
|
+
workbench-0.8.174.dist-info/METADATA,sha256=ABNffIi526f83ZgwjI3kxSv0yhpeTo3ruM8fwmJL7ks,9210
|
|
293
|
+
workbench-0.8.174.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
294
|
+
workbench-0.8.174.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
|
|
295
|
+
workbench-0.8.174.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
|
|
296
|
+
workbench-0.8.174.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|