workbench 0.8.173__py3-none-any.whl → 0.8.174__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of workbench might be problematic. Click here for more details.

@@ -2,6 +2,7 @@
2
2
 
3
3
  import logging
4
4
  import re
5
+ import time
5
6
  from datetime import datetime
6
7
  from typing import Tuple
7
8
  import pandas as pd
@@ -14,6 +15,9 @@ from workbench.core.artifacts.endpoint_core import EndpointCore
14
15
  from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
15
16
  from workbench.utils.monitor_utils import process_data_capture
16
17
 
18
+ # Setup logging
19
+ log = logging.getLogger("workbench")
20
+
17
21
 
18
22
  class DataCaptureCore:
19
23
  """Manages data capture configuration and retrieval for SageMaker endpoints"""
@@ -203,7 +207,7 @@ class DataCaptureCore:
203
207
  modes = [opt.get("CaptureMode") for opt in capture_options]
204
208
  return ["REQUEST" if m == "Input" else "RESPONSE" for m in modes if m]
205
209
 
206
- def get_captured_data(self, from_date=None, add_timestamp=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
210
+ def get_captured_data(self, from_date: str = None, add_timestamp: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
207
211
  """
208
212
  Read and process captured data from S3.
209
213
 
@@ -226,29 +230,58 @@ class DataCaptureCore:
226
230
  files = [f for f in files if self._file_date_filter(f, from_date_obj)]
227
231
  self.log.info(f"Processing {len(files)} files from {from_date} onwards.")
228
232
  else:
229
- self.log.info(f"Processing all {len(files)} files.")
233
+ self.log.info(f"Processing all {len(files)} files...")
230
234
  files.sort()
231
235
 
232
- # Process files
233
- all_input_dfs, all_output_dfs = [], []
234
- for file_path in files:
236
+ # Get all timestamps in one batch if needed
237
+ timestamps = {}
238
+ if add_timestamp:
239
+ # Batch describe operation - much more efficient than per-file calls
240
+ timestamps = wr.s3.describe_objects(path=files)
241
+
242
+ # Process files using concurrent.futures
243
+ start_time = time.time()
244
+
245
+ def process_single_file(file_path):
246
+ """Process a single file and return input/output DataFrames."""
235
247
  try:
248
+ log.debug(f"Processing file: {file_path}...")
236
249
  df = wr.s3.read_json(path=file_path, lines=True)
237
250
  if not df.empty:
238
251
  input_df, output_df = process_data_capture(df)
239
- if add_timestamp:
240
- timestamp = wr.s3.describe_objects(path=file_path)[file_path]["LastModified"]
241
- output_df["timestamp"] = timestamp
242
- all_input_dfs.append(input_df)
243
- all_output_dfs.append(output_df)
252
+ if add_timestamp and file_path in timestamps:
253
+ output_df["timestamp"] = timestamps[file_path]["LastModified"]
254
+ return input_df, output_df
255
+ return pd.DataFrame(), pd.DataFrame()
244
256
  except Exception as e:
245
257
  self.log.warning(f"Error processing {file_path}: {e}")
258
+ return pd.DataFrame(), pd.DataFrame()
259
+
260
+ # Use ThreadPoolExecutor for I/O-bound operations
261
+ from concurrent.futures import ThreadPoolExecutor
262
+
263
+ max_workers = min(32, len(files)) # Cap at 32 threads or number of files
264
+
265
+ all_input_dfs, all_output_dfs = [], []
266
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
267
+ futures = [executor.submit(process_single_file, file_path) for file_path in files]
268
+ for future in futures:
269
+ input_df, output_df = future.result()
270
+ if not input_df.empty:
271
+ all_input_dfs.append(input_df)
272
+ if not output_df.empty:
273
+ all_output_dfs.append(output_df)
246
274
 
247
275
  if not all_input_dfs:
248
276
  self.log.warning("No valid data was processed.")
249
277
  return pd.DataFrame(), pd.DataFrame()
250
278
 
251
- return pd.concat(all_input_dfs, ignore_index=True), pd.concat(all_output_dfs, ignore_index=True)
279
+ input_df = pd.concat(all_input_dfs, ignore_index=True)
280
+ output_df = pd.concat(all_output_dfs, ignore_index=True)
281
+
282
+ elapsed_time = time.time() - start_time
283
+ self.log.info(f"Processed {len(files)} files in {elapsed_time:.2f} seconds.")
284
+ return input_df, output_df
252
285
 
253
286
  def _file_date_filter(self, file_path, from_date_obj):
254
287
  """Extract date from S3 path and compare with from_date."""
@@ -76,55 +76,44 @@ def process_data_capture(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
76
76
  Returns:
77
77
  tuple[DataFrame, DataFrame]: Input and output DataFrames.
78
78
  """
79
+
80
+ def parse_endpoint_data(data: dict) -> pd.DataFrame:
81
+ """Parse endpoint data based on encoding type."""
82
+ encoding = data["encoding"].upper()
83
+
84
+ if encoding == "CSV":
85
+ return pd.read_csv(StringIO(data["data"]))
86
+ elif encoding == "JSON":
87
+ json_data = json.loads(data["data"])
88
+ if isinstance(json_data, dict):
89
+ return pd.DataFrame({k: [v] if not isinstance(v, list) else v for k, v in json_data.items()})
90
+ else:
91
+ return pd.DataFrame(json_data)
92
+ else:
93
+ return None # Unknown encoding
94
+
79
95
  input_dfs = []
80
96
  output_dfs = []
81
97
 
82
- for idx, row in df.iterrows():
98
+ # Use itertuples() instead of iterrows() for better performance
99
+ for row in df.itertuples(index=True):
83
100
  try:
84
- capture_data = row["captureData"]
101
+ capture_data = row.captureData
85
102
 
86
103
  # Process input data if present
87
104
  if "endpointInput" in capture_data:
88
- input_data = capture_data["endpointInput"]
89
- encoding = input_data["encoding"].upper()
90
-
91
- if encoding == "CSV":
92
- input_df = pd.read_csv(StringIO(input_data["data"]))
93
- elif encoding == "JSON":
94
- json_data = json.loads(input_data["data"])
95
- if isinstance(json_data, dict):
96
- input_df = pd.DataFrame(
97
- {k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
98
- )
99
- else:
100
- input_df = pd.DataFrame(json_data)
101
- else:
102
- continue # Skip unknown encodings
103
-
104
- input_dfs.append(input_df)
105
+ input_df = parse_endpoint_data(capture_data["endpointInput"])
106
+ if input_df is not None:
107
+ input_dfs.append(input_df)
105
108
 
106
109
  # Process output data if present
107
110
  if "endpointOutput" in capture_data:
108
- output_data = capture_data["endpointOutput"]
109
- encoding = output_data["encoding"].upper()
110
-
111
- if encoding == "CSV":
112
- output_df = pd.read_csv(StringIO(output_data["data"]))
113
- elif encoding == "JSON":
114
- json_data = json.loads(output_data["data"])
115
- if isinstance(json_data, dict):
116
- output_df = pd.DataFrame(
117
- {k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
118
- )
119
- else:
120
- output_df = pd.DataFrame(json_data)
121
- else:
122
- continue # Skip unknown encodings
123
-
124
- output_dfs.append(output_df)
111
+ output_df = parse_endpoint_data(capture_data["endpointOutput"])
112
+ if output_df is not None:
113
+ output_dfs.append(output_df)
125
114
 
126
115
  except Exception as e:
127
- log.debug(f"Row {idx}: Failed to process row: {e}")
116
+ log.debug(f"Row {row.Index}: Failed to process row: {e}")
128
117
  continue
129
118
 
130
119
  # Combine and return results
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: workbench
3
- Version: 0.8.173
3
+ Version: 0.8.174
4
4
  Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
5
5
  Author-email: SuperCowPowers LLC <support@supercowpowers.com>
6
6
  License-Expression: MIT
@@ -51,7 +51,7 @@ workbench/core/artifacts/__init__.py,sha256=ps7rA_rbWnDbvWbg4kvu--IKMY8WmbPRyv4S
51
51
  workbench/core/artifacts/artifact.py,sha256=AtTw8wfMd-fi7cHJHsBAXHUk53kRW_6lyBwwsIbHw54,17750
52
52
  workbench/core/artifacts/athena_source.py,sha256=RNmCe7s6uH4gVHpcdJcL84aSbF5Q1ahJBLLGwHYRXEU,26081
53
53
  workbench/core/artifacts/cached_artifact_mixin.py,sha256=ngqFLZ4cQx_TFouXZgXZQsv_7W6XCvxVGXXSfzzaft8,3775
54
- workbench/core/artifacts/data_capture_core.py,sha256=VJL5AcXOx8PxY1Urw0AFm-czqvs55cDiwH_ZTcr2LS0,13207
54
+ workbench/core/artifacts/data_capture_core.py,sha256=jOMLK2N8jst6WBbZ-XEVwnGz2JKiKS5RY84hMv5o_g4,14579
55
55
  workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99vQAmQ0WHE5jcm5RfsVTg,10619
56
56
  workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
57
57
  workbench/core/artifacts/endpoint_core.py,sha256=lwgiz0jttW8C4YqcKaA8nf231WI3kol-nLnKcAbFJko,49049
@@ -221,7 +221,7 @@ workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYw
221
221
  workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
222
222
  workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
223
223
  workbench/utils/model_utils.py,sha256=JeEztmFyDJ7yqRozDX0L6apuhLgKx1sgNlO5duB73qc,11938
224
- workbench/utils/monitor_utils.py,sha256=LbfZImf4tHqYz9J8NnW_ggZP45Has_4QwXHQ-Wi3sLw,8381
224
+ workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
225
225
  workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
226
226
  workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
227
227
  workbench/utils/pipeline_utils.py,sha256=yzR5tgAzz6zNqvxzZR6YqsbS7r3QDKzBXozaM_ADXlc,2171
@@ -288,9 +288,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
288
288
  workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
289
289
  workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
290
290
  workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
291
- workbench-0.8.173.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
292
- workbench-0.8.173.dist-info/METADATA,sha256=b1gas8B3zXhFnVPVFB8vLCeqoeb8brx4rdMXRus-YJo,9210
293
- workbench-0.8.173.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
294
- workbench-0.8.173.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
295
- workbench-0.8.173.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
296
- workbench-0.8.173.dist-info/RECORD,,
291
+ workbench-0.8.174.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
292
+ workbench-0.8.174.dist-info/METADATA,sha256=ABNffIi526f83ZgwjI3kxSv0yhpeTo3ruM8fwmJL7ks,9210
293
+ workbench-0.8.174.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
294
+ workbench-0.8.174.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
295
+ workbench-0.8.174.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
296
+ workbench-0.8.174.dist-info/RECORD,,