PyPI - workbench - Versions diffs - 0.8.173__py3-none-any.whl → 0.8.174__py3-none-any.whl - Mend

workbench 0.8.173py3-none-any.whl → 0.8.174py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of workbench might be problematic. Click here for more details.

Files changed (8) hide show

workbench/core/artifacts/data_capture_core.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import logging
 import re
+import time
 from datetime import datetime
 from typing import Tuple
 import pandas as pd
@@ -14,6 +15,9 @@ from workbench.core.artifacts.endpoint_core import EndpointCore
 from workbench.core.cloud_platform.aws.aws_account_clamp import AWSAccountClamp
 from workbench.utils.monitor_utils import process_data_capture
+# Setup logging
+log = logging.getLogger("workbench")
 class DataCaptureCore:
     """Manages data capture configuration and retrieval for SageMaker endpoints"""
@@ -203,7 +207,7 @@ class DataCaptureCore:
         modes = [opt.get("CaptureMode") for opt in capture_options]
         return ["REQUEST" if m == "Input" else "RESPONSE" for m in modes if m]
-    def get_captured_data(self, from_date=None, add_timestamp=True) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    def get_captured_data(self, from_date: str = None, add_timestamp: bool = True) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
         Read and process captured data from S3.
@@ -226,29 +230,58 @@ class DataCaptureCore:
             files = [f for f in files if self._file_date_filter(f, from_date_obj)]
             self.log.info(f"Processing {len(files)} files from {from_date} onwards.")
         else:
-            self.log.info(f"Processing all {len(files)} files.")
+            self.log.info(f"Processing all {len(files)} files...")
         files.sort()
-        # Process files
-        all_input_dfs, all_output_dfs = [], []
-        for file_path in files:
+        # Get all timestamps in one batch if needed
+        timestamps = {}
+        if add_timestamp:
+            # Batch describe operation - much more efficient than per-file calls
+            timestamps = wr.s3.describe_objects(path=files)
+        # Process files using concurrent.futures
+        start_time = time.time()
+        def process_single_file(file_path):
+            """Process a single file and return input/output DataFrames."""
             try:
+                log.debug(f"Processing file: {file_path}...")
                 df = wr.s3.read_json(path=file_path, lines=True)
                 if not df.empty:
                     input_df, output_df = process_data_capture(df)
-                    if add_timestamp:
-                        timestamp = wr.s3.describe_objects(path=file_path)[file_path]["LastModified"]
-                        output_df["timestamp"] = timestamp
-                    all_input_dfs.append(input_df)
-                    all_output_dfs.append(output_df)
+                    if add_timestamp and file_path in timestamps:
+                        output_df["timestamp"] = timestamps[file_path]["LastModified"]
+                    return input_df, output_df
+                return pd.DataFrame(), pd.DataFrame()
             except Exception as e:
                 self.log.warning(f"Error processing {file_path}: {e}")
+                return pd.DataFrame(), pd.DataFrame()
+        # Use ThreadPoolExecutor for I/O-bound operations
+        from concurrent.futures import ThreadPoolExecutor
+        max_workers = min(32, len(files))  # Cap at 32 threads or number of files
+        all_input_dfs, all_output_dfs = [], []
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            futures = [executor.submit(process_single_file, file_path) for file_path in files]
+            for future in futures:
+                input_df, output_df = future.result()
+                if not input_df.empty:
+                    all_input_dfs.append(input_df)
+                if not output_df.empty:
+                    all_output_dfs.append(output_df)
         if not all_input_dfs:
             self.log.warning("No valid data was processed.")
             return pd.DataFrame(), pd.DataFrame()
-        return pd.concat(all_input_dfs, ignore_index=True), pd.concat(all_output_dfs, ignore_index=True)
+        input_df = pd.concat(all_input_dfs, ignore_index=True)
+        output_df = pd.concat(all_output_dfs, ignore_index=True)
+        elapsed_time = time.time() - start_time
+        self.log.info(f"Processed {len(files)} files in {elapsed_time:.2f} seconds.")
+        return input_df, output_df
     def _file_date_filter(self, file_path, from_date_obj):
         """Extract date from S3 path and compare with from_date."""

workbench/utils/monitor_utils.py CHANGED Viewed

@@ -76,55 +76,44 @@ def process_data_capture(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     Returns:
         tuple[DataFrame, DataFrame]: Input and output DataFrames.
     """
+    def parse_endpoint_data(data: dict) -> pd.DataFrame:
+        """Parse endpoint data based on encoding type."""
+        encoding = data["encoding"].upper()
+        if encoding == "CSV":
+            return pd.read_csv(StringIO(data["data"]))
+        elif encoding == "JSON":
+            json_data = json.loads(data["data"])
+            if isinstance(json_data, dict):
+                return pd.DataFrame({k: [v] if not isinstance(v, list) else v for k, v in json_data.items()})
+            else:
+                return pd.DataFrame(json_data)
+        else:
+            return None  # Unknown encoding
     input_dfs = []
     output_dfs = []
-    for idx, row in df.iterrows():
+    # Use itertuples() instead of iterrows() for better performance
+    for row in df.itertuples(index=True):
         try:
-            capture_data = row["captureData"]
+            capture_data = row.captureData
             # Process input data if present
             if "endpointInput" in capture_data:
-                input_data = capture_data["endpointInput"]
-                encoding = input_data["encoding"].upper()
-                if encoding == "CSV":
-                    input_df = pd.read_csv(StringIO(input_data["data"]))
-                elif encoding == "JSON":
-                    json_data = json.loads(input_data["data"])
-                    if isinstance(json_data, dict):
-                        input_df = pd.DataFrame(
-                            {k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
-                        )
-                    else:
-                        input_df = pd.DataFrame(json_data)
-                else:
-                    continue  # Skip unknown encodings
-                input_dfs.append(input_df)
+                input_df = parse_endpoint_data(capture_data["endpointInput"])
+                if input_df is not None:
+                    input_dfs.append(input_df)
             # Process output data if present
             if "endpointOutput" in capture_data:
-                output_data = capture_data["endpointOutput"]
-                encoding = output_data["encoding"].upper()
-                if encoding == "CSV":
-                    output_df = pd.read_csv(StringIO(output_data["data"]))
-                elif encoding == "JSON":
-                    json_data = json.loads(output_data["data"])
-                    if isinstance(json_data, dict):
-                        output_df = pd.DataFrame(
-                            {k: [v] if not isinstance(v, list) else v for k, v in json_data.items()}
-                        )
-                    else:
-                        output_df = pd.DataFrame(json_data)
-                else:
-                    continue  # Skip unknown encodings
-                output_dfs.append(output_df)
+                output_df = parse_endpoint_data(capture_data["endpointOutput"])
+                if output_df is not None:
+                    output_dfs.append(output_df)
         except Exception as e:
-            log.debug(f"Row {idx}: Failed to process row: {e}")
+            log.debug(f"Row {row.Index}: Failed to process row: {e}")
             continue
     # Combine and return results

{workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: workbench
-Version: 0.8.173
+Version: 0.8.174
 Summary: Workbench: A Dashboard and Python API for creating and deploying AWS SageMaker Model Pipelines
 Author-email: SuperCowPowers LLC <support@supercowpowers.com>
 License-Expression: MIT

{workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/RECORD RENAMED Viewed

@@ -51,7 +51,7 @@ workbench/core/artifacts/__init__.py,sha256=ps7rA_rbWnDbvWbg4kvu--IKMY8WmbPRyv4S
 workbench/core/artifacts/artifact.py,sha256=AtTw8wfMd-fi7cHJHsBAXHUk53kRW_6lyBwwsIbHw54,17750
 workbench/core/artifacts/athena_source.py,sha256=RNmCe7s6uH4gVHpcdJcL84aSbF5Q1ahJBLLGwHYRXEU,26081
 workbench/core/artifacts/cached_artifact_mixin.py,sha256=ngqFLZ4cQx_TFouXZgXZQsv_7W6XCvxVGXXSfzzaft8,3775
-workbench/core/artifacts/data_capture_core.py,sha256=VJL5AcXOx8PxY1Urw0AFm-czqvs55cDiwH_ZTcr2LS0,13207
+workbench/core/artifacts/data_capture_core.py,sha256=jOMLK2N8jst6WBbZ-XEVwnGz2JKiKS5RY84hMv5o_g4,14579
 workbench/core/artifacts/data_source_abstract.py,sha256=5IRCzFVK-17cd4NXPMRfx99vQAmQ0WHE5jcm5RfsVTg,10619
 workbench/core/artifacts/data_source_factory.py,sha256=YL_tA5fsgubbB3dPF6T4tO0rGgz-6oo3ge4i_YXVC-M,2380
 workbench/core/artifacts/endpoint_core.py,sha256=lwgiz0jttW8C4YqcKaA8nf231WI3kol-nLnKcAbFJko,49049
@@ -221,7 +221,7 @@ workbench/utils/license_manager.py,sha256=sDuhk1mZZqUbFmnuFXehyGnui_ALxrmYBg7gYw
 workbench/utils/log_utils.py,sha256=7n1NJXO_jUX82e6LWAQug6oPo3wiPDBYsqk9gsYab_A,3167
 workbench/utils/markdown_utils.py,sha256=4lEqzgG4EVmLcvvKKNUwNxVCySLQKJTJmWDiaDroI1w,8306
 workbench/utils/model_utils.py,sha256=JeEztmFyDJ7yqRozDX0L6apuhLgKx1sgNlO5duB73qc,11938
-workbench/utils/monitor_utils.py,sha256=LbfZImf4tHqYz9J8NnW_ggZP45Has_4QwXHQ-Wi3sLw,8381
+workbench/utils/monitor_utils.py,sha256=kVaJ7BgUXs3VPMFYfLC03wkIV4Dq-pEhoXS0wkJFxCc,7858
 workbench/utils/pandas_utils.py,sha256=uTUx-d1KYfjbS9PMQp2_9FogCV7xVZR6XLzU5YAGmfs,39371
 workbench/utils/performance_utils.py,sha256=WDNvz-bOdC99cDuXl0urAV4DJ7alk_V3yzKPwvqgST4,1329
 workbench/utils/pipeline_utils.py,sha256=yzR5tgAzz6zNqvxzZR6YqsbS7r3QDKzBXozaM_ADXlc,2171
@@ -288,9 +288,9 @@ workbench/web_interface/page_views/main_page.py,sha256=X4-KyGTKLAdxR-Zk2niuLJB2Y
 workbench/web_interface/page_views/models_page_view.py,sha256=M0bdC7bAzLyIaE2jviY12FF4abdMFZmg6sFuOY_LaGI,2650
 workbench/web_interface/page_views/page_view.py,sha256=Gh6YnpOGlUejx-bHZAf5pzqoQ1H1R0OSwOpGhOBO06w,455
 workbench/web_interface/page_views/pipelines_page_view.py,sha256=v2pxrIbsHBcYiblfius3JK766NZ7ciD2yPx0t3E5IJo,2656
-workbench-0.8.173.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
-workbench-0.8.173.dist-info/METADATA,sha256=b1gas8B3zXhFnVPVFB8vLCeqoeb8brx4rdMXRus-YJo,9210
-workbench-0.8.173.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-workbench-0.8.173.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
-workbench-0.8.173.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
-workbench-0.8.173.dist-info/RECORD,,
+workbench-0.8.174.dist-info/licenses/LICENSE,sha256=z4QMMPlLJkZjU8VOKqJkZiQZCEZ--saIU2Z8-p3aVc0,1080
+workbench-0.8.174.dist-info/METADATA,sha256=ABNffIi526f83ZgwjI3kxSv0yhpeTo3ruM8fwmJL7ks,9210
+workbench-0.8.174.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+workbench-0.8.174.dist-info/entry_points.txt,sha256=zPFPruY9uayk8-wsKrhfnIyIB6jvZOW_ibyllEIsLWo,356
+workbench-0.8.174.dist-info/top_level.txt,sha256=Dhy72zTxaA_o_yRkPZx5zw-fwumnjGaeGf0hBN3jc_w,10
+workbench-0.8.174.dist-info/RECORD,,

{workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/WHEEL RENAMED Viewed

File without changes

{workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{workbench-0.8.173.dist-info → workbench-0.8.174.dist-info}/top_level.txt RENAMED Viewed

File without changes

workbench 0.8.173__py3-none-any.whl → 0.8.174__py3-none-any.whl

Potentially problematic release.

workbench 0.8.173py3-none-any.whl → 0.8.174py3-none-any.whl