PyPI - ragaai-catalyst - Versions diffs - 2.1.5b0__py3-none-any.whl → 2.1.5b2__py3-none-any.whl - Mend

ragaai-catalyst 2.1.5b0py3-none-any.whl → 2.1.5b2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

ragaai_catalyst/dataset.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
+import json
 import requests
 from .utils import response_checker
 from typing import Union
@@ -271,3 +272,332 @@ class Dataset:
         except Exception as e:
             logger.error(f"Error in create_from_csv: {e}")
             raise
+    def add_rows(self, csv_path, dataset_name):
+        """
+        Add rows to an existing dataset from a CSV file.
+        Args:
+            csv_path (str): Path to the CSV file to be added
+            dataset_name (str): Name of the existing dataset to add rows to
+        Raises:
+            ValueError: If dataset does not exist or columns are incompatible
+        """
+        # Get existing dataset columns
+        existing_columns = self.get_dataset_columns(dataset_name)
+        # Read the CSV file to check columns
+        try:
+            import pandas as pd
+            df = pd.read_csv(csv_path)
+            csv_columns = df.columns.tolist()
+        except Exception as e:
+            logger.error(f"Failed to read CSV file: {e}")
+            raise ValueError(f"Unable to read CSV file: {e}")
+        # Check column compatibility
+        for column in existing_columns:
+            if column not in csv_columns:
+                df[column] = None
+        # Get presigned URL for the CSV
+        def get_presignedUrl():
+            headers = {
+                "Authorization": f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
+                "X-Project-Id": str(self.project_id),
+            }
+            try:
+                response = requests.get(
+                    f"{Dataset.BASE_URL}/v2/llm/dataset/csv/presigned-url",
+                    headers=headers,
+                    timeout=Dataset.TIMEOUT,
+                )
+                response.raise_for_status()
+                return response.json()
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Failed to get presigned URL: {e}")
+                raise
+        try:
+            presignedUrl = get_presignedUrl()
+            if presignedUrl['success']:
+                url = presignedUrl['data']['presignedUrl']
+                filename = presignedUrl['data']['fileName']
+            else:
+                raise ValueError('Unable to fetch presignedUrl')
+        except Exception as e:
+            logger.error(f"Error in get_presignedUrl: {e}")
+            raise
+        # Upload CSV to presigned URL
+        def put_csv_to_presignedUrl(url):
+            headers = {
+                'Content-Type': 'text/csv',
+                'x-ms-blob-type': 'BlockBlob',
+            }
+            try:
+                with open(csv_path, 'rb') as file:
+                    response = requests.put(
+                        url,
+                        headers=headers,
+                        data=file,
+                        timeout=Dataset.TIMEOUT,
+                    )
+                    response.raise_for_status()
+                    return response
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Failed to put CSV to presigned URL: {e}")
+                raise
+        try:
+            put_csv_response = put_csv_to_presignedUrl(url)
+            if put_csv_response.status_code not in (200, 201):
+                raise ValueError('Unable to put csv to the presignedUrl')
+        except Exception as e:
+            logger.error(f"Error in put_csv_to_presignedUrl: {e}")
+            raise
+        # Prepare schema mapping (assuming same mapping as original dataset)
+        def generate_schema_mapping(dataset_name):
+            headers = {
+                'Content-Type': 'application/json',
+                "Authorization": f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
+                "X-Project-Id": str(self.project_id),
+            }
+            json_data = {
+                "size": 12,
+                "page": "0",
+                "projectId": str(self.project_id),
+                "search": ""
+            }
+            try:
+                # First get dataset details
+                response = requests.post(
+                    f"{Dataset.BASE_URL}/v2/llm/dataset",
+                    headers=headers,
+                    json=json_data,
+                    timeout=Dataset.TIMEOUT,
+                )
+                response.raise_for_status()
+                datasets = response.json()["data"]["content"]
+                dataset_id = [dataset["id"] for dataset in datasets if dataset["name"]==dataset_name][0]
+                # Get dataset details to extract schema mapping
+                response = requests.get(
+                    f"{Dataset.BASE_URL}/v2/llm/dataset/{dataset_id}?initialCols=0",
+                    headers=headers,
+                    timeout=Dataset.TIMEOUT,
+                )
+                response.raise_for_status()
+                # Extract schema mapping
+                schema_mapping = {}
+                for col in response.json()["data"]["datasetColumnsResponses"]:
+                    schema_mapping[col["displayName"]] = {"columnType": col["columnType"]}
+                return schema_mapping
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Failed to get schema mapping: {e}")
+                raise
+        # Upload CSV to elastic
+        try:
+            schema_mapping = generate_schema_mapping(dataset_name)
+            data = {
+                "projectId": str(self.project_id),
+                "datasetName": dataset_name,
+                "fileName": filename,
+                "schemaMapping": schema_mapping,
+                "opType": "update",  # Use update for adding rows
+                "description": "Adding new rows to dataset"
+            }
+            headers = {
+                'Content-Type': 'application/json',
+                'Authorization': f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
+                "X-Project-Id": str(self.project_id)
+            }
+            response = requests.post(
+                f"{Dataset.BASE_URL}/v2/llm/dataset/csv",
+                headers=headers,
+                json=data,
+                timeout=Dataset.TIMEOUT,
+            )
+            if response.status_code == 400:
+                raise ValueError(response.json().get("message", "Failed to add rows"))
+            response.raise_for_status()
+            # Check response
+            response_data = response.json()
+            if response_data.get('success', False):
+                print(f"{response_data['message']}")
+            else:
+                raise ValueError(response_data.get('message', 'Failed to add rows'))
+        except Exception as e:
+            logger.error(f"Error in add_rows_to_dataset: {e}")
+            raise
+    def add_columns(self, text_fields, dataset_name, column_name, provider, model, variables={}):
+        """
+        Add a column to a dataset with dynamically fetched model parameters
+        Args:
+            project_id (int): Project ID
+            dataset_id (int): Dataset ID
+            column_name (str): Name of the new column
+            provider (str): Name of the model provider
+            model (str): Name of the model
+        """
+        # First, get model parameters
+        # Validate text_fields input
+        if not isinstance(text_fields, list):
+            raise ValueError("text_fields must be a list of dictionaries")
+        for field in text_fields:
+            if not isinstance(field, dict) or 'role' not in field or 'content' not in field:
+                raise ValueError("Each text field must be a dictionary with 'role' and 'content' keys")
+        # First, get the dataset ID
+        headers = {
+            'Content-Type': 'application/json',
+            "Authorization": f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
+            "X-Project-Id": str(self.project_id),
+        }
+        json_data = {"size": 12, "page": "0", "projectId": str(self.project_id), "search": ""}
+        try:
+            # Get dataset list
+            response = requests.post(
+                f"{Dataset.BASE_URL}/v2/llm/dataset",
+                headers=headers,
+                json=json_data,
+                timeout=Dataset.TIMEOUT,
+            )
+            response.raise_for_status()
+            datasets = response.json()["data"]["content"]
+            # Find dataset ID
+            dataset_id = next((dataset["id"] for dataset in datasets if dataset["name"] == dataset_name), None)
+            if dataset_id is None:
+                raise ValueError(f"Dataset {dataset_name} not found")
+            parameters_url= f"{Dataset.BASE_URL}/playground/providers/models/parameters/list"
+            headers = {
+                'Content-Type': 'application/json',
+                "Authorization": f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
+                "X-Project-Id": str(self.project_id),
+            }
+            # Fetch model parameters
+            parameters_payload = {
+                "providerName": provider,
+                "modelName": model
+            }
+            # Get model parameters
+            params_response = requests.post(
+                parameters_url,
+                headers=headers,
+                json=parameters_payload,
+                timeout=30
+            )
+            params_response.raise_for_status()
+            # Extract parameters
+            all_parameters = params_response.json().get('data', [])
+            # Filter and transform parameters for add-column API
+            formatted_parameters = []
+            for param in all_parameters:
+                value = param.get('value')
+                param_type = param.get('type')
+                if value is None:
+                    formatted_param = {
+                        "name": param.get('name'),
+                        "value": None,  # Pass None if the value is null
+                        "type": param.get('type')
+                    }
+                else:
+                    # Improved type handling
+                    if param_type == "float":
+                        value = float(value)  # Ensure value is converted to float
+                    elif param_type == "int":
+                        value = int(value)  # Ensure value is converted to int
+                    elif param_type == "bool":
+                        value = bool(value)  # Ensure value is converted to bool
+                    elif param_type == "string":
+                        value = str(value)  # Ensure value is converted to string
+                    else:
+                        raise ValueError(f"Unsupported parameter type: {param_type}")  # Handle unsupported types
+                    formatted_param = {
+                        "name": param.get('name'),
+                        "value": value,
+                        "type": param.get('type')
+                    }
+                formatted_parameters.append(formatted_param)
+            dataset_id = next((dataset["id"] for dataset in datasets if dataset["name"] == dataset_name), None)
+            # Prepare payload for add column API
+            add_column_payload = {
+                "rowFilterList": [],
+                "columnName": column_name,
+                "datasetId": dataset_id,
+                "variables": variables,
+                "promptTemplate": {
+                    "textFields": text_fields,
+                    "modelSpecs": {
+                        "model": f"{provider}/{model}",
+                        "parameters": formatted_parameters
+                    }
+                }
+            }
+            if variables:
+                variable_specs = []
+                for key, values in variables.items():
+                    variable_specs.append({
+                        "name": key,
+                        "type": "string",
+                        "schema": "query"
+                    })
+                add_column_payload["promptTemplate"]["variableSpecs"] = variable_specs
+            # Make API call to add column
+            add_column_url = f"{Dataset.BASE_URL}/v2/llm/dataset/add-column"
+            response = requests.post(
+                add_column_url,
+                headers={
+                    'Content-Type': 'application/json',
+                    'Authorization': f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
+                    "X-Project-Id": str(self.project_id)
+                },
+                json=add_column_payload,
+                timeout=30
+            )
+            # Check response
+            response.raise_for_status()
+            response_data = response.json()
+            if response_data.get('success', False):
+                print(f"Column '{column_name}' added successfully to dataset '{dataset_name}'")
+            else:
+                raise ValueError(response_data.get('message', 'Failed to add column'))
+        except requests.exceptions.RequestException as e:
+            print(f"Error adding column: {e}")
+            raise

ragaai_catalyst/tracers/agentic_tracing/tracers/agent_tracer.py CHANGED Viewed

@@ -44,6 +44,7 @@ class AgentTracerMixin:
         # Add auto instrument flags
         self.auto_instrument_agent = False
         self.auto_instrument_user_interaction = False
+        self.auto_instrument_file_io = False
         self.auto_instrument_network = False
     def trace_agent(
@@ -512,10 +513,22 @@ class AgentTracerMixin:
             network_calls = self.component_network_calls.get(kwargs["component_id"], [])
         interactions = []
         if self.auto_instrument_user_interaction:
-            interactions = self.component_user_interaction.get(
-                kwargs["component_id"], []
-            )
-        start_time = kwargs["start_time"]
+            input_output_interactions = []
+            for interaction in self.component_user_interaction.get(kwargs["component_id"], []):
+                if interaction["interaction_type"] in ["input", "output"]:
+                    input_output_interactions.append(interaction)
+            interactions.extend(input_output_interactions)
+        if self.auto_instrument_file_io:
+            file_io_interactions = []
+            for interaction in self.component_user_interaction.get(kwargs["component_id"], []):
+                if interaction["interaction_type"] in ["file_read", "file_write"]:
+                    file_io_interactions.append(interaction)
+            interactions.extend(file_io_interactions)
+        # Get start time
+        start_time = None
+        if "start_time" in kwargs:
+            start_time = kwargs["start_time"]
         # Get tags, metrics
         name = kwargs["name"]
@@ -621,3 +634,6 @@ class AgentTracerMixin:
     def instrument_network_calls(self):
         self.auto_instrument_network = True
+    def instrument_file_io_calls(self):
+        self.auto_instrument_file_io = True

ragaai_catalyst/tracers/agentic_tracing/tracers/base.py CHANGED Viewed

@@ -1,8 +1,5 @@
 import json
 import os
-import platform
-import psutil
-import pkg_resources
 from datetime import datetime
 from pathlib import Path
 from typing import List, Any, Dict
@@ -16,20 +13,9 @@ from ..data.data_structure import (
     Trace,
     Metadata,
     SystemInfo,
-    OSInfo,
-    EnvironmentInfo,
     Resources,
-    CPUResource,
-    MemoryResource,
-    DiskResource,
-    NetworkResource,
-    ResourceInfo,
-    MemoryInfo,
-    DiskInfo,
-    NetworkInfo,
     Component,
 )
 from ..upload.upload_agentic_traces import UploadAgenticTraces
 from ..upload.upload_code import upload_code
 from ..upload.upload_trace_metric import upload_trace_metric
@@ -37,9 +23,8 @@ from ..utils.file_name_tracker import TrackName
 from ..utils.zip_list_of_unique_files import zip_list_of_unique_files
 from ..utils.span_attributes import SpanAttributes
 from ..utils.create_dataset_schema import create_dataset_schema_with_trace
+from ..utils.system_monitor import SystemMonitor
-# Configure logging to show debug messages (which includes info messages as well)
 import logging
 logger = logging.getLogger(__name__)
@@ -76,12 +61,12 @@ class TracerJSONEncoder(json.JSONEncoder):
 class BaseTracer:
     def __init__(self, user_details):
         self.user_details = user_details
-        self.project_name = self.user_details["project_name"]  # Access the project_name
-        self.dataset_name = self.user_details["dataset_name"]  # Access the dataset_name
-        self.project_id = self.user_details["project_id"]  # Access the project_id
-        self.trace_name = self.user_details["trace_name"]  # Access the trace_name
+        self.project_name = self.user_details["project_name"]
+        self.dataset_name = self.user_details["dataset_name"]
+        self.project_id = self.user_details["project_id"]
+        self.trace_name = self.user_details["trace_name"]
         self.visited_metrics = []
-        self.trace_metrics = []  # Store metrics here
+        self.trace_metrics = []
         # Initialize trace data
         self.trace_id = None
@@ -97,117 +82,60 @@ class BaseTracer:
         self.network_usage_list = []
         self.tracking_thread = None
         self.tracking = False
+        self.system_monitor = None
     def _get_system_info(self) -> SystemInfo:
-        # Get OS info
-        os_info = OSInfo(
-            name=platform.system(),
-            version=platform.version(),
-            platform=platform.machine(),
-            kernel_version=platform.release(),
-        )
-        # Get Python environment info
-        installed_packages = [
-            f"{pkg.key}=={pkg.version}" for pkg in pkg_resources.working_set
-        ]
-        env_info = EnvironmentInfo(
-            name="Python",
-            version=platform.python_version(),
-            packages=installed_packages,
-            env_path=sys.prefix,
-            command_to_run=f"python {sys.argv[0]}",
-        )
-        return SystemInfo(
-            id=f"sys_{self.trace_id}",
-            os=os_info,
-            environment=env_info,
-            source_code="Path to the source code .zip file in format hashid.zip",  # TODO: Implement source code archiving
-        )
+        return self.system_monitor.get_system_info()
     def _get_resources(self) -> Resources:
-        # CPU info
-        cpu_info = ResourceInfo(
-            name=platform.processor(),
-            cores=psutil.cpu_count(logical=False),
-            threads=psutil.cpu_count(logical=True),
-        )
-        cpu = CPUResource(info=cpu_info, interval="5s", values=[psutil.cpu_percent()])
-        # Memory info
-        memory = psutil.virtual_memory()
-        mem_info = MemoryInfo(
-            total=memory.total / (1024**3),  # Convert to GB
-            free=memory.available / (1024**3),
-        )
-        mem = MemoryResource(info=mem_info, interval="5s", values=[memory.percent])
-        # Disk info
-        disk = psutil.disk_usage("/")
-        disk_info = DiskInfo(total=disk.total / (1024**3), free=disk.free / (1024**3))
-        disk_io = psutil.disk_io_counters()
-        disk_resource = DiskResource(
-            info=disk_info,
-            interval="5s",
-            read=[disk_io.read_bytes / (1024**2)],  # MB
-            write=[disk_io.write_bytes / (1024**2)],
-        )
-        # Network info
-        net_io = psutil.net_io_counters()
-        net_info = NetworkInfo(
-            upload_speed=net_io.bytes_sent / (1024**2),  # MB
-            download_speed=net_io.bytes_recv / (1024**2),
-        )
-        net = NetworkResource(
-            info=net_info,
-            interval="5s",
-            uploads=[net_io.bytes_sent / (1024**2)],
-            downloads=[net_io.bytes_recv / (1024**2)],
-        )
-        return Resources(cpu=cpu, memory=mem, disk=disk_resource, network=net)
+        return self.system_monitor.get_resources()
     def _track_memory_usage(self):
         self.memory_usage_list = []
         while self.tracking:
-            memory_usage = psutil.Process().memory_info().rss
-            self.memory_usage_list.append(memory_usage / (1024 * 1024))  # Convert to MB and append to the list
-            time.sleep(self.interval_time)
+            usage = self.system_monitor.track_memory_usage()
+            self.memory_usage_list.append(usage)
+            try:
+                time.sleep(self.interval_time)
+            except Exception as e:
+                logger.warning(f"Sleep interrupted in memory tracking: {str(e)}")
     def _track_cpu_usage(self):
         self.cpu_usage_list = []
         while self.tracking:
-            cpu_usage = psutil.cpu_percent(interval=self.interval_time)
-            self.cpu_usage_list.append(cpu_usage)
-            time.sleep(self.interval_time)
+            usage = self.system_monitor.track_cpu_usage(self.interval_time)
+            self.cpu_usage_list.append(usage)
+            try:
+                time.sleep(self.interval_time)
+            except Exception as e:
+                logger.warning(f"Sleep interrupted in CPU tracking: {str(e)}")
     def _track_disk_usage(self):
         self.disk_usage_list = []
         while self.tracking:
-            disk_io = psutil.disk_io_counters()
-            self.disk_usage_list.append({
-                'disk_read': disk_io.read_bytes / (1024 * 1024),  # Convert to MB
-                'disk_write': disk_io.write_bytes / (1024 * 1024)  # Convert to MB
-            })
-            time.sleep(self.interval_time)
+            usage = self.system_monitor.track_disk_usage()
+            self.disk_usage_list.append(usage)
+            try:
+                time.sleep(self.interval_time)
+            except Exception as e:
+                logger.warning(f"Sleep interrupted in disk tracking: {str(e)}")
     def _track_network_usage(self):
         self.network_usage_list = []
         while self.tracking:
-            net_io = psutil.net_io_counters()
-            self.network_usage_list.append({
-                'uploads': net_io.bytes_sent / (1024 * 1024),  # Convert to MB
-                'downloads': net_io.bytes_recv / (1024 * 1024)  # Convert to MB
-            })
-            time.sleep(self.interval_time)
+            usage = self.system_monitor.track_network_usage()
+            self.network_usage_list.append(usage)
+            try:
+                time.sleep(self.interval_time)
+            except Exception as e:
+                logger.warning(f"Sleep interrupted in network tracking: {str(e)}")
     def start(self):
         """Initialize a new trace"""
         self.tracking = True
-        self.tracking_thread = threading.Thread(target=self._track_memory_usage)
-        self.tracking_thread.start()
+        self.trace_id = str(uuid.uuid4())
+        self.system_monitor = SystemMonitor(self.trace_id)
+        threading.Thread(target=self._track_memory_usage).start()
         threading.Thread(target=self._track_cpu_usage).start()
         threading.Thread(target=self._track_disk_usage).start()
         threading.Thread(target=self._track_network_usage).start()
@@ -223,9 +151,6 @@ class BaseTracer:
             resources=self._get_resources(),
         )
-        # Generate a unique trace ID, when trace starts
-        self.trace_id = str(uuid.uuid4())
         # Get the start time
         self.start_time = datetime.now().astimezone().isoformat()
@@ -257,8 +182,6 @@ class BaseTracer:
             #track memory usage
             self.tracking = False
-            if self.tracking_thread is not None:
-                self.tracking_thread.join()
             self.trace.metadata.resources.memory.values = self.memory_usage_list
             #track cpu usage

ragaai_catalyst/tracers/agentic_tracing/tracers/custom_tracer.py CHANGED Viewed

@@ -25,6 +25,7 @@ class CustomTracerMixin:
         self.auto_instrument_custom = False
         self.auto_instrument_user_interaction = False
         self.auto_instrument_network = False
+        self.auto_instrument_file_io = False
     def trace_custom(self, name: str = None, custom_type: str = "generic", version: str = "1.0.0", trace_variables: bool = True):
         def decorator(func):
@@ -246,8 +247,18 @@ class CustomTracerMixin:
         interactions = []
         if self.auto_instrument_user_interaction:
-            interactions = self.component_user_interaction.get(kwargs["component_id"], [])
+            input_output_interactions = []
+            for interaction in self.component_user_interaction.get(kwargs["component_id"], []):
+                if interaction["interaction_type"] in ["input", "output"]:
+                    input_output_interactions.append(interaction)
+            interactions.extend(input_output_interactions)
+        if self.auto_instrument_file_io:
+            file_io_interactions = []
+            for interaction in self.component_user_interaction.get(kwargs["component_id"], []):
+                if interaction["interaction_type"] in ["file_read", "file_write"]:
+                    file_io_interactions.append(interaction)
+            interactions.extend(file_io_interactions)
         component = {
             "id": kwargs["component_id"],
             "hash_id": kwargs["hash_id"],
@@ -314,3 +325,7 @@ class CustomTracerMixin:
     def instrument_network_calls(self):
         """Enable auto-instrumentation for network calls"""
         self.auto_instrument_network = True
+    def instrument_file_io_calls(self):
+        """Enable auto-instrumentation for file IO calls"""
+        self.auto_instrument_file_io = True

ragaai-catalyst 2.1.5b0__py3-none-any.whl → 2.1.5b2__py3-none-any.whl

ragaai-catalyst 2.1.5b0py3-none-any.whl → 2.1.5b2py3-none-any.whl