PyPI - ragaai-catalyst - Versions diffs - 2.1.4.1b0__py3-none-any.whl → 2.1.5__py3-none-any.whl - Mend

ragaai-catalyst 2.1.4.1b0py3-none-any.whl → 2.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

ragaai_catalyst/synthetic_data_generation.py CHANGED Viewed

@@ -1,23 +1,28 @@
 import os
-from groq import Groq
-import google.generativeai as genai
-import openai
-import PyPDF2
+import ast
 import csv
+import json
+import random
+import PyPDF2
 import markdown
 import pandas as pd
-import json
-from litellm import completion
 from tqdm import tqdm
-# import internal_api_completion
-# import proxy_call
+import openai
+import tiktoken
+import litellm
+import google.generativeai as genai
+from groq import Groq
+from litellm import completion
 from .internal_api_completion import api_completion as internal_api_completion
 from .proxy_call import api_completion as proxy_api_completion
-# from ragaai_catalyst import internal_api_completion
-# from ragaai_catalyst import proxy_call
-import ast
-# dotenv.load_dotenv()
+from typing import Optional, List, Dict, Any
+import logging
+logger = logging.getLogger(__name__)
 class SyntheticDataGeneration:
     """
@@ -48,13 +53,18 @@ class SyntheticDataGeneration:
         Raises:
             ValueError: If an invalid provider is specified or API key is missing.
         """
+        text_validity = self.validate_input(text)
+        if text_validity:
+            raise ValueError(text_validity)
         BATCH_SIZE = 5  # Optimal batch size for maintaining response quality
         provider = model_config.get("provider")
         model = model_config.get("model")
         api_base = model_config.get("api_base")
+        api_version = model_config.get("api_version")
         # Initialize the appropriate client based on provider
-        self._initialize_client(provider, api_key, api_base, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
+        self._initialize_client(provider, api_key, api_base, api_version, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
         # Initialize progress bar
         pbar = tqdm(total=n, desc="Generating QA pairs")
@@ -68,7 +78,9 @@ class SyntheticDataGeneration:
             "No connection adapters",
             "Required API Keys are not set",
             "litellm.BadRequestError",
-            "litellm.AuthenticationError"]
+            "litellm.AuthenticationError",
+            "Max retries exceeded"
+            ]
         for _ in range(num_batches):
             current_batch_size = min(BATCH_SIZE, n - len(all_responses))
@@ -77,7 +89,6 @@ class SyntheticDataGeneration:
             try:
                 system_message = self._get_system_message(question_type, current_batch_size)
                 if "internal_llm_proxy" in kwargs:
                     batch_df = self._generate_internal_response(text, system_message, model_config, kwargs)
                 else:
@@ -88,7 +99,7 @@ class SyntheticDataGeneration:
                     pbar.update(len(batch_df))
             except Exception as e:
-                print(f"Batch generation failed.")
+                print(f"Batch generation failed:{str(e)}")
                 if any(error in str(e) for error in FAILURE_CASES):
                     raise Exception(f"{e}")
@@ -139,7 +150,7 @@ class SyntheticDataGeneration:
         return final_df
-    def _initialize_client(self, provider, api_key, api_base=None, internal_llm_proxy=None):
+    def _initialize_client(self, provider, api_key, api_base=None, api_version=None, internal_llm_proxy=None):
         """Initialize the appropriate client based on provider."""
         if not provider:
             raise ValueError("Model configuration must be provided with a valid provider and model.")
@@ -158,7 +169,17 @@ class SyntheticDataGeneration:
             if api_key is None and os.getenv("OPENAI_API_KEY") is None and internal_llm_proxy is None:
                 raise ValueError("API key must be provided for OpenAI.")
             openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
+        elif provider == "azure":
+            if api_key is None and os.getenv("AZURE_API_KEY") is None and internal_llm_proxy is None:
+                raise ValueError("API key must be provided for Azure.")
+            litellm.api_key = api_key or os.getenv("AZURE_API_KEY")
+            if api_base is None and os.getenv("AZURE_API_BASE") is None and internal_llm_proxy is None:
+                raise ValueError("API Base must be provided for Azure.")
+            litellm.api_base = api_base or os.getenv("AZURE_API_BASE")
+            if api_version is None and os.getenv("AZURE_API_VERSION") is None and internal_llm_proxy is None:
+                raise ValueError("API version must be provided for Azure.")
+            litellm.api_version = api_version or os.getenv("AZURE_API_VERSION")
         else:
             raise ValueError(f"Provider is not recognized.")
@@ -189,7 +210,15 @@ class SyntheticDataGeneration:
             kwargs=kwargs
         )
+    def validate_input(self,text):
+        if not text.strip():
+            return 'Empty Text provided for qna generation. Please provide valid text'
+        encoding = tiktoken.encoding_for_model("gpt-4")
+        tokens = encoding.encode(text)
+        if len(tokens)<5:
+            return 'Very Small Text provided for qna generation. Please provide longer text'
+        return False
     def _get_system_message(self, question_type, n):
@@ -207,7 +236,8 @@ class SyntheticDataGeneration:
             ValueError: If an invalid question type is specified.
         """
         if question_type == 'simple':
-            return f'''Generate a set of {n} very simple questions answerable in a single phrase.
+            return f'''Generate a set of {n} very simple questions answerable in a single phrase using the below text.
+                Only generate questions answerable from the text given, to cover all parts of the given document.
                 Also return the answers for the generated questions.
                 Return the response in a list of object format.
                 Each object in list should have Question and corresponding answer.
@@ -216,6 +246,7 @@ class SyntheticDataGeneration:
             '''
         elif question_type == 'mcq':
             return f'''Generate a set of {n} questions with 4 probable answers from the given text.
+                Only generate questions answerable from the text given, to cover all parts of the given document.
                 The options should not be longer than a phrase. There should be only 1 correct answer.
                 There should not be any ambiguity between correct and incorrect options.
                 Return the response in a list of object format.
@@ -225,6 +256,7 @@ class SyntheticDataGeneration:
             '''
         elif question_type == 'complex':
             return f'''Can you generate a set of {n} complex questions answerable in long form from the below texts.
+                Only generate questions answerable from the text given, to cover all parts of the given document.
                 Make sure the questions are important and provide new information to the user.
                 Return the response in a list of object format. Enclose any quotes in single quote.
                 Do not use double quotes within questions or answers.
@@ -274,10 +306,14 @@ class SyntheticDataGeneration:
         # Add optional parameters if they exist in model_config
         if "api_base" in model_config:
             completion_params["api_base"] = model_config["api_base"]
+        if "api_version" in model_config:
+            completion_params["api_version"] = model_config["api_version"]
         if "max_tokens" in model_config:
             completion_params["max_tokens"] = model_config["max_tokens"]
         if "temperature" in model_config:
             completion_params["temperature"] = model_config["temperature"]
+        if 'provider' in model_config:
+            completion_params['model'] = f'{model_config["provider"]}/{model_config["model"]}'
         # Make the API call using LiteLLM
         try:
@@ -298,6 +334,59 @@ class SyntheticDataGeneration:
         json_data = json.loads(content)
         return pd.DataFrame(json_data)
+    def _generate_raw_llm_response(self, text, system_message: Optional[str] = None, model_config: Dict[str, Any] = dict(), api_key=None):
+        """
+        Generate questions using LiteLLM which supports multiple providers (OpenAI, Groq, Gemini, etc.).
+        Args:
+            text (str): The input text to generate questions from.
+            system_message (str): The system message for the AI model.
+            model_config (dict): Configuration dictionary containing model details.
+                Required keys:
+                - model: The model identifier (e.g., "gpt-4", "gemini-pro", "mixtral-8x7b-32768")
+                Optional keys:
+                - api_base: Custom API base URL if needed
+                - max_tokens: Maximum tokens in response
+                - temperature: Temperature for response generation
+            api_key (str, optional): The API key for the model provider.
+        Returns:
+            pandas.DataFrame: A DataFrame containing the generated questions and answers.
+        Raises:
+            Exception: If there's an error in generating the response.
+        """
+        messages = [
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": text}
+        ]
+        completion_params = {
+            "model": model_config.get("model", 'gpt-4o'),
+            "messages": messages,
+            "api_key": api_key
+        }
+        if "api_base" in model_config:
+            completion_params["api_base"] = model_config["api_base"]
+        if "api_version" in model_config:
+            completion_params["api_version"] = model_config["api_version"]
+        if "max_tokens" in model_config:
+            completion_params["max_tokens"] = model_config["max_tokens"]
+        if "temperature" in model_config:
+            completion_params["temperature"] = model_config["temperature"]
+        if 'provider' in model_config:
+            completion_params['model'] = f'{model_config["provider"]}/{model_config["model"]}'
+        try:
+            response = completion(**completion_params)
+        except Exception as e:
+            if any(error in str(e).lower() for error in ["invalid api key", "incorrect api key", "unauthorized", "authentication"]):
+                raise ValueError(f"Invalid API key provided for {model_config.get('provider', 'the specified')} provider")
+            raise Exception(f"Error calling LLM API: {str(e)}")
+        return response.choices[0].message.content
     def _parse_response(self, response, provider):
         """
@@ -318,9 +407,13 @@ class SyntheticDataGeneration:
             list_start_index = data.find('[')  # Find the index of the first '['
             substring_data = data[list_start_index:] if list_start_index != -1 else data  # Slice from the list start
             data = substring_data
+        elif provider == "azure":
+            data = response.choices[0].message.content.replace('\n', '')
+            list_start_index = data.find('[')  # Find the index of the first '['
+            substring_data = data[list_start_index:] if list_start_index != -1 else data  # Slice from the list start
+            data = substring_data
         else:
-            raise ValueError("Invalid provider. Choose 'groq', 'gemini', or 'openai'.")
+            raise ValueError("Invalid provider. Choose 'groq', 'gemini', 'azure' or 'openai'.")
         try:
             json_data = json.loads(data)
             return pd.DataFrame(json_data)
@@ -442,7 +535,292 @@ class SyntheticDataGeneration:
         Returns:
             list: A list of supported AI providers.
         """
-        return ['gemini', 'openai']
+        return ['gemini', 'openai','azure']
+    def _get_init_ex_gen_prompt(self):
+        prompt = '''
+You are an expert example generator. Your task is to produce creative, relevant and varied examples according to the user instructions.
+**Inputs**
+User Instruction: The user will provide guidance on how to generate examples, possibly accompanied by their own examples.
+User Examples[Optional]: The user may supply examples.
+User Context[Optional]: The user may supply context to generate the examples from.
+No of Examples: The total number of examples to produce.
+**Steps to follow**
+1. Carefully analyze the user's instruction
+2. If user examples are provided, check whether the user’s instructions refer to them specifically.
+3. If user context is provided, understand it thoroughly and identify relevant parts to generate examples.
+4. Comply with the system’s guidelines to generate examples, incorporating any user examples or user context as needed.
+**Output Format**:
+- Present examples in a multiline string with each line a separate example.
+- Avoid markdown or special formatting.
+- Omit any boilerplate texts.
+**Instructions for Diversity**:
+- Vary the examples by context, tone, and (if applicable) technical complexity.
+- Include edge cases or unconventional scenarios.
+- Ensure no two examples are conceptually identical.
+**Final Notes**:
+- Focus on both originality and practical relevance.
+- Avoid repetitiveness in the examples.
+'''
+        return prompt
+    def _get_iter_ex_gen_prompt(self):
+        prompt = '''
+You are an expert example generator. Your task is to produce creative, relevant and varied examples according to the user instructions.
+**Inputs**
+User Instruction: The user will provide guidance on how to generate examples, possibly accompanied by their own examples.
+User Examples[Optional]: The user may supply examples.
+User Context[Optional]: The user may supply context to generate the examples from.
+No of Examples: The total number of examples to produce.
+Relevant Examples: Any examples that are relevant to the user's instruction.
+Irrelevant Examples: Any examples that are not relevant to the user's instruction.
+**Steps to follow**
+1. Carefully analyze the user's instruction
+2. If user examples are provided, check whether the user’s instructions refer to them specifically.
+3. If user context is provided, understand it thoroughly and identify relevant parts to generate examples.
+4. Review the relevant and irrelevant examples present, understanding the differences in them.
+5. Comply with the user's instruction to generate examples, similar to relevant examples and dissimilar to irrelevant ones.
+**Output Format**:
+- Present examples in a multiline sting with each line a separate example.
+- Avoid markdown or special formatting.
+- Omit any boilerplate texts.
+**Instructions for Diversity**:
+- Vary the examples by context, tone, and (if applicable) technical complexity.
+- Include edge cases or unconventional scenarios.
+- Ensure no two examples are conceptually identical.
+**Final Notes**:
+- Focus on both originality and practical relevance.
+- Avoid repetitiveness in the examples.
+'''
+        return prompt
+    def _generate_examples_iter(
+            self,
+            user_instruction: str,
+            user_examples: Optional[List[str] | str] = None,
+            user_context: Optional[str] = None,
+            relevant_examples: List[str]=[],
+            irrelevant_examples: List[str]=[],
+            no_examples: Optional[int] = None,
+            model_config: Dict[str, Any] = dict(),
+            api_key: Optional[str] = None
+            ):
+        if no_examples is None:
+            no_examples = 5
+        relevant_examples_str = '\n'.join(relevant_examples)
+        irrelevant_examples_str = '\n'.join(irrelevant_examples)
+        user_message = f'**User Instruction:** {user_instruction}'
+        user_message += f'\n\n**No of Examples:** {no_examples}'
+        if user_examples:
+            if isinstance(user_examples, str):
+                user_examples_str = user_examples
+            elif isinstance(user_examples, list):
+                user_examples_str = "\n".join(user_examples)
+            else:
+                raise ValueError(f'Expected string or list of strings as user_examples got {type(user_examples)}')
+            user_message += f"\n\n**User Examples:** \n{user_examples_str}"
+        if relevant_examples:
+            user_message += f'\n\n**Relevant Examples:** \n{relevant_examples_str}'
+        if irrelevant_examples:
+            user_message += f'\n\n**Irrelevant Examples:** \n{irrelevant_examples_str}'
+        if user_context:
+            user_message += f'\n\n**User Context:** \n{user_context}'
+        system_prompt = self._get_iter_ex_gen_prompt()
+        return self._generate_raw_llm_response(user_message, system_prompt, model_config=model_config, api_key=api_key)
+    def _generate_examples(
+            self,
+            user_instruction:str,
+            user_examples:Optional[List[str]|str]=None,
+            user_context: Optional[str] = None,
+            no_examples:Optional[int]=None,
+            model_config: Dict[str, Any] = dict(),
+            api_key: Optional[str] = None
+            ):
+        if no_examples is None:
+            no_examples = 5
+        user_message = f"**User Instruction:** {user_instruction}"
+        if user_examples:
+            if isinstance(user_examples, str):
+                user_examples_str = user_examples
+            elif isinstance(user_examples, list):
+                user_examples_str = "\n".join(user_examples)
+            else:
+                raise ValueError(f'Expected string or list of strings as user_examples got {type(user_examples)}')
+            user_message += f"\n\n**User Examples:** \n{user_examples_str}"
+        if user_context:
+            user_message += f'\n\n**User Context:** \n{user_context}'
+        user_message += f'\n\n**No of Examples:** {no_examples}'
+        init_system_prompt = self._get_init_ex_gen_prompt()
+        return self._generate_raw_llm_response(user_message, init_system_prompt, model_config=model_config, api_key=api_key)
+    def _get_valid_examples(self, user_indices_str: str, examples: List[str]):
+        valid_examples = []
+        try:
+            user_indices = user_indices_str.strip().split(',')
+            for index_str in user_indices:
+                try:
+                    index = int(index_str)
+                    if index <= 0 or index > len(examples):
+                        continue
+                except ValueError as e:
+                    continue
+                valid_examples.append(examples[index-1])
+        except Exception as e:
+            print(f'Error: {e}')
+        return valid_examples
+    def generate_examples(
+        self,
+        user_instruction: str,
+        user_examples:Optional[List[str] | str] = None,
+        user_context: Optional[str] = None,
+        no_examples: Optional[int] = None,
+        model_config: Optional[Dict[str, Any]] = None,
+        api_key: Optional[str] = None,
+        max_iter: int = 0,
+        **kwargs
+        ):
+        if not model_config:
+            model_config = {}
+        provider = model_config.get("provider")
+        api_base = model_config.get("api_base")
+        api_version = model_config.get("api_version")
+        self._initialize_client(provider, api_key, api_base, api_version, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
+        if no_examples is None:
+            no_examples = 5
+        assert no_examples >= 0, 'The number of examples cannot be less than 0'
+        relevant_examples = []
+        irrelevant_examples = []
+        max_relevant_examples = 5
+        max_irrelevant_examples = 10
+        while len(relevant_examples) <= max_relevant_examples or len(irrelevant_examples) <= max_irrelevant_examples:
+            if max_iter <= 0:
+                break
+            if len(relevant_examples) > max_relevant_examples:
+                relevant_examples = random.sample(relevant_examples, max_relevant_examples)
+            if len(irrelevant_examples) > max_irrelevant_examples:
+                irrelevant_examples = random.sample(irrelevant_examples, max_irrelevant_examples)
+            if relevant_examples or irrelevant_examples:
+                examples_str = self._generate_examples_iter(
+                    user_instruction = user_instruction,
+                    user_examples = user_examples,
+                    relevant_examples = relevant_examples,
+                    irrelevant_examples = irrelevant_examples,
+                    model_config = model_config,
+                    api_key = api_key
+                    )
+            else:
+                examples_str = self._generate_examples(
+                    user_instruction = user_instruction,
+                    user_examples = user_examples,
+                    user_context = user_context,
+                    model_config = model_config,
+                    api_key = api_key
+                )
+            examples = [example for example in examples_str.split('\n') if example.strip()]
+            print('Generated Examples:')
+            for i, example in enumerate(examples):
+                print(f'{i+1}. {example}')
+            relevant_indices = input('Enter the indices of relevant examples (comma-separated): ').strip()
+            if relevant_indices:
+                relevant_examples.extend(self._get_valid_examples(relevant_indices, examples))
+            irrelevant_indices = input('Enter the indices of irrelevant examples (comma-separated): ').strip()
+            if irrelevant_indices:
+                irrelevant_examples.extend(self._get_valid_examples(irrelevant_indices, examples))
+            max_iter -= 1
+        if len(relevant_examples) > max_relevant_examples:
+            fin_relevant_examples = random.sample(relevant_examples, max_relevant_examples)
+        else:
+            fin_relevant_examples = relevant_examples
+        if len(irrelevant_examples) > max_irrelevant_examples:
+            fin_irrelevant_examples = random.sample(irrelevant_examples, max_irrelevant_examples)
+        else:
+            fin_irrelevant_examples = irrelevant_examples
+        if relevant_examples or irrelevant_examples:
+            if len(relevant_examples) < no_examples:
+                more_no_examples = no_examples - len(relevant_examples)
+                final_examples_str = self._generate_examples_iter(
+                    user_instruction = user_instruction,
+                    user_examples = user_examples,
+                    user_context = user_context,
+                    relevant_examples = fin_relevant_examples,
+                    irrelevant_examples = fin_irrelevant_examples,
+                    no_examples = more_no_examples,
+                    model_config = model_config,
+                    api_key = api_key
+                    )
+                final_examples = [example for example in final_examples_str.split('\n') if example.strip()]
+                final_examples.extend(relevant_examples)
+            else:
+                final_examples = random.sample(relevant_examples, no_examples)
+        else:
+            final_examples_str = self._generate_examples(
+                user_instruction = user_instruction,
+                user_examples = user_examples,
+                user_context = user_context,
+                no_examples = no_examples,
+                model_config = model_config,
+                api_key = api_key
+            )
+            final_examples = [example for example in final_examples_str.split('\n') if example.strip()]
+        return final_examples
+    def generate_examples_from_csv(
+            self,
+            csv_path: str,
+            dst_csv_path: Optional[str] = None,
+            no_examples: Optional[int] = None,
+            model_config: Optional[Dict[str, Any]] = None,
+            api_key: Optional[str] = None,
+            **kwargs
+            ):
+        if no_examples is None:
+            no_examples = 5
+        assert no_examples >= 0, 'The number of examples cannot be less than  0'
+        df = pd.read_csv(csv_path)
+        assert 'user_instruction' in df.columns, 'The csv must have a column named user_instruction'
+        fin_df_list = []
+        for i, row in df.iterrows():
+            user_instruction = row['user_instruction']
+            user_examples = row.get('user_examples')
+            user_context = row.get('user_context')
+            row_dict = row.to_dict()
+            try:
+                examples = self.generate_examples(
+                    user_instruction = user_instruction,
+                    user_examples = user_examples,
+                    user_context = user_context,
+                    no_examples = no_examples,
+                    model_config = model_config,
+                    api_key = api_key
+                )
+            except Exception as e:
+                continue
+            row_dict['generated_examples'] = examples
+            fin_df_list.append(row_dict)
+        fin_df = pd.DataFrame(fin_df_list)
+        csv_file, csv_ext = os.path.splitext(csv_path)
+        if not dst_csv_path:
+            dst_csv_path = csv_file + '_with_examples' + csv_ext
+        dst_dir = os.path.dirname(dst_csv_path)
+        if dst_dir:
+            os.makedirs(dst_dir, exist_ok=True)
+        fin_df.to_csv(dst_csv_path)
+        logger.info(f'CSV with generated examples saved at {dst_csv_path}')
 # Usage:
 # from synthetic_data_generation import SyntheticDataGeneration

ragaai_catalyst/tracers/__init__.py CHANGED Viewed

@@ -1,3 +1,19 @@
 from .tracer import Tracer
+from .distributed import (
+    init_tracing,
+    trace_agent,
+    trace_llm,
+    trace_tool,
+    current_span,
+    trace_custom,
+)
-__all__ = ["Tracer"]
+__all__ = [
+    "Tracer",
+    "init_tracing",
+    "trace_agent",
+    "trace_llm",
+    "trace_tool",
+    "current_span",
+    "trace_custom"
+]

ragaai_catalyst/tracers/agentic_tracing/data/data_structure.py CHANGED Viewed

@@ -271,7 +271,7 @@ class ComponentInfo:
     cost: Optional[Dict[str, float]] = None
 class Trace:
-    def __init__(self, id: str, trace_name: str, project_name: str, start_time: str, end_time: str, metadata: Optional[Metadata] = None, data: Optional[List[Dict[str, Any]]] = None, replays: Optional[Dict[str, Any]] = None):
+    def __init__(self, id: str, trace_name: str, project_name: str, start_time: str, end_time: str, metadata: Optional[Metadata] = None, data: Optional[List[Dict[str, Any]]] = None, replays: Optional[Dict[str, Any]] = None, metrics: Optional[List[Dict[str, Any]]] = None):
         self.id = id
         self.trace_name = trace_name
         self.project_name = project_name
@@ -280,6 +280,7 @@ class Trace:
         self.metadata = metadata or Metadata()
         self.data = data or []
         self.replays = replays
+        self.metrics = metrics or []
     def to_dict(self):
         return {
@@ -288,7 +289,8 @@ class Trace:
             "project_name": self.project_name,
             "start_time": self.start_time,
             "end_time": self.end_time,
-            "metadata": self.metadata.to_dict() if self.metadata else None,
+            "metadata": self.metadata,
             "data": self.data,
             "replays": self.replays,
+            "metrics": self.metrics
         }

ragaai-catalyst 2.1.4.1b0__py3-none-any.whl → 2.1.5__py3-none-any.whl

ragaai-catalyst 2.1.4.1b0py3-none-any.whl → 2.1.5py3-none-any.whl