ragaai-catalyst 2.1.4.1b0__py3-none-any.whl → 2.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. ragaai_catalyst/__init__.py +23 -2
  2. ragaai_catalyst/dataset.py +462 -1
  3. ragaai_catalyst/evaluation.py +76 -7
  4. ragaai_catalyst/ragaai_catalyst.py +52 -10
  5. ragaai_catalyst/redteaming/__init__.py +7 -0
  6. ragaai_catalyst/redteaming/config/detectors.toml +13 -0
  7. ragaai_catalyst/redteaming/data_generator/scenario_generator.py +95 -0
  8. ragaai_catalyst/redteaming/data_generator/test_case_generator.py +120 -0
  9. ragaai_catalyst/redteaming/evaluator.py +125 -0
  10. ragaai_catalyst/redteaming/llm_generator.py +136 -0
  11. ragaai_catalyst/redteaming/llm_generator_old.py +83 -0
  12. ragaai_catalyst/redteaming/red_teaming.py +331 -0
  13. ragaai_catalyst/redteaming/requirements.txt +4 -0
  14. ragaai_catalyst/redteaming/tests/grok.ipynb +97 -0
  15. ragaai_catalyst/redteaming/tests/stereotype.ipynb +2258 -0
  16. ragaai_catalyst/redteaming/upload_result.py +38 -0
  17. ragaai_catalyst/redteaming/utils/issue_description.py +114 -0
  18. ragaai_catalyst/redteaming/utils/rt.png +0 -0
  19. ragaai_catalyst/redteaming_old.py +171 -0
  20. ragaai_catalyst/synthetic_data_generation.py +400 -22
  21. ragaai_catalyst/tracers/__init__.py +17 -1
  22. ragaai_catalyst/tracers/agentic_tracing/data/data_structure.py +4 -2
  23. ragaai_catalyst/tracers/agentic_tracing/tracers/agent_tracer.py +212 -148
  24. ragaai_catalyst/tracers/agentic_tracing/tracers/base.py +657 -247
  25. ragaai_catalyst/tracers/agentic_tracing/tracers/custom_tracer.py +50 -19
  26. ragaai_catalyst/tracers/agentic_tracing/tracers/llm_tracer.py +588 -177
  27. ragaai_catalyst/tracers/agentic_tracing/tracers/main_tracer.py +99 -100
  28. ragaai_catalyst/tracers/agentic_tracing/tracers/network_tracer.py +3 -3
  29. ragaai_catalyst/tracers/agentic_tracing/tracers/tool_tracer.py +230 -29
  30. ragaai_catalyst/tracers/agentic_tracing/upload/trace_uploader.py +358 -0
  31. ragaai_catalyst/tracers/agentic_tracing/upload/upload_agentic_traces.py +75 -20
  32. ragaai_catalyst/tracers/agentic_tracing/upload/upload_code.py +55 -11
  33. ragaai_catalyst/tracers/agentic_tracing/upload/upload_local_metric.py +74 -0
  34. ragaai_catalyst/tracers/agentic_tracing/upload/upload_trace_metric.py +47 -16
  35. ragaai_catalyst/tracers/agentic_tracing/utils/create_dataset_schema.py +4 -2
  36. ragaai_catalyst/tracers/agentic_tracing/utils/file_name_tracker.py +26 -3
  37. ragaai_catalyst/tracers/agentic_tracing/utils/llm_utils.py +182 -17
  38. ragaai_catalyst/tracers/agentic_tracing/utils/model_costs.json +1233 -497
  39. ragaai_catalyst/tracers/agentic_tracing/utils/span_attributes.py +81 -10
  40. ragaai_catalyst/tracers/agentic_tracing/utils/supported_llm_provider.toml +34 -0
  41. ragaai_catalyst/tracers/agentic_tracing/utils/system_monitor.py +215 -0
  42. ragaai_catalyst/tracers/agentic_tracing/utils/trace_utils.py +0 -32
  43. ragaai_catalyst/tracers/agentic_tracing/utils/unique_decorator.py +3 -1
  44. ragaai_catalyst/tracers/agentic_tracing/utils/zip_list_of_unique_files.py +73 -47
  45. ragaai_catalyst/tracers/distributed.py +300 -0
  46. ragaai_catalyst/tracers/exporters/__init__.py +3 -1
  47. ragaai_catalyst/tracers/exporters/dynamic_trace_exporter.py +160 -0
  48. ragaai_catalyst/tracers/exporters/ragaai_trace_exporter.py +129 -0
  49. ragaai_catalyst/tracers/langchain_callback.py +809 -0
  50. ragaai_catalyst/tracers/llamaindex_instrumentation.py +424 -0
  51. ragaai_catalyst/tracers/tracer.py +301 -55
  52. ragaai_catalyst/tracers/upload_traces.py +24 -7
  53. ragaai_catalyst/tracers/utils/convert_langchain_callbacks_output.py +61 -0
  54. ragaai_catalyst/tracers/utils/convert_llama_instru_callback.py +69 -0
  55. ragaai_catalyst/tracers/utils/extraction_logic_llama_index.py +74 -0
  56. ragaai_catalyst/tracers/utils/langchain_tracer_extraction_logic.py +82 -0
  57. ragaai_catalyst/tracers/utils/model_prices_and_context_window_backup.json +9365 -0
  58. ragaai_catalyst/tracers/utils/trace_json_converter.py +269 -0
  59. {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/METADATA +367 -45
  60. ragaai_catalyst-2.1.5.dist-info/RECORD +97 -0
  61. {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/WHEEL +1 -1
  62. ragaai_catalyst-2.1.4.1b0.dist-info/RECORD +0 -67
  63. {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/LICENSE +0 -0
  64. {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,28 @@
1
1
  import os
2
- from groq import Groq
3
- import google.generativeai as genai
4
- import openai
5
- import PyPDF2
2
+ import ast
6
3
  import csv
4
+ import json
5
+ import random
6
+ import PyPDF2
7
7
  import markdown
8
8
  import pandas as pd
9
- import json
10
- from litellm import completion
11
9
  from tqdm import tqdm
12
- # import internal_api_completion
13
- # import proxy_call
10
+
11
+ import openai
12
+ import tiktoken
13
+ import litellm
14
+ import google.generativeai as genai
15
+ from groq import Groq
16
+ from litellm import completion
17
+
14
18
  from .internal_api_completion import api_completion as internal_api_completion
15
19
  from .proxy_call import api_completion as proxy_api_completion
16
- # from ragaai_catalyst import internal_api_completion
17
- # from ragaai_catalyst import proxy_call
18
- import ast
19
20
 
20
- # dotenv.load_dotenv()
21
+ from typing import Optional, List, Dict, Any
22
+
23
+ import logging
24
+
25
+ logger = logging.getLogger(__name__)
21
26
 
22
27
  class SyntheticDataGeneration:
23
28
  """
@@ -48,13 +53,18 @@ class SyntheticDataGeneration:
48
53
  Raises:
49
54
  ValueError: If an invalid provider is specified or API key is missing.
50
55
  """
56
+ text_validity = self.validate_input(text)
57
+ if text_validity:
58
+ raise ValueError(text_validity)
59
+
51
60
  BATCH_SIZE = 5 # Optimal batch size for maintaining response quality
52
61
  provider = model_config.get("provider")
53
62
  model = model_config.get("model")
54
63
  api_base = model_config.get("api_base")
64
+ api_version = model_config.get("api_version")
55
65
 
56
66
  # Initialize the appropriate client based on provider
57
- self._initialize_client(provider, api_key, api_base, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
67
+ self._initialize_client(provider, api_key, api_base, api_version, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
58
68
 
59
69
  # Initialize progress bar
60
70
  pbar = tqdm(total=n, desc="Generating QA pairs")
@@ -68,7 +78,9 @@ class SyntheticDataGeneration:
68
78
  "No connection adapters",
69
79
  "Required API Keys are not set",
70
80
  "litellm.BadRequestError",
71
- "litellm.AuthenticationError"]
81
+ "litellm.AuthenticationError",
82
+ "Max retries exceeded"
83
+ ]
72
84
 
73
85
  for _ in range(num_batches):
74
86
  current_batch_size = min(BATCH_SIZE, n - len(all_responses))
@@ -77,7 +89,6 @@ class SyntheticDataGeneration:
77
89
 
78
90
  try:
79
91
  system_message = self._get_system_message(question_type, current_batch_size)
80
-
81
92
  if "internal_llm_proxy" in kwargs:
82
93
  batch_df = self._generate_internal_response(text, system_message, model_config, kwargs)
83
94
  else:
@@ -88,7 +99,7 @@ class SyntheticDataGeneration:
88
99
  pbar.update(len(batch_df))
89
100
 
90
101
  except Exception as e:
91
- print(f"Batch generation failed.")
102
+ print(f"Batch generation failed:{str(e)}")
92
103
 
93
104
  if any(error in str(e) for error in FAILURE_CASES):
94
105
  raise Exception(f"{e}")
@@ -139,7 +150,7 @@ class SyntheticDataGeneration:
139
150
 
140
151
  return final_df
141
152
 
142
- def _initialize_client(self, provider, api_key, api_base=None, internal_llm_proxy=None):
153
+ def _initialize_client(self, provider, api_key, api_base=None, api_version=None, internal_llm_proxy=None):
143
154
  """Initialize the appropriate client based on provider."""
144
155
  if not provider:
145
156
  raise ValueError("Model configuration must be provided with a valid provider and model.")
@@ -158,7 +169,17 @@ class SyntheticDataGeneration:
158
169
  if api_key is None and os.getenv("OPENAI_API_KEY") is None and internal_llm_proxy is None:
159
170
  raise ValueError("API key must be provided for OpenAI.")
160
171
  openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
161
-
172
+
173
+ elif provider == "azure":
174
+ if api_key is None and os.getenv("AZURE_API_KEY") is None and internal_llm_proxy is None:
175
+ raise ValueError("API key must be provided for Azure.")
176
+ litellm.api_key = api_key or os.getenv("AZURE_API_KEY")
177
+ if api_base is None and os.getenv("AZURE_API_BASE") is None and internal_llm_proxy is None:
178
+ raise ValueError("API Base must be provided for Azure.")
179
+ litellm.api_base = api_base or os.getenv("AZURE_API_BASE")
180
+ if api_version is None and os.getenv("AZURE_API_VERSION") is None and internal_llm_proxy is None:
181
+ raise ValueError("API version must be provided for Azure.")
182
+ litellm.api_version = api_version or os.getenv("AZURE_API_VERSION")
162
183
  else:
163
184
  raise ValueError(f"Provider is not recognized.")
164
185
 
@@ -189,7 +210,15 @@ class SyntheticDataGeneration:
189
210
  kwargs=kwargs
190
211
  )
191
212
 
213
+ def validate_input(self,text):
192
214
 
215
+ if not text.strip():
216
+ return 'Empty Text provided for qna generation. Please provide valid text'
217
+ encoding = tiktoken.encoding_for_model("gpt-4")
218
+ tokens = encoding.encode(text)
219
+ if len(tokens)<5:
220
+ return 'Very Small Text provided for qna generation. Please provide longer text'
221
+ return False
193
222
 
194
223
 
195
224
  def _get_system_message(self, question_type, n):
@@ -207,7 +236,8 @@ class SyntheticDataGeneration:
207
236
  ValueError: If an invalid question type is specified.
208
237
  """
209
238
  if question_type == 'simple':
210
- return f'''Generate a set of {n} very simple questions answerable in a single phrase.
239
+ return f'''Generate a set of {n} very simple questions answerable in a single phrase using the below text.
240
+ Only generate questions answerable from the text given, to cover all parts of the given document.
211
241
  Also return the answers for the generated questions.
212
242
  Return the response in a list of object format.
213
243
  Each object in list should have Question and corresponding answer.
@@ -216,6 +246,7 @@ class SyntheticDataGeneration:
216
246
  '''
217
247
  elif question_type == 'mcq':
218
248
  return f'''Generate a set of {n} questions with 4 probable answers from the given text.
249
+ Only generate questions answerable from the text given, to cover all parts of the given document.
219
250
  The options should not be longer than a phrase. There should be only 1 correct answer.
220
251
  There should not be any ambiguity between correct and incorrect options.
221
252
  Return the response in a list of object format.
@@ -225,6 +256,7 @@ class SyntheticDataGeneration:
225
256
  '''
226
257
  elif question_type == 'complex':
227
258
  return f'''Can you generate a set of {n} complex questions answerable in long form from the below texts.
259
+ Only generate questions answerable from the text given, to cover all parts of the given document.
228
260
  Make sure the questions are important and provide new information to the user.
229
261
  Return the response in a list of object format. Enclose any quotes in single quote.
230
262
  Do not use double quotes within questions or answers.
@@ -274,10 +306,14 @@ class SyntheticDataGeneration:
274
306
  # Add optional parameters if they exist in model_config
275
307
  if "api_base" in model_config:
276
308
  completion_params["api_base"] = model_config["api_base"]
309
+ if "api_version" in model_config:
310
+ completion_params["api_version"] = model_config["api_version"]
277
311
  if "max_tokens" in model_config:
278
312
  completion_params["max_tokens"] = model_config["max_tokens"]
279
313
  if "temperature" in model_config:
280
314
  completion_params["temperature"] = model_config["temperature"]
315
+ if 'provider' in model_config:
316
+ completion_params['model'] = f'{model_config["provider"]}/{model_config["model"]}'
281
317
 
282
318
  # Make the API call using LiteLLM
283
319
  try:
@@ -298,6 +334,59 @@ class SyntheticDataGeneration:
298
334
 
299
335
  json_data = json.loads(content)
300
336
  return pd.DataFrame(json_data)
337
+
338
+ def _generate_raw_llm_response(self, text, system_message: Optional[str] = None, model_config: Dict[str, Any] = dict(), api_key=None):
339
+ """
340
+ Generate questions using LiteLLM which supports multiple providers (OpenAI, Groq, Gemini, etc.).
341
+
342
+ Args:
343
+ text (str): The input text to generate questions from.
344
+ system_message (str): The system message for the AI model.
345
+ model_config (dict): Configuration dictionary containing model details.
346
+ Required keys:
347
+ - model: The model identifier (e.g., "gpt-4", "gemini-pro", "mixtral-8x7b-32768")
348
+ Optional keys:
349
+ - api_base: Custom API base URL if needed
350
+ - max_tokens: Maximum tokens in response
351
+ - temperature: Temperature for response generation
352
+ api_key (str, optional): The API key for the model provider.
353
+
354
+ Returns:
355
+ pandas.DataFrame: A DataFrame containing the generated questions and answers.
356
+
357
+ Raises:
358
+ Exception: If there's an error in generating the response.
359
+ """
360
+ messages = [
361
+ {"role": "system", "content": system_message},
362
+ {"role": "user", "content": text}
363
+ ]
364
+
365
+ completion_params = {
366
+ "model": model_config.get("model", 'gpt-4o'),
367
+ "messages": messages,
368
+ "api_key": api_key
369
+ }
370
+
371
+ if "api_base" in model_config:
372
+ completion_params["api_base"] = model_config["api_base"]
373
+ if "api_version" in model_config:
374
+ completion_params["api_version"] = model_config["api_version"]
375
+ if "max_tokens" in model_config:
376
+ completion_params["max_tokens"] = model_config["max_tokens"]
377
+ if "temperature" in model_config:
378
+ completion_params["temperature"] = model_config["temperature"]
379
+ if 'provider' in model_config:
380
+ completion_params['model'] = f'{model_config["provider"]}/{model_config["model"]}'
381
+
382
+ try:
383
+ response = completion(**completion_params)
384
+ except Exception as e:
385
+ if any(error in str(e).lower() for error in ["invalid api key", "incorrect api key", "unauthorized", "authentication"]):
386
+ raise ValueError(f"Invalid API key provided for {model_config.get('provider', 'the specified')} provider")
387
+ raise Exception(f"Error calling LLM API: {str(e)}")
388
+
389
+ return response.choices[0].message.content
301
390
 
302
391
  def _parse_response(self, response, provider):
303
392
  """
@@ -318,9 +407,13 @@ class SyntheticDataGeneration:
318
407
  list_start_index = data.find('[') # Find the index of the first '['
319
408
  substring_data = data[list_start_index:] if list_start_index != -1 else data # Slice from the list start
320
409
  data = substring_data
321
-
410
+ elif provider == "azure":
411
+ data = response.choices[0].message.content.replace('\n', '')
412
+ list_start_index = data.find('[') # Find the index of the first '['
413
+ substring_data = data[list_start_index:] if list_start_index != -1 else data # Slice from the list start
414
+ data = substring_data
322
415
  else:
323
- raise ValueError("Invalid provider. Choose 'groq', 'gemini', or 'openai'.")
416
+ raise ValueError("Invalid provider. Choose 'groq', 'gemini', 'azure' or 'openai'.")
324
417
  try:
325
418
  json_data = json.loads(data)
326
419
  return pd.DataFrame(json_data)
@@ -442,7 +535,292 @@ class SyntheticDataGeneration:
442
535
  Returns:
443
536
  list: A list of supported AI providers.
444
537
  """
445
- return ['gemini', 'openai']
538
+ return ['gemini', 'openai','azure']
539
+
540
+ def _get_init_ex_gen_prompt(self):
541
+ prompt = '''
542
+ You are an expert example generator. Your task is to produce creative, relevant and varied examples according to the user instructions.
543
+
544
+ **Inputs**
545
+ User Instruction: The user will provide guidance on how to generate examples, possibly accompanied by their own examples.
546
+ User Examples[Optional]: The user may supply examples.
547
+ User Context[Optional]: The user may supply context to generate the examples from.
548
+ No of Examples: The total number of examples to produce.
549
+
550
+ **Steps to follow**
551
+ 1. Carefully analyze the user's instruction
552
+ 2. If user examples are provided, check whether the user’s instructions refer to them specifically.
553
+ 3. If user context is provided, understand it thoroughly and identify relevant parts to generate examples.
554
+ 4. Comply with the system’s guidelines to generate examples, incorporating any user examples or user context as needed.
555
+
556
+ **Output Format**:
557
+ - Present examples in a multiline string with each line a separate example.
558
+ - Avoid markdown or special formatting.
559
+ - Omit any boilerplate texts.
560
+
561
+ **Instructions for Diversity**:
562
+ - Vary the examples by context, tone, and (if applicable) technical complexity.
563
+ - Include edge cases or unconventional scenarios.
564
+ - Ensure no two examples are conceptually identical.
565
+
566
+ **Final Notes**:
567
+ - Focus on both originality and practical relevance.
568
+ - Avoid repetitiveness in the examples.
569
+ '''
570
+ return prompt
571
+
572
+ def _get_iter_ex_gen_prompt(self):
573
+ prompt = '''
574
+ You are an expert example generator. Your task is to produce creative, relevant and varied examples according to the user instructions.
575
+
576
+ **Inputs**
577
+ User Instruction: The user will provide guidance on how to generate examples, possibly accompanied by their own examples.
578
+ User Examples[Optional]: The user may supply examples.
579
+ User Context[Optional]: The user may supply context to generate the examples from.
580
+ No of Examples: The total number of examples to produce.
581
+ Relevant Examples: Any examples that are relevant to the user's instruction.
582
+ Irrelevant Examples: Any examples that are not relevant to the user's instruction.
583
+
584
+ **Steps to follow**
585
+ 1. Carefully analyze the user's instruction
586
+ 2. If user examples are provided, check whether the user’s instructions refer to them specifically.
587
+ 3. If user context is provided, understand it thoroughly and identify relevant parts to generate examples.
588
+ 4. Review the relevant and irrelevant examples present, understanding the differences in them.
589
+ 5. Comply with the user's instruction to generate examples, similar to relevant examples and dissimilar to irrelevant ones.
590
+
591
+ **Output Format**:
592
+ - Present examples in a multiline sting with each line a separate example.
593
+ - Avoid markdown or special formatting.
594
+ - Omit any boilerplate texts.
595
+
596
+ **Instructions for Diversity**:
597
+ - Vary the examples by context, tone, and (if applicable) technical complexity.
598
+ - Include edge cases or unconventional scenarios.
599
+ - Ensure no two examples are conceptually identical.
600
+
601
+ **Final Notes**:
602
+ - Focus on both originality and practical relevance.
603
+ - Avoid repetitiveness in the examples.
604
+ '''
605
+ return prompt
606
+
607
+ def _generate_examples_iter(
608
+ self,
609
+ user_instruction: str,
610
+ user_examples: Optional[List[str] | str] = None,
611
+ user_context: Optional[str] = None,
612
+ relevant_examples: List[str]=[],
613
+ irrelevant_examples: List[str]=[],
614
+ no_examples: Optional[int] = None,
615
+ model_config: Dict[str, Any] = dict(),
616
+ api_key: Optional[str] = None
617
+ ):
618
+ if no_examples is None:
619
+ no_examples = 5
620
+ relevant_examples_str = '\n'.join(relevant_examples)
621
+ irrelevant_examples_str = '\n'.join(irrelevant_examples)
622
+ user_message = f'**User Instruction:** {user_instruction}'
623
+ user_message += f'\n\n**No of Examples:** {no_examples}'
624
+ if user_examples:
625
+ if isinstance(user_examples, str):
626
+ user_examples_str = user_examples
627
+ elif isinstance(user_examples, list):
628
+ user_examples_str = "\n".join(user_examples)
629
+ else:
630
+ raise ValueError(f'Expected string or list of strings as user_examples got {type(user_examples)}')
631
+ user_message += f"\n\n**User Examples:** \n{user_examples_str}"
632
+ if relevant_examples:
633
+ user_message += f'\n\n**Relevant Examples:** \n{relevant_examples_str}'
634
+ if irrelevant_examples:
635
+ user_message += f'\n\n**Irrelevant Examples:** \n{irrelevant_examples_str}'
636
+ if user_context:
637
+ user_message += f'\n\n**User Context:** \n{user_context}'
638
+ system_prompt = self._get_iter_ex_gen_prompt()
639
+ return self._generate_raw_llm_response(user_message, system_prompt, model_config=model_config, api_key=api_key)
640
+
641
+ def _generate_examples(
642
+ self,
643
+ user_instruction:str,
644
+ user_examples:Optional[List[str]|str]=None,
645
+ user_context: Optional[str] = None,
646
+ no_examples:Optional[int]=None,
647
+ model_config: Dict[str, Any] = dict(),
648
+ api_key: Optional[str] = None
649
+ ):
650
+ if no_examples is None:
651
+ no_examples = 5
652
+ user_message = f"**User Instruction:** {user_instruction}"
653
+ if user_examples:
654
+ if isinstance(user_examples, str):
655
+ user_examples_str = user_examples
656
+ elif isinstance(user_examples, list):
657
+ user_examples_str = "\n".join(user_examples)
658
+ else:
659
+ raise ValueError(f'Expected string or list of strings as user_examples got {type(user_examples)}')
660
+ user_message += f"\n\n**User Examples:** \n{user_examples_str}"
661
+ if user_context:
662
+ user_message += f'\n\n**User Context:** \n{user_context}'
663
+ user_message += f'\n\n**No of Examples:** {no_examples}'
664
+ init_system_prompt = self._get_init_ex_gen_prompt()
665
+ return self._generate_raw_llm_response(user_message, init_system_prompt, model_config=model_config, api_key=api_key)
666
+
667
+ def _get_valid_examples(self, user_indices_str: str, examples: List[str]):
668
+ valid_examples = []
669
+ try:
670
+ user_indices = user_indices_str.strip().split(',')
671
+ for index_str in user_indices:
672
+ try:
673
+ index = int(index_str)
674
+ if index <= 0 or index > len(examples):
675
+ continue
676
+ except ValueError as e:
677
+ continue
678
+ valid_examples.append(examples[index-1])
679
+ except Exception as e:
680
+ print(f'Error: {e}')
681
+ return valid_examples
682
+
683
+ def generate_examples(
684
+ self,
685
+ user_instruction: str,
686
+ user_examples:Optional[List[str] | str] = None,
687
+ user_context: Optional[str] = None,
688
+ no_examples: Optional[int] = None,
689
+ model_config: Optional[Dict[str, Any]] = None,
690
+ api_key: Optional[str] = None,
691
+ max_iter: int = 0,
692
+ **kwargs
693
+ ):
694
+ if not model_config:
695
+ model_config = {}
696
+ provider = model_config.get("provider")
697
+ api_base = model_config.get("api_base")
698
+ api_version = model_config.get("api_version")
699
+ self._initialize_client(provider, api_key, api_base, api_version, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
700
+
701
+ if no_examples is None:
702
+ no_examples = 5
703
+ assert no_examples >= 0, 'The number of examples cannot be less than 0'
704
+ relevant_examples = []
705
+ irrelevant_examples = []
706
+ max_relevant_examples = 5
707
+ max_irrelevant_examples = 10
708
+ while len(relevant_examples) <= max_relevant_examples or len(irrelevant_examples) <= max_irrelevant_examples:
709
+ if max_iter <= 0:
710
+ break
711
+ if len(relevant_examples) > max_relevant_examples:
712
+ relevant_examples = random.sample(relevant_examples, max_relevant_examples)
713
+ if len(irrelevant_examples) > max_irrelevant_examples:
714
+ irrelevant_examples = random.sample(irrelevant_examples, max_irrelevant_examples)
715
+ if relevant_examples or irrelevant_examples:
716
+ examples_str = self._generate_examples_iter(
717
+ user_instruction = user_instruction,
718
+ user_examples = user_examples,
719
+ relevant_examples = relevant_examples,
720
+ irrelevant_examples = irrelevant_examples,
721
+ model_config = model_config,
722
+ api_key = api_key
723
+ )
724
+ else:
725
+ examples_str = self._generate_examples(
726
+ user_instruction = user_instruction,
727
+ user_examples = user_examples,
728
+ user_context = user_context,
729
+ model_config = model_config,
730
+ api_key = api_key
731
+ )
732
+ examples = [example for example in examples_str.split('\n') if example.strip()]
733
+ print('Generated Examples:')
734
+ for i, example in enumerate(examples):
735
+ print(f'{i+1}. {example}')
736
+ relevant_indices = input('Enter the indices of relevant examples (comma-separated): ').strip()
737
+ if relevant_indices:
738
+ relevant_examples.extend(self._get_valid_examples(relevant_indices, examples))
739
+ irrelevant_indices = input('Enter the indices of irrelevant examples (comma-separated): ').strip()
740
+ if irrelevant_indices:
741
+ irrelevant_examples.extend(self._get_valid_examples(irrelevant_indices, examples))
742
+ max_iter -= 1
743
+ if len(relevant_examples) > max_relevant_examples:
744
+ fin_relevant_examples = random.sample(relevant_examples, max_relevant_examples)
745
+ else:
746
+ fin_relevant_examples = relevant_examples
747
+ if len(irrelevant_examples) > max_irrelevant_examples:
748
+ fin_irrelevant_examples = random.sample(irrelevant_examples, max_irrelevant_examples)
749
+ else:
750
+ fin_irrelevant_examples = irrelevant_examples
751
+ if relevant_examples or irrelevant_examples:
752
+ if len(relevant_examples) < no_examples:
753
+ more_no_examples = no_examples - len(relevant_examples)
754
+ final_examples_str = self._generate_examples_iter(
755
+ user_instruction = user_instruction,
756
+ user_examples = user_examples,
757
+ user_context = user_context,
758
+ relevant_examples = fin_relevant_examples,
759
+ irrelevant_examples = fin_irrelevant_examples,
760
+ no_examples = more_no_examples,
761
+ model_config = model_config,
762
+ api_key = api_key
763
+ )
764
+ final_examples = [example for example in final_examples_str.split('\n') if example.strip()]
765
+ final_examples.extend(relevant_examples)
766
+ else:
767
+ final_examples = random.sample(relevant_examples, no_examples)
768
+ else:
769
+ final_examples_str = self._generate_examples(
770
+ user_instruction = user_instruction,
771
+ user_examples = user_examples,
772
+ user_context = user_context,
773
+ no_examples = no_examples,
774
+ model_config = model_config,
775
+ api_key = api_key
776
+ )
777
+ final_examples = [example for example in final_examples_str.split('\n') if example.strip()]
778
+ return final_examples
779
+
780
+
781
+ def generate_examples_from_csv(
782
+ self,
783
+ csv_path: str,
784
+ dst_csv_path: Optional[str] = None,
785
+ no_examples: Optional[int] = None,
786
+ model_config: Optional[Dict[str, Any]] = None,
787
+ api_key: Optional[str] = None,
788
+ **kwargs
789
+ ):
790
+ if no_examples is None:
791
+ no_examples = 5
792
+ assert no_examples >= 0, 'The number of examples cannot be less than 0'
793
+ df = pd.read_csv(csv_path)
794
+ assert 'user_instruction' in df.columns, 'The csv must have a column named user_instruction'
795
+ fin_df_list = []
796
+ for i, row in df.iterrows():
797
+ user_instruction = row['user_instruction']
798
+ user_examples = row.get('user_examples')
799
+ user_context = row.get('user_context')
800
+ row_dict = row.to_dict()
801
+ try:
802
+ examples = self.generate_examples(
803
+ user_instruction = user_instruction,
804
+ user_examples = user_examples,
805
+ user_context = user_context,
806
+ no_examples = no_examples,
807
+ model_config = model_config,
808
+ api_key = api_key
809
+ )
810
+ except Exception as e:
811
+ continue
812
+ row_dict['generated_examples'] = examples
813
+ fin_df_list.append(row_dict)
814
+ fin_df = pd.DataFrame(fin_df_list)
815
+ csv_file, csv_ext = os.path.splitext(csv_path)
816
+ if not dst_csv_path:
817
+ dst_csv_path = csv_file + '_with_examples' + csv_ext
818
+ dst_dir = os.path.dirname(dst_csv_path)
819
+ if dst_dir:
820
+ os.makedirs(dst_dir, exist_ok=True)
821
+ fin_df.to_csv(dst_csv_path)
822
+ logger.info(f'CSV with generated examples saved at {dst_csv_path}')
823
+
446
824
 
447
825
  # Usage:
448
826
  # from synthetic_data_generation import SyntheticDataGeneration
@@ -1,3 +1,19 @@
1
1
  from .tracer import Tracer
2
+ from .distributed import (
3
+ init_tracing,
4
+ trace_agent,
5
+ trace_llm,
6
+ trace_tool,
7
+ current_span,
8
+ trace_custom,
9
+ )
2
10
 
3
- __all__ = ["Tracer"]
11
+ __all__ = [
12
+ "Tracer",
13
+ "init_tracing",
14
+ "trace_agent",
15
+ "trace_llm",
16
+ "trace_tool",
17
+ "current_span",
18
+ "trace_custom"
19
+ ]
@@ -271,7 +271,7 @@ class ComponentInfo:
271
271
  cost: Optional[Dict[str, float]] = None
272
272
 
273
273
  class Trace:
274
- def __init__(self, id: str, trace_name: str, project_name: str, start_time: str, end_time: str, metadata: Optional[Metadata] = None, data: Optional[List[Dict[str, Any]]] = None, replays: Optional[Dict[str, Any]] = None):
274
+ def __init__(self, id: str, trace_name: str, project_name: str, start_time: str, end_time: str, metadata: Optional[Metadata] = None, data: Optional[List[Dict[str, Any]]] = None, replays: Optional[Dict[str, Any]] = None, metrics: Optional[List[Dict[str, Any]]] = None):
275
275
  self.id = id
276
276
  self.trace_name = trace_name
277
277
  self.project_name = project_name
@@ -280,6 +280,7 @@ class Trace:
280
280
  self.metadata = metadata or Metadata()
281
281
  self.data = data or []
282
282
  self.replays = replays
283
+ self.metrics = metrics or []
283
284
 
284
285
  def to_dict(self):
285
286
  return {
@@ -288,7 +289,8 @@ class Trace:
288
289
  "project_name": self.project_name,
289
290
  "start_time": self.start_time,
290
291
  "end_time": self.end_time,
291
- "metadata": self.metadata.to_dict() if self.metadata else None,
292
+ "metadata": self.metadata,
292
293
  "data": self.data,
293
294
  "replays": self.replays,
295
+ "metrics": self.metrics
294
296
  }