ragaai-catalyst 2.1.4.1b0__py3-none-any.whl → 2.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragaai_catalyst/__init__.py +23 -2
- ragaai_catalyst/dataset.py +462 -1
- ragaai_catalyst/evaluation.py +76 -7
- ragaai_catalyst/ragaai_catalyst.py +52 -10
- ragaai_catalyst/redteaming/__init__.py +7 -0
- ragaai_catalyst/redteaming/config/detectors.toml +13 -0
- ragaai_catalyst/redteaming/data_generator/scenario_generator.py +95 -0
- ragaai_catalyst/redteaming/data_generator/test_case_generator.py +120 -0
- ragaai_catalyst/redteaming/evaluator.py +125 -0
- ragaai_catalyst/redteaming/llm_generator.py +136 -0
- ragaai_catalyst/redteaming/llm_generator_old.py +83 -0
- ragaai_catalyst/redteaming/red_teaming.py +331 -0
- ragaai_catalyst/redteaming/requirements.txt +4 -0
- ragaai_catalyst/redteaming/tests/grok.ipynb +97 -0
- ragaai_catalyst/redteaming/tests/stereotype.ipynb +2258 -0
- ragaai_catalyst/redteaming/upload_result.py +38 -0
- ragaai_catalyst/redteaming/utils/issue_description.py +114 -0
- ragaai_catalyst/redteaming/utils/rt.png +0 -0
- ragaai_catalyst/redteaming_old.py +171 -0
- ragaai_catalyst/synthetic_data_generation.py +400 -22
- ragaai_catalyst/tracers/__init__.py +17 -1
- ragaai_catalyst/tracers/agentic_tracing/data/data_structure.py +4 -2
- ragaai_catalyst/tracers/agentic_tracing/tracers/agent_tracer.py +212 -148
- ragaai_catalyst/tracers/agentic_tracing/tracers/base.py +657 -247
- ragaai_catalyst/tracers/agentic_tracing/tracers/custom_tracer.py +50 -19
- ragaai_catalyst/tracers/agentic_tracing/tracers/llm_tracer.py +588 -177
- ragaai_catalyst/tracers/agentic_tracing/tracers/main_tracer.py +99 -100
- ragaai_catalyst/tracers/agentic_tracing/tracers/network_tracer.py +3 -3
- ragaai_catalyst/tracers/agentic_tracing/tracers/tool_tracer.py +230 -29
- ragaai_catalyst/tracers/agentic_tracing/upload/trace_uploader.py +358 -0
- ragaai_catalyst/tracers/agentic_tracing/upload/upload_agentic_traces.py +75 -20
- ragaai_catalyst/tracers/agentic_tracing/upload/upload_code.py +55 -11
- ragaai_catalyst/tracers/agentic_tracing/upload/upload_local_metric.py +74 -0
- ragaai_catalyst/tracers/agentic_tracing/upload/upload_trace_metric.py +47 -16
- ragaai_catalyst/tracers/agentic_tracing/utils/create_dataset_schema.py +4 -2
- ragaai_catalyst/tracers/agentic_tracing/utils/file_name_tracker.py +26 -3
- ragaai_catalyst/tracers/agentic_tracing/utils/llm_utils.py +182 -17
- ragaai_catalyst/tracers/agentic_tracing/utils/model_costs.json +1233 -497
- ragaai_catalyst/tracers/agentic_tracing/utils/span_attributes.py +81 -10
- ragaai_catalyst/tracers/agentic_tracing/utils/supported_llm_provider.toml +34 -0
- ragaai_catalyst/tracers/agentic_tracing/utils/system_monitor.py +215 -0
- ragaai_catalyst/tracers/agentic_tracing/utils/trace_utils.py +0 -32
- ragaai_catalyst/tracers/agentic_tracing/utils/unique_decorator.py +3 -1
- ragaai_catalyst/tracers/agentic_tracing/utils/zip_list_of_unique_files.py +73 -47
- ragaai_catalyst/tracers/distributed.py +300 -0
- ragaai_catalyst/tracers/exporters/__init__.py +3 -1
- ragaai_catalyst/tracers/exporters/dynamic_trace_exporter.py +160 -0
- ragaai_catalyst/tracers/exporters/ragaai_trace_exporter.py +129 -0
- ragaai_catalyst/tracers/langchain_callback.py +809 -0
- ragaai_catalyst/tracers/llamaindex_instrumentation.py +424 -0
- ragaai_catalyst/tracers/tracer.py +301 -55
- ragaai_catalyst/tracers/upload_traces.py +24 -7
- ragaai_catalyst/tracers/utils/convert_langchain_callbacks_output.py +61 -0
- ragaai_catalyst/tracers/utils/convert_llama_instru_callback.py +69 -0
- ragaai_catalyst/tracers/utils/extraction_logic_llama_index.py +74 -0
- ragaai_catalyst/tracers/utils/langchain_tracer_extraction_logic.py +82 -0
- ragaai_catalyst/tracers/utils/model_prices_and_context_window_backup.json +9365 -0
- ragaai_catalyst/tracers/utils/trace_json_converter.py +269 -0
- {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/METADATA +367 -45
- ragaai_catalyst-2.1.5.dist-info/RECORD +97 -0
- {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/WHEEL +1 -1
- ragaai_catalyst-2.1.4.1b0.dist-info/RECORD +0 -67
- {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/LICENSE +0 -0
- {ragaai_catalyst-2.1.4.1b0.dist-info → ragaai_catalyst-2.1.5.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,28 @@
|
|
1
1
|
import os
|
2
|
-
|
3
|
-
import google.generativeai as genai
|
4
|
-
import openai
|
5
|
-
import PyPDF2
|
2
|
+
import ast
|
6
3
|
import csv
|
4
|
+
import json
|
5
|
+
import random
|
6
|
+
import PyPDF2
|
7
7
|
import markdown
|
8
8
|
import pandas as pd
|
9
|
-
import json
|
10
|
-
from litellm import completion
|
11
9
|
from tqdm import tqdm
|
12
|
-
|
13
|
-
|
10
|
+
|
11
|
+
import openai
|
12
|
+
import tiktoken
|
13
|
+
import litellm
|
14
|
+
import google.generativeai as genai
|
15
|
+
from groq import Groq
|
16
|
+
from litellm import completion
|
17
|
+
|
14
18
|
from .internal_api_completion import api_completion as internal_api_completion
|
15
19
|
from .proxy_call import api_completion as proxy_api_completion
|
16
|
-
# from ragaai_catalyst import internal_api_completion
|
17
|
-
# from ragaai_catalyst import proxy_call
|
18
|
-
import ast
|
19
20
|
|
20
|
-
|
21
|
+
from typing import Optional, List, Dict, Any
|
22
|
+
|
23
|
+
import logging
|
24
|
+
|
25
|
+
logger = logging.getLogger(__name__)
|
21
26
|
|
22
27
|
class SyntheticDataGeneration:
|
23
28
|
"""
|
@@ -48,13 +53,18 @@ class SyntheticDataGeneration:
|
|
48
53
|
Raises:
|
49
54
|
ValueError: If an invalid provider is specified or API key is missing.
|
50
55
|
"""
|
56
|
+
text_validity = self.validate_input(text)
|
57
|
+
if text_validity:
|
58
|
+
raise ValueError(text_validity)
|
59
|
+
|
51
60
|
BATCH_SIZE = 5 # Optimal batch size for maintaining response quality
|
52
61
|
provider = model_config.get("provider")
|
53
62
|
model = model_config.get("model")
|
54
63
|
api_base = model_config.get("api_base")
|
64
|
+
api_version = model_config.get("api_version")
|
55
65
|
|
56
66
|
# Initialize the appropriate client based on provider
|
57
|
-
self._initialize_client(provider, api_key, api_base, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
|
67
|
+
self._initialize_client(provider, api_key, api_base, api_version, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
|
58
68
|
|
59
69
|
# Initialize progress bar
|
60
70
|
pbar = tqdm(total=n, desc="Generating QA pairs")
|
@@ -68,7 +78,9 @@ class SyntheticDataGeneration:
|
|
68
78
|
"No connection adapters",
|
69
79
|
"Required API Keys are not set",
|
70
80
|
"litellm.BadRequestError",
|
71
|
-
"litellm.AuthenticationError"
|
81
|
+
"litellm.AuthenticationError",
|
82
|
+
"Max retries exceeded"
|
83
|
+
]
|
72
84
|
|
73
85
|
for _ in range(num_batches):
|
74
86
|
current_batch_size = min(BATCH_SIZE, n - len(all_responses))
|
@@ -77,7 +89,6 @@ class SyntheticDataGeneration:
|
|
77
89
|
|
78
90
|
try:
|
79
91
|
system_message = self._get_system_message(question_type, current_batch_size)
|
80
|
-
|
81
92
|
if "internal_llm_proxy" in kwargs:
|
82
93
|
batch_df = self._generate_internal_response(text, system_message, model_config, kwargs)
|
83
94
|
else:
|
@@ -88,7 +99,7 @@ class SyntheticDataGeneration:
|
|
88
99
|
pbar.update(len(batch_df))
|
89
100
|
|
90
101
|
except Exception as e:
|
91
|
-
print(f"Batch generation failed
|
102
|
+
print(f"Batch generation failed:{str(e)}")
|
92
103
|
|
93
104
|
if any(error in str(e) for error in FAILURE_CASES):
|
94
105
|
raise Exception(f"{e}")
|
@@ -139,7 +150,7 @@ class SyntheticDataGeneration:
|
|
139
150
|
|
140
151
|
return final_df
|
141
152
|
|
142
|
-
def _initialize_client(self, provider, api_key, api_base=None, internal_llm_proxy=None):
|
153
|
+
def _initialize_client(self, provider, api_key, api_base=None, api_version=None, internal_llm_proxy=None):
|
143
154
|
"""Initialize the appropriate client based on provider."""
|
144
155
|
if not provider:
|
145
156
|
raise ValueError("Model configuration must be provided with a valid provider and model.")
|
@@ -158,7 +169,17 @@ class SyntheticDataGeneration:
|
|
158
169
|
if api_key is None and os.getenv("OPENAI_API_KEY") is None and internal_llm_proxy is None:
|
159
170
|
raise ValueError("API key must be provided for OpenAI.")
|
160
171
|
openai.api_key = api_key or os.getenv("OPENAI_API_KEY")
|
161
|
-
|
172
|
+
|
173
|
+
elif provider == "azure":
|
174
|
+
if api_key is None and os.getenv("AZURE_API_KEY") is None and internal_llm_proxy is None:
|
175
|
+
raise ValueError("API key must be provided for Azure.")
|
176
|
+
litellm.api_key = api_key or os.getenv("AZURE_API_KEY")
|
177
|
+
if api_base is None and os.getenv("AZURE_API_BASE") is None and internal_llm_proxy is None:
|
178
|
+
raise ValueError("API Base must be provided for Azure.")
|
179
|
+
litellm.api_base = api_base or os.getenv("AZURE_API_BASE")
|
180
|
+
if api_version is None and os.getenv("AZURE_API_VERSION") is None and internal_llm_proxy is None:
|
181
|
+
raise ValueError("API version must be provided for Azure.")
|
182
|
+
litellm.api_version = api_version or os.getenv("AZURE_API_VERSION")
|
162
183
|
else:
|
163
184
|
raise ValueError(f"Provider is not recognized.")
|
164
185
|
|
@@ -189,7 +210,15 @@ class SyntheticDataGeneration:
|
|
189
210
|
kwargs=kwargs
|
190
211
|
)
|
191
212
|
|
213
|
+
def validate_input(self,text):
|
192
214
|
|
215
|
+
if not text.strip():
|
216
|
+
return 'Empty Text provided for qna generation. Please provide valid text'
|
217
|
+
encoding = tiktoken.encoding_for_model("gpt-4")
|
218
|
+
tokens = encoding.encode(text)
|
219
|
+
if len(tokens)<5:
|
220
|
+
return 'Very Small Text provided for qna generation. Please provide longer text'
|
221
|
+
return False
|
193
222
|
|
194
223
|
|
195
224
|
def _get_system_message(self, question_type, n):
|
@@ -207,7 +236,8 @@ class SyntheticDataGeneration:
|
|
207
236
|
ValueError: If an invalid question type is specified.
|
208
237
|
"""
|
209
238
|
if question_type == 'simple':
|
210
|
-
return f'''Generate a set of {n} very simple questions answerable in a single phrase.
|
239
|
+
return f'''Generate a set of {n} very simple questions answerable in a single phrase using the below text.
|
240
|
+
Only generate questions answerable from the text given, to cover all parts of the given document.
|
211
241
|
Also return the answers for the generated questions.
|
212
242
|
Return the response in a list of object format.
|
213
243
|
Each object in list should have Question and corresponding answer.
|
@@ -216,6 +246,7 @@ class SyntheticDataGeneration:
|
|
216
246
|
'''
|
217
247
|
elif question_type == 'mcq':
|
218
248
|
return f'''Generate a set of {n} questions with 4 probable answers from the given text.
|
249
|
+
Only generate questions answerable from the text given, to cover all parts of the given document.
|
219
250
|
The options should not be longer than a phrase. There should be only 1 correct answer.
|
220
251
|
There should not be any ambiguity between correct and incorrect options.
|
221
252
|
Return the response in a list of object format.
|
@@ -225,6 +256,7 @@ class SyntheticDataGeneration:
|
|
225
256
|
'''
|
226
257
|
elif question_type == 'complex':
|
227
258
|
return f'''Can you generate a set of {n} complex questions answerable in long form from the below texts.
|
259
|
+
Only generate questions answerable from the text given, to cover all parts of the given document.
|
228
260
|
Make sure the questions are important and provide new information to the user.
|
229
261
|
Return the response in a list of object format. Enclose any quotes in single quote.
|
230
262
|
Do not use double quotes within questions or answers.
|
@@ -274,10 +306,14 @@ class SyntheticDataGeneration:
|
|
274
306
|
# Add optional parameters if they exist in model_config
|
275
307
|
if "api_base" in model_config:
|
276
308
|
completion_params["api_base"] = model_config["api_base"]
|
309
|
+
if "api_version" in model_config:
|
310
|
+
completion_params["api_version"] = model_config["api_version"]
|
277
311
|
if "max_tokens" in model_config:
|
278
312
|
completion_params["max_tokens"] = model_config["max_tokens"]
|
279
313
|
if "temperature" in model_config:
|
280
314
|
completion_params["temperature"] = model_config["temperature"]
|
315
|
+
if 'provider' in model_config:
|
316
|
+
completion_params['model'] = f'{model_config["provider"]}/{model_config["model"]}'
|
281
317
|
|
282
318
|
# Make the API call using LiteLLM
|
283
319
|
try:
|
@@ -298,6 +334,59 @@ class SyntheticDataGeneration:
|
|
298
334
|
|
299
335
|
json_data = json.loads(content)
|
300
336
|
return pd.DataFrame(json_data)
|
337
|
+
|
338
|
+
def _generate_raw_llm_response(self, text, system_message: Optional[str] = None, model_config: Dict[str, Any] = dict(), api_key=None):
|
339
|
+
"""
|
340
|
+
Generate questions using LiteLLM which supports multiple providers (OpenAI, Groq, Gemini, etc.).
|
341
|
+
|
342
|
+
Args:
|
343
|
+
text (str): The input text to generate questions from.
|
344
|
+
system_message (str): The system message for the AI model.
|
345
|
+
model_config (dict): Configuration dictionary containing model details.
|
346
|
+
Required keys:
|
347
|
+
- model: The model identifier (e.g., "gpt-4", "gemini-pro", "mixtral-8x7b-32768")
|
348
|
+
Optional keys:
|
349
|
+
- api_base: Custom API base URL if needed
|
350
|
+
- max_tokens: Maximum tokens in response
|
351
|
+
- temperature: Temperature for response generation
|
352
|
+
api_key (str, optional): The API key for the model provider.
|
353
|
+
|
354
|
+
Returns:
|
355
|
+
pandas.DataFrame: A DataFrame containing the generated questions and answers.
|
356
|
+
|
357
|
+
Raises:
|
358
|
+
Exception: If there's an error in generating the response.
|
359
|
+
"""
|
360
|
+
messages = [
|
361
|
+
{"role": "system", "content": system_message},
|
362
|
+
{"role": "user", "content": text}
|
363
|
+
]
|
364
|
+
|
365
|
+
completion_params = {
|
366
|
+
"model": model_config.get("model", 'gpt-4o'),
|
367
|
+
"messages": messages,
|
368
|
+
"api_key": api_key
|
369
|
+
}
|
370
|
+
|
371
|
+
if "api_base" in model_config:
|
372
|
+
completion_params["api_base"] = model_config["api_base"]
|
373
|
+
if "api_version" in model_config:
|
374
|
+
completion_params["api_version"] = model_config["api_version"]
|
375
|
+
if "max_tokens" in model_config:
|
376
|
+
completion_params["max_tokens"] = model_config["max_tokens"]
|
377
|
+
if "temperature" in model_config:
|
378
|
+
completion_params["temperature"] = model_config["temperature"]
|
379
|
+
if 'provider' in model_config:
|
380
|
+
completion_params['model'] = f'{model_config["provider"]}/{model_config["model"]}'
|
381
|
+
|
382
|
+
try:
|
383
|
+
response = completion(**completion_params)
|
384
|
+
except Exception as e:
|
385
|
+
if any(error in str(e).lower() for error in ["invalid api key", "incorrect api key", "unauthorized", "authentication"]):
|
386
|
+
raise ValueError(f"Invalid API key provided for {model_config.get('provider', 'the specified')} provider")
|
387
|
+
raise Exception(f"Error calling LLM API: {str(e)}")
|
388
|
+
|
389
|
+
return response.choices[0].message.content
|
301
390
|
|
302
391
|
def _parse_response(self, response, provider):
|
303
392
|
"""
|
@@ -318,9 +407,13 @@ class SyntheticDataGeneration:
|
|
318
407
|
list_start_index = data.find('[') # Find the index of the first '['
|
319
408
|
substring_data = data[list_start_index:] if list_start_index != -1 else data # Slice from the list start
|
320
409
|
data = substring_data
|
321
|
-
|
410
|
+
elif provider == "azure":
|
411
|
+
data = response.choices[0].message.content.replace('\n', '')
|
412
|
+
list_start_index = data.find('[') # Find the index of the first '['
|
413
|
+
substring_data = data[list_start_index:] if list_start_index != -1 else data # Slice from the list start
|
414
|
+
data = substring_data
|
322
415
|
else:
|
323
|
-
raise ValueError("Invalid provider. Choose 'groq', 'gemini', or 'openai'.")
|
416
|
+
raise ValueError("Invalid provider. Choose 'groq', 'gemini', 'azure' or 'openai'.")
|
324
417
|
try:
|
325
418
|
json_data = json.loads(data)
|
326
419
|
return pd.DataFrame(json_data)
|
@@ -442,7 +535,292 @@ class SyntheticDataGeneration:
|
|
442
535
|
Returns:
|
443
536
|
list: A list of supported AI providers.
|
444
537
|
"""
|
445
|
-
return ['gemini', 'openai']
|
538
|
+
return ['gemini', 'openai','azure']
|
539
|
+
|
540
|
+
def _get_init_ex_gen_prompt(self):
|
541
|
+
prompt = '''
|
542
|
+
You are an expert example generator. Your task is to produce creative, relevant and varied examples according to the user instructions.
|
543
|
+
|
544
|
+
**Inputs**
|
545
|
+
User Instruction: The user will provide guidance on how to generate examples, possibly accompanied by their own examples.
|
546
|
+
User Examples[Optional]: The user may supply examples.
|
547
|
+
User Context[Optional]: The user may supply context to generate the examples from.
|
548
|
+
No of Examples: The total number of examples to produce.
|
549
|
+
|
550
|
+
**Steps to follow**
|
551
|
+
1. Carefully analyze the user's instruction
|
552
|
+
2. If user examples are provided, check whether the user’s instructions refer to them specifically.
|
553
|
+
3. If user context is provided, understand it thoroughly and identify relevant parts to generate examples.
|
554
|
+
4. Comply with the system’s guidelines to generate examples, incorporating any user examples or user context as needed.
|
555
|
+
|
556
|
+
**Output Format**:
|
557
|
+
- Present examples in a multiline string with each line a separate example.
|
558
|
+
- Avoid markdown or special formatting.
|
559
|
+
- Omit any boilerplate texts.
|
560
|
+
|
561
|
+
**Instructions for Diversity**:
|
562
|
+
- Vary the examples by context, tone, and (if applicable) technical complexity.
|
563
|
+
- Include edge cases or unconventional scenarios.
|
564
|
+
- Ensure no two examples are conceptually identical.
|
565
|
+
|
566
|
+
**Final Notes**:
|
567
|
+
- Focus on both originality and practical relevance.
|
568
|
+
- Avoid repetitiveness in the examples.
|
569
|
+
'''
|
570
|
+
return prompt
|
571
|
+
|
572
|
+
def _get_iter_ex_gen_prompt(self):
|
573
|
+
prompt = '''
|
574
|
+
You are an expert example generator. Your task is to produce creative, relevant and varied examples according to the user instructions.
|
575
|
+
|
576
|
+
**Inputs**
|
577
|
+
User Instruction: The user will provide guidance on how to generate examples, possibly accompanied by their own examples.
|
578
|
+
User Examples[Optional]: The user may supply examples.
|
579
|
+
User Context[Optional]: The user may supply context to generate the examples from.
|
580
|
+
No of Examples: The total number of examples to produce.
|
581
|
+
Relevant Examples: Any examples that are relevant to the user's instruction.
|
582
|
+
Irrelevant Examples: Any examples that are not relevant to the user's instruction.
|
583
|
+
|
584
|
+
**Steps to follow**
|
585
|
+
1. Carefully analyze the user's instruction
|
586
|
+
2. If user examples are provided, check whether the user’s instructions refer to them specifically.
|
587
|
+
3. If user context is provided, understand it thoroughly and identify relevant parts to generate examples.
|
588
|
+
4. Review the relevant and irrelevant examples present, understanding the differences in them.
|
589
|
+
5. Comply with the user's instruction to generate examples, similar to relevant examples and dissimilar to irrelevant ones.
|
590
|
+
|
591
|
+
**Output Format**:
|
592
|
+
- Present examples in a multiline sting with each line a separate example.
|
593
|
+
- Avoid markdown or special formatting.
|
594
|
+
- Omit any boilerplate texts.
|
595
|
+
|
596
|
+
**Instructions for Diversity**:
|
597
|
+
- Vary the examples by context, tone, and (if applicable) technical complexity.
|
598
|
+
- Include edge cases or unconventional scenarios.
|
599
|
+
- Ensure no two examples are conceptually identical.
|
600
|
+
|
601
|
+
**Final Notes**:
|
602
|
+
- Focus on both originality and practical relevance.
|
603
|
+
- Avoid repetitiveness in the examples.
|
604
|
+
'''
|
605
|
+
return prompt
|
606
|
+
|
607
|
+
def _generate_examples_iter(
|
608
|
+
self,
|
609
|
+
user_instruction: str,
|
610
|
+
user_examples: Optional[List[str] | str] = None,
|
611
|
+
user_context: Optional[str] = None,
|
612
|
+
relevant_examples: List[str]=[],
|
613
|
+
irrelevant_examples: List[str]=[],
|
614
|
+
no_examples: Optional[int] = None,
|
615
|
+
model_config: Dict[str, Any] = dict(),
|
616
|
+
api_key: Optional[str] = None
|
617
|
+
):
|
618
|
+
if no_examples is None:
|
619
|
+
no_examples = 5
|
620
|
+
relevant_examples_str = '\n'.join(relevant_examples)
|
621
|
+
irrelevant_examples_str = '\n'.join(irrelevant_examples)
|
622
|
+
user_message = f'**User Instruction:** {user_instruction}'
|
623
|
+
user_message += f'\n\n**No of Examples:** {no_examples}'
|
624
|
+
if user_examples:
|
625
|
+
if isinstance(user_examples, str):
|
626
|
+
user_examples_str = user_examples
|
627
|
+
elif isinstance(user_examples, list):
|
628
|
+
user_examples_str = "\n".join(user_examples)
|
629
|
+
else:
|
630
|
+
raise ValueError(f'Expected string or list of strings as user_examples got {type(user_examples)}')
|
631
|
+
user_message += f"\n\n**User Examples:** \n{user_examples_str}"
|
632
|
+
if relevant_examples:
|
633
|
+
user_message += f'\n\n**Relevant Examples:** \n{relevant_examples_str}'
|
634
|
+
if irrelevant_examples:
|
635
|
+
user_message += f'\n\n**Irrelevant Examples:** \n{irrelevant_examples_str}'
|
636
|
+
if user_context:
|
637
|
+
user_message += f'\n\n**User Context:** \n{user_context}'
|
638
|
+
system_prompt = self._get_iter_ex_gen_prompt()
|
639
|
+
return self._generate_raw_llm_response(user_message, system_prompt, model_config=model_config, api_key=api_key)
|
640
|
+
|
641
|
+
def _generate_examples(
|
642
|
+
self,
|
643
|
+
user_instruction:str,
|
644
|
+
user_examples:Optional[List[str]|str]=None,
|
645
|
+
user_context: Optional[str] = None,
|
646
|
+
no_examples:Optional[int]=None,
|
647
|
+
model_config: Dict[str, Any] = dict(),
|
648
|
+
api_key: Optional[str] = None
|
649
|
+
):
|
650
|
+
if no_examples is None:
|
651
|
+
no_examples = 5
|
652
|
+
user_message = f"**User Instruction:** {user_instruction}"
|
653
|
+
if user_examples:
|
654
|
+
if isinstance(user_examples, str):
|
655
|
+
user_examples_str = user_examples
|
656
|
+
elif isinstance(user_examples, list):
|
657
|
+
user_examples_str = "\n".join(user_examples)
|
658
|
+
else:
|
659
|
+
raise ValueError(f'Expected string or list of strings as user_examples got {type(user_examples)}')
|
660
|
+
user_message += f"\n\n**User Examples:** \n{user_examples_str}"
|
661
|
+
if user_context:
|
662
|
+
user_message += f'\n\n**User Context:** \n{user_context}'
|
663
|
+
user_message += f'\n\n**No of Examples:** {no_examples}'
|
664
|
+
init_system_prompt = self._get_init_ex_gen_prompt()
|
665
|
+
return self._generate_raw_llm_response(user_message, init_system_prompt, model_config=model_config, api_key=api_key)
|
666
|
+
|
667
|
+
def _get_valid_examples(self, user_indices_str: str, examples: List[str]):
|
668
|
+
valid_examples = []
|
669
|
+
try:
|
670
|
+
user_indices = user_indices_str.strip().split(',')
|
671
|
+
for index_str in user_indices:
|
672
|
+
try:
|
673
|
+
index = int(index_str)
|
674
|
+
if index <= 0 or index > len(examples):
|
675
|
+
continue
|
676
|
+
except ValueError as e:
|
677
|
+
continue
|
678
|
+
valid_examples.append(examples[index-1])
|
679
|
+
except Exception as e:
|
680
|
+
print(f'Error: {e}')
|
681
|
+
return valid_examples
|
682
|
+
|
683
|
+
def generate_examples(
|
684
|
+
self,
|
685
|
+
user_instruction: str,
|
686
|
+
user_examples:Optional[List[str] | str] = None,
|
687
|
+
user_context: Optional[str] = None,
|
688
|
+
no_examples: Optional[int] = None,
|
689
|
+
model_config: Optional[Dict[str, Any]] = None,
|
690
|
+
api_key: Optional[str] = None,
|
691
|
+
max_iter: int = 0,
|
692
|
+
**kwargs
|
693
|
+
):
|
694
|
+
if not model_config:
|
695
|
+
model_config = {}
|
696
|
+
provider = model_config.get("provider")
|
697
|
+
api_base = model_config.get("api_base")
|
698
|
+
api_version = model_config.get("api_version")
|
699
|
+
self._initialize_client(provider, api_key, api_base, api_version, internal_llm_proxy=kwargs.get("internal_llm_proxy", None))
|
700
|
+
|
701
|
+
if no_examples is None:
|
702
|
+
no_examples = 5
|
703
|
+
assert no_examples >= 0, 'The number of examples cannot be less than 0'
|
704
|
+
relevant_examples = []
|
705
|
+
irrelevant_examples = []
|
706
|
+
max_relevant_examples = 5
|
707
|
+
max_irrelevant_examples = 10
|
708
|
+
while len(relevant_examples) <= max_relevant_examples or len(irrelevant_examples) <= max_irrelevant_examples:
|
709
|
+
if max_iter <= 0:
|
710
|
+
break
|
711
|
+
if len(relevant_examples) > max_relevant_examples:
|
712
|
+
relevant_examples = random.sample(relevant_examples, max_relevant_examples)
|
713
|
+
if len(irrelevant_examples) > max_irrelevant_examples:
|
714
|
+
irrelevant_examples = random.sample(irrelevant_examples, max_irrelevant_examples)
|
715
|
+
if relevant_examples or irrelevant_examples:
|
716
|
+
examples_str = self._generate_examples_iter(
|
717
|
+
user_instruction = user_instruction,
|
718
|
+
user_examples = user_examples,
|
719
|
+
relevant_examples = relevant_examples,
|
720
|
+
irrelevant_examples = irrelevant_examples,
|
721
|
+
model_config = model_config,
|
722
|
+
api_key = api_key
|
723
|
+
)
|
724
|
+
else:
|
725
|
+
examples_str = self._generate_examples(
|
726
|
+
user_instruction = user_instruction,
|
727
|
+
user_examples = user_examples,
|
728
|
+
user_context = user_context,
|
729
|
+
model_config = model_config,
|
730
|
+
api_key = api_key
|
731
|
+
)
|
732
|
+
examples = [example for example in examples_str.split('\n') if example.strip()]
|
733
|
+
print('Generated Examples:')
|
734
|
+
for i, example in enumerate(examples):
|
735
|
+
print(f'{i+1}. {example}')
|
736
|
+
relevant_indices = input('Enter the indices of relevant examples (comma-separated): ').strip()
|
737
|
+
if relevant_indices:
|
738
|
+
relevant_examples.extend(self._get_valid_examples(relevant_indices, examples))
|
739
|
+
irrelevant_indices = input('Enter the indices of irrelevant examples (comma-separated): ').strip()
|
740
|
+
if irrelevant_indices:
|
741
|
+
irrelevant_examples.extend(self._get_valid_examples(irrelevant_indices, examples))
|
742
|
+
max_iter -= 1
|
743
|
+
if len(relevant_examples) > max_relevant_examples:
|
744
|
+
fin_relevant_examples = random.sample(relevant_examples, max_relevant_examples)
|
745
|
+
else:
|
746
|
+
fin_relevant_examples = relevant_examples
|
747
|
+
if len(irrelevant_examples) > max_irrelevant_examples:
|
748
|
+
fin_irrelevant_examples = random.sample(irrelevant_examples, max_irrelevant_examples)
|
749
|
+
else:
|
750
|
+
fin_irrelevant_examples = irrelevant_examples
|
751
|
+
if relevant_examples or irrelevant_examples:
|
752
|
+
if len(relevant_examples) < no_examples:
|
753
|
+
more_no_examples = no_examples - len(relevant_examples)
|
754
|
+
final_examples_str = self._generate_examples_iter(
|
755
|
+
user_instruction = user_instruction,
|
756
|
+
user_examples = user_examples,
|
757
|
+
user_context = user_context,
|
758
|
+
relevant_examples = fin_relevant_examples,
|
759
|
+
irrelevant_examples = fin_irrelevant_examples,
|
760
|
+
no_examples = more_no_examples,
|
761
|
+
model_config = model_config,
|
762
|
+
api_key = api_key
|
763
|
+
)
|
764
|
+
final_examples = [example for example in final_examples_str.split('\n') if example.strip()]
|
765
|
+
final_examples.extend(relevant_examples)
|
766
|
+
else:
|
767
|
+
final_examples = random.sample(relevant_examples, no_examples)
|
768
|
+
else:
|
769
|
+
final_examples_str = self._generate_examples(
|
770
|
+
user_instruction = user_instruction,
|
771
|
+
user_examples = user_examples,
|
772
|
+
user_context = user_context,
|
773
|
+
no_examples = no_examples,
|
774
|
+
model_config = model_config,
|
775
|
+
api_key = api_key
|
776
|
+
)
|
777
|
+
final_examples = [example for example in final_examples_str.split('\n') if example.strip()]
|
778
|
+
return final_examples
|
779
|
+
|
780
|
+
|
781
|
+
def generate_examples_from_csv(
|
782
|
+
self,
|
783
|
+
csv_path: str,
|
784
|
+
dst_csv_path: Optional[str] = None,
|
785
|
+
no_examples: Optional[int] = None,
|
786
|
+
model_config: Optional[Dict[str, Any]] = None,
|
787
|
+
api_key: Optional[str] = None,
|
788
|
+
**kwargs
|
789
|
+
):
|
790
|
+
if no_examples is None:
|
791
|
+
no_examples = 5
|
792
|
+
assert no_examples >= 0, 'The number of examples cannot be less than 0'
|
793
|
+
df = pd.read_csv(csv_path)
|
794
|
+
assert 'user_instruction' in df.columns, 'The csv must have a column named user_instruction'
|
795
|
+
fin_df_list = []
|
796
|
+
for i, row in df.iterrows():
|
797
|
+
user_instruction = row['user_instruction']
|
798
|
+
user_examples = row.get('user_examples')
|
799
|
+
user_context = row.get('user_context')
|
800
|
+
row_dict = row.to_dict()
|
801
|
+
try:
|
802
|
+
examples = self.generate_examples(
|
803
|
+
user_instruction = user_instruction,
|
804
|
+
user_examples = user_examples,
|
805
|
+
user_context = user_context,
|
806
|
+
no_examples = no_examples,
|
807
|
+
model_config = model_config,
|
808
|
+
api_key = api_key
|
809
|
+
)
|
810
|
+
except Exception as e:
|
811
|
+
continue
|
812
|
+
row_dict['generated_examples'] = examples
|
813
|
+
fin_df_list.append(row_dict)
|
814
|
+
fin_df = pd.DataFrame(fin_df_list)
|
815
|
+
csv_file, csv_ext = os.path.splitext(csv_path)
|
816
|
+
if not dst_csv_path:
|
817
|
+
dst_csv_path = csv_file + '_with_examples' + csv_ext
|
818
|
+
dst_dir = os.path.dirname(dst_csv_path)
|
819
|
+
if dst_dir:
|
820
|
+
os.makedirs(dst_dir, exist_ok=True)
|
821
|
+
fin_df.to_csv(dst_csv_path)
|
822
|
+
logger.info(f'CSV with generated examples saved at {dst_csv_path}')
|
823
|
+
|
446
824
|
|
447
825
|
# Usage:
|
448
826
|
# from synthetic_data_generation import SyntheticDataGeneration
|
@@ -1,3 +1,19 @@
|
|
1
1
|
from .tracer import Tracer
|
2
|
+
from .distributed import (
|
3
|
+
init_tracing,
|
4
|
+
trace_agent,
|
5
|
+
trace_llm,
|
6
|
+
trace_tool,
|
7
|
+
current_span,
|
8
|
+
trace_custom,
|
9
|
+
)
|
2
10
|
|
3
|
-
__all__ = [
|
11
|
+
__all__ = [
|
12
|
+
"Tracer",
|
13
|
+
"init_tracing",
|
14
|
+
"trace_agent",
|
15
|
+
"trace_llm",
|
16
|
+
"trace_tool",
|
17
|
+
"current_span",
|
18
|
+
"trace_custom"
|
19
|
+
]
|
@@ -271,7 +271,7 @@ class ComponentInfo:
|
|
271
271
|
cost: Optional[Dict[str, float]] = None
|
272
272
|
|
273
273
|
class Trace:
|
274
|
-
def __init__(self, id: str, trace_name: str, project_name: str, start_time: str, end_time: str, metadata: Optional[Metadata] = None, data: Optional[List[Dict[str, Any]]] = None, replays: Optional[Dict[str, Any]] = None):
|
274
|
+
def __init__(self, id: str, trace_name: str, project_name: str, start_time: str, end_time: str, metadata: Optional[Metadata] = None, data: Optional[List[Dict[str, Any]]] = None, replays: Optional[Dict[str, Any]] = None, metrics: Optional[List[Dict[str, Any]]] = None):
|
275
275
|
self.id = id
|
276
276
|
self.trace_name = trace_name
|
277
277
|
self.project_name = project_name
|
@@ -280,6 +280,7 @@ class Trace:
|
|
280
280
|
self.metadata = metadata or Metadata()
|
281
281
|
self.data = data or []
|
282
282
|
self.replays = replays
|
283
|
+
self.metrics = metrics or []
|
283
284
|
|
284
285
|
def to_dict(self):
|
285
286
|
return {
|
@@ -288,7 +289,8 @@ class Trace:
|
|
288
289
|
"project_name": self.project_name,
|
289
290
|
"start_time": self.start_time,
|
290
291
|
"end_time": self.end_time,
|
291
|
-
"metadata": self.metadata
|
292
|
+
"metadata": self.metadata,
|
292
293
|
"data": self.data,
|
293
294
|
"replays": self.replays,
|
295
|
+
"metrics": self.metrics
|
294
296
|
}
|