ragaai-catalyst 2.0.5__py3-none-any.whl → 2.0.6b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragaai_catalyst/__init__.py +2 -1
- ragaai_catalyst/dataset.py +49 -60
- ragaai_catalyst/evaluation.py +47 -29
- ragaai_catalyst/guardrails_manager.py +233 -0
- ragaai_catalyst/internal_api_completion.py +83 -0
- ragaai_catalyst/proxy_call.py +1 -1
- ragaai_catalyst/synthetic_data_generation.py +201 -78
- ragaai_catalyst/tracers/llamaindex_callback.py +361 -0
- ragaai_catalyst/tracers/tracer.py +62 -28
- {ragaai_catalyst-2.0.5.dist-info → ragaai_catalyst-2.0.6b0.dist-info}/METADATA +139 -72
- {ragaai_catalyst-2.0.5.dist-info → ragaai_catalyst-2.0.6b0.dist-info}/RECORD +13 -10
- {ragaai_catalyst-2.0.5.dist-info → ragaai_catalyst-2.0.6b0.dist-info}/WHEEL +1 -1
- {ragaai_catalyst-2.0.5.dist-info → ragaai_catalyst-2.0.6b0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,361 @@
|
|
1
|
+
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
|
2
|
+
from llama_index.core import Settings
|
3
|
+
from typing import List, Dict, Any, Optional
|
4
|
+
from datetime import datetime
|
5
|
+
from enum import Enum
|
6
|
+
import json
|
7
|
+
import uuid
|
8
|
+
import os
|
9
|
+
import requests
|
10
|
+
import tempfile
|
11
|
+
|
12
|
+
from ..ragaai_catalyst import RagaAICatalyst
|
13
|
+
|
14
|
+
class CustomEncoder(json.JSONEncoder):
|
15
|
+
def default(self, obj):
|
16
|
+
if isinstance(obj, Enum):
|
17
|
+
return obj.value
|
18
|
+
elif hasattr(obj, "__dict__"):
|
19
|
+
return obj.__dict__
|
20
|
+
return str(obj)
|
21
|
+
|
22
|
+
|
23
|
+
class LlamaIndexTracer:
|
24
|
+
def __init__(self, user_detail):
|
25
|
+
self.trace_handler = None
|
26
|
+
self.callback_manager = (
|
27
|
+
CallbackManager()
|
28
|
+
) # Ensure callback manager is initialized
|
29
|
+
self._original_inits = {} # Store original __init__ methods
|
30
|
+
self.project_name = user_detail["project_name"]
|
31
|
+
self.project_id = user_detail["project_id"]
|
32
|
+
self.dataset_name = user_detail["dataset_name"]
|
33
|
+
self.user_detail = user_detail["trace_user_detail"]
|
34
|
+
self.base_url = f"{RagaAICatalyst.BASE_URL}"
|
35
|
+
self.timeout = 10
|
36
|
+
self.query_count = 0
|
37
|
+
self._upload_task = None
|
38
|
+
|
39
|
+
def start(self):
|
40
|
+
"""Start tracing - call this before your LlamaIndex operations"""
|
41
|
+
outer_self = self # Capture outer self reference for inner class
|
42
|
+
|
43
|
+
class CustomTraceHandler(LlamaDebugHandler):
|
44
|
+
def __init__(self):
|
45
|
+
super().__init__()
|
46
|
+
self.traces: List[Dict[str, Any]] = []
|
47
|
+
self.current_query_traces: List[Dict[str, Any]] = []
|
48
|
+
self.in_query = False
|
49
|
+
self.query_event_id = None
|
50
|
+
|
51
|
+
def on_event_start(
|
52
|
+
self,
|
53
|
+
event_type: Optional[str],
|
54
|
+
payload: Optional[Dict[str, Any]] = None,
|
55
|
+
event_id: str = "",
|
56
|
+
parent_id: str = "",
|
57
|
+
**kwargs: Any
|
58
|
+
) -> None:
|
59
|
+
trace = {
|
60
|
+
"event_type": event_type,
|
61
|
+
"timestamp": datetime.now().isoformat(),
|
62
|
+
"payload": payload,
|
63
|
+
"status": "started",
|
64
|
+
"event_id": event_id,
|
65
|
+
"parent_id": parent_id,
|
66
|
+
}
|
67
|
+
if event_type == "query":
|
68
|
+
self.in_query = True
|
69
|
+
self.query_event_id = event_id
|
70
|
+
self.current_query_traces = []
|
71
|
+
|
72
|
+
if self.in_query:
|
73
|
+
self.current_query_traces.append(trace)
|
74
|
+
self.traces.append(trace)
|
75
|
+
|
76
|
+
def on_event_end(
|
77
|
+
self,
|
78
|
+
event_type: Optional[str],
|
79
|
+
payload: Optional[Dict[str, Any]] = None,
|
80
|
+
event_id: str = "",
|
81
|
+
**kwargs: Any
|
82
|
+
) -> None:
|
83
|
+
trace = {
|
84
|
+
"event_type": event_type,
|
85
|
+
"timestamp": datetime.now().isoformat(),
|
86
|
+
"payload": payload,
|
87
|
+
"status": "completed",
|
88
|
+
"event_id": event_id,
|
89
|
+
}
|
90
|
+
if self.in_query:
|
91
|
+
self.current_query_traces.append(trace)
|
92
|
+
self.traces.append(trace)
|
93
|
+
|
94
|
+
# If this is the end of a query event, automatically save the traces
|
95
|
+
if event_type == "query" and event_id == self.query_event_id:
|
96
|
+
self.in_query = False
|
97
|
+
outer_self._save_current_query_traces(self.current_query_traces)
|
98
|
+
self.current_query_traces = []
|
99
|
+
|
100
|
+
|
101
|
+
self.trace_handler = CustomTraceHandler()
|
102
|
+
self.callback_manager.add_handler(self.trace_handler)
|
103
|
+
Settings.callback_manager = self.callback_manager
|
104
|
+
|
105
|
+
|
106
|
+
# Monkey-patch LlamaIndex components
|
107
|
+
self._monkey_patch()
|
108
|
+
return self # Return self to allow method chaining
|
109
|
+
|
110
|
+
|
111
|
+
def _save_current_query_traces(self, query_traces):
|
112
|
+
"""Save traces for the current query"""
|
113
|
+
self.query_count += 1
|
114
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
115
|
+
filename = f"trace_query_{self.query_count}_{timestamp}.json"
|
116
|
+
|
117
|
+
traces = self._add_traces_in_data(query_traces)
|
118
|
+
|
119
|
+
# Write the tracer json files to a temporary directory
|
120
|
+
temp_dir = tempfile.gettempdir()
|
121
|
+
temp_file_path = f"{temp_dir}/{filename}"
|
122
|
+
|
123
|
+
with open(temp_file_path, "w") as f:
|
124
|
+
json.dump([traces], f, indent=2, cls=CustomEncoder)
|
125
|
+
# print(f"Query traces saved to {temp_file_path}")
|
126
|
+
|
127
|
+
# Upload the traces
|
128
|
+
self._create_dataset_schema_with_trace()
|
129
|
+
presignedUrl = self._get_presigned_url()
|
130
|
+
self._put_presigned_url(presignedUrl, temp_file_path)
|
131
|
+
self._insert_traces(presignedUrl)
|
132
|
+
# print(f"Query {self.query_count} traces uploaded")
|
133
|
+
|
134
|
+
|
135
|
+
def _monkey_patch(self):
|
136
|
+
"""Monkey-patch LlamaIndex components to automatically include the callback manager"""
|
137
|
+
from llama_index.core import VectorStoreIndex, ServiceContext
|
138
|
+
from llama_index.llms.openai import OpenAI
|
139
|
+
|
140
|
+
# Import any other classes you need to patch here
|
141
|
+
|
142
|
+
def make_new_init(original_init, callback_manager):
|
143
|
+
def new_init(self, *args, **kwargs):
|
144
|
+
# If 'callback_manager' is not provided, inject our tracer's callback manager
|
145
|
+
if "callback_manager" not in kwargs:
|
146
|
+
kwargs["callback_manager"] = callback_manager
|
147
|
+
original_init(self, *args, **kwargs)
|
148
|
+
|
149
|
+
return new_init
|
150
|
+
|
151
|
+
# Monkey-patch VectorStoreIndex
|
152
|
+
self._original_inits["VectorStoreIndex"] = VectorStoreIndex.__init__
|
153
|
+
VectorStoreIndex.__init__ = make_new_init(
|
154
|
+
VectorStoreIndex.__init__, self.callback_manager
|
155
|
+
)
|
156
|
+
|
157
|
+
# Monkey-patch OpenAI LLM
|
158
|
+
self._original_inits["OpenAI"] = OpenAI.__init__
|
159
|
+
OpenAI.__init__ = make_new_init(OpenAI.__init__, self.callback_manager)
|
160
|
+
|
161
|
+
# Monkey-patch ServiceContext
|
162
|
+
self._original_inits["ServiceContext"] = ServiceContext.__init__
|
163
|
+
ServiceContext.__init__ = make_new_init(
|
164
|
+
ServiceContext.__init__, self.callback_manager
|
165
|
+
)
|
166
|
+
|
167
|
+
# To monkey-patch additional classes:
|
168
|
+
# 1. Import the class you want to patch
|
169
|
+
# from llama_index.some_module import SomeOtherClass
|
170
|
+
|
171
|
+
# 2. Store the original __init__ method
|
172
|
+
# self._original_inits['SomeOtherClass'] = SomeOtherClass.__init__
|
173
|
+
|
174
|
+
# 3. Replace the __init__ method with the new one that injects the callback manager
|
175
|
+
# SomeOtherClass.__init__ = make_new_init(SomeOtherClass.__init__, self.callback_manager)
|
176
|
+
|
177
|
+
# Repeat steps 1-3 for each additional class you wish to monkey-patch
|
178
|
+
|
179
|
+
def stop(self):
|
180
|
+
"""Stop tracing and restore original methods"""
|
181
|
+
# self._upload_traces(save_json_to_pwd=True)
|
182
|
+
self.callback_manager.remove_handler(self.trace_handler)
|
183
|
+
self._restore_original_inits()
|
184
|
+
print("Traces uplaoded")
|
185
|
+
self._upload_task = True
|
186
|
+
|
187
|
+
def _restore_original_inits(self):
|
188
|
+
"""Restore the original __init__ methods of LlamaIndex components"""
|
189
|
+
from llama_index.core import VectorStoreIndex, ServiceContext
|
190
|
+
from llama_index.llms.openai import OpenAI
|
191
|
+
|
192
|
+
# Import any other classes you patched
|
193
|
+
|
194
|
+
# Restore VectorStoreIndex
|
195
|
+
if "VectorStoreIndex" in self._original_inits:
|
196
|
+
VectorStoreIndex.__init__ = self._original_inits["VectorStoreIndex"]
|
197
|
+
|
198
|
+
# Restore OpenAI
|
199
|
+
if "OpenAI" in self._original_inits:
|
200
|
+
OpenAI.__init__ = self._original_inits["OpenAI"]
|
201
|
+
|
202
|
+
# Restore ServiceContext
|
203
|
+
if "ServiceContext" in self._original_inits:
|
204
|
+
ServiceContext.__init__ = self._original_inits["ServiceContext"]
|
205
|
+
|
206
|
+
# To restore additional classes:
|
207
|
+
# Check if the class was patched, then restore the original __init__
|
208
|
+
# if 'SomeOtherClass' in self._original_inits:
|
209
|
+
# SomeOtherClass.__init__ = self._original_inits['SomeOtherClass']
|
210
|
+
|
211
|
+
def _generate_trace_id(self):
|
212
|
+
"""
|
213
|
+
Generate a random trace ID using UUID4.
|
214
|
+
Returns a string representation of the UUID with no hyphens.
|
215
|
+
"""
|
216
|
+
return '0x'+str(uuid.uuid4()).replace('-', '')
|
217
|
+
|
218
|
+
def _get_user_passed_detail(self):
|
219
|
+
user_detail = self.user_detail
|
220
|
+
user_detail["trace_id"] = self._generate_trace_id()
|
221
|
+
metadata = user_detail["metadata"]
|
222
|
+
metadata["log_source"] = "llamaindex_tracer"
|
223
|
+
metadata["recorded_on"] = datetime.utcnow().isoformat().replace('T', ' ')
|
224
|
+
user_detail["metadata"] = metadata
|
225
|
+
return user_detail
|
226
|
+
|
227
|
+
def _add_traces_in_data(self, traces=None):
|
228
|
+
"""Add traces to user detail"""
|
229
|
+
user_detail = self._get_user_passed_detail()
|
230
|
+
if traces is None:
|
231
|
+
if not self.trace_handler:
|
232
|
+
raise RuntimeError("No traces available. Did you call start()?")
|
233
|
+
traces = self.trace_handler.traces
|
234
|
+
user_detail["traces"] = traces
|
235
|
+
return user_detail
|
236
|
+
|
237
|
+
|
238
|
+
def _create_dataset_schema_with_trace(self):
|
239
|
+
SCHEMA_MAPPING_NEW = {
|
240
|
+
"trace_id": {"columnType": "traceId"},
|
241
|
+
"trace_uri": {"columnType": "traceUri"},
|
242
|
+
"prompt": {"columnType": "prompt"},
|
243
|
+
"response":{"columnType": "response"},
|
244
|
+
"context": {"columnType": "context"},
|
245
|
+
"llm_model": {"columnType":"pipeline"},
|
246
|
+
"recorded_on": {"columnType": "metadata"},
|
247
|
+
"embed_model": {"columnType":"pipeline"},
|
248
|
+
"log_source": {"columnType": "metadata"},
|
249
|
+
"vector_store":{"columnType":"pipeline"},
|
250
|
+
"feedback": {"columnType":"feedBack"}
|
251
|
+
}
|
252
|
+
def make_request():
|
253
|
+
headers = {
|
254
|
+
"Content-Type": "application/json",
|
255
|
+
"Authorization": f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
|
256
|
+
"X-Project-Name": self.project_name,
|
257
|
+
}
|
258
|
+
payload = json.dumps({
|
259
|
+
"datasetName": self.dataset_name,
|
260
|
+
"schemaMapping": SCHEMA_MAPPING_NEW,
|
261
|
+
"traceFolderUrl": None,
|
262
|
+
})
|
263
|
+
response = requests.request("POST",
|
264
|
+
f"{self.base_url}/v1/llm/dataset/logs",
|
265
|
+
headers=headers,
|
266
|
+
data=payload,
|
267
|
+
timeout=self.timeout
|
268
|
+
)
|
269
|
+
|
270
|
+
return response
|
271
|
+
|
272
|
+
response = make_request()
|
273
|
+
|
274
|
+
if response.status_code == 401:
|
275
|
+
# get_token() # Fetch a new token and set it in the environment
|
276
|
+
response = make_request() # Retry the request
|
277
|
+
if response.status_code != 200:
|
278
|
+
return response.status_code
|
279
|
+
return response.status_code
|
280
|
+
|
281
|
+
def _get_presigned_url(self):
|
282
|
+
payload = json.dumps({
|
283
|
+
"datasetName": self.dataset_name,
|
284
|
+
"numFiles": 1,
|
285
|
+
})
|
286
|
+
headers = {
|
287
|
+
"Content-Type": "application/json",
|
288
|
+
"Authorization": f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
|
289
|
+
"X-Project-Name": self.project_name,
|
290
|
+
}
|
291
|
+
|
292
|
+
response = requests.request("GET",
|
293
|
+
f"{self.base_url}/v1/llm/presigned-url",
|
294
|
+
headers=headers,
|
295
|
+
data=payload,
|
296
|
+
timeout=self.timeout)
|
297
|
+
if response.status_code == 200:
|
298
|
+
presignedUrls = response.json()["data"]["presignedUrls"][0]
|
299
|
+
return presignedUrls
|
300
|
+
|
301
|
+
def _put_presigned_url(self, presignedUrl, filename):
|
302
|
+
headers = {
|
303
|
+
"Content-Type": "application/json",
|
304
|
+
}
|
305
|
+
|
306
|
+
if "blob.core.windows.net" in presignedUrl: # Azure
|
307
|
+
headers["x-ms-blob-type"] = "BlockBlob"
|
308
|
+
print(f"Uploading traces...")
|
309
|
+
with open(filename) as f:
|
310
|
+
payload = f.read().replace("\n", "").replace("\r", "").encode()
|
311
|
+
|
312
|
+
|
313
|
+
response = requests.request("PUT",
|
314
|
+
presignedUrl,
|
315
|
+
headers=headers,
|
316
|
+
data=payload,
|
317
|
+
timeout=self.timeout)
|
318
|
+
if response.status_code != 200 or response.status_code != 201:
|
319
|
+
return response, response.status_code
|
320
|
+
|
321
|
+
def _insert_traces(self, presignedUrl):
|
322
|
+
headers = {
|
323
|
+
"Authorization": f"Bearer {os.getenv('RAGAAI_CATALYST_TOKEN')}",
|
324
|
+
"Content-Type": "application/json",
|
325
|
+
"X-Project-Name": self.project_name,
|
326
|
+
}
|
327
|
+
payload = json.dumps({
|
328
|
+
"datasetName": self.dataset_name,
|
329
|
+
"presignedUrl": presignedUrl,
|
330
|
+
})
|
331
|
+
response = requests.request("POST",
|
332
|
+
f"{self.base_url}/v1/llm/insert/trace",
|
333
|
+
headers=headers,
|
334
|
+
data=payload,
|
335
|
+
timeout=self.timeout)
|
336
|
+
|
337
|
+
|
338
|
+
def _upload_traces(self, save_json_to_pwd=None):
|
339
|
+
"""Save traces to a file"""
|
340
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
341
|
+
filename = f"trace_{timestamp}.json"
|
342
|
+
|
343
|
+
traces = self._add_traces_in_data()
|
344
|
+
|
345
|
+
if save_json_to_pwd:
|
346
|
+
with open(filename, "w") as f:
|
347
|
+
json.dump([traces], f, indent=2, cls=CustomEncoder)
|
348
|
+
print(f"tracer is saved to {filename}")
|
349
|
+
|
350
|
+
self._create_dataset_schema_with_trace()
|
351
|
+
presignedUrl = self._get_presigned_url()
|
352
|
+
self._put_presigned_url(presignedUrl, filename)
|
353
|
+
self._insert_traces(presignedUrl)
|
354
|
+
print("Traces uplaoded")
|
355
|
+
|
356
|
+
def get_upload_status(self):
|
357
|
+
"""Check the status of the trace upload."""
|
358
|
+
if self._upload_task is None:
|
359
|
+
return "No upload task in progress."
|
360
|
+
if self._upload_task:
|
361
|
+
return "Upload completed"
|
@@ -17,7 +17,7 @@ from .instrumentators import (
|
|
17
17
|
LlamaIndexInstrumentor,
|
18
18
|
)
|
19
19
|
from .utils import get_unique_key
|
20
|
-
|
20
|
+
# from .llamaindex_callback import LlamaIndexTracer
|
21
21
|
from ..ragaai_catalyst import RagaAICatalyst
|
22
22
|
|
23
23
|
logger = logging.getLogger(__name__)
|
@@ -86,13 +86,19 @@ class Tracer:
|
|
86
86
|
logger.error(f"Failed to retrieve projects list: {e}")
|
87
87
|
raise
|
88
88
|
|
89
|
+
if tracer_type == "langchain":
|
90
|
+
self.raga_client = RagaExporter(project_name=self.project_name, dataset_name=self.dataset_name)
|
89
91
|
|
90
|
-
|
92
|
+
self._tracer_provider = self._setup_provider()
|
93
|
+
self._instrumentor = self._setup_instrumentor(tracer_type)
|
94
|
+
self.is_instrumented = False
|
95
|
+
self._upload_task = None
|
96
|
+
elif tracer_type == "llamaindex":
|
97
|
+
self._upload_task = None
|
98
|
+
from .llamaindex_callback import LlamaIndexTracer
|
91
99
|
|
92
|
-
|
93
|
-
|
94
|
-
self.is_instrumented = False
|
95
|
-
self._upload_task = None
|
100
|
+
else:
|
101
|
+
raise ValueError (f"Currently supported tracer types are 'langchain' and 'llamaindex'.")
|
96
102
|
|
97
103
|
def _improve_metadata(self, metadata, tracer_type):
|
98
104
|
if metadata is None:
|
@@ -142,34 +148,44 @@ class Tracer:
|
|
142
148
|
|
143
149
|
def start(self):
|
144
150
|
"""Start the tracer."""
|
145
|
-
if
|
146
|
-
self.
|
147
|
-
|
148
|
-
|
149
|
-
|
151
|
+
if self.tracer_type == "langchain":
|
152
|
+
if not self.is_instrumented:
|
153
|
+
self._instrumentor().instrument(tracer_provider=self._tracer_provider)
|
154
|
+
self.is_instrumented = True
|
155
|
+
print(f"Tracer started for project: {self.project_name}")
|
156
|
+
return self
|
157
|
+
elif self.tracer_type == "llamaindex":
|
158
|
+
from .llamaindex_callback import LlamaIndexTracer
|
159
|
+
return LlamaIndexTracer(self._pass_user_data()).start()
|
160
|
+
|
150
161
|
|
151
162
|
def stop(self):
|
152
163
|
"""Stop the tracer and initiate trace upload."""
|
153
|
-
if
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
164
|
+
if self.tracer_type == "langchain":
|
165
|
+
if not self.is_instrumented:
|
166
|
+
logger.warning("Tracer was not started. No traces to upload.")
|
167
|
+
return "No traces to upload"
|
168
|
+
|
169
|
+
print("Stopping tracer and initiating trace upload...")
|
170
|
+
self._cleanup()
|
171
|
+
self._upload_task = self._run_async(self._upload_traces())
|
172
|
+
return "Trace upload initiated. Use get_upload_status() to check the status."
|
173
|
+
elif self.tracer_type == "llamaindex":
|
174
|
+
from .llamaindex_callback import LlamaIndexTracer
|
175
|
+
return LlamaIndexTracer().stop()
|
161
176
|
|
162
177
|
def get_upload_status(self):
|
163
178
|
"""Check the status of the trace upload."""
|
164
|
-
if self.
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
179
|
+
if self.tracer_type == "langchain":
|
180
|
+
if self._upload_task is None:
|
181
|
+
return "No upload task in progress."
|
182
|
+
if self._upload_task.done():
|
183
|
+
try:
|
184
|
+
result = self._upload_task.result()
|
185
|
+
return f"Upload completed: {result}"
|
186
|
+
except Exception as e:
|
187
|
+
return f"Upload failed: {str(e)}"
|
188
|
+
return "Upload in progress..."
|
173
189
|
|
174
190
|
def _run_async(self, coroutine):
|
175
191
|
"""Run an asynchronous coroutine in a separate thread."""
|
@@ -246,3 +262,21 @@ class Tracer:
|
|
246
262
|
# Reset instrumentation flag
|
247
263
|
self.is_instrumented = False
|
248
264
|
# Note: We're not resetting all attributes here to allow for upload status checking
|
265
|
+
def _pass_user_data(self):
|
266
|
+
return {"project_name":self.project_name,
|
267
|
+
"project_id": self.project_id,
|
268
|
+
"dataset_name":self.dataset_name,
|
269
|
+
"trace_user_detail" : {
|
270
|
+
"project_id": self.project_id,
|
271
|
+
"trace_id": "",
|
272
|
+
"session_id": None,
|
273
|
+
"trace_type": self.tracer_type,
|
274
|
+
"traces": [],
|
275
|
+
"metadata": self.metadata,
|
276
|
+
"pipeline": {
|
277
|
+
"llm_model": self.pipeline["llm_model"],
|
278
|
+
"vector_store": self.pipeline["vector_store"],
|
279
|
+
"embed_model": self.pipeline["embed_model"]
|
280
|
+
}
|
281
|
+
}
|
282
|
+
}
|