llumo 0.2.24__py3-none-any.whl → 0.2.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llumo/__init__.py +6 -3
- llumo/callback.py +480 -0
- llumo/callbacks-0.py +258 -0
- llumo/client.py +383 -204
- llumo/llumoLogger.py +57 -0
- llumo/llumoSessionContext.py +366 -0
- llumo/openai.py +196 -50
- {llumo-0.2.24.dist-info → llumo-0.2.26.dist-info}/METADATA +1 -1
- llumo-0.2.26.dist-info/RECORD +20 -0
- llumo-0.2.24.dist-info/RECORD +0 -16
- {llumo-0.2.24.dist-info → llumo-0.2.26.dist-info}/WHEEL +0 -0
- {llumo-0.2.24.dist-info → llumo-0.2.26.dist-info}/licenses/LICENSE +0 -0
- {llumo-0.2.24.dist-info → llumo-0.2.26.dist-info}/top_level.txt +0 -0
llumo/llumoLogger.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LlumoLogger:
|
|
5
|
+
def __init__(self, apiKey: str, playground: str):
|
|
6
|
+
self.apiKey = apiKey
|
|
7
|
+
self.playground = playground
|
|
8
|
+
self.workspaceID = None
|
|
9
|
+
self.playgroundID = None
|
|
10
|
+
self.userEmailID = None
|
|
11
|
+
self._authenticate()
|
|
12
|
+
|
|
13
|
+
def _authenticate(self):
|
|
14
|
+
url = "https://app.llumo.ai/api/get-playground-name"
|
|
15
|
+
try:
|
|
16
|
+
response = requests.post(
|
|
17
|
+
url,
|
|
18
|
+
headers={
|
|
19
|
+
"Authorization": f"Bearer {self.apiKey}",
|
|
20
|
+
"Content-Type": "application/json",
|
|
21
|
+
},
|
|
22
|
+
json={"playgroundName": self.playground},
|
|
23
|
+
timeout=10,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
response.raise_for_status()
|
|
27
|
+
res_json = response.json()
|
|
28
|
+
|
|
29
|
+
# Navigate into the nested "data" structure
|
|
30
|
+
inner_data = res_json.get("data", {}).get("data", {})
|
|
31
|
+
|
|
32
|
+
self.workspaceID = inner_data.get("workspaceID")
|
|
33
|
+
self.playgroundID = inner_data.get("playgroundID")
|
|
34
|
+
self.userEmailID = inner_data.get("createdBy")
|
|
35
|
+
|
|
36
|
+
if not self.workspaceID or not self.playgroundID:
|
|
37
|
+
raise RuntimeError(
|
|
38
|
+
f"Invalid response: workspaceID or playgroundID missing. Full response: {res_json}"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
except requests.exceptions.RequestException as req_err:
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
f"Network or HTTP error during authentication: {req_err}"
|
|
44
|
+
)
|
|
45
|
+
except ValueError as json_err:
|
|
46
|
+
raise RuntimeError(f"Invalid JSON in authentication response: {json_err}")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
raise RuntimeError(f"Authentication failed: {e}")
|
|
49
|
+
|
|
50
|
+
def getWorkspaceID(self):
|
|
51
|
+
return self.workspaceID
|
|
52
|
+
|
|
53
|
+
def getUserEmailID(self):
|
|
54
|
+
return self.userEmailID
|
|
55
|
+
|
|
56
|
+
def getPlaygroundID(self):
|
|
57
|
+
return self.playgroundID
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
import contextvars
|
|
2
|
+
import uuid
|
|
3
|
+
from typing import Optional, List, Dict, Any
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
import requests
|
|
6
|
+
from .client import LlumoClient
|
|
7
|
+
|
|
8
|
+
_ctxLogger = contextvars.ContextVar("ctxLogger")
|
|
9
|
+
_ctxSessionID = contextvars.ContextVar("ctxSessionID")
|
|
10
|
+
_ctxLlumoRun = contextvars.ContextVar("ctxLlumoRun")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def getLogger():
|
|
14
|
+
return _ctxLogger.get()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def getSessionID():
|
|
18
|
+
return _ctxSessionID.get()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def getLlumoRun():
|
|
22
|
+
return _ctxLlumoRun.get()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LlumoSessionContext(LlumoClient):
|
|
26
|
+
def __init__(self, logger, sessionID: Optional[str] = None):
|
|
27
|
+
super().__init__(api_key=logger.apiKey, playground_id=logger.getPlaygroundID())
|
|
28
|
+
self.sessionID = sessionID or str(uuid.uuid4().hex[:14])
|
|
29
|
+
self.logger = logger
|
|
30
|
+
self.apiKey = logger.apiKey
|
|
31
|
+
self.threadLogger = None
|
|
32
|
+
self.threadSessionID = None
|
|
33
|
+
self.threadLlumoRun = None
|
|
34
|
+
|
|
35
|
+
def start(self):
|
|
36
|
+
self.threadLogger = _ctxLogger.set(self.logger)
|
|
37
|
+
self.threadSessionID = _ctxSessionID.set(self.sessionID)
|
|
38
|
+
|
|
39
|
+
def end(self):
|
|
40
|
+
if self.threadLogger:
|
|
41
|
+
_ctxLogger.reset(self.threadLogger)
|
|
42
|
+
if self.threadSessionID:
|
|
43
|
+
_ctxSessionID.reset(self.threadSessionID)
|
|
44
|
+
if self.threadLlumoRun:
|
|
45
|
+
_ctxLlumoRun.reset(self.threadLlumoRun)
|
|
46
|
+
|
|
47
|
+
def __enter__(self):
|
|
48
|
+
self.start()
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def __exit__(self, excType, excVal, excTb):
|
|
52
|
+
self.end()
|
|
53
|
+
|
|
54
|
+
def startLlumoRun(self, runName: str):
|
|
55
|
+
LlumoRunID = str(uuid.uuid4().hex[:16])
|
|
56
|
+
currentTime = datetime(2025, 8, 2, 10, 20, 15, tzinfo=timezone.utc)
|
|
57
|
+
createdAt = currentTime.strftime("%Y-%m-%dT%H:%M:%S.000Z")
|
|
58
|
+
llumoRun = {
|
|
59
|
+
"logID": LlumoRunID,
|
|
60
|
+
"runName": runName,
|
|
61
|
+
"sessionID": self.sessionID,
|
|
62
|
+
"playgroundID": self.logger.getPlaygroundID(),
|
|
63
|
+
"workspaceID": self.logger.getWorkspaceID(),
|
|
64
|
+
"source": "SDK",
|
|
65
|
+
"rowID": "",
|
|
66
|
+
"columnID": "",
|
|
67
|
+
"email": self.logger.getUserEmailID(),
|
|
68
|
+
"createdAt": createdAt,
|
|
69
|
+
"createdBy": self.logger.getUserEmailID(),
|
|
70
|
+
"status": "SUCCESS",
|
|
71
|
+
"flow": [],
|
|
72
|
+
"latency": 4200,
|
|
73
|
+
"feedback": "",
|
|
74
|
+
"dump": "",
|
|
75
|
+
"steps": [],
|
|
76
|
+
}
|
|
77
|
+
self.threadLlumoRun = _ctxLlumoRun.set(llumoRun)
|
|
78
|
+
|
|
79
|
+
def endLlumoRun(self):
|
|
80
|
+
run = getLlumoRun()
|
|
81
|
+
if run is None:
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
# STEP 1: Sort steps by timestamp
|
|
85
|
+
steps = run.get("steps", [])
|
|
86
|
+
sorted_steps = sorted(steps, key=lambda s: s.get("timestamp", 0))
|
|
87
|
+
|
|
88
|
+
# STEP 2: Remove timestamp from each step before sending
|
|
89
|
+
clean_steps = [
|
|
90
|
+
{k: v for k, v in step.items() if k != "timestamp"} for step in sorted_steps
|
|
91
|
+
]
|
|
92
|
+
run["steps"] = clean_steps
|
|
93
|
+
|
|
94
|
+
print(run["runName"]) # optional debug log
|
|
95
|
+
|
|
96
|
+
# STEP 3: Send the payload
|
|
97
|
+
url = "https://app.llumo.ai/api/create-debug-log"
|
|
98
|
+
headers = {
|
|
99
|
+
"Authorization": f"Bearer {self.apiKey}",
|
|
100
|
+
"Content-Type": "application/json",
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
response = requests.post(url, headers=headers, json=run, timeout=10)
|
|
105
|
+
response.raise_for_status()
|
|
106
|
+
# print(response.json())
|
|
107
|
+
except requests.exceptions.Timeout:
|
|
108
|
+
print("Request timed out.")
|
|
109
|
+
except requests.exceptions.RequestException as e:
|
|
110
|
+
print(f"Request failed: {e}")
|
|
111
|
+
|
|
112
|
+
# Cleanup
|
|
113
|
+
if self.threadLlumoRun:
|
|
114
|
+
_ctxLlumoRun.reset(self.threadLlumoRun)
|
|
115
|
+
self.threadLlumoRun = None
|
|
116
|
+
|
|
117
|
+
def logStep(
|
|
118
|
+
self,
|
|
119
|
+
stepType: str,
|
|
120
|
+
stepName: str,
|
|
121
|
+
metadata: Optional[dict] = None,
|
|
122
|
+
):
|
|
123
|
+
print(f"logged: {stepType}")
|
|
124
|
+
run = getLlumoRun()
|
|
125
|
+
if run is None:
|
|
126
|
+
raise RuntimeError("No active run to log steps.")
|
|
127
|
+
|
|
128
|
+
# add step
|
|
129
|
+
stepData = {
|
|
130
|
+
"stepType": stepType,
|
|
131
|
+
"stepName": stepName,
|
|
132
|
+
"status": metadata.get("status", "SUCCESS"),
|
|
133
|
+
"message": metadata.get("message", ""),
|
|
134
|
+
"metadata": metadata or {},
|
|
135
|
+
"timestamp": datetime.now(timezone.utc).timestamp(), # OPTIONAL
|
|
136
|
+
}
|
|
137
|
+
run["steps"].append(stepData)
|
|
138
|
+
# set to context vars again in llumo run
|
|
139
|
+
self.threadLlumoRun = _ctxLlumoRun.set(run)
|
|
140
|
+
|
|
141
|
+
def logLlmStep(
|
|
142
|
+
self,
|
|
143
|
+
stepName: str,
|
|
144
|
+
model: str,
|
|
145
|
+
provider: str,
|
|
146
|
+
inputTokens: int,
|
|
147
|
+
outputTokens: int,
|
|
148
|
+
temperature: float,
|
|
149
|
+
promptTruncated: bool,
|
|
150
|
+
latencyMs: int,
|
|
151
|
+
query: str,
|
|
152
|
+
output: str,
|
|
153
|
+
status: str,
|
|
154
|
+
message: str,
|
|
155
|
+
):
|
|
156
|
+
metadata = {
|
|
157
|
+
"model": model,
|
|
158
|
+
"provider": provider,
|
|
159
|
+
"inputTokens": inputTokens,
|
|
160
|
+
"outputTokens": outputTokens,
|
|
161
|
+
"temperature": temperature,
|
|
162
|
+
"promptTruncated": promptTruncated,
|
|
163
|
+
"latencyMs": latencyMs,
|
|
164
|
+
"query": query,
|
|
165
|
+
"output": output,
|
|
166
|
+
"status": status,
|
|
167
|
+
"message": message,
|
|
168
|
+
}
|
|
169
|
+
self.logStep("LLM", stepName, metadata)
|
|
170
|
+
|
|
171
|
+
def logRetrieverStep(
|
|
172
|
+
self,
|
|
173
|
+
stepName: str,
|
|
174
|
+
retrieverSource: str,
|
|
175
|
+
queryVectorType: str,
|
|
176
|
+
topK: int,
|
|
177
|
+
matchedIDs: List[str],
|
|
178
|
+
query: str,
|
|
179
|
+
latencyMs: int,
|
|
180
|
+
status: str,
|
|
181
|
+
message: str,
|
|
182
|
+
):
|
|
183
|
+
metadata = {
|
|
184
|
+
"retrieverSource": retrieverSource,
|
|
185
|
+
"queryVectorType": queryVectorType,
|
|
186
|
+
"topK": topK,
|
|
187
|
+
"matchedIDs": matchedIDs,
|
|
188
|
+
"query": query,
|
|
189
|
+
"latencyMs": latencyMs,
|
|
190
|
+
"status": status,
|
|
191
|
+
"message": message,
|
|
192
|
+
}
|
|
193
|
+
self.logStep("RETRIEVER", stepName, metadata)
|
|
194
|
+
|
|
195
|
+
def logAgentStep(
|
|
196
|
+
self,
|
|
197
|
+
stepName: str,
|
|
198
|
+
agentType: str,
|
|
199
|
+
agentName: str,
|
|
200
|
+
numStepsTaken: int,
|
|
201
|
+
tools: List[str],
|
|
202
|
+
query: str,
|
|
203
|
+
status: str,
|
|
204
|
+
message: str,
|
|
205
|
+
):
|
|
206
|
+
metadata = {
|
|
207
|
+
"agentType": agentType,
|
|
208
|
+
"agentName": agentName,
|
|
209
|
+
"numStepsTaken": numStepsTaken,
|
|
210
|
+
"tools": tools,
|
|
211
|
+
"query": query,
|
|
212
|
+
"status": status,
|
|
213
|
+
"message": message,
|
|
214
|
+
}
|
|
215
|
+
self.logStep("AGENT", stepName, metadata)
|
|
216
|
+
|
|
217
|
+
def logToolSelectorStep(
|
|
218
|
+
self,
|
|
219
|
+
stepName: str,
|
|
220
|
+
selectorType: str,
|
|
221
|
+
toolsRanked: List[Dict[str, Any]],
|
|
222
|
+
selectedTool: str,
|
|
223
|
+
reasoning: str,
|
|
224
|
+
status: str,
|
|
225
|
+
message: str,
|
|
226
|
+
):
|
|
227
|
+
metadata = {
|
|
228
|
+
"selectorType": selectorType,
|
|
229
|
+
"toolsRanked": toolsRanked,
|
|
230
|
+
"selectedTool": selectedTool,
|
|
231
|
+
"reasoning": reasoning,
|
|
232
|
+
"status": status,
|
|
233
|
+
"message": message,
|
|
234
|
+
}
|
|
235
|
+
self.logStep("TOOL_SELECTOR", stepName, metadata)
|
|
236
|
+
|
|
237
|
+
def logToolStep(
|
|
238
|
+
self,
|
|
239
|
+
stepName: str,
|
|
240
|
+
toolName: str,
|
|
241
|
+
input: Dict[str, Any],
|
|
242
|
+
output: str,
|
|
243
|
+
latencyMs: int,
|
|
244
|
+
status: str,
|
|
245
|
+
message: str,
|
|
246
|
+
):
|
|
247
|
+
metadata = {
|
|
248
|
+
"toolName": toolName,
|
|
249
|
+
"input": input,
|
|
250
|
+
"output": output,
|
|
251
|
+
"latencyMs": latencyMs,
|
|
252
|
+
"status": status,
|
|
253
|
+
"message": message,
|
|
254
|
+
}
|
|
255
|
+
self.logStep("TOOL", stepName, metadata)
|
|
256
|
+
|
|
257
|
+
def logEvalStep(
|
|
258
|
+
self,
|
|
259
|
+
stepName: str,
|
|
260
|
+
output: str,
|
|
261
|
+
context: str,
|
|
262
|
+
query: str,
|
|
263
|
+
# total 7 keys add 4 more
|
|
264
|
+
messageHistory: str,
|
|
265
|
+
tools: str,
|
|
266
|
+
intermediateSteps: str,
|
|
267
|
+
groundTruth: str,
|
|
268
|
+
analyticsScore: Dict[str, float],
|
|
269
|
+
reasoning: Dict[str, str],
|
|
270
|
+
classification: Dict[str, str],
|
|
271
|
+
evalLabel: Dict[str, str],
|
|
272
|
+
latencyMs: int,
|
|
273
|
+
status: str,
|
|
274
|
+
message: str,
|
|
275
|
+
):
|
|
276
|
+
metadata = {
|
|
277
|
+
"output": output,
|
|
278
|
+
"context": context,
|
|
279
|
+
"query": query,
|
|
280
|
+
"messageHistory": messageHistory,
|
|
281
|
+
"tools": tools,
|
|
282
|
+
"intermediateSteps": intermediateSteps,
|
|
283
|
+
"groundTruth": groundTruth,
|
|
284
|
+
"analyticsScore": analyticsScore,
|
|
285
|
+
"reasoning": reasoning,
|
|
286
|
+
"classification": classification,
|
|
287
|
+
"evalLabel": evalLabel,
|
|
288
|
+
"latencyMs": latencyMs,
|
|
289
|
+
"status": status,
|
|
290
|
+
"message": message,
|
|
291
|
+
}
|
|
292
|
+
self.logStep("EVAL", stepName, metadata)
|
|
293
|
+
|
|
294
|
+
def logFunctionCallStep(
|
|
295
|
+
self,
|
|
296
|
+
stepName: str,
|
|
297
|
+
functionName: str,
|
|
298
|
+
argsPassed: Dict[str, Any],
|
|
299
|
+
output: Dict[str, Any],
|
|
300
|
+
callMode: str,
|
|
301
|
+
latencyMs: int,
|
|
302
|
+
status: str,
|
|
303
|
+
message: str,
|
|
304
|
+
):
|
|
305
|
+
metadata = {
|
|
306
|
+
"functionName": functionName,
|
|
307
|
+
"argsPassed": argsPassed,
|
|
308
|
+
"output": output,
|
|
309
|
+
"callMode": callMode,
|
|
310
|
+
"latencyMs": latencyMs,
|
|
311
|
+
"status": status,
|
|
312
|
+
"message": message,
|
|
313
|
+
}
|
|
314
|
+
self.logStep("FUNCTION_CALL", stepName, metadata)
|
|
315
|
+
|
|
316
|
+
def logCompressionStep(
|
|
317
|
+
self,
|
|
318
|
+
stepName: str,
|
|
319
|
+
prompt: str,
|
|
320
|
+
promptTemplate: str,
|
|
321
|
+
inputs: Dict[str, Any],
|
|
322
|
+
compressedPrompt: str,
|
|
323
|
+
inputToken: int,
|
|
324
|
+
compressedToken: int,
|
|
325
|
+
outputToken: int,
|
|
326
|
+
output: str,
|
|
327
|
+
compressedOutput: str,
|
|
328
|
+
latencyMs: int,
|
|
329
|
+
status: str,
|
|
330
|
+
message: str,
|
|
331
|
+
):
|
|
332
|
+
metadata = {
|
|
333
|
+
"prompt": prompt,
|
|
334
|
+
"promptTemplate": promptTemplate,
|
|
335
|
+
"inputs": inputs,
|
|
336
|
+
"compressedPrompt": compressedPrompt,
|
|
337
|
+
"inputToken": inputToken,
|
|
338
|
+
"compressedToken": compressedToken,
|
|
339
|
+
"outputToken": outputToken,
|
|
340
|
+
"output": output,
|
|
341
|
+
"compressedOutput": compressedOutput,
|
|
342
|
+
"latencyMs": latencyMs,
|
|
343
|
+
"status": status,
|
|
344
|
+
"message": message,
|
|
345
|
+
}
|
|
346
|
+
self.logStep("COMPRESSION", stepName, metadata)
|
|
347
|
+
|
|
348
|
+
def logCustomScriptStep(
|
|
349
|
+
self,
|
|
350
|
+
stepName: str,
|
|
351
|
+
inputs: Dict[str, Any],
|
|
352
|
+
script: str,
|
|
353
|
+
output: str,
|
|
354
|
+
latencyMs: int,
|
|
355
|
+
status: str,
|
|
356
|
+
message: str,
|
|
357
|
+
):
|
|
358
|
+
metadata = {
|
|
359
|
+
"inputs": inputs,
|
|
360
|
+
"script": script,
|
|
361
|
+
"output": output,
|
|
362
|
+
"latencyMs": latencyMs,
|
|
363
|
+
"status": status,
|
|
364
|
+
"message": message,
|
|
365
|
+
}
|
|
366
|
+
self.logStep("CUSTOM_SCRIPT", stepName, metadata)
|
llumo/openai.py
CHANGED
|
@@ -1,11 +1,27 @@
|
|
|
1
|
+
import time
|
|
1
2
|
from openai import OpenAI as OpenAIClient
|
|
2
3
|
from .client import LlumoClient
|
|
4
|
+
from .llumoSessionContext import LlumoSessionContext
|
|
5
|
+
from .llumoLogger import LLUMOLogger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# evaluation function that uses LlumoClient
|
|
9
|
+
def performEvaluation(data, api_key=None, evals=["Response Correctness"], **kwargs):
|
|
10
|
+
try:
|
|
11
|
+
client = LlumoClient(api_key=api_key)
|
|
12
|
+
results = client.evaluateMultiple(
|
|
13
|
+
data,
|
|
14
|
+
evals=evals,
|
|
15
|
+
createExperiment=kwargs.get("createExperiment", False),
|
|
16
|
+
playgroundID=kwargs.get("playgroundID"),
|
|
17
|
+
prompt_template="Give answer to the query: {{query}}, using context: {{context}}",
|
|
18
|
+
getDataFrame=False,
|
|
19
|
+
)
|
|
20
|
+
return results
|
|
21
|
+
except Exception as e:
|
|
22
|
+
print(f"Error in perform_evaluation: {e}")
|
|
23
|
+
raise
|
|
3
24
|
|
|
4
|
-
# Dummy evaluation function that uses LlumoClient
|
|
5
|
-
def evaluate_multiple(data, api_key=None,evals=["Response Correctness"]):
|
|
6
|
-
client = LlumoClient(api_key=api_key)
|
|
7
|
-
results= client.evaluateMultiple(data, evals=evals,createExperiment=False,prompt_template="Give answer to the query: {{query}}, using context: {{context}}",getDataFrame=False)
|
|
8
|
-
return results
|
|
9
25
|
|
|
10
26
|
# Wrapper around ChatCompletion to allow custom fields like `.evaluation`
|
|
11
27
|
class ChatCompletionWithEval:
|
|
@@ -22,57 +38,187 @@ class ChatCompletionWithEval:
|
|
|
22
38
|
def __repr__(self):
|
|
23
39
|
return repr(self._response)
|
|
24
40
|
|
|
25
|
-
|
|
26
|
-
|
|
41
|
+
|
|
42
|
+
class OpenAI(OpenAIClient):
|
|
43
|
+
def __init__(self, api_key: str, session):
|
|
27
44
|
super().__init__(api_key=api_key)
|
|
45
|
+
self.session = session
|
|
46
|
+
self.llumo_key = session.apiKey
|
|
28
47
|
|
|
29
48
|
original_create = self.chat.completions.create
|
|
30
49
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
""
|
|
50
|
+
def create_wrapper(*args, **kwargs):
|
|
51
|
+
context = kwargs.pop("context", None)
|
|
52
|
+
model = kwargs["model"]
|
|
53
|
+
create_experiment = kwargs.pop("createExperiment", False)
|
|
54
|
+
|
|
55
|
+
messages = kwargs.get("messages", [])
|
|
56
|
+
user_message = next(
|
|
57
|
+
(
|
|
58
|
+
m.get("content")
|
|
59
|
+
for m in reversed(messages)
|
|
60
|
+
if m.get("role") == "user"
|
|
61
|
+
),
|
|
62
|
+
"",
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if not context or context.strip() == "":
|
|
66
|
+
context = user_message
|
|
67
|
+
|
|
68
|
+
# Get IDs from the session logger
|
|
69
|
+
playground_id = self.session.logger.getPlaygroundID()
|
|
70
|
+
workspace_id = self.session.logger.getWorkspaceID()
|
|
71
|
+
|
|
72
|
+
# Input Bias Evaluation
|
|
73
|
+
eval_input_bias = [
|
|
74
|
+
{
|
|
75
|
+
"query": user_message,
|
|
76
|
+
"context": context,
|
|
77
|
+
"output": "", # No output yet
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
try:
|
|
81
|
+
start_time = time.time()
|
|
82
|
+
bias_evaluation_result = performEvaluation(
|
|
83
|
+
eval_input_bias,
|
|
84
|
+
api_key=self.llumo_key,
|
|
85
|
+
evals=["Input Bias"],
|
|
86
|
+
playgroundID=playground_id,
|
|
87
|
+
workspaceID=workspace_id,
|
|
88
|
+
createExperiment=create_experiment,
|
|
89
|
+
)
|
|
90
|
+
latency = int((time.time() - start_time) * 1000)
|
|
91
|
+
# Access the first result object
|
|
92
|
+
bias_evaluation = bias_evaluation_result[0]
|
|
93
|
+
# message = "-".join(
|
|
94
|
+
# getattr(bias_evaluation, "edgeCases", {}).get("value", [])
|
|
95
|
+
# )
|
|
96
|
+
# self.session.logEvalStep(
|
|
97
|
+
# stepName=f"EVAL-Input Bias",
|
|
98
|
+
# output="",
|
|
99
|
+
# context=context,
|
|
100
|
+
# query=user_message,
|
|
101
|
+
# messageHistory="",
|
|
102
|
+
# tools="",
|
|
103
|
+
# intermediateSteps="",
|
|
104
|
+
# groundTruth="",
|
|
105
|
+
# analyticsScore=getattr(bias_evaluation, "analyticsScore", {}),
|
|
106
|
+
# reasoning=getattr(bias_evaluation, "reasoning", {}),
|
|
107
|
+
# classification=getattr(bias_evaluation, "classification", {}),
|
|
108
|
+
# evalLabel=getattr(bias_evaluation, "evalLabel", {}),
|
|
109
|
+
# latencyMs=latency,
|
|
110
|
+
# status="SUCCESS",
|
|
111
|
+
# message=message,
|
|
112
|
+
# )
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"Input Bias evaluation failed: {e}")
|
|
115
|
+
self.session.logEvalStep(
|
|
116
|
+
stepName=f"EVAL-FAILURE",
|
|
117
|
+
output="",
|
|
118
|
+
context=context,
|
|
119
|
+
query=user_message,
|
|
120
|
+
messageHistory="",
|
|
121
|
+
tools="",
|
|
122
|
+
intermediateSteps="",
|
|
123
|
+
groundTruth="",
|
|
124
|
+
analyticsScore={},
|
|
125
|
+
reasoning={},
|
|
126
|
+
classification={},
|
|
127
|
+
evalLabel={},
|
|
128
|
+
latencyMs=0,
|
|
129
|
+
status="FAILURE",
|
|
130
|
+
message="EVAL_ERROR",
|
|
42
131
|
)
|
|
43
132
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
133
|
+
start_time = time.time()
|
|
134
|
+
response = original_create(*args, **kwargs)
|
|
135
|
+
latency = int((time.time() - start_time) * 1000)
|
|
136
|
+
output_text = response.choices[0].message.content
|
|
137
|
+
|
|
138
|
+
self.session.logLlmStep(
|
|
139
|
+
stepName=f"LLM-{user_message[:30]}",
|
|
140
|
+
model=model,
|
|
141
|
+
provider="openai",
|
|
142
|
+
inputTokens=response.usage.prompt_tokens,
|
|
143
|
+
outputTokens=response.usage.completion_tokens,
|
|
144
|
+
temperature=kwargs.get("temperature", 0.0),
|
|
145
|
+
promptTruncated=False,
|
|
146
|
+
latencyMs=latency,
|
|
147
|
+
query=user_message,
|
|
148
|
+
output=output_text,
|
|
149
|
+
status="SUCCESS",
|
|
150
|
+
message="",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
# Response Correctness Evaluation
|
|
154
|
+
eval_input_correctness = [
|
|
155
|
+
{
|
|
56
156
|
"query": user_message,
|
|
57
157
|
"context": context,
|
|
58
158
|
"output": output_text,
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
#
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
159
|
+
}
|
|
160
|
+
]
|
|
161
|
+
try:
|
|
162
|
+
start_time = time.time()
|
|
163
|
+
correctness_evaluation_result = performEvaluation(
|
|
164
|
+
eval_input_correctness,
|
|
165
|
+
api_key=self.llumo_key,
|
|
166
|
+
evals=["Response Correctness"],
|
|
167
|
+
playgroundID=playground_id,
|
|
168
|
+
workspaceID=workspace_id,
|
|
169
|
+
createExperiment=create_experiment,
|
|
170
|
+
)
|
|
171
|
+
latency = int((time.time() - start_time) * 1000)
|
|
172
|
+
# Access the first result object
|
|
173
|
+
correctness_evaluation = correctness_evaluation_result[0]
|
|
174
|
+
# message = "-".join(
|
|
175
|
+
# getattr(correctness_evaluation, "edgeCases", {}).get("value", [])
|
|
176
|
+
# )
|
|
177
|
+
# self.session.logEvalStep(
|
|
178
|
+
# stepName=f"EVAL-Response Correctness",
|
|
179
|
+
# output=output_text,
|
|
180
|
+
# context=context,
|
|
181
|
+
# query=user_message,
|
|
182
|
+
# messageHistory="",
|
|
183
|
+
# tools="",
|
|
184
|
+
# intermediateSteps="",
|
|
185
|
+
# groundTruth="",
|
|
186
|
+
# analyticsScore=getattr(
|
|
187
|
+
# correctness_evaluation, "analyticsScore", {}
|
|
188
|
+
# ),
|
|
189
|
+
# reasoning=getattr(correctness_evaluation, "reasoning", {}),
|
|
190
|
+
# classification=getattr(
|
|
191
|
+
# correctness_evaluation, "classification", {}
|
|
192
|
+
# ),
|
|
193
|
+
# evalLabel=getattr(correctness_evaluation, "evalLabel", {}),
|
|
194
|
+
# latencyMs=latency,
|
|
195
|
+
# status="SUCCESS",
|
|
196
|
+
# message=message,
|
|
197
|
+
# )
|
|
198
|
+
except Exception as e:
|
|
199
|
+
print(f"Response Correctness evaluation failed: {e}")
|
|
200
|
+
correctness_evaluation = None
|
|
201
|
+
self.session.logEvalStep(
|
|
202
|
+
stepName=f"EVAL-FAILURE",
|
|
203
|
+
output=output_text,
|
|
204
|
+
context=context,
|
|
205
|
+
query=user_message,
|
|
206
|
+
messageHistory="",
|
|
207
|
+
tools="",
|
|
208
|
+
intermediateSteps="",
|
|
209
|
+
groundTruth="",
|
|
210
|
+
analyticsScore={},
|
|
211
|
+
reasoning={},
|
|
212
|
+
classification={},
|
|
213
|
+
evalLabel={},
|
|
214
|
+
latencyMs=0,
|
|
215
|
+
status="FAILURE",
|
|
216
|
+
message="EVAL_ERROR",
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if correctness_evaluation is None:
|
|
220
|
+
return response
|
|
221
|
+
|
|
222
|
+
return ChatCompletionWithEval(response, correctness_evaluation)
|
|
223
|
+
|
|
224
|
+
self.chat.completions.create = create_wrapper
|