datarobot-moderations 11.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datarobot_dome/__init__.py +11 -0
- datarobot_dome/async_http_client.py +248 -0
- datarobot_dome/chat_helper.py +227 -0
- datarobot_dome/constants.py +318 -0
- datarobot_dome/drum_integration.py +977 -0
- datarobot_dome/guard.py +736 -0
- datarobot_dome/guard_executor.py +755 -0
- datarobot_dome/guard_helpers.py +457 -0
- datarobot_dome/guards/__init__.py +11 -0
- datarobot_dome/guards/guard_llm_mixin.py +232 -0
- datarobot_dome/llm.py +148 -0
- datarobot_dome/metrics/__init__.py +11 -0
- datarobot_dome/metrics/citation_metrics.py +98 -0
- datarobot_dome/metrics/factory.py +52 -0
- datarobot_dome/metrics/metric_scorer.py +78 -0
- datarobot_dome/pipeline/__init__.py +11 -0
- datarobot_dome/pipeline/llm_pipeline.py +474 -0
- datarobot_dome/pipeline/pipeline.py +376 -0
- datarobot_dome/pipeline/vdb_pipeline.py +127 -0
- datarobot_dome/streaming.py +395 -0
- datarobot_moderations-11.1.12.dist-info/METADATA +113 -0
- datarobot_moderations-11.1.12.dist-info/RECORD +23 -0
- datarobot_moderations-11.1.12.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
# ---------------------------------------------------------------------------------
|
|
2
|
+
# Copyright (c) 2025 DataRobot, Inc. and its affiliates. All rights reserved.
|
|
3
|
+
# Last updated 2025.
|
|
4
|
+
#
|
|
5
|
+
# DataRobot, Inc. Confidential.
|
|
6
|
+
# This is proprietary source code of DataRobot, Inc. and its affiliates.
|
|
7
|
+
#
|
|
8
|
+
# This file and its contents are subject to DataRobot Tool and Utility Agreement.
|
|
9
|
+
# For details, see
|
|
10
|
+
# https://www.datarobot.com/wp-content/uploads/2021/07/DataRobot-Tool-and-Utility-Agreement.pdf.
|
|
11
|
+
# ---------------------------------------------------------------------------------
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import requests
|
|
18
|
+
import tiktoken
|
|
19
|
+
from deepeval.metrics import TaskCompletionMetric
|
|
20
|
+
from deepeval.models import DeepEvalBaseLLM
|
|
21
|
+
from deepeval.test_case import LLMTestCase
|
|
22
|
+
from langchain_nvidia_ai_endpoints import ChatNVIDIA
|
|
23
|
+
from langchain_nvidia_ai_endpoints import Model
|
|
24
|
+
from langchain_nvidia_ai_endpoints import register_model
|
|
25
|
+
from langchain_nvidia_ai_endpoints._statics import determine_model
|
|
26
|
+
from langchain_openai import AzureChatOpenAI
|
|
27
|
+
from langchain_openai import ChatOpenAI
|
|
28
|
+
from llama_index.core.evaluation import FaithfulnessEvaluator
|
|
29
|
+
from ragas import MultiTurnSample
|
|
30
|
+
from ragas.messages import AIMessage
|
|
31
|
+
from ragas.messages import HumanMessage
|
|
32
|
+
from ragas.messages import ToolMessage
|
|
33
|
+
from ragas.metrics import AgentGoalAccuracyWithoutReference
|
|
34
|
+
from rouge_score import rouge_scorer
|
|
35
|
+
|
|
36
|
+
from datarobot_dome.constants import AWS_MODEL_TO_AWS_MODEL_VERSION_MAP
|
|
37
|
+
from datarobot_dome.constants import GOOGLE_MODEL_TO_GOOGLE_MODEL_VERSION_MAP
|
|
38
|
+
from datarobot_dome.constants import LOGGER_NAME_PREFIX
|
|
39
|
+
from datarobot_dome.constants import PROMPT_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE
|
|
40
|
+
from datarobot_dome.constants import RESPONSE_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE
|
|
41
|
+
from datarobot_dome.constants import AwsModel
|
|
42
|
+
from datarobot_dome.constants import GoogleModel
|
|
43
|
+
from datarobot_dome.constants import GuardLLMType
|
|
44
|
+
from datarobot_dome.llm import DataRobotLLM
|
|
45
|
+
|
|
46
|
+
# Ideally, we want to return confidence score between 0.0 and 100.0,
|
|
47
|
+
# but for ROUGE-1 guard, UI allows the user to configure value between
|
|
48
|
+
# 0 and 1, so making scaling factor 1.
|
|
49
|
+
SCALING_FACTOR = 1
|
|
50
|
+
DEFAULT_OPEN_AI_API_VERSION = "2023-03-15-preview"
|
|
51
|
+
|
|
52
|
+
_logger = logging.getLogger(LOGGER_NAME_PREFIX + ".guard_helpers")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_token_count(input: str, encoding: str = "cl100k_base") -> int:
|
|
56
|
+
"""Get the token count for the input."""
|
|
57
|
+
if input is None:
|
|
58
|
+
return 0
|
|
59
|
+
encoding = tiktoken.get_encoding(encoding)
|
|
60
|
+
return len(encoding.encode(str(input), disallowed_special=()))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def calculate_token_counts_for_cost_calculations(prompt_column_name, response_column_name, df):
|
|
64
|
+
# For either interface, prompt is part of the predictions_df, so prompt_column_name
|
|
65
|
+
# should be present in the df
|
|
66
|
+
df[PROMPT_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE] = df[prompt_column_name].apply(
|
|
67
|
+
lambda x: get_token_count(x)
|
|
68
|
+
)
|
|
69
|
+
df[RESPONSE_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE] = df[response_column_name].apply(
|
|
70
|
+
lambda x: get_token_count(x)
|
|
71
|
+
)
|
|
72
|
+
return df
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_citation_columns(columns: pd.Index) -> list:
|
|
76
|
+
"""
|
|
77
|
+
Ensure that citation columns are returned in the order 0, 1, 2, etc
|
|
78
|
+
Order matters
|
|
79
|
+
"""
|
|
80
|
+
index = 0
|
|
81
|
+
citation_columns = []
|
|
82
|
+
while True:
|
|
83
|
+
column_name = f"CITATION_CONTENT_{index}"
|
|
84
|
+
if column_name in columns:
|
|
85
|
+
citation_columns.append(column_name)
|
|
86
|
+
index += 1
|
|
87
|
+
else:
|
|
88
|
+
break
|
|
89
|
+
return citation_columns
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def nemo_response_stage_input_formatter(bot_message: str) -> list:
|
|
93
|
+
"""
|
|
94
|
+
Format the input message for the Nemo guard during response guard stage.
|
|
95
|
+
only applicable to bot generated messages.
|
|
96
|
+
this format is only suitable for openai-based nemo guardrails.
|
|
97
|
+
"""
|
|
98
|
+
messages = [
|
|
99
|
+
{"role": "context", "content": {"llm_output": bot_message}},
|
|
100
|
+
{"role": "user", "content": "just some place holder message"},
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
return messages
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def nemo_response_stage_output_formatter(guard_message: dict) -> str:
|
|
107
|
+
"""
|
|
108
|
+
Format the output message for the Nemo guard during response guard stage.
|
|
109
|
+
applicable to nemo guard generated messages.
|
|
110
|
+
this format is only suitable for openai-based nemo guardrails.
|
|
111
|
+
"""
|
|
112
|
+
return guard_message["content"]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_rouge_1_scorer():
|
|
116
|
+
return rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_rouge_1_score(
|
|
120
|
+
scorer: rouge_scorer.RougeScorer,
|
|
121
|
+
llm_context: list[str],
|
|
122
|
+
llm_response: list[str],
|
|
123
|
+
) -> float:
|
|
124
|
+
"""Compute rouge score between list of context sent to LLM and its response.
|
|
125
|
+
|
|
126
|
+
Calculate ROUGE score between provided LLM context and LLM's response.
|
|
127
|
+
ROUGE is case insensitive, meaning that upper case letters are treated in same way as lower
|
|
128
|
+
case letters. ROUGE uses a random resampling algorithm which is non-deterministic, so we need
|
|
129
|
+
to fix seed.
|
|
130
|
+
|
|
131
|
+
Parameters
|
|
132
|
+
----------
|
|
133
|
+
llm_context
|
|
134
|
+
context sent from vector database to Open-Source LLM
|
|
135
|
+
llm_response
|
|
136
|
+
confidence score from the Open-Source LLM
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
Rouge score between context and the answer
|
|
141
|
+
"""
|
|
142
|
+
if (
|
|
143
|
+
llm_response is None
|
|
144
|
+
or len(llm_response) == 0
|
|
145
|
+
or llm_context is None
|
|
146
|
+
or len(llm_context) == 0
|
|
147
|
+
):
|
|
148
|
+
return 0.0
|
|
149
|
+
|
|
150
|
+
valid_llm_responses = list(filter(None, llm_response))
|
|
151
|
+
if len(valid_llm_responses) == 0:
|
|
152
|
+
return 0.0
|
|
153
|
+
|
|
154
|
+
# Get only non None contexts for calculation
|
|
155
|
+
valid_llm_contexts = list(filter(None, llm_context))
|
|
156
|
+
if len(valid_llm_contexts) == 0:
|
|
157
|
+
return 0.0
|
|
158
|
+
|
|
159
|
+
response_to_score = " ".join([str(response) for response in valid_llm_responses])
|
|
160
|
+
|
|
161
|
+
# Adapt Greedy Strategy for Maximizing Rouge Score
|
|
162
|
+
# For each sentence keep max between sentence rouge1 precision and sentence rouge1 recall
|
|
163
|
+
# for given llm response. At the end calculate and rouge1 precision and rouge1 recall
|
|
164
|
+
# for the entire block.
|
|
165
|
+
# rouge 1 precision = count of matching n-grams / count of context n-grams
|
|
166
|
+
# rouge 1 recall = count of matching n-grams / count of llm response n-grams
|
|
167
|
+
# According to detailed analysis of ROUGE: https://aclanthology.org/E17-2007.pdf
|
|
168
|
+
# High ROUGE score is hard to achieve, but greedy approacha achieves acceptable results.
|
|
169
|
+
# TODO: https://github.com/Tiiiger/bert_score/ use bert_score instead.
|
|
170
|
+
# Rouge is broken because doesnt' care about semantic only compare token to token
|
|
171
|
+
# We need to capture semantic and this will significantly boost results, because
|
|
172
|
+
# in order to get high rouge, LLM response needs to do "parroting", just mimicking the
|
|
173
|
+
# context as much as possible. Simple GPT paraphrasing with correct answer can break Rouge.
|
|
174
|
+
|
|
175
|
+
best_rouge_score = 0.0
|
|
176
|
+
# Greedy Strategy, pick best rouge score between each context sentence and llm response
|
|
177
|
+
for context_sentence in valid_llm_contexts:
|
|
178
|
+
sentence_score = scorer.score(str(context_sentence), response_to_score)
|
|
179
|
+
best_rouge_score = max(
|
|
180
|
+
best_rouge_score,
|
|
181
|
+
sentence_score["rouge1"].precision,
|
|
182
|
+
sentence_score["rouge1"].recall,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
context_to_score = " ".join([str(context) for context in valid_llm_contexts])
|
|
186
|
+
# Compute Rouge between whole context ( concatenated sentences ) and llm response
|
|
187
|
+
block_score = scorer.score(context_to_score, response_to_score)
|
|
188
|
+
best_rouge_score = max(
|
|
189
|
+
best_rouge_score, block_score["rouge1"].precision, block_score["rouge1"].recall
|
|
190
|
+
)
|
|
191
|
+
return best_rouge_score * SCALING_FACTOR
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def get_llm_gateway_client(
|
|
195
|
+
model: str | None = None,
|
|
196
|
+
llm_id: str | None = None,
|
|
197
|
+
openai_deployment_id: str | None = None,
|
|
198
|
+
) -> ChatOpenAI:
|
|
199
|
+
"""The LLM gateway client enables chat completions with DR provided credentials and metering.
|
|
200
|
+
|
|
201
|
+
Providing model is always required due to openai's chat api.
|
|
202
|
+
llm_id and deployment_id override model if provided.
|
|
203
|
+
The hierarchy is: model < llm_id < deployment_id
|
|
204
|
+
"""
|
|
205
|
+
datarobot_endpoint, datarobot_api_token = get_datarobot_endpoint_and_token()
|
|
206
|
+
client = ChatOpenAI(
|
|
207
|
+
# default model is required by ChatOpenAI
|
|
208
|
+
model=model or "azure/gpt-4o",
|
|
209
|
+
api_key=datarobot_api_token,
|
|
210
|
+
base_url=f"{datarobot_endpoint}/genai/llmgw",
|
|
211
|
+
max_retries=0, # retries are handled by the LLM Gateway
|
|
212
|
+
default_headers={
|
|
213
|
+
# used for metering
|
|
214
|
+
"Client-Id": "moderations",
|
|
215
|
+
},
|
|
216
|
+
extra_body={
|
|
217
|
+
# optional model overrides
|
|
218
|
+
"deployment_id": openai_deployment_id,
|
|
219
|
+
"llm_id": llm_id,
|
|
220
|
+
},
|
|
221
|
+
)
|
|
222
|
+
return client
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def try_to_fallback_to_llm_gateway(
|
|
226
|
+
llm_id: str | None,
|
|
227
|
+
openai_deployment_id: str | None,
|
|
228
|
+
llm_type: GuardLLMType,
|
|
229
|
+
e: Exception,
|
|
230
|
+
) -> ChatOpenAI:
|
|
231
|
+
# USE the LLM gateway if its runtime parameter is available and enabled
|
|
232
|
+
# DO NOT USE the gateway if user provided credentials are specified
|
|
233
|
+
# which is the case if no exception was raised trying to create the LLM
|
|
234
|
+
# DATAROBOT and NIM LLM types are not supported by the gateway
|
|
235
|
+
if not json.loads(os.environ.get("ENABLE_LLM_GATEWAY_INFERENCE", "false")) or llm_type in [
|
|
236
|
+
GuardLLMType.DATAROBOT,
|
|
237
|
+
GuardLLMType.NIM,
|
|
238
|
+
]:
|
|
239
|
+
raise e
|
|
240
|
+
llm = get_llm_gateway_client(llm_id=llm_id, openai_deployment_id=openai_deployment_id)
|
|
241
|
+
return llm
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_azure_openai_client(
|
|
245
|
+
openai_api_key: str,
|
|
246
|
+
openai_api_base: str,
|
|
247
|
+
openai_deployment_id: str,
|
|
248
|
+
) -> AzureChatOpenAI:
|
|
249
|
+
azure_openai_client = AzureChatOpenAI(
|
|
250
|
+
model=openai_deployment_id,
|
|
251
|
+
azure_endpoint=openai_api_base,
|
|
252
|
+
api_key=openai_api_key,
|
|
253
|
+
deployment_name=openai_deployment_id,
|
|
254
|
+
api_version=DEFAULT_OPEN_AI_API_VERSION,
|
|
255
|
+
)
|
|
256
|
+
return azure_openai_client
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def get_vertex_client(
|
|
260
|
+
google_model: GoogleModel,
|
|
261
|
+
google_service_account: dict,
|
|
262
|
+
google_region: str,
|
|
263
|
+
):
|
|
264
|
+
from google.oauth2 import service_account
|
|
265
|
+
from llama_index.llms.vertex import Vertex
|
|
266
|
+
|
|
267
|
+
vertex_credentials = service_account.Credentials.from_service_account_info(
|
|
268
|
+
google_service_account,
|
|
269
|
+
scopes=["https://www.googleapis.com/auth/cloud-platform"],
|
|
270
|
+
)
|
|
271
|
+
return Vertex(
|
|
272
|
+
model=GOOGLE_MODEL_TO_GOOGLE_MODEL_VERSION_MAP[google_model],
|
|
273
|
+
credentials=vertex_credentials,
|
|
274
|
+
project=vertex_credentials.project_id,
|
|
275
|
+
location=google_region,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def get_bedrock_client(
|
|
280
|
+
aws_model: AwsModel,
|
|
281
|
+
aws_access_key_id: str,
|
|
282
|
+
aws_secret_access_key: str,
|
|
283
|
+
aws_region: str,
|
|
284
|
+
aws_session_token: str | None,
|
|
285
|
+
):
|
|
286
|
+
from llama_index.llms.bedrock_converse import BedrockConverse
|
|
287
|
+
|
|
288
|
+
return BedrockConverse(
|
|
289
|
+
model=AWS_MODEL_TO_AWS_MODEL_VERSION_MAP[aws_model],
|
|
290
|
+
aws_access_key_id=aws_access_key_id,
|
|
291
|
+
aws_secret_access_key=aws_secret_access_key,
|
|
292
|
+
aws_session_token=aws_session_token,
|
|
293
|
+
region_name=aws_region,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def get_datarobot_endpoint_and_token():
|
|
298
|
+
datarobot_endpoint = os.environ.get("DATAROBOT_ENDPOINT", None)
|
|
299
|
+
if datarobot_endpoint is None:
|
|
300
|
+
raise ValueError(
|
|
301
|
+
"Missing DataRobot endpoint 'DATAROBOT_ENDPOINT' in environment variable,"
|
|
302
|
+
" can't create DataRobotLLM"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
datarobot_api_token = os.environ.get("DATAROBOT_API_TOKEN", None)
|
|
306
|
+
if datarobot_api_token is None:
|
|
307
|
+
raise ValueError(
|
|
308
|
+
"Missing DataRobot API Token 'DATAROBOT_API_TOKEN' in environment variable,"
|
|
309
|
+
" can't create DataRobotLLM"
|
|
310
|
+
)
|
|
311
|
+
return datarobot_endpoint, datarobot_api_token
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_datarobot_llm(deployment):
|
|
315
|
+
datarobot_endpoint, datarobot_api_token = get_datarobot_endpoint_and_token()
|
|
316
|
+
return DataRobotLLM(
|
|
317
|
+
deployment,
|
|
318
|
+
datarobot_endpoint=datarobot_endpoint,
|
|
319
|
+
datarobot_api_token=datarobot_api_token,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def get_nim_model_id_served_by_the_url(base_url: str, api_key: str):
|
|
324
|
+
models_url = f"{base_url}/directAccess/nim/v1/models/"
|
|
325
|
+
headers = {
|
|
326
|
+
"Authorization": f"Bearer {api_key}",
|
|
327
|
+
"Accept": "application/json",
|
|
328
|
+
}
|
|
329
|
+
response = requests.get(models_url, headers=headers)
|
|
330
|
+
response.raise_for_status()
|
|
331
|
+
json_response = response.json()
|
|
332
|
+
|
|
333
|
+
# We expect the API to adhere to OpenAI /v1/models spec, can't do
|
|
334
|
+
# all checks
|
|
335
|
+
for model in json_response["data"]:
|
|
336
|
+
# Lets get the first model id of the list to query
|
|
337
|
+
return model["id"]
|
|
338
|
+
|
|
339
|
+
# No models? Raise
|
|
340
|
+
raise Exception(f"The URL is not serving any models: {models_url}")
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def get_chat_nvidia_llm(api_key: str, base_url: str) -> ChatNVIDIA:
|
|
344
|
+
model_id = get_nim_model_id_served_by_the_url(base_url, api_key)
|
|
345
|
+
_logger.info(f"Found model {model_id} being served at url: {base_url}")
|
|
346
|
+
nim_model = determine_model(model_id)
|
|
347
|
+
if nim_model is None:
|
|
348
|
+
# Most likely a DataRobot NiM model, so first
|
|
349
|
+
# register it and then use it
|
|
350
|
+
chat_url = f"{base_url}/chat/completions"
|
|
351
|
+
nim_model = Model(
|
|
352
|
+
id=model_id,
|
|
353
|
+
model_type="chat",
|
|
354
|
+
client="ChatNVIDIA",
|
|
355
|
+
endpoint=chat_url,
|
|
356
|
+
)
|
|
357
|
+
# This registration is for the sake of NeMo guardrails to find
|
|
358
|
+
# the datarobot LLM
|
|
359
|
+
register_model(nim_model)
|
|
360
|
+
return ChatNVIDIA(model=nim_model.id, api_key=api_key)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def calculate_faithfulness(
|
|
364
|
+
evaluator: FaithfulnessEvaluator,
|
|
365
|
+
llm_query: str,
|
|
366
|
+
llm_response: str,
|
|
367
|
+
llm_context: list[str],
|
|
368
|
+
):
|
|
369
|
+
"""Compute faithfulness score between list of context and LL response for given metric.
|
|
370
|
+
|
|
371
|
+
Parameters
|
|
372
|
+
----------
|
|
373
|
+
llm_query
|
|
374
|
+
query sent from vector database to Open-Source LLM
|
|
375
|
+
llm_response
|
|
376
|
+
response from the Open-Source LLM
|
|
377
|
+
llm_context
|
|
378
|
+
context sent from vector database to Open-Source LLM
|
|
379
|
+
|
|
380
|
+
Returns
|
|
381
|
+
-------
|
|
382
|
+
Faithfulness score: 1.0 if the response is faithful to the query, 0.0 otherwise.
|
|
383
|
+
"""
|
|
384
|
+
if llm_response is None or llm_query is None or llm_context is None or len(llm_context) == 0:
|
|
385
|
+
return 0.0
|
|
386
|
+
|
|
387
|
+
# Get only non None contexts for calculation
|
|
388
|
+
valid_llm_contexts = list(filter(None, llm_context))
|
|
389
|
+
if len(valid_llm_contexts) == 0:
|
|
390
|
+
return 0.0
|
|
391
|
+
|
|
392
|
+
llm_contexts = [str(context) for context in valid_llm_contexts]
|
|
393
|
+
faithfulness_result = evaluator.evaluate(str(llm_query), str(llm_response), llm_contexts)
|
|
394
|
+
return faithfulness_result.score
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def calculate_agent_goal_accuracy(
|
|
398
|
+
scorer: AgentGoalAccuracyWithoutReference,
|
|
399
|
+
prompt: str,
|
|
400
|
+
interactions: str,
|
|
401
|
+
response: str,
|
|
402
|
+
):
|
|
403
|
+
if interactions is None:
|
|
404
|
+
# If interactions are missing - we use prompt and response to gauge the
|
|
405
|
+
# goal accuracy
|
|
406
|
+
sample = MultiTurnSample(
|
|
407
|
+
user_input=[HumanMessage(content=prompt), AIMessage(content=response)]
|
|
408
|
+
)
|
|
409
|
+
else:
|
|
410
|
+
samples_dict = json.loads(interactions)
|
|
411
|
+
inputs = []
|
|
412
|
+
for message in samples_dict["user_input"]:
|
|
413
|
+
if message["type"] == "ai":
|
|
414
|
+
inputs.append(
|
|
415
|
+
AIMessage(content=message["content"], tool_calls=message.get("tool_calls", []))
|
|
416
|
+
)
|
|
417
|
+
elif message["type"] == "human":
|
|
418
|
+
inputs.append(HumanMessage(content=message["content"]))
|
|
419
|
+
elif message["type"] == "tool":
|
|
420
|
+
inputs.append(ToolMessage(content=message["content"]))
|
|
421
|
+
sample = MultiTurnSample(user_input=inputs)
|
|
422
|
+
return scorer.multi_turn_score(sample)
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
class ModerationDeepEvalLLM(DeepEvalBaseLLM):
|
|
426
|
+
def __init__(self, llm, *args, **kwargs):
|
|
427
|
+
self.llm = llm
|
|
428
|
+
|
|
429
|
+
def load_model(self, *args, **kwargs):
|
|
430
|
+
return self.llm
|
|
431
|
+
|
|
432
|
+
def generate(self, prompt: str) -> str:
|
|
433
|
+
return self.llm.invoke(prompt).content
|
|
434
|
+
|
|
435
|
+
async def a_generate(self, prompt: str) -> str:
|
|
436
|
+
res = await self.llm.ainvoke(prompt)
|
|
437
|
+
return res.content
|
|
438
|
+
|
|
439
|
+
def get_model_name(self):
|
|
440
|
+
return "DeepEval LLM for Moderation"
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def calculate_task_adherence(
|
|
444
|
+
scorer: TaskCompletionMetric,
|
|
445
|
+
prompt: str,
|
|
446
|
+
interactions: str,
|
|
447
|
+
response: str,
|
|
448
|
+
):
|
|
449
|
+
# As discussed in ticket https://datarobot.atlassian.net/browse/RAPTOR-12657, the
|
|
450
|
+
# library will calculate the task completion metric based on input and output
|
|
451
|
+
# only and will not use tools information for now.
|
|
452
|
+
#
|
|
453
|
+
# But, we will keep `interactions` parameter (unused) so that it will be easier
|
|
454
|
+
# to implement improvement whenever required.
|
|
455
|
+
_ = interactions
|
|
456
|
+
test_case = LLMTestCase(input=prompt, actual_output=response, tools_called=[])
|
|
457
|
+
return scorer.measure(test_case)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# ---------------------------------------------------------------------------------
|
|
2
|
+
# Copyright (c) 2025 DataRobot, Inc. and its affiliates. All rights reserved.
|
|
3
|
+
# Last updated 2025.
|
|
4
|
+
#
|
|
5
|
+
# DataRobot, Inc. Confidential.
|
|
6
|
+
# This is proprietary source code of DataRobot, Inc. and its affiliates.
|
|
7
|
+
#
|
|
8
|
+
# This file and its contents are subject to DataRobot Tool and Utility Agreement.
|
|
9
|
+
# For details, see
|
|
10
|
+
# https://www.datarobot.com/wp-content/uploads/2021/07/DataRobot-Tool-and-Utility-Agreement.pdf.
|
|
11
|
+
# ---------------------------------------------------------------------------------
|