datarobot-moderations 11.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,457 @@
1
+ # ---------------------------------------------------------------------------------
2
+ # Copyright (c) 2025 DataRobot, Inc. and its affiliates. All rights reserved.
3
+ # Last updated 2025.
4
+ #
5
+ # DataRobot, Inc. Confidential.
6
+ # This is proprietary source code of DataRobot, Inc. and its affiliates.
7
+ #
8
+ # This file and its contents are subject to DataRobot Tool and Utility Agreement.
9
+ # For details, see
10
+ # https://www.datarobot.com/wp-content/uploads/2021/07/DataRobot-Tool-and-Utility-Agreement.pdf.
11
+ # ---------------------------------------------------------------------------------
12
+ import json
13
+ import logging
14
+ import os
15
+
16
+ import pandas as pd
17
+ import requests
18
+ import tiktoken
19
+ from deepeval.metrics import TaskCompletionMetric
20
+ from deepeval.models import DeepEvalBaseLLM
21
+ from deepeval.test_case import LLMTestCase
22
+ from langchain_nvidia_ai_endpoints import ChatNVIDIA
23
+ from langchain_nvidia_ai_endpoints import Model
24
+ from langchain_nvidia_ai_endpoints import register_model
25
+ from langchain_nvidia_ai_endpoints._statics import determine_model
26
+ from langchain_openai import AzureChatOpenAI
27
+ from langchain_openai import ChatOpenAI
28
+ from llama_index.core.evaluation import FaithfulnessEvaluator
29
+ from ragas import MultiTurnSample
30
+ from ragas.messages import AIMessage
31
+ from ragas.messages import HumanMessage
32
+ from ragas.messages import ToolMessage
33
+ from ragas.metrics import AgentGoalAccuracyWithoutReference
34
+ from rouge_score import rouge_scorer
35
+
36
+ from datarobot_dome.constants import AWS_MODEL_TO_AWS_MODEL_VERSION_MAP
37
+ from datarobot_dome.constants import GOOGLE_MODEL_TO_GOOGLE_MODEL_VERSION_MAP
38
+ from datarobot_dome.constants import LOGGER_NAME_PREFIX
39
+ from datarobot_dome.constants import PROMPT_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE
40
+ from datarobot_dome.constants import RESPONSE_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE
41
+ from datarobot_dome.constants import AwsModel
42
+ from datarobot_dome.constants import GoogleModel
43
+ from datarobot_dome.constants import GuardLLMType
44
+ from datarobot_dome.llm import DataRobotLLM
45
+
46
+ # Ideally, we want to return confidence score between 0.0 and 100.0,
47
+ # but for ROUGE-1 guard, UI allows the user to configure value between
48
+ # 0 and 1, so making scaling factor 1.
49
+ SCALING_FACTOR = 1
50
+ DEFAULT_OPEN_AI_API_VERSION = "2023-03-15-preview"
51
+
52
+ _logger = logging.getLogger(LOGGER_NAME_PREFIX + ".guard_helpers")
53
+
54
+
55
+ def get_token_count(input: str, encoding: str = "cl100k_base") -> int:
56
+ """Get the token count for the input."""
57
+ if input is None:
58
+ return 0
59
+ encoding = tiktoken.get_encoding(encoding)
60
+ return len(encoding.encode(str(input), disallowed_special=()))
61
+
62
+
63
+ def calculate_token_counts_for_cost_calculations(prompt_column_name, response_column_name, df):
64
+ # For either interface, prompt is part of the predictions_df, so prompt_column_name
65
+ # should be present in the df
66
+ df[PROMPT_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE] = df[prompt_column_name].apply(
67
+ lambda x: get_token_count(x)
68
+ )
69
+ df[RESPONSE_TOKEN_COUNT_COLUMN_NAME_FROM_USAGE] = df[response_column_name].apply(
70
+ lambda x: get_token_count(x)
71
+ )
72
+ return df
73
+
74
+
75
+ def get_citation_columns(columns: pd.Index) -> list:
76
+ """
77
+ Ensure that citation columns are returned in the order 0, 1, 2, etc
78
+ Order matters
79
+ """
80
+ index = 0
81
+ citation_columns = []
82
+ while True:
83
+ column_name = f"CITATION_CONTENT_{index}"
84
+ if column_name in columns:
85
+ citation_columns.append(column_name)
86
+ index += 1
87
+ else:
88
+ break
89
+ return citation_columns
90
+
91
+
92
+ def nemo_response_stage_input_formatter(bot_message: str) -> list:
93
+ """
94
+ Format the input message for the Nemo guard during response guard stage.
95
+ only applicable to bot generated messages.
96
+ this format is only suitable for openai-based nemo guardrails.
97
+ """
98
+ messages = [
99
+ {"role": "context", "content": {"llm_output": bot_message}},
100
+ {"role": "user", "content": "just some place holder message"},
101
+ ]
102
+
103
+ return messages
104
+
105
+
106
+ def nemo_response_stage_output_formatter(guard_message: dict) -> str:
107
+ """
108
+ Format the output message for the Nemo guard during response guard stage.
109
+ applicable to nemo guard generated messages.
110
+ this format is only suitable for openai-based nemo guardrails.
111
+ """
112
+ return guard_message["content"]
113
+
114
+
115
+ def get_rouge_1_scorer():
116
+ return rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
117
+
118
+
119
+ def get_rouge_1_score(
120
+ scorer: rouge_scorer.RougeScorer,
121
+ llm_context: list[str],
122
+ llm_response: list[str],
123
+ ) -> float:
124
+ """Compute rouge score between list of context sent to LLM and its response.
125
+
126
+ Calculate ROUGE score between provided LLM context and LLM's response.
127
+ ROUGE is case insensitive, meaning that upper case letters are treated in same way as lower
128
+ case letters. ROUGE uses a random resampling algorithm which is non-deterministic, so we need
129
+ to fix seed.
130
+
131
+ Parameters
132
+ ----------
133
+ llm_context
134
+ context sent from vector database to Open-Source LLM
135
+ llm_response
136
+ confidence score from the Open-Source LLM
137
+
138
+ Returns
139
+ -------
140
+ Rouge score between context and the answer
141
+ """
142
+ if (
143
+ llm_response is None
144
+ or len(llm_response) == 0
145
+ or llm_context is None
146
+ or len(llm_context) == 0
147
+ ):
148
+ return 0.0
149
+
150
+ valid_llm_responses = list(filter(None, llm_response))
151
+ if len(valid_llm_responses) == 0:
152
+ return 0.0
153
+
154
+ # Get only non None contexts for calculation
155
+ valid_llm_contexts = list(filter(None, llm_context))
156
+ if len(valid_llm_contexts) == 0:
157
+ return 0.0
158
+
159
+ response_to_score = " ".join([str(response) for response in valid_llm_responses])
160
+
161
+ # Adapt Greedy Strategy for Maximizing Rouge Score
162
+ # For each sentence keep max between sentence rouge1 precision and sentence rouge1 recall
163
+ # for given llm response. At the end calculate and rouge1 precision and rouge1 recall
164
+ # for the entire block.
165
+ # rouge 1 precision = count of matching n-grams / count of context n-grams
166
+ # rouge 1 recall = count of matching n-grams / count of llm response n-grams
167
+ # According to detailed analysis of ROUGE: https://aclanthology.org/E17-2007.pdf
168
+ # High ROUGE score is hard to achieve, but greedy approacha achieves acceptable results.
169
+ # TODO: https://github.com/Tiiiger/bert_score/ use bert_score instead.
170
+ # Rouge is broken because doesnt' care about semantic only compare token to token
171
+ # We need to capture semantic and this will significantly boost results, because
172
+ # in order to get high rouge, LLM response needs to do "parroting", just mimicking the
173
+ # context as much as possible. Simple GPT paraphrasing with correct answer can break Rouge.
174
+
175
+ best_rouge_score = 0.0
176
+ # Greedy Strategy, pick best rouge score between each context sentence and llm response
177
+ for context_sentence in valid_llm_contexts:
178
+ sentence_score = scorer.score(str(context_sentence), response_to_score)
179
+ best_rouge_score = max(
180
+ best_rouge_score,
181
+ sentence_score["rouge1"].precision,
182
+ sentence_score["rouge1"].recall,
183
+ )
184
+
185
+ context_to_score = " ".join([str(context) for context in valid_llm_contexts])
186
+ # Compute Rouge between whole context ( concatenated sentences ) and llm response
187
+ block_score = scorer.score(context_to_score, response_to_score)
188
+ best_rouge_score = max(
189
+ best_rouge_score, block_score["rouge1"].precision, block_score["rouge1"].recall
190
+ )
191
+ return best_rouge_score * SCALING_FACTOR
192
+
193
+
194
+ def get_llm_gateway_client(
195
+ model: str | None = None,
196
+ llm_id: str | None = None,
197
+ openai_deployment_id: str | None = None,
198
+ ) -> ChatOpenAI:
199
+ """The LLM gateway client enables chat completions with DR provided credentials and metering.
200
+
201
+ Providing model is always required due to openai's chat api.
202
+ llm_id and deployment_id override model if provided.
203
+ The hierarchy is: model < llm_id < deployment_id
204
+ """
205
+ datarobot_endpoint, datarobot_api_token = get_datarobot_endpoint_and_token()
206
+ client = ChatOpenAI(
207
+ # default model is required by ChatOpenAI
208
+ model=model or "azure/gpt-4o",
209
+ api_key=datarobot_api_token,
210
+ base_url=f"{datarobot_endpoint}/genai/llmgw",
211
+ max_retries=0, # retries are handled by the LLM Gateway
212
+ default_headers={
213
+ # used for metering
214
+ "Client-Id": "moderations",
215
+ },
216
+ extra_body={
217
+ # optional model overrides
218
+ "deployment_id": openai_deployment_id,
219
+ "llm_id": llm_id,
220
+ },
221
+ )
222
+ return client
223
+
224
+
225
+ def try_to_fallback_to_llm_gateway(
226
+ llm_id: str | None,
227
+ openai_deployment_id: str | None,
228
+ llm_type: GuardLLMType,
229
+ e: Exception,
230
+ ) -> ChatOpenAI:
231
+ # USE the LLM gateway if its runtime parameter is available and enabled
232
+ # DO NOT USE the gateway if user provided credentials are specified
233
+ # which is the case if no exception was raised trying to create the LLM
234
+ # DATAROBOT and NIM LLM types are not supported by the gateway
235
+ if not json.loads(os.environ.get("ENABLE_LLM_GATEWAY_INFERENCE", "false")) or llm_type in [
236
+ GuardLLMType.DATAROBOT,
237
+ GuardLLMType.NIM,
238
+ ]:
239
+ raise e
240
+ llm = get_llm_gateway_client(llm_id=llm_id, openai_deployment_id=openai_deployment_id)
241
+ return llm
242
+
243
+
244
+ def get_azure_openai_client(
245
+ openai_api_key: str,
246
+ openai_api_base: str,
247
+ openai_deployment_id: str,
248
+ ) -> AzureChatOpenAI:
249
+ azure_openai_client = AzureChatOpenAI(
250
+ model=openai_deployment_id,
251
+ azure_endpoint=openai_api_base,
252
+ api_key=openai_api_key,
253
+ deployment_name=openai_deployment_id,
254
+ api_version=DEFAULT_OPEN_AI_API_VERSION,
255
+ )
256
+ return azure_openai_client
257
+
258
+
259
+ def get_vertex_client(
260
+ google_model: GoogleModel,
261
+ google_service_account: dict,
262
+ google_region: str,
263
+ ):
264
+ from google.oauth2 import service_account
265
+ from llama_index.llms.vertex import Vertex
266
+
267
+ vertex_credentials = service_account.Credentials.from_service_account_info(
268
+ google_service_account,
269
+ scopes=["https://www.googleapis.com/auth/cloud-platform"],
270
+ )
271
+ return Vertex(
272
+ model=GOOGLE_MODEL_TO_GOOGLE_MODEL_VERSION_MAP[google_model],
273
+ credentials=vertex_credentials,
274
+ project=vertex_credentials.project_id,
275
+ location=google_region,
276
+ )
277
+
278
+
279
+ def get_bedrock_client(
280
+ aws_model: AwsModel,
281
+ aws_access_key_id: str,
282
+ aws_secret_access_key: str,
283
+ aws_region: str,
284
+ aws_session_token: str | None,
285
+ ):
286
+ from llama_index.llms.bedrock_converse import BedrockConverse
287
+
288
+ return BedrockConverse(
289
+ model=AWS_MODEL_TO_AWS_MODEL_VERSION_MAP[aws_model],
290
+ aws_access_key_id=aws_access_key_id,
291
+ aws_secret_access_key=aws_secret_access_key,
292
+ aws_session_token=aws_session_token,
293
+ region_name=aws_region,
294
+ )
295
+
296
+
297
+ def get_datarobot_endpoint_and_token():
298
+ datarobot_endpoint = os.environ.get("DATAROBOT_ENDPOINT", None)
299
+ if datarobot_endpoint is None:
300
+ raise ValueError(
301
+ "Missing DataRobot endpoint 'DATAROBOT_ENDPOINT' in environment variable,"
302
+ " can't create DataRobotLLM"
303
+ )
304
+
305
+ datarobot_api_token = os.environ.get("DATAROBOT_API_TOKEN", None)
306
+ if datarobot_api_token is None:
307
+ raise ValueError(
308
+ "Missing DataRobot API Token 'DATAROBOT_API_TOKEN' in environment variable,"
309
+ " can't create DataRobotLLM"
310
+ )
311
+ return datarobot_endpoint, datarobot_api_token
312
+
313
+
314
+ def get_datarobot_llm(deployment):
315
+ datarobot_endpoint, datarobot_api_token = get_datarobot_endpoint_and_token()
316
+ return DataRobotLLM(
317
+ deployment,
318
+ datarobot_endpoint=datarobot_endpoint,
319
+ datarobot_api_token=datarobot_api_token,
320
+ )
321
+
322
+
323
+ def get_nim_model_id_served_by_the_url(base_url: str, api_key: str):
324
+ models_url = f"{base_url}/directAccess/nim/v1/models/"
325
+ headers = {
326
+ "Authorization": f"Bearer {api_key}",
327
+ "Accept": "application/json",
328
+ }
329
+ response = requests.get(models_url, headers=headers)
330
+ response.raise_for_status()
331
+ json_response = response.json()
332
+
333
+ # We expect the API to adhere to OpenAI /v1/models spec, can't do
334
+ # all checks
335
+ for model in json_response["data"]:
336
+ # Lets get the first model id of the list to query
337
+ return model["id"]
338
+
339
+ # No models? Raise
340
+ raise Exception(f"The URL is not serving any models: {models_url}")
341
+
342
+
343
+ def get_chat_nvidia_llm(api_key: str, base_url: str) -> ChatNVIDIA:
344
+ model_id = get_nim_model_id_served_by_the_url(base_url, api_key)
345
+ _logger.info(f"Found model {model_id} being served at url: {base_url}")
346
+ nim_model = determine_model(model_id)
347
+ if nim_model is None:
348
+ # Most likely a DataRobot NiM model, so first
349
+ # register it and then use it
350
+ chat_url = f"{base_url}/chat/completions"
351
+ nim_model = Model(
352
+ id=model_id,
353
+ model_type="chat",
354
+ client="ChatNVIDIA",
355
+ endpoint=chat_url,
356
+ )
357
+ # This registration is for the sake of NeMo guardrails to find
358
+ # the datarobot LLM
359
+ register_model(nim_model)
360
+ return ChatNVIDIA(model=nim_model.id, api_key=api_key)
361
+
362
+
363
+ def calculate_faithfulness(
364
+ evaluator: FaithfulnessEvaluator,
365
+ llm_query: str,
366
+ llm_response: str,
367
+ llm_context: list[str],
368
+ ):
369
+ """Compute faithfulness score between list of context and LL response for given metric.
370
+
371
+ Parameters
372
+ ----------
373
+ llm_query
374
+ query sent from vector database to Open-Source LLM
375
+ llm_response
376
+ response from the Open-Source LLM
377
+ llm_context
378
+ context sent from vector database to Open-Source LLM
379
+
380
+ Returns
381
+ -------
382
+ Faithfulness score: 1.0 if the response is faithful to the query, 0.0 otherwise.
383
+ """
384
+ if llm_response is None or llm_query is None or llm_context is None or len(llm_context) == 0:
385
+ return 0.0
386
+
387
+ # Get only non None contexts for calculation
388
+ valid_llm_contexts = list(filter(None, llm_context))
389
+ if len(valid_llm_contexts) == 0:
390
+ return 0.0
391
+
392
+ llm_contexts = [str(context) for context in valid_llm_contexts]
393
+ faithfulness_result = evaluator.evaluate(str(llm_query), str(llm_response), llm_contexts)
394
+ return faithfulness_result.score
395
+
396
+
397
+ def calculate_agent_goal_accuracy(
398
+ scorer: AgentGoalAccuracyWithoutReference,
399
+ prompt: str,
400
+ interactions: str,
401
+ response: str,
402
+ ):
403
+ if interactions is None:
404
+ # If interactions are missing - we use prompt and response to gauge the
405
+ # goal accuracy
406
+ sample = MultiTurnSample(
407
+ user_input=[HumanMessage(content=prompt), AIMessage(content=response)]
408
+ )
409
+ else:
410
+ samples_dict = json.loads(interactions)
411
+ inputs = []
412
+ for message in samples_dict["user_input"]:
413
+ if message["type"] == "ai":
414
+ inputs.append(
415
+ AIMessage(content=message["content"], tool_calls=message.get("tool_calls", []))
416
+ )
417
+ elif message["type"] == "human":
418
+ inputs.append(HumanMessage(content=message["content"]))
419
+ elif message["type"] == "tool":
420
+ inputs.append(ToolMessage(content=message["content"]))
421
+ sample = MultiTurnSample(user_input=inputs)
422
+ return scorer.multi_turn_score(sample)
423
+
424
+
425
+ class ModerationDeepEvalLLM(DeepEvalBaseLLM):
426
+ def __init__(self, llm, *args, **kwargs):
427
+ self.llm = llm
428
+
429
+ def load_model(self, *args, **kwargs):
430
+ return self.llm
431
+
432
+ def generate(self, prompt: str) -> str:
433
+ return self.llm.invoke(prompt).content
434
+
435
+ async def a_generate(self, prompt: str) -> str:
436
+ res = await self.llm.ainvoke(prompt)
437
+ return res.content
438
+
439
+ def get_model_name(self):
440
+ return "DeepEval LLM for Moderation"
441
+
442
+
443
+ def calculate_task_adherence(
444
+ scorer: TaskCompletionMetric,
445
+ prompt: str,
446
+ interactions: str,
447
+ response: str,
448
+ ):
449
+ # As discussed in ticket https://datarobot.atlassian.net/browse/RAPTOR-12657, the
450
+ # library will calculate the task completion metric based on input and output
451
+ # only and will not use tools information for now.
452
+ #
453
+ # But, we will keep `interactions` parameter (unused) so that it will be easier
454
+ # to implement improvement whenever required.
455
+ _ = interactions
456
+ test_case = LLMTestCase(input=prompt, actual_output=response, tools_called=[])
457
+ return scorer.measure(test_case)
@@ -0,0 +1,11 @@
1
+ # ---------------------------------------------------------------------------------
2
+ # Copyright (c) 2025 DataRobot, Inc. and its affiliates. All rights reserved.
3
+ # Last updated 2025.
4
+ #
5
+ # DataRobot, Inc. Confidential.
6
+ # This is proprietary source code of DataRobot, Inc. and its affiliates.
7
+ #
8
+ # This file and its contents are subject to DataRobot Tool and Utility Agreement.
9
+ # For details, see
10
+ # https://www.datarobot.com/wp-content/uploads/2021/07/DataRobot-Tool-and-Utility-Agreement.pdf.
11
+ # ---------------------------------------------------------------------------------