rasa-pro 3.15.0a1__py3-none-any.whl → 3.15.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rasa-pro might be problematic. Click here for more details.
- rasa/builder/constants.py +5 -0
- rasa/builder/copilot/models.py +80 -28
- rasa/builder/download.py +110 -0
- rasa/builder/evaluator/__init__.py +0 -0
- rasa/builder/evaluator/constants.py +15 -0
- rasa/builder/evaluator/copilot_executor.py +89 -0
- rasa/builder/evaluator/dataset/models.py +173 -0
- rasa/builder/evaluator/exceptions.py +4 -0
- rasa/builder/evaluator/response_classification/__init__.py +0 -0
- rasa/builder/evaluator/response_classification/constants.py +66 -0
- rasa/builder/evaluator/response_classification/evaluator.py +346 -0
- rasa/builder/evaluator/response_classification/langfuse_runner.py +463 -0
- rasa/builder/evaluator/response_classification/models.py +61 -0
- rasa/builder/evaluator/scripts/__init__.py +0 -0
- rasa/builder/evaluator/scripts/run_response_classification_evaluator.py +152 -0
- rasa/builder/jobs.py +208 -1
- rasa/builder/logging_utils.py +25 -24
- rasa/builder/main.py +6 -1
- rasa/builder/models.py +23 -0
- rasa/builder/project_generator.py +29 -10
- rasa/builder/service.py +104 -22
- rasa/builder/training_service.py +13 -1
- rasa/builder/validation_service.py +2 -1
- rasa/core/actions/action_clean_stack.py +32 -0
- rasa/core/actions/constants.py +4 -0
- rasa/core/actions/custom_action_executor.py +70 -12
- rasa/core/actions/grpc_custom_action_executor.py +41 -2
- rasa/core/actions/http_custom_action_executor.py +49 -25
- rasa/core/channels/voice_stream/voice_channel.py +14 -2
- rasa/dialogue_understanding/generator/llm_based_command_generator.py +6 -3
- rasa/dialogue_understanding/generator/single_step/compact_llm_command_generator.py +15 -7
- rasa/dialogue_understanding/generator/single_step/search_ready_llm_command_generator.py +15 -8
- rasa/dialogue_understanding/processor/command_processor.py +49 -7
- rasa/shared/providers/_configs/azure_openai_client_config.py +4 -5
- rasa/shared/providers/_configs/default_litellm_client_config.py +4 -4
- rasa/shared/providers/_configs/litellm_router_client_config.py +3 -2
- rasa/shared/providers/_configs/openai_client_config.py +5 -7
- rasa/shared/providers/_configs/rasa_llm_client_config.py +4 -4
- rasa/shared/providers/_configs/self_hosted_llm_client_config.py +4 -4
- rasa/shared/providers/llm/_base_litellm_client.py +42 -14
- rasa/shared/providers/llm/litellm_router_llm_client.py +38 -15
- rasa/shared/providers/llm/self_hosted_llm_client.py +34 -32
- rasa/shared/utils/configs.py +5 -8
- rasa/utils/endpoints.py +6 -0
- rasa/version.py +1 -1
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/METADATA +12 -12
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/RECORD +50 -37
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/NOTICE +0 -0
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/WHEEL +0 -0
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/entry_points.txt +0 -0
rasa/builder/copilot/models.py
CHANGED
|
@@ -344,6 +344,55 @@ ChatMessage = Union[
|
|
|
344
344
|
]
|
|
345
345
|
|
|
346
346
|
|
|
347
|
+
def create_chat_message_from_dict(message_data: Dict[str, Any]) -> ChatMessage:
|
|
348
|
+
"""Parse a single chat message dictionary into a ChatMessage object.
|
|
349
|
+
|
|
350
|
+
This utility function manually parses a chat message dictionary into the
|
|
351
|
+
appropriate ChatMessage type based on its role field.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
message_data: Dictionary containing chat message data
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
Parsed ChatMessage object
|
|
358
|
+
|
|
359
|
+
Raises:
|
|
360
|
+
ValueError: If an unknown role is encountered
|
|
361
|
+
|
|
362
|
+
Example:
|
|
363
|
+
>>> message_data = {
|
|
364
|
+
... "role": "user",
|
|
365
|
+
... "content": [{"type": "text", "text": "Hello"}]
|
|
366
|
+
... }
|
|
367
|
+
>>> message = parse_chat_message_from_dict(message_data)
|
|
368
|
+
>>> isinstance(message, UserChatMessage)
|
|
369
|
+
True
|
|
370
|
+
>>> message.role
|
|
371
|
+
'user'
|
|
372
|
+
"""
|
|
373
|
+
available_roles = [ROLE_USER, ROLE_COPILOT, ROLE_COPILOT_INTERNAL]
|
|
374
|
+
role = message_data.get("role")
|
|
375
|
+
|
|
376
|
+
if role == ROLE_USER:
|
|
377
|
+
return UserChatMessage(**message_data)
|
|
378
|
+
elif role == ROLE_COPILOT:
|
|
379
|
+
return CopilotChatMessage(**message_data)
|
|
380
|
+
elif role == ROLE_COPILOT_INTERNAL:
|
|
381
|
+
return InternalCopilotRequestChatMessage(**message_data)
|
|
382
|
+
else:
|
|
383
|
+
message = (
|
|
384
|
+
f"Unknown role '{role}' in chat message. "
|
|
385
|
+
f"Available roles are: {', '.join(available_roles)}."
|
|
386
|
+
)
|
|
387
|
+
structlogger.error(
|
|
388
|
+
"models.create_chat_message_from_dict.unknown_role",
|
|
389
|
+
event_info=message,
|
|
390
|
+
role=role,
|
|
391
|
+
available_roles=available_roles,
|
|
392
|
+
)
|
|
393
|
+
raise ValueError(message)
|
|
394
|
+
|
|
395
|
+
|
|
347
396
|
class CopilotContext(BaseModel):
|
|
348
397
|
"""Model containing the context used by the copilot to generate a response."""
|
|
349
398
|
|
|
@@ -391,37 +440,40 @@ class CopilotRequest(BaseModel):
|
|
|
391
440
|
|
|
392
441
|
@field_validator("copilot_chat_history", mode="before")
|
|
393
442
|
@classmethod
|
|
394
|
-
def parse_chat_history(
|
|
443
|
+
def parse_chat_history(
|
|
444
|
+
cls, v: Union[List[Dict[str, Any]], List[ChatMessage]]
|
|
445
|
+
) -> List[ChatMessage]:
|
|
395
446
|
"""Manually parse chat history messages based on role field."""
|
|
447
|
+
# If already parsed ChatMessage objects, return them as-is
|
|
448
|
+
if (
|
|
449
|
+
v
|
|
450
|
+
and isinstance(v, list)
|
|
451
|
+
and all(isinstance(item, ChatMessage) for item in v)
|
|
452
|
+
):
|
|
453
|
+
return v # type: ignore[return-value]
|
|
454
|
+
|
|
455
|
+
# Check for mixed types (some ChatMessage, some not)
|
|
456
|
+
if (
|
|
457
|
+
v
|
|
458
|
+
and isinstance(v, list)
|
|
459
|
+
and any(isinstance(item, ChatMessage) for item in v)
|
|
460
|
+
):
|
|
461
|
+
message = (
|
|
462
|
+
"Mixed types in copilot_chat_history: cannot mix ChatMessage objects"
|
|
463
|
+
"with other types."
|
|
464
|
+
)
|
|
465
|
+
structlog.get_logger().error(
|
|
466
|
+
"copilot_request.parse_chat_history.mixed_types",
|
|
467
|
+
event_info=message,
|
|
468
|
+
chat_history_types=[type(item) for item in v],
|
|
469
|
+
)
|
|
470
|
+
raise ValueError(message)
|
|
471
|
+
|
|
472
|
+
# Otherwise, parse from dictionaries
|
|
396
473
|
parsed_messages: List[ChatMessage] = []
|
|
397
|
-
available_roles = [ROLE_USER, ROLE_COPILOT, ROLE_COPILOT_INTERNAL]
|
|
398
474
|
for message_data in v:
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
if role == ROLE_USER:
|
|
402
|
-
parsed_messages.append(UserChatMessage(**message_data))
|
|
403
|
-
|
|
404
|
-
elif role == ROLE_COPILOT:
|
|
405
|
-
parsed_messages.append(CopilotChatMessage(**message_data))
|
|
406
|
-
|
|
407
|
-
elif role == ROLE_COPILOT_INTERNAL:
|
|
408
|
-
parsed_messages.append(
|
|
409
|
-
InternalCopilotRequestChatMessage(**message_data)
|
|
410
|
-
)
|
|
411
|
-
|
|
412
|
-
else:
|
|
413
|
-
message = (
|
|
414
|
-
f"Unknown role '{role}' in chat message. "
|
|
415
|
-
f"Available roles are: {', '.join(available_roles)}."
|
|
416
|
-
)
|
|
417
|
-
structlogger.error(
|
|
418
|
-
"copilot_request.parse_chat_history.unknown_role",
|
|
419
|
-
event_info=message,
|
|
420
|
-
role=role,
|
|
421
|
-
available_roles=available_roles,
|
|
422
|
-
)
|
|
423
|
-
raise ValueError(message)
|
|
424
|
-
|
|
475
|
+
chat_message = create_chat_message_from_dict(message_data)
|
|
476
|
+
parsed_messages.append(chat_message)
|
|
425
477
|
return parsed_messages
|
|
426
478
|
|
|
427
479
|
@property
|
rasa/builder/download.py
CHANGED
|
@@ -1,11 +1,21 @@
|
|
|
1
1
|
"""Download utilities for bot projects."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import io
|
|
4
5
|
import os
|
|
5
6
|
import sys
|
|
6
7
|
import tarfile
|
|
8
|
+
import tempfile
|
|
9
|
+
from pathlib import Path
|
|
7
10
|
from textwrap import dedent
|
|
8
11
|
from typing import Dict, Optional
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import aiofiles
|
|
15
|
+
import aiohttp
|
|
16
|
+
|
|
17
|
+
from rasa.builder.constants import MAX_BACKUP_SIZE
|
|
18
|
+
from rasa.builder.exceptions import ProjectGenerationError
|
|
9
19
|
|
|
10
20
|
|
|
11
21
|
def _get_env_content() -> str:
|
|
@@ -138,3 +148,103 @@ def create_bot_project_archive(
|
|
|
138
148
|
|
|
139
149
|
tar_buffer.seek(0)
|
|
140
150
|
return tar_buffer.getvalue()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def validate_s3_url(url: str) -> None:
|
|
154
|
+
"""Validate that the URL is from an expected S3 domain for security.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
url: The URL to validate
|
|
158
|
+
|
|
159
|
+
Raises:
|
|
160
|
+
ValueError: If the URL is not from an expected S3 domain
|
|
161
|
+
"""
|
|
162
|
+
parsed = urlparse(url)
|
|
163
|
+
hostname = parsed.hostname
|
|
164
|
+
|
|
165
|
+
if not hostname:
|
|
166
|
+
raise ValueError("URL must have a valid hostname")
|
|
167
|
+
|
|
168
|
+
hostname = hostname.lower()
|
|
169
|
+
if not ("s3" in hostname and hostname.endswith(".amazonaws.com")):
|
|
170
|
+
raise ValueError(f"URL must be from an AWS S3 domain, got: {hostname}")
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
async def download_backup_from_url(url: str) -> str:
|
|
174
|
+
"""Download backup file from presigned URL to a temporary file.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
url: Presigned URL to download from
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Path to the downloaded temporary file
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
ProjectGenerationError: If download fails or file is too large
|
|
184
|
+
"""
|
|
185
|
+
# Validate URL for security
|
|
186
|
+
validate_s3_url(url)
|
|
187
|
+
|
|
188
|
+
# Create temporary file path (using mktemp for path only, not creating the file)
|
|
189
|
+
temp_file_fd, temp_file_path = tempfile.mkstemp(suffix=".tar.gz")
|
|
190
|
+
os.close(temp_file_fd) # Close the file descriptor immediately
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
timeout = aiohttp.ClientTimeout(total=60)
|
|
194
|
+
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
195
|
+
async with session.get(url) as response:
|
|
196
|
+
if response.status != 200:
|
|
197
|
+
raise ProjectGenerationError(
|
|
198
|
+
f"Failed to download backup from presigned URL. "
|
|
199
|
+
f"HTTP {response.status}: {response.reason}",
|
|
200
|
+
attempts=1,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# Check content length if available
|
|
204
|
+
content_length = response.headers.get("Content-Length")
|
|
205
|
+
if content_length and int(content_length) > MAX_BACKUP_SIZE:
|
|
206
|
+
raise ProjectGenerationError(
|
|
207
|
+
f"Backup file too large "
|
|
208
|
+
f"({content_length} bytes > {MAX_BACKUP_SIZE} bytes). "
|
|
209
|
+
f"Please provide a smaller backup file.",
|
|
210
|
+
attempts=1,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Stream download to file using async file operations
|
|
214
|
+
downloaded_size = 0
|
|
215
|
+
async with aiofiles.open(temp_file_path, "wb") as f:
|
|
216
|
+
async for chunk in response.content.iter_chunked(8192):
|
|
217
|
+
downloaded_size += len(chunk)
|
|
218
|
+
|
|
219
|
+
# Check size limit during download
|
|
220
|
+
if downloaded_size > MAX_BACKUP_SIZE:
|
|
221
|
+
raise ProjectGenerationError(
|
|
222
|
+
f"Backup file too large "
|
|
223
|
+
f"({downloaded_size} bytes > {MAX_BACKUP_SIZE} bytes).",
|
|
224
|
+
attempts=1,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
await f.write(chunk)
|
|
228
|
+
|
|
229
|
+
return temp_file_path
|
|
230
|
+
|
|
231
|
+
except ProjectGenerationError:
|
|
232
|
+
# Clean up temp file and re-raise ProjectGenerationError as-is
|
|
233
|
+
try:
|
|
234
|
+
Path(temp_file_path).unlink(missing_ok=True)
|
|
235
|
+
except Exception:
|
|
236
|
+
pass
|
|
237
|
+
raise
|
|
238
|
+
except asyncio.TimeoutError:
|
|
239
|
+
error_message = "Download timeout: Presigned URL may have expired."
|
|
240
|
+
except aiohttp.ClientError as exc:
|
|
241
|
+
error_message = f"Network error downloading backup: {exc}"
|
|
242
|
+
except Exception as exc:
|
|
243
|
+
error_message = f"Unexpected error downloading backup: {exc}"
|
|
244
|
+
|
|
245
|
+
# Clean up temp file and raise error
|
|
246
|
+
try:
|
|
247
|
+
Path(temp_file_path).unlink(missing_ok=True)
|
|
248
|
+
except Exception:
|
|
249
|
+
pass
|
|
250
|
+
raise ProjectGenerationError(error_message, attempts=1)
|
|
File without changes
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Constants for the evaluator module."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
# Base directory for the rasa package
|
|
6
|
+
BASE_DIR = Path(__file__).parent.parent.parent
|
|
7
|
+
|
|
8
|
+
# Response classification evaluation results directory
|
|
9
|
+
RESPONSE_CLASSIFICATION_EVALUATION_RESULTS_DIR = (
|
|
10
|
+
BASE_DIR / "builder" / "evaluator" / "results"
|
|
11
|
+
)
|
|
12
|
+
# Default output filename
|
|
13
|
+
DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME = "run_results.txt"
|
|
14
|
+
# Default YAML output filename
|
|
15
|
+
RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME = "run_results.yaml"
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Copilot execution utilities for evaluators.
|
|
2
|
+
|
|
3
|
+
This module provides utilities for running copilot operations in evaluation contexts,
|
|
4
|
+
independent of specific evaluation frameworks like Langfuse.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
import structlog
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from rasa.builder.config import COPILOT_HANDLER_ROLLING_BUFFER_SIZE
|
|
13
|
+
from rasa.builder.copilot.models import (
|
|
14
|
+
CopilotContext,
|
|
15
|
+
CopilotGenerationContext,
|
|
16
|
+
GeneratedContent,
|
|
17
|
+
ReferenceSection,
|
|
18
|
+
ResponseCategory,
|
|
19
|
+
)
|
|
20
|
+
from rasa.builder.llm_service import llm_service
|
|
21
|
+
|
|
22
|
+
structlogger = structlog.get_logger()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CopilotRunResult(BaseModel):
|
|
26
|
+
"""Result from running the copilot with response handler."""
|
|
27
|
+
|
|
28
|
+
complete_response: Optional[str]
|
|
29
|
+
response_category: Optional[ResponseCategory]
|
|
30
|
+
reference_section: Optional[ReferenceSection]
|
|
31
|
+
generation_context: CopilotGenerationContext
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
async def run_copilot_with_response_handler(
|
|
35
|
+
context: CopilotContext,
|
|
36
|
+
) -> Optional[CopilotRunResult]:
|
|
37
|
+
"""Run the copilot with response handler on the given context.
|
|
38
|
+
|
|
39
|
+
This function encapsulates the core copilot execution logic. It handles:
|
|
40
|
+
- Instantiating the copilot and response handler
|
|
41
|
+
- Generating a response and extracting the reference section from the given context
|
|
42
|
+
- Returning structured results
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
context: The copilot context to process.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
CopilotRunResult containing the complete response, category, and generation
|
|
49
|
+
context, or None if execution fails.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
Any exceptions from the copilot or response handler execution.
|
|
53
|
+
"""
|
|
54
|
+
# Instantiate the copilot and response handler
|
|
55
|
+
copilot = llm_service.instantiate_copilot()
|
|
56
|
+
copilot_response_handler = llm_service.instantiate_handler(
|
|
57
|
+
COPILOT_HANDLER_ROLLING_BUFFER_SIZE
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# Call the copilot to generate a response and handle it with the response
|
|
61
|
+
# handler
|
|
62
|
+
(original_stream, generation_context) = await copilot.generate_response(context)
|
|
63
|
+
intercepted_stream = copilot_response_handler.handle_response(original_stream)
|
|
64
|
+
|
|
65
|
+
# Exhaust the stream to get the complete response for evaluation
|
|
66
|
+
response_chunks: List[str] = []
|
|
67
|
+
response_category = None
|
|
68
|
+
async for chunk in intercepted_stream:
|
|
69
|
+
if not isinstance(chunk, GeneratedContent):
|
|
70
|
+
continue
|
|
71
|
+
response_chunks.append(chunk.content)
|
|
72
|
+
response_category = chunk.response_category
|
|
73
|
+
|
|
74
|
+
complete_response = "".join(response_chunks) if response_chunks else None
|
|
75
|
+
|
|
76
|
+
# Extract the reference section from the response handler
|
|
77
|
+
if generation_context.relevant_documents:
|
|
78
|
+
reference_section = copilot_response_handler.extract_references(
|
|
79
|
+
generation_context.relevant_documents
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
reference_section = None
|
|
83
|
+
|
|
84
|
+
return CopilotRunResult(
|
|
85
|
+
complete_response=complete_response,
|
|
86
|
+
response_category=response_category,
|
|
87
|
+
reference_section=reference_section,
|
|
88
|
+
generation_context=generation_context,
|
|
89
|
+
)
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Optional, Union
|
|
2
|
+
|
|
3
|
+
import structlog
|
|
4
|
+
from pydantic import BaseModel, Field, field_validator
|
|
5
|
+
|
|
6
|
+
from rasa.builder.copilot.models import (
|
|
7
|
+
ChatMessage,
|
|
8
|
+
CopilotContext,
|
|
9
|
+
EventContent,
|
|
10
|
+
ReferenceEntry,
|
|
11
|
+
ResponseCategory,
|
|
12
|
+
create_chat_message_from_dict,
|
|
13
|
+
)
|
|
14
|
+
from rasa.builder.document_retrieval.models import Document
|
|
15
|
+
from rasa.builder.shared.tracker_context import TrackerContext
|
|
16
|
+
|
|
17
|
+
structlogger = structlog.get_logger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DatasetInput(BaseModel):
|
|
21
|
+
"""Model for the input field of a dataset entry."""
|
|
22
|
+
|
|
23
|
+
message: Optional[str] = None
|
|
24
|
+
tracker_event_attachments: List[EventContent] = Field(default_factory=list)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DatasetExpectedOutput(BaseModel):
|
|
28
|
+
"""Model for the expected_output field of a dataset entry."""
|
|
29
|
+
|
|
30
|
+
answer: str
|
|
31
|
+
response_category: ResponseCategory
|
|
32
|
+
references: list[ReferenceEntry]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DatasetMetadataCopilotAdditionalContext(BaseModel):
|
|
36
|
+
"""Model for the copilot_additional_context in metadata."""
|
|
37
|
+
|
|
38
|
+
relevant_documents: List[Document] = Field(default_factory=list)
|
|
39
|
+
relevant_assistant_files: Dict[str, str] = Field(default_factory=dict)
|
|
40
|
+
assistant_tracker_context: Optional[Dict[str, Any]] = None
|
|
41
|
+
assistant_logs: str = Field(default="")
|
|
42
|
+
copilot_chat_history: List[ChatMessage] = Field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
@field_validator("copilot_chat_history", mode="before")
|
|
45
|
+
@classmethod
|
|
46
|
+
def parse_chat_history(
|
|
47
|
+
cls, v: Union[List[Dict[str, Any]], List[ChatMessage]]
|
|
48
|
+
) -> List[ChatMessage]:
|
|
49
|
+
"""Manually parse chat history messages based on role field."""
|
|
50
|
+
# If already parsed ChatMessage objects, return them as-is
|
|
51
|
+
if (
|
|
52
|
+
v
|
|
53
|
+
and isinstance(v, list)
|
|
54
|
+
and all(isinstance(item, ChatMessage) for item in v)
|
|
55
|
+
):
|
|
56
|
+
return v # type: ignore[return-value]
|
|
57
|
+
|
|
58
|
+
# Check for mixed types (some ChatMessage, some not)
|
|
59
|
+
if (
|
|
60
|
+
v
|
|
61
|
+
and isinstance(v, list)
|
|
62
|
+
and any(isinstance(item, ChatMessage) for item in v)
|
|
63
|
+
):
|
|
64
|
+
message = (
|
|
65
|
+
"Mixed types in copilot_chat_history: cannot mix ChatMessage objects "
|
|
66
|
+
"with other types."
|
|
67
|
+
)
|
|
68
|
+
structlogger.error(
|
|
69
|
+
"dataset_entry.parse_chat_history.mixed_types",
|
|
70
|
+
event_info=message,
|
|
71
|
+
chat_history_types=[type(item) for item in v],
|
|
72
|
+
)
|
|
73
|
+
raise ValueError(message)
|
|
74
|
+
|
|
75
|
+
# Otherwise, parse from dictionaries
|
|
76
|
+
parsed_messages: List[ChatMessage] = []
|
|
77
|
+
for message_data in v:
|
|
78
|
+
chat_message = create_chat_message_from_dict(message_data)
|
|
79
|
+
parsed_messages.append(chat_message)
|
|
80
|
+
return parsed_messages
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class DatasetMetadata(BaseModel):
|
|
84
|
+
"""Model for the metadata field of a dataset entry."""
|
|
85
|
+
|
|
86
|
+
ids: Dict[str, str] = Field(default_factory=dict)
|
|
87
|
+
copilot_additional_context: DatasetMetadataCopilotAdditionalContext = Field(
|
|
88
|
+
default_factory=DatasetMetadataCopilotAdditionalContext
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class DatasetEntry(BaseModel):
|
|
93
|
+
"""Pydantic model for dataset entries from Langfuse ExperimentItem."""
|
|
94
|
+
|
|
95
|
+
# Basic fields from ExperimentItem
|
|
96
|
+
id: str
|
|
97
|
+
input: DatasetInput
|
|
98
|
+
expected_output: DatasetExpectedOutput
|
|
99
|
+
metadata: DatasetMetadata
|
|
100
|
+
|
|
101
|
+
def to_copilot_context(self) -> CopilotContext:
|
|
102
|
+
"""Create a CopilotContext from the dataset entry.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
ValueError: If the metadata is None, as it's required for creating a valid
|
|
106
|
+
CopilotContext.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
CopilotContext with all the context information.
|
|
110
|
+
"""
|
|
111
|
+
if self.metadata is None:
|
|
112
|
+
message = (
|
|
113
|
+
f"Cannot create CopilotContext from dataset item with id: {self.id}. "
|
|
114
|
+
f"Metadata is required but was None."
|
|
115
|
+
)
|
|
116
|
+
structlogger.error(
|
|
117
|
+
"dataset_entry.to_copilot_context.metadata_is_none",
|
|
118
|
+
event_info=message,
|
|
119
|
+
item_id=self.id,
|
|
120
|
+
item_metadata=self.metadata,
|
|
121
|
+
)
|
|
122
|
+
raise ValueError(message)
|
|
123
|
+
|
|
124
|
+
# Parse tracker context if available
|
|
125
|
+
tracker_context = None
|
|
126
|
+
if (
|
|
127
|
+
self.metadata.copilot_additional_context.assistant_tracker_context
|
|
128
|
+
is not None
|
|
129
|
+
):
|
|
130
|
+
tracker_context = TrackerContext(
|
|
131
|
+
**self.metadata.copilot_additional_context.assistant_tracker_context
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
return CopilotContext(
|
|
135
|
+
tracker_context=tracker_context,
|
|
136
|
+
assistant_logs=self.metadata.copilot_additional_context.assistant_logs,
|
|
137
|
+
assistant_files=self.metadata.copilot_additional_context.relevant_assistant_files,
|
|
138
|
+
copilot_chat_history=self.metadata.copilot_additional_context.copilot_chat_history,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def from_raw_data(
|
|
143
|
+
cls,
|
|
144
|
+
id: str,
|
|
145
|
+
input_data: Dict[str, Any],
|
|
146
|
+
expected_output_data: Dict[str, Any],
|
|
147
|
+
metadata_data: Dict[str, Any],
|
|
148
|
+
) -> "DatasetEntry":
|
|
149
|
+
"""Create a DatasetEntry from raw dictionary data.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
id: The dataset entry ID.
|
|
153
|
+
input_data: Raw input dictionary.
|
|
154
|
+
expected_output_data: Raw expected output dictionary.
|
|
155
|
+
metadata_data: Raw metadata dictionary with all the additional context
|
|
156
|
+
used to generate the Copilot response.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
DatasetEntry with parsed data.
|
|
160
|
+
"""
|
|
161
|
+
# Use Pydantic's model_validate to parse nested structures
|
|
162
|
+
dataset_input = DatasetInput.model_validate(input_data)
|
|
163
|
+
dataset_expected_output = DatasetExpectedOutput.model_validate(
|
|
164
|
+
expected_output_data
|
|
165
|
+
)
|
|
166
|
+
dataset_metadata = DatasetMetadata.model_validate(metadata_data)
|
|
167
|
+
|
|
168
|
+
return cls(
|
|
169
|
+
id=id,
|
|
170
|
+
input=dataset_input,
|
|
171
|
+
expected_output=dataset_expected_output,
|
|
172
|
+
metadata=dataset_metadata,
|
|
173
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Constants for the response classification evaluator."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Literal
|
|
4
|
+
|
|
5
|
+
# Options for averaging methods for Response Classification Evaluator
|
|
6
|
+
MICRO_AVERAGING_METHOD: Literal["micro"] = "micro"
|
|
7
|
+
MACRO_AVERAGING_METHOD: Literal["macro"] = "macro"
|
|
8
|
+
WEIGHTED_AVERAGING_METHOD: Literal["weighted"] = "weighted"
|
|
9
|
+
|
|
10
|
+
AVERAGING_METHODS: List[Literal["micro", "macro", "weighted"]] = [
|
|
11
|
+
MICRO_AVERAGING_METHOD,
|
|
12
|
+
MACRO_AVERAGING_METHOD,
|
|
13
|
+
WEIGHTED_AVERAGING_METHOD,
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
# Overall evaluation metric names
|
|
17
|
+
MICRO_PRECISION_METRIC = "micro_precision"
|
|
18
|
+
MACRO_PRECISION_METRIC = "macro_precision"
|
|
19
|
+
WEIGHTED_PRECISION_METRIC = "weighted_precision"
|
|
20
|
+
|
|
21
|
+
MICRO_RECALL_METRIC = "micro_recall"
|
|
22
|
+
MACRO_RECALL_METRIC = "macro_recall"
|
|
23
|
+
WEIGHTED_RECALL_METRIC = "weighted_recall"
|
|
24
|
+
|
|
25
|
+
MICRO_F1_METRIC = "micro_f1"
|
|
26
|
+
MACRO_F1_METRIC = "macro_f1"
|
|
27
|
+
WEIGHTED_F1_METRIC = "weighted_f1"
|
|
28
|
+
|
|
29
|
+
# Skip count metric name due to invalid data
|
|
30
|
+
SKIP_COUNT_METRIC = "skipped_items"
|
|
31
|
+
|
|
32
|
+
# Per-class evaluation metric name templates
|
|
33
|
+
PER_CLASS_PRECISION_METRIC_TEMPLATE = "{category}_precision"
|
|
34
|
+
PER_CLASS_RECALL_METRIC_TEMPLATE = "{category}_recall"
|
|
35
|
+
PER_CLASS_F1_METRIC_TEMPLATE = "{category}_f1"
|
|
36
|
+
PER_CLASS_SUPPORT_METRIC_TEMPLATE = "{category}_support"
|
|
37
|
+
|
|
38
|
+
# Description templates for evaluation metrics
|
|
39
|
+
MICRO_PRECISION_DESCRIPTION = "Micro Precision: {value:.3f}"
|
|
40
|
+
MACRO_PRECISION_DESCRIPTION = "Macro Precision: {value:.3f}"
|
|
41
|
+
WEIGHTED_PRECISION_DESCRIPTION = "Weighted Precision: {value:.3f}"
|
|
42
|
+
|
|
43
|
+
MICRO_RECALL_DESCRIPTION = "Micro Recall: {value:.3f}"
|
|
44
|
+
MACRO_RECALL_DESCRIPTION = "Macro Recall: {value:.3f}"
|
|
45
|
+
WEIGHTED_RECALL_DESCRIPTION = "Weighted Recall: {value:.3f}"
|
|
46
|
+
|
|
47
|
+
MICRO_F1_DESCRIPTION = "Micro F1: {value:.3f}"
|
|
48
|
+
MACRO_F1_DESCRIPTION = "Macro F1: {value:.3f}"
|
|
49
|
+
WEIGHTED_F1_DESCRIPTION = "Weighted F1: {value:.3f}"
|
|
50
|
+
|
|
51
|
+
# Skip count metric description
|
|
52
|
+
SKIP_COUNT_DESCRIPTION = "Skipped {value} items due to invalid data"
|
|
53
|
+
|
|
54
|
+
# Per-class description templates
|
|
55
|
+
PER_CLASS_PRECISION_DESCRIPTION = "[{category}] Precision: {value:.3f}"
|
|
56
|
+
PER_CLASS_RECALL_DESCRIPTION = "[{category}] Recall: {value:.3f}"
|
|
57
|
+
PER_CLASS_F1_DESCRIPTION = "[{category}] F1: {value:.3f}"
|
|
58
|
+
PER_CLASS_SUPPORT_DESCRIPTION = "[{category}] Support: {value}"
|
|
59
|
+
|
|
60
|
+
# Experiment configuration
|
|
61
|
+
EXPERIMENT_NAME = "Copilot Response Classification Evaluation"
|
|
62
|
+
EXPERIMENT_DESCRIPTION = (
|
|
63
|
+
"Evaluating Copilot response classification performance with per-class metrics "
|
|
64
|
+
"and overall averages (micro, macro, weighted). The metric that are reported are: "
|
|
65
|
+
"precision, recall, F1, support."
|
|
66
|
+
)
|