rasa-pro 3.15.0a1__py3-none-any.whl → 3.15.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of rasa-pro might be problematic. Click here for more details.
- rasa/builder/constants.py +5 -0
- rasa/builder/copilot/models.py +80 -28
- rasa/builder/download.py +110 -0
- rasa/builder/evaluator/__init__.py +0 -0
- rasa/builder/evaluator/constants.py +15 -0
- rasa/builder/evaluator/copilot_executor.py +89 -0
- rasa/builder/evaluator/dataset/models.py +173 -0
- rasa/builder/evaluator/exceptions.py +4 -0
- rasa/builder/evaluator/response_classification/__init__.py +0 -0
- rasa/builder/evaluator/response_classification/constants.py +66 -0
- rasa/builder/evaluator/response_classification/evaluator.py +346 -0
- rasa/builder/evaluator/response_classification/langfuse_runner.py +463 -0
- rasa/builder/evaluator/response_classification/models.py +61 -0
- rasa/builder/evaluator/scripts/__init__.py +0 -0
- rasa/builder/evaluator/scripts/run_response_classification_evaluator.py +152 -0
- rasa/builder/jobs.py +208 -1
- rasa/builder/logging_utils.py +25 -24
- rasa/builder/main.py +6 -1
- rasa/builder/models.py +23 -0
- rasa/builder/project_generator.py +29 -10
- rasa/builder/service.py +104 -22
- rasa/builder/training_service.py +13 -1
- rasa/builder/validation_service.py +2 -1
- rasa/core/actions/action_clean_stack.py +32 -0
- rasa/core/actions/constants.py +4 -0
- rasa/core/actions/custom_action_executor.py +70 -12
- rasa/core/actions/grpc_custom_action_executor.py +41 -2
- rasa/core/actions/http_custom_action_executor.py +49 -25
- rasa/core/channels/voice_stream/voice_channel.py +14 -2
- rasa/dialogue_understanding/generator/llm_based_command_generator.py +6 -3
- rasa/dialogue_understanding/generator/single_step/compact_llm_command_generator.py +15 -7
- rasa/dialogue_understanding/generator/single_step/search_ready_llm_command_generator.py +15 -8
- rasa/dialogue_understanding/processor/command_processor.py +49 -7
- rasa/shared/providers/_configs/azure_openai_client_config.py +4 -5
- rasa/shared/providers/_configs/default_litellm_client_config.py +4 -4
- rasa/shared/providers/_configs/litellm_router_client_config.py +3 -2
- rasa/shared/providers/_configs/openai_client_config.py +5 -7
- rasa/shared/providers/_configs/rasa_llm_client_config.py +4 -4
- rasa/shared/providers/_configs/self_hosted_llm_client_config.py +4 -4
- rasa/shared/providers/llm/_base_litellm_client.py +42 -14
- rasa/shared/providers/llm/litellm_router_llm_client.py +38 -15
- rasa/shared/providers/llm/self_hosted_llm_client.py +34 -32
- rasa/shared/utils/configs.py +5 -8
- rasa/utils/endpoints.py +6 -0
- rasa/version.py +1 -1
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/METADATA +12 -12
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/RECORD +50 -37
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/NOTICE +0 -0
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/WHEEL +0 -0
- {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
import langfuse
|
|
6
|
+
import structlog
|
|
7
|
+
import yaml # type: ignore[import-untyped]
|
|
8
|
+
from langfuse import Evaluation
|
|
9
|
+
from langfuse._client.datasets import DatasetClient
|
|
10
|
+
from langfuse.experiment import (
|
|
11
|
+
ExperimentItem,
|
|
12
|
+
ExperimentItemResult,
|
|
13
|
+
ExperimentResult,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from rasa.builder.copilot.models import (
|
|
17
|
+
ResponseCategory,
|
|
18
|
+
)
|
|
19
|
+
from rasa.builder.evaluator.constants import (
|
|
20
|
+
DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME,
|
|
21
|
+
RESPONSE_CLASSIFICATION_EVALUATION_RESULTS_DIR,
|
|
22
|
+
RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME,
|
|
23
|
+
)
|
|
24
|
+
from rasa.builder.evaluator.copilot_executor import (
|
|
25
|
+
CopilotRunResult,
|
|
26
|
+
run_copilot_with_response_handler,
|
|
27
|
+
)
|
|
28
|
+
from rasa.builder.evaluator.dataset.models import DatasetEntry
|
|
29
|
+
from rasa.builder.evaluator.response_classification.constants import (
|
|
30
|
+
EXPERIMENT_DESCRIPTION,
|
|
31
|
+
EXPERIMENT_NAME,
|
|
32
|
+
MACRO_F1_DESCRIPTION,
|
|
33
|
+
MACRO_F1_METRIC,
|
|
34
|
+
MACRO_PRECISION_DESCRIPTION,
|
|
35
|
+
MACRO_PRECISION_METRIC,
|
|
36
|
+
MACRO_RECALL_DESCRIPTION,
|
|
37
|
+
MACRO_RECALL_METRIC,
|
|
38
|
+
MICRO_F1_DESCRIPTION,
|
|
39
|
+
MICRO_F1_METRIC,
|
|
40
|
+
MICRO_PRECISION_DESCRIPTION,
|
|
41
|
+
MICRO_PRECISION_METRIC,
|
|
42
|
+
MICRO_RECALL_DESCRIPTION,
|
|
43
|
+
MICRO_RECALL_METRIC,
|
|
44
|
+
PER_CLASS_F1_DESCRIPTION,
|
|
45
|
+
PER_CLASS_F1_METRIC_TEMPLATE,
|
|
46
|
+
PER_CLASS_PRECISION_DESCRIPTION,
|
|
47
|
+
PER_CLASS_PRECISION_METRIC_TEMPLATE,
|
|
48
|
+
PER_CLASS_RECALL_DESCRIPTION,
|
|
49
|
+
PER_CLASS_RECALL_METRIC_TEMPLATE,
|
|
50
|
+
PER_CLASS_SUPPORT_DESCRIPTION,
|
|
51
|
+
PER_CLASS_SUPPORT_METRIC_TEMPLATE,
|
|
52
|
+
SKIP_COUNT_DESCRIPTION,
|
|
53
|
+
SKIP_COUNT_METRIC,
|
|
54
|
+
WEIGHTED_F1_DESCRIPTION,
|
|
55
|
+
WEIGHTED_F1_METRIC,
|
|
56
|
+
WEIGHTED_PRECISION_DESCRIPTION,
|
|
57
|
+
WEIGHTED_PRECISION_METRIC,
|
|
58
|
+
WEIGHTED_RECALL_DESCRIPTION,
|
|
59
|
+
WEIGHTED_RECALL_METRIC,
|
|
60
|
+
)
|
|
61
|
+
from rasa.builder.evaluator.response_classification.evaluator import (
|
|
62
|
+
ResponseClassificationEvaluator,
|
|
63
|
+
)
|
|
64
|
+
from rasa.builder.evaluator.response_classification.models import (
|
|
65
|
+
ClassificationResult,
|
|
66
|
+
MetricsSummary,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
structlogger = structlog.get_logger()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ResponseClassificationLangfuseRunner:
|
|
73
|
+
"""Main class for running Langfuse evaluations on the classification evaluator."""
|
|
74
|
+
|
|
75
|
+
def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
|
|
76
|
+
self._langfuse = langfuse.get_client()
|
|
77
|
+
self._dataset = self._retrieve_dataset(dataset_name)
|
|
78
|
+
self._output_dir = (
|
|
79
|
+
Path(output_dir)
|
|
80
|
+
if output_dir
|
|
81
|
+
else RESPONSE_CLASSIFICATION_EVALUATION_RESULTS_DIR
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def _retrieve_dataset(self, dataset_name: str) -> DatasetClient:
|
|
85
|
+
"""Get the dataset."""
|
|
86
|
+
try:
|
|
87
|
+
return self._langfuse.get_dataset(dataset_name)
|
|
88
|
+
except Exception as e:
|
|
89
|
+
structlogger.error(
|
|
90
|
+
"langfuse_runner.init.dataset_not_found",
|
|
91
|
+
event_info=f"Failed to get dataset '{dataset_name}'",
|
|
92
|
+
dataset_name=dataset_name,
|
|
93
|
+
error=str(e),
|
|
94
|
+
)
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
def run_experiment(self) -> ExperimentResult:
|
|
98
|
+
"""Run the experiment."""
|
|
99
|
+
result = self._dataset.run_experiment(
|
|
100
|
+
name=EXPERIMENT_NAME,
|
|
101
|
+
description=EXPERIMENT_DESCRIPTION,
|
|
102
|
+
task=self._run_copilot_task,
|
|
103
|
+
run_evaluators=[self._run_classification_metrics_evaluator],
|
|
104
|
+
)
|
|
105
|
+
self._report_run_results_to_txt_file(result)
|
|
106
|
+
self._langfuse.flush()
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
async def _run_copilot_task(
|
|
110
|
+
self,
|
|
111
|
+
*,
|
|
112
|
+
item: ExperimentItem,
|
|
113
|
+
**kwargs: Dict[str, Any],
|
|
114
|
+
) -> Optional[CopilotRunResult]:
|
|
115
|
+
"""Copilot task function that processes each dataset item.
|
|
116
|
+
|
|
117
|
+
Follows the languse.experiment.TaskFunction protocol. The function mimics the
|
|
118
|
+
functionality of the `/copilot` endpoint.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
item: The dataset item to process.
|
|
122
|
+
kwargs: Additional keyword arguments.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
A tuple containing the complete response and the generation context.
|
|
126
|
+
"""
|
|
127
|
+
# Try to create the copilot context used for generating the response from the
|
|
128
|
+
# dataset item, if the context cannot be created, skip the evaluation by
|
|
129
|
+
# returning None.
|
|
130
|
+
try:
|
|
131
|
+
dataset_entry = DatasetEntry.from_raw_data(
|
|
132
|
+
id=item.id, # type: ignore[union-attr]
|
|
133
|
+
input_data=item.input, # type: ignore[union-attr]
|
|
134
|
+
expected_output_data=item.expected_output, # type: ignore[union-attr]
|
|
135
|
+
metadata_data=item.metadata, # type: ignore[union-attr]
|
|
136
|
+
)
|
|
137
|
+
context = dataset_entry.to_copilot_context()
|
|
138
|
+
except Exception as e:
|
|
139
|
+
structlogger.error(
|
|
140
|
+
"langfuse_runner._task_function_run_copilot.context_creation_failed",
|
|
141
|
+
event_info=(
|
|
142
|
+
f"Failed to create CopilotContext from dataset item with id: "
|
|
143
|
+
f"{item.id}. The Copilot cannot be run without a valid " # type: ignore[union-attr]
|
|
144
|
+
f"CopilotContext. Skipping evaluation."
|
|
145
|
+
),
|
|
146
|
+
item_id=item.id, # type: ignore[union-attr]
|
|
147
|
+
item_input=item.input, # type: ignore[union-attr]
|
|
148
|
+
item_expected_output=item.expected_output, # type: ignore[union-attr]
|
|
149
|
+
error=str(e),
|
|
150
|
+
)
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
# Run the evalution. If the task fails, skip the evaluation by returning None.
|
|
154
|
+
try:
|
|
155
|
+
return await run_copilot_with_response_handler(context)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
structlogger.error(
|
|
158
|
+
"langfuse_runner._task_function_run_copilot.copilot_run_failed",
|
|
159
|
+
event_info=(
|
|
160
|
+
f"Failed to run the copilot with response handler for dataset item "
|
|
161
|
+
f"with id: {item.id}. Skipping evaluation." # type: ignore[union-attr]
|
|
162
|
+
),
|
|
163
|
+
item_id=item.id, # type: ignore[union-attr]
|
|
164
|
+
item_input=item.input, # type: ignore[union-attr]
|
|
165
|
+
item_expected_output=item.expected_output, # type: ignore[union-attr]
|
|
166
|
+
error=str(e),
|
|
167
|
+
)
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
def _run_classification_metrics_evaluator(
|
|
171
|
+
self, *, item_results: List[ExperimentItemResult], **kwargs: Dict[str, Any]
|
|
172
|
+
) -> List[Evaluation]:
|
|
173
|
+
"""Main evaluator function that calculates classification metrics.
|
|
174
|
+
|
|
175
|
+
This function follows the languse.experiment.RunEvaluatorFunction protocol.
|
|
176
|
+
It will be called after the langfuse.experiment.TaskFunction has been called
|
|
177
|
+
for each item in the dataset.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
item_results: The item results to evaluate.
|
|
181
|
+
kwargs: Additional keyword arguments.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
A list of Langfuse Evaluation objects.
|
|
185
|
+
"""
|
|
186
|
+
# Create a list of ClassificationResult objects from item_results
|
|
187
|
+
classification_results, skip_count = (
|
|
188
|
+
self._create_classification_results_from_dataset_items(item_results)
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Log a warning if any items were skipped due to invalid data, as this will
|
|
192
|
+
# affect the overall metrics.
|
|
193
|
+
if skip_count > 0:
|
|
194
|
+
structlogger.warning(
|
|
195
|
+
"langfuse_runner._run_classification_metrics_evaluator.skipped_items",
|
|
196
|
+
event_info=(
|
|
197
|
+
f"Skipped {skip_count} items due to invalid data. This will affect "
|
|
198
|
+
f"the overall metrics."
|
|
199
|
+
),
|
|
200
|
+
skipped_count=skip_count,
|
|
201
|
+
total_items=len(item_results),
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Run the response classification evaluator on the classification results and
|
|
205
|
+
# get the metrics summary
|
|
206
|
+
evaluator = ResponseClassificationEvaluator() # type: ignore[no-untyped-call]
|
|
207
|
+
metrics_summary = evaluator.evaluate(classification_results)
|
|
208
|
+
|
|
209
|
+
# Record the metrics in Langfuse.
|
|
210
|
+
evaluations = self._create_langfuse_evaluation_objects(
|
|
211
|
+
metrics_summary, skip_count
|
|
212
|
+
)
|
|
213
|
+
self._report_yaml_structured_results(evaluations)
|
|
214
|
+
return evaluations
|
|
215
|
+
|
|
216
|
+
def _create_classification_results_from_dataset_items(
|
|
217
|
+
self, item_results: List[ExperimentItemResult]
|
|
218
|
+
) -> Tuple[List[ClassificationResult], int]:
|
|
219
|
+
"""Create a list of ClassificationResult objects from item results.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
item_results: The item results to create ClassificationResult objects from.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
A tuple containing the list of ClassificationResult objects and the number
|
|
226
|
+
of items that were skipped due to missing predicted or expected categories.
|
|
227
|
+
"""
|
|
228
|
+
classification_results: List[ClassificationResult] = []
|
|
229
|
+
skip_count = 0
|
|
230
|
+
|
|
231
|
+
# Try to create a ClassificationResult from the each item result. if either
|
|
232
|
+
# predicted or expected category is missing, skip the item.
|
|
233
|
+
for item_result in item_results:
|
|
234
|
+
# If the output is None, the task function resulted in an error, skip the
|
|
235
|
+
# item.
|
|
236
|
+
if (
|
|
237
|
+
item_result.output is None
|
|
238
|
+
or not isinstance(item_result.output, CopilotRunResult)
|
|
239
|
+
or item_result.output.response_category is None
|
|
240
|
+
or item_result.item.expected_output is None # type: ignore[union-attr]
|
|
241
|
+
or not isinstance(item_result.item.expected_output, dict) # type: ignore[union-attr]
|
|
242
|
+
or item_result.item.expected_output.get("response_category") is None # type: ignore[union-attr]
|
|
243
|
+
):
|
|
244
|
+
structlogger.error(
|
|
245
|
+
"langfuse_runner._create_classification_results_from_dataset_items"
|
|
246
|
+
".invalid_item_result",
|
|
247
|
+
event_info=(
|
|
248
|
+
f"Cannot create a ClassificationResult from item result with "
|
|
249
|
+
f"id: {item_result.item.id}. This item will not be used for " # type: ignore[union-attr]
|
|
250
|
+
f"evaluation."
|
|
251
|
+
),
|
|
252
|
+
item_id=item_result.item.id, # type: ignore[union-attr]
|
|
253
|
+
item_output=item_result.output,
|
|
254
|
+
item_expected_output=item_result.item.expected_output, # type: ignore[union-attr]
|
|
255
|
+
)
|
|
256
|
+
skip_count += 1
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
predicted_category = item_result.output.response_category.value
|
|
260
|
+
expected_category = item_result.item.expected_output["response_category"] # type: ignore[union-attr]
|
|
261
|
+
classification_result = ClassificationResult(
|
|
262
|
+
prediction=ResponseCategory(predicted_category),
|
|
263
|
+
expected=ResponseCategory(expected_category),
|
|
264
|
+
)
|
|
265
|
+
classification_results.append(classification_result)
|
|
266
|
+
|
|
267
|
+
return classification_results, skip_count
|
|
268
|
+
|
|
269
|
+
def _create_langfuse_evaluation_objects(
|
|
270
|
+
self, metrics_summary: MetricsSummary, skip_count: int
|
|
271
|
+
) -> List[Evaluation]:
|
|
272
|
+
"""Create Langfuse Evaluation objects from metrics summary."""
|
|
273
|
+
evaluations: List[Evaluation] = []
|
|
274
|
+
|
|
275
|
+
# Overall metrics
|
|
276
|
+
evaluations.extend(
|
|
277
|
+
[
|
|
278
|
+
Evaluation(
|
|
279
|
+
name=MICRO_PRECISION_METRIC,
|
|
280
|
+
value=metrics_summary.overall.micro_precision,
|
|
281
|
+
comment=MICRO_PRECISION_DESCRIPTION.format(
|
|
282
|
+
value=metrics_summary.overall.micro_precision
|
|
283
|
+
),
|
|
284
|
+
),
|
|
285
|
+
Evaluation(
|
|
286
|
+
name=MACRO_PRECISION_METRIC,
|
|
287
|
+
value=metrics_summary.overall.macro_precision,
|
|
288
|
+
comment=MACRO_PRECISION_DESCRIPTION.format(
|
|
289
|
+
value=metrics_summary.overall.macro_precision
|
|
290
|
+
),
|
|
291
|
+
),
|
|
292
|
+
Evaluation(
|
|
293
|
+
name=WEIGHTED_PRECISION_METRIC,
|
|
294
|
+
value=metrics_summary.overall.weighted_avg_precision,
|
|
295
|
+
comment=WEIGHTED_PRECISION_DESCRIPTION.format(
|
|
296
|
+
value=metrics_summary.overall.weighted_avg_precision
|
|
297
|
+
),
|
|
298
|
+
),
|
|
299
|
+
Evaluation(
|
|
300
|
+
name=MICRO_RECALL_METRIC,
|
|
301
|
+
value=metrics_summary.overall.micro_recall,
|
|
302
|
+
comment=MICRO_RECALL_DESCRIPTION.format(
|
|
303
|
+
value=metrics_summary.overall.micro_recall
|
|
304
|
+
),
|
|
305
|
+
),
|
|
306
|
+
Evaluation(
|
|
307
|
+
name=MACRO_RECALL_METRIC,
|
|
308
|
+
value=metrics_summary.overall.macro_recall,
|
|
309
|
+
comment=MACRO_RECALL_DESCRIPTION.format(
|
|
310
|
+
value=metrics_summary.overall.macro_recall
|
|
311
|
+
),
|
|
312
|
+
),
|
|
313
|
+
Evaluation(
|
|
314
|
+
name=WEIGHTED_RECALL_METRIC,
|
|
315
|
+
value=metrics_summary.overall.weighted_avg_recall,
|
|
316
|
+
comment=WEIGHTED_RECALL_DESCRIPTION.format(
|
|
317
|
+
value=metrics_summary.overall.weighted_avg_recall
|
|
318
|
+
),
|
|
319
|
+
),
|
|
320
|
+
Evaluation(
|
|
321
|
+
name=MICRO_F1_METRIC,
|
|
322
|
+
value=metrics_summary.overall.micro_f1,
|
|
323
|
+
comment=MICRO_F1_DESCRIPTION.format(
|
|
324
|
+
value=metrics_summary.overall.micro_f1
|
|
325
|
+
),
|
|
326
|
+
),
|
|
327
|
+
Evaluation(
|
|
328
|
+
name=MACRO_F1_METRIC,
|
|
329
|
+
value=metrics_summary.overall.macro_f1,
|
|
330
|
+
comment=MACRO_F1_DESCRIPTION.format(
|
|
331
|
+
value=metrics_summary.overall.macro_f1
|
|
332
|
+
),
|
|
333
|
+
),
|
|
334
|
+
Evaluation(
|
|
335
|
+
name=WEIGHTED_F1_METRIC,
|
|
336
|
+
value=metrics_summary.overall.weighted_avg_f1,
|
|
337
|
+
comment=WEIGHTED_F1_DESCRIPTION.format(
|
|
338
|
+
value=metrics_summary.overall.weighted_avg_f1
|
|
339
|
+
),
|
|
340
|
+
),
|
|
341
|
+
]
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Per-class metrics
|
|
345
|
+
for category, per_class_metrics in metrics_summary.per_class.items():
|
|
346
|
+
category_name = category.value.lower()
|
|
347
|
+
evaluations.extend(
|
|
348
|
+
[
|
|
349
|
+
Evaluation(
|
|
350
|
+
name=PER_CLASS_PRECISION_METRIC_TEMPLATE.format(
|
|
351
|
+
category=category_name
|
|
352
|
+
),
|
|
353
|
+
value=per_class_metrics.precision,
|
|
354
|
+
comment=PER_CLASS_PRECISION_DESCRIPTION.format(
|
|
355
|
+
category=category.value,
|
|
356
|
+
value=per_class_metrics.precision,
|
|
357
|
+
),
|
|
358
|
+
),
|
|
359
|
+
Evaluation(
|
|
360
|
+
name=PER_CLASS_RECALL_METRIC_TEMPLATE.format(
|
|
361
|
+
category=category_name
|
|
362
|
+
),
|
|
363
|
+
value=per_class_metrics.recall,
|
|
364
|
+
comment=PER_CLASS_RECALL_DESCRIPTION.format(
|
|
365
|
+
category=category.value,
|
|
366
|
+
value=per_class_metrics.recall,
|
|
367
|
+
),
|
|
368
|
+
),
|
|
369
|
+
Evaluation(
|
|
370
|
+
name=PER_CLASS_F1_METRIC_TEMPLATE.format(
|
|
371
|
+
category=category_name
|
|
372
|
+
),
|
|
373
|
+
value=per_class_metrics.f1,
|
|
374
|
+
comment=PER_CLASS_F1_DESCRIPTION.format(
|
|
375
|
+
category=category.value,
|
|
376
|
+
value=per_class_metrics.f1,
|
|
377
|
+
),
|
|
378
|
+
),
|
|
379
|
+
Evaluation(
|
|
380
|
+
name=PER_CLASS_SUPPORT_METRIC_TEMPLATE.format(
|
|
381
|
+
category=category_name
|
|
382
|
+
),
|
|
383
|
+
value=float(per_class_metrics.support),
|
|
384
|
+
comment=PER_CLASS_SUPPORT_DESCRIPTION.format(
|
|
385
|
+
category=category.value,
|
|
386
|
+
value=per_class_metrics.support,
|
|
387
|
+
),
|
|
388
|
+
),
|
|
389
|
+
]
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Record the number of items that were skipped due to invalid data
|
|
393
|
+
evaluations.append(
|
|
394
|
+
Evaluation(
|
|
395
|
+
name=SKIP_COUNT_METRIC,
|
|
396
|
+
value=skip_count,
|
|
397
|
+
comment=SKIP_COUNT_DESCRIPTION.format(value=skip_count),
|
|
398
|
+
)
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
return evaluations
|
|
402
|
+
|
|
403
|
+
def _report_run_results_to_txt_file(self, result: ExperimentResult) -> None:
|
|
404
|
+
result_str = result.format().replace("\\n", "\n")
|
|
405
|
+
self._output_dir.mkdir(parents=True, exist_ok=True)
|
|
406
|
+
|
|
407
|
+
# Add timestamp prefix to filename
|
|
408
|
+
current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
409
|
+
timestamped_filename = (
|
|
410
|
+
f"{current_date}_"
|
|
411
|
+
f"{DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME}"
|
|
412
|
+
)
|
|
413
|
+
output_path = self._output_dir / timestamped_filename
|
|
414
|
+
|
|
415
|
+
with open(str(output_path), "w") as f:
|
|
416
|
+
f.write(result_str)
|
|
417
|
+
structlogger.info(
|
|
418
|
+
"langfuse_runner._report_run_results.exported",
|
|
419
|
+
event_info="Evaluation results exported to text file",
|
|
420
|
+
text_file=output_path,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
def _report_yaml_structured_results(self, evaluations: list[Evaluation]) -> None:
|
|
424
|
+
"""Export evaluation results to a YAML file with structured data."""
|
|
425
|
+
# Ensure results directory exists
|
|
426
|
+
self._output_dir.mkdir(parents=True, exist_ok=True)
|
|
427
|
+
|
|
428
|
+
# Add timestamp prefix to filename
|
|
429
|
+
current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
430
|
+
timestamped_filename = (
|
|
431
|
+
f"{current_date}_"
|
|
432
|
+
f"{RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME}"
|
|
433
|
+
)
|
|
434
|
+
output_path = self._output_dir / timestamped_filename
|
|
435
|
+
# Convert evaluations to structured data
|
|
436
|
+
structured_data: Dict[str, Any] = {
|
|
437
|
+
"experiment": {
|
|
438
|
+
"name": EXPERIMENT_NAME,
|
|
439
|
+
"description": EXPERIMENT_DESCRIPTION,
|
|
440
|
+
"timestamp": datetime.now().isoformat(),
|
|
441
|
+
},
|
|
442
|
+
"metrics": [],
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
# Add each evaluation as a metric
|
|
446
|
+
for evaluation in evaluations:
|
|
447
|
+
metric_data: Dict[str, Any] = {
|
|
448
|
+
"name": evaluation.name,
|
|
449
|
+
"value": evaluation.value,
|
|
450
|
+
"description": evaluation.comment,
|
|
451
|
+
}
|
|
452
|
+
structured_data["metrics"].append(metric_data)
|
|
453
|
+
|
|
454
|
+
# Write to YAML file
|
|
455
|
+
with open(str(output_path), "w") as f:
|
|
456
|
+
yaml.dump(structured_data, f, default_flow_style=False, sort_keys=False)
|
|
457
|
+
|
|
458
|
+
structlogger.info(
|
|
459
|
+
"langfuse_runner._report_yaml_structured_results.exported",
|
|
460
|
+
event_info="Evaluation results exported to YAML file",
|
|
461
|
+
yaml_file=output_path,
|
|
462
|
+
metrics_count=len(evaluations),
|
|
463
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from rasa.builder.copilot.models import ResponseCategory
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ClassificationResult(BaseModel):
|
|
9
|
+
prediction: ResponseCategory
|
|
10
|
+
expected: ResponseCategory
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PerClassMetrics(BaseModel):
|
|
14
|
+
"""Metrics for a single response category."""
|
|
15
|
+
|
|
16
|
+
precision: float = Field(ge=0.0, le=1.0, description="Precision score")
|
|
17
|
+
recall: float = Field(ge=0.0, le=1.0, description="Recall score")
|
|
18
|
+
f1: float = Field(ge=0.0, le=1.0, description="F1 score")
|
|
19
|
+
|
|
20
|
+
support: int = Field(ge=0, description="Number of actual occurrences.")
|
|
21
|
+
|
|
22
|
+
true_positives: int = Field(ge=0, description="Number of true positives.")
|
|
23
|
+
false_positives: int = Field(ge=0, description="Number of false positives.")
|
|
24
|
+
false_negatives: int = Field(ge=0, description="Number of false negatives.")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class OverallClassificationMetrics(BaseModel):
|
|
28
|
+
"""Overall evaluation metrics."""
|
|
29
|
+
|
|
30
|
+
micro_precision: float = Field(
|
|
31
|
+
ge=0.0, le=1.0, description="Micro-averaged Precision"
|
|
32
|
+
)
|
|
33
|
+
macro_precision: float = Field(
|
|
34
|
+
ge=0.0, le=1.0, description="Macro-averaged Precision"
|
|
35
|
+
)
|
|
36
|
+
weighted_avg_precision: float = Field(
|
|
37
|
+
ge=0.0, le=1.0, description="Weighted Precision"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
micro_recall: float = Field(ge=0.0, le=1.0, description="Micro-averaged Recall")
|
|
41
|
+
macro_recall: float = Field(ge=0.0, le=1.0, description="Macro-averaged Recall")
|
|
42
|
+
weighted_avg_recall: float = Field(ge=0.0, le=1.0, description="Weighted Recall")
|
|
43
|
+
|
|
44
|
+
micro_f1: float = Field(ge=0.0, le=1.0, description="Micro-averaged F1 score")
|
|
45
|
+
macro_f1: float = Field(ge=0.0, le=1.0, description="Macro-averaged F1 score")
|
|
46
|
+
weighted_avg_f1: float = Field(ge=0.0, le=1.0, description="Weighted F1 score")
|
|
47
|
+
|
|
48
|
+
support: int = Field(ge=0, description="Total number of occurrences.")
|
|
49
|
+
|
|
50
|
+
true_positives: int = Field(ge=0, description="Total number of true positives.")
|
|
51
|
+
false_positives: int = Field(ge=0, description="Total number of false positives.")
|
|
52
|
+
false_negatives: int = Field(ge=0, description="Total number of false negatives.")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class MetricsSummary(BaseModel):
|
|
56
|
+
"""Complete metrics summary with per-class and overall metrics."""
|
|
57
|
+
|
|
58
|
+
per_class: Dict[ResponseCategory, PerClassMetrics] = Field(
|
|
59
|
+
description="Per-class metrics"
|
|
60
|
+
)
|
|
61
|
+
overall: OverallClassificationMetrics = Field(description="Overall metrics")
|
|
File without changes
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Response Classification Evaluator CLI.
|
|
3
|
+
|
|
4
|
+
A command-line tool for running response classification evaluation experiments using
|
|
5
|
+
Langfuse.
|
|
6
|
+
|
|
7
|
+
This script runs experiments on datasets and provides links to the results.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import asyncio
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from typing import Optional
|
|
15
|
+
|
|
16
|
+
import structlog
|
|
17
|
+
from langfuse.experiment import ExperimentResult
|
|
18
|
+
|
|
19
|
+
from rasa.builder.evaluator.constants import (
|
|
20
|
+
DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME,
|
|
21
|
+
RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME,
|
|
22
|
+
)
|
|
23
|
+
from rasa.builder.evaluator.response_classification.langfuse_runner import (
|
|
24
|
+
ResponseClassificationLangfuseRunner,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Configure structured logging
|
|
28
|
+
structlogger = structlog.get_logger()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def validate_environment() -> None:
|
|
32
|
+
"""Validate that all required environment variables are set."""
|
|
33
|
+
required_vars = [
|
|
34
|
+
"OPENAI_API_KEY",
|
|
35
|
+
"INKEEP_API_KEY",
|
|
36
|
+
"LANGFUSE_HOST",
|
|
37
|
+
"LANGFUSE_PUBLIC_KEY",
|
|
38
|
+
"LANGFUSE_SECRET_KEY",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
missing_vars = []
|
|
42
|
+
for var in required_vars:
|
|
43
|
+
if not os.getenv(var):
|
|
44
|
+
missing_vars.append(var)
|
|
45
|
+
|
|
46
|
+
if missing_vars:
|
|
47
|
+
structlogger.error(
|
|
48
|
+
"main.validate_environment.missing_variables",
|
|
49
|
+
event_info=(
|
|
50
|
+
"Missing required environment variables. Please set the following "
|
|
51
|
+
f"environment variables: {missing_vars}."
|
|
52
|
+
),
|
|
53
|
+
missing_variables=missing_vars,
|
|
54
|
+
)
|
|
55
|
+
sys.exit(1)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def run_experiment(
|
|
59
|
+
dataset_name: str, output_file: Optional[str] = None
|
|
60
|
+
) -> Optional[ExperimentResult]:
|
|
61
|
+
"""Run the response classification evaluation experiment."""
|
|
62
|
+
try:
|
|
63
|
+
structlogger.info(
|
|
64
|
+
"main.run_experiment.starting",
|
|
65
|
+
event_info="Starting response classification evaluation experiment",
|
|
66
|
+
dataset_name=dataset_name,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Initialize and run the experiment
|
|
70
|
+
runner = ResponseClassificationLangfuseRunner(
|
|
71
|
+
dataset_name=dataset_name, output_dir=output_file
|
|
72
|
+
)
|
|
73
|
+
result = runner.run_experiment()
|
|
74
|
+
|
|
75
|
+
structlogger.info(
|
|
76
|
+
"main.run_experiment.completed",
|
|
77
|
+
event_info=(
|
|
78
|
+
"Response classification evaluation experiment completed successfully",
|
|
79
|
+
),
|
|
80
|
+
dataset_name=dataset_name,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
structlogger.info("✅ Experiment completed successfully!")
|
|
84
|
+
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
except Exception as e:
|
|
88
|
+
structlogger.error(
|
|
89
|
+
"main.run_experiment.failed",
|
|
90
|
+
event_info="Response classification evaluation experiment failed",
|
|
91
|
+
error=str(e),
|
|
92
|
+
dataset_name=dataset_name,
|
|
93
|
+
)
|
|
94
|
+
structlogger.error(f"❌ Error running experiment: {e}")
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main() -> int:
|
|
99
|
+
"""Main entry point for the CLI."""
|
|
100
|
+
parser = argparse.ArgumentParser(
|
|
101
|
+
description="Run response classification evaluation experiments using Langfuse",
|
|
102
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
103
|
+
epilog="Examples:\npython run_response_classification_evaluator.py my_dataset",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
parser.add_argument(
|
|
107
|
+
"--dataset-name",
|
|
108
|
+
help="Name of the dataset to evaluate",
|
|
109
|
+
required=True,
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--output-file",
|
|
113
|
+
help=(
|
|
114
|
+
"(Optional) Directory to write experiment results. Two files are created "
|
|
115
|
+
"with a timestamp prefix (YYYYMMDD_HHMMSS_):\n"
|
|
116
|
+
f"- {DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME} "
|
|
117
|
+
f"from Langfuse, and"
|
|
118
|
+
f"- {RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME} "
|
|
119
|
+
"from the classifier."
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
args = parser.parse_args()
|
|
124
|
+
|
|
125
|
+
# Validate environment variables
|
|
126
|
+
validate_environment()
|
|
127
|
+
|
|
128
|
+
structlogger.info(f"🔍 Dataset: {args.dataset_name}")
|
|
129
|
+
structlogger.info("🚀 Starting evaluation...")
|
|
130
|
+
|
|
131
|
+
# Run the experiment
|
|
132
|
+
result = asyncio.run(run_experiment(args.dataset_name, args.output_file))
|
|
133
|
+
|
|
134
|
+
if result is None:
|
|
135
|
+
sys.exit(1)
|
|
136
|
+
|
|
137
|
+
# Get experiment link
|
|
138
|
+
structlogger.info(
|
|
139
|
+
"✨ Evaluation complete!",
|
|
140
|
+
dataset_run_id=result.dataset_run_id,
|
|
141
|
+
dataset_run_url=result.dataset_run_url,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Print formatted results:
|
|
145
|
+
result_str = result.format().replace("\\n", "\n")
|
|
146
|
+
structlogger.info(result_str)
|
|
147
|
+
|
|
148
|
+
return 0
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
main()
|