rasa-pro 3.15.0a1__py3-none-any.whl → 3.15.0.dev20251027__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rasa-pro might be problematic. Click here for more details.

@@ -0,0 +1,463 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ import langfuse
6
+ import structlog
7
+ import yaml # type: ignore[import-untyped]
8
+ from langfuse import Evaluation
9
+ from langfuse._client.datasets import DatasetClient
10
+ from langfuse.experiment import (
11
+ ExperimentItem,
12
+ ExperimentItemResult,
13
+ ExperimentResult,
14
+ )
15
+
16
+ from rasa.builder.copilot.models import (
17
+ ResponseCategory,
18
+ )
19
+ from rasa.builder.evaluator.constants import (
20
+ DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME,
21
+ RESPONSE_CLASSIFICATION_EVALUATION_RESULTS_DIR,
22
+ RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME,
23
+ )
24
+ from rasa.builder.evaluator.copilot_executor import (
25
+ CopilotRunResult,
26
+ run_copilot_with_response_handler,
27
+ )
28
+ from rasa.builder.evaluator.dataset.models import DatasetEntry
29
+ from rasa.builder.evaluator.response_classification.constants import (
30
+ EXPERIMENT_DESCRIPTION,
31
+ EXPERIMENT_NAME,
32
+ MACRO_F1_DESCRIPTION,
33
+ MACRO_F1_METRIC,
34
+ MACRO_PRECISION_DESCRIPTION,
35
+ MACRO_PRECISION_METRIC,
36
+ MACRO_RECALL_DESCRIPTION,
37
+ MACRO_RECALL_METRIC,
38
+ MICRO_F1_DESCRIPTION,
39
+ MICRO_F1_METRIC,
40
+ MICRO_PRECISION_DESCRIPTION,
41
+ MICRO_PRECISION_METRIC,
42
+ MICRO_RECALL_DESCRIPTION,
43
+ MICRO_RECALL_METRIC,
44
+ PER_CLASS_F1_DESCRIPTION,
45
+ PER_CLASS_F1_METRIC_TEMPLATE,
46
+ PER_CLASS_PRECISION_DESCRIPTION,
47
+ PER_CLASS_PRECISION_METRIC_TEMPLATE,
48
+ PER_CLASS_RECALL_DESCRIPTION,
49
+ PER_CLASS_RECALL_METRIC_TEMPLATE,
50
+ PER_CLASS_SUPPORT_DESCRIPTION,
51
+ PER_CLASS_SUPPORT_METRIC_TEMPLATE,
52
+ SKIP_COUNT_DESCRIPTION,
53
+ SKIP_COUNT_METRIC,
54
+ WEIGHTED_F1_DESCRIPTION,
55
+ WEIGHTED_F1_METRIC,
56
+ WEIGHTED_PRECISION_DESCRIPTION,
57
+ WEIGHTED_PRECISION_METRIC,
58
+ WEIGHTED_RECALL_DESCRIPTION,
59
+ WEIGHTED_RECALL_METRIC,
60
+ )
61
+ from rasa.builder.evaluator.response_classification.evaluator import (
62
+ ResponseClassificationEvaluator,
63
+ )
64
+ from rasa.builder.evaluator.response_classification.models import (
65
+ ClassificationResult,
66
+ MetricsSummary,
67
+ )
68
+
69
+ structlogger = structlog.get_logger()
70
+
71
+
72
+ class ResponseClassificationLangfuseRunner:
73
+ """Main class for running Langfuse evaluations on the classification evaluator."""
74
+
75
+ def __init__(self, dataset_name: str, output_dir: Optional[str] = None):
76
+ self._langfuse = langfuse.get_client()
77
+ self._dataset = self._retrieve_dataset(dataset_name)
78
+ self._output_dir = (
79
+ Path(output_dir)
80
+ if output_dir
81
+ else RESPONSE_CLASSIFICATION_EVALUATION_RESULTS_DIR
82
+ )
83
+
84
+ def _retrieve_dataset(self, dataset_name: str) -> DatasetClient:
85
+ """Get the dataset."""
86
+ try:
87
+ return self._langfuse.get_dataset(dataset_name)
88
+ except Exception as e:
89
+ structlogger.error(
90
+ "langfuse_runner.init.dataset_not_found",
91
+ event_info=f"Failed to get dataset '{dataset_name}'",
92
+ dataset_name=dataset_name,
93
+ error=str(e),
94
+ )
95
+ raise
96
+
97
+ def run_experiment(self) -> ExperimentResult:
98
+ """Run the experiment."""
99
+ result = self._dataset.run_experiment(
100
+ name=EXPERIMENT_NAME,
101
+ description=EXPERIMENT_DESCRIPTION,
102
+ task=self._run_copilot_task,
103
+ run_evaluators=[self._run_classification_metrics_evaluator],
104
+ )
105
+ self._report_run_results_to_txt_file(result)
106
+ self._langfuse.flush()
107
+ return result
108
+
109
+ async def _run_copilot_task(
110
+ self,
111
+ *,
112
+ item: ExperimentItem,
113
+ **kwargs: Dict[str, Any],
114
+ ) -> Optional[CopilotRunResult]:
115
+ """Copilot task function that processes each dataset item.
116
+
117
+ Follows the languse.experiment.TaskFunction protocol. The function mimics the
118
+ functionality of the `/copilot` endpoint.
119
+
120
+ Args:
121
+ item: The dataset item to process.
122
+ kwargs: Additional keyword arguments.
123
+
124
+ Returns:
125
+ A tuple containing the complete response and the generation context.
126
+ """
127
+ # Try to create the copilot context used for generating the response from the
128
+ # dataset item, if the context cannot be created, skip the evaluation by
129
+ # returning None.
130
+ try:
131
+ dataset_entry = DatasetEntry.from_raw_data(
132
+ id=item.id, # type: ignore[union-attr]
133
+ input_data=item.input, # type: ignore[union-attr]
134
+ expected_output_data=item.expected_output, # type: ignore[union-attr]
135
+ metadata_data=item.metadata, # type: ignore[union-attr]
136
+ )
137
+ context = dataset_entry.to_copilot_context()
138
+ except Exception as e:
139
+ structlogger.error(
140
+ "langfuse_runner._task_function_run_copilot.context_creation_failed",
141
+ event_info=(
142
+ f"Failed to create CopilotContext from dataset item with id: "
143
+ f"{item.id}. The Copilot cannot be run without a valid " # type: ignore[union-attr]
144
+ f"CopilotContext. Skipping evaluation."
145
+ ),
146
+ item_id=item.id, # type: ignore[union-attr]
147
+ item_input=item.input, # type: ignore[union-attr]
148
+ item_expected_output=item.expected_output, # type: ignore[union-attr]
149
+ error=str(e),
150
+ )
151
+ return None
152
+
153
+ # Run the evalution. If the task fails, skip the evaluation by returning None.
154
+ try:
155
+ return await run_copilot_with_response_handler(context)
156
+ except Exception as e:
157
+ structlogger.error(
158
+ "langfuse_runner._task_function_run_copilot.copilot_run_failed",
159
+ event_info=(
160
+ f"Failed to run the copilot with response handler for dataset item "
161
+ f"with id: {item.id}. Skipping evaluation." # type: ignore[union-attr]
162
+ ),
163
+ item_id=item.id, # type: ignore[union-attr]
164
+ item_input=item.input, # type: ignore[union-attr]
165
+ item_expected_output=item.expected_output, # type: ignore[union-attr]
166
+ error=str(e),
167
+ )
168
+ return None
169
+
170
+ def _run_classification_metrics_evaluator(
171
+ self, *, item_results: List[ExperimentItemResult], **kwargs: Dict[str, Any]
172
+ ) -> List[Evaluation]:
173
+ """Main evaluator function that calculates classification metrics.
174
+
175
+ This function follows the languse.experiment.RunEvaluatorFunction protocol.
176
+ It will be called after the langfuse.experiment.TaskFunction has been called
177
+ for each item in the dataset.
178
+
179
+ Args:
180
+ item_results: The item results to evaluate.
181
+ kwargs: Additional keyword arguments.
182
+
183
+ Returns:
184
+ A list of Langfuse Evaluation objects.
185
+ """
186
+ # Create a list of ClassificationResult objects from item_results
187
+ classification_results, skip_count = (
188
+ self._create_classification_results_from_dataset_items(item_results)
189
+ )
190
+
191
+ # Log a warning if any items were skipped due to invalid data, as this will
192
+ # affect the overall metrics.
193
+ if skip_count > 0:
194
+ structlogger.warning(
195
+ "langfuse_runner._run_classification_metrics_evaluator.skipped_items",
196
+ event_info=(
197
+ f"Skipped {skip_count} items due to invalid data. This will affect "
198
+ f"the overall metrics."
199
+ ),
200
+ skipped_count=skip_count,
201
+ total_items=len(item_results),
202
+ )
203
+
204
+ # Run the response classification evaluator on the classification results and
205
+ # get the metrics summary
206
+ evaluator = ResponseClassificationEvaluator() # type: ignore[no-untyped-call]
207
+ metrics_summary = evaluator.evaluate(classification_results)
208
+
209
+ # Record the metrics in Langfuse.
210
+ evaluations = self._create_langfuse_evaluation_objects(
211
+ metrics_summary, skip_count
212
+ )
213
+ self._report_yaml_structured_results(evaluations)
214
+ return evaluations
215
+
216
+ def _create_classification_results_from_dataset_items(
217
+ self, item_results: List[ExperimentItemResult]
218
+ ) -> Tuple[List[ClassificationResult], int]:
219
+ """Create a list of ClassificationResult objects from item results.
220
+
221
+ Args:
222
+ item_results: The item results to create ClassificationResult objects from.
223
+
224
+ Returns:
225
+ A tuple containing the list of ClassificationResult objects and the number
226
+ of items that were skipped due to missing predicted or expected categories.
227
+ """
228
+ classification_results: List[ClassificationResult] = []
229
+ skip_count = 0
230
+
231
+ # Try to create a ClassificationResult from the each item result. if either
232
+ # predicted or expected category is missing, skip the item.
233
+ for item_result in item_results:
234
+ # If the output is None, the task function resulted in an error, skip the
235
+ # item.
236
+ if (
237
+ item_result.output is None
238
+ or not isinstance(item_result.output, CopilotRunResult)
239
+ or item_result.output.response_category is None
240
+ or item_result.item.expected_output is None # type: ignore[union-attr]
241
+ or not isinstance(item_result.item.expected_output, dict) # type: ignore[union-attr]
242
+ or item_result.item.expected_output.get("response_category") is None # type: ignore[union-attr]
243
+ ):
244
+ structlogger.error(
245
+ "langfuse_runner._create_classification_results_from_dataset_items"
246
+ ".invalid_item_result",
247
+ event_info=(
248
+ f"Cannot create a ClassificationResult from item result with "
249
+ f"id: {item_result.item.id}. This item will not be used for " # type: ignore[union-attr]
250
+ f"evaluation."
251
+ ),
252
+ item_id=item_result.item.id, # type: ignore[union-attr]
253
+ item_output=item_result.output,
254
+ item_expected_output=item_result.item.expected_output, # type: ignore[union-attr]
255
+ )
256
+ skip_count += 1
257
+ continue
258
+
259
+ predicted_category = item_result.output.response_category.value
260
+ expected_category = item_result.item.expected_output["response_category"] # type: ignore[union-attr]
261
+ classification_result = ClassificationResult(
262
+ prediction=ResponseCategory(predicted_category),
263
+ expected=ResponseCategory(expected_category),
264
+ )
265
+ classification_results.append(classification_result)
266
+
267
+ return classification_results, skip_count
268
+
269
+ def _create_langfuse_evaluation_objects(
270
+ self, metrics_summary: MetricsSummary, skip_count: int
271
+ ) -> List[Evaluation]:
272
+ """Create Langfuse Evaluation objects from metrics summary."""
273
+ evaluations: List[Evaluation] = []
274
+
275
+ # Overall metrics
276
+ evaluations.extend(
277
+ [
278
+ Evaluation(
279
+ name=MICRO_PRECISION_METRIC,
280
+ value=metrics_summary.overall.micro_precision,
281
+ comment=MICRO_PRECISION_DESCRIPTION.format(
282
+ value=metrics_summary.overall.micro_precision
283
+ ),
284
+ ),
285
+ Evaluation(
286
+ name=MACRO_PRECISION_METRIC,
287
+ value=metrics_summary.overall.macro_precision,
288
+ comment=MACRO_PRECISION_DESCRIPTION.format(
289
+ value=metrics_summary.overall.macro_precision
290
+ ),
291
+ ),
292
+ Evaluation(
293
+ name=WEIGHTED_PRECISION_METRIC,
294
+ value=metrics_summary.overall.weighted_avg_precision,
295
+ comment=WEIGHTED_PRECISION_DESCRIPTION.format(
296
+ value=metrics_summary.overall.weighted_avg_precision
297
+ ),
298
+ ),
299
+ Evaluation(
300
+ name=MICRO_RECALL_METRIC,
301
+ value=metrics_summary.overall.micro_recall,
302
+ comment=MICRO_RECALL_DESCRIPTION.format(
303
+ value=metrics_summary.overall.micro_recall
304
+ ),
305
+ ),
306
+ Evaluation(
307
+ name=MACRO_RECALL_METRIC,
308
+ value=metrics_summary.overall.macro_recall,
309
+ comment=MACRO_RECALL_DESCRIPTION.format(
310
+ value=metrics_summary.overall.macro_recall
311
+ ),
312
+ ),
313
+ Evaluation(
314
+ name=WEIGHTED_RECALL_METRIC,
315
+ value=metrics_summary.overall.weighted_avg_recall,
316
+ comment=WEIGHTED_RECALL_DESCRIPTION.format(
317
+ value=metrics_summary.overall.weighted_avg_recall
318
+ ),
319
+ ),
320
+ Evaluation(
321
+ name=MICRO_F1_METRIC,
322
+ value=metrics_summary.overall.micro_f1,
323
+ comment=MICRO_F1_DESCRIPTION.format(
324
+ value=metrics_summary.overall.micro_f1
325
+ ),
326
+ ),
327
+ Evaluation(
328
+ name=MACRO_F1_METRIC,
329
+ value=metrics_summary.overall.macro_f1,
330
+ comment=MACRO_F1_DESCRIPTION.format(
331
+ value=metrics_summary.overall.macro_f1
332
+ ),
333
+ ),
334
+ Evaluation(
335
+ name=WEIGHTED_F1_METRIC,
336
+ value=metrics_summary.overall.weighted_avg_f1,
337
+ comment=WEIGHTED_F1_DESCRIPTION.format(
338
+ value=metrics_summary.overall.weighted_avg_f1
339
+ ),
340
+ ),
341
+ ]
342
+ )
343
+
344
+ # Per-class metrics
345
+ for category, per_class_metrics in metrics_summary.per_class.items():
346
+ category_name = category.value.lower()
347
+ evaluations.extend(
348
+ [
349
+ Evaluation(
350
+ name=PER_CLASS_PRECISION_METRIC_TEMPLATE.format(
351
+ category=category_name
352
+ ),
353
+ value=per_class_metrics.precision,
354
+ comment=PER_CLASS_PRECISION_DESCRIPTION.format(
355
+ category=category.value,
356
+ value=per_class_metrics.precision,
357
+ ),
358
+ ),
359
+ Evaluation(
360
+ name=PER_CLASS_RECALL_METRIC_TEMPLATE.format(
361
+ category=category_name
362
+ ),
363
+ value=per_class_metrics.recall,
364
+ comment=PER_CLASS_RECALL_DESCRIPTION.format(
365
+ category=category.value,
366
+ value=per_class_metrics.recall,
367
+ ),
368
+ ),
369
+ Evaluation(
370
+ name=PER_CLASS_F1_METRIC_TEMPLATE.format(
371
+ category=category_name
372
+ ),
373
+ value=per_class_metrics.f1,
374
+ comment=PER_CLASS_F1_DESCRIPTION.format(
375
+ category=category.value,
376
+ value=per_class_metrics.f1,
377
+ ),
378
+ ),
379
+ Evaluation(
380
+ name=PER_CLASS_SUPPORT_METRIC_TEMPLATE.format(
381
+ category=category_name
382
+ ),
383
+ value=float(per_class_metrics.support),
384
+ comment=PER_CLASS_SUPPORT_DESCRIPTION.format(
385
+ category=category.value,
386
+ value=per_class_metrics.support,
387
+ ),
388
+ ),
389
+ ]
390
+ )
391
+
392
+ # Record the number of items that were skipped due to invalid data
393
+ evaluations.append(
394
+ Evaluation(
395
+ name=SKIP_COUNT_METRIC,
396
+ value=skip_count,
397
+ comment=SKIP_COUNT_DESCRIPTION.format(value=skip_count),
398
+ )
399
+ )
400
+
401
+ return evaluations
402
+
403
+ def _report_run_results_to_txt_file(self, result: ExperimentResult) -> None:
404
+ result_str = result.format().replace("\\n", "\n")
405
+ self._output_dir.mkdir(parents=True, exist_ok=True)
406
+
407
+ # Add timestamp prefix to filename
408
+ current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
409
+ timestamped_filename = (
410
+ f"{current_date}_"
411
+ f"{DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME}"
412
+ )
413
+ output_path = self._output_dir / timestamped_filename
414
+
415
+ with open(str(output_path), "w") as f:
416
+ f.write(result_str)
417
+ structlogger.info(
418
+ "langfuse_runner._report_run_results.exported",
419
+ event_info="Evaluation results exported to text file",
420
+ text_file=output_path,
421
+ )
422
+
423
+ def _report_yaml_structured_results(self, evaluations: list[Evaluation]) -> None:
424
+ """Export evaluation results to a YAML file with structured data."""
425
+ # Ensure results directory exists
426
+ self._output_dir.mkdir(parents=True, exist_ok=True)
427
+
428
+ # Add timestamp prefix to filename
429
+ current_date = datetime.now().strftime("%Y%m%d_%H%M%S")
430
+ timestamped_filename = (
431
+ f"{current_date}_"
432
+ f"{RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME}"
433
+ )
434
+ output_path = self._output_dir / timestamped_filename
435
+ # Convert evaluations to structured data
436
+ structured_data: Dict[str, Any] = {
437
+ "experiment": {
438
+ "name": EXPERIMENT_NAME,
439
+ "description": EXPERIMENT_DESCRIPTION,
440
+ "timestamp": datetime.now().isoformat(),
441
+ },
442
+ "metrics": [],
443
+ }
444
+
445
+ # Add each evaluation as a metric
446
+ for evaluation in evaluations:
447
+ metric_data: Dict[str, Any] = {
448
+ "name": evaluation.name,
449
+ "value": evaluation.value,
450
+ "description": evaluation.comment,
451
+ }
452
+ structured_data["metrics"].append(metric_data)
453
+
454
+ # Write to YAML file
455
+ with open(str(output_path), "w") as f:
456
+ yaml.dump(structured_data, f, default_flow_style=False, sort_keys=False)
457
+
458
+ structlogger.info(
459
+ "langfuse_runner._report_yaml_structured_results.exported",
460
+ event_info="Evaluation results exported to YAML file",
461
+ yaml_file=output_path,
462
+ metrics_count=len(evaluations),
463
+ )
@@ -0,0 +1,61 @@
1
+ from typing import Dict
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from rasa.builder.copilot.models import ResponseCategory
6
+
7
+
8
+ class ClassificationResult(BaseModel):
9
+ prediction: ResponseCategory
10
+ expected: ResponseCategory
11
+
12
+
13
+ class PerClassMetrics(BaseModel):
14
+ """Metrics for a single response category."""
15
+
16
+ precision: float = Field(ge=0.0, le=1.0, description="Precision score")
17
+ recall: float = Field(ge=0.0, le=1.0, description="Recall score")
18
+ f1: float = Field(ge=0.0, le=1.0, description="F1 score")
19
+
20
+ support: int = Field(ge=0, description="Number of actual occurrences.")
21
+
22
+ true_positives: int = Field(ge=0, description="Number of true positives.")
23
+ false_positives: int = Field(ge=0, description="Number of false positives.")
24
+ false_negatives: int = Field(ge=0, description="Number of false negatives.")
25
+
26
+
27
+ class OverallClassificationMetrics(BaseModel):
28
+ """Overall evaluation metrics."""
29
+
30
+ micro_precision: float = Field(
31
+ ge=0.0, le=1.0, description="Micro-averaged Precision"
32
+ )
33
+ macro_precision: float = Field(
34
+ ge=0.0, le=1.0, description="Macro-averaged Precision"
35
+ )
36
+ weighted_avg_precision: float = Field(
37
+ ge=0.0, le=1.0, description="Weighted Precision"
38
+ )
39
+
40
+ micro_recall: float = Field(ge=0.0, le=1.0, description="Micro-averaged Recall")
41
+ macro_recall: float = Field(ge=0.0, le=1.0, description="Macro-averaged Recall")
42
+ weighted_avg_recall: float = Field(ge=0.0, le=1.0, description="Weighted Recall")
43
+
44
+ micro_f1: float = Field(ge=0.0, le=1.0, description="Micro-averaged F1 score")
45
+ macro_f1: float = Field(ge=0.0, le=1.0, description="Macro-averaged F1 score")
46
+ weighted_avg_f1: float = Field(ge=0.0, le=1.0, description="Weighted F1 score")
47
+
48
+ support: int = Field(ge=0, description="Total number of occurrences.")
49
+
50
+ true_positives: int = Field(ge=0, description="Total number of true positives.")
51
+ false_positives: int = Field(ge=0, description="Total number of false positives.")
52
+ false_negatives: int = Field(ge=0, description="Total number of false negatives.")
53
+
54
+
55
+ class MetricsSummary(BaseModel):
56
+ """Complete metrics summary with per-class and overall metrics."""
57
+
58
+ per_class: Dict[ResponseCategory, PerClassMetrics] = Field(
59
+ description="Per-class metrics"
60
+ )
61
+ overall: OverallClassificationMetrics = Field(description="Overall metrics")
File without changes
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env python3
2
+ """Response Classification Evaluator CLI.
3
+
4
+ A command-line tool for running response classification evaluation experiments using
5
+ Langfuse.
6
+
7
+ This script runs experiments on datasets and provides links to the results.
8
+ """
9
+
10
+ import argparse
11
+ import asyncio
12
+ import os
13
+ import sys
14
+ from typing import Optional
15
+
16
+ import structlog
17
+ from langfuse.experiment import ExperimentResult
18
+
19
+ from rasa.builder.evaluator.constants import (
20
+ DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME,
21
+ RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME,
22
+ )
23
+ from rasa.builder.evaluator.response_classification.langfuse_runner import (
24
+ ResponseClassificationLangfuseRunner,
25
+ )
26
+
27
+ # Configure structured logging
28
+ structlogger = structlog.get_logger()
29
+
30
+
31
+ def validate_environment() -> None:
32
+ """Validate that all required environment variables are set."""
33
+ required_vars = [
34
+ "OPENAI_API_KEY",
35
+ "INKEEP_API_KEY",
36
+ "LANGFUSE_HOST",
37
+ "LANGFUSE_PUBLIC_KEY",
38
+ "LANGFUSE_SECRET_KEY",
39
+ ]
40
+
41
+ missing_vars = []
42
+ for var in required_vars:
43
+ if not os.getenv(var):
44
+ missing_vars.append(var)
45
+
46
+ if missing_vars:
47
+ structlogger.error(
48
+ "main.validate_environment.missing_variables",
49
+ event_info=(
50
+ "Missing required environment variables. Please set the following "
51
+ f"environment variables: {missing_vars}."
52
+ ),
53
+ missing_variables=missing_vars,
54
+ )
55
+ sys.exit(1)
56
+
57
+
58
+ async def run_experiment(
59
+ dataset_name: str, output_file: Optional[str] = None
60
+ ) -> Optional[ExperimentResult]:
61
+ """Run the response classification evaluation experiment."""
62
+ try:
63
+ structlogger.info(
64
+ "main.run_experiment.starting",
65
+ event_info="Starting response classification evaluation experiment",
66
+ dataset_name=dataset_name,
67
+ )
68
+
69
+ # Initialize and run the experiment
70
+ runner = ResponseClassificationLangfuseRunner(
71
+ dataset_name=dataset_name, output_dir=output_file
72
+ )
73
+ result = runner.run_experiment()
74
+
75
+ structlogger.info(
76
+ "main.run_experiment.completed",
77
+ event_info=(
78
+ "Response classification evaluation experiment completed successfully",
79
+ ),
80
+ dataset_name=dataset_name,
81
+ )
82
+
83
+ structlogger.info("✅ Experiment completed successfully!")
84
+
85
+ return result
86
+
87
+ except Exception as e:
88
+ structlogger.error(
89
+ "main.run_experiment.failed",
90
+ event_info="Response classification evaluation experiment failed",
91
+ error=str(e),
92
+ dataset_name=dataset_name,
93
+ )
94
+ structlogger.error(f"❌ Error running experiment: {e}")
95
+ return None
96
+
97
+
98
+ def main() -> int:
99
+ """Main entry point for the CLI."""
100
+ parser = argparse.ArgumentParser(
101
+ description="Run response classification evaluation experiments using Langfuse",
102
+ formatter_class=argparse.RawDescriptionHelpFormatter,
103
+ epilog="Examples:\npython run_response_classification_evaluator.py my_dataset",
104
+ )
105
+
106
+ parser.add_argument(
107
+ "--dataset-name",
108
+ help="Name of the dataset to evaluate",
109
+ required=True,
110
+ )
111
+ parser.add_argument(
112
+ "--output-file",
113
+ help=(
114
+ "(Optional) Directory to write experiment results. Two files are created "
115
+ "with a timestamp prefix (YYYYMMDD_HHMMSS_):\n"
116
+ f"- {DEFAULT_RESPONSE_CLASSIFICATION_EVALUATION_TEXT_OUTPUT_FILENAME} "
117
+ f"from Langfuse, and"
118
+ f"- {RESPONSE_CLASSIFICATION_EVALUATION_YAML_OUTPUT_FILENAME} "
119
+ "from the classifier."
120
+ ),
121
+ )
122
+
123
+ args = parser.parse_args()
124
+
125
+ # Validate environment variables
126
+ validate_environment()
127
+
128
+ structlogger.info(f"🔍 Dataset: {args.dataset_name}")
129
+ structlogger.info("🚀 Starting evaluation...")
130
+
131
+ # Run the experiment
132
+ result = asyncio.run(run_experiment(args.dataset_name, args.output_file))
133
+
134
+ if result is None:
135
+ sys.exit(1)
136
+
137
+ # Get experiment link
138
+ structlogger.info(
139
+ "✨ Evaluation complete!",
140
+ dataset_run_id=result.dataset_run_id,
141
+ dataset_run_url=result.dataset_run_url,
142
+ )
143
+
144
+ # Print formatted results:
145
+ result_str = result.format().replace("\\n", "\n")
146
+ structlogger.info(result_str)
147
+
148
+ return 0
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()