rasa-pro 3.15.0a1__py3-none-any.whl → 3.15.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rasa-pro might be problematic. Click here for more details.

Files changed (50) hide show
  1. rasa/builder/constants.py +5 -0
  2. rasa/builder/copilot/models.py +80 -28
  3. rasa/builder/download.py +110 -0
  4. rasa/builder/evaluator/__init__.py +0 -0
  5. rasa/builder/evaluator/constants.py +15 -0
  6. rasa/builder/evaluator/copilot_executor.py +89 -0
  7. rasa/builder/evaluator/dataset/models.py +173 -0
  8. rasa/builder/evaluator/exceptions.py +4 -0
  9. rasa/builder/evaluator/response_classification/__init__.py +0 -0
  10. rasa/builder/evaluator/response_classification/constants.py +66 -0
  11. rasa/builder/evaluator/response_classification/evaluator.py +346 -0
  12. rasa/builder/evaluator/response_classification/langfuse_runner.py +463 -0
  13. rasa/builder/evaluator/response_classification/models.py +61 -0
  14. rasa/builder/evaluator/scripts/__init__.py +0 -0
  15. rasa/builder/evaluator/scripts/run_response_classification_evaluator.py +152 -0
  16. rasa/builder/jobs.py +208 -1
  17. rasa/builder/logging_utils.py +25 -24
  18. rasa/builder/main.py +6 -1
  19. rasa/builder/models.py +23 -0
  20. rasa/builder/project_generator.py +29 -10
  21. rasa/builder/service.py +104 -22
  22. rasa/builder/training_service.py +13 -1
  23. rasa/builder/validation_service.py +2 -1
  24. rasa/core/actions/action_clean_stack.py +32 -0
  25. rasa/core/actions/constants.py +4 -0
  26. rasa/core/actions/custom_action_executor.py +70 -12
  27. rasa/core/actions/grpc_custom_action_executor.py +41 -2
  28. rasa/core/actions/http_custom_action_executor.py +49 -25
  29. rasa/core/channels/voice_stream/voice_channel.py +14 -2
  30. rasa/dialogue_understanding/generator/llm_based_command_generator.py +6 -3
  31. rasa/dialogue_understanding/generator/single_step/compact_llm_command_generator.py +15 -7
  32. rasa/dialogue_understanding/generator/single_step/search_ready_llm_command_generator.py +15 -8
  33. rasa/dialogue_understanding/processor/command_processor.py +49 -7
  34. rasa/shared/providers/_configs/azure_openai_client_config.py +4 -5
  35. rasa/shared/providers/_configs/default_litellm_client_config.py +4 -4
  36. rasa/shared/providers/_configs/litellm_router_client_config.py +3 -2
  37. rasa/shared/providers/_configs/openai_client_config.py +5 -7
  38. rasa/shared/providers/_configs/rasa_llm_client_config.py +4 -4
  39. rasa/shared/providers/_configs/self_hosted_llm_client_config.py +4 -4
  40. rasa/shared/providers/llm/_base_litellm_client.py +42 -14
  41. rasa/shared/providers/llm/litellm_router_llm_client.py +38 -15
  42. rasa/shared/providers/llm/self_hosted_llm_client.py +34 -32
  43. rasa/shared/utils/configs.py +5 -8
  44. rasa/utils/endpoints.py +6 -0
  45. rasa/version.py +1 -1
  46. {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/METADATA +12 -12
  47. {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/RECORD +50 -37
  48. {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/NOTICE +0 -0
  49. {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/WHEEL +0 -0
  50. {rasa_pro-3.15.0a1.dist-info → rasa_pro-3.15.0a3.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,346 @@
1
+ from typing import Dict, List, Literal, Optional
2
+
3
+ import structlog
4
+
5
+ from rasa.builder.copilot.models import ResponseCategory
6
+ from rasa.builder.evaluator.response_classification.constants import (
7
+ MACRO_AVERAGING_METHOD,
8
+ MICRO_AVERAGING_METHOD,
9
+ WEIGHTED_AVERAGING_METHOD,
10
+ )
11
+ from rasa.builder.evaluator.response_classification.models import (
12
+ ClassificationResult,
13
+ MetricsSummary,
14
+ OverallClassificationMetrics,
15
+ PerClassMetrics,
16
+ )
17
+
18
+ structlogger = structlog.get_logger()
19
+
20
+
21
+ class ResponseClassificationEvaluator:
22
+ def __init__(self): # type: ignore[no-untyped-def]
23
+ self._classes: List[ResponseCategory] = [
24
+ ResponseCategory.COPILOT,
25
+ ResponseCategory.OUT_OF_SCOPE_DETECTION,
26
+ ResponseCategory.ROLEPLAY_DETECTION,
27
+ ResponseCategory.KNOWLEDGE_BASE_ACCESS_REQUESTED,
28
+ ResponseCategory.ERROR_FALLBACK,
29
+ # TODO: Add the greetings and goodbyes as support once the orchestrator
30
+ # aproach is implemented
31
+ ]
32
+ self._true_positives_per_class: Dict[ResponseCategory, int] = {
33
+ clazz: 0 for clazz in self._classes
34
+ }
35
+ self._false_positives_per_class: Dict[ResponseCategory, int] = {
36
+ clazz: 0 for clazz in self._classes
37
+ }
38
+ self._false_negatives_per_class: Dict[ResponseCategory, int] = {
39
+ clazz: 0 for clazz in self._classes
40
+ }
41
+ self._support_per_class: Dict[ResponseCategory, int] = {
42
+ clazz: 0 for clazz in self._classes
43
+ }
44
+
45
+ self._evaluated = False
46
+
47
+ @property
48
+ def metrics_summary(self) -> Optional[MetricsSummary]:
49
+ """Get the metrics summary.
50
+
51
+ Returns:
52
+ MetricsSummary with structured per-class and overall metrics if
53
+ the evaluator has been run on the data, otherwise None.
54
+ """
55
+ if not self._evaluated:
56
+ structlogger.warning(
57
+ "evaluator.response_classification_evaluator"
58
+ ".metrics_summary.not_evaluated",
59
+ event_info="Evaluator not evaluated. Returning empty metrics summary.",
60
+ )
61
+ return None
62
+
63
+ return self._get_metrics_summary()
64
+
65
+ def reset(self) -> None:
66
+ self._true_positives_per_class = {clazz: 0 for clazz in self._classes}
67
+ self._false_positives_per_class = {clazz: 0 for clazz in self._classes}
68
+ self._false_negatives_per_class = {clazz: 0 for clazz in self._classes}
69
+ self._support_per_class = {clazz: 0 for clazz in self._classes}
70
+ self._evaluated = False
71
+
72
+ def evaluate(self, item_results: List[ClassificationResult]) -> MetricsSummary:
73
+ """Evaluate the classifier on the given item results."""
74
+ if self._evaluated:
75
+ structlogger.warning(
76
+ "evaluator.response_classification_evaluator.evaluate.already_evaluated",
77
+ event_info="Evaluator already evaluated. Resetting evaluator.",
78
+ )
79
+ self.reset()
80
+
81
+ for result in item_results:
82
+ # Skip and raise a warning if the class is not in the list of classes
83
+ if result.expected not in self._classes:
84
+ structlogger.warning(
85
+ "evaluator.response_classification_evaluator"
86
+ ".evaluate.class_not_recognized",
87
+ event_info=(
88
+ f"Class '{result.expected}' is not recognized. "
89
+ f"Skipping evaluation for this class."
90
+ ),
91
+ expected_class=result.expected,
92
+ classes=self._classes,
93
+ )
94
+ continue
95
+
96
+ # Update support for the expected class
97
+ if result.expected in self._support_per_class:
98
+ self._support_per_class[result.expected] += 1
99
+
100
+ # Calculate TP, FP, FN per class
101
+ for clazz in self._classes:
102
+ if result.prediction == clazz and result.expected == clazz:
103
+ self._true_positives_per_class[clazz] += 1
104
+
105
+ elif result.prediction == clazz and result.expected != clazz:
106
+ self._false_positives_per_class[clazz] += 1
107
+
108
+ elif result.prediction != clazz and result.expected == clazz:
109
+ self._false_negatives_per_class[clazz] += 1
110
+
111
+ self._evaluated = True
112
+ return self._get_metrics_summary()
113
+
114
+ def calculate_precision_per_class(self, clazz: ResponseCategory) -> float:
115
+ """Calculate precision for a specific response category."""
116
+ tp = self._true_positives_per_class.get(clazz, 0)
117
+ fp = self._false_positives_per_class.get(clazz, 0)
118
+
119
+ if tp + fp == 0:
120
+ return 0.0
121
+
122
+ return tp / (tp + fp)
123
+
124
+ def calculate_recall_per_class(self, clazz: ResponseCategory) -> float:
125
+ """Calculate recall for a specific response category."""
126
+ tp = self._true_positives_per_class.get(clazz, 0)
127
+ fn = self._false_negatives_per_class.get(clazz, 0)
128
+
129
+ if tp + fn == 0:
130
+ return 0.0
131
+
132
+ return tp / (tp + fn)
133
+
134
+ def calculate_f1_per_class(self, clazz: ResponseCategory) -> float:
135
+ """Calculate F1 score for a specific response category."""
136
+ precision = self.calculate_precision_per_class(clazz)
137
+ recall = self.calculate_recall_per_class(clazz)
138
+
139
+ if precision + recall == 0:
140
+ return 0.0
141
+
142
+ return 2 * (precision * recall) / (precision + recall)
143
+
144
+ def calculate_precision(
145
+ self, average: Literal["micro", "macro", "weighted"] = MICRO_AVERAGING_METHOD
146
+ ) -> float:
147
+ """Calculate precision with specified averaging method."""
148
+ if average == MICRO_AVERAGING_METHOD:
149
+ return self._calculate_micro_precision()
150
+ elif average == MACRO_AVERAGING_METHOD:
151
+ return self._calculate_macro_precision()
152
+ elif average == WEIGHTED_AVERAGING_METHOD:
153
+ return self._calculate_weighted_avg_precision()
154
+ else:
155
+ raise ValueError(f"Invalid averaging method: {average}")
156
+
157
+ def _calculate_micro_precision(self) -> float:
158
+ """Calculate overall precision with specified averaging method.
159
+
160
+ Calculates the metric globally by aggregating the total true positives, false
161
+ positives, across all classes. Each sample contributes equally to the final
162
+ score.
163
+ """
164
+ total_tp = sum(self._true_positives_per_class.values())
165
+ total_fp = sum(self._false_positives_per_class.values())
166
+
167
+ if total_tp + total_fp == 0:
168
+ return 0.0
169
+
170
+ return total_tp / (total_tp + total_fp)
171
+
172
+ def _calculate_macro_precision(self) -> float:
173
+ """Calculate macro-averaged precision.
174
+
175
+ Calculates the metric independently for each class and then takes the
176
+ unweighted average. Each class contributes equally.
177
+ """
178
+ precisions = [
179
+ self.calculate_precision_per_class(clazz) for clazz in self._classes
180
+ ]
181
+ return sum(precisions) / len(precisions) if precisions else 0.0
182
+
183
+ def _calculate_weighted_avg_precision(self) -> float:
184
+ """Calculate weighted-averaged precision.
185
+
186
+ Calculates the metric independently for each class and then takes the average
187
+ weighted by the class support (number of true samples per class).
188
+ """
189
+ total_support = sum(self._support_per_class.values())
190
+ if total_support == 0:
191
+ return 0.0
192
+
193
+ weighted_sum = 0.0
194
+ for clazz in self._classes:
195
+ precision = self.calculate_precision_per_class(clazz)
196
+ support = self._support_per_class.get(clazz, 0)
197
+ weighted_sum += precision * support
198
+
199
+ return weighted_sum / total_support
200
+
201
+ def calculate_recall(
202
+ self, average: Literal["micro", "macro", "weighted"] = MICRO_AVERAGING_METHOD
203
+ ) -> float:
204
+ """Calculate recall with specified averaging method."""
205
+ if average == MICRO_AVERAGING_METHOD:
206
+ return self._calculate_micro_recall()
207
+ elif average == MACRO_AVERAGING_METHOD:
208
+ return self._calculate_macro_recall()
209
+ elif average == WEIGHTED_AVERAGING_METHOD:
210
+ return self._calculate_weighted_avg_recall()
211
+ else:
212
+ raise ValueError(f"Invalid averaging method: {average}")
213
+
214
+ def _calculate_micro_recall(self) -> float:
215
+ """Calculate micro-averaged recall.
216
+
217
+ Calculates the metric globally by aggregating the total true positives, false
218
+ negatives, across all classes. Each sample contributes equally to the final
219
+ score.
220
+ """
221
+ total_tp = sum(self._true_positives_per_class.values())
222
+ total_fn = sum(self._false_negatives_per_class.values())
223
+
224
+ if total_tp + total_fn == 0:
225
+ return 0.0
226
+
227
+ return total_tp / (total_tp + total_fn)
228
+
229
+ def _calculate_macro_recall(self) -> float:
230
+ """Calculate macro-averaged recall.
231
+
232
+ Calculates the metric independently for each class and then takes the
233
+ unweighted average. Each class contributes equally.
234
+ """
235
+ recalls = [self.calculate_recall_per_class(clazz) for clazz in self._classes]
236
+ return sum(recalls) / len(recalls) if recalls else 0.0
237
+
238
+ def _calculate_weighted_avg_recall(self) -> float:
239
+ """Calculate weighted-averaged recall.
240
+
241
+ Calculates the metric independently for each class and then takes the average
242
+ weighted by the class support (number of true samples per class).
243
+ """
244
+ total_support = sum(self._support_per_class.values())
245
+ if total_support == 0:
246
+ return 0.0
247
+
248
+ weighted_sum = 0.0
249
+ for clazz in self._classes:
250
+ recall = self.calculate_recall_per_class(clazz)
251
+ support = self._support_per_class.get(clazz, 0)
252
+ weighted_sum += recall * support
253
+
254
+ return weighted_sum / total_support
255
+
256
+ def calculate_f1(
257
+ self, average: Literal["micro", "macro", "weighted"] = MICRO_AVERAGING_METHOD
258
+ ) -> float:
259
+ """Calculate F1 score with specified averaging method."""
260
+ if average == MICRO_AVERAGING_METHOD:
261
+ return self._calculate_micro_f1()
262
+ elif average == MACRO_AVERAGING_METHOD:
263
+ return self._calculate_macro_f1()
264
+ elif average == WEIGHTED_AVERAGING_METHOD:
265
+ return self._calculate_weighted_avg_f1()
266
+ else:
267
+ raise ValueError(f"Invalid averaging method: {average}")
268
+
269
+ def _calculate_micro_f1(self) -> float:
270
+ """Calculate micro-averaged F1 score.
271
+
272
+ Calculates the metric globally by aggregating the total true positives, false
273
+ positives, and false negatives across all classes. Each sample contributes
274
+ equally to the final score.
275
+ """
276
+ micro_precision = self._calculate_micro_precision()
277
+ micro_recall = self._calculate_micro_recall()
278
+
279
+ if micro_precision + micro_recall == 0:
280
+ return 0.0
281
+
282
+ return 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)
283
+
284
+ def _calculate_macro_f1(self) -> float:
285
+ """Calculate macro-averaged F1 score.
286
+
287
+ Calculates the metric independently for each class and then takes the
288
+ unweighted average. Each class contributes equally.
289
+ """
290
+ f1_scores = [self.calculate_f1_per_class(clazz) for clazz in self._classes]
291
+ return sum(f1_scores) / len(f1_scores) if f1_scores else 0.0
292
+
293
+ def _calculate_weighted_avg_f1(self) -> float:
294
+ """Calculate weighted F1 score.
295
+
296
+ Calculates the metric independently for each class and then takes the average
297
+ weighted by the class support (number of true samples per class).
298
+ """
299
+ total_support = sum(self._support_per_class.values())
300
+ if total_support == 0:
301
+ return 0.0
302
+
303
+ weighted_sum = 0.0
304
+ for clazz in self._classes:
305
+ f1 = self.calculate_f1_per_class(clazz)
306
+ support = self._support_per_class.get(clazz, 0)
307
+ weighted_sum += f1 * support
308
+
309
+ return weighted_sum / total_support
310
+
311
+ def _get_metrics_summary(self) -> MetricsSummary:
312
+ """Get the metrics summary without Optional wrapper.
313
+
314
+ This method assumes the evaluator has been evaluated and will always
315
+ return a MetricsSummary.
316
+ """
317
+ # Build per-class metrics
318
+ per_class_metrics: Dict[ResponseCategory, PerClassMetrics] = {}
319
+ for clazz in self._classes:
320
+ per_class_metrics[clazz] = PerClassMetrics(
321
+ precision=self.calculate_precision_per_class(clazz),
322
+ recall=self.calculate_recall_per_class(clazz),
323
+ f1=self.calculate_f1_per_class(clazz),
324
+ support=self._support_per_class.get(clazz, 0),
325
+ true_positives=self._true_positives_per_class.get(clazz, 0),
326
+ false_positives=self._false_positives_per_class.get(clazz, 0),
327
+ false_negatives=self._false_negatives_per_class.get(clazz, 0),
328
+ )
329
+
330
+ # Build overall metrics
331
+ overall_metrics = OverallClassificationMetrics(
332
+ micro_precision=self.calculate_precision(MICRO_AVERAGING_METHOD),
333
+ macro_precision=self.calculate_precision(MACRO_AVERAGING_METHOD),
334
+ weighted_avg_precision=self.calculate_precision(WEIGHTED_AVERAGING_METHOD),
335
+ micro_recall=self.calculate_recall(MICRO_AVERAGING_METHOD),
336
+ macro_recall=self.calculate_recall(MACRO_AVERAGING_METHOD),
337
+ weighted_avg_recall=self.calculate_recall(WEIGHTED_AVERAGING_METHOD),
338
+ micro_f1=self.calculate_f1(MICRO_AVERAGING_METHOD),
339
+ macro_f1=self.calculate_f1(MACRO_AVERAGING_METHOD),
340
+ weighted_avg_f1=self.calculate_f1(WEIGHTED_AVERAGING_METHOD),
341
+ support=sum(self._support_per_class.values()),
342
+ true_positives=sum(self._true_positives_per_class.values()),
343
+ false_positives=sum(self._false_positives_per_class.values()),
344
+ false_negatives=sum(self._false_negatives_per_class.values()),
345
+ )
346
+ return MetricsSummary(per_class=per_class_metrics, overall=overall_metrics)