DeepFabric 4.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. deepfabric/__init__.py +70 -0
  2. deepfabric/__main__.py +6 -0
  3. deepfabric/auth.py +382 -0
  4. deepfabric/builders.py +303 -0
  5. deepfabric/builders_agent.py +1304 -0
  6. deepfabric/cli.py +1288 -0
  7. deepfabric/config.py +899 -0
  8. deepfabric/config_manager.py +251 -0
  9. deepfabric/constants.py +94 -0
  10. deepfabric/dataset_manager.py +534 -0
  11. deepfabric/error_codes.py +581 -0
  12. deepfabric/evaluation/__init__.py +47 -0
  13. deepfabric/evaluation/backends/__init__.py +32 -0
  14. deepfabric/evaluation/backends/ollama_backend.py +137 -0
  15. deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
  16. deepfabric/evaluation/backends/transformers_backend.py +326 -0
  17. deepfabric/evaluation/evaluator.py +845 -0
  18. deepfabric/evaluation/evaluators/__init__.py +13 -0
  19. deepfabric/evaluation/evaluators/base.py +104 -0
  20. deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
  21. deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
  22. deepfabric/evaluation/evaluators/registry.py +66 -0
  23. deepfabric/evaluation/inference.py +155 -0
  24. deepfabric/evaluation/metrics.py +397 -0
  25. deepfabric/evaluation/parser.py +304 -0
  26. deepfabric/evaluation/reporters/__init__.py +13 -0
  27. deepfabric/evaluation/reporters/base.py +56 -0
  28. deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
  29. deepfabric/evaluation/reporters/file_reporter.py +61 -0
  30. deepfabric/evaluation/reporters/multi_reporter.py +56 -0
  31. deepfabric/exceptions.py +67 -0
  32. deepfabric/factory.py +26 -0
  33. deepfabric/generator.py +1084 -0
  34. deepfabric/graph.py +545 -0
  35. deepfabric/hf_hub.py +214 -0
  36. deepfabric/kaggle_hub.py +219 -0
  37. deepfabric/llm/__init__.py +41 -0
  38. deepfabric/llm/api_key_verifier.py +534 -0
  39. deepfabric/llm/client.py +1206 -0
  40. deepfabric/llm/errors.py +105 -0
  41. deepfabric/llm/rate_limit_config.py +262 -0
  42. deepfabric/llm/rate_limit_detector.py +278 -0
  43. deepfabric/llm/retry_handler.py +270 -0
  44. deepfabric/metrics.py +212 -0
  45. deepfabric/progress.py +262 -0
  46. deepfabric/prompts.py +290 -0
  47. deepfabric/schemas.py +1000 -0
  48. deepfabric/spin/__init__.py +6 -0
  49. deepfabric/spin/client.py +263 -0
  50. deepfabric/spin/models.py +26 -0
  51. deepfabric/stream_simulator.py +90 -0
  52. deepfabric/tools/__init__.py +5 -0
  53. deepfabric/tools/defaults.py +85 -0
  54. deepfabric/tools/loader.py +87 -0
  55. deepfabric/tools/mcp_client.py +677 -0
  56. deepfabric/topic_manager.py +303 -0
  57. deepfabric/topic_model.py +20 -0
  58. deepfabric/training/__init__.py +35 -0
  59. deepfabric/training/api_key_prompt.py +302 -0
  60. deepfabric/training/callback.py +363 -0
  61. deepfabric/training/metrics_sender.py +301 -0
  62. deepfabric/tree.py +438 -0
  63. deepfabric/tui.py +1267 -0
  64. deepfabric/update_checker.py +166 -0
  65. deepfabric/utils.py +150 -0
  66. deepfabric/validation.py +143 -0
  67. deepfabric-4.4.0.dist-info/METADATA +702 -0
  68. deepfabric-4.4.0.dist-info/RECORD +71 -0
  69. deepfabric-4.4.0.dist-info/WHEEL +4 -0
  70. deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
  71. deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,397 @@
1
+ """Metrics computation for model evaluation."""
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from ..schemas import ToolDefinition
8
+
9
+ # Tolerance for numeric comparison
10
+ NUMERIC_TOLERANCE = 1e-6
11
+
12
+ # Type validation dispatch table
13
+ _TYPE_CHECKS = {
14
+ "str": lambda v: isinstance(v, str),
15
+ "int": lambda v: isinstance(v, int) and not isinstance(v, bool),
16
+ "float": lambda v: isinstance(v, int | float) and not isinstance(v, bool),
17
+ "bool": lambda v: isinstance(v, bool),
18
+ "list": lambda v: isinstance(v, list),
19
+ "dict": lambda v: isinstance(v, dict),
20
+ }
21
+
22
+
23
+ def _is_valid_type(schema_type: str, value: Any) -> bool:
24
+ """Check if value matches schema type.
25
+
26
+ Args:
27
+ schema_type: Schema type string ("str", "int", "float", "bool", "list", "dict")
28
+ value: Value to check
29
+
30
+ Returns:
31
+ True if value matches type, False otherwise
32
+ """
33
+ check = _TYPE_CHECKS.get(schema_type)
34
+ return check(value) if check else False
35
+
36
+
37
+ def _validate_parameter_types(
38
+ predicted_params: dict[str, Any],
39
+ tool_def: ToolDefinition,
40
+ ) -> bool:
41
+ """Validate parameter types against tool schema.
42
+
43
+ Checks that:
44
+ 1. All required parameters are present
45
+ 2. Parameter types match schema (with type coercion)
46
+ 3. Ignores actual values - only validates structure
47
+
48
+ Args:
49
+ predicted_params: Parameters to validate
50
+ tool_def: Tool definition with schema
51
+
52
+ Returns:
53
+ True if types are valid, False otherwise
54
+ """
55
+ # Create lookup for parameters by name
56
+ schema_params = {p.name: p for p in tool_def.parameters}
57
+
58
+ # Check all required parameters are present
59
+ for param_name, param_schema in schema_params.items():
60
+ if param_schema.required and param_name not in predicted_params:
61
+ return False
62
+
63
+ # Check types for each predicted parameter
64
+ for param_name, predicted_value in predicted_params.items():
65
+ # Skip extra parameters not in schema (allow for flexibility)
66
+ if param_name not in schema_params:
67
+ continue
68
+
69
+ schema_param = schema_params[param_name]
70
+ if not _is_valid_type(schema_param.type, predicted_value):
71
+ return False
72
+
73
+ return True
74
+
75
+
76
+ class EvaluationMetrics(BaseModel):
77
+ """Computed evaluation metrics."""
78
+
79
+ tool_selection_accuracy: float = Field(
80
+ description="Accuracy of tool selection (0.0-1.0)",
81
+ )
82
+ parameter_accuracy: float = Field(
83
+ description="Accuracy of parameter extraction (0.0-1.0)",
84
+ )
85
+ execution_success_rate: float = Field(
86
+ description="Rate of valid tool calls (0.0-1.0)",
87
+ )
88
+ response_quality: float = Field(
89
+ description="Quality of final response (0.0-1.0)",
90
+ )
91
+ overall_score: float = Field(
92
+ description="Weighted overall score (0.0-1.0)",
93
+ )
94
+ samples_evaluated: int = Field(
95
+ description="Total number of samples evaluated",
96
+ )
97
+ samples_processed: int = Field(
98
+ description="Number of samples processed without system errors",
99
+ )
100
+ processing_errors: int = Field(
101
+ description="Number of samples that failed to process (system errors, timeouts)",
102
+ )
103
+
104
+
105
+ class SampleEvaluation(BaseModel):
106
+ """Evaluation result for a single sample."""
107
+
108
+ sample_id: int = Field(description="Sample index")
109
+ query: str = Field(description="Input query")
110
+ expected_tool: str | None = Field(
111
+ default=None,
112
+ description="Expected tool name",
113
+ )
114
+ predicted_tool: str | None = Field(
115
+ default=None,
116
+ description="Predicted tool name",
117
+ )
118
+ expected_parameters: dict[str, Any] = Field(
119
+ default_factory=dict,
120
+ description="Expected parameters",
121
+ )
122
+ predicted_parameters: dict[str, Any] = Field(
123
+ default_factory=dict,
124
+ description="Predicted parameters",
125
+ )
126
+ expected_answer: str | None = Field(
127
+ default=None,
128
+ description="Expected final answer",
129
+ )
130
+ predicted_answer: str | None = Field(
131
+ default=None,
132
+ description="Predicted final answer",
133
+ )
134
+ tool_selection_correct: bool = Field(
135
+ description="Whether tool selection was correct",
136
+ )
137
+ parameters_correct: bool = Field(
138
+ description="Whether parameters were correct",
139
+ )
140
+ execution_valid: bool = Field(
141
+ description="Whether the tool call could be executed",
142
+ )
143
+ response_score: float = Field(
144
+ description="Response quality score (0.0-1.0)",
145
+ )
146
+ error: str | None = Field(
147
+ default=None,
148
+ description="Error message if prediction failed",
149
+ )
150
+
151
+
152
+ def compute_tool_selection_accuracy(
153
+ evaluations: list[SampleEvaluation],
154
+ ) -> float:
155
+ """Compute tool selection accuracy.
156
+
157
+ Args:
158
+ evaluations: List of sample evaluations
159
+
160
+ Returns:
161
+ Accuracy score (0.0-1.0)
162
+ """
163
+ if not evaluations:
164
+ return 0.0
165
+
166
+ correct = sum(1 for e in evaluations if e.tool_selection_correct)
167
+ return correct / len(evaluations)
168
+
169
+
170
+ def compute_parameter_accuracy(
171
+ evaluations: list[SampleEvaluation],
172
+ ) -> float:
173
+ """Compute parameter extraction accuracy.
174
+
175
+ Args:
176
+ evaluations: List of sample evaluations
177
+
178
+ Returns:
179
+ Accuracy score (0.0-1.0)
180
+ """
181
+ if not evaluations:
182
+ return 0.0
183
+
184
+ correct = sum(1 for e in evaluations if e.parameters_correct)
185
+ return correct / len(evaluations)
186
+
187
+
188
+ def compute_execution_success_rate(
189
+ evaluations: list[SampleEvaluation],
190
+ ) -> float:
191
+ """Compute execution success rate.
192
+
193
+ Args:
194
+ evaluations: List of sample evaluations
195
+
196
+ Returns:
197
+ Success rate (0.0-1.0)
198
+ """
199
+ if not evaluations:
200
+ return 0.0
201
+
202
+ valid = sum(1 for e in evaluations if e.execution_valid)
203
+ return valid / len(evaluations)
204
+
205
+
206
+ def compute_response_quality(
207
+ evaluations: list[SampleEvaluation],
208
+ ) -> float:
209
+ """Compute average response quality.
210
+
211
+ Args:
212
+ evaluations: List of sample evaluations
213
+
214
+ Returns:
215
+ Average quality score (0.0-1.0)
216
+ """
217
+ if not evaluations:
218
+ return 0.0
219
+
220
+ total_score = sum(e.response_score for e in evaluations)
221
+ return total_score / len(evaluations)
222
+
223
+
224
+ def compute_overall_score(
225
+ tool_accuracy: float,
226
+ param_accuracy: float,
227
+ exec_success: float,
228
+ response_quality: float,
229
+ weights: dict[str, float] | None = None,
230
+ ) -> float:
231
+ """Compute weighted overall score.
232
+
233
+ Args:
234
+ tool_accuracy: Tool selection accuracy
235
+ param_accuracy: Parameter accuracy
236
+ exec_success: Execution success rate
237
+ response_quality: Response quality score
238
+ weights: Custom weights for each metric (defaults used if None)
239
+
240
+ Returns:
241
+ Weighted overall score (0.0-1.0)
242
+ """
243
+ # Default weights (response_quality excluded for tool-calling mode)
244
+ if weights is None:
245
+ weights = {
246
+ "tool_selection": 0.40,
247
+ "parameter_accuracy": 0.35,
248
+ "execution_success": 0.25,
249
+ "response_quality": 0.00, # Not used for tool-calling evaluation
250
+ }
251
+
252
+ return (
253
+ tool_accuracy * weights.get("tool_selection", 0.0)
254
+ + param_accuracy * weights.get("parameter_accuracy", 0.0)
255
+ + exec_success * weights.get("execution_success", 0.0)
256
+ + response_quality * weights.get("response_quality", 0.0)
257
+ )
258
+
259
+
260
+ def compute_metrics(
261
+ evaluations: list[SampleEvaluation],
262
+ weights: dict[str, float] | None = None,
263
+ ) -> EvaluationMetrics:
264
+ """Compute all evaluation metrics from sample evaluations.
265
+
266
+ Args:
267
+ evaluations: List of sample evaluations
268
+ weights: Custom weights for overall score computation
269
+
270
+ Returns:
271
+ EvaluationMetrics with all computed scores
272
+ """
273
+ if not evaluations:
274
+ return EvaluationMetrics(
275
+ tool_selection_accuracy=0.0,
276
+ parameter_accuracy=0.0,
277
+ execution_success_rate=0.0,
278
+ response_quality=0.0,
279
+ overall_score=0.0,
280
+ samples_evaluated=0,
281
+ samples_processed=0,
282
+ processing_errors=0,
283
+ )
284
+
285
+ tool_acc = compute_tool_selection_accuracy(evaluations)
286
+ param_acc = compute_parameter_accuracy(evaluations)
287
+ exec_success = compute_execution_success_rate(evaluations)
288
+ resp_quality = compute_response_quality(evaluations)
289
+
290
+ overall = compute_overall_score(
291
+ tool_acc,
292
+ param_acc,
293
+ exec_success,
294
+ resp_quality,
295
+ weights,
296
+ )
297
+
298
+ # Count processing status (system errors vs successfully processed)
299
+ processed = sum(1 for e in evaluations if e.error is None)
300
+ errors = len(evaluations) - processed
301
+
302
+ return EvaluationMetrics(
303
+ tool_selection_accuracy=tool_acc,
304
+ parameter_accuracy=param_acc,
305
+ execution_success_rate=exec_success,
306
+ response_quality=resp_quality,
307
+ overall_score=overall,
308
+ samples_evaluated=len(evaluations),
309
+ samples_processed=processed,
310
+ processing_errors=errors,
311
+ )
312
+
313
+
314
+ def compare_parameters( # noqa: PLR0911
315
+ expected: dict[str, Any],
316
+ predicted: dict[str, Any],
317
+ tool_name: str | None = None,
318
+ tool_definitions: list[ToolDefinition] | None = None,
319
+ ) -> bool:
320
+ """Compare expected and predicted parameters.
321
+
322
+ If tool schema is provided, validates parameter types and presence of required params.
323
+ Otherwise, performs value-based comparison (legacy behavior for backward compatibility).
324
+
325
+ Args:
326
+ expected: Expected parameters
327
+ predicted: Predicted parameters
328
+ tool_name: Name of the tool being called (for schema lookup)
329
+ tool_definitions: List of tool definitions with schemas
330
+
331
+ Returns:
332
+ True if parameters match (schema-aware) or values match (legacy), False otherwise
333
+ """
334
+ if not expected and not predicted:
335
+ return True
336
+
337
+ # Schema-aware validation if tool definition available
338
+ if tool_name and tool_definitions:
339
+ tool_def = next((t for t in tool_definitions if t.name == tool_name), None)
340
+ if tool_def:
341
+ return _validate_parameter_types(predicted, tool_def)
342
+
343
+ # Legacy value-based comparison (backward compatibility)
344
+ # Check if all expected keys are present
345
+ if set(expected.keys()) != set(predicted.keys()):
346
+ return False
347
+
348
+ # Compare values
349
+ for key, expected_val in expected.items():
350
+ predicted_val = predicted.get(key)
351
+
352
+ # Handle different types
353
+ if isinstance(expected_val, str) and isinstance(predicted_val, str):
354
+ # Case-insensitive string comparison
355
+ if expected_val.lower().strip() != predicted_val.lower().strip():
356
+ return False
357
+ elif isinstance(expected_val, int | float) and isinstance(predicted_val, int | float):
358
+ # Numeric comparison with small tolerance
359
+ if abs(float(expected_val) - float(predicted_val)) > NUMERIC_TOLERANCE:
360
+ return False
361
+ elif expected_val != predicted_val:
362
+ # Exact match for other types
363
+ return False
364
+
365
+ return True
366
+
367
+
368
+ def compute_response_similarity(
369
+ expected: str | None,
370
+ predicted: str | None,
371
+ ) -> float:
372
+ """Compute similarity between expected and predicted responses.
373
+
374
+ Uses simple word overlap for now. Can be enhanced with semantic similarity.
375
+
376
+ Args:
377
+ expected: Expected response
378
+ predicted: Predicted response
379
+
380
+ Returns:
381
+ Similarity score (0.0-1.0)
382
+ """
383
+ if not expected or not predicted:
384
+ return 0.0 if expected != predicted else 1.0
385
+
386
+ # Tokenize and normalize
387
+ expected_words = set(expected.lower().split())
388
+ predicted_words = set(predicted.lower().split())
389
+
390
+ # Compute Jaccard similarity
391
+ if not expected_words and not predicted_words:
392
+ return 1.0
393
+
394
+ intersection = expected_words & predicted_words
395
+ union = expected_words | predicted_words
396
+
397
+ return len(intersection) / len(union) if union else 0.0
@@ -0,0 +1,304 @@
1
+ """Ground truth parsing from DeepFabric dataset samples."""
2
+
3
+ import json
4
+ import re
5
+
6
+ from typing import Any, Literal
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from ..schemas import Conversation, ToolDefinition
11
+
12
+
13
+ class ExpectedToolCall(BaseModel):
14
+ """A single expected tool call with its parameters."""
15
+
16
+ tool_name: str = Field(description="Name of the tool")
17
+ parameters: dict[str, Any] = Field(
18
+ default_factory=dict,
19
+ description="Parameter names and values",
20
+ )
21
+
22
+ def signature(self) -> str:
23
+ """Return a hashable signature for deduplication."""
24
+ params_str = json.dumps(self.parameters, sort_keys=True)
25
+ return f"{self.tool_name}:{params_str}"
26
+
27
+
28
+ class GroundTruth(BaseModel):
29
+ """Parsed ground truth from original dataset sample."""
30
+
31
+ query: str = Field(description="The user query")
32
+ expected_tool: str | None = Field(
33
+ default=None,
34
+ description="Expected tool name - first tool (None if no tool use). Kept for backwards compatibility.",
35
+ )
36
+ expected_parameters: dict[str, str | int | float | bool | list | dict] = Field(
37
+ default_factory=dict,
38
+ description="Expected tool parameters - first tool. Kept for backwards compatibility.",
39
+ )
40
+ expected_tools: list[ExpectedToolCall] = Field(
41
+ default_factory=list,
42
+ description="All unique expected tool calls (deduplicated by tool_name + parameters)",
43
+ )
44
+ tool_schema: ToolDefinition | None = Field(
45
+ default=None,
46
+ description="Tool schema from available_tools",
47
+ )
48
+ expected_answer: str | None = Field(
49
+ default=None,
50
+ description="Expected final answer if available",
51
+ )
52
+ conversation_type: Literal["basic", "chain_of_thought"] = Field(
53
+ description="Type of conversation",
54
+ )
55
+ reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
56
+ default=None,
57
+ description="Reasoning style if chain_of_thought",
58
+ )
59
+ agent_mode: Literal["single_turn", "multi_turn"] | None = Field(
60
+ default=None,
61
+ description="Agent mode if tools are used",
62
+ )
63
+ metadata: dict[str, str | int | float | bool] = Field(
64
+ default_factory=dict,
65
+ description="Additional metadata",
66
+ )
67
+
68
+
69
+ class GroundTruthParser:
70
+ """Parse ground truth from original DeepFabric JSONL format.
71
+
72
+ This parser extracts expected tools, parameters, and answers from
73
+ Conversation objects while handling all conversation types and agent modes.
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ conversation_type: Literal["basic", "chain_of_thought"],
79
+ reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
80
+ agent_mode: Literal["single_turn", "multi_turn"] | None = None,
81
+ ):
82
+ """Initialize parser with conversation configuration.
83
+
84
+ Args:
85
+ conversation_type: Type of conversation (basic, chain_of_thought)
86
+ reasoning_style: Reasoning style for chain_of_thought
87
+ agent_mode: Agent mode if tools are used
88
+ """
89
+ self.conversation_type: Literal["basic", "chain_of_thought"] = conversation_type
90
+ self.reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = (
91
+ reasoning_style
92
+ )
93
+ self.agent_mode: Literal["single_turn", "multi_turn"] | None = agent_mode
94
+
95
+ def parse(self, conversation: Conversation) -> GroundTruth:
96
+ """Extract ground truth from a conversation sample.
97
+
98
+ Args:
99
+ conversation: Conversation object from dataset
100
+
101
+ Returns:
102
+ GroundTruth with expected values
103
+
104
+ Raises:
105
+ ValueError: If conversation format is invalid
106
+ """
107
+ # Extract query from first user message
108
+ query = self._extract_query(conversation)
109
+
110
+ # Extract expected tool and parameters if tool_context present
111
+ expected_tool: str | None = None
112
+ expected_parameters: dict = {}
113
+ expected_tools: list[ExpectedToolCall] = []
114
+ tool_schema: ToolDefinition | None = None
115
+
116
+ executions = (
117
+ conversation.tool_context.executions
118
+ if conversation.tool_context is not None and conversation.tool_context.executions
119
+ else []
120
+ )
121
+ if executions:
122
+ # Get first tool execution for backwards compatibility
123
+ first_execution = executions[0]
124
+ expected_tool = first_execution.function_name
125
+ expected_parameters = first_execution.parsed_arguments
126
+
127
+ # Extract ALL tool executions and deduplicate
128
+ seen_signatures: set[str] = set()
129
+ for execution in executions:
130
+ tool_call = ExpectedToolCall(
131
+ tool_name=execution.function_name,
132
+ parameters=execution.parsed_arguments,
133
+ )
134
+ sig = tool_call.signature()
135
+ if sig not in seen_signatures:
136
+ seen_signatures.add(sig)
137
+ expected_tools.append(tool_call)
138
+
139
+ # Get tool schema from tools field (OpenAI format)
140
+ if conversation.tools:
141
+ available_tools = [ToolDefinition.from_openai(tool) for tool in conversation.tools]
142
+ tool_schema = self._get_tool_schema(available_tools, expected_tool)
143
+
144
+ # Extract expected answer
145
+ expected_answer = self._extract_expected_answer(conversation)
146
+
147
+ # Extract metadata
148
+ metadata_dict: dict[str, str | int | float | bool] = {}
149
+ if conversation.metadata:
150
+ # Filter to only simple types
151
+ for key, value in conversation.metadata.items():
152
+ if isinstance(value, str | int | float | bool):
153
+ metadata_dict[key] = value
154
+
155
+ return GroundTruth(
156
+ query=query,
157
+ expected_tool=expected_tool,
158
+ expected_parameters=expected_parameters,
159
+ expected_tools=expected_tools,
160
+ tool_schema=tool_schema,
161
+ expected_answer=expected_answer,
162
+ conversation_type=self.conversation_type,
163
+ reasoning_style=self.reasoning_style,
164
+ agent_mode=self.agent_mode,
165
+ metadata=metadata_dict,
166
+ )
167
+
168
+ def _extract_query(self, conversation: Conversation) -> str:
169
+ """Extract user query from conversation messages.
170
+
171
+ Args:
172
+ conversation: Conversation object
173
+
174
+ Returns:
175
+ User query string
176
+
177
+ Raises:
178
+ ValueError: If no user message found
179
+ """
180
+ # Find first user message
181
+ for message in conversation.messages:
182
+ if message.role == "user":
183
+ return message.content
184
+
185
+ # Fallback to question field if present
186
+ if conversation.question:
187
+ return conversation.question
188
+
189
+ raise ValueError("No user query found in conversation")
190
+
191
+ def _get_tool_schema(
192
+ self,
193
+ available_tools: list[ToolDefinition],
194
+ tool_name: str,
195
+ ) -> ToolDefinition | None:
196
+ """Get tool schema by name from available tools.
197
+
198
+ Args:
199
+ available_tools: List of available tool definitions
200
+ tool_name: Name of tool to find
201
+
202
+ Returns:
203
+ ToolDefinition if found, None otherwise
204
+ """
205
+ for tool in available_tools:
206
+ if tool.name == tool_name:
207
+ return tool
208
+ return None
209
+
210
+ def _extract_expected_answer(self, conversation: Conversation) -> str | None:
211
+ """Extract expected answer from conversation.
212
+
213
+ Args:
214
+ conversation: Conversation object
215
+
216
+ Returns:
217
+ Expected answer if available, None otherwise
218
+ """
219
+ # Check final_answer field first
220
+ if conversation.final_answer:
221
+ return conversation.final_answer
222
+
223
+ # For tool-calling conversations, answer is in last assistant message
224
+ # after tool execution
225
+ if conversation.tool_context:
226
+ # Find last assistant message
227
+ for message in reversed(conversation.messages):
228
+ if message.role == "assistant" and not self._contains_tool_call(message.content):
229
+ # Skip messages that contain tool calls
230
+ return message.content
231
+
232
+ # For basic conversations, last assistant message is the answer
233
+ for message in reversed(conversation.messages):
234
+ if message.role == "assistant":
235
+ return message.content
236
+
237
+ return None
238
+
239
+ def _contains_tool_call(self, content: str) -> bool:
240
+ """Check if message content contains a tool call.
241
+
242
+ Looks for common tool call patterns:
243
+ - XML: <tool_call>...</tool_call>
244
+ - JSON: {"tool_calls": ...}
245
+ - Function: function_name(...)
246
+
247
+ Args:
248
+ content: Message content
249
+
250
+ Returns:
251
+ True if tool call detected
252
+ """
253
+ # Check for XML tool call tags
254
+ if "<tool_call>" in content or "</tool_call>" in content:
255
+ return True
256
+
257
+ # Check for JSON tool calls
258
+ if "{" in content and "tool_calls" in content:
259
+ try:
260
+ data = json.loads(content)
261
+ if "tool_calls" in data or "function_call" in data:
262
+ return True
263
+ except json.JSONDecodeError:
264
+ pass
265
+
266
+ # Check for function call pattern: func_name(arg1, arg2)
267
+ func_pattern = r"\b[a-z_][a-z0-9_]*\s*\([^)]*\)"
268
+ return bool(re.search(func_pattern, content.lower()))
269
+
270
+
271
+ def parse_batch(
272
+ conversations: list[Conversation],
273
+ conversation_type: Literal["basic", "chain_of_thought"],
274
+ reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
275
+ agent_mode: Literal["single_turn", "multi_turn"] | None = None,
276
+ ) -> list[GroundTruth]:
277
+ """Parse a batch of conversations to extract ground truth.
278
+
279
+ Args:
280
+ conversations: List of Conversation objects
281
+ conversation_type: Type of conversation
282
+ reasoning_style: Reasoning style if chain_of_thought
283
+ agent_mode: Agent mode if tools are used
284
+
285
+ Returns:
286
+ List of GroundTruth objects
287
+ """
288
+ parser = GroundTruthParser(
289
+ conversation_type=conversation_type,
290
+ reasoning_style=reasoning_style,
291
+ agent_mode=agent_mode,
292
+ )
293
+
294
+ ground_truths: list[GroundTruth] = []
295
+ for conversation in conversations:
296
+ try:
297
+ gt = parser.parse(conversation)
298
+ ground_truths.append(gt)
299
+ except ValueError as e:
300
+ # Log error but continue processing
301
+ print(f"Warning: Failed to parse conversation: {e}")
302
+ continue
303
+
304
+ return ground_truths
@@ -0,0 +1,13 @@
1
+ """Reporters for evaluation result output."""
2
+
3
+ from .base import BaseReporter
4
+ from .cloud_reporter import CloudReporter
5
+ from .file_reporter import FileReporter
6
+ from .multi_reporter import MultiReporter
7
+
8
+ __all__ = [
9
+ "BaseReporter",
10
+ "FileReporter",
11
+ "CloudReporter",
12
+ "MultiReporter",
13
+ ]