DeepFabric 4.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepfabric/__init__.py +70 -0
- deepfabric/__main__.py +6 -0
- deepfabric/auth.py +382 -0
- deepfabric/builders.py +303 -0
- deepfabric/builders_agent.py +1304 -0
- deepfabric/cli.py +1288 -0
- deepfabric/config.py +899 -0
- deepfabric/config_manager.py +251 -0
- deepfabric/constants.py +94 -0
- deepfabric/dataset_manager.py +534 -0
- deepfabric/error_codes.py +581 -0
- deepfabric/evaluation/__init__.py +47 -0
- deepfabric/evaluation/backends/__init__.py +32 -0
- deepfabric/evaluation/backends/ollama_backend.py +137 -0
- deepfabric/evaluation/backends/tool_call_parsers.py +409 -0
- deepfabric/evaluation/backends/transformers_backend.py +326 -0
- deepfabric/evaluation/evaluator.py +845 -0
- deepfabric/evaluation/evaluators/__init__.py +13 -0
- deepfabric/evaluation/evaluators/base.py +104 -0
- deepfabric/evaluation/evaluators/builtin/__init__.py +5 -0
- deepfabric/evaluation/evaluators/builtin/tool_calling.py +93 -0
- deepfabric/evaluation/evaluators/registry.py +66 -0
- deepfabric/evaluation/inference.py +155 -0
- deepfabric/evaluation/metrics.py +397 -0
- deepfabric/evaluation/parser.py +304 -0
- deepfabric/evaluation/reporters/__init__.py +13 -0
- deepfabric/evaluation/reporters/base.py +56 -0
- deepfabric/evaluation/reporters/cloud_reporter.py +195 -0
- deepfabric/evaluation/reporters/file_reporter.py +61 -0
- deepfabric/evaluation/reporters/multi_reporter.py +56 -0
- deepfabric/exceptions.py +67 -0
- deepfabric/factory.py +26 -0
- deepfabric/generator.py +1084 -0
- deepfabric/graph.py +545 -0
- deepfabric/hf_hub.py +214 -0
- deepfabric/kaggle_hub.py +219 -0
- deepfabric/llm/__init__.py +41 -0
- deepfabric/llm/api_key_verifier.py +534 -0
- deepfabric/llm/client.py +1206 -0
- deepfabric/llm/errors.py +105 -0
- deepfabric/llm/rate_limit_config.py +262 -0
- deepfabric/llm/rate_limit_detector.py +278 -0
- deepfabric/llm/retry_handler.py +270 -0
- deepfabric/metrics.py +212 -0
- deepfabric/progress.py +262 -0
- deepfabric/prompts.py +290 -0
- deepfabric/schemas.py +1000 -0
- deepfabric/spin/__init__.py +6 -0
- deepfabric/spin/client.py +263 -0
- deepfabric/spin/models.py +26 -0
- deepfabric/stream_simulator.py +90 -0
- deepfabric/tools/__init__.py +5 -0
- deepfabric/tools/defaults.py +85 -0
- deepfabric/tools/loader.py +87 -0
- deepfabric/tools/mcp_client.py +677 -0
- deepfabric/topic_manager.py +303 -0
- deepfabric/topic_model.py +20 -0
- deepfabric/training/__init__.py +35 -0
- deepfabric/training/api_key_prompt.py +302 -0
- deepfabric/training/callback.py +363 -0
- deepfabric/training/metrics_sender.py +301 -0
- deepfabric/tree.py +438 -0
- deepfabric/tui.py +1267 -0
- deepfabric/update_checker.py +166 -0
- deepfabric/utils.py +150 -0
- deepfabric/validation.py +143 -0
- deepfabric-4.4.0.dist-info/METADATA +702 -0
- deepfabric-4.4.0.dist-info/RECORD +71 -0
- deepfabric-4.4.0.dist-info/WHEEL +4 -0
- deepfabric-4.4.0.dist-info/entry_points.txt +2 -0
- deepfabric-4.4.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
"""Metrics computation for model evaluation."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from ..schemas import ToolDefinition
|
|
8
|
+
|
|
9
|
+
# Tolerance for numeric comparison
|
|
10
|
+
NUMERIC_TOLERANCE = 1e-6
|
|
11
|
+
|
|
12
|
+
# Type validation dispatch table
|
|
13
|
+
_TYPE_CHECKS = {
|
|
14
|
+
"str": lambda v: isinstance(v, str),
|
|
15
|
+
"int": lambda v: isinstance(v, int) and not isinstance(v, bool),
|
|
16
|
+
"float": lambda v: isinstance(v, int | float) and not isinstance(v, bool),
|
|
17
|
+
"bool": lambda v: isinstance(v, bool),
|
|
18
|
+
"list": lambda v: isinstance(v, list),
|
|
19
|
+
"dict": lambda v: isinstance(v, dict),
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _is_valid_type(schema_type: str, value: Any) -> bool:
|
|
24
|
+
"""Check if value matches schema type.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
schema_type: Schema type string ("str", "int", "float", "bool", "list", "dict")
|
|
28
|
+
value: Value to check
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
True if value matches type, False otherwise
|
|
32
|
+
"""
|
|
33
|
+
check = _TYPE_CHECKS.get(schema_type)
|
|
34
|
+
return check(value) if check else False
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _validate_parameter_types(
|
|
38
|
+
predicted_params: dict[str, Any],
|
|
39
|
+
tool_def: ToolDefinition,
|
|
40
|
+
) -> bool:
|
|
41
|
+
"""Validate parameter types against tool schema.
|
|
42
|
+
|
|
43
|
+
Checks that:
|
|
44
|
+
1. All required parameters are present
|
|
45
|
+
2. Parameter types match schema (with type coercion)
|
|
46
|
+
3. Ignores actual values - only validates structure
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
predicted_params: Parameters to validate
|
|
50
|
+
tool_def: Tool definition with schema
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
True if types are valid, False otherwise
|
|
54
|
+
"""
|
|
55
|
+
# Create lookup for parameters by name
|
|
56
|
+
schema_params = {p.name: p for p in tool_def.parameters}
|
|
57
|
+
|
|
58
|
+
# Check all required parameters are present
|
|
59
|
+
for param_name, param_schema in schema_params.items():
|
|
60
|
+
if param_schema.required and param_name not in predicted_params:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
# Check types for each predicted parameter
|
|
64
|
+
for param_name, predicted_value in predicted_params.items():
|
|
65
|
+
# Skip extra parameters not in schema (allow for flexibility)
|
|
66
|
+
if param_name not in schema_params:
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
schema_param = schema_params[param_name]
|
|
70
|
+
if not _is_valid_type(schema_param.type, predicted_value):
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
return True
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EvaluationMetrics(BaseModel):
|
|
77
|
+
"""Computed evaluation metrics."""
|
|
78
|
+
|
|
79
|
+
tool_selection_accuracy: float = Field(
|
|
80
|
+
description="Accuracy of tool selection (0.0-1.0)",
|
|
81
|
+
)
|
|
82
|
+
parameter_accuracy: float = Field(
|
|
83
|
+
description="Accuracy of parameter extraction (0.0-1.0)",
|
|
84
|
+
)
|
|
85
|
+
execution_success_rate: float = Field(
|
|
86
|
+
description="Rate of valid tool calls (0.0-1.0)",
|
|
87
|
+
)
|
|
88
|
+
response_quality: float = Field(
|
|
89
|
+
description="Quality of final response (0.0-1.0)",
|
|
90
|
+
)
|
|
91
|
+
overall_score: float = Field(
|
|
92
|
+
description="Weighted overall score (0.0-1.0)",
|
|
93
|
+
)
|
|
94
|
+
samples_evaluated: int = Field(
|
|
95
|
+
description="Total number of samples evaluated",
|
|
96
|
+
)
|
|
97
|
+
samples_processed: int = Field(
|
|
98
|
+
description="Number of samples processed without system errors",
|
|
99
|
+
)
|
|
100
|
+
processing_errors: int = Field(
|
|
101
|
+
description="Number of samples that failed to process (system errors, timeouts)",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class SampleEvaluation(BaseModel):
|
|
106
|
+
"""Evaluation result for a single sample."""
|
|
107
|
+
|
|
108
|
+
sample_id: int = Field(description="Sample index")
|
|
109
|
+
query: str = Field(description="Input query")
|
|
110
|
+
expected_tool: str | None = Field(
|
|
111
|
+
default=None,
|
|
112
|
+
description="Expected tool name",
|
|
113
|
+
)
|
|
114
|
+
predicted_tool: str | None = Field(
|
|
115
|
+
default=None,
|
|
116
|
+
description="Predicted tool name",
|
|
117
|
+
)
|
|
118
|
+
expected_parameters: dict[str, Any] = Field(
|
|
119
|
+
default_factory=dict,
|
|
120
|
+
description="Expected parameters",
|
|
121
|
+
)
|
|
122
|
+
predicted_parameters: dict[str, Any] = Field(
|
|
123
|
+
default_factory=dict,
|
|
124
|
+
description="Predicted parameters",
|
|
125
|
+
)
|
|
126
|
+
expected_answer: str | None = Field(
|
|
127
|
+
default=None,
|
|
128
|
+
description="Expected final answer",
|
|
129
|
+
)
|
|
130
|
+
predicted_answer: str | None = Field(
|
|
131
|
+
default=None,
|
|
132
|
+
description="Predicted final answer",
|
|
133
|
+
)
|
|
134
|
+
tool_selection_correct: bool = Field(
|
|
135
|
+
description="Whether tool selection was correct",
|
|
136
|
+
)
|
|
137
|
+
parameters_correct: bool = Field(
|
|
138
|
+
description="Whether parameters were correct",
|
|
139
|
+
)
|
|
140
|
+
execution_valid: bool = Field(
|
|
141
|
+
description="Whether the tool call could be executed",
|
|
142
|
+
)
|
|
143
|
+
response_score: float = Field(
|
|
144
|
+
description="Response quality score (0.0-1.0)",
|
|
145
|
+
)
|
|
146
|
+
error: str | None = Field(
|
|
147
|
+
default=None,
|
|
148
|
+
description="Error message if prediction failed",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def compute_tool_selection_accuracy(
|
|
153
|
+
evaluations: list[SampleEvaluation],
|
|
154
|
+
) -> float:
|
|
155
|
+
"""Compute tool selection accuracy.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
evaluations: List of sample evaluations
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Accuracy score (0.0-1.0)
|
|
162
|
+
"""
|
|
163
|
+
if not evaluations:
|
|
164
|
+
return 0.0
|
|
165
|
+
|
|
166
|
+
correct = sum(1 for e in evaluations if e.tool_selection_correct)
|
|
167
|
+
return correct / len(evaluations)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def compute_parameter_accuracy(
|
|
171
|
+
evaluations: list[SampleEvaluation],
|
|
172
|
+
) -> float:
|
|
173
|
+
"""Compute parameter extraction accuracy.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
evaluations: List of sample evaluations
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Accuracy score (0.0-1.0)
|
|
180
|
+
"""
|
|
181
|
+
if not evaluations:
|
|
182
|
+
return 0.0
|
|
183
|
+
|
|
184
|
+
correct = sum(1 for e in evaluations if e.parameters_correct)
|
|
185
|
+
return correct / len(evaluations)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def compute_execution_success_rate(
|
|
189
|
+
evaluations: list[SampleEvaluation],
|
|
190
|
+
) -> float:
|
|
191
|
+
"""Compute execution success rate.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
evaluations: List of sample evaluations
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Success rate (0.0-1.0)
|
|
198
|
+
"""
|
|
199
|
+
if not evaluations:
|
|
200
|
+
return 0.0
|
|
201
|
+
|
|
202
|
+
valid = sum(1 for e in evaluations if e.execution_valid)
|
|
203
|
+
return valid / len(evaluations)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def compute_response_quality(
|
|
207
|
+
evaluations: list[SampleEvaluation],
|
|
208
|
+
) -> float:
|
|
209
|
+
"""Compute average response quality.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
evaluations: List of sample evaluations
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Average quality score (0.0-1.0)
|
|
216
|
+
"""
|
|
217
|
+
if not evaluations:
|
|
218
|
+
return 0.0
|
|
219
|
+
|
|
220
|
+
total_score = sum(e.response_score for e in evaluations)
|
|
221
|
+
return total_score / len(evaluations)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def compute_overall_score(
|
|
225
|
+
tool_accuracy: float,
|
|
226
|
+
param_accuracy: float,
|
|
227
|
+
exec_success: float,
|
|
228
|
+
response_quality: float,
|
|
229
|
+
weights: dict[str, float] | None = None,
|
|
230
|
+
) -> float:
|
|
231
|
+
"""Compute weighted overall score.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
tool_accuracy: Tool selection accuracy
|
|
235
|
+
param_accuracy: Parameter accuracy
|
|
236
|
+
exec_success: Execution success rate
|
|
237
|
+
response_quality: Response quality score
|
|
238
|
+
weights: Custom weights for each metric (defaults used if None)
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
Weighted overall score (0.0-1.0)
|
|
242
|
+
"""
|
|
243
|
+
# Default weights (response_quality excluded for tool-calling mode)
|
|
244
|
+
if weights is None:
|
|
245
|
+
weights = {
|
|
246
|
+
"tool_selection": 0.40,
|
|
247
|
+
"parameter_accuracy": 0.35,
|
|
248
|
+
"execution_success": 0.25,
|
|
249
|
+
"response_quality": 0.00, # Not used for tool-calling evaluation
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return (
|
|
253
|
+
tool_accuracy * weights.get("tool_selection", 0.0)
|
|
254
|
+
+ param_accuracy * weights.get("parameter_accuracy", 0.0)
|
|
255
|
+
+ exec_success * weights.get("execution_success", 0.0)
|
|
256
|
+
+ response_quality * weights.get("response_quality", 0.0)
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def compute_metrics(
|
|
261
|
+
evaluations: list[SampleEvaluation],
|
|
262
|
+
weights: dict[str, float] | None = None,
|
|
263
|
+
) -> EvaluationMetrics:
|
|
264
|
+
"""Compute all evaluation metrics from sample evaluations.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
evaluations: List of sample evaluations
|
|
268
|
+
weights: Custom weights for overall score computation
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
EvaluationMetrics with all computed scores
|
|
272
|
+
"""
|
|
273
|
+
if not evaluations:
|
|
274
|
+
return EvaluationMetrics(
|
|
275
|
+
tool_selection_accuracy=0.0,
|
|
276
|
+
parameter_accuracy=0.0,
|
|
277
|
+
execution_success_rate=0.0,
|
|
278
|
+
response_quality=0.0,
|
|
279
|
+
overall_score=0.0,
|
|
280
|
+
samples_evaluated=0,
|
|
281
|
+
samples_processed=0,
|
|
282
|
+
processing_errors=0,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
tool_acc = compute_tool_selection_accuracy(evaluations)
|
|
286
|
+
param_acc = compute_parameter_accuracy(evaluations)
|
|
287
|
+
exec_success = compute_execution_success_rate(evaluations)
|
|
288
|
+
resp_quality = compute_response_quality(evaluations)
|
|
289
|
+
|
|
290
|
+
overall = compute_overall_score(
|
|
291
|
+
tool_acc,
|
|
292
|
+
param_acc,
|
|
293
|
+
exec_success,
|
|
294
|
+
resp_quality,
|
|
295
|
+
weights,
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Count processing status (system errors vs successfully processed)
|
|
299
|
+
processed = sum(1 for e in evaluations if e.error is None)
|
|
300
|
+
errors = len(evaluations) - processed
|
|
301
|
+
|
|
302
|
+
return EvaluationMetrics(
|
|
303
|
+
tool_selection_accuracy=tool_acc,
|
|
304
|
+
parameter_accuracy=param_acc,
|
|
305
|
+
execution_success_rate=exec_success,
|
|
306
|
+
response_quality=resp_quality,
|
|
307
|
+
overall_score=overall,
|
|
308
|
+
samples_evaluated=len(evaluations),
|
|
309
|
+
samples_processed=processed,
|
|
310
|
+
processing_errors=errors,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def compare_parameters( # noqa: PLR0911
|
|
315
|
+
expected: dict[str, Any],
|
|
316
|
+
predicted: dict[str, Any],
|
|
317
|
+
tool_name: str | None = None,
|
|
318
|
+
tool_definitions: list[ToolDefinition] | None = None,
|
|
319
|
+
) -> bool:
|
|
320
|
+
"""Compare expected and predicted parameters.
|
|
321
|
+
|
|
322
|
+
If tool schema is provided, validates parameter types and presence of required params.
|
|
323
|
+
Otherwise, performs value-based comparison (legacy behavior for backward compatibility).
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
expected: Expected parameters
|
|
327
|
+
predicted: Predicted parameters
|
|
328
|
+
tool_name: Name of the tool being called (for schema lookup)
|
|
329
|
+
tool_definitions: List of tool definitions with schemas
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
True if parameters match (schema-aware) or values match (legacy), False otherwise
|
|
333
|
+
"""
|
|
334
|
+
if not expected and not predicted:
|
|
335
|
+
return True
|
|
336
|
+
|
|
337
|
+
# Schema-aware validation if tool definition available
|
|
338
|
+
if tool_name and tool_definitions:
|
|
339
|
+
tool_def = next((t for t in tool_definitions if t.name == tool_name), None)
|
|
340
|
+
if tool_def:
|
|
341
|
+
return _validate_parameter_types(predicted, tool_def)
|
|
342
|
+
|
|
343
|
+
# Legacy value-based comparison (backward compatibility)
|
|
344
|
+
# Check if all expected keys are present
|
|
345
|
+
if set(expected.keys()) != set(predicted.keys()):
|
|
346
|
+
return False
|
|
347
|
+
|
|
348
|
+
# Compare values
|
|
349
|
+
for key, expected_val in expected.items():
|
|
350
|
+
predicted_val = predicted.get(key)
|
|
351
|
+
|
|
352
|
+
# Handle different types
|
|
353
|
+
if isinstance(expected_val, str) and isinstance(predicted_val, str):
|
|
354
|
+
# Case-insensitive string comparison
|
|
355
|
+
if expected_val.lower().strip() != predicted_val.lower().strip():
|
|
356
|
+
return False
|
|
357
|
+
elif isinstance(expected_val, int | float) and isinstance(predicted_val, int | float):
|
|
358
|
+
# Numeric comparison with small tolerance
|
|
359
|
+
if abs(float(expected_val) - float(predicted_val)) > NUMERIC_TOLERANCE:
|
|
360
|
+
return False
|
|
361
|
+
elif expected_val != predicted_val:
|
|
362
|
+
# Exact match for other types
|
|
363
|
+
return False
|
|
364
|
+
|
|
365
|
+
return True
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def compute_response_similarity(
|
|
369
|
+
expected: str | None,
|
|
370
|
+
predicted: str | None,
|
|
371
|
+
) -> float:
|
|
372
|
+
"""Compute similarity between expected and predicted responses.
|
|
373
|
+
|
|
374
|
+
Uses simple word overlap for now. Can be enhanced with semantic similarity.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
expected: Expected response
|
|
378
|
+
predicted: Predicted response
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Similarity score (0.0-1.0)
|
|
382
|
+
"""
|
|
383
|
+
if not expected or not predicted:
|
|
384
|
+
return 0.0 if expected != predicted else 1.0
|
|
385
|
+
|
|
386
|
+
# Tokenize and normalize
|
|
387
|
+
expected_words = set(expected.lower().split())
|
|
388
|
+
predicted_words = set(predicted.lower().split())
|
|
389
|
+
|
|
390
|
+
# Compute Jaccard similarity
|
|
391
|
+
if not expected_words and not predicted_words:
|
|
392
|
+
return 1.0
|
|
393
|
+
|
|
394
|
+
intersection = expected_words & predicted_words
|
|
395
|
+
union = expected_words | predicted_words
|
|
396
|
+
|
|
397
|
+
return len(intersection) / len(union) if union else 0.0
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Ground truth parsing from DeepFabric dataset samples."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from ..schemas import Conversation, ToolDefinition
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ExpectedToolCall(BaseModel):
|
|
14
|
+
"""A single expected tool call with its parameters."""
|
|
15
|
+
|
|
16
|
+
tool_name: str = Field(description="Name of the tool")
|
|
17
|
+
parameters: dict[str, Any] = Field(
|
|
18
|
+
default_factory=dict,
|
|
19
|
+
description="Parameter names and values",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def signature(self) -> str:
|
|
23
|
+
"""Return a hashable signature for deduplication."""
|
|
24
|
+
params_str = json.dumps(self.parameters, sort_keys=True)
|
|
25
|
+
return f"{self.tool_name}:{params_str}"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class GroundTruth(BaseModel):
|
|
29
|
+
"""Parsed ground truth from original dataset sample."""
|
|
30
|
+
|
|
31
|
+
query: str = Field(description="The user query")
|
|
32
|
+
expected_tool: str | None = Field(
|
|
33
|
+
default=None,
|
|
34
|
+
description="Expected tool name - first tool (None if no tool use). Kept for backwards compatibility.",
|
|
35
|
+
)
|
|
36
|
+
expected_parameters: dict[str, str | int | float | bool | list | dict] = Field(
|
|
37
|
+
default_factory=dict,
|
|
38
|
+
description="Expected tool parameters - first tool. Kept for backwards compatibility.",
|
|
39
|
+
)
|
|
40
|
+
expected_tools: list[ExpectedToolCall] = Field(
|
|
41
|
+
default_factory=list,
|
|
42
|
+
description="All unique expected tool calls (deduplicated by tool_name + parameters)",
|
|
43
|
+
)
|
|
44
|
+
tool_schema: ToolDefinition | None = Field(
|
|
45
|
+
default=None,
|
|
46
|
+
description="Tool schema from available_tools",
|
|
47
|
+
)
|
|
48
|
+
expected_answer: str | None = Field(
|
|
49
|
+
default=None,
|
|
50
|
+
description="Expected final answer if available",
|
|
51
|
+
)
|
|
52
|
+
conversation_type: Literal["basic", "chain_of_thought"] = Field(
|
|
53
|
+
description="Type of conversation",
|
|
54
|
+
)
|
|
55
|
+
reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = Field(
|
|
56
|
+
default=None,
|
|
57
|
+
description="Reasoning style if chain_of_thought",
|
|
58
|
+
)
|
|
59
|
+
agent_mode: Literal["single_turn", "multi_turn"] | None = Field(
|
|
60
|
+
default=None,
|
|
61
|
+
description="Agent mode if tools are used",
|
|
62
|
+
)
|
|
63
|
+
metadata: dict[str, str | int | float | bool] = Field(
|
|
64
|
+
default_factory=dict,
|
|
65
|
+
description="Additional metadata",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class GroundTruthParser:
|
|
70
|
+
"""Parse ground truth from original DeepFabric JSONL format.
|
|
71
|
+
|
|
72
|
+
This parser extracts expected tools, parameters, and answers from
|
|
73
|
+
Conversation objects while handling all conversation types and agent modes.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
conversation_type: Literal["basic", "chain_of_thought"],
|
|
79
|
+
reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
|
|
80
|
+
agent_mode: Literal["single_turn", "multi_turn"] | None = None,
|
|
81
|
+
):
|
|
82
|
+
"""Initialize parser with conversation configuration.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
conversation_type: Type of conversation (basic, chain_of_thought)
|
|
86
|
+
reasoning_style: Reasoning style for chain_of_thought
|
|
87
|
+
agent_mode: Agent mode if tools are used
|
|
88
|
+
"""
|
|
89
|
+
self.conversation_type: Literal["basic", "chain_of_thought"] = conversation_type
|
|
90
|
+
self.reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = (
|
|
91
|
+
reasoning_style
|
|
92
|
+
)
|
|
93
|
+
self.agent_mode: Literal["single_turn", "multi_turn"] | None = agent_mode
|
|
94
|
+
|
|
95
|
+
def parse(self, conversation: Conversation) -> GroundTruth:
|
|
96
|
+
"""Extract ground truth from a conversation sample.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
conversation: Conversation object from dataset
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
GroundTruth with expected values
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
ValueError: If conversation format is invalid
|
|
106
|
+
"""
|
|
107
|
+
# Extract query from first user message
|
|
108
|
+
query = self._extract_query(conversation)
|
|
109
|
+
|
|
110
|
+
# Extract expected tool and parameters if tool_context present
|
|
111
|
+
expected_tool: str | None = None
|
|
112
|
+
expected_parameters: dict = {}
|
|
113
|
+
expected_tools: list[ExpectedToolCall] = []
|
|
114
|
+
tool_schema: ToolDefinition | None = None
|
|
115
|
+
|
|
116
|
+
executions = (
|
|
117
|
+
conversation.tool_context.executions
|
|
118
|
+
if conversation.tool_context is not None and conversation.tool_context.executions
|
|
119
|
+
else []
|
|
120
|
+
)
|
|
121
|
+
if executions:
|
|
122
|
+
# Get first tool execution for backwards compatibility
|
|
123
|
+
first_execution = executions[0]
|
|
124
|
+
expected_tool = first_execution.function_name
|
|
125
|
+
expected_parameters = first_execution.parsed_arguments
|
|
126
|
+
|
|
127
|
+
# Extract ALL tool executions and deduplicate
|
|
128
|
+
seen_signatures: set[str] = set()
|
|
129
|
+
for execution in executions:
|
|
130
|
+
tool_call = ExpectedToolCall(
|
|
131
|
+
tool_name=execution.function_name,
|
|
132
|
+
parameters=execution.parsed_arguments,
|
|
133
|
+
)
|
|
134
|
+
sig = tool_call.signature()
|
|
135
|
+
if sig not in seen_signatures:
|
|
136
|
+
seen_signatures.add(sig)
|
|
137
|
+
expected_tools.append(tool_call)
|
|
138
|
+
|
|
139
|
+
# Get tool schema from tools field (OpenAI format)
|
|
140
|
+
if conversation.tools:
|
|
141
|
+
available_tools = [ToolDefinition.from_openai(tool) for tool in conversation.tools]
|
|
142
|
+
tool_schema = self._get_tool_schema(available_tools, expected_tool)
|
|
143
|
+
|
|
144
|
+
# Extract expected answer
|
|
145
|
+
expected_answer = self._extract_expected_answer(conversation)
|
|
146
|
+
|
|
147
|
+
# Extract metadata
|
|
148
|
+
metadata_dict: dict[str, str | int | float | bool] = {}
|
|
149
|
+
if conversation.metadata:
|
|
150
|
+
# Filter to only simple types
|
|
151
|
+
for key, value in conversation.metadata.items():
|
|
152
|
+
if isinstance(value, str | int | float | bool):
|
|
153
|
+
metadata_dict[key] = value
|
|
154
|
+
|
|
155
|
+
return GroundTruth(
|
|
156
|
+
query=query,
|
|
157
|
+
expected_tool=expected_tool,
|
|
158
|
+
expected_parameters=expected_parameters,
|
|
159
|
+
expected_tools=expected_tools,
|
|
160
|
+
tool_schema=tool_schema,
|
|
161
|
+
expected_answer=expected_answer,
|
|
162
|
+
conversation_type=self.conversation_type,
|
|
163
|
+
reasoning_style=self.reasoning_style,
|
|
164
|
+
agent_mode=self.agent_mode,
|
|
165
|
+
metadata=metadata_dict,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _extract_query(self, conversation: Conversation) -> str:
|
|
169
|
+
"""Extract user query from conversation messages.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
conversation: Conversation object
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
User query string
|
|
176
|
+
|
|
177
|
+
Raises:
|
|
178
|
+
ValueError: If no user message found
|
|
179
|
+
"""
|
|
180
|
+
# Find first user message
|
|
181
|
+
for message in conversation.messages:
|
|
182
|
+
if message.role == "user":
|
|
183
|
+
return message.content
|
|
184
|
+
|
|
185
|
+
# Fallback to question field if present
|
|
186
|
+
if conversation.question:
|
|
187
|
+
return conversation.question
|
|
188
|
+
|
|
189
|
+
raise ValueError("No user query found in conversation")
|
|
190
|
+
|
|
191
|
+
def _get_tool_schema(
|
|
192
|
+
self,
|
|
193
|
+
available_tools: list[ToolDefinition],
|
|
194
|
+
tool_name: str,
|
|
195
|
+
) -> ToolDefinition | None:
|
|
196
|
+
"""Get tool schema by name from available tools.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
available_tools: List of available tool definitions
|
|
200
|
+
tool_name: Name of tool to find
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
ToolDefinition if found, None otherwise
|
|
204
|
+
"""
|
|
205
|
+
for tool in available_tools:
|
|
206
|
+
if tool.name == tool_name:
|
|
207
|
+
return tool
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
def _extract_expected_answer(self, conversation: Conversation) -> str | None:
|
|
211
|
+
"""Extract expected answer from conversation.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
conversation: Conversation object
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Expected answer if available, None otherwise
|
|
218
|
+
"""
|
|
219
|
+
# Check final_answer field first
|
|
220
|
+
if conversation.final_answer:
|
|
221
|
+
return conversation.final_answer
|
|
222
|
+
|
|
223
|
+
# For tool-calling conversations, answer is in last assistant message
|
|
224
|
+
# after tool execution
|
|
225
|
+
if conversation.tool_context:
|
|
226
|
+
# Find last assistant message
|
|
227
|
+
for message in reversed(conversation.messages):
|
|
228
|
+
if message.role == "assistant" and not self._contains_tool_call(message.content):
|
|
229
|
+
# Skip messages that contain tool calls
|
|
230
|
+
return message.content
|
|
231
|
+
|
|
232
|
+
# For basic conversations, last assistant message is the answer
|
|
233
|
+
for message in reversed(conversation.messages):
|
|
234
|
+
if message.role == "assistant":
|
|
235
|
+
return message.content
|
|
236
|
+
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
def _contains_tool_call(self, content: str) -> bool:
|
|
240
|
+
"""Check if message content contains a tool call.
|
|
241
|
+
|
|
242
|
+
Looks for common tool call patterns:
|
|
243
|
+
- XML: <tool_call>...</tool_call>
|
|
244
|
+
- JSON: {"tool_calls": ...}
|
|
245
|
+
- Function: function_name(...)
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
content: Message content
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
True if tool call detected
|
|
252
|
+
"""
|
|
253
|
+
# Check for XML tool call tags
|
|
254
|
+
if "<tool_call>" in content or "</tool_call>" in content:
|
|
255
|
+
return True
|
|
256
|
+
|
|
257
|
+
# Check for JSON tool calls
|
|
258
|
+
if "{" in content and "tool_calls" in content:
|
|
259
|
+
try:
|
|
260
|
+
data = json.loads(content)
|
|
261
|
+
if "tool_calls" in data or "function_call" in data:
|
|
262
|
+
return True
|
|
263
|
+
except json.JSONDecodeError:
|
|
264
|
+
pass
|
|
265
|
+
|
|
266
|
+
# Check for function call pattern: func_name(arg1, arg2)
|
|
267
|
+
func_pattern = r"\b[a-z_][a-z0-9_]*\s*\([^)]*\)"
|
|
268
|
+
return bool(re.search(func_pattern, content.lower()))
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def parse_batch(
|
|
272
|
+
conversations: list[Conversation],
|
|
273
|
+
conversation_type: Literal["basic", "chain_of_thought"],
|
|
274
|
+
reasoning_style: Literal["freetext", "agent", "structured", "hybrid"] | None = None,
|
|
275
|
+
agent_mode: Literal["single_turn", "multi_turn"] | None = None,
|
|
276
|
+
) -> list[GroundTruth]:
|
|
277
|
+
"""Parse a batch of conversations to extract ground truth.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
conversations: List of Conversation objects
|
|
281
|
+
conversation_type: Type of conversation
|
|
282
|
+
reasoning_style: Reasoning style if chain_of_thought
|
|
283
|
+
agent_mode: Agent mode if tools are used
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
List of GroundTruth objects
|
|
287
|
+
"""
|
|
288
|
+
parser = GroundTruthParser(
|
|
289
|
+
conversation_type=conversation_type,
|
|
290
|
+
reasoning_style=reasoning_style,
|
|
291
|
+
agent_mode=agent_mode,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
ground_truths: list[GroundTruth] = []
|
|
295
|
+
for conversation in conversations:
|
|
296
|
+
try:
|
|
297
|
+
gt = parser.parse(conversation)
|
|
298
|
+
ground_truths.append(gt)
|
|
299
|
+
except ValueError as e:
|
|
300
|
+
# Log error but continue processing
|
|
301
|
+
print(f"Warning: Failed to parse conversation: {e}")
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
return ground_truths
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Reporters for evaluation result output."""
|
|
2
|
+
|
|
3
|
+
from .base import BaseReporter
|
|
4
|
+
from .cloud_reporter import CloudReporter
|
|
5
|
+
from .file_reporter import FileReporter
|
|
6
|
+
from .multi_reporter import MultiReporter
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseReporter",
|
|
10
|
+
"FileReporter",
|
|
11
|
+
"CloudReporter",
|
|
12
|
+
"MultiReporter",
|
|
13
|
+
]
|