pydantic-evals 1.0.17__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/PKG-INFO +2 -2
  2. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/reporting/__init__.py +64 -6
  3. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/.gitignore +0 -0
  4. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/LICENSE +0 -0
  5. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/README.md +0 -0
  6. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/__init__.py +0 -0
  7. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/_utils.py +0 -0
  8. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/dataset.py +0 -0
  9. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/evaluators/__init__.py +0 -0
  10. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
  11. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/evaluators/common.py +0 -0
  12. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/evaluators/context.py +0 -0
  13. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/evaluators/evaluator.py +0 -0
  14. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/evaluators/llm_as_a_judge.py +0 -0
  15. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/evaluators/spec.py +0 -0
  16. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/generation.py +0 -0
  17. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/otel/__init__.py +0 -0
  18. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
  19. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/otel/_context_subtree.py +0 -0
  20. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/otel/_errors.py +0 -0
  21. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/otel/span_tree.py +0 -0
  22. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/py.typed +0 -0
  23. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pydantic_evals/reporting/render_numbers.py +0 -0
  24. {pydantic_evals-1.0.17 → pydantic_evals-1.1.0}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 1.0.17
3
+ Version: 1.1.0
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=0
32
32
  Requires-Dist: logfire-api>=3.14.1
33
- Requires-Dist: pydantic-ai-slim==1.0.17
33
+ Requires-Dist: pydantic-ai-slim==1.1.0
34
34
  Requires-Dist: pydantic>=2.10
35
35
  Requires-Dist: pyyaml>=6.0.2
36
36
  Requires-Dist: rich>=13.9.4
@@ -206,11 +206,70 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
206
206
  return ReportCaseAggregate.average(self.cases)
207
207
  return None
208
208
 
209
+ def render(
210
+ self,
211
+ width: int | None = None,
212
+ baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
213
+ *,
214
+ include_input: bool = False,
215
+ include_metadata: bool = False,
216
+ include_expected_output: bool = False,
217
+ include_output: bool = False,
218
+ include_durations: bool = True,
219
+ include_total_duration: bool = False,
220
+ include_removed_cases: bool = False,
221
+ include_averages: bool = True,
222
+ include_errors: bool = True,
223
+ include_error_stacktrace: bool = False,
224
+ include_evaluator_failures: bool = True,
225
+ input_config: RenderValueConfig | None = None,
226
+ metadata_config: RenderValueConfig | None = None,
227
+ output_config: RenderValueConfig | None = None,
228
+ score_configs: dict[str, RenderNumberConfig] | None = None,
229
+ label_configs: dict[str, RenderValueConfig] | None = None,
230
+ metric_configs: dict[str, RenderNumberConfig] | None = None,
231
+ duration_config: RenderNumberConfig | None = None,
232
+ include_reasons: bool = False,
233
+ ) -> str: # pragma: no cover
234
+ """Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
235
+
236
+ If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
237
+ """
238
+ io_file = StringIO()
239
+ console = Console(width=width, file=io_file)
240
+ self.print(
241
+ width=width,
242
+ baseline=baseline,
243
+ console=console,
244
+ include_input=include_input,
245
+ include_metadata=include_metadata,
246
+ include_expected_output=include_expected_output,
247
+ include_output=include_output,
248
+ include_durations=include_durations,
249
+ include_total_duration=include_total_duration,
250
+ include_removed_cases=include_removed_cases,
251
+ include_averages=include_averages,
252
+ include_errors=include_errors,
253
+ include_error_stacktrace=include_error_stacktrace,
254
+ include_evaluator_failures=include_evaluator_failures,
255
+ input_config=input_config,
256
+ metadata_config=metadata_config,
257
+ output_config=output_config,
258
+ score_configs=score_configs,
259
+ label_configs=label_configs,
260
+ metric_configs=metric_configs,
261
+ duration_config=duration_config,
262
+ include_reasons=include_reasons,
263
+ )
264
+ Console(file=io_file)
265
+ return io_file.getvalue()
266
+
209
267
  def print(
210
268
  self,
211
269
  width: int | None = None,
212
270
  baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
213
271
  *,
272
+ console: Console | None = None,
214
273
  include_input: bool = False,
215
274
  include_metadata: bool = False,
216
275
  include_expected_output: bool = False,
@@ -230,11 +289,14 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
230
289
  metric_configs: dict[str, RenderNumberConfig] | None = None,
231
290
  duration_config: RenderNumberConfig | None = None,
232
291
  include_reasons: bool = False,
233
- ): # pragma: no cover
292
+ ) -> None: # pragma: no cover
234
293
  """Print this report to the console, optionally comparing it to a baseline report.
235
294
 
236
295
  If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
237
296
  """
297
+ if console is None:
298
+ console = Console(width=width)
299
+
238
300
  table = self.console_table(
239
301
  baseline=baseline,
240
302
  include_input=include_input,
@@ -255,7 +317,6 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
255
317
  duration_config=duration_config,
256
318
  include_reasons=include_reasons,
257
319
  )
258
- console = Console(width=width)
259
320
  console.print(table)
260
321
  if include_errors and self.failures:
261
322
  failures_table = self.failures_table(
@@ -358,10 +419,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
358
419
 
359
420
  def __str__(self) -> str: # pragma: lax no cover
360
421
  """Return a string representation of the report."""
361
- table = self.console_table()
362
- io_file = StringIO()
363
- Console(file=io_file).print(table)
364
- return io_file.getvalue()
422
+ return self.render()
365
423
 
366
424
 
367
425
  EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
File without changes