pydantic-evals 1.2.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

@@ -4,16 +4,13 @@ This package provides functionality for:
4
4
  - Creating and loading test datasets with structured inputs and outputs
5
5
  - Evaluating model performance using various metrics and evaluators
6
6
  - Generating reports for evaluation results
7
-
8
- TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
9
- Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
10
- TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
11
- TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
12
7
  """
13
8
 
14
- from .dataset import Case, Dataset
9
+ from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
15
10
 
16
11
  __all__ = (
17
12
  'Case',
18
13
  'Dataset',
14
+ 'increment_eval_metric',
15
+ 'set_eval_attribute',
19
16
  )
pydantic_evals/dataset.py CHANGED
@@ -343,6 +343,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
343
343
  trace_id=trace_id,
344
344
  )
345
345
  if (averages := report.averages()) is not None and averages.assertions is not None:
346
+ experiment_metadata = {'n_cases': len(self.cases), 'averages': averages}
347
+ eval_span.set_attribute('logfire.experiment.metadata', experiment_metadata)
346
348
  eval_span.set_attribute('assertion_pass_rate', averages.assertions)
347
349
  return report
348
350
 
@@ -201,7 +201,7 @@ async def judge_output_expected(
201
201
  ).output
202
202
 
203
203
 
204
- def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
204
+ def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
205
205
  """Set the default model used for judging.
206
206
 
207
207
  This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
@@ -289,12 +289,12 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
289
289
  metric_configs: dict[str, RenderNumberConfig] | None = None,
290
290
  duration_config: RenderNumberConfig | None = None,
291
291
  include_reasons: bool = False,
292
- ) -> None: # pragma: no cover
292
+ ) -> None:
293
293
  """Print this report to the console, optionally comparing it to a baseline report.
294
294
 
295
295
  If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
296
296
  """
297
- if console is None:
297
+ if console is None: # pragma: no branch
298
298
  console = Console(width=width)
299
299
 
300
300
  table = self.console_table(
@@ -318,7 +318,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
318
318
  include_reasons=include_reasons,
319
319
  )
320
320
  console.print(table)
321
- if include_errors and self.failures:
321
+ if include_errors and self.failures: # pragma: no cover
322
322
  failures_table = self.failures_table(
323
323
  include_input=include_input,
324
324
  include_metadata=include_metadata,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 1.2.1
3
+ Version: 1.4.0
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=0
32
32
  Requires-Dist: logfire-api>=3.14.1
33
- Requires-Dist: pydantic-ai-slim==1.2.1
33
+ Requires-Dist: pydantic-ai-slim==1.4.0
34
34
  Requires-Dist: pydantic>=2.10
35
35
  Requires-Dist: pyyaml>=6.0.2
36
36
  Requires-Dist: rich>=13.9.4
@@ -1,6 +1,6 @@
1
- pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
1
+ pydantic_evals/__init__.py,sha256=X5m0fcEZ4e8hVhToX5PludEp8t7NTBmdNInFFM5hM_I,504
2
2
  pydantic_evals/_utils.py,sha256=1muGTc2zqjwxqngz6quRSLoZM88onjp0Xgt-a9n2aPQ,4111
3
- pydantic_evals/dataset.py,sha256=hX9wrBvbWha1RLomaBY_mzKudWWKMT9doj8VPH8NflU,50437
3
+ pydantic_evals/dataset.py,sha256=XobDGjjTj0oR5CARw8sWwC0KrIg0tpRzRiOkg8-Eeyc,50618
4
4
  pydantic_evals/generation.py,sha256=Qy03z7vGvE14cUBsqjorEx7Ar1KkR7Fb5SItZB429fc,3715
5
5
  pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  pydantic_evals/evaluators/__init__.py,sha256=E_JT6o96Ef-oS_IZ1Hyy95NRLwz7EOHewp-o13IdXEM,1032
@@ -8,16 +8,16 @@ pydantic_evals/evaluators/_run_evaluator.py,sha256=uGmH67gCTeF9BSprCiBC4DtKEpKLr
8
8
  pydantic_evals/evaluators/common.py,sha256=Cc9RMsSf5P2gcq3IDwmZxgfo1xnu7HEehiAS2Hgibz4,11609
9
9
  pydantic_evals/evaluators/context.py,sha256=mTxcm0Hvkev9htpqwoJMCJIqEYBtY5g86SXcjoqQxHY,3884
10
10
  pydantic_evals/evaluators/evaluator.py,sha256=ylfKRytoM9KzbZkSsFkEEnsg4XhK4usuyy1Rb1emoPo,11474
11
- pydantic_evals/evaluators/llm_as_a_judge.py,sha256=BPdUfEsLPSxN2kJPt3dtJBRCBP46ctRoW_n24WubaB0,9567
11
+ pydantic_evals/evaluators/llm_as_a_judge.py,sha256=4jAg-pAk7Ae5IFO1p3dar4Ncju__S6IORcH9LnU1fXs,9547
12
12
  pydantic_evals/evaluators/spec.py,sha256=szAUsY4gb8KK_l1R81HYrByh4Rawrjav7w9835FZg1w,6690
13
13
  pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
14
14
  pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=FrG0pXKjuvTp3bXNdrUyzdPkqm0DQWe4ehkiHaxSvz4,6742
15
15
  pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
16
16
  pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD4,268
17
17
  pydantic_evals/otel/span_tree.py,sha256=RzX4VGpEqc2QUhkyxMTXtBRo5yHHO1c0hI7QJJuiXPU,23043
18
- pydantic_evals/reporting/__init__.py,sha256=702W2BjMiXhKQz6T4sor6Zi2SjYTDQypCvealJrwTFA,54067
18
+ pydantic_evals/reporting/__init__.py,sha256=LGPZRKyRAl7Apx44-UnYENsAltknakf3dcYkjwoTSFw,54088
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-1.2.1.dist-info/METADATA,sha256=jsE9ujRLvxt780e7as5aSMUfcE8Ns1VyclE0f9-spnE,7844
21
- pydantic_evals-1.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-1.2.1.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-1.2.1.dist-info/RECORD,,
20
+ pydantic_evals-1.4.0.dist-info/METADATA,sha256=ErkaNhP07TA5vw0L7kvaGj4ZsFJ6kBzsVCcUJA9g95s,7844
21
+ pydantic_evals-1.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-1.4.0.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-1.4.0.dist-info/RECORD,,