ragbits-evaluate 1.3.0__tar.gz → 1.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/CHANGELOG.md +18 -0
  2. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/PKG-INFO +2 -2
  3. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/pyproject.toml +2 -11
  4. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/metrics/question_answer.py +25 -8
  5. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/.gitignore +0 -0
  6. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/README.md +0 -0
  7. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/__init__.py +0 -0
  8. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/cli.py +0 -0
  9. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/config.py +0 -0
  10. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataloaders/__init__.py +0 -0
  11. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataloaders/base.py +0 -0
  12. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataloaders/document_search.py +0 -0
  13. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataloaders/exceptions.py +0 -0
  14. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataloaders/question_answer.py +0 -0
  15. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/__init__.py +0 -0
  16. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/pipeline.py +0 -0
  17. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/prompts/__init__.py +0 -0
  18. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/prompts/corpus_generation.py +0 -0
  19. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/prompts/qa.py +0 -0
  20. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/__init__.py +0 -0
  21. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/corpus_generation.py +0 -0
  22. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/__init__.py +0 -0
  23. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/base.py +0 -0
  24. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/filter/dont_know.py +0 -0
  25. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/__init__.py +0 -0
  26. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/base.py +0 -0
  27. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/tasks/text_generation/qa.py +0 -0
  28. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/dataset_generator/utils.py +0 -0
  29. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/evaluator.py +0 -0
  30. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/factories/__init__.py +0 -0
  31. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/metrics/__init__.py +0 -0
  32. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/metrics/base.py +0 -0
  33. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/metrics/document_search.py +0 -0
  34. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/optimizer.py +0 -0
  35. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/pipelines/__init__.py +0 -0
  36. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/pipelines/base.py +0 -0
  37. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/pipelines/document_search.py +0 -0
  38. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/pipelines/question_answer.py +0 -0
  39. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/py.typed +0 -0
  40. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/src/ragbits/evaluate/utils.py +0 -0
  41. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/tests/cli/test_run_evaluation.py +0 -0
  42. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/tests/unit/test_evaluator.py +0 -0
  43. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/tests/unit/test_metrics.py +0 -0
  44. {ragbits_evaluate-1.3.0 → ragbits_evaluate-1.4.0}/tests/unit/test_optimizer.py +0 -0
@@ -2,6 +2,24 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ ## 1.4.0 (2026-02-04)
6
+
7
+ ### Changed
8
+
9
+ - ragbits-core updated to version v1.4.0
10
+
11
+ - Feat: introduce agent evaluation pipelines and metrics (HotpotQA, HumanEval, GAIA) (#829)
12
+
13
+ - Feat: introduce agent simulation module with utilities for agent-to-agent conversation and evaluation scenarios (#857)
14
+
15
+ - Feat: add structured results to agent simulation with `SimulationResult`, `TurnResult`, `TaskResult`, and `ConversationMetrics` models (#885)
16
+
17
+ - Feat: add generic `DomainContext` and `DataSnapshot` for flexible agent simulation context (#884)
18
+
19
+ - Feat: add metrics collection system for agent simulation (`MetricCollector` protocol, `LatencyMetricCollector`, `TokenUsageMetricCollector`, `ToolUsageMetricCollector`) (#882)
20
+
21
+ - Fix: improve `continuous-eval` import compatibility for different package versions specified by constraints
22
+
5
23
  ## 1.3.0 (2025-09-11)
6
24
 
7
25
  ### Changed
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ragbits-evaluate
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: Evaluation module for Ragbits components
5
5
  Project-URL: Homepage, https://github.com/deepsense-ai/ragbits
6
6
  Project-URL: Bug Reports, https://github.com/deepsense-ai/ragbits/issues
@@ -27,7 +27,7 @@ Requires-Dist: distilabel<2.0.0,>=1.5.0
27
27
  Requires-Dist: hydra-core<2.0.0,>=1.3.2
28
28
  Requires-Dist: neptune[optuna]<2.0.0,>=1.12.0
29
29
  Requires-Dist: optuna<5.0.0,>=4.0.0
30
- Requires-Dist: ragbits-core==1.3.0
30
+ Requires-Dist: ragbits-core==1.4.0
31
31
  Provides-Extra: relari
32
32
  Requires-Dist: continuous-eval<1.0.0,>=0.3.12; extra == 'relari'
33
33
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ragbits-evaluate"
3
- version = "1.3.0"
3
+ version = "1.4.0"
4
4
  description = "Evaluation module for Ragbits components"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -32,7 +32,7 @@ classifiers = [
32
32
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
33
33
  "Topic :: Software Development :: Libraries :: Python Modules",
34
34
  ]
35
- dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.3.0"]
35
+ dependencies = ["hydra-core>=1.3.2,<2.0.0", "neptune[optuna]>=1.12.0,<2.0.0", "optuna>=4.0.0,<5.0.0", "distilabel>=1.5.0,<2.0.0", "datasets>=3.0.1,<4.0.0", "ragbits-core==1.4.0"]
36
36
 
37
37
  [project.urls]
38
38
  "Homepage" = "https://github.com/deepsense-ai/ragbits"
@@ -45,15 +45,6 @@ relari = [
45
45
  "continuous-eval>=0.3.12,<1.0.0",
46
46
  ]
47
47
 
48
- [tool.uv]
49
- dev-dependencies = [
50
- "pre-commit~=3.8.0",
51
- "pytest~=8.3.3",
52
- "pytest-cov~=5.0.0",
53
- "pytest-asyncio~=0.24.0",
54
- "pip-licenses>=4.0.0,<5.0.0"
55
- ]
56
-
57
48
  [build-system]
58
49
  requires = ["hatchling"]
59
50
  build-backend = "hatchling.build"
@@ -4,14 +4,6 @@ from asyncio import AbstractEventLoop
4
4
  from itertools import chain
5
5
  from typing import Generic, TypeVar
6
6
 
7
- from continuous_eval.llm_factory import LLMInterface
8
- from continuous_eval.metrics.base import LLMBasedMetric
9
- from continuous_eval.metrics.generation.text import (
10
- LLMBasedAnswerCorrectness,
11
- LLMBasedAnswerRelevance,
12
- LLMBasedFaithfulness,
13
- LLMBasedStyleConsistency,
14
- )
15
7
  from typing_extensions import Self
16
8
 
17
9
  from ragbits.agents.types import QuestionAnswerPromptOutputT
@@ -20,6 +12,31 @@ from ragbits.core.utils.helpers import batched
20
12
  from ragbits.evaluate.metrics.base import Metric
21
13
  from ragbits.evaluate.pipelines.question_answer import QuestionAnswerResult
22
14
 
15
+ try:
16
+ from continuous_eval.llm_factory import LLMInterface
17
+ from continuous_eval.metrics.base import LLMBasedMetric
18
+ from continuous_eval.metrics.generation.text import (
19
+ LLMBasedAnswerCorrectness,
20
+ LLMBasedAnswerRelevance,
21
+ LLMBasedFaithfulness,
22
+ LLMBasedStyleConsistency,
23
+ )
24
+ except ModuleNotFoundError:
25
+ from continuous_eval.llms.base import LLMInterface
26
+ from continuous_eval.metrics import Metric as LLMBasedMetric
27
+ from continuous_eval.metrics.generation.text import (
28
+ AnswerCorrectness as LLMBasedAnswerCorrectness,
29
+ )
30
+ from continuous_eval.metrics.generation.text import (
31
+ AnswerRelevance as LLMBasedAnswerRelevance,
32
+ )
33
+ from continuous_eval.metrics.generation.text import (
34
+ Faithfulness as LLMBasedFaithfulness,
35
+ )
36
+ from continuous_eval.metrics.generation.text import (
37
+ StyleConsistency as LLMBasedStyleConsistency,
38
+ )
39
+
23
40
  MetricT = TypeVar("MetricT", bound=LLMBasedMetric)
24
41
 
25
42