PyPI - fba-bench-core - Versions diffs - 1.0.0__py3-none-any.whl - Mend

fba-bench-core 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

fba_bench_core/__init__.py +11 -0
fba_bench_core/agents/__init__.py +15 -0
fba_bench_core/agents/base.py +83 -0
fba_bench_core/agents/registry.py +16 -0
fba_bench_core/benchmarking/__init__.py +6 -0
fba_bench_core/benchmarking/core/__init__.py +1 -0
fba_bench_core/benchmarking/engine/__init__.py +12 -0
fba_bench_core/benchmarking/engine/core.py +135 -0
fba_bench_core/benchmarking/engine/models.py +62 -0
fba_bench_core/benchmarking/metrics/__init__.py +30 -0
fba_bench_core/benchmarking/metrics/accuracy_score.py +27 -0
fba_bench_core/benchmarking/metrics/aggregate.py +39 -0
fba_bench_core/benchmarking/metrics/completeness.py +38 -0
fba_bench_core/benchmarking/metrics/cost_efficiency.py +32 -0
fba_bench_core/benchmarking/metrics/custom_scriptable.py +17 -0
fba_bench_core/benchmarking/metrics/keyword_coverage.py +41 -0
fba_bench_core/benchmarking/metrics/policy_compliance.py +18 -0
fba_bench_core/benchmarking/metrics/registry.py +57 -0
fba_bench_core/benchmarking/metrics/robustness.py +27 -0
fba_bench_core/benchmarking/metrics/technical_performance.py +16 -0
fba_bench_core/benchmarking/registry.py +48 -0
fba_bench_core/benchmarking/scenarios/__init__.py +1 -0
fba_bench_core/benchmarking/scenarios/base.py +36 -0
fba_bench_core/benchmarking/scenarios/complex_marketplace.py +181 -0
fba_bench_core/benchmarking/scenarios/multiturn_tool_use.py +176 -0
fba_bench_core/benchmarking/scenarios/registry.py +18 -0
fba_bench_core/benchmarking/scenarios/research_summarization.py +141 -0
fba_bench_core/benchmarking/validators/__init__.py +24 -0
fba_bench_core/benchmarking/validators/determinism_check.py +95 -0
fba_bench_core/benchmarking/validators/fairness_balance.py +75 -0
fba_bench_core/benchmarking/validators/outlier_detection.py +53 -0
fba_bench_core/benchmarking/validators/registry.py +57 -0
fba_bench_core/benchmarking/validators/reproducibility_metadata.py +74 -0
fba_bench_core/benchmarking/validators/schema_adherence.py +59 -0
fba_bench_core/benchmarking/validators/structural_consistency.py +74 -0
fba_bench_core/config.py +154 -0
fba_bench_core/domain/__init__.py +75 -0
fba_bench_core/domain/events/__init__.py +230 -0
fba_bench_core/domain/events/analytics.py +69 -0
fba_bench_core/domain/events/base.py +59 -0
fba_bench_core/domain/events/inventory.py +119 -0
fba_bench_core/domain/events/marketing.py +102 -0
fba_bench_core/domain/events/pricing.py +179 -0
fba_bench_core/domain/models.py +296 -0
fba_bench_core/exceptions/__init__.py +9 -0
fba_bench_core/exceptions/base.py +46 -0
fba_bench_core/services/__init__.py +12 -0
fba_bench_core/services/base.py +52 -0
fba_bench_core-1.0.0.dist-info/METADATA +152 -0
fba_bench_core-1.0.0.dist-info/RECORD +52 -0
fba_bench_core-1.0.0.dist-info/WHEEL +4 -0
fba_bench_core-1.0.0.dist-info/licenses/LICENSE +21 -0

fba_bench_core/benchmarking/validators/determinism_check.py ADDED Viewed

@@ -0,0 +1,95 @@
+"""Determinism check validator."""
+from typing import Any
+from .registry import register_validator
+@register_validator("determinism_check")
+def determinism_check(
+    runs: list[dict[str, Any]], config: dict[str, Any]
+) -> dict[str, Any]:
+    """Check determinism across multiple runs."""
+    issues = []
+    tolerance = config.get("tolerance", 0.0)
+    fields = config.get("fields", ["value"])
+    # Group runs by runner_key and seed
+    from collections import defaultdict
+    groups = defaultdict(list)
+    for run in runs:
+        key = (run.get("runner_key"), run.get("seed"))
+        groups[key].append(run)
+    for (runner, seed), group_runs in groups.items():
+        if len(group_runs) < 2:
+            continue
+        for field in fields:
+            values = [
+                run.get("output", {}).get(field)
+                for run in group_runs
+                if run.get("status") == "success"
+            ]
+            if len(values) < 2:
+                continue
+            # Check if all values are within tolerance
+            if isinstance(values[0], (int, float)):
+                min_val = min(values)
+                max_val = max(values)
+                if max_val - min_val > tolerance:
+                    issues.append(
+                        {
+                            "id": "determinism_mismatch",
+                            "severity": "error",
+                            "message": f"Determinism mismatch for runner '{runner}', seed {seed}, field '{field}': values {values} exceed tolerance {tolerance}",
+                            "context": {
+                                "runner": runner,
+                                "seed": seed,
+                                "field": field,
+                                "values": values,
+                            },
+                        }
+                    )
+            else:
+                # For non-numeric, check exact match
+                if not all(v == values[0] for v in values):
+                    issues.append(
+                        {
+                            "id": "determinism_mismatch",
+                            "severity": "error",
+                            "message": f"Determinism mismatch for runner '{runner}', seed {seed}, field '{field}': values {values} not identical",
+                            "context": {
+                                "runner": runner,
+                                "seed": seed,
+                                "field": field,
+                                "values": values,
+                            },
+                        }
+                    )
+    if not issues:
+        issues.append(
+            {
+                "id": "determinism_ok",
+                "severity": "info",
+                "message": "All deterministic checks passed",
+            }
+        )
+    return {
+        "issues": issues,
+        "summary": {
+            "total_groups_checked": len(groups),
+            "groups_with_issues": len(
+                [
+                    g
+                    for g in groups.values()
+                    if len(g) >= 2
+                    and any(i["id"] == "determinism_mismatch" for i in issues)
+                ]
+            ),
+        },
+    }

fba_bench_core/benchmarking/validators/fairness_balance.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Fairness balance validator."""
+from typing import Any
+from .registry import register_validator
+@register_validator("fairness_balance")
+def fairness_balance(
+    runs: list[dict[str, Any]], config: dict[str, Any]
+) -> dict[str, Any]:
+    """Validate fairness balance across runs."""
+    issues = []
+    group = config.get("group", "runner_key")
+    metric_path = config.get("metric_path", "metrics.accuracy")
+    threshold = config.get("threshold", 0.1)
+    min_group_size = config.get("min_group_size", 2)
+    from collections import defaultdict
+    groups = defaultdict(list)
+    for run in runs:
+        group_key = run.get(group)
+        if group_key:
+            groups[group_key].append(run)
+    for group_key, group_runs in groups.items():
+        if len(group_runs) < min_group_size:
+            continue
+        # Extract metric values for successful runs
+        values = []
+        for run in group_runs:
+            if run.get("status") == "success":
+                # Handle nested metric path
+                current = run
+                for key in metric_path.split("."):
+                    current = current.get(key, {})
+                value = current if isinstance(current, (int, float)) else 0.0
+                values.append(value)
+        if values:
+            min_val = min(values)
+            max_val = max(values)
+            if (max_val - min_val) / ((min_val + max_val) / 2) > threshold:
+                issues.append(
+                    {
+                        "id": "fairness_imbalance",
+                        "severity": "warning",
+                        "message": f"Fairness imbalance in group '{group_key}': range {min_val}-{max_val} exceeds threshold {threshold}",
+                        "context": {
+                            "group": group_key,
+                            "metric": metric_path,
+                            "min": min_val,
+                            "max": max_val,
+                            "threshold": threshold,
+                        },
+                    }
+                )
+            else:
+                issues.append(
+                    {
+                        "id": "fairness_within_threshold",
+                        "severity": "info",
+                        "message": f"Fairness within threshold for group '{group_key}'",
+                    }
+                )
+    return {
+        "issues": issues,
+        "summary": {
+            "groups_checked": len(groups),
+            "groups_with_imbalance": len(issues),
+        },
+    }

fba_bench_core/benchmarking/validators/outlier_detection.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Outlier detection validator."""
+from typing import Any
+from .registry import register_validator
+@register_validator("outlier_detection")
+def outlier_detection(
+    runs: list[dict[str, Any]], config: dict[str, Any]
+) -> dict[str, Any]:
+    """Detect outliers in runs."""
+    issues = []
+    k = config.get("k", 1.5)
+    field = config.get("field", "duration_ms")
+    durations = [run.get(field, 0) for run in runs if run.get("status") == "success"]
+    if len(durations) < 3:
+        return {"issues": [], "summary": {"checked": len(durations), "outliers": []}}
+    median = sorted(durations)[len(durations) // 2]
+    deviations = [abs(d - median) for d in durations]
+    mad = sorted(deviations)[len(deviations) // 2]
+    outlier_indices = []
+    for i, dev in enumerate(deviations):
+        if dev > k * mad:
+            outlier_indices.append(i)
+    for idx in outlier_indices:
+        issues.append(
+            {
+                "id": "duration_outlier",
+                "severity": "warning",
+                "message": f"Outlier duration at index {idx}: {durations[idx]} (median: {median}, MAD: {mad})",
+                "context": {
+                    "index": idx,
+                    "value": durations[idx],
+                    "median": median,
+                    "mad": mad,
+                },
+            }
+        )
+    return {
+        "issues": issues,
+        "summary": {
+            "total_runs": len(durations),
+            "outliers": outlier_indices,
+            "median_duration": median,
+            "mad_duration": mad,
+        },
+    }

fba_bench_core/benchmarking/validators/registry.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Registry for validators."""
+from collections.abc import Callable
+class ValidatorRegistry:
+    _validators: dict[str, Callable] = {}
+    @classmethod
+    def register(cls, name: str, validator_class: Callable) -> None:
+        """Register a validator class."""
+        cls._validators[name] = validator_class
+    @classmethod
+    def create_validator(cls, name: str, config=None) -> Callable | None:
+        """Create a validator instance."""
+        fn = cls._validators.get(name)
+        if fn:
+            return fn(config) if config else fn()
+        return None
+    @classmethod
+    def get_validator(cls, name: str) -> Callable | None:
+        """Get a validator class by name."""
+        return cls._validators.get(name)
+    @classmethod
+    def list_validators(cls) -> list[str]:
+        """List all registered validator names."""
+        return list(cls._validators.keys())
+# Global instance for function-based API
+registry = ValidatorRegistry()
+def get_validator(name: str) -> Callable:
+    """Get a validator by name, raising KeyError if not found."""
+    validator = registry.get_validator(name)
+    if validator is None:
+        raise KeyError(f"Validator '{name}' not found")
+    return validator
+def list_validators() -> list[str]:
+    """List all registered validator names."""
+    return registry.list_validators()
+def register_validator(name: str):
+    """Decorator to register a validator function with the given name."""
+    def decorator(func: Callable) -> Callable:
+        registry.register(name, func)
+        return func
+    return decorator

fba_bench_core/benchmarking/validators/reproducibility_metadata.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Reproducibility metadata validator."""
+from typing import Any
+from .registry import register_validator
+@register_validator("reproducibility_metadata")
+def reproducibility_metadata(
+    runs: list[dict[str, Any]], config: dict[str, Any]
+) -> dict[str, Any]:
+    """Validate presence of reproducibility metadata."""
+    expected_seeds = config.get("expected_seeds", [])
+    config_digest = config.get("config_digest", "")
+    issues = []
+    for i, run in enumerate(runs):
+        seed = run.get("seed")
+        if seed is None:
+            issues.append(
+                {
+                    "id": "missing_seed",
+                    "severity": "warning",
+                    "message": f"Run {i} missing seed for reproducibility",
+                    "context": {"index": i},
+                }
+            )
+        elif seed not in expected_seeds:
+            issues.append(
+                {
+                    "id": "unexpected_seed",
+                    "severity": "warning",
+                    "message": f"Run {i} has unexpected seed {seed}, expected {expected_seeds}",
+                    "context": {"index": i, "seed": seed, "expected": expected_seeds},
+                }
+            )
+        # Check for per-run config digest
+        run_digest = run.get("config_digest")
+        if run_digest is None:
+            issues.append(
+                {
+                    "id": "per_run_digest_missing",
+                    "severity": "info",
+                    "message": f"Run {i} missing per-run config digest",
+                }
+            )
+        elif run_digest != config_digest:
+            issues.append(
+                {
+                    "id": "config_digest_mismatch",
+                    "severity": "error",
+                    "message": f"Run {i} config digest {run_digest} does not match expected {config_digest}",
+                    "context": {
+                        "index": i,
+                        "expected": config_digest,
+                        "actual": run_digest,
+                    },
+                }
+            )
+    return {
+        "issues": issues,
+        "summary": {
+            "total_runs": len(runs),
+            "missing_seeds": len([r for r in runs if r.get("seed") is None]),
+            "unexpected_seeds": len(
+                [r for r in runs if r.get("seed") not in expected_seeds]
+            ),
+            "digest_mismatches": len(
+                [r for r in runs if r.get("config_digest") != config_digest]
+            ),
+        },
+    }

fba_bench_core/benchmarking/validators/schema_adherence.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""Schema adherence validator."""
+from typing import Any
+from .registry import register_validator
+@register_validator("schema_adherence")
+def schema_adherence(
+    runs: list[dict[str, Any]], config: dict[str, Any]
+) -> dict[str, Any]:
+    """Validate schema adherence of runs."""
+    contract = config.get("contract", {})
+    required_fields = contract.get("required", {})
+    issues = []
+    for i, run in enumerate(runs):
+        if not isinstance(run, dict):
+            issues.append(
+                {
+                    "id": "invalid_run_type",
+                    "severity": "error",
+                    "message": f"Run {i} is not a dict: {type(run)}",
+                    "context": {"index": i, "type": type(run)},
+                }
+            )
+            continue
+        for field_name, field_type in required_fields.items():
+            if field_name not in run:
+                issues.append(
+                    {
+                        "id": "missing_field",
+                        "severity": "error",
+                        "message": f"Missing required field '{field_name}' in run {i}",
+                        "context": {"index": i, "field": field_name},
+                    }
+                )
+            else:
+                value = run[field_name]
+                if field_type == "int" and not isinstance(value, int):
+                    issues.append(
+                        {
+                            "id": "schema_type_mismatch",
+                            "severity": "warning",
+                            "message": f"Field '{field_name}' in run {i} has type {type(value)} but expected {field_type}",
+                            "context": {
+                                "index": i,
+                                "field": field_name,
+                                "expected": field_type,
+                                "actual": type(value),
+                            },
+                        }
+                    )
+    return {
+        "issues": issues,
+        "summary": {"total_runs": len(runs), "validation_errors": len(issues)},
+    }

fba_bench_core/benchmarking/validators/structural_consistency.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Structural consistency validator."""
+from typing import Any
+from .registry import register_validator
+@register_validator("structural_consistency")
+def structural_consistency(
+    runs: list[dict[str, Any]], config: dict[str, Any]
+) -> dict[str, Any]:
+    """Validate structural consistency across runs."""
+    issues = []
+    for i, run in enumerate(runs):
+        if not isinstance(run, dict):
+            issues.append(
+                {
+                    "id": "invalid_run_type",
+                    "severity": "error",
+                    "message": f"Run {i} is not a dict: {type(run)}",
+                    "context": {"index": i, "type": type(run)},
+                }
+            )
+            continue
+        # Check required fields
+        required_fields = [
+            "scenario_key",
+            "runner_key",
+            "status",
+            "duration_ms",
+            "metrics",
+            "output",
+        ]
+        for field in required_fields:
+            if field not in run:
+                issues.append(
+                    {
+                        "id": "missing_field",
+                        "severity": "error",
+                        "message": f"Missing required field '{field}' in run {i}",
+                        "context": {"index": i, "field": field},
+                    }
+                )
+        # Check duration_ms non-negative
+        duration = run.get("duration_ms")
+        if isinstance(duration, (int, float)) and duration < 0:
+            issues.append(
+                {
+                    "id": "negative_duration",
+                    "severity": "warning",
+                    "message": f"Negative duration_ms {duration} in run {i}",
+                    "context": {"index": i, "duration": duration},
+                }
+            )
+        # Check output only on success
+        status = run.get("status")
+        if status != "success" and run.get("output") is not None:
+            issues.append(
+                {
+                    "id": "unexpected_output_on_failure",
+                    "severity": "info",
+                    "message": f"Output present on non-success status '{status}' in run {i}",
+                    "context": {"index": i, "status": status},
+                }
+            )
+    return {
+        "issues": issues,
+        "summary": {"total_runs": len(runs), "structural_issues": len(issues)},
+    }

fba_bench_core/config.py ADDED Viewed

@@ -0,0 +1,154 @@
+"""Typed configuration contracts for fba_bench_core.
+Phase D:
+- Introduces BaseAgentConfig and BaseServiceConfig as Pydantic models.
+- Enforces typed metadata, forbids extra fields, and validates identifiers.
+- Models are frozen (immutable) to prevent accidental mutation by consumers.
+Downstream guidance:
+- Subclass BaseAgentConfig / BaseServiceConfig to add domain-specific fields.
+- Use model_copy(update={...}) to create modified copies rather than mutating.
+"""
+from __future__ import annotations
+import re
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+# Allowed primitive metadata value types. Reject nested dicts/lists to avoid
+# arbitrary deep structures hiding Any.
+Primitive = str | int | float | bool
+def _validate_slug(value: str, field_name: str) -> str:
+    """Ensure identifier uses a simple slug format (alphanum, hyphen, underscore)."""
+    if not isinstance(value, str) or not re.match(r"^[a-zA-Z0-9_-]+$", value):
+        raise ValueError(
+            f"{field_name!r} must be a slug (letters, digits, hyphen, underscore)."
+        )
+    return value
+class BaseConfigModel(BaseModel):
+    """Shared base for configuration models.
+    Provides strict Pydantic model settings:
+    - extra="forbid": disallow unknown fields (prevents accidental additions).
+    - validate_assignment=True: validate when creating copies or assigning (still
+      compatible with frozen models).
+    - frozen=True: make instances immutable to avoid accidental runtime mutation.
+    - allow_population_by_field_name=True: helpful if downstream code prefers
+      field-name population.
+    """
+    model_config = ConfigDict(
+        extra="forbid",
+        validate_assignment=True,
+        frozen=True,
+    )
+class BaseAgentConfig(BaseConfigModel):
+    """Base configuration contract for agents.
+    Fields:
+        agent_id: Unique identifier (slug) for the agent instance.
+        poll_interval_seconds: Optional polling interval (seconds) for
+            agents that poll external systems. Keep None if not used.
+        max_concurrent_tasks: Optional concurrency hint for schedulers.
+        default_region: Optional region/locale hint (e.g., "us-west-2").
+        metadata: Shallow mapping of simple metadata values (no nested dicts/lists).
+                  Keys are strings and values are limited to primitive types.
+    Example:
+        class PricingAgentConfig(BaseAgentConfig):
+            pricing_tier: Literal["basic", "pro"] = "basic"
+    """
+    agent_id: str
+    poll_interval_seconds: int | None = None
+    max_concurrent_tasks: int | None = None
+    default_region: str | None = None
+    metadata: dict[str, Primitive] = Field(default_factory=dict)
+    @field_validator("agent_id")
+    @classmethod
+    def _check_agent_id(cls, v: str) -> str:
+        return _validate_slug(v, "agent_id")
+    @field_validator("poll_interval_seconds", "max_concurrent_tasks")
+    @classmethod
+    def _non_negative_ints(cls, v: int | None) -> int | None:
+        if v is None:
+            return v
+        if v < 0:
+            raise ValueError("must be non-negative")
+        return v
+    @field_validator("metadata")
+    @classmethod
+    def _validate_metadata(cls, v: dict[str, Primitive]) -> dict[str, Primitive]:
+        if not isinstance(v, dict):
+            raise ValueError("metadata must be a mapping of str -> primitive values")
+        for k, val in v.items():
+            if not isinstance(k, str):
+                raise ValueError("metadata keys must be strings")
+            if not isinstance(val, (str, int, float, bool)):
+                raise ValueError(
+                    "metadata values must be primitive types (str, int, float, bool)"
+                )
+        return v
+class BaseServiceConfig(BaseConfigModel):
+    """Base configuration contract for services.
+    Fields:
+        service_id: Unique identifier (slug) for the service instance.
+        poll_interval_seconds, max_concurrent_tasks, default_region, metadata:
+            same semantics as in BaseAgentConfig.
+    Example:
+        class CacheServiceConfig(BaseServiceConfig):
+            ttl_seconds: int = 300
+    """
+    service_id: str
+    poll_interval_seconds: int | None = None
+    max_concurrent_tasks: int | None = None
+    default_region: str | None = None
+    metadata: dict[str, Primitive] = Field(default_factory=dict)
+    @field_validator("service_id")
+    @classmethod
+    def _check_service_id(cls, v: str) -> str:
+        return _validate_slug(v, "service_id")
+    @field_validator("poll_interval_seconds", "max_concurrent_tasks")
+    @classmethod
+    def _non_negative_ints(cls, v: int | None) -> int | None:
+        if v is None:
+            return v
+        if v < 0:
+            raise ValueError("must be non-negative")
+        return v
+    @field_validator("metadata")
+    @classmethod
+    def _validate_metadata(cls, v: dict[str, Primitive]) -> dict[str, Primitive]:
+        # Reuse same validation semantics as agent metadata.
+        if not isinstance(v, dict):
+            raise ValueError("metadata must be a mapping of str -> primitive values")
+        for k, val in v.items():
+            if not isinstance(k, str):
+                raise ValueError("metadata keys must be strings")
+            if not isinstance(val, (str, int, float, bool)):
+                raise ValueError(
+                    "metadata values must be primitive types (str, int, float, bool)"
+                )
+        return v
+# End of file. Subclass these models in downstream packages to add domain-specific
+# configuration while preserving validation and immutability guarantees.