PyPI - docent-python - Versions diffs - 0.1.21a0__py3-none-any.whl → 0.1.23a0__py3-none-any.whl - Mend

docent-python 0.1.21a0py3-none-any.whl → 0.1.23a0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of docent-python might be problematic. Click here for more details.

Files changed (23) hide show

docent/_llm_util/data_models/llm_output.py +3 -0
docent/_llm_util/llm_cache.py +4 -4
docent/_llm_util/{prod_llms.py → llm_svc.py} +104 -86
docent/_llm_util/providers/preference_types.py +2 -2
docent/data_models/__init__.py +2 -2
docent/data_models/judge.py +7 -4
docent/judges/__init__.py +2 -0
docent/judges/analysis.py +77 -0
docent/judges/impl.py +484 -119
docent/judges/runner.py +66 -0
docent/judges/stats.py +205 -0
docent/judges/types.py +73 -2
docent/judges/util/meta_schema.json +3 -1
docent/judges/util/parse_output.py +8 -16
docent/judges/util/voting.py +61 -6
docent/sdk/client.py +72 -41
docent/trace.py +18 -0
{docent_python-0.1.21a0.dist-info → docent_python-0.1.23a0.dist-info}/METADATA +2 -1
{docent_python-0.1.21a0.dist-info → docent_python-0.1.23a0.dist-info}/RECORD +21 -20
docent/_llm_util/data_models/simple_svc.py +0 -79
docent/trace_2.py +0 -1842
{docent_python-0.1.21a0.dist-info → docent_python-0.1.23a0.dist-info}/WHEEL +0 -0
{docent_python-0.1.21a0.dist-info → docent_python-0.1.23a0.dist-info}/licenses/LICENSE.md +0 -0

docent/judges/runner.py ADDED Viewed

@@ -0,0 +1,66 @@
+import anyio
+from tqdm.auto import tqdm
+from docent._llm_util.llm_svc import BaseLLMService
+from docent._log_util import get_logger
+from docent.data_models.agent_run import AgentRun
+from docent.judges import (
+    JudgeResult,
+    JudgeResultCompletionCallback,
+    Rubric,
+)
+from docent.judges.impl import build_judge
+logger = get_logger(__name__)
+async def run_rubric(
+    agent_runs: list[AgentRun],
+    rubric: Rubric,
+    llm_svc: BaseLLMService,
+    callback: JudgeResultCompletionCallback | None = None,
+    *,
+    show_progress: bool = True,
+) -> list[JudgeResult | None]:
+    if not agent_runs:
+        raise ValueError("agent_runs must be a non-empty sequence")
+    if rubric.n_rollouts_per_input <= 0:
+        raise ValueError("rubric.n_rollouts_per_input must be greater than 0")
+    judge = build_judge(rubric, llm_svc)
+    logger.info(
+        "Running rubric %s version %s against %d agent runs",
+        rubric.id,
+        rubric.version,
+        len(agent_runs),
+    )
+    agent_results: list[JudgeResult | None] = [None for _ in agent_runs]
+    progress_bar = tqdm(
+        total=len(agent_runs), desc=f"Rubric {rubric.id}", disable=not show_progress
+    )
+    async def _run_single_judge(index: int, agent_run: AgentRun):
+        agent_results[index] = result = await judge(agent_run)
+        if callback is not None:
+            await callback(index, [result] if result is not None else None)
+        progress_bar.update()
+    try:
+        async with anyio.create_task_group() as tg:
+            for index, agent_run in enumerate(agent_runs):
+                tg.start_soon(_run_single_judge, index, agent_run)
+    finally:
+        progress_bar.close()
+    successful = sum(result is not None for result in agent_results)
+    logger.info(
+        "Finished rubric %s: produced %d/%d judge results",
+        rubric.id,
+        successful,
+        len(agent_results),
+    )
+    return agent_results

docent/judges/stats.py ADDED Viewed

@@ -0,0 +1,205 @@
+from typing import Iterator, List, Tuple
+from scipy import stats
+from docent._log_util import get_logger
+logger = get_logger(__name__)
+def print_stats_with_intervals(name: str, mean: float, std: float, confidence_levels: list[float]):
+    """Print statistics with confidence intervals at multiple confidence levels.
+    Args:
+        name: Name of the statistic
+        mean: Mean value
+        std: Standard deviation
+        confidence_levels: List of confidence levels (e.g., [0.90, 0.95, 0.99])
+    """
+    intervals_str = ", ".join(
+        [
+            f"{int(level*100)}% interval: [{mean - stats.norm.ppf((1+level)/2) * std:.4f}, {mean + stats.norm.ppf((1+level)/2) * std:.4f}]"  # type: ignore
+            for level in confidence_levels
+        ]
+    )
+    print(f"{name} mean: {mean:.4f}, std: {std:.4f}, {intervals_str}")
+def _bounded_compositions(total: int, parts: int, bound: int) -> Iterator[Tuple[int, ...]]:
+    """
+    Yield all tuples (x1,...,x_parts) of nonnegative ints summing to `total`
+    with each xk <= bound.
+    """
+    # Recursive backtracking with pruning by remaining capacity.
+    def rec(k: int, remaining: int, prefix: List[int]) -> Iterator[Tuple[int, ...]]:
+        if k == parts:
+            if remaining == 0:
+                yield tuple(prefix)
+            return
+        # Max we can put here is min(bound, remaining - min_needed_for_rest)
+        # The min needed for the rest is 0; also cannot exceed remaining.
+        max_here = min(bound, remaining)
+        # Optional pruning: ensure the rest can absorb what's left (always true since min=0)
+        for x in range(max_here + 1):
+            prefix.append(x)
+            yield from rec(k + 1, remaining - x, prefix)
+            prefix.pop()
+    yield from rec(0, total, [])
+def plurality_vectors(m: int, K: int, i: int) -> Iterator[Tuple[int, ...]]:
+    """
+    Generate all count vectors n = (n1,...,nm) of nonnegative integers with
+    sum(n) = K and STRICT plurality at index i:
+       n[i] > n[j] for all j != i.
+    Yields vectors in no particular order.
+    """
+    if not (0 <= i < m):
+        raise ValueError("i must be in [0, m).")
+    if m < 2 or K < 1:
+        return  # nothing to yield in degenerate cases
+    for ni in range(1, K + 1):  # at least 1 vote for the winner
+        rest_total = K - ni
+        cap = ni - 1  # strict plurality: others must be <= ni-1
+        # If cap < 0 but rest_total > 0, impossible
+        if cap < 0 and rest_total > 0:
+            continue
+        # Build the other m-1 counts under the cap
+        for others in _bounded_compositions(rest_total, m - 1, cap):
+            # Stitch back in ni at position i
+            vec = list(others[:i]) + [ni] + list(others[i:])
+            yield tuple(vec)
+def p_mode(n: int, p_v: list[float], idx: int) -> float:
+    """Probability that the modal sample of sampling Multinom(n, p_v) is the idxth one."""
+    count_vecs = plurality_vectors(len(p_v), n, idx)
+    return sum(stats.multinomial.pmf(vec, n, p_v) for vec in count_vecs)  # type: ignore
+# async def analyze_majority_judge(
+#     rubric: Rubric,
+#     agent_runs: list[AgentRun],
+#     matched_labels: dict[str, dict[str, Any]],  # agent_run_id -> gold label obj
+#     results_path: Path,
+#     samples_per_agent_run: int = 10,
+#     maj_k: int = 5,  # Does not affect data collection
+#     max_llm_concurrency: int = 100,
+# ):
+#     # if rubric.n_rollouts_per_input != 1:
+#     #     raise ValueError("You should use n_rollouts_per_input=1")
+#     if not results_path.exists():
+#         logger.info(f"Evaluating rubrics and saving results to {results_path}")
+#         max_conc_per_rubric = min(
+#             max_llm_concurrency, len(agent_runs) * rubric.n_rollouts_per_input
+#         )
+#         max_parallel_rubrics = max(1, max_llm_concurrency // max_conc_per_rubric)
+#         logger.info(
+#             f"Evaluating {samples_per_agent_run} samples per agent run, {max_conc_per_rubric} concurrent LLM calls per rubric, {max_parallel_rubrics} parallel rubrics"
+#         )
+#         await evaluate_multiple_rubrics(
+#             rubrics=[rubric] * samples_per_agent_run,
+#             agent_runs=agent_runs,
+#             llm_svc=SimpleLLMService(),
+#             output_path=results_path,
+#             max_concurrent_llm_calls_per_rubric=max_conc_per_rubric,
+#             max_parallel_rubrics=max_parallel_rubrics,
+#         )
+#     else:
+#         logger.info(f"Found existing results at {results_path}, loading them")
+#     rows = load_rubric_results_from_file(results_path)
+#     # Parse results into a flat dataframe
+#     parsed_results: list[dict[str, Any]] = []
+#     for row in rows:
+#         config_key = row.rubric.model_dump_json(
+#             exclude={
+#                 "rubric_text",
+#                 "id",
+#                 "version",
+#                 "system_prompt_template",
+#                 "citation_instructions",
+#                 "output_schema",
+#             }
+#         )
+#         for result in row.judge_results:
+#             if result is None:
+#                 print("found null")
+#                 continue
+#                 # raise ValueError("result is None")
+#             gold_label = matched_labels[result.agent_run_id]["label"]
+#             judge_label = result.output["label"]
+#             md = result.result_metadata or {}
+#             parsed_results.append(
+#                 {
+#                     "config_key": config_key,
+#                     "agent_run_id": result.agent_run_id,
+#                     "gold_label": gold_label,
+#                     "judge_label": judge_label,
+#                     "final_results": md.get("final_results"),
+#                     "is_correct": gold_label == judge_label,
+#                 }
+#             )
+#     df = pd.DataFrame(parsed_results)
+#     potential_labels = ["cheating", "future leakage", "not suspicious"]
+#     def _get_pred_dist(x: pd.Series):
+#         x = x.tolist()
+#         counts = [x.count(l) for l in potential_labels]
+#         assert np.sum(counts) == len(x)
+#         return {l: counts[i] / len(x) for i, l in enumerate(potential_labels)}
+#     n_ars = len(df.groupby("agent_run_id").count())
+#     p_correct = (
+#         df.groupby("agent_run_id")
+#         .agg(
+#             {
+#                 "gold_label": lambda x: x.iloc[0],
+#                 "judge_label": _get_pred_dist,
+#                 "is_correct": np.mean,
+#             }
+#         )
+#         .rename(columns={"judge_label": "pred_dist", "is_correct": "p_correct_naive"})
+#     )
+#     p_correct["p_correct_naive_var"] = p_correct["p_correct_naive"].apply(lambda x: x * (1 - x))
+#     p_correct["p_correct_majority"] = p_correct.apply(
+#         lambda row: p_mode(
+#             maj_k,
+#             [row["pred_dist"][l] for l in potential_labels],
+#             potential_labels.index(row["gold_label"]),
+#         ),
+#         axis=1,
+#     )
+#     p_correct["p_correct_majority_var"] = p_correct["p_correct_majority"].apply(
+#         lambda x: x * (1 - x)
+#     )
+#     p_correct.sort_values(by="p_correct_majority_var", ascending=False, inplace=True)
+#     overall_naive_mean = p_correct["p_correct_naive"].mean()
+#     overall_naive_std = np.sqrt(p_correct["p_correct_naive_var"].sum() / n_ars**2)
+#     overall_majority_mean = p_correct["p_correct_majority"].mean()
+#     overall_majority_std = np.sqrt(p_correct["p_correct_majority_var"].sum() / n_ars**2)
+#     confidence_levels = [0.5, 0.95]
+#     print_stats_with_intervals(
+#         "Overall naive", overall_naive_mean, overall_naive_std, confidence_levels
+#     )
+#     print_stats_with_intervals(
+#         f"Overall majority (k={maj_k})",
+#         overall_majority_mean,
+#         overall_majority_std,
+#         confidence_levels,
+#     )
+#     return p_correct

docent/judges/types.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import enum
 import json
 from string import Formatter
-from typing import Any, Callable, Protocol
+from typing import Any, Callable, Literal, Protocol
 from uuid import uuid4
 from pydantic import BaseModel, Field, field_serializer, field_validator
@@ -19,12 +19,64 @@ DEFAULT_JUDGE_SYSTEM_PROMPT_TEMPLATE = """
 Here is a rubric that we are using to judge transcripts of AI agent runs.
 Rubric:
+<rubric>
 {rubric}
+</rubric>
 Agent run:
+<agent_run>
 {agent_run}
+</agent_run>
-Your response should convey your judgment of the agent run according to the criteria given in the rubric provided above. Your entire response must be a valid JSON string which can be parsed with python `json.loads` without any additional processing. Double quotes (`"`) in the middle of a string in the JSON object must be escaped with a backslash.
+Your goal is to judge the agent run according to the criteria given in the rubric. Start by faithfully following the decision procedure in extremely careful detail, step by step.
+When you are finished, output your final adjudication, surrounded by <response>...</response> tags. The response must be a valid JSON string which can be parsed with python `json.loads` without any additional processing. Double quotes (`"`) in the middle of a string in the JSON object must be escaped with a backslash.
+The JSON object you produce must adhere to the following schema:
+{output_schema}
+{citation_instructions}
+""".strip()
+DEFAULT_MULTI_TURN_JUDGE_SYSTEM_PROMPT_TEMPLATE = """
+Here is a rubric that we are using to judge transcripts of AI agent runs.
+Rubric:
+<rubric>
+{rubric}
+</rubric>
+Agent run:
+<agent_run>
+{agent_run}
+</agent_run>
+Your goal is to judge the agent run according to the criteria given in the rubric. Start by faithfully following the decision procedure in extremely careful detail, step by step. You must execute **one step in the decision procedure per assistant message turn**. After each turn, output a complete and detailed recount of all actions you took, and everything you discovered. Then call the `step_finished` tool.
+When you are finished going through the decision procedure, output your final adjudication, surrounded by <response>...</response> tags. The response must be a valid JSON string which can be parsed with python `json.loads` without any additional processing. Double quotes (`"`) in the middle of a string in the JSON object must be escaped with a backslash.
+The JSON object you produce must adhere to the following schema:
+{output_schema}
+{citation_instructions}
+""".strip()
+DEFAULT_EXPOSED_REASONING_JUDGE_SYSTEM_PROMPT_TEMPLATE = """
+Here is a rubric that we are using to judge transcripts of AI agent runs.
+Rubric:
+<rubric>
+{rubric}
+</rubric>
+Agent run:
+<agent_run>
+{agent_run}
+</agent_run>
+Your goal is to judge the agent run according to the criteria given in the rubric. Start by faithfully following the decision procedure in extremely careful detail, step by step. You must *fully externalize* your reasoning work by outputting details in the assistant message, surrounded by <reasoning>...</reasoning> tags. The reasoning section can be as messy as you need. You should use *high* reasoning effort.
+When you are finished, output your final adjudication in the assistant message, surrounded by <response>...</response> tags. The response must be a valid JSON string which can be parsed with python `json.loads` without any additional processing. Double quotes (`"`) in the middle of a string in the JSON object must be escaped with a backslash.
 The JSON object you produce must adhere to the following schema:
 {output_schema}
@@ -51,6 +103,11 @@ DEFAULT_JUDGE_OUTPUT_SCHEMA = {
 DEFAULT_JUDGE_MODEL = PUBLIC_PROVIDER_PREFERENCES.default_judge_models[0]
+class JudgeVariant(str, enum.Enum):
+    MAJORITY = "majority"
+    MULTI_REFLECT = "multi-reflect"
 class Rubric(BaseModel):
     """TODO(mengk): this should really be called JudgeConfig,
     but temporarily keeping this for consistency with docent_core."""
@@ -64,6 +121,11 @@ class Rubric(BaseModel):
     # What the judge actually does
     rubric_text: str
+    n_rollouts_per_input: int = 1
+    judge_variant: JudgeVariant = JudgeVariant.MAJORITY
+    # TODO(mengk): add this to the database
+    # No need right now because multi-turn is still very experimental.
+    rollout_type: Literal["single_turn", "multi_turn"] = "single_turn"
     # Default instructions for the judge
     system_prompt_template: str = DEFAULT_JUDGE_SYSTEM_PROMPT_TEMPLATE
@@ -129,6 +191,15 @@ class Rubric(BaseModel):
         return output_schema
+class MultiTurnRubric(Rubric):
+    system_prompt_template: str = DEFAULT_MULTI_TURN_JUDGE_SYSTEM_PROMPT_TEMPLATE
+    rollout_type: Literal["single_turn", "multi_turn"] = "multi_turn"
+class ExposedReasoningRubric(Rubric):
+    system_prompt_template: str = DEFAULT_EXPOSED_REASONING_JUDGE_SYSTEM_PROMPT_TEMPLATE
 class ResultType(enum.Enum):
     """Enum for the type of result that a judge result can have."""

docent/judges/util/meta_schema.json CHANGED Viewed

@@ -33,7 +33,9 @@
           },
           "enum": {
             "type": "array",
-            "items": { "type": "string" }
+            "items": {
+              "type": ["string", "integer", "boolean"]
+            }
           },
           "format": {
             "type": "string",

docent/judges/util/parse_output.py CHANGED Viewed

@@ -1,10 +1,8 @@
-import json
 from typing import Any, cast
 import jsonschema
 from docent._llm_util.data_models.exceptions import ValidationFailedException
-from docent._llm_util.data_models.llm_output import LLMOutput
 from docent._log_util import get_logger
 from docent.data_models.agent_run import AgentRun
 from docent.data_models.remove_invalid_citation_ranges import remove_invalid_citation_ranges
@@ -55,10 +53,8 @@ def _validate_rubric_output(
         )
-def parse_and_validate_llm_output(
-    llm_output: LLMOutput,
-    output_schema: dict[str, Any],
-    agent_run: AgentRun,
+def parse_and_validate_output_str(
+    output_str: str, output_schema: dict[str, Any], agent_run: AgentRun
 ) -> dict[str, Any]:
     """Parse and validate LLM output for rubric evaluation.
@@ -73,23 +69,19 @@ def parse_and_validate_llm_output(
     Raises:
         ValidationFailedException: If parsing or validation fails
     """
-    if llm_output.first_text is None:
-        raise ValidationFailedException("LLM output has no text", failed_output=None)
     try:
-        output = forgiving_json_loads(llm_output.first_text)
-    except json.JSONDecodeError as e:
+        output = forgiving_json_loads(output_str)
+    except Exception as e:
         raise ValidationFailedException(
-            f"Failed to parse JSON: {e}. Raw text: `{llm_output.first_text}`",
-            failed_output=llm_output.first_text,
+            f"Failed to parse JSON: {e}. Raw text: `{output_str}`",
+            failed_output=output_str,
         )
     if not isinstance(output, dict):
-        logger.error(f"Expected dict output, got {type(output)}")
-        logger.error(f"LLM output: {llm_output.first_text}")
         raise ValidationFailedException(
-            f"Expected dict output, got {type(output)}. Raw text: {llm_output.first_text}",
-            failed_output=llm_output.first_text,
+            f"Expected dict output, got {type(output)}. Raw text: {output_str}",
+            failed_output=output_str,
         )
     return _validate_rubric_output(cast(dict[str, Any], output), output_schema, agent_run)

docent/judges/util/voting.py CHANGED Viewed

@@ -1,11 +1,23 @@
 from collections import Counter
-from typing import Any, cast
+from typing import Any, TypedDict, cast
+import numpy as np
+class EstimateWithCI(TypedDict):
+    mean: float
+    var: float
+    n: int
+    ci_95: float
+JudgeOutputDistribution = dict[str | bool | int | float, EstimateWithCI]
 def get_agreement_keys(schema: dict[str, Any]) -> list[str]:
     """Get list of top-level keys in schema that we want to measure agreement on.
-    This includes enum, bool, and int fields. We skip float and strings.
+    This includes enum and bool fields.
     Args:
         schema: JSON schema dict
@@ -29,10 +41,7 @@ def get_agreement_keys(schema: dict[str, Any]) -> list[str]:
         # Include boolean fields
         if field_type == "boolean":
             agreement_keys.append(key)
-        # Include integer fields
-        elif field_type == "integer":
-            agreement_keys.append(key)
-        # Include enum fields (even strings)
+        # Include enum fields (strings and numbers must be in this category)
         elif "enum" in field_schema:
             agreement_keys.append(key)
@@ -82,3 +91,49 @@ def find_modal_result(indep_results: list[dict[str, Any]], agreement_keys: list[
     max_idx = indep_result_scores.index(max(indep_result_scores))
     return max_idx, agt_key_modes_and_counts
+def compute_output_distributions(
+    indep_results: list[dict[str, Any]], output_schema: dict[str, Any], agreement_keys: list[str]
+):
+    def _get_possible_values(key: str) -> list[str | bool | int | float]:
+        if "enum" in output_schema.get("properties", {}).get(key, {}):
+            return output_schema.get("properties", {}).get(key, {}).get("enum", [])
+        elif output_schema.get("properties", {}).get(key, {}).get("type") == "boolean":
+            return [True, False]
+        else:
+            return []
+    raw_counts: dict[str, dict[str | bool | int | float, int]] = {
+        key: {value: 0 for value in _get_possible_values(key)} for key in agreement_keys
+    }
+    # Collect counts for each possible value
+    for result in indep_results:
+        for key in agreement_keys:
+            if (value := result.get(key)) is not None:  # Could be none if the key is optional
+                assert (
+                    value in raw_counts[key]
+                ), "this should never happen; the value must be in possible values, since judge results have been validated against the schema"
+                raw_counts[key][value] += 1
+    distributions: dict[str, JudgeOutputDistribution] = {}
+    for agt_key in agreement_keys:
+        distributions[agt_key] = {}
+        # First normalize the counts to get probabilities
+        counts = raw_counts[agt_key]
+        total = sum(counts.values())
+        probs = {value: (count / total) if total > 0 else 0.0 for value, count in counts.items()}
+        for output_key, value in probs.items():
+            mean, estimate_var = value, (value * (1 - value))
+            # TODO(mengk): change to the wilson score interval
+            ci_95 = float(1.96 * np.sqrt(estimate_var / total)) if total > 0 else 0.0
+            distributions[agt_key][output_key] = {
+                "mean": mean,
+                "var": estimate_var,
+                "n": total,
+                "ci_95": ci_95,
+            }
+    return distributions

docent-python 0.1.21a0__py3-none-any.whl → 0.1.23a0__py3-none-any.whl

Potentially problematic release.

docent-python 0.1.21a0py3-none-any.whl → 0.1.23a0py3-none-any.whl