docent-python 0.1.19a0__py3-none-any.whl → 0.1.27a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docent-python might be problematic. Click here for more details.
- docent/_llm_util/__init__.py +0 -0
- docent/_llm_util/data_models/__init__.py +0 -0
- docent/_llm_util/data_models/exceptions.py +48 -0
- docent/_llm_util/data_models/llm_output.py +331 -0
- docent/_llm_util/llm_cache.py +193 -0
- docent/_llm_util/llm_svc.py +472 -0
- docent/_llm_util/model_registry.py +130 -0
- docent/_llm_util/providers/__init__.py +0 -0
- docent/_llm_util/providers/anthropic.py +537 -0
- docent/_llm_util/providers/common.py +41 -0
- docent/_llm_util/providers/google.py +530 -0
- docent/_llm_util/providers/openai.py +745 -0
- docent/_llm_util/providers/openrouter.py +375 -0
- docent/_llm_util/providers/preference_types.py +104 -0
- docent/_llm_util/providers/provider_registry.py +164 -0
- docent/data_models/__init__.py +2 -2
- docent/data_models/agent_run.py +1 -0
- docent/data_models/judge.py +7 -4
- docent/data_models/transcript.py +2 -0
- docent/data_models/util.py +170 -0
- docent/judges/__init__.py +23 -0
- docent/judges/analysis.py +77 -0
- docent/judges/impl.py +587 -0
- docent/judges/runner.py +129 -0
- docent/judges/stats.py +205 -0
- docent/judges/types.py +311 -0
- docent/judges/util/forgiving_json.py +108 -0
- docent/judges/util/meta_schema.json +86 -0
- docent/judges/util/meta_schema.py +29 -0
- docent/judges/util/parse_output.py +87 -0
- docent/judges/util/voting.py +139 -0
- docent/sdk/client.py +181 -44
- docent/trace.py +362 -44
- {docent_python-0.1.19a0.dist-info → docent_python-0.1.27a0.dist-info}/METADATA +11 -5
- docent_python-0.1.27a0.dist-info/RECORD +59 -0
- docent_python-0.1.19a0.dist-info/RECORD +0 -32
- {docent_python-0.1.19a0.dist-info → docent_python-0.1.27a0.dist-info}/WHEEL +0 -0
- {docent_python-0.1.19a0.dist-info → docent_python-0.1.27a0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Iterable, List, TypeVar
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
from docent.data_models.agent_run import AgentRun
|
|
9
|
+
|
|
10
|
+
T = TypeVar("T", bound=BaseModel)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _deep_copy_model(model: T) -> T:
|
|
14
|
+
"""Create a deep copy of a Pydantic v2 model.
|
|
15
|
+
|
|
16
|
+
Using `model_copy(deep=True)` ensures nested models are fully copied and
|
|
17
|
+
mutations do not affect the original instance.
|
|
18
|
+
"""
|
|
19
|
+
return model.model_copy(deep=True)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def clone_agent_run_with_random_ids(agent_run: AgentRun) -> AgentRun:
|
|
23
|
+
"""Clone an `AgentRun`, randomizing all IDs and fixing internal references.
|
|
24
|
+
|
|
25
|
+
The following transformations are performed on the cloned instance:
|
|
26
|
+
- Assign a new `AgentRun.id`.
|
|
27
|
+
- Assign new `Transcript.id` values and update any references to them (none today).
|
|
28
|
+
- Assign new `TranscriptGroup.id` values.
|
|
29
|
+
- Update `Transcript.transcript_group_id` to the new group IDs where applicable.
|
|
30
|
+
- Update `TranscriptGroup.agent_run_id` to the new `AgentRun.id`.
|
|
31
|
+
- Update `TranscriptGroup.parent_transcript_group_id` to the new group IDs where applicable.
|
|
32
|
+
|
|
33
|
+
Notes:
|
|
34
|
+
- If a `parent_transcript_group_id` or `transcript_group_id` references a group id that
|
|
35
|
+
is not present in the cloned run, the reference is left unchanged (mirrors importer behavior).
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
agent_run: The source `AgentRun` to clone.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
A new, independent `AgentRun` instance with randomized identifiers and consistent references.
|
|
42
|
+
"""
|
|
43
|
+
# Validate source integrity before cloning
|
|
44
|
+
# - No duplicate transcript or group IDs
|
|
45
|
+
# - All transcript.group references exist if set
|
|
46
|
+
# - All group.parent references exist if set
|
|
47
|
+
# - All group.agent_run_id match the source run id
|
|
48
|
+
src_transcript_ids = [str(t.id) for t in agent_run.transcripts]
|
|
49
|
+
if len(src_transcript_ids) != len(set(src_transcript_ids)):
|
|
50
|
+
raise ValueError("Duplicate transcript ids detected in source AgentRun")
|
|
51
|
+
|
|
52
|
+
src_group_ids = [str(g.id) for g in agent_run.transcript_groups]
|
|
53
|
+
if len(src_group_ids) != len(set(src_group_ids)):
|
|
54
|
+
raise ValueError("Duplicate transcript group ids detected in source AgentRun")
|
|
55
|
+
|
|
56
|
+
src_group_id_set = set(src_group_ids)
|
|
57
|
+
for t in agent_run.transcripts:
|
|
58
|
+
if t.transcript_group_id is not None and str(t.transcript_group_id) not in src_group_id_set:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"Transcript {t.id} references missing transcript_group_id {t.transcript_group_id}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
for g in agent_run.transcript_groups:
|
|
64
|
+
if (
|
|
65
|
+
g.parent_transcript_group_id is not None
|
|
66
|
+
and str(g.parent_transcript_group_id) not in src_group_id_set
|
|
67
|
+
):
|
|
68
|
+
raise ValueError(
|
|
69
|
+
f"TranscriptGroup {g.id} references missing parent_transcript_group_id {g.parent_transcript_group_id}"
|
|
70
|
+
)
|
|
71
|
+
if str(g.agent_run_id) != str(agent_run.id):
|
|
72
|
+
raise ValueError(
|
|
73
|
+
f"TranscriptGroup {g.id} has agent_run_id {g.agent_run_id} which does not match AgentRun.id {agent_run.id}"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Deep copy first so we never mutate the caller's instance
|
|
77
|
+
new_run = _deep_copy_model(agent_run)
|
|
78
|
+
|
|
79
|
+
# 1) Randomize AgentRun ID
|
|
80
|
+
new_agent_run_id = str(uuid4())
|
|
81
|
+
old_to_new_transcript_id: Dict[str, str] = {}
|
|
82
|
+
old_to_new_group_id: Dict[str, str] = {}
|
|
83
|
+
|
|
84
|
+
# 2) Pre-compute new IDs for transcripts and transcript groups without mutating yet
|
|
85
|
+
for transcript in new_run.transcripts:
|
|
86
|
+
old_to_new_transcript_id[str(transcript.id)] = str(uuid4())
|
|
87
|
+
|
|
88
|
+
for group in new_run.transcript_groups:
|
|
89
|
+
old_to_new_group_id[str(group.id)] = str(uuid4())
|
|
90
|
+
|
|
91
|
+
# 3) Mutate transcript groups: set new id, set agent_run_id, remap parents
|
|
92
|
+
for group in new_run.transcript_groups:
|
|
93
|
+
old_group_id = str(group.id)
|
|
94
|
+
|
|
95
|
+
# Assign new group id
|
|
96
|
+
group.id = old_to_new_group_id.get(old_group_id, str(uuid4()))
|
|
97
|
+
|
|
98
|
+
# Ensure group points to the new agent run id
|
|
99
|
+
group.agent_run_id = new_agent_run_id
|
|
100
|
+
|
|
101
|
+
# Remap parent id; raise if unknown
|
|
102
|
+
if group.parent_transcript_group_id is not None:
|
|
103
|
+
old_parent_id = str(group.parent_transcript_group_id)
|
|
104
|
+
if old_parent_id not in old_to_new_group_id:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"TranscriptGroup {old_group_id} parent_transcript_group_id {old_parent_id} not found in this AgentRun"
|
|
107
|
+
)
|
|
108
|
+
group.parent_transcript_group_id = old_to_new_group_id[old_parent_id]
|
|
109
|
+
|
|
110
|
+
# 4) Mutate transcripts: set new id, remap transcript_group_id
|
|
111
|
+
for transcript in new_run.transcripts:
|
|
112
|
+
old_transcript_id = str(transcript.id)
|
|
113
|
+
|
|
114
|
+
# Assign new transcript id
|
|
115
|
+
transcript.id = old_to_new_transcript_id.get(old_transcript_id, str(uuid4()))
|
|
116
|
+
|
|
117
|
+
# Remap group reference; raise if unknown
|
|
118
|
+
if transcript.transcript_group_id is not None:
|
|
119
|
+
old_group_id_ref = str(transcript.transcript_group_id)
|
|
120
|
+
if old_group_id_ref not in old_to_new_group_id:
|
|
121
|
+
raise ValueError(
|
|
122
|
+
f"Transcript {old_transcript_id} references transcript_group_id {old_group_id_ref} not found in this AgentRun"
|
|
123
|
+
)
|
|
124
|
+
transcript.transcript_group_id = old_to_new_group_id[old_group_id_ref]
|
|
125
|
+
|
|
126
|
+
# 5) Finally set the new run id
|
|
127
|
+
new_run.id = new_agent_run_id
|
|
128
|
+
|
|
129
|
+
# Post-validate integrity on the cloned run
|
|
130
|
+
new_group_ids = [str(g.id) for g in new_run.transcript_groups]
|
|
131
|
+
if len(new_group_ids) != len(set(new_group_ids)):
|
|
132
|
+
raise ValueError("Duplicate transcript group ids detected after cloning")
|
|
133
|
+
new_group_id_set = set(new_group_ids)
|
|
134
|
+
|
|
135
|
+
new_transcript_ids = [str(t.id) for t in new_run.transcripts]
|
|
136
|
+
if len(new_transcript_ids) != len(set(new_transcript_ids)):
|
|
137
|
+
raise ValueError("Duplicate transcript ids detected after cloning")
|
|
138
|
+
|
|
139
|
+
for t in new_run.transcripts:
|
|
140
|
+
if t.transcript_group_id is not None and str(t.transcript_group_id) not in new_group_id_set:
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Transcript {t.id} references missing transcript_group_id {t.transcript_group_id} after cloning"
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
for g in new_run.transcript_groups:
|
|
146
|
+
if (
|
|
147
|
+
g.parent_transcript_group_id is not None
|
|
148
|
+
and str(g.parent_transcript_group_id) not in new_group_id_set
|
|
149
|
+
):
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"TranscriptGroup {g.id} references missing parent_transcript_group_id {g.parent_transcript_group_id} after cloning"
|
|
152
|
+
)
|
|
153
|
+
if str(g.agent_run_id) != str(new_run.id):
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f"TranscriptGroup {g.id} has agent_run_id {g.agent_run_id} which does not match cloned AgentRun.id {new_run.id}"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
return new_run
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def clone_agent_runs_with_random_ids(agent_runs: Iterable[AgentRun]) -> List[AgentRun]:
|
|
162
|
+
"""Clone a sequence of `AgentRun` objects with randomized IDs.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
agent_runs: Iterable of `AgentRun` instances to clone.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
A list of cloned `AgentRun` instances with fresh IDs and consistent references.
|
|
169
|
+
"""
|
|
170
|
+
return [clone_agent_run_with_random_ids(ar) for ar in agent_runs]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from docent.judges.impl import BaseJudge, MajorityVotingJudge, MultiReflectionJudge
|
|
2
|
+
from docent.judges.types import (
|
|
3
|
+
JudgeResult,
|
|
4
|
+
JudgeResultCompletionCallback,
|
|
5
|
+
JudgeResultWithCitations,
|
|
6
|
+
JudgeVariant,
|
|
7
|
+
ResultType,
|
|
8
|
+
Rubric,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
# Judges
|
|
13
|
+
"MajorityVotingJudge",
|
|
14
|
+
"MultiReflectionJudge",
|
|
15
|
+
"BaseJudge",
|
|
16
|
+
# Types
|
|
17
|
+
"Rubric",
|
|
18
|
+
"JudgeResult",
|
|
19
|
+
"JudgeResultWithCitations",
|
|
20
|
+
"JudgeResultCompletionCallback",
|
|
21
|
+
"ResultType",
|
|
22
|
+
"JudgeVariant",
|
|
23
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import anyio
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from pydantic_core import to_jsonable_python
|
|
8
|
+
from tqdm.auto import tqdm
|
|
9
|
+
|
|
10
|
+
from docent._log_util import get_logger
|
|
11
|
+
from docent.data_models.agent_run import AgentRun
|
|
12
|
+
from docent.judges.impl import BaseJudge
|
|
13
|
+
from docent.judges.util.voting import JudgeOutputDistribution
|
|
14
|
+
|
|
15
|
+
logger = get_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MultiReflectRollouts(BaseModel):
|
|
19
|
+
"""Object is associated with a single agent run"""
|
|
20
|
+
|
|
21
|
+
agent_run_id: str
|
|
22
|
+
|
|
23
|
+
first_step_rollouts: list[dict[str, Any]]
|
|
24
|
+
first_step_rollout_metadata: list[dict[str, Any] | None]
|
|
25
|
+
# Each index in second_step_rollouts corresponds to an index in first_step_combinations
|
|
26
|
+
# Step 2 rollouts are computed by passing each step 1 combo into the judge several times
|
|
27
|
+
first_step_combinations: list[list[dict[str, Any]]] | None = None
|
|
28
|
+
second_step_rollouts: list[list[dict[str, Any]]] | None = None
|
|
29
|
+
second_step_rollout_metadata: list[list[dict[str, Any] | None]] | None = None
|
|
30
|
+
|
|
31
|
+
distributions: dict[str, JudgeOutputDistribution]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
async def collect_judge_pvs(
|
|
35
|
+
judge: BaseJudge,
|
|
36
|
+
agent_runs: list[AgentRun],
|
|
37
|
+
*,
|
|
38
|
+
results_path: Path,
|
|
39
|
+
estimate_output_distrs_kwargs: dict[str, Any],
|
|
40
|
+
):
|
|
41
|
+
if results_path.exists():
|
|
42
|
+
raise FileExistsError(f"Results path already exists: {results_path}")
|
|
43
|
+
results_path.parent.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
results = dict[str, MultiReflectRollouts]()
|
|
46
|
+
persist_lock = anyio.Lock()
|
|
47
|
+
pbar = tqdm(total=len(agent_runs), desc="Processing agent runs")
|
|
48
|
+
|
|
49
|
+
async def _persist():
|
|
50
|
+
async with persist_lock:
|
|
51
|
+
with open(str(results_path), "w") as f:
|
|
52
|
+
json.dump(to_jsonable_python(results), f, indent=2)
|
|
53
|
+
|
|
54
|
+
async def _execute_for_agent_run(agent_run: AgentRun):
|
|
55
|
+
result = await judge.estimate_output_distrs(agent_run, **estimate_output_distrs_kwargs)
|
|
56
|
+
if result is None:
|
|
57
|
+
pbar.update(1)
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
distrs, metadata = result
|
|
61
|
+
results[agent_run.id] = MultiReflectRollouts.model_validate(
|
|
62
|
+
{
|
|
63
|
+
"agent_run_id": agent_run.id,
|
|
64
|
+
"distributions": distrs,
|
|
65
|
+
**metadata,
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
await _persist()
|
|
69
|
+
pbar.update(1)
|
|
70
|
+
|
|
71
|
+
async with anyio.create_task_group() as tg_outer:
|
|
72
|
+
for agent_run in agent_runs:
|
|
73
|
+
tg_outer.start_soon(_execute_for_agent_run, agent_run)
|
|
74
|
+
|
|
75
|
+
pbar.close()
|
|
76
|
+
|
|
77
|
+
return results
|