agent-apprenticeship 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,6 @@
1
1
  from __future__ import annotations
2
+ import os
3
+ import signal
2
4
  import shutil, subprocess
3
5
  from pathlib import Path
4
6
  import re
@@ -16,6 +18,35 @@ class AttemptResult(dict): pass
16
18
 
17
19
  CODEX_TRUST_RETRY_MESSAGE = "Codex refused to run because the workspace is not a trusted Git directory. Retrying with --skip-git-repo-check if supported."
18
20
 
21
+
22
+ def _run_with_process_group_timeout(command, *, cwd: Path, timeout: int | None, shell: bool = False) -> subprocess.CompletedProcess:
23
+ process = subprocess.Popen(
24
+ command,
25
+ cwd=cwd,
26
+ text=True,
27
+ stdout=subprocess.PIPE,
28
+ stderr=subprocess.PIPE,
29
+ shell=shell,
30
+ start_new_session=True,
31
+ )
32
+ try:
33
+ stdout, stderr = process.communicate(timeout=timeout)
34
+ return subprocess.CompletedProcess(command, process.returncode, stdout, stderr)
35
+ except subprocess.TimeoutExpired as exc:
36
+ try:
37
+ os.killpg(process.pid, signal.SIGTERM)
38
+ except Exception:
39
+ process.kill()
40
+ try:
41
+ stdout, stderr = process.communicate(timeout=5)
42
+ except subprocess.TimeoutExpired:
43
+ try:
44
+ os.killpg(process.pid, signal.SIGKILL)
45
+ except Exception:
46
+ process.kill()
47
+ stdout, stderr = process.communicate()
48
+ raise subprocess.TimeoutExpired(command, timeout, output=stdout, stderr=stderr) from exc
49
+
19
50
  def _attempt_dir(package_root: Path, attempt_kind: str) -> Path:
20
51
  p=package_root/'attempts'/attempt_kind
21
52
  (p/'artifacts').mkdir(parents=True, exist_ok=True)
@@ -292,7 +323,7 @@ def run_codex_attempt(package_root: Path, raw: RawTaskRecord, spec: TaskIntakeSp
292
323
  stdout=''
293
324
  stderr=''
294
325
  try:
295
- cp=subprocess.run(cmd, cwd=d, text=True, capture_output=True, timeout=timeout)
326
+ cp=_run_with_process_group_timeout(cmd, cwd=d, timeout=timeout)
296
327
  returncode=cp.returncode
297
328
  stdout=cp.stdout or ''
298
329
  stderr=cp.stderr or ''
@@ -308,7 +339,7 @@ def run_codex_attempt(package_root: Path, raw: RawTaskRecord, spec: TaskIntakeSp
308
339
  skip_git_repo_check_supported=True,
309
340
  ask_for_approval_supported=ask_supported,
310
341
  )
311
- cp=subprocess.run(retry_cmd, cwd=d, text=True, capture_output=True, timeout=timeout)
342
+ cp=_run_with_process_group_timeout(retry_cmd, cwd=d, timeout=timeout)
312
343
  cmd=retry_cmd
313
344
  returncode=cp.returncode
314
345
  stdout=(stdout or '') + "\n" + (cp.stdout or '')
@@ -338,6 +369,18 @@ def run_codex_attempt(package_root: Path, raw: RawTaskRecord, spec: TaskIntakeSp
338
369
  if contract_diagnostics:
339
370
  actual.metadata_json['apprentice_agent_contract_diagnostics']=contract_diagnostics
340
371
  op_error = _apprentice_operational_error(run_error, stdout, stderr, returncode)
372
+ if (
373
+ op_error
374
+ and isinstance(run_error, subprocess.TimeoutExpired)
375
+ and trace_valid
376
+ and actual.status == 'success'
377
+ ):
378
+ actual.metadata_json['apprentice_agent_warning'] = (
379
+ 'Apprentice Agent process timed out after producing required outputs; '
380
+ 'the produced trace and artifacts were preserved.'
381
+ )
382
+ trace.metadata_json['apprentice_agent_warning'] = actual.metadata_json['apprentice_agent_warning']
383
+ op_error = None
341
384
  if op_error and returncode not in (None, 0) and trace_valid and actual.status == 'success':
342
385
  op_error = f"Apprentice Agent exited nonzero after producing required outputs (exit code {returncode})."
343
386
  if op_error or not trace_valid:
@@ -387,7 +430,7 @@ def run_custom_attempt(package_root: Path, raw: RawTaskRecord, spec: TaskIntakeS
387
430
  run_error=None
388
431
  cp=None
389
432
  try:
390
- cp=subprocess.run(command, cwd=d, text=True, capture_output=True, timeout=timeout or settings.task_timeout_seconds, shell=True)
433
+ cp=_run_with_process_group_timeout(command, cwd=d, timeout=timeout or settings.task_timeout_seconds, shell=True)
391
434
  stdout=redact_secrets(cp.stdout or '')
392
435
  stderr=redact_secrets(cp.stderr or '')
393
436
  (d/'stdout.txt').write_text(stdout)
@@ -3,6 +3,8 @@ from __future__ import annotations
3
3
  import os
4
4
  import shlex
5
5
  import shutil
6
+ from contextlib import contextmanager
7
+ from contextvars import ContextVar
6
8
  from pathlib import Path
7
9
  from typing import Any, Literal
8
10
 
@@ -28,6 +30,7 @@ DATA_SHARING_LEVELS: tuple[str, ...] = ("standard", "full-context")
28
30
  DEFAULT_APP_HOME = Path("~/.agent-apprenticeship").expanduser()
29
31
  DEFAULT_PUBLIC_ECOSYSTEM_REPO = "Forsy-AI/agent-apprenticeship"
30
32
  DEFAULT_PUBLIC_ECOSYSTEM_URL = f"https://github.com/{DEFAULT_PUBLIC_ECOSYSTEM_REPO}"
33
+ _SETTINGS_OVERRIDE: ContextVar[Any] = ContextVar("agent_apprenticeship_settings_override", default=None)
31
34
 
32
35
 
33
36
  def normalize_mentor_mode(value: str | None, default: str = "model_assisted") -> str:
@@ -183,6 +186,15 @@ class Settings(BaseModel):
183
186
  llm_rubric_generation_enabled: bool = True
184
187
 
185
188
 
189
+ @contextmanager
190
+ def settings_override(settings: Settings):
191
+ token = _SETTINGS_OVERRIDE.set(settings)
192
+ try:
193
+ yield
194
+ finally:
195
+ _SETTINGS_OVERRIDE.reset(token)
196
+
197
+
186
198
  def app_home_from_env() -> Path:
187
199
  return Path(os.getenv("AA_HOME") or DEFAULT_APP_HOME).expanduser()
188
200
 
@@ -223,6 +235,10 @@ def default_settings(app_home: Path | None = None) -> Settings:
223
235
 
224
236
 
225
237
  def get_settings(root: Path | None = None) -> Settings:
238
+ if root is None:
239
+ override = _SETTINGS_OVERRIDE.get()
240
+ if override is not None:
241
+ return override
226
242
  load_local_env(root)
227
243
  stored = _stored_settings()
228
244
  home = Path(os.getenv("AA_HOME") or stored.get("app_home") or DEFAULT_APP_HOME).expanduser()
@@ -34,6 +34,12 @@ def _drop_local_pydantic_if_needed() -> dict[str, Any]:
34
34
  `openai`.
35
35
  """
36
36
  src = _repo_src_path()
37
+ # In an installed package, parents[1] is usually site-packages. Removing
38
+ # that path prevents the OpenAI SDK itself from being imported. Only strip
39
+ # the path when it looks like the old repo-local pydantic shim directory:
40
+ # it has pydantic but not the OpenAI SDK alongside it.
41
+ if not (Path(src) / "pydantic" / "__init__.py").exists() or (Path(src) / "openai").exists():
42
+ return {'removed_paths': [], 'removed_modules': {}}
37
43
  removed_paths = []
38
44
  for p in list(sys.path):
39
45
  if Path(p or '.').resolve().as_posix() == Path(src).resolve().as_posix():
@@ -8,7 +8,7 @@ from datetime import datetime, timezone
8
8
  from pathlib import Path
9
9
 
10
10
  from .bundle_exporter import create_contribution_bundle
11
- from .config import Settings, apprentice_agent_display_name, apprentice_agent_readiness_status, get_settings
11
+ from .config import Settings, apprentice_agent_display_name, apprentice_agent_readiness_status, get_settings, settings_override
12
12
  from .io import read_json, write_json
13
13
  from .loop import run_task
14
14
  from .mentor_checkpoints import write_mentor_checkpoints
@@ -18,6 +18,13 @@ from .schemas import RawTaskRecord
18
18
  from .session_events import append_session_event, backfill_session_event_task_ids, next_followup_index
19
19
 
20
20
 
21
+ class RunInterrupted(Exception):
22
+ def __init__(self, run_root: Path, message: str = "Run interrupted by user."):
23
+ super().__init__(message)
24
+ self.run_root = run_root
25
+ self.message = message
26
+
27
+
21
28
  def slugify(text: str, fallback: str = "task") -> str:
22
29
  value = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
23
30
  return (value[:64].strip("-") or fallback)
@@ -162,17 +169,15 @@ def _pre_attempt_checkpoint_callback(
162
169
  progress_callback: ProgressCallback | None,
163
170
  followup_index: int | None = None,
164
171
  ):
165
- if not _human_checkpoint_mode(settings):
166
- return None
167
-
168
172
  def _callback(_pkg: Path) -> None:
169
- write_mentor_checkpoints(
170
- run_root,
171
- settings,
172
- auto_approve=_checkpoint_auto_approve(settings),
173
- stages=("task_intake", "rubric"),
174
- preserve_interactive=followup_index is None,
175
- )
173
+ if _human_checkpoint_mode(settings):
174
+ write_mentor_checkpoints(
175
+ run_root,
176
+ settings,
177
+ auto_approve=_checkpoint_auto_approve(settings),
178
+ stages=("task_intake", "rubric"),
179
+ preserve_interactive=followup_index is None,
180
+ )
176
181
  append_progress_event(
177
182
  run_root,
178
183
  "apprentice_attempt_started",
@@ -188,6 +193,26 @@ def _pre_attempt_checkpoint_callback(
188
193
  return _callback
189
194
 
190
195
 
196
+ def _append_mentor_preparation_started(
197
+ run_root: Path,
198
+ settings: Settings,
199
+ *,
200
+ progress_callback: ProgressCallback | None,
201
+ followup_index: int | None = None,
202
+ ) -> None:
203
+ append_progress_event(
204
+ run_root,
205
+ "mentor_preparation_started",
206
+ run_id=run_root.name,
207
+ message=(f"Follow-up {followup_index} Mentor preparation started" if followup_index else "Mentor preparation started"),
208
+ current_loop=1,
209
+ maximum_improvement_loops=settings.max_improvement_loops,
210
+ phase="mentor_preparation",
211
+ metadata_json={"followup_index": followup_index} if followup_index else None,
212
+ callback=progress_callback,
213
+ )
214
+
215
+
191
216
  def _revision_decision_callback(
192
217
  run_root: Path,
193
218
  settings: Settings,
@@ -300,6 +325,36 @@ def runner_for_settings(settings: Settings, override: str | None = None) -> str:
300
325
  raise RuntimeError(f"Unsupported Apprentice Agent: {settings.worker_agent}")
301
326
 
302
327
 
328
+ def _loop_settings_for_run(settings: Settings) -> Settings:
329
+ if settings.mentor_mode != "expert_led":
330
+ return settings
331
+ return settings.model_copy(
332
+ update={
333
+ "rubric_mode": "deterministic",
334
+ "llm_task_intake_enabled": False,
335
+ "llm_rubric_generation_enabled": False,
336
+ "llm_evaluator_enabled": False,
337
+ "llm_grader_enabled": False,
338
+ "llm_verifier_enabled": False,
339
+ }
340
+ )
341
+
342
+
343
+ def _settings_for_session(settings: Settings, session: dict) -> Settings:
344
+ updates = {}
345
+ if session.get("mentor_mode"):
346
+ updates["mentor_mode"] = session["mentor_mode"]
347
+ if session.get("sensitive_info_masking"):
348
+ updates["sensitive_info_masking"] = session["sensitive_info_masking"]
349
+ if session.get("model_provider"):
350
+ updates["model_provider"] = session["model_provider"]
351
+ if session.get("max_improvement_loops"):
352
+ loops = int(session["max_improvement_loops"])
353
+ updates["max_improvement_loops"] = loops
354
+ updates["max_iterations"] = loops
355
+ return settings.model_copy(update=updates) if updates else settings
356
+
357
+
303
358
  def _session_status_for_package(pkg: Path) -> tuple[str, str | None]:
304
359
  actual_paths = sorted((pkg / "attempts").glob("*/actual_outputs.json"))
305
360
  if not actual_paths:
@@ -496,34 +551,44 @@ def run_prompt_task(
496
551
  **_experience_session_fields(experience_pack_refs),
497
552
  },
498
553
  )
499
- if not _human_checkpoint_mode(settings):
554
+ _append_mentor_preparation_started(run_root, settings, progress_callback=progress_callback)
555
+ revision_decider, checkpoint_state = _revision_decision_callback(
556
+ run_root,
557
+ settings,
558
+ progress_callback=progress_callback,
559
+ )
560
+ try:
561
+ with settings_override(_loop_settings_for_run(settings)):
562
+ pkg = run_task(
563
+ raw,
564
+ run_root,
565
+ runner=runner_for_settings(settings, runner),
566
+ max_iterations=settings.max_improvement_loops,
567
+ pre_attempt_callback=_pre_attempt_checkpoint_callback(
568
+ run_root,
569
+ settings,
570
+ progress_callback=progress_callback,
571
+ ),
572
+ revision_decision_callback=revision_decider,
573
+ )
574
+ except KeyboardInterrupt as exc:
500
575
  append_progress_event(
501
576
  run_root,
502
- "apprentice_attempt_started",
577
+ "run_interrupted",
503
578
  run_id=run_id,
504
- message="Apprentice attempt started",
579
+ message="Run interrupted by user.",
505
580
  current_loop=1,
506
581
  maximum_improvement_loops=settings.max_improvement_loops,
507
- phase="apprentice_attempt",
582
+ phase="interrupted",
583
+ run_status="partial",
584
+ task_status="partial",
585
+ operational_error="Run interrupted by user.",
508
586
  callback=progress_callback,
509
587
  )
510
- revision_decider, checkpoint_state = _revision_decision_callback(
511
- run_root,
512
- settings,
513
- progress_callback=progress_callback,
514
- )
515
- pkg = run_task(
516
- raw,
517
- run_root,
518
- runner=runner_for_settings(settings, runner),
519
- max_iterations=settings.max_improvement_loops,
520
- pre_attempt_callback=_pre_attempt_checkpoint_callback(
521
- run_root,
522
- settings,
523
- progress_callback=progress_callback,
524
- ),
525
- revision_decision_callback=revision_decider,
526
- )
588
+ session = read_json(run_root / "session.json") if (run_root / "session.json").exists() else {}
589
+ session.update({"run_status": "partial", "task_status": "partial", "status_reason": "Run interrupted by user."})
590
+ write_json(run_root / "session.json", {k: v for k, v in session.items() if v is not None})
591
+ raise RunInterrupted(run_root) from exc
527
592
  manifest = read_json(pkg / "package_manifest.json") if (pkg / "package_manifest.json").exists() else {}
528
593
  run_status, partial_reason = _session_status_for_package(pkg)
529
594
  actual_iterations = int(manifest.get("actual_iterations") or 1)
@@ -771,6 +836,8 @@ def continue_session(
771
836
  run_root = run_root_for(run_id, settings)
772
837
  if not run_root.exists():
773
838
  raise FileNotFoundError(f"Run not found: {run_id}")
839
+ session = read_json(run_root / "session.json") if (run_root / "session.json").exists() else {}
840
+ settings = _settings_for_session(settings, session)
774
841
  followup_index = next_followup_index(run_root)
775
842
  task_id = _session_task_id(run_root)
776
843
  append_progress_event(
@@ -836,7 +903,6 @@ def continue_session(
836
903
  )
837
904
  bundle: Path | None = None
838
905
  if run_loop:
839
- session = read_json(run_root / "session.json") if (run_root / "session.json").exists() else {}
840
906
  original = session.get("task_instruction") or ""
841
907
  combined = (
842
908
  "Continue the same Agent Apprenticeship session.\n\n"
@@ -848,37 +914,32 @@ def continue_session(
848
914
  combined,
849
915
  _asset_abs_refs(run_root, asset_refs),
850
916
  )
851
- if not _human_checkpoint_mode(settings):
852
- append_progress_event(
853
- run_root,
854
- "apprentice_attempt_started",
855
- run_id=run_root.name,
856
- message=f"Follow-up {followup_index} Apprentice attempt started",
857
- current_loop=1,
858
- maximum_improvement_loops=settings.max_improvement_loops,
859
- phase="apprentice_attempt",
860
- metadata_json={"followup_index": followup_index},
861
- callback=progress_callback,
862
- )
863
- revision_decider, checkpoint_state = _revision_decision_callback(
917
+ _append_mentor_preparation_started(
864
918
  run_root,
865
919
  settings,
866
920
  progress_callback=progress_callback,
867
921
  followup_index=followup_index,
868
922
  )
869
- pkg = run_task(
870
- raw,
923
+ revision_decider, checkpoint_state = _revision_decision_callback(
871
924
  run_root,
872
- runner=runner_for_settings(settings, runner),
873
- max_iterations=settings.max_improvement_loops,
874
- pre_attempt_callback=_pre_attempt_checkpoint_callback(
875
- run_root,
876
- settings,
877
- progress_callback=progress_callback,
878
- followup_index=followup_index,
879
- ),
880
- revision_decision_callback=revision_decider,
925
+ settings,
926
+ progress_callback=progress_callback,
927
+ followup_index=followup_index,
881
928
  )
929
+ with settings_override(_loop_settings_for_run(settings)):
930
+ pkg = run_task(
931
+ raw,
932
+ run_root,
933
+ runner=runner_for_settings(settings, runner),
934
+ max_iterations=settings.max_improvement_loops,
935
+ pre_attempt_callback=_pre_attempt_checkpoint_callback(
936
+ run_root,
937
+ settings,
938
+ progress_callback=progress_callback,
939
+ followup_index=followup_index,
940
+ ),
941
+ revision_decision_callback=revision_decider,
942
+ )
882
943
  manifest = read_json(pkg / "package_manifest.json") if (pkg / "package_manifest.json").exists() else {}
883
944
  status, _reason = _session_status_for_package(pkg)
884
945
  actual_iterations = int(manifest.get("actual_iterations") or 1)
@@ -4,8 +4,8 @@ from pathlib import Path
4
4
  from pydantic import BaseModel
5
5
  from .schemas import RawTaskRecord, TaskIntakeSpec, TaskIntakeQualityReport
6
6
  from .config import get_settings
7
- from .io import read_json
8
- from .openai_structured import get_model_provider_status, run_structured_role
7
+ from .io import read_json, write_json
8
+ from .openai_structured import extract_json_object, get_model_provider_status, run_structured_role
9
9
  from .public_sanitizer import sanitize_public_obj, sha256_text
10
10
 
11
11
  class LLMTaskIntakeOutput(BaseModel):
@@ -32,6 +32,46 @@ def _task_record_for_intake(raw: RawTaskRecord) -> dict:
32
32
  data=raw.model_dump(mode='json')
33
33
  return sanitize_public_obj(_drop_source_fields(data))
34
34
 
35
+ def _sanitize_intake_output_obj(obj):
36
+ return sanitize_public_obj(_drop_source_fields(obj))
37
+
38
+ def _sanitize_intake_role_artifacts(role_dir: Path) -> None:
39
+ for name in ("parsed_output.json", "raw_parsed_output.json"):
40
+ path = role_dir / name
41
+ if path.exists():
42
+ try:
43
+ write_json(path, _sanitize_intake_output_obj(read_json(path)))
44
+ except Exception:
45
+ pass
46
+ for name in ("raw_output.txt", "raw_output.retry.txt"):
47
+ path = role_dir / name
48
+ if not path.exists():
49
+ continue
50
+ try:
51
+ parsed = extract_json_object(path.read_text(errors="ignore"))
52
+ path.write_text(json.dumps(_sanitize_intake_output_obj(parsed), indent=2, sort_keys=True) + "\n")
53
+ except Exception:
54
+ text = path.read_text(errors="ignore")
55
+ for key in SOURCE_FIELD_KEYS:
56
+ text = text.replace(key, "reference_id")
57
+ path.write_text(text)
58
+
59
+ def _sanitize_spec_and_quality(
60
+ spec: TaskIntakeSpec,
61
+ quality: TaskIntakeQualityReport | None = None,
62
+ ) -> tuple[TaskIntakeSpec, TaskIntakeQualityReport | None]:
63
+ spec_updates = {
64
+ "metadata_json": _sanitize_intake_output_obj(spec.metadata_json or {}),
65
+ "expected_pay": None,
66
+ "expected_apprentice_pay": None,
67
+ }
68
+ sanitized_quality = None
69
+ if quality is not None:
70
+ sanitized_quality = quality.model_copy(
71
+ update={"metadata_json": _sanitize_intake_output_obj(quality.metadata_json or {})}
72
+ )
73
+ return spec.model_copy(update=spec_updates), sanitized_quality
74
+
35
75
  def direct_task_sheet_metadata(raw: RawTaskRecord) -> dict[str, object]:
36
76
  payload=raw.raw_payload or {}
37
77
  expected_economic_value = raw.expected_economic_value or payload.get('expected_economic_value') or raw.expected_pay or payload.get('expected_pay')
@@ -94,12 +134,14 @@ def task_intake(raw: RawTaskRecord, role_root: Path | None=None) -> tuple[TaskIn
94
134
  model_override=settings.llm_task_intake_model if provider == 'openai' else None
95
135
  rr=run_structured_role('intake_agent', prompt, LLMTaskIntakeOutput, role_root/'intake_agent', allow_fallback=settings.allow_deterministic_eval_fallback, model_override=model_override, normalizer_context={'task_id': raw.raw_task_id.replace('raw_','task_'), 'task_title': raw.raw_title, 'task_instruction': raw.raw_description, 'model': model_override or settings.model_provider_model, 'provider':provider})
96
136
  if rr.live_call_ok and rr.structured_output_validation_ok:
137
+ _sanitize_intake_role_artifacts(role_root/'intake_agent')
97
138
  parsed=read_json(role_root/'intake_agent/parsed_output.json')
98
139
  spec=TaskIntakeSpec.model_validate(parsed['task_intake_spec'])
99
140
  spec=apply_direct_task_sheet_metadata(spec, raw)
100
141
  q=TaskIntakeQualityReport.model_validate(parsed['task_intake_quality_report'])
101
142
  spec.metadata_json.update({'intake_source':'llm','provider':rr.provider,'model':rr.model,'llm_prompt_ref_internal':str(role_root/'intake_agent/prompt.md'),'llm_response_ref_internal':str(role_root/'intake_agent/raw_output.txt'),'prompt_template_id':'task_intake_agent_v0','prompt_template_version':'0.1','prompt_hash':sha256_text(prompt),'public_response_summary':'Model-assisted task intake generated structured task spec.'})
102
143
  q.metadata_json.update({'intake_source':'llm','role_result_ref_internal':str(role_root/'intake_agent/role_result.json')})
144
+ spec, q = _sanitize_spec_and_quality(spec, q)
103
145
  return spec,q
104
146
  if settings.rubric_mode == 'llm_required' or settings.llm_fail_closed:
105
147
  raise RuntimeError(rr.error_message or 'Model task intake failed')
@@ -109,4 +151,5 @@ def task_intake(raw: RawTaskRecord, role_root: Path | None=None) -> tuple[TaskIn
109
151
  spec,q=deterministic_intake(raw)
110
152
  if settings.llm_task_intake_enabled and _mentor_provider_can_attempt():
111
153
  spec.metadata_json.update({'intake_source':'deterministic_fallback','llm_unavailable':True,'provider':_mentor_provider_id()})
154
+ spec, q = _sanitize_spec_and_quality(spec, q)
112
155
  return spec,q