nighthawk-python 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nighthawk_python-0.6.0/.claude/rules/docs.md +232 -0
- nighthawk_python-0.6.0/.claude/rules/promptfoo.md +137 -0
- nighthawk_python-0.6.0/.claude/rules/tests.md +27 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.devcontainer/litellm-config.yaml +1 -1
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.github/workflows/docs.yml +1 -1
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.gitignore +6 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/AGENTS.md +8 -5
- nighthawk_python-0.6.0/CHANGELOG.md +101 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/CONTRIBUTING.md +43 -1
- nighthawk_python-0.6.0/PKG-INFO +115 -0
- nighthawk_python-0.6.0/README.md +80 -0
- nighthawk_python-0.6.0/docs/AGENTS.md +1 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/docs/api.md +31 -4
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/docs/coding-agent-backends.md +144 -75
- nighthawk_python-0.6.0/docs/executors.md +72 -0
- nighthawk_python-0.6.0/docs/for-coding-agents.md +389 -0
- nighthawk_python-0.6.0/docs/index.md +69 -0
- nighthawk_python-0.6.0/docs/natural-blocks.md +472 -0
- nighthawk_python-0.6.0/docs/patterns.md +480 -0
- nighthawk_python-0.6.0/docs/philosophy.md +172 -0
- nighthawk_python-0.6.0/docs/pydantic-ai-providers.md +103 -0
- nighthawk_python-0.6.0/docs/quickstart.md +73 -0
- nighthawk_python-0.6.0/docs/roadmap.md +130 -0
- nighthawk_python-0.6.0/docs/runtime-configuration.md +100 -0
- nighthawk_python-0.4.0/docs/design.md → nighthawk_python-0.6.0/docs/specification.md +119 -51
- nighthawk_python-0.6.0/docs/verification.md +255 -0
- nighthawk_python-0.6.0/evals/promptfoo/assertions/binding_value.py +56 -0
- nighthawk_python-0.6.0/evals/promptfoo/assertions/outcome_kind.py +49 -0
- nighthawk_python-0.6.0/evals/promptfoo/assertions/raise_message.py +62 -0
- nighthawk_python-0.6.0/evals/promptfoo/evidence/2026-03-26-baseline-prompt-ab.md +32 -0
- nighthawk_python-0.6.0/evals/promptfoo/evidence/2026-03-26-baseline-regression.md +28 -0
- nighthawk_python-0.6.0/evals/promptfoo/evidence/2026-03-26-baseline-suffix-ab.md +37 -0
- nighthawk_python-0.6.0/evals/promptfoo/promptfooconfig-agents.yaml +44 -0
- nighthawk_python-0.6.0/evals/promptfoo/promptfooconfig-prompt-ab.yaml +63 -0
- nighthawk_python-0.6.0/evals/promptfoo/promptfooconfig-suffix-ab.yaml +63 -0
- nighthawk_python-0.6.0/evals/promptfoo/promptfooconfig.yaml +58 -0
- nighthawk_python-0.6.0/evals/promptfoo/prompts/eval_coding_agent.txt +8 -0
- nighthawk_python-0.6.0/evals/promptfoo/prompts/eval_default.txt +25 -0
- nighthawk_python-0.6.0/evals/promptfoo/prompts/eval_mutation_aware.txt +24 -0
- nighthawk_python-0.6.0/evals/promptfoo/prompts/eval_sequenced.txt +24 -0
- nighthawk_python-0.6.0/evals/promptfoo/provider.py +474 -0
- nighthawk_python-0.6.0/evals/promptfoo/research-result.md +569 -0
- nighthawk_python-0.6.0/evals/promptfoo/test_cases/binding_operations.yaml +141 -0
- nighthawk_python-0.6.0/evals/promptfoo/test_cases/edge_cases.yaml +117 -0
- nighthawk_python-0.6.0/evals/promptfoo/test_cases/loop_outcomes.yaml +77 -0
- nighthawk_python-0.6.0/evals/promptfoo/test_cases/multi_step.yaml +66 -0
- nighthawk_python-0.6.0/evals/promptfoo/test_cases/null_handling.yaml +35 -0
- nighthawk_python-0.6.0/evals/promptfoo/test_cases/outcome_kinds.yaml +151 -0
- nighthawk_python-0.6.0/evals/promptfoo/test_cases/tool_selection.yaml +287 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/mkdocs.yml +20 -11
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/pyproject.toml +2 -1
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/backends/base.py +35 -1
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/backends/claude_code_cli.py +15 -54
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/backends/claude_code_sdk.py +12 -48
- nighthawk_python-0.6.0/src/nighthawk/backends/claude_code_settings.py +33 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/backends/codex.py +13 -37
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/configuration.py +10 -6
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/natural/decorator.py +1 -2
- nighthawk_python-0.6.0/src/nighthawk/resilience/__init__.py +34 -0
- nighthawk_python-0.6.0/src/nighthawk/resilience/_circuit_breaker.py +204 -0
- nighthawk_python-0.6.0/src/nighthawk/resilience/_fallback.py +185 -0
- nighthawk_python-0.6.0/src/nighthawk/resilience/_retry.py +196 -0
- nighthawk_python-0.6.0/src/nighthawk/resilience/_timeout.py +111 -0
- nighthawk_python-0.6.0/src/nighthawk/resilience/_vote.py +139 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/prompt.py +2 -2
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/runner.py +30 -23
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/scoping.py +2 -2
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/step_context.py +1 -1
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/step_contract.py +16 -31
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/testing.py +9 -4
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/tools/assignment.py +18 -2
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/tools/provided.py +2 -19
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/tools/registry.py +5 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/backends/test_claude_code_cli.py +7 -7
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/backends/test_codex.py +19 -19
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/conftest.py +1 -14
- nighthawk_python-0.6.0/tests/docs/test_coding_agent_examples.py +137 -0
- nighthawk_python-0.6.0/tests/docs/test_docs_architecture.py +218 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/docs/test_prompt_examples.py +24 -7
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/test_execution_outcome_prompt_fragment.py +12 -13
- nighthawk_python-0.6.0/tests/execution/test_infer_binding_types.py +91 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/test_runtime.py +35 -13
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/test_variables_prompt.py +59 -8
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/integration/skip_helpers.py +8 -11
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/integration/test_carry_pattern.py +4 -4
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/integration/test_claude_code_cli_integration.py +1 -1
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/integration/test_codex_integration.py +2 -2
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/integration/test_llm_integration.py +8 -71
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/public/test_public_api.py +8 -8
- nighthawk_python-0.6.0/tests/resilience/__init__.py +0 -0
- nighthawk_python-0.6.0/tests/resilience/test_circuit_breaker.py +213 -0
- nighthawk_python-0.6.0/tests/resilience/test_composition.py +210 -0
- nighthawk_python-0.6.0/tests/resilience/test_fallback.py +228 -0
- nighthawk_python-0.6.0/tests/resilience/test_retry.py +203 -0
- nighthawk_python-0.6.0/tests/resilience/test_timeout.py +120 -0
- nighthawk_python-0.6.0/tests/resilience/test_vote.py +173 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/test_testing.py +1 -1
- nighthawk_python-0.6.0/tests/tools/__init__.py +0 -0
- nighthawk_python-0.6.0/tests/tools/test_assignment_async.py +265 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/tools/test_registry.py +0 -1
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/uv.lock +191 -179
- nighthawk_python-0.4.0/.claude/rules/docs.md +0 -108
- nighthawk_python-0.4.0/CHANGELOG.md +0 -65
- nighthawk_python-0.4.0/PKG-INFO +0 -96
- nighthawk_python-0.4.0/README.md +0 -62
- nighthawk_python-0.4.0/docs/for-coding-agents.md +0 -365
- nighthawk_python-0.4.0/docs/index.md +0 -134
- nighthawk_python-0.4.0/docs/providers.md +0 -118
- nighthawk_python-0.4.0/docs/quickstart.md +0 -110
- nighthawk_python-0.4.0/docs/roadmap.md +0 -73
- nighthawk_python-0.4.0/docs/tutorial.md +0 -941
- nighthawk_python-0.4.0/tests/docs/test_coding_agent_examples.py +0 -263
- nighthawk_python-0.4.0/tests/tools/test_assignment_async.py +0 -101
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.claude/rules/coding.md +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.claude/settings.json +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.claude/unset_envs.sh +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.devcontainer/Dockerfile.devcontainer +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.devcontainer/Dockerfile.litellm +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.devcontainer/devcontainer.json +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.devcontainer/docker-compose.yaml +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.github/dependabot.yml +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.github/workflows/ci.yml +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.github/workflows/publish.yml +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/.python-version +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/CLAUDE.md +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/LICENSE +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/docs/assets/nighthawk_logo-128x128.png +0 -0
- {nighthawk_python-0.4.0/src/nighthawk/backends → nighthawk_python-0.6.0/evals/promptfoo/assertions}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/pyrightconfig.json +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/__init__.py +0 -0
- {nighthawk_python-0.4.0/src/nighthawk/natural → nighthawk_python-0.6.0/src/nighthawk/backends}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/backends/mcp_boundary.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/backends/mcp_server.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/backends/tool_bridge.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/errors.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/identifier_path.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/json_renderer.py +0 -0
- {nighthawk_python-0.4.0/src/nighthawk/runtime → nighthawk_python-0.6.0/src/nighthawk/natural}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/natural/blocks.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/natural/transform.py +0 -0
- {nighthawk_python-0.4.0/src/nighthawk/tools → nighthawk_python-0.6.0/src/nighthawk/runtime}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/async_bridge.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/step_executor.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/runtime/tool_calls.py +0 -0
- {nighthawk_python-0.4.0/tests → nighthawk_python-0.6.0/src/nighthawk/tools}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/tools/contracts.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/tools/execution.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/src/nighthawk/ulid.py +0 -0
- {nighthawk_python-0.4.0/tests/backends → nighthawk_python-0.6.0/tests}/__init__.py +0 -0
- {nighthawk_python-0.4.0/tests/docs → nighthawk_python-0.6.0/tests/backends}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/backends/test_claude_code_sdk.py +0 -0
- {nighthawk_python-0.4.0/tests/execution → nighthawk_python-0.6.0/tests/docs}/__init__.py +0 -0
- {nighthawk_python-0.4.0/tests/integration → nighthawk_python-0.6.0/tests/execution}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/prompt_test_helpers.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/stub_executor.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/test_globals_prompt.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/test_natural_block_ordering.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/execution/test_natural_traceback.py +0 -0
- {nighthawk_python-0.4.0/tests/natural → nighthawk_python-0.6.0/tests/integration}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/integration/test_claude_code_sdk_integration.py +0 -0
- {nighthawk_python-0.4.0/tests/public → nighthawk_python-0.6.0/tests/natural}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/natural/test_blocks.py +0 -0
- {nighthawk_python-0.4.0/tests/tools → nighthawk_python-0.6.0/tests/public}/__init__.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/public/test_readme_example.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/test_renderer.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/tools/test_contracts.py +0 -0
- {nighthawk_python-0.4.0 → nighthawk_python-0.6.0}/tests/tools/test_tool_boundary.py +0 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
---
|
|
2
|
+
paths:
|
|
3
|
+
- "docs/**/*.md"
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Documentation rules
|
|
7
|
+
|
|
8
|
+
## Canonical ownership
|
|
9
|
+
|
|
10
|
+
Each topic must have exactly one canonical owner file. Other files may restate the topic only as a deliberate derivative summary, quickstart, or routing pointer.
|
|
11
|
+
|
|
12
|
+
General rules:
|
|
13
|
+
|
|
14
|
+
- Prefer cross-references over duplication.
|
|
15
|
+
- If a topic appears in multiple files, one file must be the declared source of truth.
|
|
16
|
+
- Derivative documents may compress or subset canonical content when that serves a distinct audience.
|
|
17
|
+
- `for-coding-agents.md` is the main exception: it is a derivative operational guide for coding agents and may restate material from human-oriented docs.
|
|
18
|
+
|
|
19
|
+
Public API documentation layering:
|
|
20
|
+
|
|
21
|
+
- `api.md` owns the exhaustive inventory of the supported public API surface (existence, types, signatures, exceptions, docstrings). Low-level utilities and extension hooks belong here.
|
|
22
|
+
- `specification.md` owns API semantics, contracts, boundaries, and runtime behavior. It is the canonical source for *what a symbol means and how it behaves*.
|
|
23
|
+
- `quickstart.md`, `natural-blocks.md`, `patterns.md`, `runtime-configuration.md`, and `verification.md` are task-oriented. They cover *when to use* and *how to use* selected APIs -- not every public symbol. A public API having no coverage in these pages is expected when the symbol serves a narrow or advanced use case already documented by `api.md` and `specification.md`.
|
|
24
|
+
|
|
25
|
+
`docs/AGENTS.md` is governance-only content. It is a symlink to `.claude/rules/docs.md` and must not appear as an accidental published governance page. It is excluded from the published site via `exclude_docs` in `mkdocs.yml`.
|
|
26
|
+
|
|
27
|
+
## File roles and boundaries
|
|
28
|
+
|
|
29
|
+
| File | Audience | Role | Scope |
|
|
30
|
+
|---|---|---|---|
|
|
31
|
+
| `index.md` | First-time visitors | Landing page | Value proposition, one runnable example, entry path routing. No API, no how-to. |
|
|
32
|
+
| `quickstart.md` | New users | First success | Minimal setup, minimal example, minimal troubleshooting, explicit next-step link. |
|
|
33
|
+
| `natural-blocks.md` | Learners | What Natural blocks are | Natural block anatomy, prompt structure, read/write bindings, Pydantic model bindings, f-string injection, functions and discoverability, binding function design (principles and basic examples), responsibility split, structured output design. |
|
|
34
|
+
| `executors.md` | Learners / evaluators | Choose an execution backend | Capability / cost / latency matrix, decision tree, `StepExecutorConfiguration` basics, and routing to side references and `runtime-configuration.md`. |
|
|
35
|
+
| `runtime-configuration.md` | Learners | Configure execution | `nh.run()`, `nh.scope()`, configuration patching, prompt suffix fragments, context limits, JSON rendering style, and runtime execution identity. |
|
|
36
|
+
| `patterns.md` | Practitioners | Apply Natural blocks in workflows | Outcomes, deny frontmatter, error handling, custom exception types, async, carry pattern, cross-block composition, resilience patterns, and common mistakes. |
|
|
37
|
+
| `verification.md` | Practitioners | Verify and debug | Mock tests, integration tests, prompt inspection, diagnosing snipped markers, OpenTelemetry span hierarchy, step events, local trace inspection. |
|
|
38
|
+
| `pydantic-ai-providers.md` | Model configurers | Pydantic AI provider reference | Provider list, installation, model identifiers, credentials, model settings, provider-specific troubleshooting. No chooser. No custom backends. |
|
|
39
|
+
| `coding-agent-backends.md` | Backend users | Backend reference | Backend-specific settings, shared capabilities, skills, MCP tool exposure, working directory, troubleshooting. |
|
|
40
|
+
| `for-coding-agents.md` | Coding agents (LLMs) | Operational guide | Condensed, decision-oriented rules derived from human-oriented docs. Self-contained with absolute URLs. |
|
|
41
|
+
| `specification.md` | Implementors | Canonical spec | Syntax, state layers, tools, outcomes, frontmatter, runtime semantics, observability contract, and custom backend capability/protocol semantics. Numbered section headings. |
|
|
42
|
+
| `philosophy.md` | Evaluators | Design rationale | Execution model, harness evidence, design consequences (resilience, scoped execution contexts, tool exposure, multi-agent coordination, tradeoffs), runtime evaluation rationale, design landscape. |
|
|
43
|
+
| `api.md` | Developers | API reference | Auto-generated from docstrings, including protocol and extension-hook symbols. |
|
|
44
|
+
| `roadmap.md` | Contributors | Future directions | Ideas only. Remove when implemented. |
|
|
45
|
+
| `docs/AGENTS.md` | Coding agents editing docs | Documentation governance | Canonical ownership, page roles, routing rules, and docs test invariants. Symlink to `.claude/rules/docs.md`. |
|
|
46
|
+
|
|
47
|
+
## Content routing
|
|
48
|
+
|
|
49
|
+
List only topics that commonly drift across multiple files or are easy to misplace.
|
|
50
|
+
|
|
51
|
+
- **Credentials** -> `quickstart.md` (minimal first run), `pydantic-ai-providers.md` (Pydantic AI providers), `coding-agent-backends.md` (backend prerequisites)
|
|
52
|
+
- **Executor selection** -> `executors.md` (capability and cost tradeoffs), `coding-agent-backends.md` (backend behavior and constraints), `for-coding-agents.md` (block-level operational guidance), `quickstart.md` (minimal entry example only)
|
|
53
|
+
- **Runtime setup and scoping** (`nh.run()`, `nh.scope()`, configuration patching) -> `runtime-configuration.md` (canonical), `patterns.md` (usage-only references), `for-coding-agents.md` (condensed)
|
|
54
|
+
- **Context limits / JSON rendering / execution identity** -> `runtime-configuration.md` (canonical), `verification.md` (`<snipped>` diagnosis only), `specification.md` (formal semantics), `for-coding-agents.md` (condensed)
|
|
55
|
+
- **Bindings** -> `natural-blocks.md` (canonical), `specification.md` (formal definition), `for-coding-agents.md` (condensed)
|
|
56
|
+
- **Binding function design** -> `natural-blocks.md` (principles and basic examples), `patterns.md` (only when binding functions participate in multi-block patterns), `for-coding-agents.md` (condensed)
|
|
57
|
+
- **Resilience** -> `patterns.md` (canonical patterns), `for-coding-agents.md` (condensed operational rules), `philosophy.md` (positioning only)
|
|
58
|
+
- **Testing** -> `verification.md` (canonical patterns and examples), `for-coding-agents.md` (condensed operational rules), `specification.md` (testing is out of scope except for boundary statements)
|
|
59
|
+
- **Observability** -> `verification.md` (usage and debugging workflow), `specification.md` (specification and runtime semantics), `for-coding-agents.md` (normally omit; mention only when needed to explain execution constraints)
|
|
60
|
+
- **Deny frontmatter** -> `patterns.md` (standard patterns), `specification.md` (canonical specification), `for-coding-agents.md` (condensed operational rules)
|
|
61
|
+
- **Coding agent control** -> `philosophy.md` (execution model and design consequences), `coding-agent-backends.md` (configuration and constraints)
|
|
62
|
+
- **Coding agent backend impact** -> `philosophy.md` (execution model and design consequences), `index.md` (brief summary), `coding-agent-backends.md` (details)
|
|
63
|
+
- **Async** -> `patterns.md` (patterns), `specification.md` (specification), `for-coding-agents.md` (condensed rules)
|
|
64
|
+
- **Structured output / Pydantic models** -> `natural-blocks.md` (design guidelines), `specification.md` (type validation specification)
|
|
65
|
+
- **Custom backends** -> `specification.md` (semantics), `api.md` (protocol symbols), `executors.md` (chooser-level mention only)
|
|
66
|
+
- **Workflow engine comparison** -> `philosophy.md` (canonical, design landscape section), `index.md` (link), `executors.md` (link)
|
|
67
|
+
- **Tool exposure tradeoffs** -> `philosophy.md` (canonical, design consequences section), `index.md` (link), `executors.md` (link)
|
|
68
|
+
- **Docs governance** -> `docs/AGENTS.md` and `.claude/rules/docs.md`. No derivative restatement elsewhere.
|
|
69
|
+
|
|
70
|
+
## Shared writing guidelines
|
|
71
|
+
|
|
72
|
+
### General
|
|
73
|
+
|
|
74
|
+
- Headings: sentence case. Capitalize first word, proper nouns (Nighthawk, Natural, Pydantic), acronyms (LLM, JSON, MCP).
|
|
75
|
+
- Anchors: name-based (`#writing-guidelines`), not number-based (`#1-writing-guidelines`). Exception: `specification.md` is a specification document and may use numbered section headings as its stable citation hierarchy.
|
|
76
|
+
- Cross-references: relative links. Exception: `for-coding-agents.md` uses absolute URLs from `site_url`.
|
|
77
|
+
- Terminology: "task" = structural unit (contract), "judgment" = cognitive act. Use "one task per block".
|
|
78
|
+
- Code examples: self-contained and understandable without surrounding prose.
|
|
79
|
+
- Built-in tools (`nh_eval`, `nh_assign`): implementation details. Only `specification.md` may expose them.
|
|
80
|
+
- `@nh.tool`: `specification.md` documents as spec, `natural-blocks.md` may mention it with a "prefer binding functions" note, all others must not reference it.
|
|
81
|
+
- Package name: always `nighthawk-python` in `pip install` commands.
|
|
82
|
+
- When renaming a document or changing its role, update all inbound references, routing rules here, relevant `tests/docs`, and navigation metadata if applicable.
|
|
83
|
+
- When a governance file under `docs/` is not meant for publication, its MkDocs handling must be explicit.
|
|
84
|
+
- `natural-blocks.md` and `patterns.md` together replace the former `guide.md`.
|
|
85
|
+
- References to `design.md` should use `specification.md`.
|
|
86
|
+
- References to `providers.md` should use `pydantic-ai-providers.md`.
|
|
87
|
+
|
|
88
|
+
### Prerequisite notes
|
|
89
|
+
|
|
90
|
+
Pages in the Getting started, Patterns & verification, and Configuration nav groups must open with a short prerequisite note. This supports non-linear readers who jump directly to a topic. The note should be one sentence naming the assumed prior reading. Pages in Reference, Background, and Project groups (`specification.md`, `philosophy.md`, `roadmap.md`, `api.md`) are exempt -- they serve independent audiences that do not follow the learning path.
|
|
91
|
+
|
|
92
|
+
## Per-file rules
|
|
93
|
+
|
|
94
|
+
**`index.md`**
|
|
95
|
+
|
|
96
|
+
- Links list must stay in sync with `nav` in `mkdocs.yml`.
|
|
97
|
+
- One representative code example (Python + Natural block + binding function). The example must be self-contained and runnable (include executor setup and `nh.run()` context).
|
|
98
|
+
- Brief positioning summaries linking to `philosophy.md`. No comparisons or benchmarks.
|
|
99
|
+
|
|
100
|
+
**`quickstart.md`**
|
|
101
|
+
|
|
102
|
+
- Optimize for copy-paste.
|
|
103
|
+
- Include only the minimum needed for a first success.
|
|
104
|
+
- Retain one explicit sentence stating the trust model / hard constraint for Natural blocks and imported markdown.
|
|
105
|
+
- End with a next-page link to `natural-blocks.md`.
|
|
106
|
+
- No backend alternatives beyond a one-line link to `executors.md`.
|
|
107
|
+
|
|
108
|
+
**`natural-blocks.md`**
|
|
109
|
+
|
|
110
|
+
- Prerequisite note: "This page assumes you have completed [Quickstart](quickstart.md)."
|
|
111
|
+
- Owns Natural block anatomy, prompt structure, binding semantics, discoverability, binding function design (principles and basic examples), responsibility split, and structured output design guidelines.
|
|
112
|
+
- Binding function design includes principles and basic examples that are complete within a single block. Advanced multi-block patterns (carry, branching, resilience) belong in `patterns.md`.
|
|
113
|
+
- Owns migrated `prompt-example` test anchors (`basic-binding`, `fstring-injection`, `local-function-signature`, `global-function-reference`). Exception: `carry-pattern` belongs in `patterns.md`.
|
|
114
|
+
- Backend-agnostic: examples use the Quickstart default executor.
|
|
115
|
+
- Cross-reference `specification.md` for formal definitions.
|
|
116
|
+
- Ends with a routing sentence: "Choosing an executor is in [Executors](executors.md). Runtime configuration (`nh.run()`, `nh.scope()`, limits) is in [Runtime configuration](runtime-configuration.md)."
|
|
117
|
+
|
|
118
|
+
**`executors.md`**
|
|
119
|
+
|
|
120
|
+
- Prerequisite note: "This page assumes you have completed [Quickstart](quickstart.md) and [Natural blocks](natural-blocks.md)."
|
|
121
|
+
- Owns executor selection: capability matrix, decision tree, and `StepExecutorConfiguration` basics.
|
|
122
|
+
- Links to `philosophy.md` for positioning instead of duplicating it.
|
|
123
|
+
- Capability matrix must include relative cost and latency columns.
|
|
124
|
+
- Must include an explicit custom-backend routing subsection with a minimal `AgentStepExecutor.from_agent(agent=agent)` runnable example (3-5 lines). Direct `AsyncStepExecutor` implementation belongs in `specification.md`.
|
|
125
|
+
- Ends with routing: side trips to `pydantic-ai-providers.md` and `coding-agent-backends.md`, then next-step link to `runtime-configuration.md`.
|
|
126
|
+
- Runtime configuration topics (`nh.run()`, `nh.scope()`, configuration patching, prompt suffix, context limits, JSON rendering, execution identity) belong in `runtime-configuration.md`, not here.
|
|
127
|
+
|
|
128
|
+
**`runtime-configuration.md`**
|
|
129
|
+
|
|
130
|
+
- Prerequisite note: "This page assumes you have completed [Executors](executors.md)."
|
|
131
|
+
- Owns all runtime configuration: `nh.run()`, `nh.scope()`, configuration patching, prompt suffix fragments, context limits, JSON rendering style, and runtime execution identity.
|
|
132
|
+
- These topics are independent of executor choice. The page applies equally to Pydantic AI providers and coding agent backends.
|
|
133
|
+
- Cross-reference `specification.md` for formal semantics.
|
|
134
|
+
- Ends with next-step link to `patterns.md`.
|
|
135
|
+
|
|
136
|
+
**`patterns.md`**
|
|
137
|
+
|
|
138
|
+
- Prerequisite note: "This page assumes you have completed [Natural blocks](natural-blocks.md) and [Runtime configuration](runtime-configuration.md)."
|
|
139
|
+
- Owns outcomes, deny, async, carry, composition, resilience, and common mistakes.
|
|
140
|
+
- Scope: multi-block coordination and operational patterns. Single-block-complete topics belong in `natural-blocks.md`.
|
|
141
|
+
- Backend-agnostic: no backend-specific file layouts or credentials.
|
|
142
|
+
- Cross-reference `specification.md` for formal definitions.
|
|
143
|
+
|
|
144
|
+
**`verification.md`**
|
|
145
|
+
|
|
146
|
+
- Prerequisite note: "Mock testing is readable after [Natural blocks](natural-blocks.md); later sections assume [Patterns](patterns.md)."
|
|
147
|
+
- Owns mock tests, integration tests, prompt inspection, debugging workflow, and OpenTelemetry usage.
|
|
148
|
+
- Normative observability contracts belong in `specification.md`.
|
|
149
|
+
|
|
150
|
+
**`pydantic-ai-providers.md`**
|
|
151
|
+
|
|
152
|
+
- Prerequisite note: "See [Executors](executors.md) for choosing between providers, backends, and custom executors."
|
|
153
|
+
- Pure Pydantic AI provider reference: installation, model identifiers, model settings, troubleshooting.
|
|
154
|
+
- No chooser table.
|
|
155
|
+
- No custom backends.
|
|
156
|
+
|
|
157
|
+
**`coding-agent-backends.md`**
|
|
158
|
+
|
|
159
|
+
- Prerequisite note: "See [Executors](executors.md) for when to choose a coding agent backend over a provider-backed executor."
|
|
160
|
+
- Reference-first page: minimal orientation only, then backend-specific settings, skills, MCP, working directory, and troubleshooting.
|
|
161
|
+
- Must not become a second chooser page. Capability, latency, cost, and positioning comparisons belong in `executors.md` and `philosophy.md`.
|
|
162
|
+
- Shared capabilities section for common features.
|
|
163
|
+
- Reference `executors.md` for capability and cost comparisons.
|
|
164
|
+
|
|
165
|
+
**`specification.md`**
|
|
166
|
+
|
|
167
|
+
- All current specification rules apply, with the new name.
|
|
168
|
+
- Numbered section headings remain stable.
|
|
169
|
+
- Owns custom backend capability/protocol semantics, placed as a subsection under Section 14 (Step executor) to avoid top-level section number disruption.
|
|
170
|
+
- Includes a non-runnable skeletal shape of the `AsyncStepExecutor` protocol surface under Section 14. No runnable implementation example is required (the runnable `from_agent` example lives in `executors.md`).
|
|
171
|
+
|
|
172
|
+
**`philosophy.md`**
|
|
173
|
+
|
|
174
|
+
- Owns the cumulative argument: execution model, harness evidence, design consequences (resilience, scoped execution contexts, tool exposure, multi-agent coordination, tradeoffs), runtime evaluation rationale, and design landscape.
|
|
175
|
+
- External references acceptable. Prefer stable URLs with enough inline context to survive link rot.
|
|
176
|
+
- No how-to code examples for patterns in `natural-blocks.md` or `patterns.md`. Exception: positioning examples may reuse function names from those pages.
|
|
177
|
+
|
|
178
|
+
**`for-coding-agents.md`**
|
|
179
|
+
|
|
180
|
+
- Reader is a coding agent. Write for immediate applicability with runnable templates and decision rules.
|
|
181
|
+
- Information flows from human-oriented docs only; never introduce new product behavior here first.
|
|
182
|
+
- May derive from `natural-blocks.md`, `patterns.md`, `runtime-configuration.md`, `verification.md`, `specification.md`, `pydantic-ai-providers.md`, and `coding-agent-backends.md`.
|
|
183
|
+
- Self-contained with absolute URLs from `site_url` (`https://kurusugawa-computer.github.io/nighthawk-python/`).
|
|
184
|
+
- Prefer decision rules over encyclopedic coverage.
|
|
185
|
+
- Recommend provider-backed executors by default and coding agent backends only for blocks that need autonomous long-horizon work.
|
|
186
|
+
- Keep trust-model constraints explicit.
|
|
187
|
+
- Condensation policy: compress tables and lists to inline summaries or subsets with links to canonical docs. Verbatim duplication only for compact, self-contained content.
|
|
188
|
+
- Common mistakes: subset of most impactful items with link to fuller guidance.
|
|
189
|
+
- Include resilience and scoped overrides.
|
|
190
|
+
- Omit observability except when needed to explain execution constraints.
|
|
191
|
+
- Omit exception hierarchy beyond `ExecutionError` unless a narrower rule is essential for safe coding.
|
|
192
|
+
- Published as a derivative operational reference under `Reference` in nav, not as a top-level learner-path peer.
|
|
193
|
+
- Absolute URLs use topic-based canonical owner mapping:
|
|
194
|
+
- Bindings, block anatomy, responsibility split, binding function design -> `/natural-blocks/`
|
|
195
|
+
- Carry, deny, async, resilience, multi-block composition -> `/patterns/`
|
|
196
|
+
- Executor selection, `StepExecutorConfiguration` basics -> `/executors/`
|
|
197
|
+
- `nh.run()`, `nh.scope()`, configuration patching, context limits, JSON rendering, execution identity -> `/runtime-configuration/`
|
|
198
|
+
- Provider-specific setup -> `/pydantic-ai-providers/`
|
|
199
|
+
- Coding agent backend config -> `/coding-agent-backends/`
|
|
200
|
+
- Spec references -> `/specification/`
|
|
201
|
+
|
|
202
|
+
**`api.md`**
|
|
203
|
+
|
|
204
|
+
- Exhaustive inventory of the supported public API surface. Every supported public symbol should appear here.
|
|
205
|
+
- Auto-generated from source docstrings. Hand-editing limited to `:::` directive structure.
|
|
206
|
+
- Use `members` filters to avoid duplicate rendering.
|
|
207
|
+
- A symbol appearing only in `api.md` (with no learner-facing page coverage) is acceptable. Task-oriented docs select symbols by pedagogical value, not completeness.
|
|
208
|
+
|
|
209
|
+
**`roadmap.md`**
|
|
210
|
+
|
|
211
|
+
- Future-facing only. Remove items once implemented.
|
|
212
|
+
- Each item should reference the relevant `specification.md` section where that helps maintain traceability.
|
|
213
|
+
|
|
214
|
+
**`docs/AGENTS.md`**
|
|
215
|
+
|
|
216
|
+
- Is a symlink to `.claude/rules/docs.md`. No separate synchronization step needed.
|
|
217
|
+
- Must not appear as an accidental published governance page. Exclude via `exclude_docs` in `mkdocs.yml`, listing `AGENTS.md` explicitly by filename.
|
|
218
|
+
|
|
219
|
+
## Documentation test invariants
|
|
220
|
+
|
|
221
|
+
- Treat executable or doctrinal claims in docs as testable when practical.
|
|
222
|
+
- `prompt-example` anchors live in `natural-blocks.md` by default (`basic-binding`, `fstring-injection`, `local-function-signature`, `global-function-reference`). Exception: `carry-pattern` lives in `patterns.md` (cross-block composition). Test file: `tests/docs/test_prompt_examples.py`.
|
|
223
|
+
- `for-coding-agents.md` operational examples and core doctrine are guarded by `tests/docs/test_coding_agent_examples.py`. Update the tests when changing executable guidance or non-negotiable rules.
|
|
224
|
+
- When a docs change invalidates an existing test, first decide whether the docs or the test is the canonical truth for that claim, then update both sides to match.
|
|
225
|
+
- Docs architecture regression tests (`tests/docs/test_docs_architecture.py`):
|
|
226
|
+
- Fail on stale references to deleted/renamed docs (`guide.md`, `design.md`, `providers.md`).
|
|
227
|
+
- Guard the canonical example relationship between `index.md` and `README.md` via fenced-code-block extraction + normalized exact match.
|
|
228
|
+
- Guard selected canonical-owner expectations where drift is likely.
|
|
229
|
+
- Automate `mkdocs.yml` nav entries vs `docs/` file existence as a pytest case.
|
|
230
|
+
- Guard that obsolete canonical pages do not remain published accidentally alongside their replacements.
|
|
231
|
+
- `mkdocs build` must succeed without `--strict`. Warnings are acceptable.
|
|
232
|
+
- All internal relative links must resolve.
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
---
|
|
2
|
+
paths:
|
|
3
|
+
- "evals/promptfoo/**"
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Prompt evaluation (promptfoo)
|
|
7
|
+
|
|
8
|
+
## Directory roles
|
|
9
|
+
|
|
10
|
+
| Directory | Purpose | Determinism | Cost |
|
|
11
|
+
|---|---|---|---|
|
|
12
|
+
| `evals/promptfoo/` | Prompt experimentation: system prompt variants, tool descriptions, suffix fragments, backend comparison | Non-deterministic; use `--repeat N` to measure stability. | API calls x N x providers. |
|
|
13
|
+
| `evals/promptfoo/outputs/` | Transient raw output (gitignored). Working files for in-progress analysis. | N/A | N/A |
|
|
14
|
+
| `evals/promptfoo/evidence/` | Committed eval evidence. Decision rationale for adopted/rejected variants. | N/A | N/A |
|
|
15
|
+
|
|
16
|
+
## Prompt changes workflow
|
|
17
|
+
|
|
18
|
+
1. Edit eval-layer prompts or provider config in `evals/promptfoo/`.
|
|
19
|
+
2. Run eval: `eval $PFOO eval --filter-providers "<provider>" --no-cache`.
|
|
20
|
+
3. Compare against previous eval in the promptfoo DB or JSON output.
|
|
21
|
+
4. Once validated, port the change to the corresponding production code (see mapping below).
|
|
22
|
+
5. Run `uv run pytest -q` to confirm no regressions.
|
|
23
|
+
|
|
24
|
+
**`--filter-providers` caveat**: The flag takes a regex pattern, not an exact label. A pattern like `"gpt-5.4-mini"` matches every provider whose label contains that substring (e.g. both `openai-responses` and `codex` labels). Use an anchored or label-specific pattern (e.g. `"^gpt-5.4-mini"`, `"codex:"`) to target a single provider.
|
|
25
|
+
|
|
26
|
+
## Prompt variant cleanup (deletion timing and criteria)
|
|
27
|
+
|
|
28
|
+
- Delete rejected prompt variants immediately when any of the following is true:
|
|
29
|
+
- Deterministic regression (`N/N` failures across repeats, e.g. `--repeat 3`)
|
|
30
|
+
- Clear degradation versus baseline on primary metrics
|
|
31
|
+
- Temporary/scratch variants created for quick checks
|
|
32
|
+
- Keep a variant only if it has clear re-test value (e.g. flaky `1/N` behavior or meaningful metric trade-offs).
|
|
33
|
+
- Retention for kept variants is limited to `min(7 days, 2 experiment cycles)`.
|
|
34
|
+
- If not re-evaluated within this window, delete it.
|
|
35
|
+
- Run cleanup at three checkpoints:
|
|
36
|
+
1. Right after each experiment run
|
|
37
|
+
2. Before opening a PR
|
|
38
|
+
3. Before merge (final sweep)
|
|
39
|
+
- Before deleting, record a short rejection reason in the corresponding `evals/promptfoo/evidence/` file; do not keep dead prompt files just for history.
|
|
40
|
+
|
|
41
|
+
## Backend prerequisites
|
|
42
|
+
|
|
43
|
+
| Backend | Requirement |
|
|
44
|
+
|---|---|
|
|
45
|
+
| `openai-responses` | `OPENAI_API_KEY` environment variable. |
|
|
46
|
+
| `codex` | Pre-authenticated `codex` CLI (`codex login`) or `CODEX_API_KEY` environment variable. |
|
|
47
|
+
| `claude-code-cli` | Pre-authenticated `claude` CLI (`claude login`) or `ANTHROPIC_API_KEY` environment variable. |
|
|
48
|
+
| `claude-code-sdk` | `ANTHROPIC_API_KEY` environment variable. |
|
|
49
|
+
|
|
50
|
+
Evals that include a backend without its prerequisite will run but produce errors for that backend's test cases. Use `--filter-providers` to exclude unavailable backends.
|
|
51
|
+
|
|
52
|
+
## Eval-to-production mapping
|
|
53
|
+
|
|
54
|
+
Eval prompts are experimental copies of production code. Keep them in sync; divergence means eval results do not predict production behavior.
|
|
55
|
+
|
|
56
|
+
| Eval file | Production counterpart |
|
|
57
|
+
|---|---|
|
|
58
|
+
| `evals/promptfoo/prompts/eval_default.txt` | `configuration.py:DEFAULT_STEP_SYSTEM_PROMPT_TEMPLATE` |
|
|
59
|
+
| `evals/promptfoo/prompts/eval_coding_agent.txt` | No single counterpart; coding agent backends receive this prompt via `system_prompt_file` config. |
|
|
60
|
+
| Suffix variants in `evals/promptfoo/provider.py` (`_build_suffix_*`) | `step_contract.py:build_step_system_prompt_suffix_fragment` |
|
|
61
|
+
| Tool presets in `evals/promptfoo/provider.py` (`_build_tool_preset`) | `tools/registry.py` + `tools/assignment.py` |
|
|
62
|
+
|
|
63
|
+
## Eval evidence
|
|
64
|
+
|
|
65
|
+
### When to save evidence
|
|
66
|
+
|
|
67
|
+
| Save | Do not save |
|
|
68
|
+
|---|---|
|
|
69
|
+
| Eval that decided adoption or rejection of a prompt/suffix/tool variant | In-progress trial runs during development |
|
|
70
|
+
| Regression baseline update | Single-test filter runs |
|
|
71
|
+
| Eval backing a change merged via PR | Transient output already in `outputs/` |
|
|
72
|
+
|
|
73
|
+
### File path convention
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
evals/promptfoo/evidence/{YYYY-MM-DD}-{experiment-slug}.md
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Examples: `2026-03-20-suffix-ab.md`, `2026-03-25-regression-v2.md`.
|
|
80
|
+
|
|
81
|
+
### File format
|
|
82
|
+
|
|
83
|
+
```markdown
|
|
84
|
+
---
|
|
85
|
+
eval_id: <promptfoo eval ID>
|
|
86
|
+
config: <YAML config file used>
|
|
87
|
+
date: YYYY-MM-DD
|
|
88
|
+
decision: <one-line summary of what was adopted/rejected>
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Providers tested
|
|
92
|
+
- <provider label> (<variants if A/B>)
|
|
93
|
+
|
|
94
|
+
## Results summary
|
|
95
|
+
| Variant | Pass | Fail | Error | Score | Latency |
|
|
96
|
+
|---------|------|------|-------|-------|---------|
|
|
97
|
+
| ... | | | | | |
|
|
98
|
+
|
|
99
|
+
## Decision rationale
|
|
100
|
+
<Why the chosen variant was adopted.>
|
|
101
|
+
|
|
102
|
+
## Rejected variants
|
|
103
|
+
- <variant>: <short rejection reason>
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Rules
|
|
107
|
+
|
|
108
|
+
- Only Markdown files (`*.md`) are committed under `evidence/`. Raw JSON stays in `outputs/` (gitignored).
|
|
109
|
+
- One file per experiment decision. If a follow-up eval revises a prior decision, create a new file; do not overwrite.
|
|
110
|
+
- Reference the `eval_id` so the full raw result can be retrieved from the promptfoo local DB (`promptfoo view`) or `-o` export if needed.
|
|
111
|
+
|
|
112
|
+
## Eval interpretation
|
|
113
|
+
|
|
114
|
+
- **Exit code 100**: promptfoo returns exit code 100 when any test case fails. This is not a system error; it signals "some assertions did not pass". CI scripts and background runners should treat exit 100 as "check results" rather than "eval crashed".
|
|
115
|
+
- **OpenAI 500 errors**: Transient; ignore unless persistent across runs.
|
|
116
|
+
- **Codex CLI errors**: Codex backend is unstable; isolate with `--filter-providers`.
|
|
117
|
+
- **Codex binding-not-returned**: Codex may return `None` for write bindings that `openai-responses` handles correctly. This is a distinct failure mode from flaky LLM non-determinism; it typically indicates the coding-agent backend did not invoke the assignment tool.
|
|
118
|
+
- **Mutation vs filter ambiguity**: Natural language instructions like "remove negative numbers" can be interpreted as either in-place mutation or filter-to-new-list. LLMs inconsistently choose between these, producing flaky failures where the binding value is the original unmodified collection. This pattern recurs across providers and suffix variants.
|
|
119
|
+
- **1/N failures (flaky)**: Inherent LLM non-determinism. Use `--repeat 3` to distinguish from deterministic regressions.
|
|
120
|
+
- **Deterministic failures** (N/N across repeats): Require prompt or code fix before merging.
|
|
121
|
+
|
|
122
|
+
## Baseline workflow
|
|
123
|
+
|
|
124
|
+
Run a full baseline when: (a) setting up the eval environment for the first time, (b) after a major production code change, or (c) when prior baselines are stale (> 2 weeks or across model version changes).
|
|
125
|
+
|
|
126
|
+
1. Verify prerequisites for each backend (see Backend prerequisites above).
|
|
127
|
+
2. Run all applicable configs in parallel with `--no-cache` and `-o outputs/baseline-{slug}.json`:
|
|
128
|
+
- `promptfooconfig.yaml` — regression across all available backends.
|
|
129
|
+
- `promptfooconfig-prompt-ab.yaml` — prompt/tool variant comparison.
|
|
130
|
+
- `promptfooconfig-suffix-ab.yaml` — suffix fragment comparison.
|
|
131
|
+
- `promptfooconfig-agents.yaml` — coding-agent backends (requires `codex` and `claude` CLIs).
|
|
132
|
+
3. For backends without prerequisites, either skip the config or use `--filter-providers` to exclude unavailable providers.
|
|
133
|
+
4. Create one evidence file per config under `evals/promptfoo/evidence/` with the `baseline-` slug prefix.
|
|
134
|
+
|
|
135
|
+
## Commands
|
|
136
|
+
|
|
137
|
+
See `CONTRIBUTING.md` "Prompt evaluation with promptfoo" for eval commands, config files, directory layout, and flags.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
paths:
|
|
3
|
+
- "tests/**"
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Testing (pytest)
|
|
7
|
+
|
|
8
|
+
## Directory roles
|
|
9
|
+
|
|
10
|
+
| Directory | Purpose | Determinism | Cost |
|
|
11
|
+
|---|---|---|---|
|
|
12
|
+
| `tests/` | Pytest suite: unit tests (ScriptedExecutor) and integration tests (real LLM) | Unit: deterministic. Integration: non-deterministic but single-run. | Unit: free. Integration: API calls. |
|
|
13
|
+
| `src/nighthawk/testing.py` | Test utility API for deterministic Natural-function tests (`ScriptedExecutor`, `CallbackExecutor`, and response factories). | Deterministic. | Free. |
|
|
14
|
+
|
|
15
|
+
## Workflow
|
|
16
|
+
|
|
17
|
+
### Scope boundary (pytest vs promptfoo)
|
|
18
|
+
|
|
19
|
+
- Do not force prompt behavior validation into pytest-only checks.
|
|
20
|
+
- When prompt rendering, system prompt text, suffix generation, or tool-exposure behavior changes, follow `.claude/rules/promptfoo.md`.
|
|
21
|
+
|
|
22
|
+
### Python code changes (tools, executor, contracts)
|
|
23
|
+
|
|
24
|
+
1. Write or update unit tests in `tests/` first.
|
|
25
|
+
2. Prefer helpers from `nighthawk.testing` (for example `ScriptedExecutor`, `CallbackExecutor`, `pass_response`, `return_response`) when avoiding live LLM calls.
|
|
26
|
+
3. Run `uv run pytest -q`.
|
|
27
|
+
4. If the change affects prompt rendering or tool behavior, follow `.claude/rules/promptfoo.md` and run the relevant eval subset.
|
|
@@ -30,7 +30,7 @@ model_list:
|
|
|
30
30
|
additional_drop_params: ["context_management", "output_config"]
|
|
31
31
|
- model_name: claude-haiku-*
|
|
32
32
|
litellm_params:
|
|
33
|
-
model: openai/gpt-5-mini
|
|
33
|
+
model: openai/gpt-5.4-mini
|
|
34
34
|
api_key: os.environ/OPENAI_API_KEY
|
|
35
35
|
additional_drop_params: ["context_management", "output_config"]
|
|
36
36
|
litellm_settings:
|
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
|
|
14
14
|
### Allowed abbreviations (Glossary)
|
|
15
15
|
|
|
16
|
-
`Id` (Identifier), `DSL` (Domain Specific Language), `LLM` (Large Language Model), `NH`/`nh` (Nighthawk), `max` (maximum), `min` (minimum), loop indices `i`/`j`/`k
|
|
16
|
+
`Id` (Identifier), `DSL` (Domain Specific Language), `LLM` (Large Language Model), `NH`/`nh` (Nighthawk), `max` (maximum), `min` (minimum), loop indices `i`/`j`/`k`, type parameters `P` (ParamSpec), `R` (Return type), `T` (Type variable).
|
|
17
17
|
|
|
18
18
|
### Rules
|
|
19
19
|
|
|
@@ -37,7 +37,7 @@ Natural DSL sources and included markdown are trusted, repository-managed assets
|
|
|
37
37
|
|
|
38
38
|
- `src/nighthawk/`: Library package.
|
|
39
39
|
- `tests/`: Pytest suite.
|
|
40
|
-
- `docs/`:
|
|
40
|
+
- `docs/`: User-facing documentation (MkDocs).
|
|
41
41
|
- `.agents/`: `execplans/` (on request only), `PLANS.md` (format spec).
|
|
42
42
|
- `.devcontainer/`: Devcontainer definition.
|
|
43
43
|
- `pyproject.toml`, `uv.lock`: Metadata and locked dependencies.
|
|
@@ -51,13 +51,16 @@ Python 3.13+, `uv` for dependencies, `pytest` for tests. Prefer LSP-based toolin
|
|
|
51
51
|
| `uv run python` | Investigate interactively |
|
|
52
52
|
| `uv sync --all-extras --all-groups` | Install/sync dependencies |
|
|
53
53
|
| `uv run ruff format .` | Format |
|
|
54
|
-
| `uv run ruff check .` | Lint |
|
|
55
54
|
| `uv run ruff check --fix .` | Auto-fix lint |
|
|
56
55
|
| `uv run pyright` | Type check |
|
|
57
|
-
| `uv run pytest` | Full test suite |
|
|
58
56
|
| `uv run pytest -q` | Tests (quiet) |
|
|
59
|
-
| `
|
|
57
|
+
| `NIGHTHAWK_OPENAI_INTEGRATION_TESTS=1 uv run pytest -q` | Integration tests (OpenAI) |
|
|
58
|
+
| `NIGHTHAWK_CODEX_INTEGRATION_TESTS=1 uv run pytest -q` | Integration tests (Codex) |
|
|
59
|
+
| `NIGHTHAWK_CLAUDE_SDK_INTEGRATION_TESTS=1 uv run pytest -q` | Integration tests (Claude Code SDK) |
|
|
60
|
+
| `NIGHTHAWK_CLAUDE_CLI_INTEGRATION_TESTS=1 uv run pytest -q` | Integration tests (Claude Code CLI) |
|
|
60
61
|
|
|
61
62
|
`uv` hardlinking warnings do not indicate failure. Suppress: `export UV_LINK_MODE=copy`.
|
|
62
63
|
|
|
63
64
|
Environment: `OPENAI_API_KEY` (OpenAI), `CODEX_API_KEY` (Codex).
|
|
65
|
+
|
|
66
|
+
Promptfoo evaluation details (commands, configs, directory layout, flags): see `CONTRIBUTING.md` "Prompt evaluation with promptfoo".
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- `nighthawk.resilience` module with composable function transformers for production resilience: `retrying` (tenacity-based), `fallback`, `vote`/`plurality`, `timeout`, `circuit_breaker`/`CircuitState`/`CircuitOpenError`.
|
|
12
|
+
- `tenacity>=9` as a core dependency.
|
|
13
|
+
- `BackendModelSettings` base class and `ClaudeCodeModelSettings` intermediate class extracting shared settings across coding agent backends.
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- Refactored backend settings hierarchy: extracted shared fields (`allowed_tool_names`, `working_directory`) into `BackendModelSettings` and Claude Code fields (`max_turns`, `permission_mode`, `setting_sources`) into `ClaudeCodeModelSettings`; renamed `claude_executable`/`codex_executable` to `executable`, `claude_max_turns` to `max_turns`.
|
|
17
|
+
- `nh_assign` now resolves type annotations via `get_type_hints` for plain classes and dataclasses, enabling type-mismatch retry beyond Pydantic models.
|
|
18
|
+
- Simplified intent hint formatting: dropped `intent: ` prefix from callable metadata comments in prompt context.
|
|
19
|
+
- Renamed `NIGHTHAWK_RUN_INTEGRATION_TESTS` to `NIGHTHAWK_OPENAI_INTEGRATION_TESTS` for consistency with other per-backend environment variables.
|
|
20
|
+
- Restructured documentation into sectioned navigation: split monolithic tutorial into focused guides (`natural-blocks`, `executors`, `runtime-configuration`, `patterns`, `verification`, `pydantic-ai-providers`); renamed `design.md` to `specification.md`; removed `practices.md`, `providers.md`, `tutorial.md`.
|
|
21
|
+
|
|
22
|
+
## [0.5.0]
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
- `evals/promptfoo/` evaluation harness for system prompt optimization using [promptfoo](https://www.promptfoo.dev/): custom Python provider, reusable assertions, and prompt variant A/B comparison support. See `CONTRIBUTING.md` for usage.
|
|
26
|
+
- `docs/philosophy.md`: design philosophy and motivation behind Nighthawk.
|
|
27
|
+
- `docs/practices.md`: practical patterns and binding function design guidance (extracted from tutorial).
|
|
28
|
+
|
|
29
|
+
### Changed
|
|
30
|
+
- Replaced `return_reference_path` with `return_expression` in step execution contract: return values are now specified as Python expressions evaluated against step locals/globals, consistent with `nh_eval`/`nh_assign` expression evaluation. This unblocks coding-agent backends (e.g. Claude Code CLI) that compute results via native tools without bridging values through `nh_assign`.
|
|
31
|
+
- `nh_assign` now infers binding types from initial values when no explicit annotation is provided, enabling type-mismatch retry for unannotated write bindings.
|
|
32
|
+
- Default `json_renderer_style` changed from `"strict"` to `"default"`, making truncation visible via `…` omission markers in prompt context and tool results.
|
|
33
|
+
- Merged `nh_exec` into `nh_eval`: `nh_eval` now handles expression evaluation, function calls, and in-place mutation. `nh_exec` is removed.
|
|
34
|
+
- Condensed system prompt: simplified tool selection guidance (single `nh_eval` tool), added execution order section, clarified tool result format.
|
|
35
|
+
- Condensed step execution contract (outcome prompt suffix) for reduced token usage.
|
|
36
|
+
- Improved `nh_assign` and `nh_eval` tool descriptions for LLM clarity.
|
|
37
|
+
- Restructured documentation: rewrote `index.md`, `tutorial.md`, `for-coding-agents.md`; cross-referenced specification and practice guides.
|
|
38
|
+
- Integration tests: replaced single `NIGHTHAWK_RUN_INTEGRATION_TESTS` gate with per-backend environment variables (`NIGHTHAWK_CODEX_INTEGRATION_TESTS`, `NIGHTHAWK_CLAUDE_SDK_INTEGRATION_TESTS`, `NIGHTHAWK_CLAUDE_CLI_INTEGRATION_TESTS`).
|
|
39
|
+
|
|
40
|
+
### Removed
|
|
41
|
+
- `nh_exec` tool (functionality absorbed by `nh_eval`).
|
|
42
|
+
- Three redundant OpenAI integration tests from `test_llm_integration.py` (covered by promptfoo evaluation harness).
|
|
43
|
+
- `pytest_sessionstart` credential-check hook (replaced by per-backend skip helpers).
|
|
44
|
+
|
|
45
|
+
## [0.4.0] - 2026-03-20
|
|
46
|
+
|
|
47
|
+
### Added
|
|
48
|
+
- `nighthawk.testing` module with test executors and convenience factories for deterministic Natural function testing without LLM API calls.
|
|
49
|
+
|
|
50
|
+
### Changed
|
|
51
|
+
- Rewrote testing documentation in `tutorial.md` (Section 8) and `for-coding-agents.md` (Section 8): replaced incorrect `TestModel` usage with `nighthawk.testing` utilities, added testing strategy guidance distinguishing mock tests (Python logic) from integration tests (Natural block judgment).
|
|
52
|
+
|
|
53
|
+
## [0.3.1] - 2026-03-19
|
|
54
|
+
|
|
55
|
+
### Changed
|
|
56
|
+
- Internal ID generation now uses `ulid.generate_ulid()` (ULID,
|
|
57
|
+
26-character Crockford Base32, timestamp-sortable) in a dedicated
|
|
58
|
+
module, replacing the former `generate_id` embedded in
|
|
59
|
+
`runtime.scoping`.
|
|
60
|
+
|
|
61
|
+
## [0.3.0] - 2026-03-18
|
|
62
|
+
|
|
63
|
+
### Added
|
|
64
|
+
- `system_prompt_suffix_fragment_scope` and `user_prompt_suffix_fragment_scope` context managers for lightweight prompt fragment management without full scope overhead.
|
|
65
|
+
- OpenTelemetry tracer now reports `instrumenting_library_version`.
|
|
66
|
+
|
|
67
|
+
### Changed
|
|
68
|
+
- Simplified OpenTelemetry span hierarchy: removed implicit `nighthawk.scope` spans and `nighthawk.step_executor` spans. `nighthawk.scope` spans are now emitted only for explicit `nh.scope()` calls.
|
|
69
|
+
- `nighthawk.run` span no longer includes `scope.id` attribute; only `run.id` is emitted.
|
|
70
|
+
- Trimmed `for-coding-agents.md` for coding-agent relevance: removed deprecated `@nh.tool` references, condensed exception hierarchy, scoped overrides, and added debugging context to `StepContextLimits`.
|
|
71
|
+
|
|
72
|
+
## [0.2.0] - 2026-03-16
|
|
73
|
+
|
|
74
|
+
### Added
|
|
75
|
+
- Added compact step trace support for Natural block execution attempts:
|
|
76
|
+
- `StepTrace`
|
|
77
|
+
- `StepTraceError`
|
|
78
|
+
- `nighthawk.get_step_traces()`
|
|
79
|
+
|
|
80
|
+
### Changed
|
|
81
|
+
- Updated CI workflow setup (`setup-uv`) in project automation.
|
|
82
|
+
|
|
83
|
+
### Fixed
|
|
84
|
+
- License badge reference in README.
|
|
85
|
+
- Documentation formatting inconsistencies.
|
|
86
|
+
|
|
87
|
+
## [0.1.0] - 2026-03-13
|
|
88
|
+
|
|
89
|
+
### Added
|
|
90
|
+
- Initial public release of `nighthawk-python`.
|
|
91
|
+
- Natural DSL execution runtime with run/scope execution context model.
|
|
92
|
+
- Step executor abstraction and provider integration foundation.
|
|
93
|
+
- Core documentation and project scaffolding.
|
|
94
|
+
|
|
95
|
+
[Unreleased]: https://github.com/kurusugawa-computer/nighthawk-python/compare/v0.5.0...HEAD
|
|
96
|
+
[0.5.0]: https://github.com/kurusugawa-computer/nighthawk-python/compare/v0.4.0...v0.5.0
|
|
97
|
+
[0.4.0]: https://github.com/kurusugawa-computer/nighthawk-python/compare/v0.3.1...v0.4.0
|
|
98
|
+
[0.3.1]: https://github.com/kurusugawa-computer/nighthawk-python/compare/v0.3.0...v0.3.1
|
|
99
|
+
[0.3.0]: https://github.com/kurusugawa-computer/nighthawk-python/compare/v0.2.0...v0.3.0
|
|
100
|
+
[0.2.0]: https://github.com/kurusugawa-computer/nighthawk-python/compare/v0.1.0...v0.2.0
|
|
101
|
+
[0.1.0]: https://github.com/kurusugawa-computer/nighthawk-python/tree/v0.1.0
|
|
@@ -70,6 +70,48 @@ OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 uv run pytest -q tests/integra
|
|
|
70
70
|
|
|
71
71
|
Traces appear in the otel-tui terminal UI in real time.
|
|
72
72
|
|
|
73
|
+
### Prompt evaluation with promptfoo
|
|
74
|
+
|
|
75
|
+
System prompt, tool descriptions, and backend behavior are evaluated with [promptfoo](https://www.promptfoo.dev/). Requires Node.js (for `npx`). API keys must be loaded from `.env` first.
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
set -a; source .env; set +a
|
|
79
|
+
PFOO="cd evals/promptfoo && PROMPTFOO_PYTHON=\"$(uv python find)\" npx promptfoo@latest"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
| Command | Purpose |
|
|
83
|
+
|---|---|
|
|
84
|
+
| `eval $PFOO eval` | Full regression (all backends, all tests) |
|
|
85
|
+
| `eval $PFOO eval -c promptfooconfig-prompt-ab.yaml` | Prompt A/B test (gpt-5.4-mini only) |
|
|
86
|
+
| `eval $PFOO eval -c promptfooconfig-agents.yaml` | Coding agent backends (reduced test set) |
|
|
87
|
+
| `eval $PFOO eval -c promptfooconfig.yaml --filter-pattern "P-BIND-001"` | Single test |
|
|
88
|
+
| `eval $PFOO eval -c promptfooconfig.yaml --filter-providers "claude-code-cli"` | Single backend |
|
|
89
|
+
| `eval $PFOO view` | Open results in browser |
|
|
90
|
+
|
|
91
|
+
#### Config files (`evals/promptfoo/`)
|
|
92
|
+
|
|
93
|
+
- `promptfooconfig.yaml` — Regression: winner prompt combo across openai-responses, claude-code-cli, and codex backends. All test cases.
|
|
94
|
+
- `promptfooconfig-prompt-ab.yaml` — A/B testing: 4 prompt/tool variants on gpt-5.4-mini. All test cases.
|
|
95
|
+
- `promptfooconfig-agents.yaml` — Coding agent only: claude-code-cli and codex with reduced test set.
|
|
96
|
+
|
|
97
|
+
#### Directory layout
|
|
98
|
+
|
|
99
|
+
- `provider.py` — Custom provider wrapping `AgentStepExecutor`. Handles tool preset installation, backend-specific model settings, and callable fixture resolution.
|
|
100
|
+
- `prompts/` — System prompt variants (`eval_default.txt`, `eval_sequenced.txt`, `eval_mutation_aware.txt`, `eval_coding_agent.txt`, etc.).
|
|
101
|
+
- `test_cases/` — YAML test suites: `binding_operations`, `tool_selection`, `outcome_kinds`, `edge_cases`, `loop_outcomes`, `multi_step`, `null_handling`, `tool_selection_core`.
|
|
102
|
+
- `assertions/` — Custom Python assertions: `binding_value.py`, `outcome_kind.py`, `raise_message.py`.
|
|
103
|
+
|
|
104
|
+
#### Adding tests
|
|
105
|
+
|
|
106
|
+
Each test case YAML entry needs: `description` (with `P-SLUG-NNN` Id), `vars` (natural_program, input_bindings, output_binding_names), and `assert` list. Callable bindings use `"__callable:<key>"` resolved from `_CALLABLE_FIXTURE_REGISTRY` in `provider.py`.
|
|
107
|
+
|
|
108
|
+
#### Useful flags
|
|
109
|
+
|
|
110
|
+
- `--no-cache` — Skip cache (required after changing provider.py or prompts).
|
|
111
|
+
- `--filter-pattern "<regex>"` — Run only matching test descriptions.
|
|
112
|
+
- `--filter-providers "<regex>"` — Run only matching provider labels.
|
|
113
|
+
- `-o <path>.json` — Write structured results to file for analysis.
|
|
114
|
+
|
|
73
115
|
### Environment variables
|
|
74
116
|
|
|
75
117
|
- `OPENAI_API_KEY`: Required for OpenAI integration tests (also requires `pydantic-ai-slim[openai]`).
|
|
@@ -160,7 +202,7 @@ def run(
|
|
|
160
202
|
Example:
|
|
161
203
|
```python
|
|
162
204
|
executor = AgentStepExecutor.from_configuration(
|
|
163
|
-
configuration=StepExecutorConfiguration(model="openai-responses:gpt-5-mini"),
|
|
205
|
+
configuration=StepExecutorConfiguration(model="openai-responses:gpt-5.4-mini"),
|
|
164
206
|
)
|
|
165
207
|
with nighthawk.run(executor):
|
|
166
208
|
result = my_natural_function()
|