fluxloop-cli 0.2.19__tar.gz → 0.2.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/PKG-INFO +40 -3
- fluxloop_cli-0.2.36/README.md +116 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/__init__.py +1 -1
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/arg_binder.py +103 -24
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/config.py +0 -1
- fluxloop_cli-0.2.36/fluxloop_cli/commands/evaluate.py +264 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/init.py +78 -22
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/parse.py +175 -8
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/record.py +17 -11
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/run.py +213 -26
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/status.py +1 -2
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/config_loader.py +0 -2
- fluxloop_cli-0.2.36/fluxloop_cli/conversation_supervisor.py +326 -0
- fluxloop_cli-0.2.36/fluxloop_cli/environment.py +71 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/artifacts.py +115 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/config.py +56 -3
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/core.py +44 -27
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/reporting/html.py +376 -15
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/llm.py +34 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/__init__.py +53 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/base.py +48 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/information_completeness.py +45 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/intent_recognition.py +46 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/response_clarity.py +44 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/response_consistency.py +46 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/__init__.py +0 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/aggregator.py +642 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/generator.py +896 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/pipeline.py +156 -0
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/renderer.py +479 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/rules.py +59 -1
- fluxloop_cli-0.2.36/fluxloop_cli/evaluation/templates/report.html.j2 +7072 -0
- fluxloop_cli-0.2.36/fluxloop_cli/input_generator.py +261 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/llm_generator.py +23 -23
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/main.py +0 -2
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/project_paths.py +0 -1
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/runner.py +476 -32
- fluxloop_cli-0.2.36/fluxloop_cli/templates.py +546 -0
- fluxloop_cli-0.2.36/fluxloop_cli/testing/__init__.py +24 -0
- fluxloop_cli-0.2.36/fluxloop_cli/testing/pytest_plugin.py +432 -0
- fluxloop_cli-0.2.36/fluxloop_cli/testing/types.py +188 -0
- fluxloop_cli-0.2.36/fluxloop_cli/token_usage.py +182 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/PKG-INFO +40 -3
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/SOURCES.txt +25 -1
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/requires.txt +2 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/pyproject.toml +9 -1
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_arg_binder.py +48 -1
- fluxloop_cli-0.2.36/tests/test_conversation_supervisor.py +37 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_evaluate_command.py +193 -57
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_evaluation_llm.py +5 -4
- fluxloop_cli-0.2.36/tests/test_init_pytest_template.py +62 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_input_generator.py +1 -3
- fluxloop_cli-0.2.36/tests/test_pytest_plugin.py +103 -0
- fluxloop_cli-0.2.36/tests/test_run_command.py +131 -0
- fluxloop_cli-0.2.36/tests/test_runner_multi_turn.py +243 -0
- fluxloop_cli-0.2.36/tests/test_testing_types.py +66 -0
- fluxloop_cli-0.2.19/README.md +0 -81
- fluxloop_cli-0.2.19/fluxloop_cli/commands/evaluate.py +0 -183
- fluxloop_cli-0.2.19/fluxloop_cli/evaluation/prompts/__init__.py +0 -182
- fluxloop_cli-0.2.19/fluxloop_cli/input_generator.py +0 -138
- fluxloop_cli-0.2.19/fluxloop_cli/templates.py +0 -538
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/__init__.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/doctor.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/generate.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/config_schema.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/constants.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/__init__.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/__init__.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/analysis.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/reporting/__init__.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/reporting/markdown.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/success.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/target_loader.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/validators.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/dependency_links.txt +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/entry_points.txt +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/top_level.txt +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/setup.cfg +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_analysis_recommendations.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_config_command.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_prompt_library.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_success_criteria.py +0 -0
- {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_target_loader.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fluxloop-cli
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.36
|
|
4
4
|
Summary: FluxLoop CLI for running agent simulations
|
|
5
5
|
Author-email: FluxLoop Team <team@fluxloop.dev>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -26,6 +26,8 @@ Requires-Dist: httpx>=0.24.0
|
|
|
26
26
|
Requires-Dist: rich>=13.0
|
|
27
27
|
Requires-Dist: python-dotenv>=1.0.0
|
|
28
28
|
Requires-Dist: fluxloop>=0.1.0
|
|
29
|
+
Requires-Dist: ruamel.yaml>=0.17.0
|
|
30
|
+
Requires-Dist: Jinja2>=3.0
|
|
29
31
|
Provides-Extra: dev
|
|
30
32
|
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
33
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
@@ -65,14 +67,49 @@ The legacy `setting.yaml` is still supported, but new projects created with
|
|
|
65
67
|
- `fluxloop init project` – scaffold a new project (configs, `.env`, examples)
|
|
66
68
|
- `fluxloop generate inputs` – produce input variations for the active project
|
|
67
69
|
- `fluxloop run experiment` – execute an experiment using `configs/simulation.yaml`
|
|
68
|
-
- `fluxloop parse experiment` – convert experiment outputs into readable artifacts
|
|
69
|
-
- `fluxloop evaluate experiment` –
|
|
70
|
+
- `fluxloop parse experiment` – convert experiment outputs into readable artifacts and emit structured per-trace JSON at `per_trace_analysis/per_trace.jsonl`
|
|
71
|
+
- `fluxloop evaluate experiment` – run the LLM-driven evaluation pipeline (LLM-PT → rule aggregation → LLM-OV → HTML render). Requires the parsed per-trace file (or `--per-trace`) and writes an interactive report to `evaluation_report/report.html` by default.
|
|
70
72
|
- `fluxloop config set-llm` – update LLM provider/model in `configs/input.yaml`
|
|
71
73
|
- `fluxloop record enable|disable|status` – toggle recording mode across `.env` and simulation config
|
|
72
74
|
- `fluxloop doctor` – summarize Python, FluxLoop CLI/MCP, and MCP index state for the active environment
|
|
75
|
+
- `--yes/-y` (for `fluxloop run experiment`) – skip the interactive confirmation prompt, ideal for CI and the Pytest bridge
|
|
76
|
+
|
|
77
|
+
### Multi-turn supervisor options
|
|
78
|
+
|
|
79
|
+
`fluxloop run experiment` supports multi-turn orchestration out of the box:
|
|
80
|
+
|
|
81
|
+
- Toggle with `--multi-turn/--no-multi-turn`
|
|
82
|
+
- Limit depth via `--max-turns`
|
|
83
|
+
- Control tool approvals with `--auto-approve-tools/--manual-approve-tools`
|
|
84
|
+
- Override the supervisor persona target: `--persona-override`
|
|
85
|
+
- Point at a specific LLM: `--supervisor-provider`, `--supervisor-model`, `--supervisor-temperature`, `--supervisor-api-key`
|
|
86
|
+
|
|
87
|
+
These flags override the values in `configs/simulation.yaml` (`multi_turn` block). When enabled, the runner consults the supervisor after every turn to decide whether to continue and to synthesize the next realistic user message.
|
|
88
|
+
|
|
89
|
+
**Scripted Playback Mode**: For deterministic multi-turn scenarios, switch `supervisor.provider` to `mock` and populate `supervisor.metadata.scripted_questions` with a list of user messages. FluxLoop will replay them sequentially and terminate when the script ends—ideal for regression testing and demos.
|
|
73
90
|
|
|
74
91
|
Run `fluxloop --help` or `fluxloop <command> --help` for more detail.
|
|
75
92
|
|
|
93
|
+
## Pytest Bridge (0.2.29+)
|
|
94
|
+
|
|
95
|
+
- `fluxloop init pytest-template [project_root]` creates `tests/test_fluxloop_smoke.py`, already wired to the new `fluxloop_runner` fixture.
|
|
96
|
+
- Fixtures live in `fluxloop_cli.testing.pytest_plugin` and return a `FluxLoopTestResult`, so you can assert on `total_runs`, `success_rate`, or call `require_success()`.
|
|
97
|
+
- Full guide + CI example: see `docs/guides/pytest_bridge.md` (includes GitHub Actions workflow at `examples/ci/fluxloop_pytest.yml`).
|
|
98
|
+
- Typical workflow:
|
|
99
|
+
1. `pip install -e packages/cli[dev]`
|
|
100
|
+
2. `fluxloop init pytest-template .`
|
|
101
|
+
3. `pytest -k fluxloop_smoke --maxfail=1`
|
|
102
|
+
|
|
103
|
+
## Evaluation Workflow
|
|
104
|
+
|
|
105
|
+
Evaluation now follows a two-step process so that multi-turn context is preserved:
|
|
106
|
+
|
|
107
|
+
1. `fluxloop run experiment` – produce `trace_summary.jsonl` (and optionally `observations.jsonl`).
|
|
108
|
+
2. `fluxloop parse experiment <experiment_dir>` – generate markdown summaries and a structured artifact at `per_trace_analysis/per_trace.jsonl`.
|
|
109
|
+
3. `fluxloop evaluate experiment <experiment_dir>` – consume that structured file, run LLM-based per-trace + overall analysis, and emit an interactive dashboard at `<experiment_dir>/evaluation_report/report.html` (override with `--output`).
|
|
110
|
+
|
|
111
|
+
`fluxloop evaluate` exits early with guidance when the per-trace artifact is missing. If you relocate the file, supply an explicit path with `--per-trace /path/to/per_trace.jsonl`.
|
|
112
|
+
|
|
76
113
|
## Quick Setup Script
|
|
77
114
|
|
|
78
115
|
To prepare a fresh checkout (create `.venv`, install dependencies, and run diagnostics):
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# FluxLoop CLI
|
|
2
|
+
|
|
3
|
+
Command-line interface for running agent simulations.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
pip install fluxloop-cli
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Configuration Overview (v0.2.0)
|
|
12
|
+
|
|
13
|
+
FluxLoop CLI now stores experiment settings in four files under `configs/`:
|
|
14
|
+
|
|
15
|
+
- `configs/project.yaml` – project metadata, collector defaults
|
|
16
|
+
- `configs/input.yaml` – personas, base inputs, input generation options
|
|
17
|
+
- `configs/simulation.yaml` – runtime parameters (iterations, runner, replay args)
|
|
18
|
+
- `configs/evaluation.yaml` – evaluator definitions (rule-based, LLM judge, etc.)
|
|
19
|
+
|
|
20
|
+
The legacy `setting.yaml` is still supported, but new projects created with
|
|
21
|
+
`fluxloop init project` will generate the structured layout above.
|
|
22
|
+
|
|
23
|
+
## Key Commands
|
|
24
|
+
|
|
25
|
+
- `fluxloop init project` – scaffold a new project (configs, `.env`, examples)
|
|
26
|
+
- `fluxloop generate inputs` – produce input variations for the active project
|
|
27
|
+
- `fluxloop run experiment` – execute an experiment using `configs/simulation.yaml`
|
|
28
|
+
- `fluxloop parse experiment` – convert experiment outputs into readable artifacts and emit structured per-trace JSON at `per_trace_analysis/per_trace.jsonl`
|
|
29
|
+
- `fluxloop evaluate experiment` – run the LLM-driven evaluation pipeline (LLM-PT → rule aggregation → LLM-OV → HTML render). Requires the parsed per-trace file (or `--per-trace`) and writes an interactive report to `evaluation_report/report.html` by default.
|
|
30
|
+
- `fluxloop config set-llm` – update LLM provider/model in `configs/input.yaml`
|
|
31
|
+
- `fluxloop record enable|disable|status` – toggle recording mode across `.env` and simulation config
|
|
32
|
+
- `fluxloop doctor` – summarize Python, FluxLoop CLI/MCP, and MCP index state for the active environment
|
|
33
|
+
- `--yes/-y` (for `fluxloop run experiment`) – skip the interactive confirmation prompt, ideal for CI and the Pytest bridge
|
|
34
|
+
|
|
35
|
+
### Multi-turn supervisor options
|
|
36
|
+
|
|
37
|
+
`fluxloop run experiment` supports multi-turn orchestration out of the box:
|
|
38
|
+
|
|
39
|
+
- Toggle with `--multi-turn/--no-multi-turn`
|
|
40
|
+
- Limit depth via `--max-turns`
|
|
41
|
+
- Control tool approvals with `--auto-approve-tools/--manual-approve-tools`
|
|
42
|
+
- Override the supervisor persona target: `--persona-override`
|
|
43
|
+
- Point at a specific LLM: `--supervisor-provider`, `--supervisor-model`, `--supervisor-temperature`, `--supervisor-api-key`
|
|
44
|
+
|
|
45
|
+
These flags override the values in `configs/simulation.yaml` (`multi_turn` block). When enabled, the runner consults the supervisor after every turn to decide whether to continue and to synthesize the next realistic user message.
|
|
46
|
+
|
|
47
|
+
**Scripted Playback Mode**: For deterministic multi-turn scenarios, switch `supervisor.provider` to `mock` and populate `supervisor.metadata.scripted_questions` with a list of user messages. FluxLoop will replay them sequentially and terminate when the script ends—ideal for regression testing and demos.
|
|
48
|
+
|
|
49
|
+
Run `fluxloop --help` or `fluxloop <command> --help` for more detail.
|
|
50
|
+
|
|
51
|
+
## Pytest Bridge (0.2.29+)
|
|
52
|
+
|
|
53
|
+
- `fluxloop init pytest-template [project_root]` creates `tests/test_fluxloop_smoke.py`, already wired to the new `fluxloop_runner` fixture.
|
|
54
|
+
- Fixtures live in `fluxloop_cli.testing.pytest_plugin` and return a `FluxLoopTestResult`, so you can assert on `total_runs`, `success_rate`, or call `require_success()`.
|
|
55
|
+
- Full guide + CI example: see `docs/guides/pytest_bridge.md` (includes GitHub Actions workflow at `examples/ci/fluxloop_pytest.yml`).
|
|
56
|
+
- Typical workflow:
|
|
57
|
+
1. `pip install -e packages/cli[dev]`
|
|
58
|
+
2. `fluxloop init pytest-template .`
|
|
59
|
+
3. `pytest -k fluxloop_smoke --maxfail=1`
|
|
60
|
+
|
|
61
|
+
## Evaluation Workflow
|
|
62
|
+
|
|
63
|
+
Evaluation now follows a two-step process so that multi-turn context is preserved:
|
|
64
|
+
|
|
65
|
+
1. `fluxloop run experiment` – produce `trace_summary.jsonl` (and optionally `observations.jsonl`).
|
|
66
|
+
2. `fluxloop parse experiment <experiment_dir>` – generate markdown summaries and a structured artifact at `per_trace_analysis/per_trace.jsonl`.
|
|
67
|
+
3. `fluxloop evaluate experiment <experiment_dir>` – consume that structured file, run LLM-based per-trace + overall analysis, and emit an interactive dashboard at `<experiment_dir>/evaluation_report/report.html` (override with `--output`).
|
|
68
|
+
|
|
69
|
+
`fluxloop evaluate` exits early with guidance when the per-trace artifact is missing. If you relocate the file, supply an explicit path with `--per-trace /path/to/per_trace.jsonl`.
|
|
70
|
+
|
|
71
|
+
## Quick Setup Script
|
|
72
|
+
|
|
73
|
+
To prepare a fresh checkout (create `.venv`, install dependencies, and run diagnostics):
|
|
74
|
+
|
|
75
|
+
```
|
|
76
|
+
bash scripts/setup_fluxloop_env.sh --target-source-root path/to/your/source
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
Options:
|
|
80
|
+
|
|
81
|
+
- `--python PATH` – choose a specific interpreter (default `python3`)
|
|
82
|
+
- `--target-source-root PATH` – pre-populate VSCode `fluxloop.targetSourceRoot`
|
|
83
|
+
- `--skip-doctor` – skip the final `fluxloop doctor` check
|
|
84
|
+
|
|
85
|
+
After running the script, open the folder in VSCode and use `FluxLoop: Show Environment Info`
|
|
86
|
+
or `FluxLoop: Run Doctor` to confirm the environment.
|
|
87
|
+
|
|
88
|
+
## Runner Integration Patterns
|
|
89
|
+
|
|
90
|
+
Configure how FluxLoop calls your code in `configs/simulation.yaml`:
|
|
91
|
+
|
|
92
|
+
- Module + function: `module_path`/`function_name` or `target: "module:function"`
|
|
93
|
+
- Class.method (zero-arg ctor): `target: "module:Class.method"`
|
|
94
|
+
- Module-scoped instance method: `target: "module:instance.method"`
|
|
95
|
+
- Class.method with factory: add `factory: "module:make_instance"` (+ `factory_kwargs`)
|
|
96
|
+
- Async generators: set `runner.stream_output_path` if your streamed event shape differs (default `message.delta`).
|
|
97
|
+
|
|
98
|
+
See full examples: `packages/website/docs-cli/configuration/runner-targets.md`.
|
|
99
|
+
|
|
100
|
+
## Developing
|
|
101
|
+
|
|
102
|
+
Install dependencies and run tests:
|
|
103
|
+
|
|
104
|
+
```
|
|
105
|
+
python -m venv .venv
|
|
106
|
+
source .venv/bin/activate
|
|
107
|
+
pip install -e .[dev]
|
|
108
|
+
pytest
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
To package the CLI:
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
./build.sh
|
|
115
|
+
```
|
|
116
|
+
|
|
@@ -5,7 +5,9 @@ from __future__ import annotations
|
|
|
5
5
|
import inspect
|
|
6
6
|
import json
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Callable, Dict, Optional
|
|
8
|
+
from typing import Any, Callable, Dict, Optional, Sequence
|
|
9
|
+
|
|
10
|
+
from fluxloop.schemas import ExperimentConfig, ReplayArgsConfig, PersonaConfig
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class _AttrDict(dict):
|
|
@@ -26,8 +28,6 @@ class _AttrDict(dict):
|
|
|
26
28
|
except KeyError as exc: # pragma: no cover
|
|
27
29
|
raise AttributeError(item) from exc
|
|
28
30
|
|
|
29
|
-
from fluxloop.schemas import ExperimentConfig, ReplayArgsConfig
|
|
30
|
-
|
|
31
31
|
|
|
32
32
|
class _AwaitableNone:
|
|
33
33
|
"""Simple awaitable that resolves to ``None``."""
|
|
@@ -98,9 +98,17 @@ class ArgBinder:
|
|
|
98
98
|
*,
|
|
99
99
|
runtime_input: str,
|
|
100
100
|
iteration: int = 0,
|
|
101
|
+
conversation_state: Optional[Dict[str, Any]] = None,
|
|
102
|
+
persona: Optional[PersonaConfig] = None,
|
|
103
|
+
auto_approve: Optional[bool] = None,
|
|
101
104
|
) -> Dict[str, Any]:
|
|
102
105
|
"""Construct kwargs for calling *func* based on replay or inspection."""
|
|
103
106
|
|
|
107
|
+
signature = inspect.signature(func)
|
|
108
|
+
parameters = list(signature.parameters.values())
|
|
109
|
+
if parameters and parameters[0].name == "self":
|
|
110
|
+
parameters = parameters[1:]
|
|
111
|
+
|
|
104
112
|
if self._recording:
|
|
105
113
|
kwargs = self._recording.get("kwargs", {}).copy()
|
|
106
114
|
|
|
@@ -111,22 +119,42 @@ class ArgBinder:
|
|
|
111
119
|
try:
|
|
112
120
|
self._set_by_path(kwargs, replay.override_param_path, runtime_input)
|
|
113
121
|
except (KeyError, TypeError):
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
122
|
+
kwargs = self._bind_runtime_input(parameters, runtime_input)
|
|
123
|
+
else:
|
|
124
|
+
fallback = self._bind_runtime_input(parameters, runtime_input)
|
|
125
|
+
for key, value in fallback.items():
|
|
126
|
+
kwargs.setdefault(key, value)
|
|
117
127
|
self._restore_callables(kwargs, replay)
|
|
118
128
|
self._ensure_no_unmapped_callables(kwargs, replay)
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
129
|
+
kwargs = self._hydrate_structures(kwargs)
|
|
130
|
+
else:
|
|
131
|
+
kwargs = self._bind_runtime_input(parameters, runtime_input)
|
|
132
|
+
|
|
133
|
+
return self._inject_optional_kwargs(
|
|
134
|
+
parameters=parameters,
|
|
135
|
+
kwargs=kwargs,
|
|
136
|
+
conversation_state=conversation_state,
|
|
137
|
+
persona=persona,
|
|
138
|
+
auto_approve=auto_approve,
|
|
139
|
+
iteration=iteration,
|
|
140
|
+
)
|
|
126
141
|
|
|
127
|
-
|
|
128
|
-
|
|
142
|
+
def _bind_runtime_input(
|
|
143
|
+
self, parameters: Sequence[inspect.Parameter], runtime_input: str
|
|
144
|
+
) -> Dict[str, Any]:
|
|
145
|
+
candidate = self._find_runtime_parameter(parameters)
|
|
146
|
+
if candidate:
|
|
147
|
+
return {candidate: runtime_input}
|
|
148
|
+
if parameters:
|
|
149
|
+
return {parameters[0].name: runtime_input}
|
|
150
|
+
raise ValueError(
|
|
151
|
+
"Cannot determine where to bind runtime input for the provided function."
|
|
152
|
+
)
|
|
129
153
|
|
|
154
|
+
@staticmethod
|
|
155
|
+
def _find_runtime_parameter(
|
|
156
|
+
parameters: Sequence[inspect.Parameter],
|
|
157
|
+
) -> Optional[str]:
|
|
130
158
|
candidate_names = [
|
|
131
159
|
"input",
|
|
132
160
|
"input_text",
|
|
@@ -134,18 +162,70 @@ class ArgBinder:
|
|
|
134
162
|
"query",
|
|
135
163
|
"text",
|
|
136
164
|
"content",
|
|
165
|
+
"user_message",
|
|
137
166
|
]
|
|
167
|
+
for name in candidate_names:
|
|
168
|
+
for param in parameters:
|
|
169
|
+
if param.name == name:
|
|
170
|
+
return name
|
|
171
|
+
return None
|
|
138
172
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
173
|
+
def _inject_optional_kwargs(
|
|
174
|
+
self,
|
|
175
|
+
*,
|
|
176
|
+
parameters: Sequence[inspect.Parameter],
|
|
177
|
+
kwargs: Dict[str, Any],
|
|
178
|
+
conversation_state: Optional[Dict[str, Any]],
|
|
179
|
+
persona: Optional[PersonaConfig],
|
|
180
|
+
auto_approve: Optional[bool],
|
|
181
|
+
iteration: Optional[int],
|
|
182
|
+
) -> Dict[str, Any]:
|
|
183
|
+
param_names = {param.name for param in parameters}
|
|
184
|
+
|
|
185
|
+
def assign(value: Any, candidates: Sequence[str]) -> bool:
|
|
186
|
+
if value is None:
|
|
187
|
+
return False
|
|
188
|
+
for name in candidates:
|
|
189
|
+
if name in param_names and name not in kwargs:
|
|
190
|
+
kwargs[name] = value
|
|
191
|
+
return True
|
|
192
|
+
return False
|
|
193
|
+
|
|
194
|
+
if conversation_state is not None:
|
|
195
|
+
assign(conversation_state, ["conversation_state", "state", "dialog_state"])
|
|
196
|
+
if isinstance(conversation_state, dict):
|
|
197
|
+
metadata = conversation_state.get("metadata")
|
|
198
|
+
if metadata:
|
|
199
|
+
assign(
|
|
200
|
+
metadata,
|
|
201
|
+
["conversation_metadata", "state_metadata", "conversation_meta"],
|
|
202
|
+
)
|
|
203
|
+
turns = conversation_state.get("turns")
|
|
204
|
+
if turns:
|
|
205
|
+
assign(turns, ["messages", "history", "turns"])
|
|
206
|
+
|
|
207
|
+
if persona is not None:
|
|
208
|
+
assign(persona, ["persona", "user_persona", "persona_config"])
|
|
209
|
+
try:
|
|
210
|
+
persona_prompt = persona.to_prompt()
|
|
211
|
+
except Exception:
|
|
212
|
+
persona_prompt = None
|
|
213
|
+
if persona_prompt:
|
|
214
|
+
assign(
|
|
215
|
+
persona_prompt,
|
|
216
|
+
["persona_prompt", "persona_description", "persona_text"],
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if auto_approve is not None:
|
|
220
|
+
assign(
|
|
221
|
+
auto_approve,
|
|
222
|
+
["auto_approve", "auto_approve_tools", "approve_tools", "autoapprove"],
|
|
223
|
+
)
|
|
142
224
|
|
|
143
|
-
if
|
|
144
|
-
|
|
225
|
+
if iteration is not None:
|
|
226
|
+
assign(iteration, ["iteration", "run_iteration", "loop_index"])
|
|
145
227
|
|
|
146
|
-
|
|
147
|
-
f"Cannot determine where to bind runtime input for function '{func.__name__}'."
|
|
148
|
-
)
|
|
228
|
+
return kwargs
|
|
149
229
|
|
|
150
230
|
def _restore_callables(self, kwargs: Dict[str, Any], replay: ReplayArgsConfig) -> None:
|
|
151
231
|
for param_name, provider in replay.callable_providers.items():
|
|
@@ -198,7 +278,6 @@ class ArgBinder:
|
|
|
198
278
|
|
|
199
279
|
def _record(args: Any, kwargs: Any) -> None:
|
|
200
280
|
messages.append((args, kwargs))
|
|
201
|
-
pretty = args[0] if len(args) == 1 and not kwargs else {"args": args, "kwargs": kwargs}
|
|
202
281
|
|
|
203
282
|
def send(*args: Any, **kwargs: Any) -> _AwaitableNone:
|
|
204
283
|
_record(args, kwargs)
|
|
@@ -14,7 +14,6 @@ from rich.syntax import Syntax
|
|
|
14
14
|
from rich.table import Table
|
|
15
15
|
|
|
16
16
|
from ..config_loader import load_experiment_config
|
|
17
|
-
from ..templates import create_env_file, create_gitignore, create_sample_agent
|
|
18
17
|
from ..constants import DEFAULT_CONFIG_PATH, DEFAULT_ROOT_DIR_NAME
|
|
19
18
|
from ..config_schema import CONFIG_SECTION_FILENAMES
|
|
20
19
|
from ..project_paths import (
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
"""Evaluate command for generating interactive reports."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import shutil
|
|
9
|
+
from dataclasses import asdict
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional
|
|
12
|
+
|
|
13
|
+
import typer
|
|
14
|
+
import yaml
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
|
|
17
|
+
from ..environment import load_env_chain
|
|
18
|
+
from ..evaluation import load_evaluation_config
|
|
19
|
+
from ..evaluation.artifacts import load_per_trace_records, load_trace_summary_records
|
|
20
|
+
from ..evaluation.report.pipeline import ReportPipeline
|
|
21
|
+
|
|
22
|
+
console = Console()
|
|
23
|
+
app = typer.Typer(help="Evaluate experiment outputs and generate interactive reports.")
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _load_yaml_file(path: Optional[Path]) -> dict:
|
|
28
|
+
if not path or not path.exists():
|
|
29
|
+
return {}
|
|
30
|
+
with path.open("r", encoding="utf-8") as handle:
|
|
31
|
+
data = yaml.safe_load(handle) or {}
|
|
32
|
+
if isinstance(data, dict):
|
|
33
|
+
return data
|
|
34
|
+
return {}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _resolve_project_root(config_path: Path) -> Path:
|
|
38
|
+
config_dir = config_path.parent
|
|
39
|
+
return config_dir.parent if config_dir.name == "configs" else config_dir
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _find_config_file(config_path: Path, filename: str) -> Optional[Path]:
|
|
43
|
+
config_dir = config_path.parent
|
|
44
|
+
project_root = _resolve_project_root(config_path)
|
|
45
|
+
candidates = [
|
|
46
|
+
config_dir / filename,
|
|
47
|
+
project_root / "configs" / filename,
|
|
48
|
+
project_root / filename,
|
|
49
|
+
]
|
|
50
|
+
for candidate in candidates:
|
|
51
|
+
if candidate.exists():
|
|
52
|
+
return candidate
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _prepare_output_directory(path: Path, overwrite: bool) -> None:
|
|
57
|
+
if path.exists():
|
|
58
|
+
if not overwrite:
|
|
59
|
+
raise typer.BadParameter(
|
|
60
|
+
f"Output directory already exists: {path}. Use --overwrite to replace it."
|
|
61
|
+
)
|
|
62
|
+
shutil.rmtree(path)
|
|
63
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _load_generated_inputs_data(input_config: Dict[str, any], project_root: Path) -> Dict[str, any]:
|
|
67
|
+
"""
|
|
68
|
+
Load generated inputs (variations) from the configured inputs file, if available.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
inputs_file_value = input_config.get("inputs_file") or "inputs/generated.yaml"
|
|
72
|
+
inputs_path = Path(inputs_file_value)
|
|
73
|
+
if not inputs_path.is_absolute():
|
|
74
|
+
inputs_path = (project_root / inputs_path).resolve()
|
|
75
|
+
|
|
76
|
+
if not inputs_path.exists():
|
|
77
|
+
logger.debug("Generated inputs file not found at %s", inputs_path)
|
|
78
|
+
return {}
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
with inputs_path.open("r", encoding="utf-8") as handle:
|
|
82
|
+
payload = yaml.safe_load(handle) or {}
|
|
83
|
+
except Exception as exc: # noqa: BLE001
|
|
84
|
+
logger.warning("Failed to load generated inputs file %s: %s", inputs_path, exc)
|
|
85
|
+
return {}
|
|
86
|
+
|
|
87
|
+
inputs_list: List[Dict[str, any]] = []
|
|
88
|
+
generation_cfg: Dict[str, any] = {}
|
|
89
|
+
if isinstance(payload, dict):
|
|
90
|
+
inputs_list = payload.get("inputs") or payload.get("variations") or []
|
|
91
|
+
generation_cfg = payload.get("generation_config") or {}
|
|
92
|
+
elif isinstance(payload, list):
|
|
93
|
+
inputs_list = payload
|
|
94
|
+
else:
|
|
95
|
+
logger.debug("Generated inputs file %s did not contain a supported structure", inputs_path)
|
|
96
|
+
return {}
|
|
97
|
+
|
|
98
|
+
variations: List[Dict[str, str]] = []
|
|
99
|
+
for entry in inputs_list:
|
|
100
|
+
if not isinstance(entry, dict):
|
|
101
|
+
continue
|
|
102
|
+
text = entry.get("input")
|
|
103
|
+
if not text:
|
|
104
|
+
continue
|
|
105
|
+
metadata = entry.get("metadata") or {}
|
|
106
|
+
persona = entry.get("persona") or metadata.get("persona")
|
|
107
|
+
strategy = entry.get("strategy") or metadata.get("variation_strategy") or metadata.get("strategy")
|
|
108
|
+
|
|
109
|
+
variations.append(
|
|
110
|
+
{
|
|
111
|
+
"persona": (persona or "unknown").strip(),
|
|
112
|
+
"strategy": (strategy or "base").strip(),
|
|
113
|
+
"input": text.strip(),
|
|
114
|
+
}
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
generator_model = generation_cfg.get("model") or generation_cfg.get("generator_model")
|
|
118
|
+
provider = generation_cfg.get("provider")
|
|
119
|
+
if generator_model and provider and "/" not in str(generator_model):
|
|
120
|
+
generator_model = f"{provider}/{generator_model}"
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
"path": str(inputs_path),
|
|
124
|
+
"variations": variations,
|
|
125
|
+
"generator_model": generator_model or input_config.get("input_generation", {})
|
|
126
|
+
.get("llm", {})
|
|
127
|
+
.get("model"),
|
|
128
|
+
"strategies": generation_cfg.get("strategies") or input_config.get("variation_strategies", []),
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@app.command()
|
|
133
|
+
def experiment(
|
|
134
|
+
experiment_dir: Path = typer.Argument(
|
|
135
|
+
...,
|
|
136
|
+
help="Path to the experiment output directory",
|
|
137
|
+
exists=True,
|
|
138
|
+
dir_okay=True,
|
|
139
|
+
file_okay=False,
|
|
140
|
+
resolve_path=True,
|
|
141
|
+
),
|
|
142
|
+
config: Path = typer.Option(
|
|
143
|
+
Path("configs/evaluation.yaml"),
|
|
144
|
+
"--config",
|
|
145
|
+
"-c",
|
|
146
|
+
help="Path to evaluation configuration file",
|
|
147
|
+
),
|
|
148
|
+
output: Path = typer.Option(
|
|
149
|
+
Path("evaluation_report"),
|
|
150
|
+
"--output",
|
|
151
|
+
"-o",
|
|
152
|
+
help="Output directory name (relative to the experiment directory)",
|
|
153
|
+
),
|
|
154
|
+
overwrite: bool = typer.Option(
|
|
155
|
+
False,
|
|
156
|
+
"--overwrite",
|
|
157
|
+
help="Overwrite output directory if it already exists",
|
|
158
|
+
),
|
|
159
|
+
llm_api_key: Optional[str] = typer.Option(
|
|
160
|
+
None,
|
|
161
|
+
"--llm-api-key",
|
|
162
|
+
help="LLM API key for report generation (optional)",
|
|
163
|
+
envvar="FLUXLOOP_LLM_API_KEY",
|
|
164
|
+
),
|
|
165
|
+
per_trace: Optional[Path] = typer.Option(
|
|
166
|
+
None,
|
|
167
|
+
"--per-trace",
|
|
168
|
+
help="Path to structured per-trace JSONL generated by `fluxloop parse`",
|
|
169
|
+
),
|
|
170
|
+
verbose: bool = typer.Option(
|
|
171
|
+
False,
|
|
172
|
+
"--verbose",
|
|
173
|
+
help="Enable verbose logging",
|
|
174
|
+
),
|
|
175
|
+
):
|
|
176
|
+
"""
|
|
177
|
+
Evaluate experiment outputs and generate an interactive HTML report.
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
logging.basicConfig(
|
|
181
|
+
level=logging.DEBUG if verbose else logging.INFO,
|
|
182
|
+
format="%(message)s",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
resolved_experiment_dir = experiment_dir.resolve()
|
|
186
|
+
if not resolved_experiment_dir.is_dir():
|
|
187
|
+
raise typer.BadParameter(f"Experiment directory not found: {resolved_experiment_dir}")
|
|
188
|
+
|
|
189
|
+
config_path = config.resolve() if config.is_absolute() else (Path.cwd() / config).resolve()
|
|
190
|
+
project_root = _resolve_project_root(config_path)
|
|
191
|
+
|
|
192
|
+
if per_trace is not None:
|
|
193
|
+
per_trace_path = per_trace.resolve() if per_trace.is_absolute() else (Path.cwd() / per_trace).resolve()
|
|
194
|
+
else:
|
|
195
|
+
per_trace_path = resolved_experiment_dir / "per_trace_analysis" / "per_trace.jsonl"
|
|
196
|
+
|
|
197
|
+
per_trace_records = load_per_trace_records(resolved_experiment_dir, per_trace_path)
|
|
198
|
+
trace_records = [record.trace for record in per_trace_records]
|
|
199
|
+
if not trace_records:
|
|
200
|
+
raise typer.BadParameter("No traces found in per-trace artifacts.")
|
|
201
|
+
|
|
202
|
+
trace_summary_path = resolved_experiment_dir / "trace_summary.jsonl"
|
|
203
|
+
trace_summaries = load_trace_summary_records(resolved_experiment_dir, trace_summary_path)
|
|
204
|
+
if not trace_summaries:
|
|
205
|
+
raise typer.BadParameter("No traces found in trace summary artifacts.")
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
evaluation_config = load_evaluation_config(config_path)
|
|
209
|
+
except FileNotFoundError as exc:
|
|
210
|
+
raise typer.BadParameter(str(exc)) from exc
|
|
211
|
+
except Exception as exc: # noqa: BLE001
|
|
212
|
+
raise typer.BadParameter(f"Failed to load evaluation config: {exc}") from exc
|
|
213
|
+
|
|
214
|
+
def _log_env_error(path: Path, exc: Exception) -> None:
|
|
215
|
+
console.log(
|
|
216
|
+
f"[yellow]Warning:[/yellow] Failed to load environment from {path}: {exc}"
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
load_env_chain(
|
|
220
|
+
evaluation_config.get_source_dir(),
|
|
221
|
+
refresh_config=True,
|
|
222
|
+
on_error=_log_env_error,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
if llm_api_key is None:
|
|
226
|
+
llm_api_key = os.getenv("FLUXLOOP_LLM_API_KEY") or os.getenv("OPENAI_API_KEY")
|
|
227
|
+
|
|
228
|
+
output_dir = output if output.is_absolute() else (resolved_experiment_dir / output)
|
|
229
|
+
_prepare_output_directory(output_dir, overwrite)
|
|
230
|
+
|
|
231
|
+
input_config_path = _find_config_file(config_path, "input.yaml")
|
|
232
|
+
project_config_path = _find_config_file(config_path, "project.yaml")
|
|
233
|
+
|
|
234
|
+
input_config = _load_yaml_file(input_config_path)
|
|
235
|
+
project_config = _load_yaml_file(project_config_path)
|
|
236
|
+
generated_inputs = _load_generated_inputs_data(input_config, project_root)
|
|
237
|
+
|
|
238
|
+
config_bundle = {
|
|
239
|
+
"name": project_config.get("name") or resolved_experiment_dir.name,
|
|
240
|
+
"evaluation": asdict(evaluation_config),
|
|
241
|
+
"input": input_config,
|
|
242
|
+
"generated_inputs": generated_inputs,
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
pipeline = ReportPipeline(
|
|
246
|
+
config=config_bundle,
|
|
247
|
+
output_dir=output_dir,
|
|
248
|
+
api_key=llm_api_key,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
message_lines = [
|
|
252
|
+
f"📊 Evaluating experiment at [cyan]{resolved_experiment_dir}[/cyan]",
|
|
253
|
+
f"⚙️ Config: [magenta]{config_path}[/magenta]",
|
|
254
|
+
f"🧵 Per-trace data: [blue]{per_trace_path}[/blue]",
|
|
255
|
+
f"📄 Trace summary: [blue]{trace_summary_path}[/blue]",
|
|
256
|
+
f"📁 Output: [green]{output_dir}[/green]",
|
|
257
|
+
]
|
|
258
|
+
console.print("\n".join(message_lines))
|
|
259
|
+
|
|
260
|
+
artifacts = asyncio.run(pipeline.run(trace_records, trace_summaries))
|
|
261
|
+
console.print(f"\n✅ Report ready: [bold cyan]{artifacts.html_path}[/bold cyan]")
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
|