fluxloop-cli 0.2.19__tar.gz → 0.2.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/PKG-INFO +40 -3
  2. fluxloop_cli-0.2.36/README.md +116 -0
  3. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/__init__.py +1 -1
  4. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/arg_binder.py +103 -24
  5. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/config.py +0 -1
  6. fluxloop_cli-0.2.36/fluxloop_cli/commands/evaluate.py +264 -0
  7. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/init.py +78 -22
  8. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/parse.py +175 -8
  9. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/record.py +17 -11
  10. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/run.py +213 -26
  11. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/status.py +1 -2
  12. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/config_loader.py +0 -2
  13. fluxloop_cli-0.2.36/fluxloop_cli/conversation_supervisor.py +326 -0
  14. fluxloop_cli-0.2.36/fluxloop_cli/environment.py +71 -0
  15. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/artifacts.py +115 -0
  16. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/config.py +56 -3
  17. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/core.py +44 -27
  18. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/reporting/html.py +376 -15
  19. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/llm.py +34 -0
  20. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/__init__.py +53 -0
  21. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/base.py +48 -0
  22. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/information_completeness.py +45 -0
  23. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/intent_recognition.py +46 -0
  24. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/response_clarity.py +44 -0
  25. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/prompts/response_consistency.py +46 -0
  26. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/__init__.py +0 -0
  27. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/aggregator.py +642 -0
  28. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/generator.py +896 -0
  29. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/pipeline.py +156 -0
  30. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/report/renderer.py +479 -0
  31. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/rules.py +59 -1
  32. fluxloop_cli-0.2.36/fluxloop_cli/evaluation/templates/report.html.j2 +7072 -0
  33. fluxloop_cli-0.2.36/fluxloop_cli/input_generator.py +261 -0
  34. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/llm_generator.py +23 -23
  35. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/main.py +0 -2
  36. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/project_paths.py +0 -1
  37. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/runner.py +476 -32
  38. fluxloop_cli-0.2.36/fluxloop_cli/templates.py +546 -0
  39. fluxloop_cli-0.2.36/fluxloop_cli/testing/__init__.py +24 -0
  40. fluxloop_cli-0.2.36/fluxloop_cli/testing/pytest_plugin.py +432 -0
  41. fluxloop_cli-0.2.36/fluxloop_cli/testing/types.py +188 -0
  42. fluxloop_cli-0.2.36/fluxloop_cli/token_usage.py +182 -0
  43. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/PKG-INFO +40 -3
  44. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/SOURCES.txt +25 -1
  45. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/requires.txt +2 -0
  46. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/pyproject.toml +9 -1
  47. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_arg_binder.py +48 -1
  48. fluxloop_cli-0.2.36/tests/test_conversation_supervisor.py +37 -0
  49. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_evaluate_command.py +193 -57
  50. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_evaluation_llm.py +5 -4
  51. fluxloop_cli-0.2.36/tests/test_init_pytest_template.py +62 -0
  52. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_input_generator.py +1 -3
  53. fluxloop_cli-0.2.36/tests/test_pytest_plugin.py +103 -0
  54. fluxloop_cli-0.2.36/tests/test_run_command.py +131 -0
  55. fluxloop_cli-0.2.36/tests/test_runner_multi_turn.py +243 -0
  56. fluxloop_cli-0.2.36/tests/test_testing_types.py +66 -0
  57. fluxloop_cli-0.2.19/README.md +0 -81
  58. fluxloop_cli-0.2.19/fluxloop_cli/commands/evaluate.py +0 -183
  59. fluxloop_cli-0.2.19/fluxloop_cli/evaluation/prompts/__init__.py +0 -182
  60. fluxloop_cli-0.2.19/fluxloop_cli/input_generator.py +0 -138
  61. fluxloop_cli-0.2.19/fluxloop_cli/templates.py +0 -538
  62. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/__init__.py +0 -0
  63. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/doctor.py +0 -0
  64. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/commands/generate.py +0 -0
  65. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/config_schema.py +0 -0
  66. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/constants.py +0 -0
  67. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/__init__.py +0 -0
  68. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/__init__.py +0 -0
  69. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/analysis.py +0 -0
  70. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/reporting/__init__.py +0 -0
  71. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/reporting/markdown.py +0 -0
  72. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/evaluation/engine/success.py +0 -0
  73. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/target_loader.py +0 -0
  74. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli/validators.py +0 -0
  75. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/dependency_links.txt +0 -0
  76. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/entry_points.txt +0 -0
  77. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/fluxloop_cli.egg-info/top_level.txt +0 -0
  78. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/setup.cfg +0 -0
  79. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_analysis_recommendations.py +0 -0
  80. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_config_command.py +0 -0
  81. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_prompt_library.py +0 -0
  82. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_success_criteria.py +0 -0
  83. {fluxloop_cli-0.2.19 → fluxloop_cli-0.2.36}/tests/test_target_loader.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fluxloop-cli
3
- Version: 0.2.19
3
+ Version: 0.2.36
4
4
  Summary: FluxLoop CLI for running agent simulations
5
5
  Author-email: FluxLoop Team <team@fluxloop.dev>
6
6
  License: Apache-2.0
@@ -26,6 +26,8 @@ Requires-Dist: httpx>=0.24.0
26
26
  Requires-Dist: rich>=13.0
27
27
  Requires-Dist: python-dotenv>=1.0.0
28
28
  Requires-Dist: fluxloop>=0.1.0
29
+ Requires-Dist: ruamel.yaml>=0.17.0
30
+ Requires-Dist: Jinja2>=3.0
29
31
  Provides-Extra: dev
30
32
  Requires-Dist: pytest>=7.0; extra == "dev"
31
33
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -65,14 +67,49 @@ The legacy `setting.yaml` is still supported, but new projects created with
65
67
  - `fluxloop init project` – scaffold a new project (configs, `.env`, examples)
66
68
  - `fluxloop generate inputs` – produce input variations for the active project
67
69
  - `fluxloop run experiment` – execute an experiment using `configs/simulation.yaml`
68
- - `fluxloop parse experiment` – convert experiment outputs into readable artifacts
69
- - `fluxloop evaluate experiment` – score experiment outputs using rule-based and LLM evaluators, generate reports with success criteria, analysis, and customizable templates
70
+ - `fluxloop parse experiment` – convert experiment outputs into readable artifacts and emit structured per-trace JSON at `per_trace_analysis/per_trace.jsonl`
71
+ - `fluxloop evaluate experiment` – run the LLM-driven evaluation pipeline (LLM-PT rule aggregation → LLM-OV HTML render). Requires the parsed per-trace file (or `--per-trace`) and writes an interactive report to `evaluation_report/report.html` by default.
70
72
  - `fluxloop config set-llm` – update LLM provider/model in `configs/input.yaml`
71
73
  - `fluxloop record enable|disable|status` – toggle recording mode across `.env` and simulation config
72
74
  - `fluxloop doctor` – summarize Python, FluxLoop CLI/MCP, and MCP index state for the active environment
75
+ - `--yes/-y` (for `fluxloop run experiment`) – skip the interactive confirmation prompt, ideal for CI and the Pytest bridge
76
+
77
+ ### Multi-turn supervisor options
78
+
79
+ `fluxloop run experiment` supports multi-turn orchestration out of the box:
80
+
81
+ - Toggle with `--multi-turn/--no-multi-turn`
82
+ - Limit depth via `--max-turns`
83
+ - Control tool approvals with `--auto-approve-tools/--manual-approve-tools`
84
+ - Override the supervisor persona target: `--persona-override`
85
+ - Point at a specific LLM: `--supervisor-provider`, `--supervisor-model`, `--supervisor-temperature`, `--supervisor-api-key`
86
+
87
+ These flags override the values in `configs/simulation.yaml` (`multi_turn` block). When enabled, the runner consults the supervisor after every turn to decide whether to continue and to synthesize the next realistic user message.
88
+
89
+ **Scripted Playback Mode**: For deterministic multi-turn scenarios, switch `supervisor.provider` to `mock` and populate `supervisor.metadata.scripted_questions` with a list of user messages. FluxLoop will replay them sequentially and terminate when the script ends—ideal for regression testing and demos.
73
90
 
74
91
  Run `fluxloop --help` or `fluxloop <command> --help` for more detail.
75
92
 
93
+ ## Pytest Bridge (0.2.29+)
94
+
95
+ - `fluxloop init pytest-template [project_root]` creates `tests/test_fluxloop_smoke.py`, already wired to the new `fluxloop_runner` fixture.
96
+ - Fixtures live in `fluxloop_cli.testing.pytest_plugin` and return a `FluxLoopTestResult`, so you can assert on `total_runs`, `success_rate`, or call `require_success()`.
97
+ - Full guide + CI example: see `docs/guides/pytest_bridge.md` (includes GitHub Actions workflow at `examples/ci/fluxloop_pytest.yml`).
98
+ - Typical workflow:
99
+ 1. `pip install -e packages/cli[dev]`
100
+ 2. `fluxloop init pytest-template .`
101
+ 3. `pytest -k fluxloop_smoke --maxfail=1`
102
+
103
+ ## Evaluation Workflow
104
+
105
+ Evaluation now follows a two-step process so that multi-turn context is preserved:
106
+
107
+ 1. `fluxloop run experiment` – produce `trace_summary.jsonl` (and optionally `observations.jsonl`).
108
+ 2. `fluxloop parse experiment <experiment_dir>` – generate markdown summaries and a structured artifact at `per_trace_analysis/per_trace.jsonl`.
109
+ 3. `fluxloop evaluate experiment <experiment_dir>` – consume that structured file, run LLM-based per-trace + overall analysis, and emit an interactive dashboard at `<experiment_dir>/evaluation_report/report.html` (override with `--output`).
110
+
111
+ `fluxloop evaluate` exits early with guidance when the per-trace artifact is missing. If you relocate the file, supply an explicit path with `--per-trace /path/to/per_trace.jsonl`.
112
+
76
113
  ## Quick Setup Script
77
114
 
78
115
  To prepare a fresh checkout (create `.venv`, install dependencies, and run diagnostics):
@@ -0,0 +1,116 @@
1
+ # FluxLoop CLI
2
+
3
+ Command-line interface for running agent simulations.
4
+
5
+ ## Installation
6
+
7
+ ```
8
+ pip install fluxloop-cli
9
+ ```
10
+
11
+ ## Configuration Overview (v0.2.0)
12
+
13
+ FluxLoop CLI now stores experiment settings in four files under `configs/`:
14
+
15
+ - `configs/project.yaml` – project metadata, collector defaults
16
+ - `configs/input.yaml` – personas, base inputs, input generation options
17
+ - `configs/simulation.yaml` – runtime parameters (iterations, runner, replay args)
18
+ - `configs/evaluation.yaml` – evaluator definitions (rule-based, LLM judge, etc.)
19
+
20
+ The legacy `setting.yaml` is still supported, but new projects created with
21
+ `fluxloop init project` will generate the structured layout above.
22
+
23
+ ## Key Commands
24
+
25
+ - `fluxloop init project` – scaffold a new project (configs, `.env`, examples)
26
+ - `fluxloop generate inputs` – produce input variations for the active project
27
+ - `fluxloop run experiment` – execute an experiment using `configs/simulation.yaml`
28
+ - `fluxloop parse experiment` – convert experiment outputs into readable artifacts and emit structured per-trace JSON at `per_trace_analysis/per_trace.jsonl`
29
+ - `fluxloop evaluate experiment` – run the LLM-driven evaluation pipeline (LLM-PT → rule aggregation → LLM-OV → HTML render). Requires the parsed per-trace file (or `--per-trace`) and writes an interactive report to `evaluation_report/report.html` by default.
30
+ - `fluxloop config set-llm` – update LLM provider/model in `configs/input.yaml`
31
+ - `fluxloop record enable|disable|status` – toggle recording mode across `.env` and simulation config
32
+ - `fluxloop doctor` – summarize Python, FluxLoop CLI/MCP, and MCP index state for the active environment
33
+ - `--yes/-y` (for `fluxloop run experiment`) – skip the interactive confirmation prompt, ideal for CI and the Pytest bridge
34
+
35
+ ### Multi-turn supervisor options
36
+
37
+ `fluxloop run experiment` supports multi-turn orchestration out of the box:
38
+
39
+ - Toggle with `--multi-turn/--no-multi-turn`
40
+ - Limit depth via `--max-turns`
41
+ - Control tool approvals with `--auto-approve-tools/--manual-approve-tools`
42
+ - Override the supervisor persona target: `--persona-override`
43
+ - Point at a specific LLM: `--supervisor-provider`, `--supervisor-model`, `--supervisor-temperature`, `--supervisor-api-key`
44
+
45
+ These flags override the values in `configs/simulation.yaml` (`multi_turn` block). When enabled, the runner consults the supervisor after every turn to decide whether to continue and to synthesize the next realistic user message.
46
+
47
+ **Scripted Playback Mode**: For deterministic multi-turn scenarios, switch `supervisor.provider` to `mock` and populate `supervisor.metadata.scripted_questions` with a list of user messages. FluxLoop will replay them sequentially and terminate when the script ends—ideal for regression testing and demos.
48
+
49
+ Run `fluxloop --help` or `fluxloop <command> --help` for more detail.
50
+
51
+ ## Pytest Bridge (0.2.29+)
52
+
53
+ - `fluxloop init pytest-template [project_root]` creates `tests/test_fluxloop_smoke.py`, already wired to the new `fluxloop_runner` fixture.
54
+ - Fixtures live in `fluxloop_cli.testing.pytest_plugin` and return a `FluxLoopTestResult`, so you can assert on `total_runs`, `success_rate`, or call `require_success()`.
55
+ - Full guide + CI example: see `docs/guides/pytest_bridge.md` (includes GitHub Actions workflow at `examples/ci/fluxloop_pytest.yml`).
56
+ - Typical workflow:
57
+ 1. `pip install -e packages/cli[dev]`
58
+ 2. `fluxloop init pytest-template .`
59
+ 3. `pytest -k fluxloop_smoke --maxfail=1`
60
+
61
+ ## Evaluation Workflow
62
+
63
+ Evaluation now follows a two-step process so that multi-turn context is preserved:
64
+
65
+ 1. `fluxloop run experiment` – produce `trace_summary.jsonl` (and optionally `observations.jsonl`).
66
+ 2. `fluxloop parse experiment <experiment_dir>` – generate markdown summaries and a structured artifact at `per_trace_analysis/per_trace.jsonl`.
67
+ 3. `fluxloop evaluate experiment <experiment_dir>` – consume that structured file, run LLM-based per-trace + overall analysis, and emit an interactive dashboard at `<experiment_dir>/evaluation_report/report.html` (override with `--output`).
68
+
69
+ `fluxloop evaluate` exits early with guidance when the per-trace artifact is missing. If you relocate the file, supply an explicit path with `--per-trace /path/to/per_trace.jsonl`.
70
+
71
+ ## Quick Setup Script
72
+
73
+ To prepare a fresh checkout (create `.venv`, install dependencies, and run diagnostics):
74
+
75
+ ```
76
+ bash scripts/setup_fluxloop_env.sh --target-source-root path/to/your/source
77
+ ```
78
+
79
+ Options:
80
+
81
+ - `--python PATH` – choose a specific interpreter (default `python3`)
82
+ - `--target-source-root PATH` – pre-populate VSCode `fluxloop.targetSourceRoot`
83
+ - `--skip-doctor` – skip the final `fluxloop doctor` check
84
+
85
+ After running the script, open the folder in VSCode and use `FluxLoop: Show Environment Info`
86
+ or `FluxLoop: Run Doctor` to confirm the environment.
87
+
88
+ ## Runner Integration Patterns
89
+
90
+ Configure how FluxLoop calls your code in `configs/simulation.yaml`:
91
+
92
+ - Module + function: `module_path`/`function_name` or `target: "module:function"`
93
+ - Class.method (zero-arg ctor): `target: "module:Class.method"`
94
+ - Module-scoped instance method: `target: "module:instance.method"`
95
+ - Class.method with factory: add `factory: "module:make_instance"` (+ `factory_kwargs`)
96
+ - Async generators: set `runner.stream_output_path` if your streamed event shape differs (default `message.delta`).
97
+
98
+ See full examples: `packages/website/docs-cli/configuration/runner-targets.md`.
99
+
100
+ ## Developing
101
+
102
+ Install dependencies and run tests:
103
+
104
+ ```
105
+ python -m venv .venv
106
+ source .venv/bin/activate
107
+ pip install -e .[dev]
108
+ pytest
109
+ ```
110
+
111
+ To package the CLI:
112
+
113
+ ```
114
+ ./build.sh
115
+ ```
116
+
@@ -2,7 +2,7 @@
2
2
  FluxLoop CLI - Command-line interface for running agent simulations.
3
3
  """
4
4
 
5
- __version__ = "0.2.19"
5
+ __version__ = "0.2.36"
6
6
 
7
7
  from .main import app
8
8
 
@@ -5,7 +5,9 @@ from __future__ import annotations
5
5
  import inspect
6
6
  import json
7
7
  from pathlib import Path
8
- from typing import Any, Callable, Dict, Optional
8
+ from typing import Any, Callable, Dict, Optional, Sequence
9
+
10
+ from fluxloop.schemas import ExperimentConfig, ReplayArgsConfig, PersonaConfig
9
11
 
10
12
 
11
13
  class _AttrDict(dict):
@@ -26,8 +28,6 @@ class _AttrDict(dict):
26
28
  except KeyError as exc: # pragma: no cover
27
29
  raise AttributeError(item) from exc
28
30
 
29
- from fluxloop.schemas import ExperimentConfig, ReplayArgsConfig
30
-
31
31
 
32
32
  class _AwaitableNone:
33
33
  """Simple awaitable that resolves to ``None``."""
@@ -98,9 +98,17 @@ class ArgBinder:
98
98
  *,
99
99
  runtime_input: str,
100
100
  iteration: int = 0,
101
+ conversation_state: Optional[Dict[str, Any]] = None,
102
+ persona: Optional[PersonaConfig] = None,
103
+ auto_approve: Optional[bool] = None,
101
104
  ) -> Dict[str, Any]:
102
105
  """Construct kwargs for calling *func* based on replay or inspection."""
103
106
 
107
+ signature = inspect.signature(func)
108
+ parameters = list(signature.parameters.values())
109
+ if parameters and parameters[0].name == "self":
110
+ parameters = parameters[1:]
111
+
104
112
  if self._recording:
105
113
  kwargs = self._recording.get("kwargs", {}).copy()
106
114
 
@@ -111,22 +119,42 @@ class ArgBinder:
111
119
  try:
112
120
  self._set_by_path(kwargs, replay.override_param_path, runtime_input)
113
121
  except (KeyError, TypeError):
114
- # If path missing, fall back to plain binding
115
- return self._bind_by_signature(func, runtime_input)
116
-
122
+ kwargs = self._bind_runtime_input(parameters, runtime_input)
123
+ else:
124
+ fallback = self._bind_runtime_input(parameters, runtime_input)
125
+ for key, value in fallback.items():
126
+ kwargs.setdefault(key, value)
117
127
  self._restore_callables(kwargs, replay)
118
128
  self._ensure_no_unmapped_callables(kwargs, replay)
119
- return self._hydrate_structures(kwargs)
120
-
121
- return self._bind_by_signature(func, runtime_input)
122
-
123
- def _bind_by_signature(self, func: Callable, runtime_input: str) -> Dict[str, Any]:
124
- signature = inspect.signature(func)
125
- parameters = list(signature.parameters.values())
129
+ kwargs = self._hydrate_structures(kwargs)
130
+ else:
131
+ kwargs = self._bind_runtime_input(parameters, runtime_input)
132
+
133
+ return self._inject_optional_kwargs(
134
+ parameters=parameters,
135
+ kwargs=kwargs,
136
+ conversation_state=conversation_state,
137
+ persona=persona,
138
+ auto_approve=auto_approve,
139
+ iteration=iteration,
140
+ )
126
141
 
127
- if parameters and parameters[0].name == "self":
128
- parameters = parameters[1:]
142
+ def _bind_runtime_input(
143
+ self, parameters: Sequence[inspect.Parameter], runtime_input: str
144
+ ) -> Dict[str, Any]:
145
+ candidate = self._find_runtime_parameter(parameters)
146
+ if candidate:
147
+ return {candidate: runtime_input}
148
+ if parameters:
149
+ return {parameters[0].name: runtime_input}
150
+ raise ValueError(
151
+ "Cannot determine where to bind runtime input for the provided function."
152
+ )
129
153
 
154
+ @staticmethod
155
+ def _find_runtime_parameter(
156
+ parameters: Sequence[inspect.Parameter],
157
+ ) -> Optional[str]:
130
158
  candidate_names = [
131
159
  "input",
132
160
  "input_text",
@@ -134,18 +162,70 @@ class ArgBinder:
134
162
  "query",
135
163
  "text",
136
164
  "content",
165
+ "user_message",
137
166
  ]
167
+ for name in candidate_names:
168
+ for param in parameters:
169
+ if param.name == name:
170
+ return name
171
+ return None
138
172
 
139
- for param in parameters:
140
- if param.name in candidate_names:
141
- return {param.name: runtime_input}
173
+ def _inject_optional_kwargs(
174
+ self,
175
+ *,
176
+ parameters: Sequence[inspect.Parameter],
177
+ kwargs: Dict[str, Any],
178
+ conversation_state: Optional[Dict[str, Any]],
179
+ persona: Optional[PersonaConfig],
180
+ auto_approve: Optional[bool],
181
+ iteration: Optional[int],
182
+ ) -> Dict[str, Any]:
183
+ param_names = {param.name for param in parameters}
184
+
185
+ def assign(value: Any, candidates: Sequence[str]) -> bool:
186
+ if value is None:
187
+ return False
188
+ for name in candidates:
189
+ if name in param_names and name not in kwargs:
190
+ kwargs[name] = value
191
+ return True
192
+ return False
193
+
194
+ if conversation_state is not None:
195
+ assign(conversation_state, ["conversation_state", "state", "dialog_state"])
196
+ if isinstance(conversation_state, dict):
197
+ metadata = conversation_state.get("metadata")
198
+ if metadata:
199
+ assign(
200
+ metadata,
201
+ ["conversation_metadata", "state_metadata", "conversation_meta"],
202
+ )
203
+ turns = conversation_state.get("turns")
204
+ if turns:
205
+ assign(turns, ["messages", "history", "turns"])
206
+
207
+ if persona is not None:
208
+ assign(persona, ["persona", "user_persona", "persona_config"])
209
+ try:
210
+ persona_prompt = persona.to_prompt()
211
+ except Exception:
212
+ persona_prompt = None
213
+ if persona_prompt:
214
+ assign(
215
+ persona_prompt,
216
+ ["persona_prompt", "persona_description", "persona_text"],
217
+ )
218
+
219
+ if auto_approve is not None:
220
+ assign(
221
+ auto_approve,
222
+ ["auto_approve", "auto_approve_tools", "approve_tools", "autoapprove"],
223
+ )
142
224
 
143
- if parameters:
144
- return {parameters[0].name: runtime_input}
225
+ if iteration is not None:
226
+ assign(iteration, ["iteration", "run_iteration", "loop_index"])
145
227
 
146
- raise ValueError(
147
- f"Cannot determine where to bind runtime input for function '{func.__name__}'."
148
- )
228
+ return kwargs
149
229
 
150
230
  def _restore_callables(self, kwargs: Dict[str, Any], replay: ReplayArgsConfig) -> None:
151
231
  for param_name, provider in replay.callable_providers.items():
@@ -198,7 +278,6 @@ class ArgBinder:
198
278
 
199
279
  def _record(args: Any, kwargs: Any) -> None:
200
280
  messages.append((args, kwargs))
201
- pretty = args[0] if len(args) == 1 and not kwargs else {"args": args, "kwargs": kwargs}
202
281
 
203
282
  def send(*args: Any, **kwargs: Any) -> _AwaitableNone:
204
283
  _record(args, kwargs)
@@ -14,7 +14,6 @@ from rich.syntax import Syntax
14
14
  from rich.table import Table
15
15
 
16
16
  from ..config_loader import load_experiment_config
17
- from ..templates import create_env_file, create_gitignore, create_sample_agent
18
17
  from ..constants import DEFAULT_CONFIG_PATH, DEFAULT_ROOT_DIR_NAME
19
18
  from ..config_schema import CONFIG_SECTION_FILENAMES
20
19
  from ..project_paths import (
@@ -0,0 +1,264 @@
1
+ """Evaluate command for generating interactive reports."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import os
8
+ import shutil
9
+ from dataclasses import asdict
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional
12
+
13
+ import typer
14
+ import yaml
15
+ from rich.console import Console
16
+
17
+ from ..environment import load_env_chain
18
+ from ..evaluation import load_evaluation_config
19
+ from ..evaluation.artifacts import load_per_trace_records, load_trace_summary_records
20
+ from ..evaluation.report.pipeline import ReportPipeline
21
+
22
+ console = Console()
23
+ app = typer.Typer(help="Evaluate experiment outputs and generate interactive reports.")
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def _load_yaml_file(path: Optional[Path]) -> dict:
28
+ if not path or not path.exists():
29
+ return {}
30
+ with path.open("r", encoding="utf-8") as handle:
31
+ data = yaml.safe_load(handle) or {}
32
+ if isinstance(data, dict):
33
+ return data
34
+ return {}
35
+
36
+
37
+ def _resolve_project_root(config_path: Path) -> Path:
38
+ config_dir = config_path.parent
39
+ return config_dir.parent if config_dir.name == "configs" else config_dir
40
+
41
+
42
+ def _find_config_file(config_path: Path, filename: str) -> Optional[Path]:
43
+ config_dir = config_path.parent
44
+ project_root = _resolve_project_root(config_path)
45
+ candidates = [
46
+ config_dir / filename,
47
+ project_root / "configs" / filename,
48
+ project_root / filename,
49
+ ]
50
+ for candidate in candidates:
51
+ if candidate.exists():
52
+ return candidate
53
+ return None
54
+
55
+
56
+ def _prepare_output_directory(path: Path, overwrite: bool) -> None:
57
+ if path.exists():
58
+ if not overwrite:
59
+ raise typer.BadParameter(
60
+ f"Output directory already exists: {path}. Use --overwrite to replace it."
61
+ )
62
+ shutil.rmtree(path)
63
+ path.mkdir(parents=True, exist_ok=True)
64
+
65
+
66
+ def _load_generated_inputs_data(input_config: Dict[str, any], project_root: Path) -> Dict[str, any]:
67
+ """
68
+ Load generated inputs (variations) from the configured inputs file, if available.
69
+ """
70
+
71
+ inputs_file_value = input_config.get("inputs_file") or "inputs/generated.yaml"
72
+ inputs_path = Path(inputs_file_value)
73
+ if not inputs_path.is_absolute():
74
+ inputs_path = (project_root / inputs_path).resolve()
75
+
76
+ if not inputs_path.exists():
77
+ logger.debug("Generated inputs file not found at %s", inputs_path)
78
+ return {}
79
+
80
+ try:
81
+ with inputs_path.open("r", encoding="utf-8") as handle:
82
+ payload = yaml.safe_load(handle) or {}
83
+ except Exception as exc: # noqa: BLE001
84
+ logger.warning("Failed to load generated inputs file %s: %s", inputs_path, exc)
85
+ return {}
86
+
87
+ inputs_list: List[Dict[str, any]] = []
88
+ generation_cfg: Dict[str, any] = {}
89
+ if isinstance(payload, dict):
90
+ inputs_list = payload.get("inputs") or payload.get("variations") or []
91
+ generation_cfg = payload.get("generation_config") or {}
92
+ elif isinstance(payload, list):
93
+ inputs_list = payload
94
+ else:
95
+ logger.debug("Generated inputs file %s did not contain a supported structure", inputs_path)
96
+ return {}
97
+
98
+ variations: List[Dict[str, str]] = []
99
+ for entry in inputs_list:
100
+ if not isinstance(entry, dict):
101
+ continue
102
+ text = entry.get("input")
103
+ if not text:
104
+ continue
105
+ metadata = entry.get("metadata") or {}
106
+ persona = entry.get("persona") or metadata.get("persona")
107
+ strategy = entry.get("strategy") or metadata.get("variation_strategy") or metadata.get("strategy")
108
+
109
+ variations.append(
110
+ {
111
+ "persona": (persona or "unknown").strip(),
112
+ "strategy": (strategy or "base").strip(),
113
+ "input": text.strip(),
114
+ }
115
+ )
116
+
117
+ generator_model = generation_cfg.get("model") or generation_cfg.get("generator_model")
118
+ provider = generation_cfg.get("provider")
119
+ if generator_model and provider and "/" not in str(generator_model):
120
+ generator_model = f"{provider}/{generator_model}"
121
+
122
+ return {
123
+ "path": str(inputs_path),
124
+ "variations": variations,
125
+ "generator_model": generator_model or input_config.get("input_generation", {})
126
+ .get("llm", {})
127
+ .get("model"),
128
+ "strategies": generation_cfg.get("strategies") or input_config.get("variation_strategies", []),
129
+ }
130
+
131
+
132
+ @app.command()
133
+ def experiment(
134
+ experiment_dir: Path = typer.Argument(
135
+ ...,
136
+ help="Path to the experiment output directory",
137
+ exists=True,
138
+ dir_okay=True,
139
+ file_okay=False,
140
+ resolve_path=True,
141
+ ),
142
+ config: Path = typer.Option(
143
+ Path("configs/evaluation.yaml"),
144
+ "--config",
145
+ "-c",
146
+ help="Path to evaluation configuration file",
147
+ ),
148
+ output: Path = typer.Option(
149
+ Path("evaluation_report"),
150
+ "--output",
151
+ "-o",
152
+ help="Output directory name (relative to the experiment directory)",
153
+ ),
154
+ overwrite: bool = typer.Option(
155
+ False,
156
+ "--overwrite",
157
+ help="Overwrite output directory if it already exists",
158
+ ),
159
+ llm_api_key: Optional[str] = typer.Option(
160
+ None,
161
+ "--llm-api-key",
162
+ help="LLM API key for report generation (optional)",
163
+ envvar="FLUXLOOP_LLM_API_KEY",
164
+ ),
165
+ per_trace: Optional[Path] = typer.Option(
166
+ None,
167
+ "--per-trace",
168
+ help="Path to structured per-trace JSONL generated by `fluxloop parse`",
169
+ ),
170
+ verbose: bool = typer.Option(
171
+ False,
172
+ "--verbose",
173
+ help="Enable verbose logging",
174
+ ),
175
+ ):
176
+ """
177
+ Evaluate experiment outputs and generate an interactive HTML report.
178
+ """
179
+
180
+ logging.basicConfig(
181
+ level=logging.DEBUG if verbose else logging.INFO,
182
+ format="%(message)s",
183
+ )
184
+
185
+ resolved_experiment_dir = experiment_dir.resolve()
186
+ if not resolved_experiment_dir.is_dir():
187
+ raise typer.BadParameter(f"Experiment directory not found: {resolved_experiment_dir}")
188
+
189
+ config_path = config.resolve() if config.is_absolute() else (Path.cwd() / config).resolve()
190
+ project_root = _resolve_project_root(config_path)
191
+
192
+ if per_trace is not None:
193
+ per_trace_path = per_trace.resolve() if per_trace.is_absolute() else (Path.cwd() / per_trace).resolve()
194
+ else:
195
+ per_trace_path = resolved_experiment_dir / "per_trace_analysis" / "per_trace.jsonl"
196
+
197
+ per_trace_records = load_per_trace_records(resolved_experiment_dir, per_trace_path)
198
+ trace_records = [record.trace for record in per_trace_records]
199
+ if not trace_records:
200
+ raise typer.BadParameter("No traces found in per-trace artifacts.")
201
+
202
+ trace_summary_path = resolved_experiment_dir / "trace_summary.jsonl"
203
+ trace_summaries = load_trace_summary_records(resolved_experiment_dir, trace_summary_path)
204
+ if not trace_summaries:
205
+ raise typer.BadParameter("No traces found in trace summary artifacts.")
206
+
207
+ try:
208
+ evaluation_config = load_evaluation_config(config_path)
209
+ except FileNotFoundError as exc:
210
+ raise typer.BadParameter(str(exc)) from exc
211
+ except Exception as exc: # noqa: BLE001
212
+ raise typer.BadParameter(f"Failed to load evaluation config: {exc}") from exc
213
+
214
+ def _log_env_error(path: Path, exc: Exception) -> None:
215
+ console.log(
216
+ f"[yellow]Warning:[/yellow] Failed to load environment from {path}: {exc}"
217
+ )
218
+
219
+ load_env_chain(
220
+ evaluation_config.get_source_dir(),
221
+ refresh_config=True,
222
+ on_error=_log_env_error,
223
+ )
224
+
225
+ if llm_api_key is None:
226
+ llm_api_key = os.getenv("FLUXLOOP_LLM_API_KEY") or os.getenv("OPENAI_API_KEY")
227
+
228
+ output_dir = output if output.is_absolute() else (resolved_experiment_dir / output)
229
+ _prepare_output_directory(output_dir, overwrite)
230
+
231
+ input_config_path = _find_config_file(config_path, "input.yaml")
232
+ project_config_path = _find_config_file(config_path, "project.yaml")
233
+
234
+ input_config = _load_yaml_file(input_config_path)
235
+ project_config = _load_yaml_file(project_config_path)
236
+ generated_inputs = _load_generated_inputs_data(input_config, project_root)
237
+
238
+ config_bundle = {
239
+ "name": project_config.get("name") or resolved_experiment_dir.name,
240
+ "evaluation": asdict(evaluation_config),
241
+ "input": input_config,
242
+ "generated_inputs": generated_inputs,
243
+ }
244
+
245
+ pipeline = ReportPipeline(
246
+ config=config_bundle,
247
+ output_dir=output_dir,
248
+ api_key=llm_api_key,
249
+ )
250
+
251
+ message_lines = [
252
+ f"📊 Evaluating experiment at [cyan]{resolved_experiment_dir}[/cyan]",
253
+ f"⚙️ Config: [magenta]{config_path}[/magenta]",
254
+ f"🧵 Per-trace data: [blue]{per_trace_path}[/blue]",
255
+ f"📄 Trace summary: [blue]{trace_summary_path}[/blue]",
256
+ f"📁 Output: [green]{output_dir}[/green]",
257
+ ]
258
+ console.print("\n".join(message_lines))
259
+
260
+ artifacts = asyncio.run(pipeline.run(trace_records, trace_summaries))
261
+ console.print(f"\n✅ Report ready: [bold cyan]{artifacts.html_path}[/bold cyan]")
262
+
263
+
264
+