agentx-kit 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/PKG-INFO +29 -1
  2. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/README.md +21 -0
  3. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/pyproject.toml +3 -1
  4. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/__init__.py +12 -1
  5. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/cli.py +40 -0
  6. agentx_kit-0.3.0/src/agentx/dashboard/__init__.py +40 -0
  7. agentx_kit-0.3.0/src/agentx/dashboard/app.py +270 -0
  8. agentx_kit-0.3.0/src/agentx/insights/__init__.py +18 -0
  9. agentx_kit-0.3.0/src/agentx/insights/analyze.py +85 -0
  10. agentx_kit-0.3.0/src/agentx/insights/log.py +88 -0
  11. agentx_kit-0.3.0/src/agentx/insights/optimize.py +80 -0
  12. agentx_kit-0.3.0/src/agentx/insights/tokens.py +97 -0
  13. agentx_kit-0.3.0/tests/test_insights.py +94 -0
  14. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/.github/workflows/publish.yml +0 -0
  15. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/.gitignore +0 -0
  16. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/DESIGN.md +0 -0
  17. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/LICENSE +0 -0
  18. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/RESEARCH.md +0 -0
  19. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/config.py +0 -0
  20. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/frameworks/__init__.py +0 -0
  21. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/frameworks/crewai_agent.py +0 -0
  22. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/frameworks/langchain_agent.py +0 -0
  23. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/guardrails.py +0 -0
  24. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/memory/__init__.py +0 -0
  25. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/memory/store.py +0 -0
  26. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/observability.py +0 -0
  27. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/prompts/__init__.py +0 -0
  28. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/prompts/templates.py +0 -0
  29. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/providers/__init__.py +0 -0
  30. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/providers/base.py +0 -0
  31. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/providers/factory.py +0 -0
  32. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/providers/registry.py +0 -0
  33. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/rag/__init__.py +0 -0
  34. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/rag/pipeline.py +0 -0
  35. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/reliability.py +0 -0
  36. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/__init__.py +0 -0
  37. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/generator.py +0 -0
  38. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/prompts_store.py +0 -0
  39. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/spec.py +0 -0
  40. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/Dockerfile.j2 +0 -0
  41. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/README.md.j2 +0 -0
  42. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/ci.yml.j2 +0 -0
  43. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/docker-compose.yml.j2 +0 -0
  44. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/dockerignore.j2 +0 -0
  45. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/env.example.j2 +0 -0
  46. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/evals/dataset.json.j2 +0 -0
  47. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/evals/run_evals.py.j2 +0 -0
  48. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/gitignore.j2 +0 -0
  49. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/mcp_servers.json.j2 +0 -0
  50. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/__init__.py.j2 +0 -0
  51. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/agents.py.j2 +0 -0
  52. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/config.py.j2 +0 -0
  53. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/guardrails.py.j2 +0 -0
  54. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/main.py.j2 +0 -0
  55. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/memory.py.j2 +0 -0
  56. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/observability.py.j2 +0 -0
  57. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/prompts.py.j2 +0 -0
  58. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/rag.py.j2 +0 -0
  59. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/server.py.j2 +0 -0
  60. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pkg/tools.py.j2 +0 -0
  61. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/pyproject.toml.j2 +0 -0
  62. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/templates/skills_seed.json.j2 +0 -0
  63. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/scaffold/wizard.py +0 -0
  64. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/skills/__init__.py +0 -0
  65. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/skills/registry.py +0 -0
  66. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/structured.py +0 -0
  67. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/tools/__init__.py +0 -0
  68. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/tools/builtin.py +0 -0
  69. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/src/agentx/tools/mcp.py +0 -0
  70. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/tests/test_enterprise.py +0 -0
  71. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/tests/test_prompts.py +0 -0
  72. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/tests/test_providers.py +0 -0
  73. {agentx_kit-0.2.0 → agentx_kit-0.3.0}/tests/test_scaffold.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agentx-kit
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: An open-source, provider-agnostic agentic framework + interactive project scaffolder for LangChain and CrewAI. Pick your LLM provider, agents, RAG, memory, MCP tools and skills — generate a ready-to-run uv project.
5
5
  Project-URL: Homepage, https://github.com/muhammadyahiya/agentx-kit
6
6
  Project-URL: Repository, https://github.com/muhammadyahiya/agentx-kit
@@ -48,7 +48,10 @@ Requires-Dist: mcp>=1.2.0; extra == 'all'
48
48
  Requires-Dist: openinference-instrumentation-langchain>=0.1.0; extra == 'all'
49
49
  Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.20.0; extra == 'all'
50
50
  Requires-Dist: opentelemetry-sdk>=1.20.0; extra == 'all'
51
+ Requires-Dist: pandas>=2.0.0; extra == 'all'
51
52
  Requires-Dist: sse-starlette>=2.0.0; extra == 'all'
53
+ Requires-Dist: streamlit>=1.40.0; extra == 'all'
54
+ Requires-Dist: tiktoken>=0.7.0; extra == 'all'
52
55
  Requires-Dist: uvicorn[standard]>=0.29.0; extra == 'all'
53
56
  Provides-Extra: anthropic
54
57
  Requires-Dist: langchain-anthropic>=0.2.0; extra == 'anthropic'
@@ -58,6 +61,10 @@ Provides-Extra: bedrock
58
61
  Requires-Dist: langchain-aws>=0.2.0; extra == 'bedrock'
59
62
  Provides-Extra: crewai
60
63
  Requires-Dist: crewai>=0.70.0; extra == 'crewai'
64
+ Provides-Extra: dashboard
65
+ Requires-Dist: pandas>=2.0.0; extra == 'dashboard'
66
+ Requires-Dist: streamlit>=1.40.0; extra == 'dashboard'
67
+ Requires-Dist: tiktoken>=0.7.0; extra == 'dashboard'
61
68
  Provides-Extra: dev
62
69
  Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
63
70
  Requires-Dist: pytest>=8.0.0; extra == 'dev'
@@ -233,6 +240,26 @@ agentx prompt remove reviewer
233
240
  A blank `system_prompt` is auto-derived from the agent's role + goal. You can also
234
241
  just open `prompts.json` in an editor — the CLI is a convenience, not a gate.
235
242
 
243
+ ## 📊 Prompt dashboard (observability + optimization)
244
+ A Streamlit workbench to **understand and refine how your prompts talk to the LLM** —
245
+ launch it any time:
246
+
247
+ ```bash
248
+ pip install "agentx-kit[dashboard]"
249
+ agentx dashboard # opens http://localhost:8501
250
+ agentx prompt set assistant -d # edit a prompt AND open the dashboard
251
+ ```
252
+
253
+ It gives you, live as you edit:
254
+ - **Token count, context-window utilization gauge, and cost estimate** (tiktoken-accurate).
255
+ - **Quality score (0–100)** with a checklist (role / goal / output-format / examples / constraints / specificity) and **concrete suggestions + limit warnings**.
256
+ - **✨ One-click LLM optimization** — refines the prompt while preserving intent, shows a **diff + rationale + token delta**, and can **apply the result straight back to `prompts.json`**.
257
+ - **▶️ Test run** — send the prompt to the model and see the response with **tokens in/out, latency, and cost**.
258
+ - **📈 Usage trends** — tokens, cost, and latency over time, logged locally to `.agentx/insights.jsonl`.
259
+
260
+ Run it inside a generated AgentX project and it reads/writes that project's
261
+ `prompts.json`; run it anywhere else for a free-form prompt scratchpad.
262
+
236
263
  ## 🏢 Enterprise pack
237
264
  Generate a production-shaped project with one flag — informed by a survey of
238
265
  CrewAI/LangGraph/create-llama/AgentStack/agno/pydantic-ai (see [RESEARCH.md](RESEARCH.md)):
@@ -281,6 +308,7 @@ llm = build_resilient_chat("openai", "gpt-4o-mini", fallbacks=[("anthropic", "cl
281
308
  | `mcp` | `langchain-mcp-adapters` | MCP tools |
282
309
  | `observability` | `opentelemetry-*`, `openinference-*` | tracing |
283
310
  | `server` | `fastapi`, `uvicorn` | serving |
311
+ | `dashboard` | `streamlit`, `tiktoken`, `pandas` | prompt observability dashboard |
284
312
  | `all` | everything above | kitchen sink |
285
313
 
286
314
  See [DESIGN.md](DESIGN.md) for the architecture and [RESEARCH.md](RESEARCH.md) for the competitive analysis behind these features.
@@ -138,6 +138,26 @@ agentx prompt remove reviewer
138
138
  A blank `system_prompt` is auto-derived from the agent's role + goal. You can also
139
139
  just open `prompts.json` in an editor — the CLI is a convenience, not a gate.
140
140
 
141
+ ## 📊 Prompt dashboard (observability + optimization)
142
+ A Streamlit workbench to **understand and refine how your prompts talk to the LLM** —
143
+ launch it any time:
144
+
145
+ ```bash
146
+ pip install "agentx-kit[dashboard]"
147
+ agentx dashboard # opens http://localhost:8501
148
+ agentx prompt set assistant -d # edit a prompt AND open the dashboard
149
+ ```
150
+
151
+ It gives you, live as you edit:
152
+ - **Token count, context-window utilization gauge, and cost estimate** (tiktoken-accurate).
153
+ - **Quality score (0–100)** with a checklist (role / goal / output-format / examples / constraints / specificity) and **concrete suggestions + limit warnings**.
154
+ - **✨ One-click LLM optimization** — refines the prompt while preserving intent, shows a **diff + rationale + token delta**, and can **apply the result straight back to `prompts.json`**.
155
+ - **▶️ Test run** — send the prompt to the model and see the response with **tokens in/out, latency, and cost**.
156
+ - **📈 Usage trends** — tokens, cost, and latency over time, logged locally to `.agentx/insights.jsonl`.
157
+
158
+ Run it inside a generated AgentX project and it reads/writes that project's
159
+ `prompts.json`; run it anywhere else for a free-form prompt scratchpad.
160
+
141
161
  ## 🏢 Enterprise pack
142
162
  Generate a production-shaped project with one flag — informed by a survey of
143
163
  CrewAI/LangGraph/create-llama/AgentStack/agno/pydantic-ai (see [RESEARCH.md](RESEARCH.md)):
@@ -186,6 +206,7 @@ llm = build_resilient_chat("openai", "gpt-4o-mini", fallbacks=[("anthropic", "cl
186
206
  | `mcp` | `langchain-mcp-adapters` | MCP tools |
187
207
  | `observability` | `opentelemetry-*`, `openinference-*` | tracing |
188
208
  | `server` | `fastapi`, `uvicorn` | serving |
209
+ | `dashboard` | `streamlit`, `tiktoken`, `pandas` | prompt observability dashboard |
189
210
  | `all` | everything above | kitchen sink |
190
211
 
191
212
  See [DESIGN.md](DESIGN.md) for the architecture and [RESEARCH.md](RESEARCH.md) for the competitive analysis behind these features.
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  # PyPI distribution name (import name + CLI stay `agentx`; `agentx` was taken).
3
3
  name = "agentx-kit"
4
- version = "0.2.0"
4
+ version = "0.3.0"
5
5
  description = "An open-source, provider-agnostic agentic framework + interactive project scaffolder for LangChain and CrewAI. Pick your LLM provider, agents, RAG, memory, MCP tools and skills — generate a ready-to-run uv project."
6
6
  readme = "README.md"
7
7
  requires-python = ">=3.10,<3.14"
@@ -59,6 +59,7 @@ observability = [
59
59
  "openinference-instrumentation-langchain>=0.1.0",
60
60
  ]
61
61
  server = ["fastapi>=0.110.0", "uvicorn[standard]>=0.29.0", "sse-starlette>=2.0.0"]
62
+ dashboard = ["streamlit>=1.40.0", "tiktoken>=0.7.0", "pandas>=2.0.0"]
62
63
 
63
64
  # ---- Bundles ----
64
65
  all = [
@@ -71,6 +72,7 @@ all = [
71
72
  "opentelemetry-sdk>=1.20.0", "opentelemetry-exporter-otlp-proto-http>=1.20.0",
72
73
  "openinference-instrumentation-langchain>=0.1.0",
73
74
  "fastapi>=0.110.0", "uvicorn[standard]>=0.29.0", "sse-starlette>=2.0.0",
75
+ "streamlit>=1.40.0", "tiktoken>=0.7.0", "pandas>=2.0.0",
74
76
  ]
75
77
  dev = ["pytest>=8.0.0", "pytest-cov>=5.0.0"]
76
78
 
@@ -16,7 +16,7 @@ is enough to get started.
16
16
  """
17
17
  from __future__ import annotations
18
18
 
19
- __version__ = "0.2.0"
19
+ __version__ = "0.3.0"
20
20
 
21
21
  from .providers import ( # noqa: E402
22
22
  ProviderSpec,
@@ -33,6 +33,12 @@ from .reliability import ( # noqa: E402
33
33
  build_resilient_chat,
34
34
  )
35
35
  from .structured import structured_model # noqa: E402
36
+ from .insights import ( # noqa: E402
37
+ analyze_prompt,
38
+ count_tokens,
39
+ estimate_cost,
40
+ optimize_prompt,
41
+ )
36
42
 
37
43
  __all__ = [
38
44
  "__version__",
@@ -52,4 +58,9 @@ __all__ = [
52
58
  "apply_guards",
53
59
  "GuardrailError",
54
60
  "structured_model",
61
+ # prompt insights
62
+ "analyze_prompt",
63
+ "optimize_prompt",
64
+ "count_tokens",
65
+ "estimate_cost",
55
66
  ]
@@ -46,6 +46,28 @@ def providers() -> None:
46
46
  console.print(table)
47
47
 
48
48
 
49
+ @app.command()
50
+ def dashboard(
51
+ port: int = typer.Option(8501, "--port", help="Port for the dashboard server."),
52
+ provider: str = typer.Option(None, "--provider", help="Default provider to preselect."),
53
+ model: str = typer.Option(None, "--model", help="Default model to preselect."),
54
+ project: Path = typer.Option(None, "--project", help="Project dir (default: cwd; auto-detects prompts.json)."),
55
+ ) -> None:
56
+ """Launch the prompt observability & optimization dashboard (Streamlit).
57
+
58
+ A workbench to edit a prompt and see token usage, context-window utilization,
59
+ cost, quality suggestions, one-click LLM optimization, and test runs — live.
60
+ """
61
+ from .dashboard import launch
62
+
63
+ console.print(f"[cyan]Launching AgentX dashboard on http://localhost:{port} …[/] (Ctrl+C to stop)")
64
+ try:
65
+ launch(port=port, provider=provider, model=model, project=str(project) if project else None)
66
+ except RuntimeError as exc:
67
+ console.print(f"[red]{exc}[/]")
68
+ raise typer.Exit(1) from exc
69
+
70
+
49
71
  def _result_panel(result, spec: ProjectSpec) -> None:
50
72
  lines = [f"[bold green]✓[/] Project '{spec.slug}' created at:", f" {result.target_dir}", ""]
51
73
  lines += [f" • {m}" for m in result.messages]
@@ -156,6 +178,20 @@ def _read_text_arg(text: str | None, from_file: Path | None) -> str:
156
178
  return (text or "").strip()
157
179
 
158
180
 
181
+ def _maybe_launch_dashboard(launch_flag: bool, project_dir: Path) -> None:
182
+ """Open the prompt dashboard after an edit if requested."""
183
+ if not launch_flag:
184
+ console.print(" [dim]Tip: run `agentx dashboard` to tune this prompt live.[/]")
185
+ return
186
+ from .dashboard import launch
187
+
188
+ console.print("[cyan]Opening dashboard…[/]")
189
+ try:
190
+ launch(project=str(project_dir))
191
+ except RuntimeError as exc:
192
+ console.print(f"[yellow]{exc}[/]")
193
+
194
+
159
195
  @prompt_app.command("list")
160
196
  def prompt_list(project: Path = typer.Option(None, "--project", help="Project dir (default: search from cwd).")) -> None:
161
197
  """List agents and their (resolved) prompts."""
@@ -177,6 +213,7 @@ def prompt_set(
177
213
  text: str = typer.Option("", "--text", "-t", help="New system prompt text."),
178
214
  from_file: Path = typer.Option(None, "--file", "-f", help="Read prompt text from a file."),
179
215
  project: Path = typer.Option(None, "--project"),
216
+ dash: bool = typer.Option(False, "--dashboard", "-d", help="Open the dashboard after saving."),
180
217
  ) -> None:
181
218
  """Set/replace an existing agent's system prompt."""
182
219
  path = _resolve_prompts_file(project)
@@ -190,6 +227,7 @@ def prompt_set(
190
227
  console.print(f"[red]{exc}[/]")
191
228
  raise typer.Exit(1) from exc
192
229
  console.print(f"[green]✓[/] Updated prompt for '{agent}'.")
230
+ _maybe_launch_dashboard(dash, path.parent)
193
231
 
194
232
 
195
233
  @prompt_app.command("add")
@@ -200,6 +238,7 @@ def prompt_add(
200
238
  text: str = typer.Option("", "--text", "-t", help="System prompt (blank = auto from role/goal)."),
201
239
  from_file: Path = typer.Option(None, "--file", "-f"),
202
240
  project: Path = typer.Option(None, "--project"),
241
+ dash: bool = typer.Option(False, "--dashboard", "-d", help="Open the dashboard after saving."),
203
242
  ) -> None:
204
243
  """Add a new agent; the project picks it up automatically on next run."""
205
244
  path = _resolve_prompts_file(project)
@@ -209,6 +248,7 @@ def prompt_add(
209
248
  console.print(f"[red]{exc}[/]")
210
249
  raise typer.Exit(1) from exc
211
250
  console.print(f"[green]✓[/] Added agent '{agent}'. It will run on next start — no code changes needed.")
251
+ _maybe_launch_dashboard(dash, path.parent)
212
252
 
213
253
 
214
254
  @prompt_app.command("remove")
@@ -0,0 +1,40 @@
1
+ """The AgentX prompt-observability dashboard (Streamlit)."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ APP = Path(__file__).parent / "app.py"
10
+
11
+
12
+ def launch(port: int = 8501, provider: str | None = None, model: str | None = None,
13
+ project: str | None = None, headless: bool = False) -> int:
14
+ """Launch the Streamlit dashboard. Raises a helpful error if Streamlit is absent."""
15
+ try:
16
+ import streamlit # noqa: F401
17
+ except ImportError as exc:
18
+ raise RuntimeError(
19
+ "The dashboard needs Streamlit. Install it with:\n"
20
+ " pip install 'agentx-kit[dashboard]'"
21
+ ) from exc
22
+
23
+ env = os.environ.copy()
24
+ if provider:
25
+ env["AGENTX_DASH_PROVIDER"] = provider
26
+ if model:
27
+ env["AGENTX_DASH_MODEL"] = model
28
+ env["AGENTX_DASH_PROJECT"] = str(project or Path.cwd())
29
+
30
+ cmd = [
31
+ sys.executable, "-m", "streamlit", "run", str(APP),
32
+ "--server.port", str(port),
33
+ "--browser.gatherUsageStats", "false",
34
+ ]
35
+ if headless:
36
+ cmd += ["--server.headless", "true"]
37
+ return subprocess.run(cmd, env=env).returncode
38
+
39
+
40
+ __all__ = ["launch", "APP"]
@@ -0,0 +1,270 @@
1
+ """AgentX — Prompt Observability & Optimization Dashboard (Streamlit).
2
+
3
+ Launched via ``agentx dashboard``. An observability workbench for understanding
4
+ and refining how your prompts interact with the LLM:
5
+
6
+ • live token count, context-window utilization, and cost estimate
7
+ • a heuristic quality score with concrete suggestions + limit warnings
8
+ • one-click LLM optimization (refine while preserving intent) with a diff
9
+ • a test run showing response, tokens in/out, latency, and cost
10
+ • usage trends over time, logged locally to .agentx/insights.jsonl
11
+
12
+ If run inside a generated AgentX project, it reads/writes that project's
13
+ ``prompts.json`` so optimizations can be applied in place.
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import difflib
18
+ import os
19
+ import time
20
+ from pathlib import Path
21
+
22
+ import streamlit as st
23
+
24
+ from agentx.insights import (
25
+ analyze_prompt,
26
+ count_tokens,
27
+ estimate_cost,
28
+ get_log,
29
+ optimize_prompt,
30
+ prompt_hash,
31
+ )
32
+ from agentx.insights.tokens import context_window
33
+ from agentx.providers import all_specs, get_spec
34
+
35
+ st.set_page_config(page_title="AgentX Prompt Dashboard", page_icon="🧬", layout="wide")
36
+
37
+ _PROJECT = Path(os.getenv("AGENTX_DASH_PROJECT", "."))
38
+ _DEFAULT_PROVIDER = os.getenv("AGENTX_DASH_PROVIDER", "openai")
39
+ _DEFAULT_MODEL = os.getenv("AGENTX_DASH_MODEL", "")
40
+
41
+
42
+ # --------------------------------------------------------------------------- #
43
+ # prompts.json integration (optional)
44
+ # --------------------------------------------------------------------------- #
45
+ def _load_prompts_store():
46
+ try:
47
+ from agentx.scaffold import prompts_store
48
+
49
+ path = prompts_store.find_prompts_file(_PROJECT)
50
+ if path:
51
+ return prompts_store, path, prompts_store.load(path)
52
+ except Exception: # noqa: BLE001
53
+ pass
54
+ return None, None, None
55
+
56
+
57
+ def _log():
58
+ return get_log(_PROJECT / ".agentx" / "insights.jsonl")
59
+
60
+
61
+ # --------------------------------------------------------------------------- #
62
+ # Sidebar — provider/model + prompt source
63
+ # --------------------------------------------------------------------------- #
64
+ def _sidebar():
65
+ st.sidebar.header("🧬 AgentX Dashboard")
66
+ specs = all_specs()
67
+ ids = [s.id for s in specs]
68
+ provider = st.sidebar.selectbox(
69
+ "Provider", ids, index=ids.index(_DEFAULT_PROVIDER) if _DEFAULT_PROVIDER in ids else 0,
70
+ )
71
+ default_model = _DEFAULT_MODEL or get_spec(provider).default_model
72
+ model = st.sidebar.text_input("Model", value=default_model)
73
+
74
+ store, path, data = _load_prompts_store()
75
+ source = "Free-form"
76
+ agent_name = None
77
+ initial_text = st.session_state.get("prompt_text", "")
78
+ if store and data and data.get("agents"):
79
+ st.sidebar.success(f"Project prompts: {path.parent.name}/prompts.json")
80
+ choices = ["Free-form"] + list(data["agents"])
81
+ source = st.sidebar.selectbox("Prompt source", choices)
82
+ if source != "Free-form":
83
+ agent_name = source
84
+ meta = data["agents"][agent_name]
85
+ loaded = meta.get("system_prompt") or ""
86
+ if st.session_state.get("_loaded_agent") != agent_name:
87
+ st.session_state["prompt_text"] = loaded
88
+ st.session_state["_loaded_agent"] = agent_name
89
+ initial_text = loaded
90
+ else:
91
+ st.sidebar.info("No prompts.json found — running in free-form mode. "
92
+ "Run inside an AgentX project to edit its prompts.")
93
+ return provider, model, store, path, agent_name
94
+
95
+
96
+ # --------------------------------------------------------------------------- #
97
+ # Panels
98
+ # --------------------------------------------------------------------------- #
99
+ def _metrics_row(text: str, model: str):
100
+ tokens = count_tokens(text, model)
101
+ win = context_window(model)
102
+ util = tokens / win if win else 0.0
103
+ cost = estimate_cost(tokens, 0, model)
104
+ c1, c2, c3, c4 = st.columns(4)
105
+ c1.metric("Tokens", f"{tokens:,}")
106
+ c2.metric("Context window", f"{win:,}")
107
+ c3.metric("Utilization", f"{util:.1%}")
108
+ c4.metric("Est. input cost", f"${cost:.5f}")
109
+ st.progress(min(1.0, util), text=f"Context window usage: {util:.1%}")
110
+ return tokens
111
+
112
+
113
+ def _analysis_panel(text: str, model: str):
114
+ a = analyze_prompt(text, model)
115
+ score = a.quality_score
116
+ color = "🟢" if score >= 75 else "🟡" if score >= 50 else "🔴"
117
+ st.subheader(f"{color} Prompt quality: {score}/100")
118
+ cols = st.columns(2)
119
+ labels = {
120
+ "has_role": "Role defined", "has_goal": "Goal stated",
121
+ "has_output_format": "Output format", "has_examples": "Examples",
122
+ "has_constraints": "Constraints", "not_vague": "Specific (not vague)",
123
+ "reasonable_length": "Reasonable length",
124
+ }
125
+ items = list(a.checks.items())
126
+ for i, (key, ok) in enumerate(items):
127
+ cols[i % 2].markdown(f"{'✅' if ok else '⬜'} {labels.get(key, key)}")
128
+ if a.suggestions:
129
+ st.markdown("##### 💡 Suggestions")
130
+ for s in a.suggestions:
131
+ st.markdown(f"- {s}")
132
+ for w in a.warnings:
133
+ st.warning(w)
134
+
135
+
136
+ def _optimize_panel(text: str, provider: str, model: str, store, path, agent_name):
137
+ st.subheader("✨ Optimize prompt")
138
+ feedback = st.text_input("Optional feedback (tone, format, length, focus…)", key="opt_feedback")
139
+ if st.button("Optimize with LLM", type="primary"):
140
+ with st.spinner("Refining prompt (preserving intent)…"):
141
+ result = optimize_prompt(text, provider, model, feedback=feedback)
142
+ if not result.ok:
143
+ st.error(f"Optimization failed: {result.error}")
144
+ else:
145
+ st.session_state["opt_result"] = {"improved": result.improved, "rationale": result.rationale}
146
+ _log().record(kind="optimize", model=model, prompt_hash=prompt_hash(text),
147
+ tokens_in=count_tokens(text, model), tokens_out=count_tokens(result.improved, model),
148
+ note="prompt optimization")
149
+
150
+ res = st.session_state.get("opt_result")
151
+ if res:
152
+ before = count_tokens(text, model)
153
+ after = count_tokens(res["improved"], model)
154
+ delta = after - before
155
+ st.caption(f"Tokens: {before} → {after} ({'+' if delta >= 0 else ''}{delta})")
156
+ st.markdown("**Improved prompt**")
157
+ st.code(res["improved"])
158
+ if res["rationale"]:
159
+ with st.expander("Why these changes?"):
160
+ st.markdown(res["rationale"])
161
+ with st.expander("Diff (original → improved)"):
162
+ diff = difflib.unified_diff(
163
+ text.splitlines(), res["improved"].splitlines(),
164
+ fromfile="original", tofile="improved", lineterm="",
165
+ )
166
+ st.code("\n".join(diff) or "(no line-level changes)", language="diff")
167
+ cols = st.columns(2)
168
+ if cols[0].button("Use as current prompt"):
169
+ st.session_state["prompt_text"] = res["improved"]
170
+ st.session_state.pop("opt_result", None)
171
+ st.rerun()
172
+ if agent_name and store and path:
173
+ if cols[1].button(f"💾 Apply to '{agent_name}' in prompts.json"):
174
+ store.set_prompt(path, agent_name, res["improved"])
175
+ st.session_state["prompt_text"] = res["improved"]
176
+ st.session_state.pop("opt_result", None)
177
+ st.success(f"Saved to prompts.json → {agent_name}. Your project picks it up on next run.")
178
+ st.rerun()
179
+
180
+
181
+ def _run_panel(text: str, provider: str, model: str):
182
+ st.subheader("▶️ Test run")
183
+ user_msg = st.text_area("User message", value="Hello! Introduce yourself.", height=80, key="run_user")
184
+ if st.button("Run against the model"):
185
+ try:
186
+ from agentx import get_chat_model
187
+ from langchain_core.messages import HumanMessage, SystemMessage
188
+
189
+ llm = get_chat_model(provider, model)
190
+ messages = [SystemMessage(text), HumanMessage(user_msg)] if text.strip() else [HumanMessage(user_msg)]
191
+ t0 = time.time()
192
+ with st.spinner("Calling the model…"):
193
+ resp = llm.invoke(messages)
194
+ latency = int((time.time() - t0) * 1000)
195
+ reply = getattr(resp, "content", str(resp))
196
+ tin = count_tokens(text + user_msg, model)
197
+ tout = count_tokens(reply, model)
198
+ cost = estimate_cost(tin, tout, model)
199
+ _log().record(kind="run", model=model, prompt_hash=prompt_hash(text),
200
+ tokens_in=tin, tokens_out=tout, cost_usd=cost, latency_ms=latency)
201
+ m1, m2, m3, m4 = st.columns(4)
202
+ m1.metric("Tokens in", f"{tin:,}")
203
+ m2.metric("Tokens out", f"{tout:,}")
204
+ m3.metric("Latency", f"{latency} ms")
205
+ m4.metric("Est. cost", f"${cost:.5f}")
206
+ st.markdown("**Response**")
207
+ st.markdown(reply)
208
+ except Exception as exc: # noqa: BLE001
209
+ st.error(f"Run failed: {exc}\n\nCheck your provider extra is installed and credentials are set.")
210
+
211
+
212
+ def _trends_panel():
213
+ st.subheader("📊 Usage & trends")
214
+ log = _log()
215
+ agg = log.aggregate()
216
+ c1, c2, c3, c4 = st.columns(4)
217
+ c1.metric("Runs", agg["runs"])
218
+ c2.metric("Total tokens", f"{agg['total_tokens']:,}")
219
+ c3.metric("Total cost", f"${agg['total_cost_usd']:.4f}")
220
+ c4.metric("Avg latency", f"{agg['avg_latency_ms']} ms")
221
+ rows = [r for r in log.events() if r.get("kind") == "run"]
222
+ if not rows:
223
+ st.info("No runs logged yet — use **Test run** to populate trends.")
224
+ return
225
+ try:
226
+ import pandas as pd
227
+
228
+ df = pd.DataFrame(rows)
229
+ df["ts"] = pd.to_datetime(df["ts"])
230
+ df = df.set_index("ts")
231
+ st.markdown("###### Tokens per run")
232
+ st.line_chart(df[["tokens_in", "tokens_out"]], height=200)
233
+ st.markdown("###### Cost (USD) per run")
234
+ st.line_chart(df[["cost_usd"]], height=160)
235
+ st.markdown("###### Latency (ms) per run")
236
+ st.line_chart(df[["latency_ms"]], height=160)
237
+ except Exception: # noqa: BLE001 - pandas optional
238
+ st.write(rows[-20:])
239
+
240
+
241
+ # --------------------------------------------------------------------------- #
242
+ def main():
243
+ provider, model, store, path, agent_name = _sidebar()
244
+
245
+ st.title("🧬 Prompt Observability & Optimization")
246
+ st.caption("Edit a prompt, see token/cost/limits live, get suggestions, optimize, and test — all in one place.")
247
+
248
+ text = st.text_area(
249
+ "System prompt", value=st.session_state.get("prompt_text", ""),
250
+ height=240, key="prompt_text",
251
+ placeholder="You are a helpful assistant. Your goal is to…",
252
+ )
253
+
254
+ _metrics_row(text, model)
255
+ st.divider()
256
+
257
+ tab_analyze, tab_optimize, tab_run, tab_trends = st.tabs(
258
+ ["🔎 Analysis", "✨ Optimize", "▶️ Test run", "📊 Trends"]
259
+ )
260
+ with tab_analyze:
261
+ _analysis_panel(text, model)
262
+ with tab_optimize:
263
+ _optimize_panel(text, provider, model, store, path, agent_name)
264
+ with tab_run:
265
+ _run_panel(text, provider, model)
266
+ with tab_trends:
267
+ _trends_panel()
268
+
269
+
270
+ main()
@@ -0,0 +1,18 @@
1
+ """Prompt insights: token/cost analysis, quality heuristics, LLM optimization, logging."""
2
+ from .analyze import PromptAnalysis, analyze_prompt
3
+ from .log import InsightEvent, InsightLog, get_log, prompt_hash
4
+ from .optimize import OptimizationResult, optimize_prompt
5
+ from .tokens import (
6
+ TokenReport,
7
+ context_window,
8
+ count_tokens,
9
+ estimate_cost,
10
+ utilization,
11
+ )
12
+
13
+ __all__ = [
14
+ "analyze_prompt", "PromptAnalysis",
15
+ "optimize_prompt", "OptimizationResult",
16
+ "count_tokens", "estimate_cost", "context_window", "utilization", "TokenReport",
17
+ "InsightLog", "InsightEvent", "get_log", "prompt_hash",
18
+ ]
@@ -0,0 +1,85 @@
1
+ """Heuristic prompt analysis — quality score, suggestions, and limit warnings.
2
+
3
+ Offline and fast (no LLM). Encodes widely-recommended prompt-engineering levers
4
+ (2025–2026): a clear role + goal, explicit output format, examples, constraints,
5
+ lean/keyword-led wording, and context-window awareness. Use this for instant
6
+ feedback while editing; use :mod:`agentx.insights.optimize` for an LLM rewrite.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from dataclasses import dataclass, field
12
+
13
+ from .tokens import context_window, count_tokens, utilization
14
+
15
+ _VAGUE = ("something", "stuff", "etc", "and so on", "good", "nice", "appropriate", "as needed")
16
+ _FORMAT_HINTS = ("json", "markdown", "bullet", "list", "table", "format", "schema", "output:", "respond with")
17
+ _EXAMPLE_HINTS = ("example", "e.g.", "for instance", "input:", "output:")
18
+ _CONSTRAINT_HINTS = ("must", "do not", "don't", "never", "only", "limit", "at most", "no more than", "avoid")
19
+ _ROLE_HINTS = ("you are", "act as", "your role", "as a ")
20
+ _GOAL_HINTS = ("your goal", "your task", "objective", "you should", "help the user", "your job")
21
+
22
+
23
+ @dataclass
24
+ class PromptAnalysis:
25
+ tokens: int
26
+ chars: int
27
+ quality_score: int # 0-100
28
+ checks: dict[str, bool] = field(default_factory=dict)
29
+ suggestions: list[str] = field(default_factory=list)
30
+ warnings: list[str] = field(default_factory=list)
31
+
32
+
33
+ def analyze_prompt(text: str, model: str = "gpt-4o-mini") -> PromptAnalysis:
34
+ """Score a prompt and return actionable suggestions + limit warnings."""
35
+ text = text or ""
36
+ low = text.lower()
37
+ tokens = count_tokens(text, model)
38
+
39
+ checks = {
40
+ "has_role": any(h in low for h in _ROLE_HINTS),
41
+ "has_goal": any(h in low for h in _GOAL_HINTS),
42
+ "has_output_format": any(h in low for h in _FORMAT_HINTS),
43
+ "has_examples": any(h in low for h in _EXAMPLE_HINTS),
44
+ "has_constraints": any(h in low for h in _CONSTRAINT_HINTS),
45
+ "not_vague": not any(re.search(rf"\b{re.escape(w)}\b", low) for w in _VAGUE),
46
+ "reasonable_length": 5 <= tokens <= 1500,
47
+ }
48
+ # Weighted score (role + goal + format matter most).
49
+ weights = {
50
+ "has_role": 18, "has_goal": 18, "has_output_format": 18,
51
+ "has_examples": 14, "has_constraints": 14, "not_vague": 10, "reasonable_length": 8,
52
+ }
53
+ score = sum(w for k, w in weights.items() if checks[k])
54
+
55
+ suggestions: list[str] = []
56
+ if not checks["has_role"]:
57
+ suggestions.append("Open with an explicit role, e.g. “You are a senior support agent…”.")
58
+ if not checks["has_goal"]:
59
+ suggestions.append("State the goal/task clearly so the model knows what success looks like.")
60
+ if not checks["has_output_format"]:
61
+ suggestions.append("Specify the output format (JSON/markdown/bullets) — reduces retries and tokens.")
62
+ if not checks["has_examples"]:
63
+ suggestions.append("Add 1–2 short input→output examples (few-shot) for tricky tasks.")
64
+ if not checks["has_constraints"]:
65
+ suggestions.append("Add constraints/guardrails (length caps, “do not …”) to keep output on-task.")
66
+ if not checks["not_vague"]:
67
+ suggestions.append("Replace vague words (e.g. “good”, “appropriate”, “stuff”) with concrete criteria.")
68
+
69
+ warnings: list[str] = []
70
+ win = context_window(model)
71
+ util = utilization(tokens, model)
72
+ if tokens > 1500:
73
+ warnings.append(
74
+ f"Prompt is long ({tokens} tokens). Lead with keywords, trim boilerplate, and move "
75
+ "stable context into a cached prefix to cut cost."
76
+ )
77
+ if util >= 0.5:
78
+ warnings.append(f"Prompt already uses {util:.0%} of {model}'s {win:,}-token context window.")
79
+ if tokens < 5:
80
+ warnings.append("Prompt is very short — likely under-specified.")
81
+
82
+ return PromptAnalysis(
83
+ tokens=tokens, chars=len(text), quality_score=score,
84
+ checks=checks, suggestions=suggestions, warnings=warnings,
85
+ )
@@ -0,0 +1,88 @@
1
+ """Interaction log — append prompt edits/runs/optimizations for the dashboard.
2
+
3
+ A local JSONL at ``.agentx/insights.jsonl`` (project-local). Powers the usage
4
+ and trend charts: tokens in/out, cost, latency, model, per event.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import hashlib
9
+ import json
10
+ import threading
11
+ from dataclasses import asdict, dataclass, field
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+
15
+ _lock = threading.Lock()
16
+
17
+
18
+ def prompt_hash(text: str) -> str:
19
+ return hashlib.sha256((text or "").encode("utf-8")).hexdigest()[:10]
20
+
21
+
22
+ def _now() -> str:
23
+ return datetime.now(timezone.utc).isoformat()
24
+
25
+
26
+ @dataclass
27
+ class InsightEvent:
28
+ ts: str = field(default_factory=_now)
29
+ kind: str = "run" # run | edit | optimize
30
+ model: str = ""
31
+ prompt_hash: str = ""
32
+ tokens_in: int = 0
33
+ tokens_out: int = 0
34
+ cost_usd: float = 0.0
35
+ latency_ms: int = 0
36
+ note: str = ""
37
+
38
+
39
+ class InsightLog:
40
+ def __init__(self, path: str | Path = ".agentx/insights.jsonl"):
41
+ self.path = Path(path)
42
+ self.path.parent.mkdir(parents=True, exist_ok=True)
43
+
44
+ def add(self, event: InsightEvent) -> InsightEvent:
45
+ with _lock:
46
+ with self.path.open("a", encoding="utf-8") as fh:
47
+ fh.write(json.dumps(asdict(event)) + "\n")
48
+ return event
49
+
50
+ def record(self, **kwargs) -> InsightEvent:
51
+ return self.add(InsightEvent(**kwargs))
52
+
53
+ def events(self, limit: int | None = None) -> list[dict]:
54
+ if not self.path.exists():
55
+ return []
56
+ rows = []
57
+ for line in self.path.read_text(encoding="utf-8").splitlines():
58
+ line = line.strip()
59
+ if not line:
60
+ continue
61
+ try:
62
+ rows.append(json.loads(line))
63
+ except json.JSONDecodeError:
64
+ continue
65
+ return rows[-limit:] if limit else rows
66
+
67
+ def aggregate(self) -> dict:
68
+ rows = self.events()
69
+ runs = [r for r in rows if r.get("kind") == "run"]
70
+ total_tokens = sum(r.get("tokens_in", 0) + r.get("tokens_out", 0) for r in runs)
71
+ total_cost = round(sum(r.get("cost_usd", 0.0) for r in runs), 6)
72
+ lat = [r.get("latency_ms", 0) for r in runs if r.get("latency_ms")]
73
+ return {
74
+ "events": len(rows),
75
+ "runs": len(runs),
76
+ "total_tokens": total_tokens,
77
+ "total_cost_usd": total_cost,
78
+ "avg_latency_ms": round(sum(lat) / len(lat)) if lat else 0,
79
+ "optimizations": sum(1 for r in rows if r.get("kind") == "optimize"),
80
+ }
81
+
82
+ def clear(self) -> None:
83
+ if self.path.exists():
84
+ self.path.unlink()
85
+
86
+
87
+ def get_log(path: str | Path = ".agentx/insights.jsonl") -> InsightLog:
88
+ return InsightLog(path)
@@ -0,0 +1,80 @@
1
+ """LLM-backed prompt refinement — rewrite a prompt while preserving intent.
2
+
3
+ Implements the "iterative refinement" pattern: improve an existing prompt by
4
+ applying best practices (clear role/goal, explicit output format, constraints,
5
+ lean wording) and any user feedback, *without* changing the original intent.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from dataclasses import dataclass
11
+
12
+ from ..providers import get_chat_model
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _OPTIMIZER_SYSTEM = (
17
+ "You are a senior prompt engineer. You rewrite system prompts to be clearer, "
18
+ "more reliable, and more token-efficient WITHOUT changing their intent.\n"
19
+ "Apply: an explicit role + goal, a specified output format, concrete constraints, "
20
+ "lean keyword-led wording, and few-shot examples only if they add value. "
21
+ "Remove redundancy and vagueness. Keep it as short as possible while complete."
22
+ )
23
+
24
+ _OPTIMIZER_HUMAN = (
25
+ "Rewrite the PROMPT below.\n\n"
26
+ "PROMPT:\n{prompt}\n\n"
27
+ "{feedback_block}"
28
+ "Respond in EXACTLY this format:\n"
29
+ "===IMPROVED===\n<the improved prompt only>\n===RATIONALE===\n"
30
+ "<3-5 bullet points explaining the key changes>"
31
+ )
32
+
33
+
34
+ @dataclass
35
+ class OptimizationResult:
36
+ original: str
37
+ improved: str
38
+ rationale: str
39
+ ok: bool = True
40
+ error: str = ""
41
+
42
+
43
+ def optimize_prompt(
44
+ prompt: str,
45
+ provider: str | None = None,
46
+ model: str | None = None,
47
+ feedback: str = "",
48
+ temperature: float = 0.3,
49
+ ) -> OptimizationResult:
50
+ """Return an LLM-refined version of ``prompt`` + a rationale. Never raises."""
51
+ if not (prompt or "").strip():
52
+ return OptimizationResult(prompt, prompt, "", ok=False, error="Empty prompt.")
53
+ feedback_block = f"Apply this feedback: {feedback}\n\n" if feedback.strip() else ""
54
+ try:
55
+ from langchain_core.prompts import ChatPromptTemplate
56
+
57
+ chain = ChatPromptTemplate.from_messages(
58
+ [("system", _OPTIMIZER_SYSTEM), ("human", _OPTIMIZER_HUMAN)]
59
+ ) | get_chat_model(provider, model, temperature=temperature)
60
+ raw = chain.invoke({"prompt": prompt, "feedback_block": feedback_block}).content
61
+ improved, rationale = _parse(raw, fallback=prompt)
62
+ return OptimizationResult(prompt, improved, rationale)
63
+ except Exception as exc: # noqa: BLE001
64
+ logger.warning("Prompt optimization failed: %s", exc)
65
+ return OptimizationResult(prompt, prompt, "", ok=False, error=str(exc))
66
+
67
+
68
+ def _parse(raw: str, fallback: str) -> tuple[str, str]:
69
+ text = raw or ""
70
+ improved, rationale = fallback, ""
71
+ if "===IMPROVED===" in text:
72
+ rest = text.split("===IMPROVED===", 1)[1]
73
+ if "===RATIONALE===" in rest:
74
+ imp, rat = rest.split("===RATIONALE===", 1)
75
+ improved, rationale = imp.strip(), rat.strip()
76
+ else:
77
+ improved = rest.strip()
78
+ else:
79
+ improved = text.strip() or fallback
80
+ return improved, rationale
@@ -0,0 +1,97 @@
1
+ """Token counting, cost estimation, and context-window utilization.
2
+
3
+ Uses ``tiktoken`` when available for accurate counts; otherwise falls back to a
4
+ ~4-chars/token heuristic. Pricing and context windows are approximate, editable
5
+ defaults — override per your contract. Cost is *derived* from tokens (the
6
+ industry convention; OTel GenAI standardises tokens, not cost).
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ # Approximate context windows (tokens). Matched by substring on the model id.
13
+ _CONTEXT_WINDOWS: dict[str, int] = {
14
+ "gpt-4o": 128_000, "gpt-4.1": 1_000_000, "gpt-4": 128_000, "o1": 200_000, "o3": 200_000,
15
+ "gpt-3.5": 16_385,
16
+ "claude-3-5": 200_000, "claude-3": 200_000, "claude": 200_000,
17
+ "gemini-1.5": 1_000_000, "gemini-2": 1_000_000, "gemini": 1_000_000,
18
+ "llama-3.3": 128_000, "llama-3.1": 128_000, "llama3": 8_192, "llama": 8_192,
19
+ "mixtral": 32_768, "mistral": 32_768, "qwen": 32_768,
20
+ }
21
+
22
+ # Approximate USD per 1K tokens, as (input, output). Defaults are conservative.
23
+ _PRICING: dict[str, tuple[float, float]] = {
24
+ "gpt-4o-mini": (0.00015, 0.0006), "gpt-4o": (0.0025, 0.01), "gpt-4.1": (0.002, 0.008),
25
+ "gpt-4": (0.03, 0.06), "gpt-3.5": (0.0005, 0.0015),
26
+ "claude-3-5-sonnet": (0.003, 0.015), "claude-3-5-haiku": (0.0008, 0.004),
27
+ "claude-3-opus": (0.015, 0.075), "claude": (0.003, 0.015),
28
+ "gemini-1.5-flash": (0.000075, 0.0003), "gemini-1.5-pro": (0.00125, 0.005), "gemini": (0.000075, 0.0003),
29
+ "llama": (0.0, 0.0), "qwen": (0.0, 0.0), "mistral": (0.0, 0.0),
30
+ }
31
+
32
+ _DEFAULT_WINDOW = 8_192
33
+ _DEFAULT_PRICE = (0.0, 0.0)
34
+
35
+
36
+ def _match(model: str, table: dict):
37
+ model = (model or "").lower()
38
+ # Longest key match wins (so "gpt-4o-mini" beats "gpt-4").
39
+ best = None
40
+ for key in sorted(table, key=len, reverse=True):
41
+ if key in model:
42
+ best = table[key]
43
+ break
44
+ return best
45
+
46
+
47
+ def count_tokens(text: str, model: str = "gpt-4o-mini") -> int:
48
+ """Count tokens in ``text`` for ``model``. Accurate via tiktoken if installed."""
49
+ if not text:
50
+ return 0
51
+ try:
52
+ import tiktoken
53
+
54
+ try:
55
+ enc = tiktoken.encoding_for_model(model)
56
+ except KeyError:
57
+ enc = tiktoken.get_encoding("o200k_base" if "gpt-4o" in model or "gpt-4.1" in model else "cl100k_base")
58
+ return len(enc.encode(text))
59
+ except Exception: # noqa: BLE001 - tiktoken absent or model unknown
60
+ # ~4 chars/token heuristic with a small floor.
61
+ return max(1, round(len(text) / 4))
62
+
63
+
64
+ def context_window(model: str) -> int:
65
+ return _match(model, _CONTEXT_WINDOWS) or _DEFAULT_WINDOW
66
+
67
+
68
+ def utilization(tokens: int, model: str) -> float:
69
+ """Fraction (0-1) of the model's context window used by ``tokens``."""
70
+ win = context_window(model)
71
+ return round(tokens / win, 4) if win else 0.0
72
+
73
+
74
+ def estimate_cost(input_tokens: int, output_tokens: int = 0, model: str = "gpt-4o-mini") -> float:
75
+ """Estimate USD cost from token counts (approximate; pricing is editable)."""
76
+ price_in, price_out = _match(model, _PRICING) or _DEFAULT_PRICE
77
+ return round(input_tokens / 1000 * price_in + output_tokens / 1000 * price_out, 6)
78
+
79
+
80
+ @dataclass
81
+ class TokenReport:
82
+ model: str
83
+ tokens: int
84
+ context_window: int
85
+ utilization: float
86
+ est_input_cost: float
87
+
88
+ @classmethod
89
+ def for_text(cls, text: str, model: str) -> "TokenReport":
90
+ n = count_tokens(text, model)
91
+ return cls(
92
+ model=model,
93
+ tokens=n,
94
+ context_window=context_window(model),
95
+ utilization=utilization(n, model),
96
+ est_input_cost=estimate_cost(n, 0, model),
97
+ )
@@ -0,0 +1,94 @@
1
+ """Tests for the prompt-insights core (token/cost, analysis, log). No live LLM."""
2
+ import pytest
3
+
4
+ from agentx.insights import (
5
+ analyze_prompt,
6
+ context_window,
7
+ count_tokens,
8
+ estimate_cost,
9
+ get_log,
10
+ prompt_hash,
11
+ utilization,
12
+ )
13
+
14
+
15
+ # ----- tokens -----
16
+ def test_count_tokens_scales_with_length():
17
+ assert count_tokens("", "gpt-4o-mini") == 0
18
+ short = count_tokens("hello world", "gpt-4o-mini")
19
+ long = count_tokens("hello world " * 100, "gpt-4o-mini")
20
+ assert 0 < short < long
21
+
22
+
23
+ def test_context_window_and_utilization():
24
+ assert context_window("gpt-4o-mini") >= 100_000
25
+ assert context_window("totally-unknown-model") == 8192 # default
26
+ assert 0.0 <= utilization(1000, "gpt-4o-mini") < 0.1
27
+
28
+
29
+ def test_estimate_cost_monotonic():
30
+ cheap = estimate_cost(1000, 0, "gpt-4o-mini")
31
+ pricey = estimate_cost(1000, 0, "gpt-4o")
32
+ assert pricey > cheap >= 0
33
+ assert estimate_cost(0, 0, "gpt-4o") == 0.0
34
+
35
+
36
+ # ----- analysis -----
37
+ def test_analyze_good_prompt_scores_high():
38
+ good = (
39
+ "You are a senior support agent. Your goal is to resolve billing issues. "
40
+ "Respond in JSON with fields {reason, action}. Do not invent policy. "
41
+ "Example: input: 'refund?' output: {\"reason\": \"...\", \"action\": \"...\"}."
42
+ )
43
+ a = analyze_prompt(good, "gpt-4o-mini")
44
+ assert a.quality_score >= 70
45
+ assert a.checks["has_role"] and a.checks["has_goal"] and a.checks["has_output_format"]
46
+
47
+
48
+ def test_analyze_poor_prompt_has_suggestions():
49
+ a = analyze_prompt("do good stuff", "gpt-4o-mini")
50
+ assert a.quality_score < 50
51
+ assert a.suggestions
52
+ assert a.checks["not_vague"] is False # 'good'/'stuff' are vague
53
+
54
+
55
+ def test_analyze_long_prompt_warns():
56
+ a = analyze_prompt("word " * 4000, "gpt-4o-mini")
57
+ assert any("long" in w.lower() for w in a.warnings)
58
+
59
+
60
+ # ----- log -----
61
+ def test_insight_log_roundtrip(tmp_path):
62
+ log = get_log(tmp_path / ".agentx" / "insights.jsonl")
63
+ log.record(kind="run", model="gpt-4o-mini", tokens_in=100, tokens_out=50, cost_usd=0.001, latency_ms=420)
64
+ log.record(kind="run", model="gpt-4o-mini", tokens_in=200, tokens_out=80, cost_usd=0.002, latency_ms=380)
65
+ log.record(kind="optimize", model="gpt-4o-mini", tokens_in=100, tokens_out=90)
66
+ agg = log.aggregate()
67
+ assert agg["runs"] == 2
68
+ assert agg["total_tokens"] == 100 + 50 + 200 + 80
69
+ assert agg["optimizations"] == 1
70
+ assert agg["avg_latency_ms"] == 400
71
+
72
+
73
+ def test_prompt_hash_stable():
74
+ assert prompt_hash("abc") == prompt_hash("abc")
75
+ assert prompt_hash("abc") != prompt_hash("abd")
76
+
77
+
78
+ # ----- dashboard launcher import is lazy/graceful -----
79
+ def test_dashboard_launch_requires_streamlit(monkeypatch):
80
+ import builtins
81
+
82
+ from agentx import dashboard
83
+
84
+ real_import = builtins.__import__
85
+
86
+ def fake_import(name, *a, **k):
87
+ if name == "streamlit":
88
+ raise ImportError("no streamlit")
89
+ return real_import(name, *a, **k)
90
+
91
+ monkeypatch.setattr(builtins, "__import__", fake_import)
92
+ with pytest.raises(RuntimeError) as exc:
93
+ dashboard.launch()
94
+ assert "agentx-kit[dashboard]" in str(exc.value)
File without changes
File without changes
File without changes
File without changes