riptide-watergraph 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. riptide_watergraph/__init__.py +82 -0
  2. riptide_watergraph/cli.py +364 -0
  3. riptide_watergraph/config.py +58 -0
  4. riptide_watergraph/evaluation/__init__.py +18 -0
  5. riptide_watergraph/evaluation/runner.py +135 -0
  6. riptide_watergraph/evaluation/suite.py +51 -0
  7. riptide_watergraph/gateway/__init__.py +7 -0
  8. riptide_watergraph/gateway/demo_gateway.py +177 -0
  9. riptide_watergraph/gateway/litellm_gateway.py +106 -0
  10. riptide_watergraph/gateway/resilient.py +72 -0
  11. riptide_watergraph/graph/__init__.py +6 -0
  12. riptide_watergraph/graph/builder.py +164 -0
  13. riptide_watergraph/graph/nodes.py +1012 -0
  14. riptide_watergraph/graph/state.py +63 -0
  15. riptide_watergraph/graph/waves.py +35 -0
  16. riptide_watergraph/guardrails/__init__.py +12 -0
  17. riptide_watergraph/guardrails/injection.py +39 -0
  18. riptide_watergraph/guardrails/pii.py +41 -0
  19. riptide_watergraph/guardrails/pipeline.py +43 -0
  20. riptide_watergraph/interfaces/__init__.py +37 -0
  21. riptide_watergraph/interfaces/agent.py +17 -0
  22. riptide_watergraph/interfaces/embedding.py +12 -0
  23. riptide_watergraph/interfaces/gateway.py +72 -0
  24. riptide_watergraph/interfaces/guardrail.py +32 -0
  25. riptide_watergraph/interfaces/memory.py +54 -0
  26. riptide_watergraph/interfaces/reflector.py +33 -0
  27. riptide_watergraph/interfaces/reranker.py +18 -0
  28. riptide_watergraph/interfaces/swarm.py +48 -0
  29. riptide_watergraph/interfaces/tools.py +65 -0
  30. riptide_watergraph/mcp/__init__.py +18 -0
  31. riptide_watergraph/mcp/adapter.py +66 -0
  32. riptide_watergraph/mcp/client.py +57 -0
  33. riptide_watergraph/mcp/stdio.py +83 -0
  34. riptide_watergraph/memory/__init__.py +26 -0
  35. riptide_watergraph/memory/embedding.py +48 -0
  36. riptide_watergraph/memory/inmemory.py +59 -0
  37. riptide_watergraph/memory/jsonfile.py +143 -0
  38. riptide_watergraph/memory/pgvector.py +101 -0
  39. riptide_watergraph/memory/ranking.py +128 -0
  40. riptide_watergraph/memory/reflection.py +80 -0
  41. riptide_watergraph/memory/rerank.py +27 -0
  42. riptide_watergraph/memory/types.py +49 -0
  43. riptide_watergraph/observability/__init__.py +13 -0
  44. riptide_watergraph/observability/cost.py +121 -0
  45. riptide_watergraph/observability/tracing.py +78 -0
  46. riptide_watergraph/py.typed +0 -0
  47. riptide_watergraph/server/__init__.py +9 -0
  48. riptide_watergraph/server/app.py +578 -0
  49. riptide_watergraph/server/static/app.js +1139 -0
  50. riptide_watergraph/server/static/index.html +49 -0
  51. riptide_watergraph/server/static/styles.css +329 -0
  52. riptide_watergraph/service.py +447 -0
  53. riptide_watergraph/swarm/__init__.py +13 -0
  54. riptide_watergraph/swarm/cost.py +46 -0
  55. riptide_watergraph/swarm/heuristic_composer.py +75 -0
  56. riptide_watergraph/swarm/llm_composer.py +110 -0
  57. riptide_watergraph/swarm/plan_composer.py +50 -0
  58. riptide_watergraph/swarm/role_library.py +324 -0
  59. riptide_watergraph/swarm/roles.py +127 -0
  60. riptide_watergraph/swarm/static_composer.py +27 -0
  61. riptide_watergraph/tools/__init__.py +6 -0
  62. riptide_watergraph/tools/dev_tools.py +298 -0
  63. riptide_watergraph/tools/enterprise.py +96 -0
  64. riptide_watergraph/tools/examples.py +179 -0
  65. riptide_watergraph/tools/library.py +925 -0
  66. riptide_watergraph/tools/registry.py +114 -0
  67. riptide_watergraph/workflows.py +154 -0
  68. riptide_watergraph-0.9.0.dist-info/METADATA +470 -0
  69. riptide_watergraph-0.9.0.dist-info/RECORD +73 -0
  70. riptide_watergraph-0.9.0.dist-info/WHEEL +5 -0
  71. riptide_watergraph-0.9.0.dist-info/entry_points.txt +3 -0
  72. riptide_watergraph-0.9.0.dist-info/licenses/LICENSE +21 -0
  73. riptide_watergraph-0.9.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,82 @@
1
+ """Riptide-Watergraph — a 'like water', layered multi-agent framework on LangGraph.
2
+
3
+ Public surface (Stage 1):
4
+
5
+ from riptide_watergraph import build_graph, LiteLLMGateway, InMemoryMemory
6
+ from riptide_watergraph import default_registry, SingleAgentComposer
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ __version__ = "0.9.0"
12
+
13
+ from .gateway import DemoGateway, LiteLLMGateway, ResilientGateway
14
+ from .graph import build_graph
15
+ from .guardrails import (
16
+ GuardrailPipeline,
17
+ PiiGuardrail,
18
+ PromptInjectionGuardrail,
19
+ default_guardrails,
20
+ )
21
+ from .interfaces import (
22
+ Agent,
23
+ CompletionResult,
24
+ Guardrail,
25
+ GuardrailResult,
26
+ Memory,
27
+ Message,
28
+ ModelGateway,
29
+ Reflector,
30
+ SwarmComposer,
31
+ SwarmDecision,
32
+ ToolRegistry,
33
+ ToolSpec,
34
+ Trajectory,
35
+ )
36
+ from .mcp import FakeMcpClient, McpToolInfo, register_mcp_tools
37
+ from .memory import InMemoryMemory, JsonFileMemory, LLMReflector, MemoryType
38
+ from .observability import CostTracker, UsageRecord
39
+ from .swarm import HeuristicSwarmComposer, LLMSwarmComposer, SingleAgentComposer
40
+ from .tools import StaticToolRegistry, default_registry
41
+
42
+ __all__ = [
43
+ "__version__",
44
+ "build_graph",
45
+ "LiteLLMGateway",
46
+ "DemoGateway",
47
+ "ResilientGateway",
48
+ "InMemoryMemory",
49
+ "JsonFileMemory",
50
+ "LLMReflector",
51
+ "MemoryType",
52
+ "StaticToolRegistry",
53
+ "default_registry",
54
+ "SingleAgentComposer",
55
+ "HeuristicSwarmComposer",
56
+ "LLMSwarmComposer",
57
+ # guardrails + observability (Stage 4)
58
+ "GuardrailPipeline",
59
+ "default_guardrails",
60
+ "PiiGuardrail",
61
+ "PromptInjectionGuardrail",
62
+ "Guardrail",
63
+ "GuardrailResult",
64
+ "CostTracker",
65
+ "UsageRecord",
66
+ # interfaces
67
+ "Agent",
68
+ "ModelGateway",
69
+ "Message",
70
+ "CompletionResult",
71
+ "Memory",
72
+ "Reflector",
73
+ "Trajectory",
74
+ "ToolRegistry",
75
+ "ToolSpec",
76
+ "SwarmComposer",
77
+ "SwarmDecision",
78
+ # MCP tool interop
79
+ "register_mcp_tools",
80
+ "FakeMcpClient",
81
+ "McpToolInfo",
82
+ ]
@@ -0,0 +1,364 @@
1
+ """Command-line entrypoint.
2
+
3
+ ``riptide run "<task>"`` runs a task end-to-end (guardrails -> recall -> orchestrate ->
4
+ worker/swarm -> approval -> finalize -> reflect -> output), attributing usage to a
5
+ tenant. ``riptide costs`` prints the per-tenant cost dashboard.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import sys
13
+ import time
14
+ import uuid
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from langgraph.checkpoint.sqlite import SqliteSaver
19
+ from langgraph.types import Command
20
+
21
+ from .config import get_settings
22
+ from .evaluation import EvalRunner
23
+ from .gateway import DemoGateway, LiteLLMGateway, ResilientGateway
24
+ from .graph import build_graph
25
+ from .guardrails import default_guardrails
26
+ from .memory import HashingEmbedding, JsonFileMemory, LexicalOverlapReranker
27
+ from .memory.reflection import LLMReflector
28
+ from .observability.cost import (
29
+ BudgetExceeded,
30
+ CostTracker,
31
+ UsageRecord,
32
+ cost_from_usage,
33
+ estimate_tokens,
34
+ )
35
+ from .observability.tracing import init_tracing
36
+ from .service import enforce_budget
37
+ from .interfaces import SwarmComposer
38
+ from .swarm import HeuristicSwarmComposer, LLMSwarmComposer, SingleAgentComposer
39
+ from .tools import default_registry
40
+
41
+
42
+ def _prompt_approval(payload: dict[str, Any]) -> bool:
43
+ """Ask the operator to approve a pending side-effecting action."""
44
+ print("\n HUMAN APPROVAL REQUIRED")
45
+ print(f" tool: {payload.get('tool')}")
46
+ print(f" arguments: {payload.get('arguments')}")
47
+ print(f" subtask: {payload.get('subtask')}")
48
+ reply = input(" Approve? [y/N] ").strip().lower()
49
+ return reply in ("y", "yes")
50
+
51
+
52
+ def _prompt_clarification(payload: dict[str, Any]) -> str:
53
+ """Ask the operator to answer a worker's clarifying question."""
54
+ print("\n CLARIFICATION REQUESTED")
55
+ print(f" subtask: {payload.get('subtask')}")
56
+ print(f" question: {payload.get('question')}")
57
+ return input(" Your answer: ").strip()
58
+
59
+
60
+ def _run_task(
61
+ task: str,
62
+ *,
63
+ auto_approve: bool,
64
+ offline: bool = False,
65
+ memory_on: bool = True,
66
+ single: bool = False,
67
+ tenant_id: str = "default",
68
+ guardrails_on: bool = True,
69
+ llm_composer: bool = False,
70
+ critic: bool = False,
71
+ supervisor: bool = False,
72
+ react_steps: int = 1,
73
+ vote_k: int = 1,
74
+ final_schema: dict[str, Any] | None = None,
75
+ ) -> int:
76
+ settings = get_settings()
77
+ init_tracing(settings)
78
+ Path(settings.checkpoint_path).parent.mkdir(parents=True, exist_ok=True)
79
+
80
+ try:
81
+ enforce_budget(settings, tenant_id)
82
+ except BudgetExceeded as exc:
83
+ print(f" BUDGET EXCEEDED: {exc}")
84
+ return 2
85
+
86
+ model = settings.riptide_watergraph_model
87
+ planner_model = settings.planner_model or model
88
+ worker_model = settings.worker_model or model
89
+
90
+ base_gateway = DemoGateway() if offline else LiteLLMGateway(default_model=model)
91
+ # Wrap with timeout + retry so transient API failures don't crash the run.
92
+ gateway = ResilientGateway(base_gateway)
93
+ registry = default_registry()
94
+ composer: SwarmComposer
95
+ if single:
96
+ composer = SingleAgentComposer(model=planner_model)
97
+ elif llm_composer:
98
+ composer = LLMSwarmComposer(gateway, model=planner_model)
99
+ else:
100
+ composer = HeuristicSwarmComposer(model=planner_model)
101
+
102
+ # Stage 2 + 4: per-tenant persistent memory (lessons never leak across tenants),
103
+ # with hybrid dense+lexical retrieval (offline embedder) and reranking.
104
+ memory = (
105
+ JsonFileMemory(
106
+ settings.tenant_memory_path(tenant_id),
107
+ embedding=HashingEmbedding(),
108
+ reranker=LexicalOverlapReranker(),
109
+ )
110
+ if memory_on
111
+ else None
112
+ )
113
+ reflector = LLMReflector(gateway, model=planner_model) if memory_on else None
114
+ guardrails = default_guardrails() if guardrails_on else None
115
+
116
+ thread_id = str(uuid.uuid4())
117
+ config = {"configurable": {"thread_id": thread_id}}
118
+
119
+ with SqliteSaver.from_conn_string(settings.checkpoint_path) as checkpointer:
120
+ graph = build_graph(
121
+ gateway=gateway,
122
+ registry=registry,
123
+ composer=composer,
124
+ model=model,
125
+ checkpointer=checkpointer,
126
+ memory=memory,
127
+ reflector=reflector,
128
+ guardrails=guardrails,
129
+ planner_model=planner_model,
130
+ worker_model=worker_model,
131
+ enable_critic=critic,
132
+ enable_supervisor=supervisor,
133
+ max_steps=react_steps,
134
+ vote_k=vote_k,
135
+ final_schema=final_schema,
136
+ )
137
+
138
+ print(f" tenant={tenant_id} thread={thread_id}")
139
+ result = graph.invoke(
140
+ {"task": task, "session_id": thread_id, "tenant_id": tenant_id}, config
141
+ )
142
+
143
+ # Resume loop: handle approval and clarification interrupts.
144
+ while "__interrupt__" in result:
145
+ payload = result["__interrupt__"][0].value
146
+ if isinstance(payload, dict) and payload.get("type") == "clarification":
147
+ if auto_approve:
148
+ answer = "(no clarification available; proceed with your best assumption)"
149
+ print(f" auto-clarify: {payload.get('question')}")
150
+ else:
151
+ answer = _prompt_clarification(payload)
152
+ result = graph.invoke(Command(resume={"answer": answer}), config)
153
+ else:
154
+ approved = True if auto_approve else _prompt_approval(payload)
155
+ if auto_approve:
156
+ print(f" auto-approved: {payload.get('tool')}")
157
+ result = graph.invoke(Command(resume={"approved": approved}), config)
158
+
159
+ _print_result(result, memory_on=memory_on, memory=memory)
160
+ _record_usage(settings, tenant_id, task, result)
161
+ return 0
162
+
163
+
164
+ def _print_result(result: dict, *, memory_on: bool, memory) -> None:
165
+ if result.get("blocked"):
166
+ print(f" BLOCKED by guardrails: {', '.join(result.get('guard_violations') or [])}")
167
+ print("\n FINAL ANSWER\n" + (result.get("final_answer") or "(none)"))
168
+ return
169
+
170
+ decision = result.get("swarm_decision") or {}
171
+ if decision:
172
+ print(f" composition: {decision.get('mode')} "
173
+ f"(parallelism={decision.get('parallelism')}) - {decision.get('rationale')}")
174
+ roles = result.get("roles") or []
175
+ plan = result.get("plan") or []
176
+ if roles:
177
+ print(" roles: " + ", ".join(
178
+ f"{plan[i] if i < len(plan) else '?'} -> {roles[i]}" for i in range(len(roles))
179
+ ))
180
+ verdicts = result.get("verdicts") or []
181
+ if verdicts:
182
+ n_pass = sum(1 for v in verdicts if v.get("verdict") == "pass")
183
+ print(f" critic: {n_pass}/{len(verdicts)} subtasks verified")
184
+
185
+ for tag in ("guard_violations", "guard_violations_out"):
186
+ if result.get(tag):
187
+ print(f" guardrails ({tag}): {', '.join(result[tag])}")
188
+
189
+ recalled = result.get("recalled_lessons") or []
190
+ if recalled:
191
+ print(f"\n recalled {len(recalled)} lesson(s):")
192
+ for ln in recalled:
193
+ print(f" - {ln}")
194
+
195
+ print("\n FINAL ANSWER\n" + (result.get("final_answer") or "(none)"))
196
+
197
+ structured = result.get("structured_output")
198
+ if structured:
199
+ print("\n STRUCTURED OUTPUT\n" + json.dumps(structured, indent=2))
200
+
201
+ metrics = result.get("metrics") or {}
202
+ total = metrics.get("tool_calls_total", 0)
203
+ valid = metrics.get("tool_calls_valid", 0)
204
+ if total:
205
+ print(f"\n tool-call validity: {valid}/{total} = {valid / total:.0%}")
206
+
207
+ if memory_on and memory is not None:
208
+ stored = result.get("stored_lessons") or []
209
+ outcome = "success" if result.get("success") else "needs-improvement"
210
+ print(f" outcome: {outcome}; learned {len(stored)} lesson(s) "
211
+ f"(memory now holds {len(memory)})")
212
+
213
+
214
+ def _record_usage(settings, tenant_id: str, task: str, result: dict) -> None:
215
+ decision = result.get("swarm_decision") or {}
216
+ blob = (
217
+ task
218
+ + " ".join(r.get("output", "") for r in (result.get("results") or []))
219
+ + (result.get("final_answer") or "")
220
+ )
221
+ # Prefer real token usage from the gateway; fall back to the composer estimate.
222
+ usage = (result.get("metrics") or {}).get("usage") or {}
223
+ actual_total = int(usage.get("total_tokens", 0) or 0)
224
+ if actual_total > 0:
225
+ cost = cost_from_usage(settings.riptide_watergraph_model, usage)
226
+ else:
227
+ cost = float(decision.get("estimated_cost_usd", 0.0))
228
+ tracker = CostTracker(settings.usage_log_path)
229
+ tracker.record(
230
+ UsageRecord(
231
+ tenant_id=tenant_id,
232
+ task=task,
233
+ mode=decision.get("mode", "single"),
234
+ est_tokens=estimate_tokens(blob),
235
+ actual_tokens=actual_total,
236
+ cost_usd=cost,
237
+ blocked=bool(result.get("blocked")),
238
+ ts=time.time(),
239
+ )
240
+ )
241
+
242
+
243
+ def _show_costs() -> int:
244
+ settings = get_settings()
245
+ totals = CostTracker(settings.usage_log_path).by_tenant()
246
+ if not totals:
247
+ print("no usage recorded yet.")
248
+ return 0
249
+ print(f"{'tenant':<16}{'runs':>6}{'tokens':>10}{'cost_usd':>12}{'blocked':>9}")
250
+ print("-" * 53)
251
+ for t in sorted(totals.values(), key=lambda x: x.cost_usd, reverse=True):
252
+ print(f"{t.tenant_id:<16}{t.runs:>6}{t.est_tokens:>10}"
253
+ f"{t.cost_usd:>12.6f}{t.blocked:>9}")
254
+ return 0
255
+
256
+
257
+ def _run_eval(offline: bool) -> int:
258
+ try:
259
+ report = EvalRunner(offline=offline).run()
260
+ except Exception as exc: # noqa: BLE001 - surface a friendly hint for real runs
261
+ if not offline:
262
+ print(f" real-model eval failed: {exc}")
263
+ print(' hint: pip install -e ".[litellm]", set OPENAI_API_KEY and '
264
+ "AGENTIC_WATER_MODEL, or use --offline.")
265
+ return 1
266
+ raise
267
+ print(f"{'task':<14}{'pass':>6}{'mode':>10}{'tool_valid':>12} notes")
268
+ print("-" * 60)
269
+ for r in report.results:
270
+ rate = "-" if r.tool_valid_rate is None else f"{r.tool_valid_rate:.0%}"
271
+ mark = "PASS" if r.passed else "FAIL"
272
+ print(f"{r.task_id:<14}{mark:>6}{r.mode:>10}{rate:>12} {r.notes}")
273
+ print("-" * 60)
274
+ print(f" pass rate: {report.n_passed}/{report.n_total} = {report.pass_rate:.0%}")
275
+ print(f" routing: {report.modes}; blocked: {report.blocked}; "
276
+ f"self-learning recall: {report.learning_recall}")
277
+ return 0 if report.pass_rate == 1.0 else 1
278
+
279
+
280
+ def main(argv: list[str] | None = None) -> int:
281
+ parser = argparse.ArgumentParser(prog="riptide-watergraph")
282
+ sub = parser.add_subparsers(dest="command", required=True)
283
+
284
+ run_p = sub.add_parser("run", help="Run a task end-to-end.")
285
+ run_p.add_argument("task", help="The task for the agent to perform.")
286
+ run_p.add_argument("--auto-approve", action="store_true",
287
+ help="Approve side-effecting tools without prompting (for CI).")
288
+ run_p.add_argument("--offline", action="store_true",
289
+ help="Use the deterministic offline gateway (no API key).")
290
+ run_p.add_argument("--no-memory", action="store_true",
291
+ help="Disable long-term memory recall + reflection.")
292
+ run_p.add_argument("--single", action="store_true",
293
+ help="Force a single agent (skip the swarm composer).")
294
+ run_p.add_argument("--tenant", default="default",
295
+ help="Tenant id for memory isolation + cost attribution.")
296
+ run_p.add_argument("--no-guardrails", action="store_true",
297
+ help="Disable input/output guardrails for this run.")
298
+ run_p.add_argument("--llm-composer", action="store_true",
299
+ help="Use the LLM swarm composer (plan + dependencies) instead "
300
+ "of the heuristic one.")
301
+ run_p.add_argument("--critic", action="store_true",
302
+ help="Add a critic agent that verifies each subtask result.")
303
+ run_p.add_argument("--supervisor", action="store_true",
304
+ help="Add a supervisor that re-plans corrective subtasks (implies "
305
+ "--critic).")
306
+ run_p.add_argument("--react", type=int, default=1, metavar="N",
307
+ help="Max think->act->observe steps per subtask (default 1).")
308
+ run_p.add_argument("--vote", type=int, default=1, metavar="K",
309
+ help="Self-consistency samples for direct answers (default 1).")
310
+ run_p.add_argument("--schema", metavar="PATH",
311
+ help="Path to a JSON Schema file; finalize emits a validated "
312
+ "structured output matching it.")
313
+
314
+ sub.add_parser("costs", help="Show the per-tenant cost dashboard.")
315
+
316
+ eval_p = sub.add_parser("eval", help="Run the evaluation suite and report metrics.")
317
+ eval_p.add_argument("--offline", action="store_true",
318
+ help="Evaluate with the deterministic offline gateway.")
319
+
320
+ serve_p = sub.add_parser("serve", help="Run the HTTP service (needs the [server] extra).")
321
+ serve_p.add_argument("--host", default="127.0.0.1")
322
+ serve_p.add_argument("--port", type=int, default=8000)
323
+
324
+ args = parser.parse_args(argv)
325
+ if args.command == "run":
326
+ final_schema = json.loads(Path(args.schema).read_text()) if args.schema else None
327
+ return _run_task(
328
+ args.task,
329
+ auto_approve=args.auto_approve,
330
+ offline=args.offline,
331
+ memory_on=not args.no_memory,
332
+ single=args.single,
333
+ tenant_id=args.tenant,
334
+ guardrails_on=not args.no_guardrails,
335
+ llm_composer=args.llm_composer,
336
+ critic=args.critic,
337
+ supervisor=args.supervisor,
338
+ react_steps=args.react,
339
+ vote_k=args.vote,
340
+ final_schema=final_schema,
341
+ )
342
+ if args.command == "costs":
343
+ return _show_costs()
344
+ if args.command == "eval":
345
+ return _run_eval(args.offline)
346
+ if args.command == "serve":
347
+ return _serve(args.host, args.port)
348
+ parser.print_help()
349
+ return 1
350
+
351
+
352
+ def _serve(host: str, port: int) -> int:
353
+ try:
354
+ import uvicorn
355
+ except ImportError:
356
+ print('the HTTP server needs the [server] extra: pip install -e ".[server]"')
357
+ return 1
358
+ print(f" serving riptide-watergraph on http://{host}:{port}")
359
+ uvicorn.run("riptide_watergraph.server:app", host=host, port=port)
360
+ return 0
361
+
362
+
363
+ if __name__ == "__main__":
364
+ sys.exit(main())
@@ -0,0 +1,58 @@
1
+ """Runtime configuration via pydantic-settings (reads from env / .env)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic_settings import BaseSettings, SettingsConfigDict
6
+
7
+
8
+ class Settings(BaseSettings):
9
+ """Framework settings. All fields overridable by environment variables."""
10
+
11
+ model_config = SettingsConfigDict(
12
+ env_file=".env",
13
+ env_prefix="",
14
+ extra="ignore",
15
+ case_sensitive=False,
16
+ )
17
+
18
+ # Default model string passed to LiteLLM (orchestrator/worker/finalizer).
19
+ riptide_watergraph_model: str = "gpt-4o-mini"
20
+ # Optional per-role model routing (Phase C). Empty => use the default model.
21
+ planner_model: str = "" # orchestrator + finalize (the "thinking" steps)
22
+ worker_model: str = "" # workers (often a cheaper model)
23
+
24
+ # Checkpoint database path for the LangGraph SqliteSaver.
25
+ checkpoint_path: str = ".riptide_watergraph/checkpoints.sqlite"
26
+
27
+ # Persistent long-term memory store (Stage 2: lessons accumulate here across runs).
28
+ memory_path: str = ".riptide_watergraph/memory.json"
29
+
30
+ # Stage 4: multi-tenancy + cost attribution.
31
+ tenant_id: str = "default"
32
+ data_dir: str = ".riptide_watergraph" # base dir for per-tenant memory + usage log
33
+
34
+ # Sandbox root the agentic developer tools (read_file/write_file/run_*) are confined to.
35
+ # All file paths are resolved under this dir; ``..``/absolute escapes are refused.
36
+ workspace_dir: str = ".riptide_watergraph/workspace"
37
+ # Phase D: per-tenant spend ceiling in USD (0 = unlimited). Runs are refused once a
38
+ # tenant's accumulated cost reaches this.
39
+ tenant_budget_usd: float = 0.0
40
+
41
+ def tenant_memory_path(self, tenant_id: str) -> str:
42
+ """Per-tenant memory namespace so lessons never leak across tenants."""
43
+ return f"{self.data_dir}/tenants/{tenant_id}/memory.json"
44
+
45
+ @property
46
+ def usage_log_path(self) -> str:
47
+ return f"{self.data_dir}/usage.jsonl"
48
+
49
+ # Observability
50
+ langfuse_public_key: str | None = None
51
+ langfuse_secret_key: str | None = None
52
+ langfuse_host: str = "https://cloud.langfuse.com"
53
+ riptide_watergraph_disable_tracing: bool = False
54
+
55
+
56
+ def get_settings() -> Settings:
57
+ """Load settings from environment / .env."""
58
+ return Settings()
@@ -0,0 +1,18 @@
1
+ """Offline evaluation harness — measure the framework on a task suite.
2
+
3
+ The research consensus is to run your own evals on your task distribution rather than
4
+ trust vendor benchmarks. This harness makes the framework's behavior measurable:
5
+ pass rate, single-vs-swarm routing, guardrail blocking, tool-call validity, and
6
+ self-learning gain — deterministically, offline.
7
+ """
8
+
9
+ from .runner import EvalReport, EvalResult, EvalRunner
10
+ from .suite import EvalTask, default_suite
11
+
12
+ __all__ = [
13
+ "EvalTask",
14
+ "default_suite",
15
+ "EvalRunner",
16
+ "EvalResult",
17
+ "EvalReport",
18
+ ]
@@ -0,0 +1,135 @@
1
+ """Eval runner: build a graph, run the suite, score and aggregate.
2
+
3
+ Offline + deterministic by default (DemoGateway), so the suite doubles as a behavioral
4
+ regression gate in CI. Pass ``offline=False`` to evaluate against a real model.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+ from ..config import get_settings
12
+ from ..gateway import DemoGateway, LiteLLMGateway, ResilientGateway
13
+ from ..graph import build_graph
14
+ from ..guardrails import default_guardrails
15
+ from ..memory import HashingEmbedding, InMemoryMemory, LexicalOverlapReranker
16
+ from ..memory.reflection import LLMReflector
17
+ from ..swarm import HeuristicSwarmComposer
18
+ from ..tools import default_registry
19
+ from .suite import EvalTask, default_suite
20
+
21
+
22
+ class EvalResult(BaseModel):
23
+ task_id: str
24
+ passed: bool
25
+ mode: str # single | swarm | blocked
26
+ blocked: bool = False
27
+ tool_valid_rate: float | None = None
28
+ notes: str = ""
29
+
30
+
31
+ class EvalReport(BaseModel):
32
+ results: list[EvalResult] = Field(default_factory=list)
33
+ pass_rate: float = 0.0
34
+ n_passed: int = 0
35
+ n_total: int = 0
36
+ modes: dict[str, int] = Field(default_factory=dict)
37
+ blocked: int = 0
38
+ learning_recall: bool = False # did a repeated task recall a prior lesson?
39
+
40
+
41
+ class EvalRunner:
42
+ """Runs the task suite through a freshly built graph."""
43
+
44
+ def __init__(self, *, offline: bool = True, model: str | None = None) -> None:
45
+ self.offline = offline
46
+ # For a real run, default to the configured model rather than a placeholder.
47
+ self.model = model or ("demo" if offline else get_settings().riptide_watergraph_model)
48
+
49
+ def _gateway(self):
50
+ if self.offline:
51
+ return DemoGateway()
52
+ # Real model: wrap LiteLLM in the resilient gateway (timeouts + retries).
53
+ return ResilientGateway(LiteLLMGateway(default_model=self.model))
54
+
55
+ def _build(self, memory):
56
+ gateway = self._gateway()
57
+ return build_graph(
58
+ gateway=gateway,
59
+ registry=default_registry(),
60
+ composer=HeuristicSwarmComposer(model=self.model),
61
+ model=self.model,
62
+ memory=memory,
63
+ reflector=LLMReflector(gateway, model=self.model),
64
+ guardrails=default_guardrails(),
65
+ )
66
+
67
+ def run(self, suite: list[EvalTask] | None = None) -> EvalReport:
68
+ suite = suite or default_suite()
69
+ memory = InMemoryMemory(
70
+ embedding=HashingEmbedding(), reranker=LexicalOverlapReranker()
71
+ )
72
+ graph = self._build(memory)
73
+
74
+ results = [self._run_task(graph, t) for t in suite]
75
+ report = EvalReport(
76
+ results=results,
77
+ n_total=len(results),
78
+ n_passed=sum(1 for r in results if r.passed),
79
+ blocked=sum(1 for r in results if r.blocked),
80
+ learning_recall=self._probe_learning(),
81
+ )
82
+ report.pass_rate = (report.n_passed / report.n_total) if report.n_total else 0.0
83
+ for r in results:
84
+ report.modes[r.mode] = report.modes.get(r.mode, 0) + 1
85
+ return report
86
+
87
+ def _run_task(self, graph, task: EvalTask) -> EvalResult:
88
+ state = graph.invoke(
89
+ {"task": task.prompt, "session_id": task.id, "tenant_id": "eval"},
90
+ {"configurable": {"thread_id": task.id}},
91
+ )
92
+ blocked = bool(state.get("blocked"))
93
+ decision = state.get("swarm_decision") or {}
94
+ mode = "blocked" if blocked else decision.get("mode", "single")
95
+
96
+ metrics = state.get("metrics") or {}
97
+ total = metrics.get("tool_calls_total", 0)
98
+ valid = metrics.get("tool_calls_valid", 0)
99
+ rate = (valid / total) if total else None
100
+
101
+ passed, notes = self._score(task, state, blocked, mode)
102
+ return EvalResult(
103
+ task_id=task.id, passed=passed, mode=mode, blocked=blocked,
104
+ tool_valid_rate=rate, notes=notes,
105
+ )
106
+
107
+ @staticmethod
108
+ def _score(task: EvalTask, state: dict, blocked: bool, mode: str) -> tuple[bool, str]:
109
+ if task.expect_blocked:
110
+ return (blocked, "" if blocked else "expected block, was allowed")
111
+ if blocked:
112
+ return (False, "unexpectedly blocked")
113
+ if task.expect_mode and mode != task.expect_mode:
114
+ return (False, f"expected {task.expect_mode}, got {mode}")
115
+ if task.expect_substring:
116
+ blob = (
117
+ task.prompt
118
+ + " ".join(r.get("output", "") for r in (state.get("results") or []))
119
+ + (state.get("final_answer") or "")
120
+ ).lower()
121
+ if task.expect_substring.lower() not in blob:
122
+ return (False, f"missing expected {task.expect_substring!r}")
123
+ return (True, "")
124
+
125
+ def _probe_learning(self) -> bool:
126
+ """Run one task twice; the second run should recall the first run's lesson."""
127
+ memory = InMemoryMemory(
128
+ embedding=HashingEmbedding(), reranker=LexicalOverlapReranker()
129
+ )
130
+ graph = self._build(memory)
131
+ cfg1 = {"configurable": {"thread_id": "probe-1"}}
132
+ cfg2 = {"configurable": {"thread_id": "probe-2"}}
133
+ graph.invoke({"task": "compute 7 * 7", "session_id": "p1", "tenant_id": "eval"}, cfg1)
134
+ s2 = graph.invoke({"task": "compute 7 * 7", "session_id": "p2", "tenant_id": "eval"}, cfg2)
135
+ return bool(s2.get("recalled_lessons"))