generflow-core 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,306 @@
1
+ """Data source resolvers: turn `src="name"` references into live data.
2
+
3
+ Each source type (rest, sql, graphql, mcp) implements a Resolver.
4
+ The registry maps source names → resolvers. The SSE pipeline emits
5
+ `data.fill` events as each `src=` ref resolves.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import hashlib
11
+ import json
12
+ import time
13
+ from abc import ABC, abstractmethod
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from .config import AppConfig, DataSource
18
+
19
+
20
+ class ResolverError(Exception):
21
+ pass
22
+
23
+
24
+ class Resolver(ABC):
25
+ type: str = "base"
26
+
27
+ @abstractmethod
28
+ async def fetch(self, source: DataSource, params: dict[str, Any]) -> Any:
29
+ """Execute the source and return its data. `params` are runtime args."""
30
+
31
+
32
+ # ── Simple in-memory cache (per-process, TTL-based) ────────────────────────
33
+
34
+ _CACHE: dict[tuple[str, frozenset], tuple[float, Any]] = {}
35
+
36
+
37
+ def _cache_key(name: str, params: dict) -> tuple[str, frozenset]:
38
+ # stable key from sorted params
39
+ return (name, frozenset(params.items()))
40
+
41
+
42
+ def _cache_get(name: str, params: dict, ttl: int) -> Any | None:
43
+ if ttl <= 0:
44
+ return None
45
+ key = _cache_key(name, params)
46
+ hit = _CACHE.get(key)
47
+ if hit is None:
48
+ return None
49
+ expires_at, value = hit
50
+ if time.time() > expires_at:
51
+ _CACHE.pop(key, None)
52
+ return None
53
+ return value
54
+
55
+
56
+ def _cache_put(name: str, params: dict, ttl: int, value: Any) -> None:
57
+ if ttl <= 0:
58
+ return
59
+ _CACHE[_cache_key(name, params)] = (time.time() + ttl, value)
60
+
61
+
62
+ def cache_clear() -> None:
63
+ _CACHE.clear()
64
+
65
+
66
+ # ── Resolver registry ──────────────────────────────────────────────────────
67
+
68
+ _RESOLVERS: dict[str, type[Resolver]] = {}
69
+
70
+
71
+ def register_resolver(t: str, cls: type[Resolver]) -> None:
72
+ _RESOLVERS[t] = cls
73
+
74
+
75
+ def get_resolver(t: str) -> Resolver:
76
+ cls = _RESOLVERS.get(t)
77
+ if cls is None:
78
+ raise ResolverError(f"Unknown source type: {t!r}")
79
+ return cls()
80
+
81
+
82
+ # ── REST resolver ─────────────────────────────────────────────────────────
83
+
84
+ class RestResolver(Resolver):
85
+ type = "rest"
86
+
87
+ async def fetch(self, source: DataSource, params: dict) -> Any:
88
+ import httpx
89
+ method = source.config.get("method", "GET").upper()
90
+ url = source.config.get("url", "")
91
+ if not url:
92
+ raise ResolverError(f"Source {source.name!r}: missing url")
93
+ # interpolate params into URL path (/orders/$order_id) and query string
94
+ for k, v in params.items():
95
+ url = url.replace(f"${k}", str(v))
96
+ query = {k: v for k, v in source.config.get("params", {}).items()}
97
+ query.update({k: v for k, v in params.items() if k not in query})
98
+ body = source.config.get("body")
99
+ headers = dict(source.config.get("headers", {}))
100
+ auth = source.config.get("auth")
101
+ if isinstance(auth, str) and auth.startswith("bearer:"):
102
+ headers["Authorization"] = f"Bearer {auth[7:]}"
103
+ async with httpx.AsyncClient(timeout=10.0) as client:
104
+ r = await client.request(method, url, params=query, json=body, headers=headers)
105
+ r.raise_for_status()
106
+ return r.json()
107
+
108
+
109
+ register_resolver("rest", RestResolver)
110
+
111
+
112
+ # ── SQL resolver ──────────────────────────────────────────────────────────
113
+
114
+ class SqlResolver(Resolver):
115
+ type = "sql"
116
+
117
+ async def fetch(self, source: DataSource, params: dict) -> Any:
118
+ # use sqlite3 (stdlib) for portability; production would use asyncpg
119
+ import sqlite3
120
+ dsn = source.config.get("connection") or source.config.get("dsn", ":memory:")
121
+ if isinstance(dsn, str) and dsn.startswith("sqlite:///"):
122
+ dsn = dsn[len("sqlite:///"):]
123
+ query = source.config.get("query", "")
124
+ if not query:
125
+ raise ResolverError(f"Source {source.name!r}: missing query")
126
+ # bind params from runtime args
127
+ positional = [params.get(b) for b in source.config.get("bind", [])]
128
+ conn = sqlite3.connect(dsn)
129
+ try:
130
+ cur = conn.execute(query, positional)
131
+ cols = [d[0] for d in cur.description] if cur.description else []
132
+ rows = [dict(zip(cols, row)) for row in cur.fetchall()]
133
+ return rows
134
+ finally:
135
+ conn.close()
136
+
137
+
138
+ register_resolver("sql", SqlResolver)
139
+
140
+
141
+ # ── GraphQL resolver ──────────────────────────────────────────────────────
142
+
143
+ class GraphqlResolver(Resolver):
144
+ type = "graphql"
145
+
146
+ async def fetch(self, source: DataSource, params: dict) -> Any:
147
+ # Minimal GraphQL: POST {query, variables} to endpoint.
148
+ # We use raw urllib (stdlib) so we don't add a dependency.
149
+ import urllib.request
150
+ import urllib.error
151
+ endpoint = source.config.get("url", "")
152
+ if not endpoint:
153
+ raise ResolverError(f"Source {source.name!r}: missing url")
154
+ query = source.config.get("query", "")
155
+ variables = dict(source.config.get("variables", {}))
156
+ variables.update(params)
157
+ body = json.dumps({"query": query, "variables": variables}).encode("utf-8")
158
+ headers = {"Content-Type": "application/json"}
159
+ auth = source.config.get("auth")
160
+ if isinstance(auth, str) and auth.startswith("bearer:"):
161
+ headers["Authorization"] = f"Bearer {auth[7:]}"
162
+ req = urllib.request.Request(endpoint, data=body, headers=headers, method="POST")
163
+ loop = asyncio.get_event_loop()
164
+ resp = await loop.run_in_executor(None, lambda: urllib.request.urlopen(req, timeout=10))
165
+ payload = json.loads(resp.read().decode("utf-8"))
166
+ if "errors" in payload:
167
+ raise ResolverError(f"GraphQL errors: {payload['errors']}")
168
+ return payload.get("data", {})
169
+
170
+
171
+ register_resolver("graphql", GraphqlResolver)
172
+
173
+
174
+ # ── MCP resolver ──────────────────────────────────────────────────────────
175
+
176
+ class McpResolver(Resolver):
177
+ """MCP (Model Context Protocol) tool caller.
178
+
179
+ Talks to an MCP server over stdio or HTTP. For v1, we ship a stdio
180
+ adapter (the MCP standard transport). HTTP MCP support is a v2 add.
181
+
182
+ MCP servers expose a list of tools via `tools/list`. Each tool call
183
+ is `tools/call` with name + arguments.
184
+ """
185
+
186
+ type = "mcp"
187
+
188
+ async def fetch(self, source: DataSource, params: dict) -> Any:
189
+ command = source.config.get("command", "")
190
+ tool = source.config.get("tool", "")
191
+ if not command or not tool:
192
+ raise ResolverError(f"Source {source.name!r}: missing command or tool")
193
+ args = dict(source.config.get("args", {}))
194
+ args.update(params)
195
+ # spawn the MCP server, send tools/call, collect result
196
+ return await self._call_stdio(command, tool, args, source.config.get("env", {}))
197
+
198
+ async def _call_stdio(self, command: str, tool: str, arguments: dict, env_overrides: dict) -> Any:
199
+ import os
200
+ import subprocess
201
+ env = {**os.environ, **env_overrides}
202
+ proc = await asyncio.create_subprocess_exec(
203
+ *command.split(),
204
+ stdin=subprocess.PIPE,
205
+ stdout=subprocess.PIPE,
206
+ stderr=subprocess.PIPE,
207
+ env=env,
208
+ )
209
+ # Minimal MCP handshake: initialize + tools/call
210
+ init_msg = {
211
+ "jsonrpc": "2.0", "id": 1, "method": "initialize",
212
+ "params": {"protocolVersion": "2024-11-05", "capabilities": {}, "clientInfo": {"name": "generflow", "version": "0.1.0"}},
213
+ }
214
+ initialized = {"jsonrpc": "2.0", "method": "notifications/initialized"}
215
+ call_msg = {
216
+ "jsonrpc": "2.0", "id": 2, "method": "tools/call",
217
+ "params": {"name": tool, "arguments": arguments},
218
+ }
219
+ messages = json.dumps(init_msg) + "\n" + json.dumps(initialized) + "\n" + json.dumps(call_msg) + "\n"
220
+ try:
221
+ stdout, stderr = await asyncio.wait_for(proc.communicate(messages.encode()), timeout=15.0)
222
+ except asyncio.TimeoutError:
223
+ proc.kill()
224
+ raise ResolverError(f"MCP call to {command!r} timed out")
225
+ # parse responses
226
+ for line in stdout.decode("utf-8", errors="replace").splitlines():
227
+ line = line.strip()
228
+ if not line or not line.startswith("{"):
229
+ continue
230
+ try:
231
+ msg = json.loads(line)
232
+ except json.JSONDecodeError:
233
+ continue
234
+ if msg.get("id") == 2:
235
+ if "error" in msg:
236
+ raise ResolverError(f"MCP error: {msg['error']}")
237
+ return msg.get("result", {})
238
+ raise ResolverError(f"MCP server {command!r} returned no response")
239
+
240
+
241
+ register_resolver("mcp", McpResolver)
242
+
243
+
244
+ # ── File resolver ─────────────────────────────────────────────────────────
245
+ class FileResolver(Resolver):
246
+ """Read structured data from a local file.
247
+
248
+ Supports JSONL (one JSON object per line, default) and JSON arrays.
249
+ Config:
250
+ path: absolute or {config_dir}-interpolated path
251
+ format: "jsonl" (default) or "json"
252
+ """
253
+
254
+ type = "file"
255
+
256
+ async def fetch(self, source: DataSource, params: dict) -> Any:
257
+ path_str = source.config.get("path", "")
258
+ if not path_str:
259
+ raise ResolverError(f"Source {source.name!r}: missing 'path'")
260
+ path = Path(path_str)
261
+ if not path.exists():
262
+ raise ResolverError(f"Source {source.name!r}: file not found: {path}")
263
+ fmt = source.config.get("format", "jsonl")
264
+ text = path.read_text()
265
+ if fmt == "json":
266
+ return json.loads(text)
267
+ if fmt == "jsonl":
268
+ rows = []
269
+ for line in text.splitlines():
270
+ line = line.strip()
271
+ if not line:
272
+ continue
273
+ rows.append(json.loads(line))
274
+ return rows
275
+ raise ResolverError(f"Source {source.name!r}: unknown format {fmt!r}")
276
+
277
+
278
+ register_resolver("file", FileResolver)
279
+
280
+
281
+ # ── Top-level resolver: name → data ────────────────────────────────────────
282
+
283
+ async def resolve_source(
284
+ config: AppConfig, name: str, params: dict | None = None
285
+ ) -> tuple[Any, str]:
286
+ """Resolve a named source. Returns (data, fetched_at_iso).
287
+
288
+ Honors cache TTLs. Raises ResolverError on failure (caller decides
289
+ whether to fall back to a HITL gate).
290
+ """
291
+ src = config.source(name)
292
+ if src is None:
293
+ raise ResolverError(f"Unknown source: {name!r}")
294
+ params = params or {}
295
+ cached = _cache_get(name, params, src.cache_seconds)
296
+ if cached is not None:
297
+ return cached, _iso_now()
298
+ resolver = get_resolver(src.type)
299
+ data = await resolver.fetch(src, params)
300
+ _cache_put(name, params, src.cache_seconds, data)
301
+ return data, _iso_now()
302
+
303
+
304
+ def _iso_now() -> str:
305
+ import datetime
306
+ return datetime.datetime.utcnow().isoformat() + "Z"
@@ -0,0 +1,22 @@
1
+ """HITL module: human-in-the-loop gates for confidence, PII, ambiguity, missing sources."""
2
+ from .gates import (
3
+ Decision,
4
+ GateResult,
5
+ ambiguity_gate,
6
+ confidence_gate,
7
+ missing_source_gate,
8
+ pii_gate,
9
+ redact,
10
+ scan_pii,
11
+ )
12
+
13
+ __all__ = [
14
+ "Decision",
15
+ "GateResult",
16
+ "ambiguity_gate",
17
+ "confidence_gate",
18
+ "missing_source_gate",
19
+ "pii_gate",
20
+ "redact",
21
+ "scan_pii",
22
+ ]
@@ -0,0 +1,165 @@
1
+ """HITL gates: confidence, PII, ambiguity, missing-source.
2
+
3
+ Each gate returns a decision:
4
+ - ALLOW: proceed, no user interaction
5
+ - CONFIRM: show the user a preview, wait for approval
6
+ - CLARIFY: ask a question, get more info
7
+ - REJECT: hard-fail, don't render / don't execute
8
+
9
+ Gates run during SSE streaming:
10
+ - Confidence gate runs per node (after spec.line)
11
+ - PII gate runs on data.fill values (redact + confirm)
12
+ - Ambiguity gate runs when a component name isn't in the registry
13
+ - Missing-source gate runs when a `src=` ref can't be resolved
14
+ """
15
+ from __future__ import annotations
16
+
17
+ import re
18
+ from dataclasses import dataclass
19
+ from enum import Enum
20
+ from typing import Any
21
+
22
+
23
+ class Decision(str, Enum):
24
+ ALLOW = "allow"
25
+ CONFIRM = "confirm"
26
+ CLARIFY = "clarify"
27
+ REJECT = "reject"
28
+
29
+
30
+ @dataclass
31
+ class GateResult:
32
+ decision: Decision
33
+ reason: str = ""
34
+ redacted_value: Any = None # for PII gate
35
+ question: str = "" # for CLARIFY
36
+ options: list[str] | None = None # for CLARIFY
37
+ confidence: float = 1.0
38
+
39
+ def to_dict(self) -> dict:
40
+ return {
41
+ "decision": self.decision.value,
42
+ "reason": self.reason,
43
+ "confidence": self.confidence,
44
+ "question": self.question,
45
+ "options": self.options,
46
+ "redacted_value": self.redacted_value,
47
+ }
48
+
49
+
50
+ # ── PII detection ─────────────────────────────────────────────────────────
51
+
52
+ _PII_PATTERNS: list[tuple[str, re.Pattern]] = [
53
+ ("email", re.compile(r"\b[\w.+-]+@[\w-]+(?:\.[\w.-]+)?\b")),
54
+ ("phone_us", re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b")),
55
+ ("ssn", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")),
56
+ ("credit_card", re.compile(r"\b(?:\d[ -]*?){13,16}\b")),
57
+ ("ipv4", re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")),
58
+ ]
59
+
60
+
61
+ def scan_pii(value: Any) -> list[str]:
62
+ """Return the list of PII kinds detected in `value`."""
63
+ if value is None:
64
+ return []
65
+ if isinstance(value, (dict, list)):
66
+ import json
67
+ s = json.dumps(value, default=str)
68
+ else:
69
+ s = str(value)
70
+ found = []
71
+ for name, pat in _PII_PATTERNS:
72
+ if pat.search(s):
73
+ found.append(name)
74
+ return found
75
+
76
+
77
+ def redact(value: Any) -> Any:
78
+ """Replace PII patterns with placeholder text."""
79
+ if value is None:
80
+ return value
81
+ s = str(value)
82
+ for name, pat in _PII_PATTERNS:
83
+ s = pat.sub(f"[REDACTED:{name}]", s)
84
+ return s
85
+
86
+
87
+ # ── Confidence gate ───────────────────────────────────────────────────────
88
+
89
+ def confidence_gate(value: Any, threshold: float = 0.7) -> GateResult:
90
+ """Heuristic confidence score for a value.
91
+
92
+ Real implementation would use the LLM's logprobs or a self-reported
93
+ score. For v1, we use a proxy: short, complete strings score higher;
94
+ long/uncertain/complex values score lower.
95
+ """
96
+ if isinstance(value, str):
97
+ s = value.strip()
98
+ if not s:
99
+ return GateResult(Decision.CONFIRM, reason="empty", confidence=0.0)
100
+ # Heuristic: contain "?" or "maybe" or "..." → low confidence
101
+ if any(t in s.lower() for t in ("maybe", "perhaps", "might be", "...")):
102
+ return GateResult(Decision.CONFIRM, reason="uncertain phrasing", confidence=0.5)
103
+ # TBD-like markers
104
+ if any(t in s for t in ("TODO", "TBD", "???", "...")):
105
+ return GateResult(Decision.CONFIRM, reason="TBD marker", confidence=0.4)
106
+ # otherwise assume confident
107
+ return GateResult(Decision.ALLOW, confidence=0.9)
108
+ if isinstance(value, (int, float, bool)):
109
+ return GateResult(Decision.ALLOW, confidence=0.95)
110
+ if isinstance(value, (list, dict)):
111
+ return GateResult(Decision.ALLOW, confidence=0.85)
112
+ return GateResult(Decision.ALLOW, confidence=0.8)
113
+
114
+
115
+ # ── PII gate ──────────────────────────────────────────────────────────────
116
+
117
+ def pii_gate(value: Any) -> GateResult:
118
+ kinds = scan_pii(value)
119
+ if not kinds:
120
+ return GateResult(Decision.ALLOW, confidence=1.0)
121
+ return GateResult(
122
+ Decision.CONFIRM,
123
+ reason=f"PII detected: {', '.join(kinds)}",
124
+ confidence=0.3,
125
+ redacted_value=redact(value),
126
+ )
127
+
128
+
129
+ # ── Ambiguity gate ───────────────────────────────────────────────────────
130
+
131
+ def ambiguity_gate(
132
+ name: str,
133
+ candidates: list[str],
134
+ threshold: float = 0.8,
135
+ ) -> GateResult:
136
+ """If the LLM emitted a component name not in the registry, but
137
+ it looks like a close match to one that is, ask the user."""
138
+ if not candidates:
139
+ return GateResult(Decision.REJECT, reason=f"Unknown component: {name}", confidence=0.0)
140
+ if len(candidates) == 1:
141
+ return GateResult(
142
+ Decision.CONFIRM,
143
+ reason=f"Unknown component '{name}', did you mean '{candidates[0]}'?",
144
+ confidence=0.6,
145
+ options=[candidates[0], "skip"],
146
+ )
147
+ return GateResult(
148
+ Decision.CLARIFY,
149
+ reason=f"Unknown component '{name}'",
150
+ confidence=0.4,
151
+ question=f"Component '{name}' is not in the registry. Which did you mean?",
152
+ options=candidates[:5] + ["skip"],
153
+ )
154
+
155
+
156
+ # ── Missing-source gate ──────────────────────────────────────────────────
157
+
158
+ def missing_source_gate(ref_name: str) -> GateResult:
159
+ return GateResult(
160
+ Decision.CLARIFY,
161
+ reason=f"Data source '{ref_name}' is not bound",
162
+ confidence=0.0,
163
+ question=f"I referenced a data source '{ref_name}' but no binding exists. Skip or abort?",
164
+ options=["skip (render placeholder)", "abort"],
165
+ )