preppergpt 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +93 -0
- package/bin/preppergpt.js +8 -0
- package/compose/preppergpt.yaml +232 -0
- package/docs/hardware.md +15 -0
- package/docs/model-sources.md +12 -0
- package/docs/preppergpt-local-parity-map.md +16 -0
- package/docs/publishing.md +24 -0
- package/installer/cli.mjs +225 -0
- package/installer/install.sh +18 -0
- package/installer/lib/detect.mjs +128 -0
- package/installer/lib/paths.mjs +26 -0
- package/installer/lib/planner.mjs +175 -0
- package/installer/lib/render.mjs +76 -0
- package/installer/lib/util.mjs +84 -0
- package/package.json +48 -0
- package/profiles/models.json +277 -0
- package/services/comfyui/flux-kontext-edit-openwebui-nodes.json +46 -0
- package/services/comfyui/flux-kontext-edit-openwebui-workflow.json +245 -0
- package/services/comfyui/flux-kontext-mask-edit-openwebui-nodes.json +51 -0
- package/services/comfyui/flux-kontext-mask-edit-openwebui-workflow.json +322 -0
- package/services/comfyui/flux2-klein-9b-openwebui-nodes.json +58 -0
- package/services/comfyui/flux2-klein-9b-openwebui-workflow.json +141 -0
- package/services/comfyui/image-invert-edit-openwebui-nodes.json +23 -0
- package/services/comfyui/image-invert-edit-openwebui-workflow.json +52 -0
- package/services/deep-research/Dockerfile +7 -0
- package/services/deep-research/app.py +1913 -0
- package/services/local-agent/Dockerfile +17 -0
- package/services/local-agent/app.py +2311 -0
- package/services/local-scheduler/Dockerfile +8 -0
- package/services/local-scheduler/app.py +15774 -0
- package/services/local-vision/Dockerfile +11 -0
- package/services/local-vision/app.py +888 -0
- package/services/searxng/settings.yml +16 -0
- package/themes/preppergpt/custom.css +15 -0
- package/themes/preppergpt/static/favicon.svg +5 -0
- package/themes/preppergpt/static/logo.svg +6 -0
|
@@ -0,0 +1,1913 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import base64
|
|
3
|
+
import html
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import textwrap
|
|
9
|
+
import time
|
|
10
|
+
import uuid
|
|
11
|
+
import zipfile
|
|
12
|
+
from dataclasses import dataclass, asdict
|
|
13
|
+
from html.parser import HTMLParser
|
|
14
|
+
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from threading import Lock
|
|
17
|
+
from urllib import parse, request, error
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
MODEL_ID = os.environ.get("DEEP_RESEARCH_MODEL_ID", "deep-research-glm52")
|
|
21
|
+
GLM_MODEL = os.environ.get("DEEP_RESEARCH_MODEL", "glm52-q4-local")
|
|
22
|
+
GLM_BASE_URL = os.environ.get("DEEP_RESEARCH_GLM_BASE_URL", "http://127.0.0.1:11441/v1")
|
|
23
|
+
SEARXNG_URL = os.environ.get("DEEP_RESEARCH_SEARXNG_URL", "http://127.0.0.1:18080/search")
|
|
24
|
+
TIKA_URL = os.environ.get("DEEP_RESEARCH_TIKA_URL", "http://127.0.0.1:9998/tika")
|
|
25
|
+
LOCAL_APP_CONNECTOR_URL = os.environ.get("DEEP_RESEARCH_LOCAL_APP_CONNECTOR_URL", "http://127.0.0.1:18042")
|
|
26
|
+
PUBLIC_BASE_URL = os.environ.get("DEEP_RESEARCH_PUBLIC_BASE_URL", "http://127.0.0.1:18041")
|
|
27
|
+
STORAGE = Path(os.environ.get("DEEP_RESEARCH_STORAGE", "/data"))
|
|
28
|
+
MAX_QUERIES = int(os.environ.get("DEEP_RESEARCH_MAX_QUERIES", "12"))
|
|
29
|
+
MAX_RESULTS = int(os.environ.get("DEEP_RESEARCH_MAX_RESULTS", "120"))
|
|
30
|
+
MAX_SOURCES = int(os.environ.get("DEEP_RESEARCH_MAX_SOURCES", "40"))
|
|
31
|
+
MAX_SNIPPETS = int(os.environ.get("DEEP_RESEARCH_MAX_SNIPPETS", "28"))
|
|
32
|
+
MAX_TOKENS = int(os.environ.get("DEEP_RESEARCH_MAX_TOKENS", "1600"))
|
|
33
|
+
MAX_EXCERPT_CHARS = int(os.environ.get("DEEP_RESEARCH_MAX_EXCERPT_CHARS", "1600"))
|
|
34
|
+
MAX_LOCAL_DOCUMENTS = int(os.environ.get("DEEP_RESEARCH_MAX_LOCAL_DOCUMENTS", "20"))
|
|
35
|
+
MAX_CONNECTOR_SOURCES = int(os.environ.get("DEEP_RESEARCH_MAX_CONNECTOR_SOURCES", "12"))
|
|
36
|
+
MAX_DOCUMENT_CHARS = int(os.environ.get("DEEP_RESEARCH_MAX_DOCUMENT_CHARS", "200000"))
|
|
37
|
+
FETCH_TIMEOUT = int(os.environ.get("DEEP_RESEARCH_FETCH_TIMEOUT_SECONDS", "20"))
|
|
38
|
+
GLM_TIMEOUT = int(os.environ.get("DEEP_RESEARCH_GLM_TIMEOUT_SECONDS", "21600"))
|
|
39
|
+
MAX_FETCH_BYTES = int(os.environ.get("DEEP_RESEARCH_MAX_FETCH_BYTES", str(3 * 1024 * 1024)))
|
|
40
|
+
|
|
41
|
+
LLM_LOCK = Lock()
|
|
42
|
+
STORAGE.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TextExtractor(HTMLParser):
|
|
46
|
+
def __init__(self):
|
|
47
|
+
super().__init__()
|
|
48
|
+
self.skip = 0
|
|
49
|
+
self.parts = []
|
|
50
|
+
|
|
51
|
+
def handle_starttag(self, tag, attrs):
|
|
52
|
+
if tag in {"script", "style", "noscript", "svg"}:
|
|
53
|
+
self.skip += 1
|
|
54
|
+
|
|
55
|
+
def handle_endtag(self, tag):
|
|
56
|
+
if tag in {"script", "style", "noscript", "svg"} and self.skip:
|
|
57
|
+
self.skip -= 1
|
|
58
|
+
|
|
59
|
+
def handle_data(self, data):
|
|
60
|
+
if not self.skip:
|
|
61
|
+
text = " ".join(data.split())
|
|
62
|
+
if text:
|
|
63
|
+
self.parts.append(text)
|
|
64
|
+
|
|
65
|
+
def text(self):
|
|
66
|
+
return "\n".join(self.parts)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class Source:
|
|
71
|
+
sid: str
|
|
72
|
+
title: str
|
|
73
|
+
url: str
|
|
74
|
+
engine: str = ""
|
|
75
|
+
score: float = 0.0
|
|
76
|
+
snippet: str = ""
|
|
77
|
+
fetched: bool = False
|
|
78
|
+
error: str = ""
|
|
79
|
+
text_chars: int = 0
|
|
80
|
+
excerpts: list[str] = None
|
|
81
|
+
|
|
82
|
+
def __post_init__(self):
|
|
83
|
+
if self.excerpts is None:
|
|
84
|
+
self.excerpts = []
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def now() -> int:
|
|
88
|
+
return int(time.time())
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def int_setting(overrides: dict, key: str, default: int, minimum: int = 0, maximum: int | None = None) -> int:
|
|
92
|
+
try:
|
|
93
|
+
value = int(overrides.get(key, default))
|
|
94
|
+
except (TypeError, ValueError):
|
|
95
|
+
value = default
|
|
96
|
+
value = max(minimum, value)
|
|
97
|
+
if maximum is not None:
|
|
98
|
+
value = min(value, maximum)
|
|
99
|
+
return value
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def read_json(handler: BaseHTTPRequestHandler) -> dict:
|
|
103
|
+
length = int(handler.headers.get("Content-Length", "0") or "0")
|
|
104
|
+
raw = handler.rfile.read(length) if length else b"{}"
|
|
105
|
+
return json.loads(raw.decode("utf-8") or "{}")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def send_json(handler: BaseHTTPRequestHandler, status: int, payload: dict):
|
|
109
|
+
body = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
|
110
|
+
handler.send_response(status)
|
|
111
|
+
handler.send_header("Content-Type", "application/json; charset=utf-8")
|
|
112
|
+
handler.send_header("Content-Length", str(len(body)))
|
|
113
|
+
handler.send_header("Access-Control-Allow-Origin", "*")
|
|
114
|
+
handler.end_headers()
|
|
115
|
+
handler.wfile.write(body)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def send_bytes(handler: BaseHTTPRequestHandler, status: int, body: bytes, content_type: str):
|
|
119
|
+
handler.send_response(status)
|
|
120
|
+
handler.send_header("Content-Type", content_type)
|
|
121
|
+
handler.send_header("Content-Length", str(len(body)))
|
|
122
|
+
handler.send_header("Access-Control-Allow-Origin", "*")
|
|
123
|
+
handler.end_headers()
|
|
124
|
+
handler.wfile.write(body)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def http_json(url: str, payload: dict | None = None, timeout: int = 60, headers: dict | None = None) -> dict:
|
|
128
|
+
data = None
|
|
129
|
+
req_headers = {"User-Agent": "openwebui-deep-research/0.1"}
|
|
130
|
+
if headers:
|
|
131
|
+
req_headers.update(headers)
|
|
132
|
+
if payload is not None:
|
|
133
|
+
data = json.dumps(payload).encode("utf-8")
|
|
134
|
+
req_headers["Content-Type"] = "application/json"
|
|
135
|
+
req = request.Request(url, data=data, headers=req_headers)
|
|
136
|
+
with request.urlopen(req, timeout=timeout) as resp:
|
|
137
|
+
return json.loads(resp.read().decode("utf-8") or "{}")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def http_bytes(url: str, timeout: int = FETCH_TIMEOUT) -> tuple[bytes, str]:
|
|
141
|
+
req = request.Request(url, headers={"User-Agent": "Mozilla/5.0 openwebui-deep-research/0.1"})
|
|
142
|
+
with request.urlopen(req, timeout=timeout) as resp:
|
|
143
|
+
content_type = resp.headers.get("Content-Type", "application/octet-stream").split(";")[0]
|
|
144
|
+
chunks = []
|
|
145
|
+
total = 0
|
|
146
|
+
while True:
|
|
147
|
+
chunk = resp.read(65536)
|
|
148
|
+
if not chunk:
|
|
149
|
+
break
|
|
150
|
+
total += len(chunk)
|
|
151
|
+
if total > MAX_FETCH_BYTES:
|
|
152
|
+
break
|
|
153
|
+
chunks.append(chunk)
|
|
154
|
+
return b"".join(chunks), content_type
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def tika_extract(raw: bytes, content_type: str) -> str:
|
|
158
|
+
req = request.Request(TIKA_URL, data=raw, method="PUT", headers={"Content-Type": content_type})
|
|
159
|
+
with request.urlopen(req, timeout=max(60, FETCH_TIMEOUT)) as resp:
|
|
160
|
+
return resp.read().decode("utf-8", errors="replace")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def html_to_text(raw: bytes) -> str:
|
|
164
|
+
text = raw.decode("utf-8", errors="replace")
|
|
165
|
+
parser = TextExtractor()
|
|
166
|
+
parser.feed(text)
|
|
167
|
+
return clean_text(parser.text())
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def clean_text(text: str) -> str:
|
|
171
|
+
text = re.sub(r"\r", "\n", text)
|
|
172
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
173
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
174
|
+
return text.strip()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def sentence_split(text: str) -> list[str]:
|
|
178
|
+
chunks = re.split(r"(?<=[.!?])\s+|\n+", text)
|
|
179
|
+
return [clean_text(x) for x in chunks if len(clean_text(x)) > 40]
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def terms(question: str) -> set[str]:
|
|
183
|
+
stop = {
|
|
184
|
+
"about", "after", "again", "against", "also", "because", "before", "being", "between", "could", "does",
|
|
185
|
+
"from", "have", "into", "more", "most", "over", "should", "than", "that", "their", "there", "these",
|
|
186
|
+
"this", "through", "what", "when", "where", "which", "while", "with", "would", "your", "deep", "research",
|
|
187
|
+
}
|
|
188
|
+
return {w for w in re.findall(r"[a-zA-Z0-9][a-zA-Z0-9_-]{2,}", question.lower()) if w not in stop}
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def score_text(text: str, wanted: set[str]) -> float:
|
|
192
|
+
if not text or not wanted:
|
|
193
|
+
return 0.0
|
|
194
|
+
lower = text.lower()
|
|
195
|
+
hits = sum(1 for term in wanted if term in lower)
|
|
196
|
+
return hits / max(1, len(wanted))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def make_queries(question: str, limit: int) -> list[str]:
|
|
200
|
+
base = clean_text(question)
|
|
201
|
+
variants = [
|
|
202
|
+
base,
|
|
203
|
+
f"{base} primary source",
|
|
204
|
+
f"{base} official documentation",
|
|
205
|
+
f"{base} data report",
|
|
206
|
+
f"{base} analysis",
|
|
207
|
+
f"{base} criticism limitations",
|
|
208
|
+
f"{base} controversy",
|
|
209
|
+
f"{base} recent developments",
|
|
210
|
+
f"{base} statistics",
|
|
211
|
+
f"{base} expert review",
|
|
212
|
+
f"{base} site:gov OR site:edu",
|
|
213
|
+
f"{base} filetype:pdf",
|
|
214
|
+
]
|
|
215
|
+
result = []
|
|
216
|
+
for query in variants:
|
|
217
|
+
if query not in result:
|
|
218
|
+
result.append(query)
|
|
219
|
+
return result[:limit]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def normalize_sites(value) -> list[str]:
|
|
223
|
+
if not value:
|
|
224
|
+
return []
|
|
225
|
+
if isinstance(value, str):
|
|
226
|
+
items = re.split(r"[\s,]+", value)
|
|
227
|
+
elif isinstance(value, list):
|
|
228
|
+
items = value
|
|
229
|
+
else:
|
|
230
|
+
return []
|
|
231
|
+
|
|
232
|
+
sites = []
|
|
233
|
+
for item in items:
|
|
234
|
+
text = str(item).strip().lower()
|
|
235
|
+
if not text:
|
|
236
|
+
continue
|
|
237
|
+
parsed = parse.urlparse(text if "://" in text else f"https://{text}")
|
|
238
|
+
host = (parsed.netloc or parsed.path.split("/", 1)[0]).split("@")[-1].split(":", 1)[0]
|
|
239
|
+
host = host.removeprefix("www.")
|
|
240
|
+
if re.fullmatch(r"[a-z0-9.-]+\.[a-z]{2,}", host) and host not in sites:
|
|
241
|
+
sites.append(host)
|
|
242
|
+
return sites
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def source_host(url: str) -> str:
|
|
246
|
+
try:
|
|
247
|
+
host = parse.urlparse(url).netloc.lower().split("@")[-1].split(":", 1)[0]
|
|
248
|
+
return host.removeprefix("www.")
|
|
249
|
+
except Exception:
|
|
250
|
+
return ""
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def matches_site(url: str, sites: list[str]) -> bool:
|
|
254
|
+
host = source_host(url)
|
|
255
|
+
return any(host == site or host.endswith(f".{site}") for site in sites)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def source_policy(overrides: dict) -> dict:
|
|
259
|
+
include_sites = normalize_sites(
|
|
260
|
+
overrides.get("sites")
|
|
261
|
+
or overrides.get("include_sites")
|
|
262
|
+
or overrides.get("allowed_sites")
|
|
263
|
+
or overrides.get("domains")
|
|
264
|
+
)
|
|
265
|
+
exclude_sites = normalize_sites(overrides.get("exclude_sites") or overrides.get("blocked_sites"))
|
|
266
|
+
mode = str(overrides.get("site_mode") or overrides.get("source_mode") or "").strip().lower()
|
|
267
|
+
if not mode:
|
|
268
|
+
mode = "restrict" if include_sites and overrides.get("restrict_sites") else "prioritize" if include_sites else "default"
|
|
269
|
+
if mode not in {"default", "restrict", "prioritize"}:
|
|
270
|
+
mode = "default"
|
|
271
|
+
if not include_sites and mode in {"restrict", "prioritize"}:
|
|
272
|
+
mode = "default"
|
|
273
|
+
return {
|
|
274
|
+
"mode": mode,
|
|
275
|
+
"include_sites": include_sites,
|
|
276
|
+
"exclude_sites": exclude_sites,
|
|
277
|
+
"description": (
|
|
278
|
+
"Restrict to listed sites"
|
|
279
|
+
if mode == "restrict"
|
|
280
|
+
else "Prioritize listed sites while allowing the broader web"
|
|
281
|
+
if mode == "prioritize"
|
|
282
|
+
else "Use broad local web search"
|
|
283
|
+
),
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def make_policy_queries(question: str, limit: int, policy: dict) -> list[str]:
|
|
288
|
+
if limit <= 0:
|
|
289
|
+
return []
|
|
290
|
+
|
|
291
|
+
include_sites = policy.get("include_sites") or []
|
|
292
|
+
mode = policy.get("mode")
|
|
293
|
+
site_expr = " OR ".join(f"site:{site}" for site in include_sites[:8])
|
|
294
|
+
|
|
295
|
+
if include_sites and mode == "restrict":
|
|
296
|
+
base_queries = make_queries(question, limit)
|
|
297
|
+
return [f"{query} {site_expr}" for query in base_queries][:limit]
|
|
298
|
+
|
|
299
|
+
if include_sites and mode == "prioritize":
|
|
300
|
+
mixed = []
|
|
301
|
+
for query in make_queries(question, limit):
|
|
302
|
+
mixed.append(f"{query} {site_expr}")
|
|
303
|
+
mixed.append(query)
|
|
304
|
+
if len(mixed) >= limit:
|
|
305
|
+
break
|
|
306
|
+
return mixed[:limit]
|
|
307
|
+
|
|
308
|
+
return make_queries(question, limit)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def document_inputs(overrides: dict) -> list[dict]:
|
|
312
|
+
raw = (
|
|
313
|
+
overrides.get("documents")
|
|
314
|
+
or overrides.get("files")
|
|
315
|
+
or overrides.get("local_documents")
|
|
316
|
+
or overrides.get("local_sources")
|
|
317
|
+
or []
|
|
318
|
+
)
|
|
319
|
+
if isinstance(raw, dict):
|
|
320
|
+
raw = [raw]
|
|
321
|
+
if not isinstance(raw, list):
|
|
322
|
+
return []
|
|
323
|
+
return [item for item in raw if isinstance(item, dict)][:MAX_LOCAL_DOCUMENTS]
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def document_text(item: dict) -> str:
|
|
327
|
+
for key in ("text", "content", "body", "markdown"):
|
|
328
|
+
value = item.get(key)
|
|
329
|
+
if isinstance(value, str) and value.strip():
|
|
330
|
+
return clean_text(value)[:MAX_DOCUMENT_CHARS]
|
|
331
|
+
|
|
332
|
+
encoded = item.get("content_base64") or item.get("data_base64")
|
|
333
|
+
if isinstance(encoded, str) and encoded.strip():
|
|
334
|
+
try:
|
|
335
|
+
raw = base64.b64decode(encoded, validate=True)
|
|
336
|
+
content_type = str(item.get("content_type") or item.get("mime_type") or "text/plain").split(";")[0]
|
|
337
|
+
if content_type in {"text/html", "application/xhtml+xml"}:
|
|
338
|
+
return html_to_text(raw)[:MAX_DOCUMENT_CHARS]
|
|
339
|
+
if content_type.startswith("text/"):
|
|
340
|
+
return clean_text(raw.decode("utf-8", errors="replace"))[:MAX_DOCUMENT_CHARS]
|
|
341
|
+
return clean_text(tika_extract(raw, content_type))[:MAX_DOCUMENT_CHARS]
|
|
342
|
+
except Exception:
|
|
343
|
+
return ""
|
|
344
|
+
return ""
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def document_plan_items(overrides: dict) -> list[dict]:
|
|
348
|
+
items = []
|
|
349
|
+
for index, item in enumerate(document_inputs(overrides), start=1):
|
|
350
|
+
title = clean_text(str(item.get("title") or item.get("name") or item.get("filename") or f"Local document {index}"))
|
|
351
|
+
text = document_text(item)
|
|
352
|
+
if text:
|
|
353
|
+
items.append(
|
|
354
|
+
{
|
|
355
|
+
"title": title[:220],
|
|
356
|
+
"chars": len(text),
|
|
357
|
+
"source": clean_text(str(item.get("url") or item.get("source") or f"local-document://{index}"))[:500],
|
|
358
|
+
}
|
|
359
|
+
)
|
|
360
|
+
return items
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def local_document_sources(overrides: dict, wanted: set[str]) -> list[Source]:
|
|
364
|
+
sources = []
|
|
365
|
+
for index, item in enumerate(document_inputs(overrides), start=1):
|
|
366
|
+
text = document_text(item)
|
|
367
|
+
if not text:
|
|
368
|
+
continue
|
|
369
|
+
title = clean_text(str(item.get("title") or item.get("name") or item.get("filename") or f"Local document {index}"))
|
|
370
|
+
source = Source(
|
|
371
|
+
sid=f"D{len(sources) + 1}",
|
|
372
|
+
title=title[:220],
|
|
373
|
+
url=clean_text(str(item.get("url") or item.get("source") or f"local-document://{index}"))[:500],
|
|
374
|
+
engine="local-document",
|
|
375
|
+
score=1.0,
|
|
376
|
+
snippet=text[:800],
|
|
377
|
+
fetched=True,
|
|
378
|
+
text_chars=len(text),
|
|
379
|
+
)
|
|
380
|
+
candidates = sentence_split(text[:MAX_DOCUMENT_CHARS])
|
|
381
|
+
candidates.sort(key=lambda sentence: score_text(sentence, wanted), reverse=True)
|
|
382
|
+
source.excerpts = candidates[:5] or [text[:1000]]
|
|
383
|
+
sources.append(source)
|
|
384
|
+
return sources
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def connector_inputs(overrides: dict, question: str) -> list[dict]:
|
|
388
|
+
raw = []
|
|
389
|
+
for key in ("connectors", "apps", "connector_sources", "local_app_connectors"):
|
|
390
|
+
value = overrides.get(key)
|
|
391
|
+
if value:
|
|
392
|
+
raw = value
|
|
393
|
+
break
|
|
394
|
+
|
|
395
|
+
explicit = overrides.get("local_app_search")
|
|
396
|
+
if explicit:
|
|
397
|
+
if isinstance(raw, list):
|
|
398
|
+
raw = [*raw, explicit]
|
|
399
|
+
elif raw:
|
|
400
|
+
raw = [raw, explicit]
|
|
401
|
+
else:
|
|
402
|
+
raw = explicit
|
|
403
|
+
|
|
404
|
+
if raw is True:
|
|
405
|
+
raw = [{"type": "local_app", "query": question}]
|
|
406
|
+
elif isinstance(raw, dict):
|
|
407
|
+
raw = [raw]
|
|
408
|
+
elif isinstance(raw, str):
|
|
409
|
+
raw = [{"type": "local_app", "query": raw}]
|
|
410
|
+
elif not isinstance(raw, list):
|
|
411
|
+
raw = []
|
|
412
|
+
|
|
413
|
+
connectors = []
|
|
414
|
+
for item in raw:
|
|
415
|
+
if item is True:
|
|
416
|
+
item = {"type": "local_app", "query": question}
|
|
417
|
+
elif isinstance(item, str):
|
|
418
|
+
item = {"type": "local_app", "query": item}
|
|
419
|
+
if not isinstance(item, dict):
|
|
420
|
+
continue
|
|
421
|
+
connector_type = str(item.get("type") or item.get("connector") or "local_app").strip().lower()
|
|
422
|
+
if connector_type not in {"local_app", "local-app", "local_app_connector", "local-app-connector"}:
|
|
423
|
+
continue
|
|
424
|
+
query = clean_text(str(item.get("query") or item.get("q") or question))
|
|
425
|
+
if not query:
|
|
426
|
+
continue
|
|
427
|
+
connectors.append(
|
|
428
|
+
{
|
|
429
|
+
"type": "local_app",
|
|
430
|
+
"query": query,
|
|
431
|
+
"limit": max(1, min(MAX_CONNECTOR_SOURCES, int(item.get("limit") or item.get("max_results") or 5))),
|
|
432
|
+
"url": str(item.get("url") or LOCAL_APP_CONNECTOR_URL).rstrip("/"),
|
|
433
|
+
}
|
|
434
|
+
)
|
|
435
|
+
if len(connectors) >= MAX_CONNECTOR_SOURCES:
|
|
436
|
+
break
|
|
437
|
+
return connectors
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def connector_plan_items(overrides: dict, question: str) -> list[dict]:
|
|
441
|
+
return [
|
|
442
|
+
{
|
|
443
|
+
"type": item["type"],
|
|
444
|
+
"query": item["query"],
|
|
445
|
+
"limit": item["limit"],
|
|
446
|
+
"url": item["url"],
|
|
447
|
+
}
|
|
448
|
+
for item in connector_inputs(overrides, question)
|
|
449
|
+
]
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def local_app_note_text(connector_url: str, item: dict) -> str:
|
|
453
|
+
note_id = item.get("id")
|
|
454
|
+
if item.get("source") == "local-app-note" and note_id:
|
|
455
|
+
try:
|
|
456
|
+
note = http_json(f"{connector_url}/local-app/notes/{parse.quote(str(note_id))}", timeout=30).get("note", {})
|
|
457
|
+
text = note.get("content") or item.get("snippet") or ""
|
|
458
|
+
tags = note.get("tags") if isinstance(note.get("tags"), list) else []
|
|
459
|
+
return clean_text("\n".join([str(text), " ".join(str(tag) for tag in tags)]))
|
|
460
|
+
except Exception:
|
|
461
|
+
pass
|
|
462
|
+
return clean_text(str(item.get("snippet") or ""))
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def connector_sources(overrides: dict, question: str, wanted: set[str]) -> list[Source]:
|
|
466
|
+
sources = []
|
|
467
|
+
seen = set()
|
|
468
|
+
for connector in connector_inputs(overrides, question):
|
|
469
|
+
params = parse.urlencode({"q": connector["query"], "limit": connector["limit"]})
|
|
470
|
+
try:
|
|
471
|
+
items = http_json(f"{connector['url']}/local-app/search?{params}", timeout=30).get("data", [])
|
|
472
|
+
except Exception:
|
|
473
|
+
continue
|
|
474
|
+
for item in items:
|
|
475
|
+
source_key = f"{item.get('source')}:{item.get('id')}"
|
|
476
|
+
if source_key in seen:
|
|
477
|
+
continue
|
|
478
|
+
seen.add(source_key)
|
|
479
|
+
text = local_app_note_text(connector["url"], item)
|
|
480
|
+
if not text:
|
|
481
|
+
continue
|
|
482
|
+
title = clean_text(str(item.get("title") or source_key or "Local app connector item"))[:220]
|
|
483
|
+
url = clean_text(str(item.get("url") or f"local-app://{source_key}"))[:500]
|
|
484
|
+
source = Source(
|
|
485
|
+
sid=f"C{len(sources) + 1}",
|
|
486
|
+
title=title,
|
|
487
|
+
url=url,
|
|
488
|
+
engine=f"local-app-connector:{item.get('source') or 'item'}",
|
|
489
|
+
score=1.0,
|
|
490
|
+
snippet=text[:800],
|
|
491
|
+
fetched=True,
|
|
492
|
+
text_chars=len(text),
|
|
493
|
+
)
|
|
494
|
+
candidates = sentence_split(text[:MAX_DOCUMENT_CHARS])
|
|
495
|
+
candidates.sort(key=lambda sentence: score_text(sentence, wanted), reverse=True)
|
|
496
|
+
source.excerpts = candidates[:5] or [text[:1000]]
|
|
497
|
+
sources.append(source)
|
|
498
|
+
if len(sources) >= MAX_CONNECTOR_SOURCES:
|
|
499
|
+
return sources
|
|
500
|
+
return sources
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def build_plan(question: str, overrides: dict | None = None) -> dict:
|
|
504
|
+
overrides = overrides or {}
|
|
505
|
+
max_queries = int_setting(overrides, "max_queries", MAX_QUERIES)
|
|
506
|
+
policy = source_policy(overrides)
|
|
507
|
+
raw_queries = overrides.get("queries")
|
|
508
|
+
if isinstance(raw_queries, list) and raw_queries:
|
|
509
|
+
queries = [clean_text(str(query)) for query in raw_queries if clean_text(str(query))][:max_queries]
|
|
510
|
+
else:
|
|
511
|
+
queries = make_policy_queries(question, max_queries, policy)
|
|
512
|
+
return {
|
|
513
|
+
"question": question,
|
|
514
|
+
"created_at": now(),
|
|
515
|
+
"queries": queries,
|
|
516
|
+
"source_policy": policy,
|
|
517
|
+
"local_documents": document_plan_items(overrides),
|
|
518
|
+
"local_connectors": connector_plan_items(overrides, question),
|
|
519
|
+
"limits": {
|
|
520
|
+
"max_queries": max_queries,
|
|
521
|
+
"max_results": int_setting(overrides, "max_results", MAX_RESULTS),
|
|
522
|
+
"max_sources": int_setting(overrides, "max_sources", MAX_SOURCES),
|
|
523
|
+
"max_local_documents": min(MAX_LOCAL_DOCUMENTS, len(document_inputs(overrides))),
|
|
524
|
+
"max_connector_sources": MAX_CONNECTOR_SOURCES,
|
|
525
|
+
"max_document_chars": MAX_DOCUMENT_CHARS,
|
|
526
|
+
"max_snippets": int_setting(overrides, "max_snippets", MAX_SNIPPETS, minimum=1),
|
|
527
|
+
"max_tokens": int_setting(overrides, "max_tokens", MAX_TOKENS, minimum=1, maximum=8192),
|
|
528
|
+
"max_excerpt_chars": int_setting(
|
|
529
|
+
overrides, "max_excerpt_chars", MAX_EXCERPT_CHARS, minimum=80, maximum=10_000
|
|
530
|
+
),
|
|
531
|
+
},
|
|
532
|
+
"review_checklist": [
|
|
533
|
+
"Confirm the question, constraints, and expected output.",
|
|
534
|
+
"Confirm whether the listed sites should be restricted, prioritized, or ignored.",
|
|
535
|
+
"Confirm whether local documents should be included as private sources.",
|
|
536
|
+
"Confirm whether local app connector sources should be included as private sources.",
|
|
537
|
+
"Increase max_sources or max_tokens for broader reports; lower them for faster local runs.",
|
|
538
|
+
],
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def searxng_search(query: str, count: int) -> list[dict]:
|
|
543
|
+
params = parse.urlencode({"q": query, "format": "json", "language": "all"})
|
|
544
|
+
data = http_json(f"{SEARXNG_URL}?{params}", timeout=45)
|
|
545
|
+
return data.get("results", [])[:count]
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def dedupe_sources(results: list[dict], max_sources: int, policy: dict | None = None) -> list[Source]:
|
|
549
|
+
if max_sources <= 0:
|
|
550
|
+
return []
|
|
551
|
+
|
|
552
|
+
policy = policy or {}
|
|
553
|
+
mode = policy.get("mode", "default")
|
|
554
|
+
include_sites = policy.get("include_sites") or []
|
|
555
|
+
exclude_sites = policy.get("exclude_sites") or []
|
|
556
|
+
seen = set()
|
|
557
|
+
sources = []
|
|
558
|
+
for item in results:
|
|
559
|
+
url = item.get("url") or item.get("parsed_url")
|
|
560
|
+
if not isinstance(url, str) or not url.startswith(("http://", "https://")):
|
|
561
|
+
continue
|
|
562
|
+
normalized = re.sub(r"#.*$", "", url)
|
|
563
|
+
if exclude_sites and matches_site(normalized, exclude_sites):
|
|
564
|
+
continue
|
|
565
|
+
if mode == "restrict" and include_sites and not matches_site(normalized, include_sites):
|
|
566
|
+
continue
|
|
567
|
+
if normalized in seen:
|
|
568
|
+
continue
|
|
569
|
+
seen.add(normalized)
|
|
570
|
+
sid = f"S{len(sources) + 1}"
|
|
571
|
+
sources.append(
|
|
572
|
+
Source(
|
|
573
|
+
sid=sid,
|
|
574
|
+
title=clean_text(item.get("title") or normalized)[:220],
|
|
575
|
+
url=normalized,
|
|
576
|
+
engine=str(item.get("engine") or ""),
|
|
577
|
+
score=float(item.get("score") or 0.0),
|
|
578
|
+
snippet=clean_text(item.get("content") or item.get("snippet") or "")[:800],
|
|
579
|
+
)
|
|
580
|
+
)
|
|
581
|
+
if len(sources) >= max_sources:
|
|
582
|
+
break
|
|
583
|
+
return sources
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def fetch_source(source: Source, wanted: set[str]):
|
|
587
|
+
try:
|
|
588
|
+
raw, content_type = http_bytes(source.url)
|
|
589
|
+
if content_type in {"text/html", "application/xhtml+xml"}:
|
|
590
|
+
text = html_to_text(raw)
|
|
591
|
+
elif content_type.startswith("text/"):
|
|
592
|
+
text = clean_text(raw.decode("utf-8", errors="replace"))
|
|
593
|
+
else:
|
|
594
|
+
text = clean_text(tika_extract(raw, content_type))
|
|
595
|
+
source.fetched = bool(text)
|
|
596
|
+
source.text_chars = len(text)
|
|
597
|
+
candidates = sentence_split(text[:200_000])
|
|
598
|
+
candidates.sort(key=lambda sentence: score_text(sentence, wanted), reverse=True)
|
|
599
|
+
source.excerpts = candidates[:5] or ([text[:1000]] if text else [])
|
|
600
|
+
except Exception as exc:
|
|
601
|
+
source.error = str(exc)[:500]
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def glm_chat(messages: list[dict], max_tokens: int = 1024, temperature: float = 0.2) -> str:
|
|
605
|
+
payload = {
|
|
606
|
+
"model": GLM_MODEL,
|
|
607
|
+
"messages": messages,
|
|
608
|
+
"max_tokens": max_tokens,
|
|
609
|
+
"temperature": temperature,
|
|
610
|
+
"stream": False,
|
|
611
|
+
}
|
|
612
|
+
with LLM_LOCK:
|
|
613
|
+
data = http_json(f"{GLM_BASE_URL}/chat/completions", payload=payload, timeout=GLM_TIMEOUT)
|
|
614
|
+
return data.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def report_markdown(
|
|
618
|
+
run_id: str,
|
|
619
|
+
question: str,
|
|
620
|
+
plan: dict,
|
|
621
|
+
activity: list[dict],
|
|
622
|
+
sources: list[Source],
|
|
623
|
+
answer: str,
|
|
624
|
+
review: dict | None = None,
|
|
625
|
+
citation_audit: dict | None = None,
|
|
626
|
+
) -> str:
|
|
627
|
+
created = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(plan.get("created_at") or now()))
|
|
628
|
+
policy = plan.get("source_policy") or {}
|
|
629
|
+
limits = plan.get("limits") or {}
|
|
630
|
+
lines = [
|
|
631
|
+
f"# Deep Research Report: {run_id}",
|
|
632
|
+
"",
|
|
633
|
+
f"Created: {created}",
|
|
634
|
+
"",
|
|
635
|
+
"## Question",
|
|
636
|
+
"",
|
|
637
|
+
question,
|
|
638
|
+
"",
|
|
639
|
+
"## Research Plan",
|
|
640
|
+
"",
|
|
641
|
+
f"- Source mode: {policy.get('mode', 'default')} ({policy.get('description', 'Use broad local web search')})",
|
|
642
|
+
f"- Include sites: {', '.join(policy.get('include_sites') or []) or '(none)'}",
|
|
643
|
+
f"- Exclude sites: {', '.join(policy.get('exclude_sites') or []) or '(none)'}",
|
|
644
|
+
f"- Limits: {json.dumps(limits, sort_keys=True)}",
|
|
645
|
+
"",
|
|
646
|
+
"### Local Documents",
|
|
647
|
+
"",
|
|
648
|
+
]
|
|
649
|
+
local_documents = plan.get("local_documents") or []
|
|
650
|
+
if local_documents:
|
|
651
|
+
for document in local_documents:
|
|
652
|
+
lines.append(f"- {document.get('title')} ({document.get('chars')} chars, {document.get('source')})")
|
|
653
|
+
else:
|
|
654
|
+
lines.append("- (none)")
|
|
655
|
+
lines.extend(["", "### Local App Connectors", ""])
|
|
656
|
+
local_connectors = plan.get("local_connectors") or []
|
|
657
|
+
if local_connectors:
|
|
658
|
+
for connector in local_connectors:
|
|
659
|
+
lines.append(
|
|
660
|
+
f"- {connector.get('type')} query `{connector.get('query')}` "
|
|
661
|
+
f"(limit {connector.get('limit')}, {connector.get('url')})"
|
|
662
|
+
)
|
|
663
|
+
else:
|
|
664
|
+
lines.append("- (none)")
|
|
665
|
+
lines.extend([
|
|
666
|
+
"",
|
|
667
|
+
"### Queries",
|
|
668
|
+
"",
|
|
669
|
+
])
|
|
670
|
+
lines.extend([f"- {query}" for query in plan.get("queries", [])] or ["- (no search queries; local dry run)"])
|
|
671
|
+
lines.extend(["", "## Activity History", ""])
|
|
672
|
+
if activity:
|
|
673
|
+
for event in activity:
|
|
674
|
+
timestamp = time.strftime("%H:%M:%S", time.gmtime(event.get("ts") or now()))
|
|
675
|
+
phase = event.get("phase", "step")
|
|
676
|
+
lines.append(f"- {timestamp} [{phase}] {event.get('message', '')}")
|
|
677
|
+
else:
|
|
678
|
+
lines.append("- (no activity recorded)")
|
|
679
|
+
if review:
|
|
680
|
+
updated_at = int(review.get("updated_at") or 0)
|
|
681
|
+
updated = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(updated_at)) if updated_at else "(unknown)"
|
|
682
|
+
lines.extend(
|
|
683
|
+
[
|
|
684
|
+
"",
|
|
685
|
+
"## Review",
|
|
686
|
+
"",
|
|
687
|
+
f"- Status: {review.get('status', 'reviewed')}",
|
|
688
|
+
f"- Revision count: {review.get('revision_count', 0)}",
|
|
689
|
+
f"- Updated: {updated}",
|
|
690
|
+
f"- Reviewer: {review.get('reviewer') or '(local user)'}",
|
|
691
|
+
f"- Note: {review.get('note') or '(none)'}",
|
|
692
|
+
]
|
|
693
|
+
)
|
|
694
|
+
if citation_audit:
|
|
695
|
+
lines.extend(
|
|
696
|
+
[
|
|
697
|
+
"",
|
|
698
|
+
"## Citation Audit",
|
|
699
|
+
"",
|
|
700
|
+
f"- Status: {citation_audit.get('status', 'unknown')}",
|
|
701
|
+
f"- Citation count: {citation_audit.get('citation_count', 0)}",
|
|
702
|
+
f"- Valid citations: {', '.join(citation_audit.get('valid_citation_ids') or []) or '(none)'}",
|
|
703
|
+
f"- Invalid citations: {', '.join(citation_audit.get('invalid_citation_ids') or []) or '(none)'}",
|
|
704
|
+
f"- Uncited sources: {', '.join(citation_audit.get('uncited_source_ids') or []) or '(none)'}",
|
|
705
|
+
]
|
|
706
|
+
)
|
|
707
|
+
lines.extend(["", "## Answer", "", answer, "", "## Sources Used", ""])
|
|
708
|
+
if sources:
|
|
709
|
+
for source in sources:
|
|
710
|
+
status = "fetched" if source.fetched else f"failed: {source.error or 'not fetched'}"
|
|
711
|
+
lines.append(f"- [{source.sid}] {source.title} - {source.url} ({status}, {source.text_chars} chars)")
|
|
712
|
+
else:
|
|
713
|
+
lines.append("- (no sources used)")
|
|
714
|
+
return "\n".join(lines).strip() + "\n"
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
def html_text_block(text: str) -> str:
|
|
718
|
+
escaped = html.escape(text or "")
|
|
719
|
+
paragraphs = []
|
|
720
|
+
for block in re.split(r"\n{2,}", escaped):
|
|
721
|
+
block = block.strip()
|
|
722
|
+
if not block:
|
|
723
|
+
continue
|
|
724
|
+
paragraphs.append(f"<p>{block.replace(chr(10), '<br>')}</p>")
|
|
725
|
+
return "\n".join(paragraphs) or "<p>No content.</p>"
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def report_html(
|
|
729
|
+
run_id: str,
|
|
730
|
+
question: str,
|
|
731
|
+
plan: dict,
|
|
732
|
+
activity: list[dict],
|
|
733
|
+
sources: list[Source],
|
|
734
|
+
answer: str,
|
|
735
|
+
markdown: str,
|
|
736
|
+
review: dict | None = None,
|
|
737
|
+
citation_audit: dict | None = None,
|
|
738
|
+
) -> str:
|
|
739
|
+
created = time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(plan.get("created_at") or now()))
|
|
740
|
+
policy = plan.get("source_policy") or {}
|
|
741
|
+
limits = plan.get("limits") or {}
|
|
742
|
+
local_documents = plan.get("local_documents") or []
|
|
743
|
+
local_connectors = plan.get("local_connectors") or []
|
|
744
|
+
queries = plan.get("queries") or []
|
|
745
|
+
review = review or {}
|
|
746
|
+
review_status = str(review.get("status") or "unreviewed")
|
|
747
|
+
|
|
748
|
+
def link(filename: str, label: str) -> str:
|
|
749
|
+
return f'<a href="{html.escape(filename)}" download>{html.escape(label)}</a>'
|
|
750
|
+
|
|
751
|
+
download_links = " ".join(
|
|
752
|
+
[
|
|
753
|
+
link("report.md", "Markdown"),
|
|
754
|
+
link("report.html", "HTML"),
|
|
755
|
+
link("report.docx", "Word DOCX"),
|
|
756
|
+
link("report.doc", "Word DOC"),
|
|
757
|
+
link("report.pdf", "PDF"),
|
|
758
|
+
link("report-bundle.zip", "Bundle ZIP"),
|
|
759
|
+
link("source-pack.json", "JSON"),
|
|
760
|
+
link("citation-audit.json", "Citation Audit"),
|
|
761
|
+
link("activity.json", "Activity"),
|
|
762
|
+
]
|
|
763
|
+
)
|
|
764
|
+
|
|
765
|
+
source_cards = []
|
|
766
|
+
for source in sources:
|
|
767
|
+
status = "Fetched" if source.fetched else f"Failed: {source.error or 'not fetched'}"
|
|
768
|
+
excerpt = source.excerpts[0] if source.excerpts else source.snippet
|
|
769
|
+
source_cards.append(
|
|
770
|
+
"\n".join(
|
|
771
|
+
[
|
|
772
|
+
'<article class="source-card" data-source-id="' + html.escape(source.sid) + '">',
|
|
773
|
+
'<div class="source-card__head">',
|
|
774
|
+
f"<span>{html.escape(source.sid)}</span>",
|
|
775
|
+
f"<strong>{html.escape(source.title)}</strong>",
|
|
776
|
+
"</div>",
|
|
777
|
+
f'<a href="{html.escape(source.url)}">{html.escape(source.url)}</a>',
|
|
778
|
+
f"<p>{html.escape(status)} · {source.text_chars} chars · {html.escape(source.engine or 'source')}</p>",
|
|
779
|
+
f"<blockquote>{html.escape((excerpt or '')[:900])}</blockquote>",
|
|
780
|
+
"</article>",
|
|
781
|
+
]
|
|
782
|
+
)
|
|
783
|
+
)
|
|
784
|
+
if not source_cards:
|
|
785
|
+
source_cards.append('<p class="empty">No source excerpts were used for this dry-run report.</p>')
|
|
786
|
+
|
|
787
|
+
activity_items = []
|
|
788
|
+
for event in activity:
|
|
789
|
+
timestamp = time.strftime("%H:%M:%S", time.gmtime(event.get("ts") or now()))
|
|
790
|
+
activity_items.append(
|
|
791
|
+
'<li><time>'
|
|
792
|
+
+ html.escape(timestamp)
|
|
793
|
+
+ '</time><span>'
|
|
794
|
+
+ html.escape(event.get("phase", "step"))
|
|
795
|
+
+ '</span><p>'
|
|
796
|
+
+ html.escape(event.get("message", ""))
|
|
797
|
+
+ "</p></li>"
|
|
798
|
+
)
|
|
799
|
+
if not activity_items:
|
|
800
|
+
activity_items.append("<li><time>--:--:--</time><span>idle</span><p>No activity recorded.</p></li>")
|
|
801
|
+
|
|
802
|
+
plan_rows = [
|
|
803
|
+
("Source mode", f"{policy.get('mode', 'default')} - {policy.get('description', 'Use broad local web search')}"),
|
|
804
|
+
("Include sites", ", ".join(policy.get("include_sites") or []) or "(none)"),
|
|
805
|
+
("Exclude sites", ", ".join(policy.get("exclude_sites") or []) or "(none)"),
|
|
806
|
+
("Local documents", str(len(local_documents))),
|
|
807
|
+
("Local app connectors", str(len(local_connectors))),
|
|
808
|
+
("Limits", json.dumps(limits, sort_keys=True)),
|
|
809
|
+
]
|
|
810
|
+
plan_table = "\n".join(
|
|
811
|
+
f"<tr><th>{html.escape(label)}</th><td>{html.escape(value)}</td></tr>" for label, value in plan_rows
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
review_updated_at = int(review.get("updated_at") or 0)
|
|
815
|
+
review_updated = (
|
|
816
|
+
time.strftime("%Y-%m-%d %H:%M:%S UTC", time.gmtime(review_updated_at))
|
|
817
|
+
if review_updated_at
|
|
818
|
+
else "(not reviewed)"
|
|
819
|
+
)
|
|
820
|
+
review_rows = [
|
|
821
|
+
("Status", review_status),
|
|
822
|
+
("Revisions", str(review.get("revision_count") or 0)),
|
|
823
|
+
("Updated", review_updated),
|
|
824
|
+
("Reviewer", str(review.get("reviewer") or "(local user)")),
|
|
825
|
+
("Note", str(review.get("note") or "(none)")),
|
|
826
|
+
]
|
|
827
|
+
review_table = "\n".join(
|
|
828
|
+
f"<tr><th>{html.escape(label)}</th><td>{html.escape(value)}</td></tr>" for label, value in review_rows
|
|
829
|
+
)
|
|
830
|
+
citation_audit = citation_audit or {}
|
|
831
|
+
citation_rows = [
|
|
832
|
+
("Status", str(citation_audit.get("status") or "unknown")),
|
|
833
|
+
("Citation count", str(citation_audit.get("citation_count") or 0)),
|
|
834
|
+
("Valid citations", ", ".join(citation_audit.get("valid_citation_ids") or []) or "(none)"),
|
|
835
|
+
("Invalid citations", ", ".join(citation_audit.get("invalid_citation_ids") or []) or "(none)"),
|
|
836
|
+
("Uncited sources", ", ".join(citation_audit.get("uncited_source_ids") or []) or "(none)"),
|
|
837
|
+
]
|
|
838
|
+
citation_table = "\n".join(
|
|
839
|
+
f"<tr><th>{html.escape(label)}</th><td>{html.escape(value)}</td></tr>" for label, value in citation_rows
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
connector_items = "\n".join(
|
|
843
|
+
f"<li>{html.escape(item.get('type', 'local_app'))}: {html.escape(item.get('query', ''))} "
|
|
844
|
+
f"(limit {html.escape(str(item.get('limit', '')))}, {html.escape(item.get('url', ''))})</li>"
|
|
845
|
+
for item in local_connectors
|
|
846
|
+
) or "<li>(none)</li>"
|
|
847
|
+
document_items = "\n".join(
|
|
848
|
+
f"<li>{html.escape(item.get('title', 'Local document'))} ({html.escape(str(item.get('chars', 0)))} chars)</li>"
|
|
849
|
+
for item in local_documents
|
|
850
|
+
) or "<li>(none)</li>"
|
|
851
|
+
query_items = "\n".join(f"<li>{html.escape(query)}</li>" for query in queries) or "<li>(no web queries)</li>"
|
|
852
|
+
|
|
853
|
+
markdown_escaped = html.escape(markdown)
|
|
854
|
+
return (
|
|
855
|
+
"<!doctype html>\n"
|
|
856
|
+
"<html><head><meta charset=\"utf-8\"><title>Deep Research Report</title>"
|
|
857
|
+
"<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">"
|
|
858
|
+
"<style>"
|
|
859
|
+
":root{color-scheme:light;--ink:#151719;--muted:#5a6472;--line:#d9dee7;--bg:#f7f8fa;"
|
|
860
|
+
"--panel:#fff;--accent:#0f766e;--blue:#1d4ed8;--amber:#b45309}"
|
|
861
|
+
"*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);"
|
|
862
|
+
"font-family:Inter,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;line-height:1.5}"
|
|
863
|
+
"a{color:var(--blue);text-decoration:none}a:hover{text-decoration:underline}"
|
|
864
|
+
".shell{display:grid;grid-template-columns:minmax(260px,320px) minmax(0,1fr);min-height:100vh}"
|
|
865
|
+
"aside{border-right:1px solid var(--line);background:#fff;padding:24px;position:sticky;top:0;height:100vh;overflow:auto}"
|
|
866
|
+
"main{padding:32px;max-width:1100px}.eyebrow{font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:var(--accent);font-weight:700}"
|
|
867
|
+
"h1{font-size:32px;line-height:1.15;margin:8px 0 12px}h2{font-size:20px;margin:0 0 14px}h3{font-size:15px;margin:0 0 8px}"
|
|
868
|
+
".meta{color:var(--muted);font-size:14px}.download-bar{display:flex;flex-wrap:wrap;gap:8px;margin:18px 0}"
|
|
869
|
+
".download-bar a{border:1px solid var(--line);border-radius:6px;padding:7px 10px;background:#fff;font-size:13px}"
|
|
870
|
+
".review-badge{display:inline-flex;align-items:center;border:1px solid #99f6e4;background:#e6fffb;color:#0f766e;border-radius:999px;padding:4px 9px;font-size:12px;font-weight:700}"
|
|
871
|
+
"section{margin:0 0 24px}.panel{background:var(--panel);border:1px solid var(--line);border-radius:8px;padding:18px}"
|
|
872
|
+
"table{border-collapse:collapse;width:100%;font-size:14px}th,td{border-top:1px solid var(--line);padding:9px 0;text-align:left;vertical-align:top}"
|
|
873
|
+
"th{width:150px;color:var(--muted);font-weight:600}.answer p{margin:0 0 12px}.source-list{display:grid;gap:12px}"
|
|
874
|
+
".source-card{border:1px solid var(--line);border-radius:8px;background:#fff;padding:14px}.source-card__head{display:flex;gap:10px;align-items:flex-start}"
|
|
875
|
+
".source-card__head span{background:#e6fffb;color:#0f766e;border:1px solid #99f6e4;border-radius:999px;padding:2px 7px;font-size:12px;font-weight:700}"
|
|
876
|
+
".source-card p,.source-card a{font-size:13px}.source-card blockquote{margin:10px 0 0;border-left:3px solid var(--accent);padding-left:10px;color:#29313b}"
|
|
877
|
+
".activity-timeline{list-style:none;padding:0;margin:0;display:grid;gap:10px}.activity-timeline li{border-left:3px solid var(--line);padding-left:12px}"
|
|
878
|
+
".activity-timeline time{font-size:12px;color:var(--muted);margin-right:8px}.activity-timeline span{font-size:12px;color:var(--amber);font-weight:700;text-transform:uppercase}"
|
|
879
|
+
".raw-markdown{white-space:pre-wrap;word-wrap:break-word;background:#101418;color:#f7f8fa;border-radius:8px;padding:16px;overflow:auto;font-size:13px}"
|
|
880
|
+
".empty{color:var(--muted)}ul{padding-left:18px;margin-top:8px}@media(max-width:820px){.shell{grid-template-columns:1fr}aside{position:relative;height:auto;border-right:0;border-bottom:1px solid var(--line)}main{padding:20px}h1{font-size:25px}}"
|
|
881
|
+
"</style></head><body>"
|
|
882
|
+
f'<div class="shell" data-report-shell="deep-research-report" data-run-id="{html.escape(run_id)}" data-review-status="{html.escape(review_status)}">'
|
|
883
|
+
"<aside>"
|
|
884
|
+
'<div class="eyebrow">Deep Research</div>'
|
|
885
|
+
f"<h1>{html.escape(question[:160] or 'Research report')}</h1>"
|
|
886
|
+
f'<p class="meta">Run {html.escape(run_id)}<br>Created {html.escape(created)}</p>'
|
|
887
|
+
f'<p><span class="review-badge">Review: {html.escape(review_status)}</span></p>'
|
|
888
|
+
f'<nav class="download-bar" aria-label="Downloads">{download_links}</nav>'
|
|
889
|
+
"<section><h2>Plan</h2><table>" + plan_table + "</table></section>"
|
|
890
|
+
'<section><h2>Review</h2><table data-review-panel="deep-research-review">' + review_table + "</table></section>"
|
|
891
|
+
'<section><h2>Citation Audit</h2><table data-citation-audit="deep-research-citations">' + citation_table + "</table></section>"
|
|
892
|
+
"<section><h3>Local Documents</h3><ul>" + document_items + "</ul></section>"
|
|
893
|
+
"<section><h3>Local App Connectors</h3><ul>" + connector_items + "</ul></section>"
|
|
894
|
+
"<section><h3>Queries</h3><ul>" + query_items + "</ul></section>"
|
|
895
|
+
"</aside><main>"
|
|
896
|
+
'<section class="panel answer" id="answer"><h2>Answer</h2>'
|
|
897
|
+
+ html_text_block(answer)
|
|
898
|
+
+ "</section>"
|
|
899
|
+
'<section class="panel" id="citation-audit"><h2>Citation Audit</h2><table data-citation-audit-panel="deep-research-citations">'
|
|
900
|
+
+ citation_table
|
|
901
|
+
+ "</table></section>"
|
|
902
|
+
'<section class="panel" id="sources"><h2>Sources Used</h2><div class="source-list">'
|
|
903
|
+
+ "\n".join(source_cards)
|
|
904
|
+
+ "</div></section>"
|
|
905
|
+
'<section class="panel" id="activity"><h2>Activity History</h2><ol class="activity-timeline">'
|
|
906
|
+
+ "\n".join(activity_items)
|
|
907
|
+
+ "</ol></section>"
|
|
908
|
+
'<section class="panel" id="raw"><h2>Raw Markdown</h2><pre class="raw-markdown">'
|
|
909
|
+
+ markdown_escaped
|
|
910
|
+
+ "</pre></section>"
|
|
911
|
+
"</main></div></body></html>\n"
|
|
912
|
+
)
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
def pdf_escape(text: str) -> str:
|
|
916
|
+
return text.encode("latin-1", errors="replace").decode("latin-1").replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)")
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def docx_escape(text: str) -> str:
|
|
920
|
+
text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", str(text))
|
|
921
|
+
return html.escape(text, quote=False)
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
def docx_paragraph(text: str, style: str | None = None) -> str:
|
|
925
|
+
if not text:
|
|
926
|
+
return "<w:p/>"
|
|
927
|
+
style_xml = f'<w:pPr><w:pStyle w:val="{docx_escape(style)}"/></w:pPr>' if style else ""
|
|
928
|
+
return f'<w:p>{style_xml}<w:r><w:t xml:space="preserve">{docx_escape(text)}</w:t></w:r></w:p>'
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
def report_docx(markdown: str) -> bytes:
|
|
932
|
+
paragraphs = []
|
|
933
|
+
for raw_line in markdown.splitlines():
|
|
934
|
+
line = raw_line.rstrip()
|
|
935
|
+
if not line:
|
|
936
|
+
paragraphs.append(docx_paragraph(""))
|
|
937
|
+
elif line.startswith("# "):
|
|
938
|
+
paragraphs.append(docx_paragraph(line[2:].strip(), "Title"))
|
|
939
|
+
elif line.startswith("## "):
|
|
940
|
+
paragraphs.append(docx_paragraph(line[3:].strip(), "Heading1"))
|
|
941
|
+
elif line.startswith("### "):
|
|
942
|
+
paragraphs.append(docx_paragraph(line[4:].strip(), "Heading2"))
|
|
943
|
+
elif line.startswith("- "):
|
|
944
|
+
paragraphs.append(docx_paragraph(f"• {line[2:].strip()}"))
|
|
945
|
+
else:
|
|
946
|
+
paragraphs.append(docx_paragraph(line))
|
|
947
|
+
|
|
948
|
+
document_xml = (
|
|
949
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
950
|
+
'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
|
|
951
|
+
"<w:body>"
|
|
952
|
+
+ "".join(paragraphs)
|
|
953
|
+
+ '<w:sectPr><w:pgSz w:w="12240" w:h="15840"/><w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440" w:header="720" w:footer="720" w:gutter="0"/></w:sectPr>'
|
|
954
|
+
"</w:body></w:document>"
|
|
955
|
+
)
|
|
956
|
+
styles_xml = (
|
|
957
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
958
|
+
'<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
|
|
959
|
+
'<w:style w:type="paragraph" w:styleId="Normal"><w:name w:val="Normal"/></w:style>'
|
|
960
|
+
'<w:style w:type="paragraph" w:styleId="Title"><w:name w:val="Title"/><w:basedOn w:val="Normal"/>'
|
|
961
|
+
'<w:rPr><w:b/><w:sz w:val="32"/></w:rPr></w:style>'
|
|
962
|
+
'<w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="heading 1"/><w:basedOn w:val="Normal"/>'
|
|
963
|
+
'<w:rPr><w:b/><w:sz w:val="28"/></w:rPr></w:style>'
|
|
964
|
+
'<w:style w:type="paragraph" w:styleId="Heading2"><w:name w:val="heading 2"/><w:basedOn w:val="Normal"/>'
|
|
965
|
+
'<w:rPr><w:b/><w:sz w:val="24"/></w:rPr></w:style>'
|
|
966
|
+
"</w:styles>"
|
|
967
|
+
)
|
|
968
|
+
created = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(now()))
|
|
969
|
+
core_xml = (
|
|
970
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
971
|
+
'<cp:coreProperties xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties" '
|
|
972
|
+
'xmlns:dc="http://purl.org/dc/elements/1.1/" '
|
|
973
|
+
'xmlns:dcterms="http://purl.org/dc/terms/" '
|
|
974
|
+
'xmlns:dcmitype="http://purl.org/dc/dcmitype/" '
|
|
975
|
+
'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">'
|
|
976
|
+
"<dc:title>Deep Research Report</dc:title>"
|
|
977
|
+
"<dc:creator>OpenWebUI Local Deep Research</dc:creator>"
|
|
978
|
+
f'<dcterms:created xsi:type="dcterms:W3CDTF">{created}</dcterms:created>'
|
|
979
|
+
f'<dcterms:modified xsi:type="dcterms:W3CDTF">{created}</dcterms:modified>'
|
|
980
|
+
"</cp:coreProperties>"
|
|
981
|
+
)
|
|
982
|
+
app_xml = (
|
|
983
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
984
|
+
'<Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" '
|
|
985
|
+
'xmlns:vt="http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes">'
|
|
986
|
+
"<Application>OpenWebUI Local Deep Research</Application>"
|
|
987
|
+
"</Properties>"
|
|
988
|
+
)
|
|
989
|
+
content_types_xml = (
|
|
990
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
991
|
+
'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
|
|
992
|
+
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
|
|
993
|
+
'<Default Extension="xml" ContentType="application/xml"/>'
|
|
994
|
+
'<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
|
|
995
|
+
'<Override PartName="/word/styles.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"/>'
|
|
996
|
+
'<Override PartName="/docProps/core.xml" ContentType="application/vnd.openxmlformats-package.core-properties+xml"/>'
|
|
997
|
+
'<Override PartName="/docProps/app.xml" ContentType="application/vnd.openxmlformats-officedocument.extended-properties+xml"/>'
|
|
998
|
+
"</Types>"
|
|
999
|
+
)
|
|
1000
|
+
root_rels_xml = (
|
|
1001
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
1002
|
+
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
|
|
1003
|
+
'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>'
|
|
1004
|
+
'<Relationship Id="rId2" Type="http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties" Target="docProps/core.xml"/>'
|
|
1005
|
+
'<Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties" Target="docProps/app.xml"/>'
|
|
1006
|
+
"</Relationships>"
|
|
1007
|
+
)
|
|
1008
|
+
document_rels_xml = (
|
|
1009
|
+
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
|
1010
|
+
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"/>'
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
buffer = io.BytesIO()
|
|
1014
|
+
with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
|
1015
|
+
archive.writestr("[Content_Types].xml", content_types_xml)
|
|
1016
|
+
archive.writestr("_rels/.rels", root_rels_xml)
|
|
1017
|
+
archive.writestr("docProps/core.xml", core_xml)
|
|
1018
|
+
archive.writestr("docProps/app.xml", app_xml)
|
|
1019
|
+
archive.writestr("word/document.xml", document_xml)
|
|
1020
|
+
archive.writestr("word/styles.xml", styles_xml)
|
|
1021
|
+
archive.writestr("word/_rels/document.xml.rels", document_rels_xml)
|
|
1022
|
+
return buffer.getvalue()
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def report_pdf(markdown: str) -> bytes:
|
|
1026
|
+
wrapped_lines = []
|
|
1027
|
+
for line in markdown.splitlines():
|
|
1028
|
+
if not line:
|
|
1029
|
+
wrapped_lines.append("")
|
|
1030
|
+
continue
|
|
1031
|
+
wrapped_lines.extend(textwrap.wrap(line, width=92, replace_whitespace=False) or [""])
|
|
1032
|
+
|
|
1033
|
+
pages = [wrapped_lines[index : index + 54] for index in range(0, len(wrapped_lines), 54)] or [[]]
|
|
1034
|
+
objects: dict[int, bytes] = {
|
|
1035
|
+
1: b"<< /Type /Catalog /Pages 2 0 R >>",
|
|
1036
|
+
3: b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>",
|
|
1037
|
+
}
|
|
1038
|
+
kids = []
|
|
1039
|
+
next_id = 4
|
|
1040
|
+
for page_lines in pages:
|
|
1041
|
+
page_id = next_id
|
|
1042
|
+
content_id = next_id + 1
|
|
1043
|
+
next_id += 2
|
|
1044
|
+
kids.append(f"{page_id} 0 R")
|
|
1045
|
+
commands = ["BT", "/F1 10 Tf", "54 756 Td", "13 TL"]
|
|
1046
|
+
for line in page_lines:
|
|
1047
|
+
commands.append(f"({pdf_escape(line)}) Tj")
|
|
1048
|
+
commands.append("T*")
|
|
1049
|
+
commands.append("ET")
|
|
1050
|
+
stream = "\n".join(commands).encode("latin-1", errors="replace")
|
|
1051
|
+
objects[page_id] = (
|
|
1052
|
+
f"<< /Type /Page /Parent 2 0 R /Resources << /Font << /F1 3 0 R >> >> "
|
|
1053
|
+
f"/MediaBox [0 0 612 792] /Contents {content_id} 0 R >>"
|
|
1054
|
+
).encode("ascii")
|
|
1055
|
+
objects[content_id] = b"<< /Length " + str(len(stream)).encode("ascii") + b" >>\nstream\n" + stream + b"\nendstream"
|
|
1056
|
+
objects[2] = f"<< /Type /Pages /Kids [{' '.join(kids)}] /Count {len(kids)} >>".encode("ascii")
|
|
1057
|
+
|
|
1058
|
+
output = bytearray(b"%PDF-1.4\n")
|
|
1059
|
+
offsets = [0]
|
|
1060
|
+
for object_id in range(1, max(objects) + 1):
|
|
1061
|
+
offsets.append(len(output))
|
|
1062
|
+
output.extend(f"{object_id} 0 obj\n".encode("ascii"))
|
|
1063
|
+
output.extend(objects[object_id])
|
|
1064
|
+
output.extend(b"\nendobj\n")
|
|
1065
|
+
xref_at = len(output)
|
|
1066
|
+
output.extend(f"xref\n0 {len(offsets)}\n".encode("ascii"))
|
|
1067
|
+
output.extend(b"0000000000 65535 f \n")
|
|
1068
|
+
for offset in offsets[1:]:
|
|
1069
|
+
output.extend(f"{offset:010d} 00000 n \n".encode("ascii"))
|
|
1070
|
+
output.extend(
|
|
1071
|
+
f"trailer\n<< /Size {len(offsets)} /Root 1 0 R >>\nstartxref\n{xref_at}\n%%EOF\n".encode("ascii")
|
|
1072
|
+
)
|
|
1073
|
+
return bytes(output)
|
|
1074
|
+
|
|
1075
|
+
|
|
1076
|
+
def source_from_payload(item: dict) -> Source:
|
|
1077
|
+
return Source(
|
|
1078
|
+
sid=str(item.get("sid") or ""),
|
|
1079
|
+
title=str(item.get("title") or "Untitled source"),
|
|
1080
|
+
url=str(item.get("url") or ""),
|
|
1081
|
+
engine=str(item.get("engine") or ""),
|
|
1082
|
+
score=float(item.get("score") or 0.0),
|
|
1083
|
+
snippet=str(item.get("snippet") or ""),
|
|
1084
|
+
fetched=bool(item.get("fetched")),
|
|
1085
|
+
error=str(item.get("error") or ""),
|
|
1086
|
+
text_chars=int(item.get("text_chars") or 0),
|
|
1087
|
+
excerpts=item.get("excerpts") if isinstance(item.get("excerpts"), list) else [],
|
|
1088
|
+
)
|
|
1089
|
+
|
|
1090
|
+
|
|
1091
|
+
def source_pack_markdown(run_id: str, question: str, queries: list[str], sources: list[Source], answer: str) -> str:
|
|
1092
|
+
lines = [f"# Source Pack: {run_id}", "", f"Question: {question}", "", "## Queries", ""]
|
|
1093
|
+
lines.extend([f"- {query}" for query in queries])
|
|
1094
|
+
lines.extend(["", "## Sources", ""])
|
|
1095
|
+
for source in sources:
|
|
1096
|
+
status = "fetched" if source.fetched else f"failed: {source.error or 'not fetched'}"
|
|
1097
|
+
lines.extend([f"### [{source.sid}] {source.title}", f"- URL: {source.url}", f"- Status: {status}", ""])
|
|
1098
|
+
if source.snippet:
|
|
1099
|
+
lines.extend(["Search snippet:", "", f"> {source.snippet}", ""])
|
|
1100
|
+
for excerpt in source.excerpts:
|
|
1101
|
+
lines.extend([f"> {excerpt}", ""])
|
|
1102
|
+
lines.extend(["## Final Answer", "", answer])
|
|
1103
|
+
return "\n".join(lines)
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
def citation_audit_for_answer(run_id: str, answer: str, sources: list[Source], review: dict | None = None) -> dict:
|
|
1107
|
+
source_ids = [source.sid for source in sources if source.sid]
|
|
1108
|
+
source_id_set = set(source_ids)
|
|
1109
|
+
cited_ids = []
|
|
1110
|
+
for match in re.finditer(r"\[S(\d+)\]", answer or ""):
|
|
1111
|
+
citation_id = f"S{match.group(1)}"
|
|
1112
|
+
if citation_id not in cited_ids:
|
|
1113
|
+
cited_ids.append(citation_id)
|
|
1114
|
+
valid_ids = [citation_id for citation_id in cited_ids if citation_id in source_id_set]
|
|
1115
|
+
invalid_ids = [citation_id for citation_id in cited_ids if citation_id not in source_id_set]
|
|
1116
|
+
uncited_ids = [source_id for source_id in source_ids if source_id not in set(cited_ids)]
|
|
1117
|
+
cited_sources = [
|
|
1118
|
+
{
|
|
1119
|
+
"id": source.sid,
|
|
1120
|
+
"title": source.title,
|
|
1121
|
+
"url": source.url,
|
|
1122
|
+
"engine": source.engine,
|
|
1123
|
+
"fetched": source.fetched,
|
|
1124
|
+
"text_chars": source.text_chars,
|
|
1125
|
+
"excerpt_count": len(source.excerpts or []),
|
|
1126
|
+
}
|
|
1127
|
+
for source in sources
|
|
1128
|
+
if source.sid in set(valid_ids)
|
|
1129
|
+
]
|
|
1130
|
+
status = "ready" if not invalid_ids else "needs_review"
|
|
1131
|
+
return {
|
|
1132
|
+
"source": "deep-research-citation-audit",
|
|
1133
|
+
"run_id": run_id,
|
|
1134
|
+
"generated_at": now(),
|
|
1135
|
+
"status": status,
|
|
1136
|
+
"review_status": (review or {}).get("status", "unreviewed"),
|
|
1137
|
+
"citation_count": len(cited_ids),
|
|
1138
|
+
"source_count": len(source_ids),
|
|
1139
|
+
"valid_citation_count": len(valid_ids),
|
|
1140
|
+
"invalid_citation_count": len(invalid_ids),
|
|
1141
|
+
"uncited_source_count": len(uncited_ids),
|
|
1142
|
+
"cited_source_ids": cited_ids,
|
|
1143
|
+
"valid_citation_ids": valid_ids,
|
|
1144
|
+
"invalid_citation_ids": invalid_ids,
|
|
1145
|
+
"uncited_source_ids": uncited_ids,
|
|
1146
|
+
"all_citations_valid": not invalid_ids,
|
|
1147
|
+
"has_citations": bool(cited_ids),
|
|
1148
|
+
"cited_sources": cited_sources,
|
|
1149
|
+
"privacy": {
|
|
1150
|
+
"local_only": True,
|
|
1151
|
+
"derived_from_source_pack": True,
|
|
1152
|
+
"prompt_bodies_excluded": True,
|
|
1153
|
+
},
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
RUN_BUNDLE_FILES = (
|
|
1158
|
+
"report.md",
|
|
1159
|
+
"report.html",
|
|
1160
|
+
"report.docx",
|
|
1161
|
+
"report.doc",
|
|
1162
|
+
"report.pdf",
|
|
1163
|
+
"source-pack.md",
|
|
1164
|
+
"source-pack.json",
|
|
1165
|
+
"citation-audit.json",
|
|
1166
|
+
"activity.json",
|
|
1167
|
+
"revisions.json",
|
|
1168
|
+
)
|
|
1169
|
+
|
|
1170
|
+
|
|
1171
|
+
def write_run_bundle(run_dir: Path, run_id: str):
|
|
1172
|
+
files = []
|
|
1173
|
+
for filename in RUN_BUNDLE_FILES:
|
|
1174
|
+
file_path = run_dir / filename
|
|
1175
|
+
if file_path.exists():
|
|
1176
|
+
files.append(
|
|
1177
|
+
{
|
|
1178
|
+
"filename": filename,
|
|
1179
|
+
"content_type": RUN_FILE_TYPES.get(filename, "application/octet-stream"),
|
|
1180
|
+
"bytes": file_path.stat().st_size,
|
|
1181
|
+
}
|
|
1182
|
+
)
|
|
1183
|
+
manifest = {
|
|
1184
|
+
"source": "deep-research-report-bundle",
|
|
1185
|
+
"run_id": run_id,
|
|
1186
|
+
"generated_at": now(),
|
|
1187
|
+
"files": files,
|
|
1188
|
+
"privacy": {
|
|
1189
|
+
"local_only": True,
|
|
1190
|
+
"includes_source_pack": True,
|
|
1191
|
+
"includes_private_local_sources_if_used": True,
|
|
1192
|
+
"prompt_bodies_excluded": True,
|
|
1193
|
+
},
|
|
1194
|
+
}
|
|
1195
|
+
bundle_path = run_dir / "report-bundle.zip"
|
|
1196
|
+
with zipfile.ZipFile(bundle_path, "w", zipfile.ZIP_DEFLATED) as archive:
|
|
1197
|
+
archive.writestr("manifest.json", json.dumps(manifest, indent=2, ensure_ascii=False) + "\n")
|
|
1198
|
+
for item in files:
|
|
1199
|
+
archive.write(run_dir / item["filename"], item["filename"])
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
def write_run_artifacts(run_dir: Path, payload: dict):
|
|
1203
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
1204
|
+
run_id = str(payload.get("run_id") or run_dir.name)
|
|
1205
|
+
question = str(payload.get("question") or "")
|
|
1206
|
+
queries = payload.get("queries") if isinstance(payload.get("queries"), list) else []
|
|
1207
|
+
plan = payload.get("plan") if isinstance(payload.get("plan"), dict) else {
|
|
1208
|
+
"question": question,
|
|
1209
|
+
"created_at": payload.get("created_at") or now(),
|
|
1210
|
+
"queries": queries,
|
|
1211
|
+
"source_policy": source_policy({}),
|
|
1212
|
+
}
|
|
1213
|
+
activity = payload.get("activity") if isinstance(payload.get("activity"), list) else []
|
|
1214
|
+
answer = str(payload.get("answer") or "")
|
|
1215
|
+
review = payload.get("review") if isinstance(payload.get("review"), dict) else None
|
|
1216
|
+
sources = [source_from_payload(source) for source in payload.get("sources") or [] if isinstance(source, dict)]
|
|
1217
|
+
citation_audit = citation_audit_for_answer(run_id, answer, sources, review=review)
|
|
1218
|
+
|
|
1219
|
+
payload["run_id"] = run_id
|
|
1220
|
+
payload["question"] = question
|
|
1221
|
+
payload["queries"] = queries
|
|
1222
|
+
payload["plan"] = plan
|
|
1223
|
+
payload["activity"] = activity
|
|
1224
|
+
payload["answer"] = answer
|
|
1225
|
+
payload["sources"] = [asdict(source) for source in sources]
|
|
1226
|
+
payload["citation_audit"] = citation_audit
|
|
1227
|
+
|
|
1228
|
+
(run_dir / "source-pack.json").write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1229
|
+
(run_dir / "activity.json").write_text(json.dumps(activity, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1230
|
+
(run_dir / "citation-audit.json").write_text(json.dumps(citation_audit, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1231
|
+
(run_dir / "source-pack.md").write_text(source_pack_markdown(run_id, question, queries, sources, answer), encoding="utf-8")
|
|
1232
|
+
|
|
1233
|
+
report_md = report_markdown(
|
|
1234
|
+
run_id, question, plan, activity, sources, answer, review=review, citation_audit=citation_audit
|
|
1235
|
+
)
|
|
1236
|
+
(run_dir / "report.md").write_text(report_md, encoding="utf-8")
|
|
1237
|
+
html_report = report_html(
|
|
1238
|
+
run_id, question, plan, activity, sources, answer, report_md, review=review, citation_audit=citation_audit
|
|
1239
|
+
)
|
|
1240
|
+
(run_dir / "report.html").write_text(html_report, encoding="utf-8")
|
|
1241
|
+
(run_dir / "report.doc").write_text(html_report, encoding="utf-8")
|
|
1242
|
+
(run_dir / "report.docx").write_bytes(report_docx(report_md))
|
|
1243
|
+
(run_dir / "report.pdf").write_bytes(report_pdf(report_md))
|
|
1244
|
+
write_run_bundle(run_dir, run_id)
|
|
1245
|
+
|
|
1246
|
+
|
|
1247
|
+
def source_pack(
|
|
1248
|
+
run_dir: Path,
|
|
1249
|
+
run_id: str,
|
|
1250
|
+
question: str,
|
|
1251
|
+
queries: list[str],
|
|
1252
|
+
sources: list[Source],
|
|
1253
|
+
answer: str,
|
|
1254
|
+
plan: dict | None = None,
|
|
1255
|
+
activity: list[dict] | None = None,
|
|
1256
|
+
):
|
|
1257
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
1258
|
+
plan = plan or {"question": question, "created_at": now(), "queries": queries, "source_policy": source_policy({})}
|
|
1259
|
+
activity = activity or []
|
|
1260
|
+
payload = {
|
|
1261
|
+
"run_id": run_id,
|
|
1262
|
+
"question": question,
|
|
1263
|
+
"created_at": now(),
|
|
1264
|
+
"queries": queries,
|
|
1265
|
+
"plan": plan,
|
|
1266
|
+
"activity": activity,
|
|
1267
|
+
"answer": answer,
|
|
1268
|
+
"sources": [asdict(source) for source in sources],
|
|
1269
|
+
}
|
|
1270
|
+
write_run_artifacts(run_dir, payload)
|
|
1271
|
+
|
|
1272
|
+
|
|
1273
|
+
def runs_root() -> Path:
|
|
1274
|
+
path = STORAGE / "runs"
|
|
1275
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
1276
|
+
return path
|
|
1277
|
+
|
|
1278
|
+
|
|
1279
|
+
def load_run_summary(run_dir: Path) -> dict | None:
|
|
1280
|
+
pack_path = run_dir / "source-pack.json"
|
|
1281
|
+
if not pack_path.exists():
|
|
1282
|
+
return None
|
|
1283
|
+
try:
|
|
1284
|
+
pack = json.loads(pack_path.read_text(encoding="utf-8"))
|
|
1285
|
+
except json.JSONDecodeError:
|
|
1286
|
+
return None
|
|
1287
|
+
run_id = str(pack.get("run_id") or run_dir.name)
|
|
1288
|
+
plan = pack.get("plan") or {}
|
|
1289
|
+
activity = pack.get("activity") or []
|
|
1290
|
+
sources = pack.get("sources") or []
|
|
1291
|
+
citation_audit = pack.get("citation_audit") if isinstance(pack.get("citation_audit"), dict) else {}
|
|
1292
|
+
created_at = int(pack.get("created_at") or plan.get("created_at") or 0)
|
|
1293
|
+
if not created_at:
|
|
1294
|
+
try:
|
|
1295
|
+
created_at = int(pack_path.stat().st_mtime)
|
|
1296
|
+
except OSError:
|
|
1297
|
+
created_at = 0
|
|
1298
|
+
artifacts = []
|
|
1299
|
+
for filename in RUN_FILE_TYPES:
|
|
1300
|
+
file_path = run_dir / filename
|
|
1301
|
+
if file_path.exists():
|
|
1302
|
+
artifacts.append(
|
|
1303
|
+
{
|
|
1304
|
+
"filename": filename,
|
|
1305
|
+
"content_type": RUN_FILE_TYPES[filename],
|
|
1306
|
+
"bytes": file_path.stat().st_size,
|
|
1307
|
+
"url": f"{PUBLIC_BASE_URL}/runs/{run_id}/{filename}",
|
|
1308
|
+
}
|
|
1309
|
+
)
|
|
1310
|
+
answer = str(pack.get("answer") or "")
|
|
1311
|
+
return {
|
|
1312
|
+
"id": run_id,
|
|
1313
|
+
"question": clean_text(str(pack.get("question") or plan.get("question") or "")),
|
|
1314
|
+
"created_at": created_at,
|
|
1315
|
+
"created_at_iso": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(created_at)) if created_at else "",
|
|
1316
|
+
"source_mode": (plan.get("source_policy") or {}).get("mode", "default"),
|
|
1317
|
+
"source_count": len(sources),
|
|
1318
|
+
"activity_count": len(activity),
|
|
1319
|
+
"local_document_count": len(plan.get("local_documents") or []),
|
|
1320
|
+
"local_connector_count": len(plan.get("local_connectors") or []),
|
|
1321
|
+
"review_status": (pack.get("review") or {}).get("status", "unreviewed"),
|
|
1322
|
+
"revision_count": int((pack.get("review") or {}).get("revision_count") or 0),
|
|
1323
|
+
"citation_audit_status": citation_audit.get("status"),
|
|
1324
|
+
"citation_count": int(citation_audit.get("citation_count") or 0),
|
|
1325
|
+
"invalid_citation_count": int(citation_audit.get("invalid_citation_count") or 0),
|
|
1326
|
+
"answer_preview": clean_text(answer)[:360],
|
|
1327
|
+
"artifacts": artifacts,
|
|
1328
|
+
"report_url": f"{PUBLIC_BASE_URL}/runs/{run_id}/report.html",
|
|
1329
|
+
"source_pack_url": f"{PUBLIC_BASE_URL}/runs/{run_id}/source-pack.json",
|
|
1330
|
+
"citation_audit_url": f"{PUBLIC_BASE_URL}/runs/{run_id}/citation-audit.json",
|
|
1331
|
+
"review_url": f"{PUBLIC_BASE_URL}/runs/{run_id}/review",
|
|
1332
|
+
}
|
|
1333
|
+
|
|
1334
|
+
|
|
1335
|
+
def list_run_summaries(query: str = "", limit: int = 50) -> list[dict]:
|
|
1336
|
+
query = clean_text(query).lower()
|
|
1337
|
+
terms_ = [term for term in re.split(r"\s+", query) if term]
|
|
1338
|
+
summaries = []
|
|
1339
|
+
for run_dir in runs_root().iterdir():
|
|
1340
|
+
if not run_dir.is_dir():
|
|
1341
|
+
continue
|
|
1342
|
+
summary = load_run_summary(run_dir)
|
|
1343
|
+
if not summary:
|
|
1344
|
+
continue
|
|
1345
|
+
haystack = " ".join(
|
|
1346
|
+
[
|
|
1347
|
+
summary.get("id", ""),
|
|
1348
|
+
summary.get("question", ""),
|
|
1349
|
+
summary.get("answer_preview", ""),
|
|
1350
|
+
summary.get("source_mode", ""),
|
|
1351
|
+
]
|
|
1352
|
+
).lower()
|
|
1353
|
+
if terms_ and not all(term in haystack for term in terms_):
|
|
1354
|
+
continue
|
|
1355
|
+
summaries.append(summary)
|
|
1356
|
+
summaries.sort(key=lambda item: item.get("created_at") or 0, reverse=True)
|
|
1357
|
+
return summaries[: max(1, min(500, int(limit or 50)))]
|
|
1358
|
+
|
|
1359
|
+
|
|
1360
|
+
def report_library_html(runs: list[dict], query: str = "") -> str:
|
|
1361
|
+
cards = []
|
|
1362
|
+
for item in runs:
|
|
1363
|
+
artifacts = " ".join(
|
|
1364
|
+
f'<a href="{html.escape(artifact["url"])}">{html.escape(artifact["filename"])}</a>'
|
|
1365
|
+
for artifact in item.get("artifacts", [])
|
|
1366
|
+
)
|
|
1367
|
+
cards.append(
|
|
1368
|
+
"\n".join(
|
|
1369
|
+
[
|
|
1370
|
+
f'<article class="run-card" data-run-id="{html.escape(item.get("id", ""))}">',
|
|
1371
|
+
'<div class="run-card__main">',
|
|
1372
|
+
f'<time>{html.escape(item.get("created_at_iso", ""))}</time>',
|
|
1373
|
+
f'<h2><a href="{html.escape(item.get("report_url", ""))}">{html.escape(item.get("question", "") or "Untitled research run")}</a></h2>',
|
|
1374
|
+
f'<p>{html.escape(item.get("answer_preview", ""))}</p>',
|
|
1375
|
+
"</div>",
|
|
1376
|
+
'<dl class="run-card__meta">',
|
|
1377
|
+
f'<div><dt>Sources</dt><dd>{html.escape(str(item.get("source_count", 0)))}</dd></div>',
|
|
1378
|
+
f'<div><dt>Events</dt><dd>{html.escape(str(item.get("activity_count", 0)))}</dd></div>',
|
|
1379
|
+
f'<div><dt>Docs</dt><dd>{html.escape(str(item.get("local_document_count", 0)))}</dd></div>',
|
|
1380
|
+
f'<div><dt>Apps</dt><dd>{html.escape(str(item.get("local_connector_count", 0)))}</dd></div>',
|
|
1381
|
+
f'<div><dt>Revisions</dt><dd>{html.escape(str(item.get("revision_count", 0)))}</dd></div>',
|
|
1382
|
+
"</dl>",
|
|
1383
|
+
f'<nav class="artifact-links">{artifacts}</nav>',
|
|
1384
|
+
"</article>",
|
|
1385
|
+
]
|
|
1386
|
+
)
|
|
1387
|
+
)
|
|
1388
|
+
if not cards:
|
|
1389
|
+
cards.append('<p class="empty">No research runs match this search.</p>')
|
|
1390
|
+
return (
|
|
1391
|
+
"<!doctype html>\n"
|
|
1392
|
+
'<html><head><meta charset="utf-8"><title>Deep Research Library</title>'
|
|
1393
|
+
'<meta name="viewport" content="width=device-width,initial-scale=1">'
|
|
1394
|
+
"<style>"
|
|
1395
|
+
":root{--ink:#151719;--muted:#5a6472;--line:#d9dee7;--bg:#f7f8fa;--panel:#fff;--accent:#0f766e;--blue:#1d4ed8}"
|
|
1396
|
+
"*{box-sizing:border-box}body{margin:0;background:var(--bg);color:var(--ink);font-family:Inter,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,'Segoe UI',sans-serif;line-height:1.5}"
|
|
1397
|
+
"main{max-width:1120px;margin:0 auto;padding:32px 20px}a{color:var(--blue);text-decoration:none}a:hover{text-decoration:underline}"
|
|
1398
|
+
".top{display:flex;align-items:flex-end;justify-content:space-between;gap:16px;margin-bottom:20px}.eyebrow{font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:var(--accent);font-weight:700}"
|
|
1399
|
+
"h1{margin:4px 0 0;font-size:32px}.search{display:flex;gap:8px}.search input{border:1px solid var(--line);border-radius:6px;padding:8px 10px;min-width:260px}.search button{border:1px solid var(--line);border-radius:6px;background:#fff;padding:8px 12px}"
|
|
1400
|
+
".run-list{display:grid;gap:12px}.run-card{background:var(--panel);border:1px solid var(--line);border-radius:8px;padding:16px;display:grid;grid-template-columns:minmax(0,1fr) auto;gap:14px}"
|
|
1401
|
+
".run-card time{font-size:12px;color:var(--muted)}.run-card h2{font-size:18px;margin:4px 0 8px}.run-card p{margin:0;color:#29313b}.run-card__meta{display:grid;grid-template-columns:repeat(5,1fr);gap:8px;margin:0}.run-card__meta div{border:1px solid var(--line);border-radius:6px;padding:8px;min-width:62px;text-align:center}.run-card__meta dt{font-size:11px;color:var(--muted)}.run-card__meta dd{font-weight:700;margin:0}.artifact-links{grid-column:1/-1;display:flex;flex-wrap:wrap;gap:8px}.artifact-links a{border:1px solid var(--line);border-radius:6px;padding:6px 8px;background:#fff;font-size:13px}.empty{background:#fff;border:1px solid var(--line);border-radius:8px;padding:18px;color:var(--muted)}"
|
|
1402
|
+
"@media(max-width:760px){.top{display:block}.search{margin-top:14px}.search input{min-width:0;width:100%}.run-card{grid-template-columns:1fr}.run-card__meta{grid-template-columns:repeat(2,1fr)}}"
|
|
1403
|
+
"</style></head><body>"
|
|
1404
|
+
'<main data-report-library="deep-research-runs">'
|
|
1405
|
+
'<div class="top"><div><div class="eyebrow">Deep Research</div><h1>Report Library</h1></div>'
|
|
1406
|
+
f'<form class="search" method="get" action="/runs/index.html"><input name="q" value="{html.escape(query)}" placeholder="Search reports"><button type="submit">Search</button></form></div>'
|
|
1407
|
+
'<section class="run-list">'
|
|
1408
|
+
+ "\n".join(cards)
|
|
1409
|
+
+ "</section></main></body></html>\n"
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
|
|
1413
|
+
def delete_run(run_id: str) -> bool:
|
|
1414
|
+
run_dir = runs_root() / run_id
|
|
1415
|
+
if not run_dir.exists() or not run_dir.is_dir():
|
|
1416
|
+
return False
|
|
1417
|
+
for child in run_dir.iterdir():
|
|
1418
|
+
if child.is_file() or child.is_symlink():
|
|
1419
|
+
child.unlink()
|
|
1420
|
+
elif child.is_dir():
|
|
1421
|
+
for nested in child.rglob("*"):
|
|
1422
|
+
if nested.is_file() or nested.is_symlink():
|
|
1423
|
+
nested.unlink()
|
|
1424
|
+
child.rmdir()
|
|
1425
|
+
run_dir.rmdir()
|
|
1426
|
+
return True
|
|
1427
|
+
|
|
1428
|
+
|
|
1429
|
+
def review_run(run_id: str, payload: dict) -> dict:
|
|
1430
|
+
run_dir = runs_root() / run_id
|
|
1431
|
+
pack_path = run_dir / "source-pack.json"
|
|
1432
|
+
if not run_dir.is_dir() or not pack_path.exists():
|
|
1433
|
+
raise FileNotFoundError("run not found")
|
|
1434
|
+
|
|
1435
|
+
pack = json.loads(pack_path.read_text(encoding="utf-8"))
|
|
1436
|
+
revised_answer = clean_text(
|
|
1437
|
+
str(
|
|
1438
|
+
payload.get("revised_answer")
|
|
1439
|
+
or payload.get("answer")
|
|
1440
|
+
or payload.get("markdown")
|
|
1441
|
+
or payload.get("content")
|
|
1442
|
+
or ""
|
|
1443
|
+
)
|
|
1444
|
+
)
|
|
1445
|
+
if not revised_answer:
|
|
1446
|
+
raise ValueError("review requires a non-empty revised answer")
|
|
1447
|
+
|
|
1448
|
+
note = clean_text(str(payload.get("note") or payload.get("review_note") or ""))[:1000]
|
|
1449
|
+
reviewer = clean_text(str(payload.get("reviewer") or "local-user"))[:120]
|
|
1450
|
+
revision_id = str(uuid.uuid4())
|
|
1451
|
+
updated_at = now()
|
|
1452
|
+
|
|
1453
|
+
revisions_path = run_dir / "revisions.json"
|
|
1454
|
+
if revisions_path.exists():
|
|
1455
|
+
try:
|
|
1456
|
+
revisions = json.loads(revisions_path.read_text(encoding="utf-8"))
|
|
1457
|
+
if not isinstance(revisions, list):
|
|
1458
|
+
revisions = []
|
|
1459
|
+
except json.JSONDecodeError:
|
|
1460
|
+
revisions = []
|
|
1461
|
+
else:
|
|
1462
|
+
revisions = []
|
|
1463
|
+
|
|
1464
|
+
previous_answer = str(pack.get("answer") or "")
|
|
1465
|
+
if "original_answer" not in pack:
|
|
1466
|
+
pack["original_answer"] = previous_answer
|
|
1467
|
+
|
|
1468
|
+
revision = {
|
|
1469
|
+
"id": revision_id,
|
|
1470
|
+
"updated_at": updated_at,
|
|
1471
|
+
"reviewer": reviewer,
|
|
1472
|
+
"note": note,
|
|
1473
|
+
"answer_chars": len(revised_answer),
|
|
1474
|
+
"previous_answer_preview": clean_text(previous_answer)[:360],
|
|
1475
|
+
}
|
|
1476
|
+
revisions.append(revision)
|
|
1477
|
+
revisions_path.write_text(json.dumps(revisions, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1478
|
+
|
|
1479
|
+
activity = pack.get("activity") if isinstance(pack.get("activity"), list) else []
|
|
1480
|
+
activity.append(
|
|
1481
|
+
{
|
|
1482
|
+
"ts": updated_at,
|
|
1483
|
+
"phase": "review",
|
|
1484
|
+
"message": clean_text(f"Reviewed report saved as revision {revision_id}. {note}"),
|
|
1485
|
+
}
|
|
1486
|
+
)
|
|
1487
|
+
pack["activity"] = activity
|
|
1488
|
+
pack["answer"] = revised_answer
|
|
1489
|
+
pack["review"] = {
|
|
1490
|
+
"status": "reviewed",
|
|
1491
|
+
"updated_at": updated_at,
|
|
1492
|
+
"revision_count": len(revisions),
|
|
1493
|
+
"latest_revision_id": revision_id,
|
|
1494
|
+
"reviewer": reviewer,
|
|
1495
|
+
"note": note,
|
|
1496
|
+
"original_answer_preserved": bool(pack.get("original_answer")),
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
write_run_artifacts(run_dir, pack)
|
|
1500
|
+
return {
|
|
1501
|
+
"status": True,
|
|
1502
|
+
"review": pack["review"],
|
|
1503
|
+
"revision": revision,
|
|
1504
|
+
"run": load_run_summary(run_dir),
|
|
1505
|
+
"artifacts": {
|
|
1506
|
+
"report_md": f"{PUBLIC_BASE_URL}/runs/{run_id}/report.md",
|
|
1507
|
+
"report_html": f"{PUBLIC_BASE_URL}/runs/{run_id}/report.html",
|
|
1508
|
+
"report_docx": f"{PUBLIC_BASE_URL}/runs/{run_id}/report.docx",
|
|
1509
|
+
"report_doc": f"{PUBLIC_BASE_URL}/runs/{run_id}/report.doc",
|
|
1510
|
+
"report_pdf": f"{PUBLIC_BASE_URL}/runs/{run_id}/report.pdf",
|
|
1511
|
+
"report_bundle": f"{PUBLIC_BASE_URL}/runs/{run_id}/report-bundle.zip",
|
|
1512
|
+
"citation_audit": f"{PUBLIC_BASE_URL}/runs/{run_id}/citation-audit.json",
|
|
1513
|
+
"revisions": f"{PUBLIC_BASE_URL}/runs/{run_id}/revisions.json",
|
|
1514
|
+
},
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
|
|
1518
|
+
def synthesize(
|
|
1519
|
+
question: str,
|
|
1520
|
+
sources: list[Source],
|
|
1521
|
+
run_id: str,
|
|
1522
|
+
max_tokens: int,
|
|
1523
|
+
max_snippets: int,
|
|
1524
|
+
max_excerpt_chars: int,
|
|
1525
|
+
extractive_only: bool = False,
|
|
1526
|
+
) -> str:
|
|
1527
|
+
evidence = []
|
|
1528
|
+
for source in sources:
|
|
1529
|
+
if not source.excerpts:
|
|
1530
|
+
continue
|
|
1531
|
+
for excerpt in source.excerpts[:3]:
|
|
1532
|
+
evidence.append(f"[{source.sid}] {source.title}\nURL: {source.url}\nExcerpt: {excerpt[:max_excerpt_chars]}")
|
|
1533
|
+
if len(evidence) >= max_snippets:
|
|
1534
|
+
break
|
|
1535
|
+
if not evidence:
|
|
1536
|
+
return "I could not gather enough source text to produce a grounded deep research answer."
|
|
1537
|
+
|
|
1538
|
+
if extractive_only:
|
|
1539
|
+
lines = [
|
|
1540
|
+
"Answer",
|
|
1541
|
+
"",
|
|
1542
|
+
f"Local extractive research summary for: {question}",
|
|
1543
|
+
"",
|
|
1544
|
+
"Evidence",
|
|
1545
|
+
"",
|
|
1546
|
+
]
|
|
1547
|
+
for source in sources:
|
|
1548
|
+
if not source.excerpts:
|
|
1549
|
+
continue
|
|
1550
|
+
lines.append(f"- [{source.sid}] {source.title}: {source.excerpts[0][:max_excerpt_chars]}")
|
|
1551
|
+
if len(lines) >= 2 + (max_snippets * 2):
|
|
1552
|
+
break
|
|
1553
|
+
lines.extend(
|
|
1554
|
+
[
|
|
1555
|
+
"",
|
|
1556
|
+
"Contradictions or uncertainty",
|
|
1557
|
+
"",
|
|
1558
|
+
"- This fast extractive mode quotes the available local evidence and does not ask GLM to infer beyond it.",
|
|
1559
|
+
"",
|
|
1560
|
+
"Sources to inspect",
|
|
1561
|
+
"",
|
|
1562
|
+
]
|
|
1563
|
+
)
|
|
1564
|
+
lines.extend(f"- [{source.sid}] {source.title}: {source.url}" for source in sources[:max_snippets])
|
|
1565
|
+
return "\n".join(lines)
|
|
1566
|
+
|
|
1567
|
+
prompt = (
|
|
1568
|
+
"You are writing a deep research answer. Use only the provided sources. "
|
|
1569
|
+
"Cite claims with source ids like [S1]. Identify disagreements, uncertainty, and source quality issues. "
|
|
1570
|
+
"Do not cite a source unless the cited sentence is supported by its excerpt.\n\n"
|
|
1571
|
+
f"Question:\n{question}\n\n"
|
|
1572
|
+
"Evidence:\n" + "\n\n".join(evidence[:max_snippets]) + "\n\n"
|
|
1573
|
+
"Write a concise but thorough answer with sections: Answer, Evidence, Contradictions or uncertainty, Sources to inspect."
|
|
1574
|
+
)
|
|
1575
|
+
try:
|
|
1576
|
+
return glm_chat(
|
|
1577
|
+
[
|
|
1578
|
+
{"role": "system", "content": "You are a careful research analyst. You always cite sources accurately."},
|
|
1579
|
+
{"role": "user", "content": prompt},
|
|
1580
|
+
],
|
|
1581
|
+
max_tokens=max_tokens,
|
|
1582
|
+
temperature=0.2,
|
|
1583
|
+
)
|
|
1584
|
+
except Exception as exc:
|
|
1585
|
+
listed = "\n".join(f"- [{s.sid}] {s.title}: {s.url}" for s in sources[:20])
|
|
1586
|
+
return (
|
|
1587
|
+
"Deep research gathered sources, but GLM synthesis did not complete.\n\n"
|
|
1588
|
+
f"Error: {exc}\n\n"
|
|
1589
|
+
"Source pack contains the extracted evidence for manual review.\n\n"
|
|
1590
|
+
f"{listed}"
|
|
1591
|
+
)
|
|
1592
|
+
|
|
1593
|
+
|
|
1594
|
+
def build_research(question: str, progress=None, overrides: dict | None = None) -> str:
|
|
1595
|
+
overrides = overrides or {}
|
|
1596
|
+
plan = build_plan(question, overrides)
|
|
1597
|
+
limits = plan["limits"]
|
|
1598
|
+
max_results = limits["max_results"]
|
|
1599
|
+
max_sources = limits["max_sources"]
|
|
1600
|
+
max_snippets = limits["max_snippets"]
|
|
1601
|
+
max_tokens = limits["max_tokens"]
|
|
1602
|
+
max_excerpt_chars = limits["max_excerpt_chars"]
|
|
1603
|
+
run_id = str(uuid.uuid4())
|
|
1604
|
+
run_dir = STORAGE / "runs" / run_id
|
|
1605
|
+
activity = []
|
|
1606
|
+
|
|
1607
|
+
def say(message: str, phase: str = "progress"):
|
|
1608
|
+
activity.append({"ts": now(), "phase": phase, "message": clean_text(message)})
|
|
1609
|
+
if progress:
|
|
1610
|
+
progress(message)
|
|
1611
|
+
|
|
1612
|
+
queries = plan["queries"]
|
|
1613
|
+
policy = plan["source_policy"]
|
|
1614
|
+
wanted = terms(question)
|
|
1615
|
+
say(f"Research run `{run_id}` started.\n\n", "start")
|
|
1616
|
+
say(
|
|
1617
|
+
f"Search plan: {len(queries)} queries, source mode `{policy['mode']}`, up to {max_sources} sources.\n\n",
|
|
1618
|
+
"plan",
|
|
1619
|
+
)
|
|
1620
|
+
document_sources = local_document_sources(overrides, wanted)
|
|
1621
|
+
say(f"Loaded {len(document_sources)} local document sources.\n\n", "documents")
|
|
1622
|
+
app_sources = connector_sources(overrides, question, wanted)
|
|
1623
|
+
say(f"Loaded {len(app_sources)} local app connector sources.\n\n", "connectors")
|
|
1624
|
+
|
|
1625
|
+
results = []
|
|
1626
|
+
per_query = max(1, max_results // max(1, len(queries)))
|
|
1627
|
+
for index, query in enumerate(queries, start=1):
|
|
1628
|
+
say(f"Searching {index}/{len(queries)}: `{query}`\n\n", "search")
|
|
1629
|
+
try:
|
|
1630
|
+
results.extend(searxng_search(query, per_query))
|
|
1631
|
+
except Exception as exc:
|
|
1632
|
+
say(f"Search failed for `{query}`: {exc}\n\n", "search_error")
|
|
1633
|
+
|
|
1634
|
+
web_sources = dedupe_sources(results, max_sources, policy)
|
|
1635
|
+
say(f"Reading {len(web_sources)} deduplicated web sources.\n\n", "read")
|
|
1636
|
+
for index, source in enumerate(web_sources, start=1):
|
|
1637
|
+
say(f"Reading {index}/{len(web_sources)} [{source.sid}] {source.title}\n\n", "read")
|
|
1638
|
+
fetch_source(source, wanted)
|
|
1639
|
+
|
|
1640
|
+
sources = document_sources + app_sources + web_sources
|
|
1641
|
+
sources.sort(key=lambda item: (len(item.excerpts), item.score, item.text_chars), reverse=True)
|
|
1642
|
+
for idx, source in enumerate(sources, start=1):
|
|
1643
|
+
source.sid = f"S{idx}"
|
|
1644
|
+
|
|
1645
|
+
say("Synthesizing answer with GLM 5.2.\n\n", "synthesize")
|
|
1646
|
+
answer = synthesize(
|
|
1647
|
+
question,
|
|
1648
|
+
sources,
|
|
1649
|
+
run_id,
|
|
1650
|
+
max_tokens=max_tokens,
|
|
1651
|
+
max_snippets=max_snippets,
|
|
1652
|
+
max_excerpt_chars=max_excerpt_chars,
|
|
1653
|
+
extractive_only=bool(overrides.get("extractive_only")),
|
|
1654
|
+
)
|
|
1655
|
+
|
|
1656
|
+
citations = set(re.findall(r"\[S(\d+)\]", answer))
|
|
1657
|
+
valid = {str(i) for i in range(1, len(sources) + 1)}
|
|
1658
|
+
invalid = sorted(citations - valid)
|
|
1659
|
+
if invalid:
|
|
1660
|
+
answer += "\n\nCitation verification note: these citation ids were not in the source pack: " + ", ".join(invalid)
|
|
1661
|
+
|
|
1662
|
+
say("Writing source pack, activity log, and downloadable reports.\n\n", "write_artifacts")
|
|
1663
|
+
source_pack(run_dir, run_id, question, queries, sources, answer, plan=plan, activity=activity)
|
|
1664
|
+
md_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/source-pack.md"
|
|
1665
|
+
json_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/source-pack.json"
|
|
1666
|
+
report_md_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/report.md"
|
|
1667
|
+
report_html_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/report.html"
|
|
1668
|
+
report_docx_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/report.docx"
|
|
1669
|
+
report_doc_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/report.doc"
|
|
1670
|
+
report_pdf_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/report.pdf"
|
|
1671
|
+
report_bundle_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/report-bundle.zip"
|
|
1672
|
+
citation_audit_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/citation-audit.json"
|
|
1673
|
+
activity_url = f"{PUBLIC_BASE_URL}/runs/{run_id}/activity.json"
|
|
1674
|
+
answer += (
|
|
1675
|
+
f"\n\nSource pack: [Markdown]({md_url}) | [JSON]({json_url})"
|
|
1676
|
+
f"\n\nResearch artifacts: [Report Markdown]({report_md_url}) | [HTML]({report_html_url}) | "
|
|
1677
|
+
f"[Word DOCX]({report_docx_url}) | [Word DOC]({report_doc_url}) | [PDF]({report_pdf_url}) | "
|
|
1678
|
+
f"[Bundle ZIP]({report_bundle_url}) | [Citation Audit]({citation_audit_url}) | [Activity]({activity_url})"
|
|
1679
|
+
)
|
|
1680
|
+
return answer
|
|
1681
|
+
|
|
1682
|
+
|
|
1683
|
+
def chunk_payload(content: str, finish_reason=None) -> dict:
|
|
1684
|
+
return {
|
|
1685
|
+
"id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
|
|
1686
|
+
"object": "chat.completion.chunk",
|
|
1687
|
+
"created": now(),
|
|
1688
|
+
"model": MODEL_ID,
|
|
1689
|
+
"choices": [{"index": 0, "delta": {"content": content} if content else {}, "finish_reason": finish_reason}],
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1692
|
+
|
|
1693
|
+
def last_user_message(payload: dict) -> str:
|
|
1694
|
+
messages = payload.get("messages") or []
|
|
1695
|
+
for message in reversed(messages):
|
|
1696
|
+
if message.get("role") == "user":
|
|
1697
|
+
content = message.get("content", "")
|
|
1698
|
+
if isinstance(content, str):
|
|
1699
|
+
return content
|
|
1700
|
+
if isinstance(content, list):
|
|
1701
|
+
return "\n".join(part.get("text", "") for part in content if isinstance(part, dict))
|
|
1702
|
+
return ""
|
|
1703
|
+
|
|
1704
|
+
|
|
1705
|
+
def documents_from_messages(messages: list[dict]) -> list[dict]:
|
|
1706
|
+
documents = []
|
|
1707
|
+
for message in messages:
|
|
1708
|
+
content = message.get("content")
|
|
1709
|
+
if not isinstance(content, list):
|
|
1710
|
+
continue
|
|
1711
|
+
for part in content:
|
|
1712
|
+
if not isinstance(part, dict):
|
|
1713
|
+
continue
|
|
1714
|
+
part_type = str(part.get("type") or "").lower()
|
|
1715
|
+
if part_type not in {"file", "input_file", "document", "local_document"}:
|
|
1716
|
+
continue
|
|
1717
|
+
text = part.get("text") or part.get("content")
|
|
1718
|
+
encoded = part.get("content_base64") or part.get("data_base64")
|
|
1719
|
+
if not text and not encoded:
|
|
1720
|
+
continue
|
|
1721
|
+
documents.append(
|
|
1722
|
+
{
|
|
1723
|
+
"title": part.get("title") or part.get("name") or part.get("filename") or f"Message document {len(documents) + 1}",
|
|
1724
|
+
"text": text,
|
|
1725
|
+
"content_base64": encoded,
|
|
1726
|
+
"content_type": part.get("content_type") or part.get("mime_type"),
|
|
1727
|
+
"source": part.get("url") or part.get("source") or f"message-document://{len(documents) + 1}",
|
|
1728
|
+
}
|
|
1729
|
+
)
|
|
1730
|
+
return documents[:MAX_LOCAL_DOCUMENTS]
|
|
1731
|
+
|
|
1732
|
+
|
|
1733
|
+
def merge_message_documents(payload: dict, overrides: dict) -> dict:
|
|
1734
|
+
message_documents = documents_from_messages(payload.get("messages") or [])
|
|
1735
|
+
if not message_documents:
|
|
1736
|
+
return overrides
|
|
1737
|
+
merged = dict(overrides)
|
|
1738
|
+
existing = document_inputs(merged)
|
|
1739
|
+
merged["documents"] = [*existing, *message_documents][:MAX_LOCAL_DOCUMENTS]
|
|
1740
|
+
return merged
|
|
1741
|
+
|
|
1742
|
+
|
|
1743
|
+
RUN_FILE_TYPES = {
|
|
1744
|
+
"source-pack.md": "text/markdown; charset=utf-8",
|
|
1745
|
+
"source-pack.json": "application/json; charset=utf-8",
|
|
1746
|
+
"citation-audit.json": "application/json; charset=utf-8",
|
|
1747
|
+
"activity.json": "application/json; charset=utf-8",
|
|
1748
|
+
"report.md": "text/markdown; charset=utf-8",
|
|
1749
|
+
"report.html": "text/html; charset=utf-8",
|
|
1750
|
+
"report.docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
1751
|
+
"report.doc": "application/msword; charset=utf-8",
|
|
1752
|
+
"report.pdf": "application/pdf",
|
|
1753
|
+
"report-bundle.zip": "application/zip",
|
|
1754
|
+
"revisions.json": "application/json; charset=utf-8",
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1757
|
+
|
|
1758
|
+
class Handler(BaseHTTPRequestHandler):
|
|
1759
|
+
server_version = "openwebui-deep-research/0.1"
|
|
1760
|
+
|
|
1761
|
+
def log_message(self, fmt, *args):
|
|
1762
|
+
print("%s - - [%s] %s" % (self.client_address[0], self.log_date_time_string(), fmt % args), flush=True)
|
|
1763
|
+
|
|
1764
|
+
def do_OPTIONS(self):
|
|
1765
|
+
self.send_response(204)
|
|
1766
|
+
self.send_header("Access-Control-Allow-Origin", "*")
|
|
1767
|
+
self.send_header("Access-Control-Allow-Headers", "Authorization, Content-Type")
|
|
1768
|
+
self.send_header("Access-Control-Allow-Methods", "GET, POST, DELETE, OPTIONS")
|
|
1769
|
+
self.end_headers()
|
|
1770
|
+
|
|
1771
|
+
def do_GET(self):
|
|
1772
|
+
parsed = parse.urlparse(self.path)
|
|
1773
|
+
path = parsed.path
|
|
1774
|
+
if path.startswith("/v1/runs"):
|
|
1775
|
+
path = path[3:]
|
|
1776
|
+
query = parse.parse_qs(parsed.query)
|
|
1777
|
+
if path == "/health":
|
|
1778
|
+
return send_json(self, 200, {"status": "ok", "model": MODEL_ID})
|
|
1779
|
+
if path in {"/v1/models", "/models"}:
|
|
1780
|
+
return send_json(
|
|
1781
|
+
self,
|
|
1782
|
+
200,
|
|
1783
|
+
{
|
|
1784
|
+
"object": "list",
|
|
1785
|
+
"data": [{"id": MODEL_ID, "object": "model", "created": now(), "owned_by": "local"}],
|
|
1786
|
+
"models": [{"name": MODEL_ID, "model": MODEL_ID, "type": "model"}],
|
|
1787
|
+
},
|
|
1788
|
+
)
|
|
1789
|
+
if path in {"/runs", "/runs/"}:
|
|
1790
|
+
search_query = (query.get("q") or [""])[0]
|
|
1791
|
+
limit = int((query.get("limit") or ["50"])[0] or "50")
|
|
1792
|
+
return send_json(self, 200, {"data": list_run_summaries(search_query, limit)})
|
|
1793
|
+
if path == "/runs/index.html":
|
|
1794
|
+
search_query = (query.get("q") or [""])[0]
|
|
1795
|
+
limit = int((query.get("limit") or ["100"])[0] or "100")
|
|
1796
|
+
html_body = report_library_html(list_run_summaries(search_query, limit), search_query).encode("utf-8")
|
|
1797
|
+
return send_bytes(self, 200, html_body, "text/html; charset=utf-8")
|
|
1798
|
+
match = re.match(r"^/runs/([^/]+)$", path)
|
|
1799
|
+
if match:
|
|
1800
|
+
summary = load_run_summary(runs_root() / match.group(1))
|
|
1801
|
+
if not summary:
|
|
1802
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1803
|
+
return send_json(self, 200, {"run": summary})
|
|
1804
|
+
match = re.match(r"^/runs/([^/]+)/([a-z0-9-]+\.(?:md|json|html|docx|doc|pdf|zip))$", path)
|
|
1805
|
+
if match:
|
|
1806
|
+
run_id, filename = match.groups()
|
|
1807
|
+
if filename not in RUN_FILE_TYPES:
|
|
1808
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1809
|
+
file_path = STORAGE / "runs" / run_id / filename
|
|
1810
|
+
if not file_path.exists():
|
|
1811
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1812
|
+
body = file_path.read_bytes()
|
|
1813
|
+
return send_bytes(self, 200, body, RUN_FILE_TYPES[filename])
|
|
1814
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1815
|
+
|
|
1816
|
+
def do_DELETE(self):
|
|
1817
|
+
path = parse.urlparse(self.path).path
|
|
1818
|
+
if path.startswith("/v1/runs"):
|
|
1819
|
+
path = path[3:]
|
|
1820
|
+
match = re.match(r"^/runs/([^/]+)$", path)
|
|
1821
|
+
if not match:
|
|
1822
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1823
|
+
if not delete_run(match.group(1)):
|
|
1824
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1825
|
+
return send_json(self, 200, {"status": True})
|
|
1826
|
+
|
|
1827
|
+
def do_POST(self):
|
|
1828
|
+
path = parse.urlparse(self.path).path
|
|
1829
|
+
if path.startswith("/v1/runs"):
|
|
1830
|
+
path = path[3:]
|
|
1831
|
+
match = re.match(r"^/runs/([0-9a-f-]+)/review$", path)
|
|
1832
|
+
if match:
|
|
1833
|
+
try:
|
|
1834
|
+
return send_json(self, 200, review_run(match.group(1), read_json(self)))
|
|
1835
|
+
except FileNotFoundError:
|
|
1836
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1837
|
+
except ValueError as exc:
|
|
1838
|
+
return send_json(self, 400, {"error": {"message": str(exc), "type": "bad_request"}})
|
|
1839
|
+
except Exception as exc:
|
|
1840
|
+
return send_json(self, 500, {"error": {"message": str(exc), "type": "server_error"}})
|
|
1841
|
+
|
|
1842
|
+
if path in {"/v1/research/plan", "/research/plan"}:
|
|
1843
|
+
try:
|
|
1844
|
+
payload = read_json(self)
|
|
1845
|
+
question = clean_text(payload.get("question") or last_user_message(payload))
|
|
1846
|
+
if not question:
|
|
1847
|
+
return send_json(self, 400, {"error": {"message": "No question found"}})
|
|
1848
|
+
overrides = payload.get("deep_research") or payload.get("metadata", {}).get("deep_research") or payload
|
|
1849
|
+
overrides = merge_message_documents(payload, overrides)
|
|
1850
|
+
return send_json(self, 200, {"plan": build_plan(question, overrides)})
|
|
1851
|
+
except Exception as exc:
|
|
1852
|
+
return send_json(self, 500, {"error": {"message": str(exc), "type": "server_error"}})
|
|
1853
|
+
|
|
1854
|
+
if path not in {"/v1/chat/completions", "/chat/completions"}:
|
|
1855
|
+
return send_json(self, 404, {"error": "not found"})
|
|
1856
|
+
|
|
1857
|
+
try:
|
|
1858
|
+
payload = read_json(self)
|
|
1859
|
+
question = last_user_message(payload).strip()
|
|
1860
|
+
if not question:
|
|
1861
|
+
return send_json(self, 400, {"error": {"message": "No user message found"}})
|
|
1862
|
+
|
|
1863
|
+
overrides = payload.get("deep_research") or payload.get("metadata", {}).get("deep_research") or {}
|
|
1864
|
+
overrides = merge_message_documents(payload, overrides)
|
|
1865
|
+
if payload.get("stream"):
|
|
1866
|
+
self.send_response(200)
|
|
1867
|
+
self.send_header("Content-Type", "text/event-stream; charset=utf-8")
|
|
1868
|
+
self.send_header("Cache-Control", "no-cache")
|
|
1869
|
+
self.send_header("Connection", "keep-alive")
|
|
1870
|
+
self.send_header("Access-Control-Allow-Origin", "*")
|
|
1871
|
+
self.end_headers()
|
|
1872
|
+
|
|
1873
|
+
def emit(text: str):
|
|
1874
|
+
event = "data: " + json.dumps(chunk_payload(text), ensure_ascii=False) + "\n\n"
|
|
1875
|
+
self.wfile.write(event.encode("utf-8"))
|
|
1876
|
+
self.wfile.flush()
|
|
1877
|
+
|
|
1878
|
+
answer = build_research(question, progress=emit, overrides=overrides)
|
|
1879
|
+
emit(answer)
|
|
1880
|
+
done = "data: " + json.dumps(chunk_payload("", "stop")) + "\n\n" + "data: [DONE]\n\n"
|
|
1881
|
+
self.wfile.write(done.encode("utf-8"))
|
|
1882
|
+
self.wfile.flush()
|
|
1883
|
+
return
|
|
1884
|
+
|
|
1885
|
+
answer = build_research(question, overrides=overrides)
|
|
1886
|
+
return send_json(
|
|
1887
|
+
self,
|
|
1888
|
+
200,
|
|
1889
|
+
{
|
|
1890
|
+
"id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
|
|
1891
|
+
"object": "chat.completion",
|
|
1892
|
+
"created": now(),
|
|
1893
|
+
"model": MODEL_ID,
|
|
1894
|
+
"choices": [{"index": 0, "message": {"role": "assistant", "content": answer}, "finish_reason": "stop"}],
|
|
1895
|
+
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
|
|
1896
|
+
},
|
|
1897
|
+
)
|
|
1898
|
+
except BrokenPipeError:
|
|
1899
|
+
return
|
|
1900
|
+
except Exception as exc:
|
|
1901
|
+
return send_json(self, 500, {"error": {"message": str(exc), "type": "server_error"}})
|
|
1902
|
+
|
|
1903
|
+
|
|
1904
|
+
def main():
|
|
1905
|
+
host = os.environ.get("DEEP_RESEARCH_HOST", "127.0.0.1")
|
|
1906
|
+
port = int(os.environ.get("DEEP_RESEARCH_PORT", "18041"))
|
|
1907
|
+
server = ThreadingHTTPServer((host, port), Handler)
|
|
1908
|
+
print(f"deep research sidecar listening on http://{host}:{port}", flush=True)
|
|
1909
|
+
server.serve_forever()
|
|
1910
|
+
|
|
1911
|
+
|
|
1912
|
+
if __name__ == "__main__":
|
|
1913
|
+
main()
|