@spinabot/brigade 1.9.0 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -10
- package/dist/agents/agent-loop.d.ts +55 -0
- package/dist/agents/agent-loop.d.ts.map +1 -1
- package/dist/agents/agent-loop.js +90 -1
- package/dist/agents/agent-loop.js.map +1 -1
- package/dist/agents/channels/inbound-pipeline.d.ts +22 -0
- package/dist/agents/channels/inbound-pipeline.d.ts.map +1 -1
- package/dist/agents/channels/inbound-pipeline.js +31 -1
- package/dist/agents/channels/inbound-pipeline.js.map +1 -1
- package/dist/agents/channels/media-capture.d.ts +69 -6
- package/dist/agents/channels/media-capture.d.ts.map +1 -1
- package/dist/agents/channels/media-capture.js +125 -8
- package/dist/agents/channels/media-capture.js.map +1 -1
- package/dist/agents/channels/telegram/media.d.ts.map +1 -1
- package/dist/agents/channels/telegram/media.js +16 -4
- package/dist/agents/channels/telegram/media.js.map +1 -1
- package/dist/agents/channels/whatsapp/media.d.ts +19 -0
- package/dist/agents/channels/whatsapp/media.d.ts.map +1 -1
- package/dist/agents/channels/whatsapp/media.js +37 -2
- package/dist/agents/channels/whatsapp/media.js.map +1 -1
- package/dist/agents/media-understanding/anthropic-adapter.d.ts +49 -0
- package/dist/agents/media-understanding/anthropic-adapter.d.ts.map +1 -0
- package/dist/agents/media-understanding/anthropic-adapter.js +162 -0
- package/dist/agents/media-understanding/anthropic-adapter.js.map +1 -0
- package/dist/agents/media-understanding/config.d.ts +57 -0
- package/dist/agents/media-understanding/config.d.ts.map +1 -0
- package/dist/agents/media-understanding/config.js +289 -0
- package/dist/agents/media-understanding/config.js.map +1 -0
- package/dist/agents/media-understanding/gemini-adapter.d.ts +57 -0
- package/dist/agents/media-understanding/gemini-adapter.d.ts.map +1 -0
- package/dist/agents/media-understanding/gemini-adapter.js +343 -0
- package/dist/agents/media-understanding/gemini-adapter.js.map +1 -0
- package/dist/agents/media-understanding/index.d.ts +58 -0
- package/dist/agents/media-understanding/index.d.ts.map +1 -0
- package/dist/agents/media-understanding/index.js +275 -0
- package/dist/agents/media-understanding/index.js.map +1 -0
- package/dist/agents/media-understanding/pi-adapter.d.ts +72 -0
- package/dist/agents/media-understanding/pi-adapter.d.ts.map +1 -0
- package/dist/agents/media-understanding/pi-adapter.js +160 -0
- package/dist/agents/media-understanding/pi-adapter.js.map +1 -0
- package/dist/agents/media-understanding/types.d.ts +189 -0
- package/dist/agents/media-understanding/types.d.ts.map +1 -0
- package/dist/agents/media-understanding/types.js +51 -0
- package/dist/agents/media-understanding/types.js.map +1 -0
- package/dist/agents/session-wiring.d.ts +11 -0
- package/dist/agents/session-wiring.d.ts.map +1 -1
- package/dist/agents/session-wiring.js +1 -0
- package/dist/agents/session-wiring.js.map +1 -1
- package/dist/agents/tools/analyze-media-tool.d.ts +263 -0
- package/dist/agents/tools/analyze-media-tool.d.ts.map +1 -0
- package/dist/agents/tools/analyze-media-tool.js +2321 -0
- package/dist/agents/tools/analyze-media-tool.js.map +1 -0
- package/dist/agents/tools/doc-shared.d.ts +187 -0
- package/dist/agents/tools/doc-shared.d.ts.map +1 -0
- package/dist/agents/tools/doc-shared.js +484 -0
- package/dist/agents/tools/doc-shared.js.map +1 -0
- package/dist/agents/tools/edit-document-tool.d.ts +133 -0
- package/dist/agents/tools/edit-document-tool.d.ts.map +1 -0
- package/dist/agents/tools/edit-document-tool.js +815 -0
- package/dist/agents/tools/edit-document-tool.js.map +1 -0
- package/dist/agents/tools/image-downscale.d.ts +93 -0
- package/dist/agents/tools/image-downscale.d.ts.map +1 -0
- package/dist/agents/tools/image-downscale.js +257 -0
- package/dist/agents/tools/image-downscale.js.map +1 -0
- package/dist/agents/tools/make-document-tool.d.ts +114 -0
- package/dist/agents/tools/make-document-tool.d.ts.map +1 -0
- package/dist/agents/tools/make-document-tool.js +542 -0
- package/dist/agents/tools/make-document-tool.js.map +1 -0
- package/dist/agents/tools/media-cache.d.ts +56 -0
- package/dist/agents/tools/media-cache.d.ts.map +1 -0
- package/dist/agents/tools/media-cache.js +133 -0
- package/dist/agents/tools/media-cache.js.map +1 -0
- package/dist/agents/tools/ooxml-images.d.ts +107 -0
- package/dist/agents/tools/ooxml-images.d.ts.map +1 -0
- package/dist/agents/tools/ooxml-images.js +308 -0
- package/dist/agents/tools/ooxml-images.js.map +1 -0
- package/dist/agents/tools/registry.d.ts +12 -0
- package/dist/agents/tools/registry.d.ts.map +1 -1
- package/dist/agents/tools/registry.js +47 -0
- package/dist/agents/tools/registry.js.map +1 -1
- package/dist/buildstamp.json +1 -1
- package/dist/cli/commands/doctor.d.ts.map +1 -1
- package/dist/cli/commands/doctor.js +41 -0
- package/dist/cli/commands/doctor.js.map +1 -1
- package/dist/core/console-stream.d.ts.map +1 -1
- package/dist/core/console-stream.js +7 -5
- package/dist/core/console-stream.js.map +1 -1
- package/dist/core/server.js +6 -1
- package/dist/core/server.js.map +1 -1
- package/dist/system-prompt/assembler.d.ts.map +1 -1
- package/dist/system-prompt/assembler.js +25 -1
- package/dist/system-prompt/assembler.js.map +1 -1
- package/dist/system-prompt/guidance.d.ts +30 -0
- package/dist/system-prompt/guidance.d.ts.map +1 -1
- package/dist/system-prompt/guidance.js +50 -0
- package/dist/system-prompt/guidance.js.map +1 -1
- package/package.json +9 -1
|
@@ -0,0 +1,2321 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `analyze_media` tool — comprehensive media + document understanding.
|
|
3
|
+
*
|
|
4
|
+
* The model hands this tool a local file PATH or a URL (+ an optional
|
|
5
|
+
* `question`) and the tool RESOLVES the input into content the CURRENT turn's
|
|
6
|
+
* model can reason about against that question. It auto-detects the kind by
|
|
7
|
+
* extension / MIME and dispatches per-format.
|
|
8
|
+
*
|
|
9
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
10
|
+
* WHY THIS DESIGN (STEP-0 investigation findings — read before changing)
|
|
11
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
12
|
+
* 1. TOOL-RESULT CONTENT SHAPE. Pi types a tool's `AgentToolResult.content`
|
|
13
|
+
* as `(TextContent | ImageContent)[]` — TEXT or IMAGE only. There is NO
|
|
14
|
+
* `document` / `pdf` / `video` content-block type anywhere in the Pi SDK,
|
|
15
|
+
* and `Model.input` is `("text" | "image")[]` — the whole SDK content model
|
|
16
|
+
* is text + image. `ImageContent` is `{ type:"image"; data:<base64>;
|
|
17
|
+
* mimeType }`. So an IMAGE can flow to the model as a real multimodal block
|
|
18
|
+
* (the same shape `payload-mutators.ts` prunes from history, proving image
|
|
19
|
+
* blocks reach the provider); a PDF/DOCX/PPTX/XLSX/HTML/VIDEO can NOT be
|
|
20
|
+
* returned as a native non-text block. They must become TEXT.
|
|
21
|
+
*
|
|
22
|
+
* 2. DIRECT-PROVIDER UNDERSTANDING (the gap-closer). For modalities Pi can't
|
|
23
|
+
* carry — VIDEO, native/scanned PDF, and images on a text-only current
|
|
24
|
+
* model — the tool calls a provider REST API DIRECTLY via the
|
|
25
|
+
* media-understanding subsystem (`agents/media-understanding/`): it ships
|
|
26
|
+
* the media bytes + the question to Gemini (video → Files API; image/pdf →
|
|
27
|
+
* inline) or Anthropic (pdf → native `document` block with OCR; image →
|
|
28
|
+
* image block) and gets back TEXT, which it returns for the current model.
|
|
29
|
+
* Keys are resolved through Brigade's existing credential store
|
|
30
|
+
* (`readBrigadeCredentials`), never invented here. This bypasses Pi's
|
|
31
|
+
* text+image content cap WITHOUT needing a Pi aux-model runtime.
|
|
32
|
+
*
|
|
33
|
+
* 3. REUSE. HTML → markdown reuses the existing readability/linkedom extractor
|
|
34
|
+
* (`web-fetch-utils.ts`); URL fetches route through the SSRF guard
|
|
35
|
+
* (`guardedFetch`, `infra/net/fetch-guard.ts`) with size + content-type
|
|
36
|
+
* caps; local paths reuse the outbound media-path guard
|
|
37
|
+
* (`security/media-path-guard.ts`) PLUS a workspace/cwd/cache root scoping
|
|
38
|
+
* so secrets/system files outside allowed roots are refused (the same
|
|
39
|
+
* posture the `read`/path-write guards enforce). Untrusted bytes are
|
|
40
|
+
* wrapped in the external-content envelope (`security/external-content.ts`).
|
|
41
|
+
*
|
|
42
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
43
|
+
* PER-FORMAT BEHAVIOUR
|
|
44
|
+
* ─────────────────────────────────────────────────────────────────────────
|
|
45
|
+
* • image (png/jpg/jpeg/webp/gif/bmp/heic/heif): when the CURRENT model is
|
|
46
|
+
* vision-capable, returned as an IMAGE block so the model sees it (cheap —
|
|
47
|
+
* no extra call). When the current model is text-only, the tool routes the
|
|
48
|
+
* image to a vision-capable provider and returns the resulting TEXT — via
|
|
49
|
+
* the Pi SDK against ANY keyed provider with an image-capable model
|
|
50
|
+
* (OpenAI / OpenRouter / Groq / xAI / Mistral / Ollama / …), or the bespoke
|
|
51
|
+
* google/anthropic REST adapters — so vision works on any model + any
|
|
52
|
+
* configured provider. HEIC/HEIF cannot be transcoded without a native dep,
|
|
53
|
+
* so they are passed through with their declared mime — most providers
|
|
54
|
+
* reject HEIC, so the tool warns. Capped by `maxBytes`.
|
|
55
|
+
* • audio (mp3/wav/m4a/ogg/oga/flac/aac/opus): routed to the media-
|
|
56
|
+
* understanding subsystem (Gemini inline — audio is GEMINI-ONLY because Pi's
|
|
57
|
+
* content model is text + image, so no Pi-drivable provider can ingest an
|
|
58
|
+
* audio block) and the TEXT transcription / summary is returned, so voice
|
|
59
|
+
* notes work. Needs a Google/Gemini key; with none the tool returns a clear
|
|
60
|
+
* "configure a Gemini key" message (NOT a provider 400).
|
|
61
|
+
* • pdf: when an understanding provider key is configured, the PDF is sent
|
|
62
|
+
* NATIVELY (Anthropic `document` block — OCRs scanned pages + reads layout;
|
|
63
|
+
* or Gemini inline) and the provider's TEXT answer is returned, so scanned
|
|
64
|
+
* / no-text-layer PDFs now work. With no key (or `mode:"text"`) it falls
|
|
65
|
+
* back to per-page text extraction (`unpdf`, zero native deps) honoring a
|
|
66
|
+
* `pages` range. `mode:"provider"` forces the provider path.
|
|
67
|
+
* • docx: unzip (`fflate`) → concatenate `word/document.xml` text runs.
|
|
68
|
+
* • pptx: unzip → per-slide text (`ppt/slides/slideN.xml`), slide-numbered,
|
|
69
|
+
* honoring `pages` as a slide range.
|
|
70
|
+
* • xlsx: unzip → `xl/sharedStrings.xml` + each `xl/worksheets/sheetN.xml`
|
|
71
|
+
* → CSV-ish per-sheet text.
|
|
72
|
+
* • html (or a URL returning HTML): readability/linkedom → markdown.
|
|
73
|
+
* • video (mp4/webm/mov/…): always routed to the media-understanding
|
|
74
|
+
* subsystem (Gemini via the Files API: upload → poll ACTIVE →
|
|
75
|
+
* generateContent with a fileData part), and the model's TEXT description
|
|
76
|
+
* is returned. Needs a Google/Gemini key; with none the tool returns a
|
|
77
|
+
* clear "configure a Gemini key" message.
|
|
78
|
+
*
|
|
79
|
+
* The user's `question` is ALWAYS echoed back as a leading text block so the
|
|
80
|
+
* model knows what to do with the resolved content.
|
|
81
|
+
*
|
|
82
|
+
* SECURITY POSTURE: read capability — NOT owner-only — but it MUST honour the
|
|
83
|
+
* path guard (local) + SSRF guard (URL). Registered for every sender; no
|
|
84
|
+
* mutation, no spend.
|
|
85
|
+
*/
|
|
86
|
+
import fs from "node:fs";
|
|
87
|
+
import fsp from "node:fs/promises";
|
|
88
|
+
import os from "node:os";
|
|
89
|
+
import path from "node:path";
|
|
90
|
+
import { Type } from "typebox";
|
|
91
|
+
import { guardedFetch, SsrfBlockedError } from "../../infra/net/fetch-guard.js";
|
|
92
|
+
import { validateOutboundMediaPath } from "../../security/media-path-guard.js";
|
|
93
|
+
import { wrapWebContent } from "../../security/external-content.js";
|
|
94
|
+
import { downscaleImageToBudget, isDownscalableImageMime, } from "./image-downscale.js";
|
|
95
|
+
import { extractOoxmlImages, resolveSlideOrder, } from "./ooxml-images.js";
|
|
96
|
+
import { mediaCacheKey, readMediaCache, writeMediaCache, } from "./media-cache.js";
|
|
97
|
+
import { resolveCacheDir, resolveOsCacheDir, resolveStateDir, DEFAULT_AGENT_ID, } from "../../config/paths.js";
|
|
98
|
+
import { runMediaUnderstanding as defaultRunMediaUnderstanding, resolvePiModel, MediaUnderstandingUnavailableError, } from "../media-understanding/index.js";
|
|
99
|
+
import { buildMediaUnderstandingConfig } from "../media-understanding/config.js";
|
|
100
|
+
import { composeFetchBody, extractBasicHtmlContent, extractReadableContent, } from "./web-fetch-utils.js";
|
|
101
|
+
import { truncateText } from "./web-shared.js";
|
|
102
|
+
import { BrigadeToolInputError, jsonResult } from "./common.js";
|
|
103
|
+
/* ─────────────────────────── tunables ─────────────────────────── */
|
|
104
|
+
/** Default hard cap on bytes read for ANY source (image bytes, doc bytes, fetched body). */
|
|
105
|
+
const DEFAULT_MAX_BYTES = 12 * 1024 * 1024; // 12 MiB
|
|
106
|
+
/** Absolute ceiling — even an explicit `maxBytes` is clamped to this. */
|
|
107
|
+
const MAX_BYTES_CEILING = 48 * 1024 * 1024; // 48 MiB
|
|
108
|
+
/** Image blocks are the most token-expensive — cap them tighter by default. */
|
|
109
|
+
const DEFAULT_IMAGE_MAX_BYTES = 8 * 1024 * 1024; // 8 MiB
|
|
110
|
+
/** Max characters of extracted text returned to the model (keeps the turn bounded). */
|
|
111
|
+
const DEFAULT_MAX_CHARS = 60_000;
|
|
112
|
+
/** Per-request HTTP timeout for URL sources. */
|
|
113
|
+
const FETCH_TIMEOUT_MS = 45_000;
|
|
114
|
+
/** Max images accepted in one batch (`sources[]`). Matches the field cap. */
|
|
115
|
+
const MAX_BATCH_IMAGES = 20;
|
|
116
|
+
/** Max non-image (document/text) sources accepted in one batch. */
|
|
117
|
+
const MAX_BATCH_DOCS = 10;
|
|
118
|
+
/**
|
|
119
|
+
* Max EMBEDDED images surfaced from a single OOXML document when
|
|
120
|
+
* `includeImages` is set. A real deck can carry many pictures (the failure case
|
|
121
|
+
* had 35); cap the count (reusing the batch cap) so the turn stays bounded, and
|
|
122
|
+
* report "showing N of M" when truncated. Per-image + total byte budgets reuse
|
|
123
|
+
* the existing image downscale path.
|
|
124
|
+
*/
|
|
125
|
+
const MAX_EMBEDDED_IMAGES = MAX_BATCH_IMAGES;
|
|
126
|
+
/**
|
|
127
|
+
* Total byte budget across ALL embedded image blocks from one document, so a
|
|
128
|
+
* deck with many large pictures can't blow the turn even under the count cap.
|
|
129
|
+
* Each image is downscaled to the per-image budget first; once the running total
|
|
130
|
+
* would exceed this ceiling, remaining images are dropped (and reported).
|
|
131
|
+
*/
|
|
132
|
+
const EMBEDDED_IMAGES_TOTAL_BYTES = 24 * 1024 * 1024; // 24 MiB
|
|
133
|
+
/**
|
|
134
|
+
* Image MIME types that the understanding providers reliably accept on an image
|
|
135
|
+
* block. Anthropic's Messages API accepts ONLY jpeg / png / gif / webp and
|
|
136
|
+
* returns a 400 for anything else (e.g. image/bmp, image/tiff); Gemini + the
|
|
137
|
+
* Pi-driven providers are similarly conservative. So before routing an image to
|
|
138
|
+
* a provider we re-encode any other (decodable) raster format to JPEG via the
|
|
139
|
+
* downscale path. `image/heic`/`image/heif` are intentionally NOT here — they
|
|
140
|
+
* are not decodable without a native dep (they pass through with their declared
|
|
141
|
+
* mime + a warning).
|
|
142
|
+
*/
|
|
143
|
+
const PROVIDER_SAFE_IMAGE_MIME = new Set([
|
|
144
|
+
"image/jpeg",
|
|
145
|
+
"image/png",
|
|
146
|
+
"image/gif",
|
|
147
|
+
"image/webp",
|
|
148
|
+
]);
|
|
149
|
+
/** Extension → kind. Lowercase, no leading dot. */
|
|
150
|
+
const EXT_KIND = {
|
|
151
|
+
// images
|
|
152
|
+
png: "image",
|
|
153
|
+
jpg: "image",
|
|
154
|
+
jpeg: "image",
|
|
155
|
+
webp: "image",
|
|
156
|
+
gif: "image",
|
|
157
|
+
bmp: "image",
|
|
158
|
+
heic: "image",
|
|
159
|
+
heif: "image",
|
|
160
|
+
// documents
|
|
161
|
+
pdf: "pdf",
|
|
162
|
+
docx: "docx",
|
|
163
|
+
pptx: "pptx",
|
|
164
|
+
xlsx: "xlsx",
|
|
165
|
+
// OpenDocument + e-book + rich-text + notebook (broader than either rival)
|
|
166
|
+
odt: "odt",
|
|
167
|
+
ods: "ods",
|
|
168
|
+
odp: "odp",
|
|
169
|
+
epub: "epub",
|
|
170
|
+
rtf: "rtf",
|
|
171
|
+
ipynb: "ipynb",
|
|
172
|
+
// markup
|
|
173
|
+
html: "html",
|
|
174
|
+
htm: "html",
|
|
175
|
+
// video
|
|
176
|
+
mp4: "video",
|
|
177
|
+
webm: "video",
|
|
178
|
+
mov: "video",
|
|
179
|
+
m4v: "video",
|
|
180
|
+
mkv: "video",
|
|
181
|
+
avi: "video",
|
|
182
|
+
mpeg: "video",
|
|
183
|
+
mpg: "video",
|
|
184
|
+
// audio (voice notes + clips). `.webm`/`.ogg` are ambiguous (audio OR video);
|
|
185
|
+
// they map to video above — the model can pass an explicit `kind:"audio"`, or
|
|
186
|
+
// a URL's `audio/*` MIME re-routes to audio via `kindFromMime`.
|
|
187
|
+
mp3: "audio",
|
|
188
|
+
wav: "audio",
|
|
189
|
+
m4a: "audio",
|
|
190
|
+
oga: "audio",
|
|
191
|
+
ogg: "audio",
|
|
192
|
+
flac: "audio",
|
|
193
|
+
aac: "audio",
|
|
194
|
+
opus: "audio",
|
|
195
|
+
// plain / structured text + common source-code files. Read as UTF-8, wrapped
|
|
196
|
+
// in the untrusted-content envelope, returned as text. (Both rival tools
|
|
197
|
+
// accept these; Brigade used to reject them outright.)
|
|
198
|
+
txt: "text",
|
|
199
|
+
text: "text",
|
|
200
|
+
log: "text",
|
|
201
|
+
csv: "text",
|
|
202
|
+
tsv: "text",
|
|
203
|
+
json: "text",
|
|
204
|
+
jsonl: "text",
|
|
205
|
+
ndjson: "text",
|
|
206
|
+
json5: "text",
|
|
207
|
+
xml: "text",
|
|
208
|
+
yaml: "text",
|
|
209
|
+
yml: "text",
|
|
210
|
+
toml: "text",
|
|
211
|
+
ini: "text",
|
|
212
|
+
cfg: "text",
|
|
213
|
+
conf: "text",
|
|
214
|
+
env: "text",
|
|
215
|
+
properties: "text",
|
|
216
|
+
md: "text",
|
|
217
|
+
markdown: "text",
|
|
218
|
+
mdx: "text",
|
|
219
|
+
rst: "text",
|
|
220
|
+
tex: "text",
|
|
221
|
+
srt: "text",
|
|
222
|
+
vtt: "text",
|
|
223
|
+
// source code
|
|
224
|
+
js: "text",
|
|
225
|
+
mjs: "text",
|
|
226
|
+
cjs: "text",
|
|
227
|
+
jsx: "text",
|
|
228
|
+
ts: "text",
|
|
229
|
+
tsx: "text",
|
|
230
|
+
mts: "text",
|
|
231
|
+
cts: "text",
|
|
232
|
+
py: "text",
|
|
233
|
+
rb: "text",
|
|
234
|
+
go: "text",
|
|
235
|
+
rs: "text",
|
|
236
|
+
java: "text",
|
|
237
|
+
kt: "text",
|
|
238
|
+
kts: "text",
|
|
239
|
+
c: "text",
|
|
240
|
+
h: "text",
|
|
241
|
+
cc: "text",
|
|
242
|
+
cpp: "text",
|
|
243
|
+
cxx: "text",
|
|
244
|
+
hpp: "text",
|
|
245
|
+
cs: "text",
|
|
246
|
+
php: "text",
|
|
247
|
+
swift: "text",
|
|
248
|
+
scala: "text",
|
|
249
|
+
sh: "text",
|
|
250
|
+
bash: "text",
|
|
251
|
+
zsh: "text",
|
|
252
|
+
fish: "text",
|
|
253
|
+
ps1: "text",
|
|
254
|
+
bat: "text",
|
|
255
|
+
sql: "text",
|
|
256
|
+
r: "text",
|
|
257
|
+
lua: "text",
|
|
258
|
+
pl: "text",
|
|
259
|
+
dart: "text",
|
|
260
|
+
ex: "text",
|
|
261
|
+
exs: "text",
|
|
262
|
+
clj: "text",
|
|
263
|
+
hs: "text",
|
|
264
|
+
css: "text",
|
|
265
|
+
scss: "text",
|
|
266
|
+
sass: "text",
|
|
267
|
+
less: "text",
|
|
268
|
+
svg: "text",
|
|
269
|
+
};
|
|
270
|
+
/** MIME prefix/exact → kind, consulted when the extension is ambiguous (URLs). */
|
|
271
|
+
function kindFromMime(mime) {
|
|
272
|
+
if (!mime)
|
|
273
|
+
return undefined;
|
|
274
|
+
const m = mime.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
275
|
+
if (m.startsWith("image/"))
|
|
276
|
+
return "image";
|
|
277
|
+
if (m.startsWith("video/"))
|
|
278
|
+
return "video";
|
|
279
|
+
if (m.startsWith("audio/"))
|
|
280
|
+
return "audio";
|
|
281
|
+
if (m === "application/pdf")
|
|
282
|
+
return "pdf";
|
|
283
|
+
if (m === "text/html" || m === "application/xhtml+xml")
|
|
284
|
+
return "html";
|
|
285
|
+
// Structured-text content types — JSON / XML / YAML / CSV / source. Checked
|
|
286
|
+
// AFTER html so an HTML page still routes to the readability extractor.
|
|
287
|
+
if (m.startsWith("text/") ||
|
|
288
|
+
m === "application/json" ||
|
|
289
|
+
m === "application/ld+json" ||
|
|
290
|
+
m === "application/xml" ||
|
|
291
|
+
m === "application/x-ndjson" ||
|
|
292
|
+
m === "application/x-yaml" ||
|
|
293
|
+
m === "application/yaml" ||
|
|
294
|
+
m === "application/toml" ||
|
|
295
|
+
m === "application/x-sh" ||
|
|
296
|
+
m === "image/svg+xml" ||
|
|
297
|
+
/\+json$/.test(m) ||
|
|
298
|
+
/\+xml$/.test(m)) {
|
|
299
|
+
return "text";
|
|
300
|
+
}
|
|
301
|
+
if (m === "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
302
|
+
return "docx";
|
|
303
|
+
if (m === "application/vnd.openxmlformats-officedocument.presentationml.presentation")
|
|
304
|
+
return "pptx";
|
|
305
|
+
if (m === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
|
|
306
|
+
return "xlsx";
|
|
307
|
+
if (m === "application/vnd.oasis.opendocument.text")
|
|
308
|
+
return "odt";
|
|
309
|
+
if (m === "application/vnd.oasis.opendocument.spreadsheet")
|
|
310
|
+
return "ods";
|
|
311
|
+
if (m === "application/vnd.oasis.opendocument.presentation")
|
|
312
|
+
return "odp";
|
|
313
|
+
if (m === "application/epub+zip")
|
|
314
|
+
return "epub";
|
|
315
|
+
if (m === "application/rtf" || m === "text/rtf")
|
|
316
|
+
return "rtf";
|
|
317
|
+
if (m === "application/x-ipynb+json")
|
|
318
|
+
return "ipynb";
|
|
319
|
+
return undefined;
|
|
320
|
+
}
|
|
321
|
+
/** Pull a lowercase extension (no dot) from a path or URL pathname. */
|
|
322
|
+
export function extensionOf(source) {
|
|
323
|
+
let p = source;
|
|
324
|
+
try {
|
|
325
|
+
if (/^https?:\/\//i.test(source))
|
|
326
|
+
p = new URL(source).pathname;
|
|
327
|
+
}
|
|
328
|
+
catch {
|
|
329
|
+
/* not a URL — treat as a path */
|
|
330
|
+
}
|
|
331
|
+
const ext = path.extname(p).toLowerCase().replace(/^\./, "");
|
|
332
|
+
return ext;
|
|
333
|
+
}
|
|
334
|
+
/** Image mime from extension (no `data:` prefix — Pi's ImageContent wants raw base64 + mimeType). */
|
|
335
|
+
function imageMimeFromExt(ext) {
|
|
336
|
+
switch (ext) {
|
|
337
|
+
case "jpg":
|
|
338
|
+
case "jpeg":
|
|
339
|
+
return "image/jpeg";
|
|
340
|
+
case "webp":
|
|
341
|
+
return "image/webp";
|
|
342
|
+
case "gif":
|
|
343
|
+
return "image/gif";
|
|
344
|
+
case "bmp":
|
|
345
|
+
return "image/bmp";
|
|
346
|
+
case "heic":
|
|
347
|
+
return "image/heic";
|
|
348
|
+
case "heif":
|
|
349
|
+
return "image/heif";
|
|
350
|
+
default:
|
|
351
|
+
return "image/png";
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
/** Video mime from extension — used when a local video has no declared MIME. */
|
|
355
|
+
function videoMimeFromExt(ext) {
|
|
356
|
+
switch (ext) {
|
|
357
|
+
case "webm":
|
|
358
|
+
return "video/webm";
|
|
359
|
+
case "mov":
|
|
360
|
+
return "video/quicktime";
|
|
361
|
+
case "m4v":
|
|
362
|
+
return "video/x-m4v";
|
|
363
|
+
case "mkv":
|
|
364
|
+
return "video/x-matroska";
|
|
365
|
+
case "avi":
|
|
366
|
+
return "video/x-msvideo";
|
|
367
|
+
case "mpeg":
|
|
368
|
+
case "mpg":
|
|
369
|
+
return "video/mpeg";
|
|
370
|
+
default:
|
|
371
|
+
return "video/mp4";
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
/** Audio mime from extension — used when a local audio file has no declared MIME. */
|
|
375
|
+
function audioMimeFromExt(ext) {
|
|
376
|
+
switch (ext) {
|
|
377
|
+
case "wav":
|
|
378
|
+
return "audio/wav";
|
|
379
|
+
case "m4a":
|
|
380
|
+
return "audio/mp4";
|
|
381
|
+
case "aac":
|
|
382
|
+
return "audio/aac";
|
|
383
|
+
case "flac":
|
|
384
|
+
return "audio/flac";
|
|
385
|
+
case "oga":
|
|
386
|
+
case "ogg":
|
|
387
|
+
return "audio/ogg";
|
|
388
|
+
case "opus":
|
|
389
|
+
return "audio/opus";
|
|
390
|
+
default:
|
|
391
|
+
return "audio/mpeg";
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
/**
|
|
395
|
+
* Resolve the kind. Explicit `kind` override wins; else extension; else MIME
|
|
396
|
+
* (URL responses). Returns undefined when nothing matches (unsupported).
|
|
397
|
+
*/
|
|
398
|
+
export function detectKind(args) {
|
|
399
|
+
if (args.override) {
|
|
400
|
+
const k = args.override.toLowerCase();
|
|
401
|
+
if (k === "image" ||
|
|
402
|
+
k === "pdf" ||
|
|
403
|
+
k === "docx" ||
|
|
404
|
+
k === "pptx" ||
|
|
405
|
+
k === "xlsx" ||
|
|
406
|
+
k === "html" ||
|
|
407
|
+
k === "video" ||
|
|
408
|
+
k === "audio" ||
|
|
409
|
+
k === "text" ||
|
|
410
|
+
k === "odt" ||
|
|
411
|
+
k === "ods" ||
|
|
412
|
+
k === "odp" ||
|
|
413
|
+
k === "epub" ||
|
|
414
|
+
k === "rtf" ||
|
|
415
|
+
k === "ipynb") {
|
|
416
|
+
return k;
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
const ext = extensionOf(args.source);
|
|
420
|
+
if (ext && EXT_KIND[ext])
|
|
421
|
+
return EXT_KIND[ext];
|
|
422
|
+
return kindFromMime(args.mime);
|
|
423
|
+
}
|
|
424
|
+
/* ─────────────────────────── params ─────────────────────────── */
|
|
425
|
+
const AnalyzeMediaParams = Type.Object({
|
|
426
|
+
source: Type.Optional(Type.String({
|
|
427
|
+
description: "Local file PATH or http(s) URL to analyze. Images, PDF, DOCX, PPTX, XLSX, HTML, plain/structured text, audio (voice notes), and video are auto-detected by extension/MIME. For a single file. Use `sources` to analyze several at once.",
|
|
428
|
+
})),
|
|
429
|
+
sources: Type.Optional(Type.Array(Type.String(), {
|
|
430
|
+
description: "Several local PATHs / http(s) URLs to analyze together in ONE call (e.g. compare photos, or read many files). Images are shown as multiple image blocks; documents/text are concatenated under per-file labels. Caps: 20 images / 10 documents per call. When set, takes precedence over `source`.",
|
|
431
|
+
})),
|
|
432
|
+
question: Type.Optional(Type.String({
|
|
433
|
+
description: "What to analyze / extract / answer about the media. Optional but strongly encouraged — it is echoed to the model alongside the resolved content.",
|
|
434
|
+
})),
|
|
435
|
+
prompt: Type.Optional(Type.String({
|
|
436
|
+
description: "Alias for `question`. Use one or the other.",
|
|
437
|
+
})),
|
|
438
|
+
pages: Type.Optional(Type.String({
|
|
439
|
+
description: 'Page (PDF) or slide (PPTX) range to limit extraction, e.g. "1-5", "3", or "2-". 1-indexed. Ignored for other kinds.',
|
|
440
|
+
})),
|
|
441
|
+
includeImages: Type.Optional(Type.Boolean({
|
|
442
|
+
description: "For an Office document (PPTX/DOCX/XLSX): whether to ALSO extract the embedded images (wireframes / screenshots / diagrams / charts inside the file) and show them to the model alongside the text — so you SEE the visuals, not just read titles. DEFAULT TRUE: analyzing an Office doc on a vision model returns its embedded images automatically, no flag needed. For a PPTX, `pages` scopes which slides' images come back (e.g. pages:\"8-13\"). Images are labeled by slide where known and capped (≈20); use `pages` to scope a big deck. Set `false` to skip images (text only, cheaper). NEVER unzip the file with bash/python to get its images — this tool already does it.",
|
|
443
|
+
})),
|
|
444
|
+
language: Type.Optional(Type.String({
|
|
445
|
+
description: 'Optional spoken-language hint for AUDIO transcription (e.g. "es", "Spanish", "en-US"). Improves accuracy for non-English voice notes; ignored for non-audio kinds.',
|
|
446
|
+
})),
|
|
447
|
+
provider: Type.Optional(Type.Union([Type.Literal("google"), Type.Literal("anthropic")], {
|
|
448
|
+
description: "Optional provider override for understanding video / native-PDF / text-only-model images (else auto-selected from configured keys). google = Gemini.",
|
|
449
|
+
})),
|
|
450
|
+
model: Type.Optional(Type.String({
|
|
451
|
+
description: "Optional provider model id override for the understanding call (e.g. gemini-2.5-pro, claude-sonnet-4-5). Ignored for the local text-extraction path.",
|
|
452
|
+
})),
|
|
453
|
+
mode: Type.Optional(Type.Union([Type.Literal("auto"), Type.Literal("provider"), Type.Literal("text")], {
|
|
454
|
+
description: 'PDF handling: "auto" (default — provider when a key is configured, else local text extraction), "provider" (force the native provider path), or "text" (force local unpdf text extraction).',
|
|
455
|
+
})),
|
|
456
|
+
maxBytes: Type.Optional(Type.Integer({
|
|
457
|
+
description: `Optional cap on bytes read from the source (default ${DEFAULT_MAX_BYTES}, ceiling ${MAX_BYTES_CEILING}).`,
|
|
458
|
+
minimum: 1024,
|
|
459
|
+
})),
|
|
460
|
+
maxTokens: Type.Optional(Type.Integer({
|
|
461
|
+
description: "Optional cap on the provider answer length (output tokens) for the understanding call (image-via-provider / PDF / audio / video). Default ~4096; clamped to a sane window. Ignored for the local text-extraction path.",
|
|
462
|
+
minimum: 64,
|
|
463
|
+
})),
|
|
464
|
+
kind: Type.Optional(Type.Union([
|
|
465
|
+
Type.Literal("image"),
|
|
466
|
+
Type.Literal("pdf"),
|
|
467
|
+
Type.Literal("docx"),
|
|
468
|
+
Type.Literal("pptx"),
|
|
469
|
+
Type.Literal("xlsx"),
|
|
470
|
+
Type.Literal("html"),
|
|
471
|
+
Type.Literal("video"),
|
|
472
|
+
Type.Literal("audio"),
|
|
473
|
+
Type.Literal("text"),
|
|
474
|
+
// OpenDocument + e-book + rich-text + notebook — `detectKind` already
|
|
475
|
+
// routes these, so the override must accept them too (lets the model
|
|
476
|
+
// force e.g. kind:"epub" to rescue a mis-detected / extension-less file).
|
|
477
|
+
Type.Literal("odt"),
|
|
478
|
+
Type.Literal("ods"),
|
|
479
|
+
Type.Literal("odp"),
|
|
480
|
+
Type.Literal("epub"),
|
|
481
|
+
Type.Literal("rtf"),
|
|
482
|
+
Type.Literal("ipynb"),
|
|
483
|
+
], {
|
|
484
|
+
description: "Optional override of the auto-detected kind (use when the extension/MIME is wrong or missing). Use \"audio\" for a voice note whose extension is ambiguous (e.g. .ogg/.webm); \"text\" to force plain/structured-text reading; odt/ods/odp/epub/rtf/ipynb to force an OpenDocument / e-book / rich-text / notebook read.",
|
|
485
|
+
})),
|
|
486
|
+
});
|
|
487
|
+
/**
|
|
488
|
+
* Decide whether the current model can consume an IMAGE block. When
|
|
489
|
+
* `imageInput` is set explicitly we trust it. Otherwise we infer from the
|
|
490
|
+
* provider/model id with a conservative, self-contained heuristic (no heavy
|
|
491
|
+
* model-resolution on the hot path): the major multimodal families return
|
|
492
|
+
* true; a small set of known text-only model-id markers return false; unknown
|
|
493
|
+
* → undefined ("assume yes, note it").
|
|
494
|
+
*/
|
|
495
|
+
export function modelLikelySeesImages(ctx) {
|
|
496
|
+
if (!ctx)
|
|
497
|
+
return undefined;
|
|
498
|
+
if (typeof ctx.imageInput === "boolean")
|
|
499
|
+
return ctx.imageInput;
|
|
500
|
+
const id = (ctx.modelId ?? "").toLowerCase();
|
|
501
|
+
if (!id)
|
|
502
|
+
return undefined;
|
|
503
|
+
// Known text-only / no-vision markers — be explicit, return false.
|
|
504
|
+
if (/\b(text-only|no-?vision)\b/.test(id))
|
|
505
|
+
return false;
|
|
506
|
+
if (/(^|[/-])(o1-mini|o3-mini)([-/]|$)/.test(id))
|
|
507
|
+
return false;
|
|
508
|
+
if (/(^|[/-])gpt-3\.5/.test(id))
|
|
509
|
+
return false;
|
|
510
|
+
// Major multimodal families — vision-capable.
|
|
511
|
+
if (/(claude|gpt-4|gpt-5|gemini|llava|pixtral|qwen.*vl|grok-(?:2|3|4)|gpt-4o)/.test(id)) {
|
|
512
|
+
return true;
|
|
513
|
+
}
|
|
514
|
+
// Unknown — caller decides; we report uncertainty.
|
|
515
|
+
return undefined;
|
|
516
|
+
}
|
|
517
|
+
/** Roots a local source path is allowed to live under (workspace, cwd, OS cache/temp, state dir). */
|
|
518
|
+
function allowedLocalRoots(opts) {
|
|
519
|
+
const roots = new Set();
|
|
520
|
+
const add = (p) => {
|
|
521
|
+
if (!p)
|
|
522
|
+
return;
|
|
523
|
+
try {
|
|
524
|
+
roots.add(path.resolve(p));
|
|
525
|
+
}
|
|
526
|
+
catch {
|
|
527
|
+
/* ignore */
|
|
528
|
+
}
|
|
529
|
+
};
|
|
530
|
+
add(opts.workspaceDir);
|
|
531
|
+
add(opts.cwd);
|
|
532
|
+
add(resolveCacheDir());
|
|
533
|
+
add(process.env.TMPDIR || process.env.TEMP || process.env.TMP || "");
|
|
534
|
+
try {
|
|
535
|
+
add(os.tmpdir());
|
|
536
|
+
}
|
|
537
|
+
catch {
|
|
538
|
+
/* ignore */
|
|
539
|
+
}
|
|
540
|
+
// The state dir's media/cache subtree is where inbound attachments + generated
|
|
541
|
+
// media land in FILESYSTEM mode; allow it so the model can analyze a file it
|
|
542
|
+
// just received.
|
|
543
|
+
try {
|
|
544
|
+
add(path.join(resolveStateDir(), "channels"));
|
|
545
|
+
add(path.join(resolveStateDir(), "cache"));
|
|
546
|
+
add(path.join(resolveStateDir(), "captures"));
|
|
547
|
+
add(path.join(resolveStateDir(), "workspace"));
|
|
548
|
+
}
|
|
549
|
+
catch {
|
|
550
|
+
/* ignore */
|
|
551
|
+
}
|
|
552
|
+
// In CONVEX mode inbound channel media relocates OUT of ~/.brigade to the OS
|
|
553
|
+
// cache dir (the channel media resolvers write to
|
|
554
|
+
// `resolveOsCacheDir()/channels/<id>/...` — see channels/whatsapp/media.ts;
|
|
555
|
+
// other channels mirror this). BlueBubbles writes inbound media to
|
|
556
|
+
// `resolveOsCacheDir()/bluebubbles/<acct>/inbound-media` in BOTH modes
|
|
557
|
+
// (connection.ts). Without these roots, a perfectly valid "analyze the photo
|
|
558
|
+
// I just sent" fails in convex mode. `resolveCacheDir()` already returns the
|
|
559
|
+
// OS cache root in convex mode, but adding `resolveOsCacheDir()` (+ the two
|
|
560
|
+
// channel subtrees) explicitly covers filesystem-mode BlueBubbles and any
|
|
561
|
+
// pre-context window where the mode peek hasn't settled. The media-path guard
|
|
562
|
+
// (`validateOutboundMediaPath`) still independently refuses secrets / system
|
|
563
|
+
// files / credential dirs, so widening to the machine-local cache is safe.
|
|
564
|
+
try {
|
|
565
|
+
const osCache = resolveOsCacheDir();
|
|
566
|
+
add(osCache);
|
|
567
|
+
add(path.join(osCache, "channels"));
|
|
568
|
+
add(path.join(osCache, "bluebubbles"));
|
|
569
|
+
}
|
|
570
|
+
catch {
|
|
571
|
+
/* ignore */
|
|
572
|
+
}
|
|
573
|
+
// macOS Messages Attachments root. The NATIVE iMessage adapter surfaces the
|
|
574
|
+
// bridge's on-disk attachment path AS-IS (it does NOT copy bytes into a cache
|
|
575
|
+
// dir the way BlueBubbles does — see channels/imessage/media.ts
|
|
576
|
+
// resolveInboundAttachments), so an inbound iMessage PDF/doc lives under
|
|
577
|
+
// `~/Library/Messages/Attachments/...`. Without this root, "analyze the PDF I
|
|
578
|
+
// just iMessaged" throws "outside the allowed roots" (images still auto-see via
|
|
579
|
+
// inline base64). The iMessage adapter already constrains inbound paths to its
|
|
580
|
+
// own attachmentRoots allow-list, and `validateOutboundMediaPath` still refuses
|
|
581
|
+
// secrets/system files independently, so admitting this read-only Apple data
|
|
582
|
+
// dir is safe. (A REMOTE iMessage bridge SCP-copies bytes into an OS temp dir,
|
|
583
|
+
// already covered above.) Only meaningful on macOS; harmless elsewhere (the
|
|
584
|
+
// path simply never exists).
|
|
585
|
+
try {
|
|
586
|
+
add(path.join(os.homedir(), "Library", "Messages", "Attachments"));
|
|
587
|
+
}
|
|
588
|
+
catch {
|
|
589
|
+
/* ignore */
|
|
590
|
+
}
|
|
591
|
+
// OWNER local turns only (TUI / desktop / the operator's own channel messages):
|
|
592
|
+
// the operator routinely references a file by an absolute path that lives in a
|
|
593
|
+
// personal directory — Downloads / Desktop / Documents — NOT under the workspace
|
|
594
|
+
// or cwd. Without this an "analyze C:\Users\me\Downloads\report.pdf" from the
|
|
595
|
+
// owner is refused as "outside the allowed roots", which is the wrong default
|
|
596
|
+
// for the trusted operator on their own machine. We widen to the operator's home
|
|
597
|
+
// dir (covers Downloads/Desktop/Documents and anything they point at). This is
|
|
598
|
+
// gated on `ownerLocalAccess` so an UNTRUSTED remote channel sender can NEVER
|
|
599
|
+
// make Brigade read the operator's home — a remote turn threads senderIsOwner:
|
|
600
|
+
// false → this stays off. `validateOutboundMediaPath` still independently refuses
|
|
601
|
+
// secrets / credential dirs / system files even for the owner, so the home dir's
|
|
602
|
+
// ~/.ssh, ~/.aws, .env, brigade.json, etc. remain denied.
|
|
603
|
+
if (opts.ownerLocalAccess) {
|
|
604
|
+
try {
|
|
605
|
+
add(os.homedir());
|
|
606
|
+
}
|
|
607
|
+
catch {
|
|
608
|
+
/* ignore */
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
return [...roots].filter((r) => r.length > 0);
|
|
612
|
+
}
|
|
613
|
+
/** True when `resolved` is inside one of `roots` (path.relative containment, no `..`). */
|
|
614
|
+
function isInsideAnyRoot(resolved, roots) {
|
|
615
|
+
for (const root of roots) {
|
|
616
|
+
const rel = path.relative(root, resolved);
|
|
617
|
+
if (rel === "" || (!rel.startsWith("..") && !path.isAbsolute(rel)))
|
|
618
|
+
return true;
|
|
619
|
+
}
|
|
620
|
+
return false;
|
|
621
|
+
}
|
|
622
|
+
/**
|
|
623
|
+
* Read a LOCAL file with the same safety posture as `read` / outbound media:
|
|
624
|
+
* 1. media-path guard (refuse secrets / system files / credential dirs).
|
|
625
|
+
* 2. allowed-root scoping (must be under workspace / cwd / cache / temp /
|
|
626
|
+
* state media subtree) — refuses arbitrary absolute reads outside roots.
|
|
627
|
+
* Symlinks are resolved first (the guards do this too) so a benign name can't
|
|
628
|
+
* smuggle a denied target.
|
|
629
|
+
*/
|
|
630
|
+
async function acquireLocalBytes(source, opts) {
|
|
631
|
+
const verdict = validateOutboundMediaPath(source);
|
|
632
|
+
if (!verdict.ok) {
|
|
633
|
+
throw new BrigadeToolInputError(verdict.reason ?? "refusing to read that path");
|
|
634
|
+
}
|
|
635
|
+
let resolved;
|
|
636
|
+
try {
|
|
637
|
+
resolved = fs.realpathSync(path.resolve(source));
|
|
638
|
+
}
|
|
639
|
+
catch {
|
|
640
|
+
resolved = path.resolve(source);
|
|
641
|
+
}
|
|
642
|
+
const roots = allowedLocalRoots(opts);
|
|
643
|
+
if (!isInsideAnyRoot(resolved, roots)) {
|
|
644
|
+
throw new BrigadeToolInputError("refusing to read a path outside the allowed roots (workspace / current dir / cache / temp). " +
|
|
645
|
+
"Move the file into the workspace, or pass a URL.");
|
|
646
|
+
}
|
|
647
|
+
let stat;
|
|
648
|
+
try {
|
|
649
|
+
stat = await fsp.stat(resolved);
|
|
650
|
+
}
|
|
651
|
+
catch {
|
|
652
|
+
throw new BrigadeToolInputError(`file not found: ${source}`);
|
|
653
|
+
}
|
|
654
|
+
if (!stat.isFile())
|
|
655
|
+
throw new BrigadeToolInputError(`not a file: ${source}`);
|
|
656
|
+
if (stat.size === 0)
|
|
657
|
+
throw new BrigadeToolInputError(`file is empty: ${source}`);
|
|
658
|
+
const full = await fsp.readFile(resolved);
|
|
659
|
+
const truncated = full.length > opts.maxBytes;
|
|
660
|
+
const bytes = truncated ? full.subarray(0, opts.maxBytes) : full;
|
|
661
|
+
return { bytes, truncated };
|
|
662
|
+
}
|
|
663
|
+
/**
|
|
664
|
+
* Fetch a URL through the SSRF guard with size + timeout caps. Reads the body
|
|
665
|
+
* in bounded chunks so a giant response can't blow memory.
|
|
666
|
+
*/
|
|
667
|
+
async function acquireUrlBytes(source, opts) {
|
|
668
|
+
const { response, finalUrl } = await guardedFetch(source, {
|
|
669
|
+
method: "GET",
|
|
670
|
+
headers: {
|
|
671
|
+
accept: "*/*",
|
|
672
|
+
"user-agent": "Mozilla/5.0 (compatible; Brigade/1.0; +https://brigade.spinabot.com)",
|
|
673
|
+
},
|
|
674
|
+
timeoutMs: FETCH_TIMEOUT_MS,
|
|
675
|
+
...(opts.signal ? { signal: opts.signal } : {}),
|
|
676
|
+
});
|
|
677
|
+
void finalUrl;
|
|
678
|
+
if (response.status >= 400) {
|
|
679
|
+
throw new BrigadeToolInputError(`fetch failed: HTTP ${response.status} for ${source}`);
|
|
680
|
+
}
|
|
681
|
+
const mime = response.headers.get("content-type") ?? undefined;
|
|
682
|
+
const bytes = await readBodyCapped(response, opts.maxBytes);
|
|
683
|
+
return { bytes: bytes.buf, mime, truncated: bytes.truncated };
|
|
684
|
+
}
|
|
685
|
+
/** Stream a Response body into a Buffer, stopping at `maxBytes`. */
|
|
686
|
+
async function readBodyCapped(response, maxBytes) {
|
|
687
|
+
if (!response.body) {
|
|
688
|
+
const ab = await response.arrayBuffer();
|
|
689
|
+
const full = Buffer.from(ab);
|
|
690
|
+
const truncated = full.length > maxBytes;
|
|
691
|
+
return { buf: truncated ? full.subarray(0, maxBytes) : full, truncated };
|
|
692
|
+
}
|
|
693
|
+
const reader = response.body.getReader();
|
|
694
|
+
const chunks = [];
|
|
695
|
+
let total = 0;
|
|
696
|
+
let truncated = false;
|
|
697
|
+
for (;;) {
|
|
698
|
+
const { done, value } = await reader.read();
|
|
699
|
+
if (done)
|
|
700
|
+
break;
|
|
701
|
+
if (!value)
|
|
702
|
+
continue;
|
|
703
|
+
const chunk = Buffer.from(value);
|
|
704
|
+
if (total + chunk.length > maxBytes) {
|
|
705
|
+
chunks.push(chunk.subarray(0, maxBytes - total));
|
|
706
|
+
truncated = true;
|
|
707
|
+
try {
|
|
708
|
+
await reader.cancel();
|
|
709
|
+
}
|
|
710
|
+
catch {
|
|
711
|
+
/* ignore */
|
|
712
|
+
}
|
|
713
|
+
break;
|
|
714
|
+
}
|
|
715
|
+
chunks.push(chunk);
|
|
716
|
+
total += chunk.length;
|
|
717
|
+
}
|
|
718
|
+
return { buf: Buffer.concat(chunks), truncated };
|
|
719
|
+
}
|
|
720
|
+
/* ─────────────────────────── page-range parsing ─────────────────────────── */
|
|
721
|
+
/**
|
|
722
|
+
* Parse a 1-indexed page/slide range like "1-5", "3", "2-" into a predicate
|
|
723
|
+
* over 1-indexed page numbers. Invalid input → accept all (best-effort, never
|
|
724
|
+
* throws). Exported for tests.
|
|
725
|
+
*/
|
|
726
|
+
export function parsePageRange(spec, total) {
|
|
727
|
+
if (!spec || !spec.trim())
|
|
728
|
+
return () => true;
|
|
729
|
+
const s = spec.trim();
|
|
730
|
+
const m = /^(\d+)?\s*-\s*(\d+)?$/.exec(s);
|
|
731
|
+
if (m) {
|
|
732
|
+
const lo = m[1] ? Math.max(1, parseInt(m[1], 10)) : 1;
|
|
733
|
+
const hi = m[2] ? Math.min(total, parseInt(m[2], 10)) : total;
|
|
734
|
+
return (n) => n >= lo && n <= hi;
|
|
735
|
+
}
|
|
736
|
+
const single = /^\d+$/.test(s) ? parseInt(s, 10) : NaN;
|
|
737
|
+
if (Number.isFinite(single))
|
|
738
|
+
return (n) => n === single;
|
|
739
|
+
return () => true;
|
|
740
|
+
}
|
|
741
|
+
/* ─────────────────────────── XML text helpers (docx/pptx/xlsx) ─────────────────────────── */
|
|
742
|
+
/** Decode the 5 predefined XML entities. */
|
|
743
|
+
function decodeXmlEntities(s) {
|
|
744
|
+
return s
|
|
745
|
+
.replace(/</g, "<")
|
|
746
|
+
.replace(/>/g, ">")
|
|
747
|
+
.replace(/"/g, '"')
|
|
748
|
+
.replace(/'/g, "'")
|
|
749
|
+
.replace(/&#x([0-9a-fA-F]+);/g, (_m, h) => safeCodePoint(parseInt(h, 16)))
|
|
750
|
+
.replace(/&#(\d+);/g, (_m, d) => safeCodePoint(parseInt(d, 10)))
|
|
751
|
+
.replace(/&/g, "&"); // amp LAST so we don't double-decode
|
|
752
|
+
}
|
|
753
|
+
function safeCodePoint(code) {
|
|
754
|
+
return Number.isFinite(code) && code >= 0 && code <= 0x10ffff ? String.fromCodePoint(code) : "";
|
|
755
|
+
}
|
|
756
|
+
/**
|
|
757
|
+
* Pull text from OOXML `<a:t>` / `<w:t>` / `<t>` run elements in document
|
|
758
|
+
* order. Works for Word (`w:t`), PowerPoint (`a:t`), and Excel shared strings
|
|
759
|
+
* (`t`). Paragraph/row boundaries (`</w:p>`, `</a:p>`, `</tr>`) become
|
|
760
|
+
* newlines so the text stays readable.
|
|
761
|
+
*/
|
|
762
|
+
function ooxmlRunsToText(xml) {
|
|
763
|
+
// Insert newlines at paragraph / line-break / table-row boundaries first.
|
|
764
|
+
const withBreaks = xml
|
|
765
|
+
.replace(/<\/w:p>/g, "\n")
|
|
766
|
+
.replace(/<\/a:p>/g, "\n")
|
|
767
|
+
.replace(/<w:br\s*\/?>/g, "\n")
|
|
768
|
+
.replace(/<a:br\s*\/?>/g, "\n");
|
|
769
|
+
const out = [];
|
|
770
|
+
// Match <prefix:t ...>text</prefix:t> and bare <t ...>text</t>.
|
|
771
|
+
const re = /<(?:[a-zA-Z]+:)?t(?:\s[^>]*)?>([\s\S]*?)<\/(?:[a-zA-Z]+:)?t>/g;
|
|
772
|
+
let m;
|
|
773
|
+
while ((m = re.exec(withBreaks)) !== null) {
|
|
774
|
+
out.push(decodeXmlEntities(m[1] ?? ""));
|
|
775
|
+
}
|
|
776
|
+
return out.join("");
|
|
777
|
+
}
|
|
778
|
+
/** Lazy fflate import — keeps the unzip cost off the cold-start path. */
|
|
779
|
+
async function unzipEntries(bytes) {
|
|
780
|
+
const { unzipSync } = await import("fflate");
|
|
781
|
+
try {
|
|
782
|
+
return unzipSync(new Uint8Array(bytes));
|
|
783
|
+
}
|
|
784
|
+
catch {
|
|
785
|
+
// fflate throws "invalid zip data" on a corrupt / non-OOXML file.
|
|
786
|
+
// Convert to a clean tool-input error so the model sees a usable
|
|
787
|
+
// message instead of a raw library throw.
|
|
788
|
+
throw new BrigadeToolInputError("could not read the file as an Office document (corrupt, password-protected, or not a real .docx/.pptx/.xlsx)");
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
async function entryText(entries, name) {
|
|
792
|
+
const u8 = entries[name];
|
|
793
|
+
if (!u8)
|
|
794
|
+
return undefined;
|
|
795
|
+
const { strFromU8 } = await import("fflate");
|
|
796
|
+
return strFromU8(u8);
|
|
797
|
+
}
|
|
798
|
+
/* ─────────────────────────── per-format extractors ─────────────────────────── */
|
|
799
|
+
async function extractDocx(bytes) {
|
|
800
|
+
const entries = await unzipEntries(bytes);
|
|
801
|
+
const doc = await entryText(entries, "word/document.xml");
|
|
802
|
+
if (!doc)
|
|
803
|
+
throw new BrigadeToolInputError("not a valid .docx (missing word/document.xml)");
|
|
804
|
+
const text = ooxmlRunsToText(doc).replace(/\n{3,}/g, "\n\n").trim();
|
|
805
|
+
if (!text)
|
|
806
|
+
throw new BrigadeToolInputError("no extractable text in the .docx");
|
|
807
|
+
return text;
|
|
808
|
+
}
|
|
809
|
+
async function extractPptx(bytes, pages) {
|
|
810
|
+
const entries = await unzipEntries(bytes);
|
|
811
|
+
// slide files are ppt/slides/slideN.xml — order by N.
|
|
812
|
+
const slideNames = Object.keys(entries)
|
|
813
|
+
.filter((n) => /^ppt\/slides\/slide\d+\.xml$/.test(n))
|
|
814
|
+
.sort((a, b) => slideNum(a) - slideNum(b));
|
|
815
|
+
if (slideNames.length === 0)
|
|
816
|
+
throw new BrigadeToolInputError("not a valid .pptx (no slides found)");
|
|
817
|
+
const inRange = parsePageRange(pages, slideNames.length);
|
|
818
|
+
const parts = [];
|
|
819
|
+
for (let i = 0; i < slideNames.length; i++) {
|
|
820
|
+
const num = i + 1;
|
|
821
|
+
if (!inRange(num))
|
|
822
|
+
continue;
|
|
823
|
+
const xml = await entryText(entries, slideNames[i]);
|
|
824
|
+
const text = xml ? ooxmlRunsToText(xml).replace(/\n{3,}/g, "\n\n").trim() : "";
|
|
825
|
+
parts.push(`--- Slide ${num} ---\n${text}`);
|
|
826
|
+
}
|
|
827
|
+
const joined = parts.join("\n\n").trim();
|
|
828
|
+
if (!joined)
|
|
829
|
+
throw new BrigadeToolInputError("no extractable text in the .pptx");
|
|
830
|
+
return joined;
|
|
831
|
+
}
|
|
832
|
+
function slideNum(name) {
|
|
833
|
+
const m = /slide(\d+)\.xml$/.exec(name);
|
|
834
|
+
return m ? parseInt(m[1], 10) : 0;
|
|
835
|
+
}
|
|
836
|
+
async function extractXlsx(bytes) {
|
|
837
|
+
const entries = await unzipEntries(bytes);
|
|
838
|
+
// Shared strings table — cells reference into it by index.
|
|
839
|
+
const sharedXml = await entryText(entries, "xl/sharedStrings.xml");
|
|
840
|
+
const shared = [];
|
|
841
|
+
if (sharedXml) {
|
|
842
|
+
// Each <si> is one shared string; it may contain multiple <t> runs.
|
|
843
|
+
const siRe = /<si\b[^>]*>([\s\S]*?)<\/si>/g;
|
|
844
|
+
let m;
|
|
845
|
+
while ((m = siRe.exec(sharedXml)) !== null) {
|
|
846
|
+
shared.push(ooxmlRunsToText(m[1] ?? ""));
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
const sheetNames = Object.keys(entries)
|
|
850
|
+
.filter((n) => /^xl\/worksheets\/sheet\d+\.xml$/.test(n))
|
|
851
|
+
.sort((a, b) => sheetNum(a) - sheetNum(b));
|
|
852
|
+
if (sheetNames.length === 0)
|
|
853
|
+
throw new BrigadeToolInputError("not a valid .xlsx (no worksheets found)");
|
|
854
|
+
const out = [];
|
|
855
|
+
for (let i = 0; i < sheetNames.length; i++) {
|
|
856
|
+
const xml = await entryText(entries, sheetNames[i]);
|
|
857
|
+
if (!xml)
|
|
858
|
+
continue;
|
|
859
|
+
out.push(`--- Sheet ${i + 1} ---`);
|
|
860
|
+
out.push(sheetXmlToCsv(xml, shared));
|
|
861
|
+
}
|
|
862
|
+
const joined = out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
863
|
+
if (!joined)
|
|
864
|
+
throw new BrigadeToolInputError("no extractable data in the .xlsx");
|
|
865
|
+
return joined;
|
|
866
|
+
}
|
|
867
|
+
function sheetNum(name) {
|
|
868
|
+
const m = /sheet(\d+)\.xml$/.exec(name);
|
|
869
|
+
return m ? parseInt(m[1], 10) : 0;
|
|
870
|
+
}
|
|
871
|
+
/**
|
|
872
|
+
* Turn a worksheet XML into CSV-ish rows. Each `<row>` becomes a line; each
|
|
873
|
+
* `<c>` cell is resolved — `t="s"` cells index into the shared-string table,
|
|
874
|
+
* inline / numeric cells use their `<v>` (or inline `<t>`). Best-effort: cells
|
|
875
|
+
* are emitted in document order separated by commas (column gaps are not
|
|
876
|
+
* reconstructed — text fidelity over grid fidelity, which is what the model
|
|
877
|
+
* needs to reason about the content).
|
|
878
|
+
*/
|
|
879
|
+
function sheetXmlToCsv(xml, shared) {
|
|
880
|
+
const rows = [];
|
|
881
|
+
const rowRe = /<row\b[^>]*>([\s\S]*?)<\/row>/g;
|
|
882
|
+
let rm;
|
|
883
|
+
while ((rm = rowRe.exec(xml)) !== null) {
|
|
884
|
+
const rowXml = rm[1] ?? "";
|
|
885
|
+
const cells = [];
|
|
886
|
+
const cellRe = /<c\b([^>]*)>([\s\S]*?)<\/c>|<c\b([^>]*)\/>/g;
|
|
887
|
+
let cm;
|
|
888
|
+
while ((cm = cellRe.exec(rowXml)) !== null) {
|
|
889
|
+
const attrs = cm[1] ?? cm[3] ?? "";
|
|
890
|
+
const inner = cm[2] ?? "";
|
|
891
|
+
const isShared = /\bt="s"/.test(attrs);
|
|
892
|
+
const vMatch = /<v\b[^>]*>([\s\S]*?)<\/v>/.exec(inner);
|
|
893
|
+
const inlineT = /<t\b[^>]*>([\s\S]*?)<\/t>/.exec(inner);
|
|
894
|
+
let value = "";
|
|
895
|
+
if (isShared && vMatch) {
|
|
896
|
+
const idx = parseInt(vMatch[1] ?? "", 10);
|
|
897
|
+
value = Number.isFinite(idx) ? shared[idx] ?? "" : "";
|
|
898
|
+
}
|
|
899
|
+
else if (inlineT) {
|
|
900
|
+
value = decodeXmlEntities(inlineT[1] ?? "");
|
|
901
|
+
}
|
|
902
|
+
else if (vMatch) {
|
|
903
|
+
value = decodeXmlEntities(vMatch[1] ?? "");
|
|
904
|
+
}
|
|
905
|
+
// CSV-escape: wrap in quotes when it contains a comma / quote / newline.
|
|
906
|
+
if (/[",\n]/.test(value))
|
|
907
|
+
value = `"${value.replace(/"/g, '""')}"`;
|
|
908
|
+
cells.push(value);
|
|
909
|
+
}
|
|
910
|
+
rows.push(cells.join(","));
|
|
911
|
+
}
|
|
912
|
+
return rows.join("\n");
|
|
913
|
+
}
|
|
914
|
+
/** PDF → per-page text via unpdf (zero native deps). Honors `pages`. */
|
|
915
|
+
async function extractPdf(bytes, pages) {
|
|
916
|
+
const { getDocumentProxy, extractText } = await import("unpdf");
|
|
917
|
+
let pdf;
|
|
918
|
+
try {
|
|
919
|
+
pdf = await getDocumentProxy(new Uint8Array(bytes));
|
|
920
|
+
}
|
|
921
|
+
catch {
|
|
922
|
+
throw new BrigadeToolInputError("could not parse the PDF (corrupt or password-protected?)");
|
|
923
|
+
}
|
|
924
|
+
const { totalPages, text } = await extractText(pdf, { mergePages: false });
|
|
925
|
+
const perPage = Array.isArray(text) ? text : [String(text)];
|
|
926
|
+
const inRange = parsePageRange(pages, totalPages);
|
|
927
|
+
const parts = [];
|
|
928
|
+
for (let i = 0; i < perPage.length; i++) {
|
|
929
|
+
const num = i + 1;
|
|
930
|
+
if (!inRange(num))
|
|
931
|
+
continue;
|
|
932
|
+
const t = (perPage[i] ?? "").trim();
|
|
933
|
+
parts.push(`--- Page ${num} ---\n${t}`);
|
|
934
|
+
}
|
|
935
|
+
const joined = parts.join("\n\n").trim();
|
|
936
|
+
return { text: joined, totalPages };
|
|
937
|
+
}
|
|
938
|
+
/** HTML bytes → markdown via the shared readability extractor (with regex fallback). */
|
|
939
|
+
async function extractHtml(bytes, baseUrl) {
|
|
940
|
+
const html = bytes.toString("utf8");
|
|
941
|
+
const readable = await extractReadableContent(html, baseUrl).catch(() => null);
|
|
942
|
+
const extracted = readable ?? extractBasicHtmlContent(html);
|
|
943
|
+
const { text } = composeFetchBody(extracted, {
|
|
944
|
+
extractMode: "markdown",
|
|
945
|
+
maxChars: DEFAULT_MAX_CHARS,
|
|
946
|
+
});
|
|
947
|
+
return text;
|
|
948
|
+
}
|
|
949
|
+
/* ── extra document formats (ODF / EPUB / RTF / IPYNB) — broader than rivals ── */
|
|
950
|
+
/**
|
|
951
|
+
* Pull text from OpenDocument XML (`content.xml`). ODF uses `<text:p>` /
|
|
952
|
+
* `<text:h>` paragraphs, `<text:span>` runs, and `<text:line-break/>` /
|
|
953
|
+
* `<text:tab/>`; spreadsheets use `<table:table-cell>` / `<table:table-row>`.
|
|
954
|
+
* Strategy mirrors `ooxmlRunsToText`: insert newlines at block boundaries, then
|
|
955
|
+
* strip remaining tags and decode entities.
|
|
956
|
+
*/
|
|
957
|
+
function odfXmlToText(xml) {
|
|
958
|
+
const withBreaks = xml
|
|
959
|
+
.replace(/<text:line-break\s*\/?>/g, "\n")
|
|
960
|
+
.replace(/<text:tab\s*\/?>/g, "\t")
|
|
961
|
+
.replace(/<\/text:p>/g, "\n")
|
|
962
|
+
.replace(/<\/text:h>/g, "\n")
|
|
963
|
+
.replace(/<\/table:table-row>/g, "\n")
|
|
964
|
+
.replace(/<\/table:table-cell>/g, "\t");
|
|
965
|
+
// Drop every remaining tag, then decode the 5 predefined XML entities.
|
|
966
|
+
const stripped = withBreaks.replace(/<[^>]+>/g, "");
|
|
967
|
+
return decodeXmlEntities(stripped)
|
|
968
|
+
.replace(/[ \t]+\n/g, "\n")
|
|
969
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
970
|
+
.trim();
|
|
971
|
+
}
|
|
972
|
+
/** OpenDocument (odt/ods/odp) → text from `content.xml`. */
|
|
973
|
+
async function extractOpenDocument(bytes, kind) {
|
|
974
|
+
const entries = await unzipEntries(bytes);
|
|
975
|
+
const content = await entryText(entries, "content.xml");
|
|
976
|
+
if (!content)
|
|
977
|
+
throw new BrigadeToolInputError(`not a valid .${kind} (missing content.xml — corrupt or not an OpenDocument file)`);
|
|
978
|
+
const text = odfXmlToText(content);
|
|
979
|
+
if (!text)
|
|
980
|
+
throw new BrigadeToolInputError(`no extractable text in the .${kind}`);
|
|
981
|
+
return text;
|
|
982
|
+
}
|
|
983
|
+
/**
|
|
984
|
+
* EPUB → concatenated readable text. An EPUB is a zip of XHTML "chapters"; we
|
|
985
|
+
* read them in spine order (from the OPF manifest) when resolvable, else fall
|
|
986
|
+
* back to every `.x?html` entry sorted by name. Each chapter's markup is run
|
|
987
|
+
* through the basic HTML extractor so only the readable text survives.
|
|
988
|
+
*/
|
|
989
|
+
async function extractEpub(bytes) {
|
|
990
|
+
const entries = await unzipEntries(bytes);
|
|
991
|
+
const names = Object.keys(entries);
|
|
992
|
+
// Resolve spine order via the OPF (content.opf) when present.
|
|
993
|
+
const opfName = names.find((n) => /\.opf$/i.test(n));
|
|
994
|
+
let ordered = [];
|
|
995
|
+
if (opfName) {
|
|
996
|
+
const opf = (await entryText(entries, opfName)) ?? "";
|
|
997
|
+
const opfDir = opfName.includes("/") ? opfName.slice(0, opfName.lastIndexOf("/") + 1) : "";
|
|
998
|
+
// manifest: id → href
|
|
999
|
+
const idToHref = new Map();
|
|
1000
|
+
const itemRe = /<item\b[^>]*\bid="([^"]+)"[^>]*\bhref="([^"]+)"[^>]*\/?>/g;
|
|
1001
|
+
let im;
|
|
1002
|
+
while ((im = itemRe.exec(opf)) !== null) {
|
|
1003
|
+
idToHref.set(im[1], im[2]);
|
|
1004
|
+
}
|
|
1005
|
+
// also handle href-before-id ordering
|
|
1006
|
+
const itemRe2 = /<item\b[^>]*\bhref="([^"]+)"[^>]*\bid="([^"]+)"[^>]*\/?>/g;
|
|
1007
|
+
while ((im = itemRe2.exec(opf)) !== null) {
|
|
1008
|
+
if (!idToHref.has(im[2]))
|
|
1009
|
+
idToHref.set(im[2], im[1]);
|
|
1010
|
+
}
|
|
1011
|
+
const spineRe = /<itemref\b[^>]*\bidref="([^"]+)"/g;
|
|
1012
|
+
let sm;
|
|
1013
|
+
while ((sm = spineRe.exec(opf)) !== null) {
|
|
1014
|
+
const href = idToHref.get(sm[1]);
|
|
1015
|
+
if (href) {
|
|
1016
|
+
const full = decodeURIComponent(opfDir + href).replace(/^\.\//, "");
|
|
1017
|
+
if (entries[full])
|
|
1018
|
+
ordered.push(full);
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
if (ordered.length === 0) {
|
|
1023
|
+
ordered = names.filter((n) => /\.x?html?$/i.test(n)).sort();
|
|
1024
|
+
}
|
|
1025
|
+
const parts = [];
|
|
1026
|
+
for (const name of ordered) {
|
|
1027
|
+
const html = (await entryText(entries, name)) ?? "";
|
|
1028
|
+
if (!html.trim())
|
|
1029
|
+
continue;
|
|
1030
|
+
const extracted = extractBasicHtmlContent(html);
|
|
1031
|
+
const { text } = composeFetchBody(extracted, { extractMode: "markdown", maxChars: DEFAULT_MAX_CHARS });
|
|
1032
|
+
if (text.trim())
|
|
1033
|
+
parts.push(text.trim());
|
|
1034
|
+
if (parts.join("\n\n").length > DEFAULT_MAX_CHARS)
|
|
1035
|
+
break; // bound the work
|
|
1036
|
+
}
|
|
1037
|
+
const joined = parts.join("\n\n").trim();
|
|
1038
|
+
if (!joined)
|
|
1039
|
+
throw new BrigadeToolInputError("no extractable text in the .epub");
|
|
1040
|
+
return joined;
|
|
1041
|
+
}
|
|
1042
|
+
/**
|
|
1043
|
+
* RTF → plain text. A small control-word stripper: drops `{\\*\\...}` groups
|
|
1044
|
+
* (fonts/colour tables/pictures), decodes `\\'hh` hex + `\\uN` unicode escapes,
|
|
1045
|
+
* maps `\\par`/`\\line`/`\\tab` to whitespace, and removes the remaining
|
|
1046
|
+
* `\\control` words and group braces. Best-effort — fidelity is text, not layout.
|
|
1047
|
+
*/
|
|
1048
|
+
function extractRtf(bytes) {
|
|
1049
|
+
let rtf = bytes.toString("latin1");
|
|
1050
|
+
if (!/^\s*{\\rtf/i.test(rtf)) {
|
|
1051
|
+
throw new BrigadeToolInputError("not a valid .rtf (missing the {\\rtf header)");
|
|
1052
|
+
}
|
|
1053
|
+
// Remove destination groups that carry no body text (font/colour/info/pict…).
|
|
1054
|
+
rtf = rtf.replace(/\{\\\*?\\(?:fonttbl|colortbl|stylesheet|info|pict|object|themedata|colorschememapping|latentstyles|datastore|generator)[^{}]*(?:\{[^{}]*\}[^{}]*)*\}/gi, " ");
|
|
1055
|
+
// Line / paragraph / tab control words → whitespace.
|
|
1056
|
+
rtf = rtf.replace(/\\par[d]?\b/g, "\n").replace(/\\line\b/g, "\n").replace(/\\tab\b/g, "\t");
|
|
1057
|
+
// Hex escapes \'hh → the byte (latin1).
|
|
1058
|
+
rtf = rtf.replace(/\\'([0-9a-fA-F]{2})/g, (_m, h) => {
|
|
1059
|
+
const code = parseInt(h, 16);
|
|
1060
|
+
return Number.isFinite(code) ? String.fromCharCode(code) : "";
|
|
1061
|
+
});
|
|
1062
|
+
// Unicode escapes \uNNNN (followed by a fallback char we drop).
|
|
1063
|
+
rtf = rtf.replace(/\\u(-?\d+)\??/g, (_m, n) => {
|
|
1064
|
+
let code = parseInt(n, 10);
|
|
1065
|
+
if (code < 0)
|
|
1066
|
+
code += 65536; // RTF emits negative for >32767
|
|
1067
|
+
return Number.isFinite(code) ? String.fromCodePoint(code) : "";
|
|
1068
|
+
});
|
|
1069
|
+
// Escaped literals.
|
|
1070
|
+
rtf = rtf.replace(/\\([{}\\])/g, "$1");
|
|
1071
|
+
// Remaining control words / symbols.
|
|
1072
|
+
rtf = rtf.replace(/\\[a-zA-Z]+-?\d* ?/g, "").replace(/\\[^a-zA-Z]/g, "");
|
|
1073
|
+
// Group braces.
|
|
1074
|
+
rtf = rtf.replace(/[{}]/g, "");
|
|
1075
|
+
return rtf.replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1076
|
+
}
|
|
1077
|
+
/**
|
|
1078
|
+
* Jupyter notebook (.ipynb) → text. Walks `cells[]`, joining each cell's
|
|
1079
|
+
* `source` (string or string[]) under a per-cell label, prefixing code cells so
|
|
1080
|
+
* the model knows code from prose. Cell OUTPUTS are skipped (often huge / binary
|
|
1081
|
+
* image data) — only the authored source is returned.
|
|
1082
|
+
*/
|
|
1083
|
+
function extractIpynb(bytes) {
|
|
1084
|
+
let nb;
|
|
1085
|
+
try {
|
|
1086
|
+
nb = JSON.parse(bytes.toString("utf8"));
|
|
1087
|
+
}
|
|
1088
|
+
catch {
|
|
1089
|
+
throw new BrigadeToolInputError("not a valid .ipynb (could not parse the notebook JSON)");
|
|
1090
|
+
}
|
|
1091
|
+
const cells = Array.isArray(nb.cells) ? nb.cells : [];
|
|
1092
|
+
if (cells.length === 0)
|
|
1093
|
+
throw new BrigadeToolInputError("the notebook has no cells");
|
|
1094
|
+
const parts = [];
|
|
1095
|
+
let n = 0;
|
|
1096
|
+
for (const cell of cells) {
|
|
1097
|
+
n += 1;
|
|
1098
|
+
const type = typeof cell.cell_type === "string" ? cell.cell_type : "code";
|
|
1099
|
+
const src = Array.isArray(cell.source)
|
|
1100
|
+
? cell.source.join("")
|
|
1101
|
+
: typeof cell.source === "string"
|
|
1102
|
+
? cell.source
|
|
1103
|
+
: "";
|
|
1104
|
+
if (!src.trim())
|
|
1105
|
+
continue;
|
|
1106
|
+
if (type === "markdown" || type === "raw") {
|
|
1107
|
+
parts.push(`--- Cell ${n} (${type}) ---\n${src.trim()}`);
|
|
1108
|
+
}
|
|
1109
|
+
else {
|
|
1110
|
+
parts.push(`--- Cell ${n} (code) ---\n\`\`\`\n${src.trim()}\n\`\`\``);
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
const joined = parts.join("\n\n").trim();
|
|
1114
|
+
if (!joined)
|
|
1115
|
+
throw new BrigadeToolInputError("no source text found in the notebook cells");
|
|
1116
|
+
return joined;
|
|
1117
|
+
}
|
|
1118
|
+
export function makeAnalyzeMediaTool(opts = {}) {
|
|
1119
|
+
const acquireUrl = opts.acquireUrl ?? acquireUrlBytes;
|
|
1120
|
+
const acquireLocal = opts.acquireLocal ?? acquireLocalBytes;
|
|
1121
|
+
const runUnderstanding = opts.runMediaUnderstanding ?? defaultRunMediaUnderstanding;
|
|
1122
|
+
const downscaleImage = opts.downscaleImage ?? downscaleImageToBudget;
|
|
1123
|
+
// Result cache: ON by default. A test-injected read/write seam overrides the
|
|
1124
|
+
// disk implementation; `resultCache:false` disables it entirely.
|
|
1125
|
+
const cacheEnabled = opts.resultCache !== false;
|
|
1126
|
+
const readCache = opts.readCache ?? readMediaCache;
|
|
1127
|
+
const writeCache = opts.writeCache ?? writeMediaCache;
|
|
1128
|
+
const agentId = opts.agentId ?? DEFAULT_AGENT_ID;
|
|
1129
|
+
// Lazily resolve the media-understanding config (key resolution + per-kind
|
|
1130
|
+
// defaults) from Brigade's credential store the first time it is needed, so
|
|
1131
|
+
// constructing the tool never touches the auth store. A test-injected config
|
|
1132
|
+
// short-circuits this.
|
|
1133
|
+
let muConfig = opts.mediaUnderstandingConfig;
|
|
1134
|
+
const getMuConfig = () => {
|
|
1135
|
+
if (!muConfig)
|
|
1136
|
+
muConfig = buildMediaUnderstandingConfig(agentId);
|
|
1137
|
+
return muConfig;
|
|
1138
|
+
};
|
|
1139
|
+
return {
|
|
1140
|
+
name: "analyze_media",
|
|
1141
|
+
label: "Analyze Media",
|
|
1142
|
+
displaySummary: "analyzing media",
|
|
1143
|
+
// Read capability — NOT owner-only. It reads a file/URL the operator
|
|
1144
|
+
// pointed at and hands content to the model; it never mutates state or
|
|
1145
|
+
// spends. The path guard + SSRF guard are the real safety boundary, and
|
|
1146
|
+
// they run for EVERY sender regardless of owner status.
|
|
1147
|
+
ownerOnly: false,
|
|
1148
|
+
description: [
|
|
1149
|
+
"Understand a local file or URL: images, PDF, DOCX, PPTX, XLSX, ODT/ODS/ODP, EPUB, RTF, Jupyter (.ipynb), HTML, plain/structured text (txt/csv/json/xml/yaml/md/log/source code), audio (voice notes), and video (auto-detected by extension/MIME).",
|
|
1150
|
+
"Pass `source` (a single local path or http(s) URL) — or `sources` (an array) to analyze several at once — and a `question` describing what to analyze.",
|
|
1151
|
+
"Images are shown to a vision model (or, on a text-only model, understood via any configured provider with an image-capable model) and oversize images are DOWNSCALED to fit (never truncated); PDF is read natively when a provider key is configured (scanned PDFs work) else extracted to text; office/e-book/notebook/text files are extracted to text; AUDIO is transcribed/summarized via a Google/Gemini key (with an optional `language` hint); VIDEO is understood via a Google/Gemini key.",
|
|
1152
|
+
"Use `pages` to limit a PDF/PPTX range (e.g. \"1-5\"). Use this instead of bash/curl — it applies the SSRF guard for URLs and the path guard for local files.",
|
|
1153
|
+
].join(" "),
|
|
1154
|
+
parameters: AnalyzeMediaParams,
|
|
1155
|
+
execute: async (_toolCallId, args, signal) => {
|
|
1156
|
+
// Resolve the source LIST. `sources[]` (new, batch) wins; else the single
|
|
1157
|
+
// `source` (back-compat) becomes a one-element list. De-dupe blanks.
|
|
1158
|
+
const list = (Array.isArray(args.sources) && args.sources.length > 0
|
|
1159
|
+
? args.sources
|
|
1160
|
+
: args.source
|
|
1161
|
+
? [args.source]
|
|
1162
|
+
: [])
|
|
1163
|
+
.map((s) => (s ?? "").trim())
|
|
1164
|
+
.filter((s) => s.length > 0);
|
|
1165
|
+
if (list.length === 0)
|
|
1166
|
+
throw new BrigadeToolInputError("source required");
|
|
1167
|
+
// Single source → the exact existing behaviour (one result, image block
|
|
1168
|
+
// or text). Multiple → the batch merge.
|
|
1169
|
+
if (list.length === 1)
|
|
1170
|
+
return analyzeOne(list[0], args, signal);
|
|
1171
|
+
return analyzeBatch(list, args, signal);
|
|
1172
|
+
},
|
|
1173
|
+
};
|
|
1174
|
+
/* ── single-source pipeline (the original per-source path) ── */
|
|
1175
|
+
/** Analyze ONE source end-to-end → a complete tool result (image or text). */
|
|
1176
|
+
async function analyzeOne(source, args, signal) {
|
|
1177
|
+
{
|
|
1178
|
+
const question = (args.question ?? args.prompt ?? "").trim();
|
|
1179
|
+
const isUrl = /^https?:\/\//i.test(source);
|
|
1180
|
+
const sourceType = isUrl ? "url" : "path";
|
|
1181
|
+
// Image blocks are the most token-expensive to ship, so when the
|
|
1182
|
+
// source LOOKS like an image (by extension or explicit kind) apply
|
|
1183
|
+
// the tighter image budget unless the caller raised maxBytes
|
|
1184
|
+
// explicitly. Documents/HTML keep the larger default.
|
|
1185
|
+
const looksImage = (args.kind ? args.kind === "image" : false) ||
|
|
1186
|
+
EXT_KIND[extensionOf(source)] === "image";
|
|
1187
|
+
// The byte BUDGET an image must fit into (downscaled if larger).
|
|
1188
|
+
const imageBudget = clampBytes(args.maxBytes, true);
|
|
1189
|
+
const maxBytes = clampBytes(args.maxBytes, looksImage);
|
|
1190
|
+
// For an image we want the WHOLE file (up to the absolute ceiling) so it
|
|
1191
|
+
// can be DOWNSCALED to a valid image — truncating it mid-stream corrupts
|
|
1192
|
+
// the only copy. So read images at the ceiling and let the image handler
|
|
1193
|
+
// resize to `imageBudget`. Non-image sources keep the existing cap
|
|
1194
|
+
// (a byte prefix is fine for text/doc bytes).
|
|
1195
|
+
const readCap = looksImage ? MAX_BYTES_CEILING : maxBytes;
|
|
1196
|
+
// Acquire bytes (with the right guard for the source type).
|
|
1197
|
+
let acquired;
|
|
1198
|
+
try {
|
|
1199
|
+
acquired = isUrl
|
|
1200
|
+
? await acquireUrl(source, {
|
|
1201
|
+
maxBytes: readCap,
|
|
1202
|
+
...(signal ? { signal } : {}),
|
|
1203
|
+
})
|
|
1204
|
+
: await acquireLocal(source, {
|
|
1205
|
+
...(opts.workspaceDir ? { workspaceDir: opts.workspaceDir } : {}),
|
|
1206
|
+
...(opts.cwd ? { cwd: opts.cwd } : {}),
|
|
1207
|
+
...(opts.ownerLocalAccess ? { ownerLocalAccess: true } : {}),
|
|
1208
|
+
maxBytes: readCap,
|
|
1209
|
+
});
|
|
1210
|
+
}
|
|
1211
|
+
catch (err) {
|
|
1212
|
+
if (err instanceof SsrfBlockedError) {
|
|
1213
|
+
throw new BrigadeToolInputError(`refused to fetch the URL: ${err.reason}`);
|
|
1214
|
+
}
|
|
1215
|
+
throw err;
|
|
1216
|
+
}
|
|
1217
|
+
// Detect kind (override → ext → MIME).
|
|
1218
|
+
const kind = detectKind({
|
|
1219
|
+
source,
|
|
1220
|
+
...(args.kind ? { override: args.kind } : {}),
|
|
1221
|
+
...(acquired.mime ? { mime: acquired.mime } : {}),
|
|
1222
|
+
});
|
|
1223
|
+
if (!kind) {
|
|
1224
|
+
// Last-resort: an unknown extension/MIME whose bytes decode as UTF-8
|
|
1225
|
+
// text is handled as the `text` kind (structured text / source code /
|
|
1226
|
+
// logs), so a `.toml`/unknown-but-textual file is read rather than
|
|
1227
|
+
// rejected. Binary that is not a known kind stays unsupported.
|
|
1228
|
+
if (looksLikeUtf8Text(acquired.bytes)) {
|
|
1229
|
+
return handleTextPlain({
|
|
1230
|
+
source,
|
|
1231
|
+
sourceType,
|
|
1232
|
+
bytes: acquired.bytes,
|
|
1233
|
+
truncated: acquired.truncated,
|
|
1234
|
+
...(acquired.mime ? { mime: acquired.mime } : {}),
|
|
1235
|
+
question,
|
|
1236
|
+
});
|
|
1237
|
+
}
|
|
1238
|
+
return failure({
|
|
1239
|
+
source,
|
|
1240
|
+
sourceType,
|
|
1241
|
+
...(acquired.mime ? { mimeType: acquired.mime } : {}),
|
|
1242
|
+
bytes: acquired.bytes.length,
|
|
1243
|
+
message: "Unsupported or undetectable media type. Supported: image (png/jpg/jpeg/webp/gif/bmp/heic), pdf, docx, pptx, xlsx, html, text (txt/csv/json/xml/md/yaml/log/source), audio, video. " +
|
|
1244
|
+
"Pass an explicit `kind` if the extension/MIME is missing.",
|
|
1245
|
+
});
|
|
1246
|
+
}
|
|
1247
|
+
// Dispatch per kind.
|
|
1248
|
+
switch (kind) {
|
|
1249
|
+
case "image":
|
|
1250
|
+
return handleImage({
|
|
1251
|
+
source,
|
|
1252
|
+
sourceType,
|
|
1253
|
+
bytes: acquired.bytes,
|
|
1254
|
+
truncated: acquired.truncated,
|
|
1255
|
+
mime: acquired.mime,
|
|
1256
|
+
question,
|
|
1257
|
+
imageBudget,
|
|
1258
|
+
modelContext: opts.modelContext,
|
|
1259
|
+
...(args.provider ? { provider: args.provider } : {}),
|
|
1260
|
+
...(args.model ? { model: args.model } : {}),
|
|
1261
|
+
...(args.maxTokens !== undefined ? { maxTokens: args.maxTokens } : {}),
|
|
1262
|
+
...(signal ? { signal } : {}),
|
|
1263
|
+
});
|
|
1264
|
+
case "video":
|
|
1265
|
+
return handleVideo({
|
|
1266
|
+
source,
|
|
1267
|
+
sourceType,
|
|
1268
|
+
bytes: acquired.bytes,
|
|
1269
|
+
mime: acquired.mime,
|
|
1270
|
+
question,
|
|
1271
|
+
...(args.provider ? { provider: args.provider } : {}),
|
|
1272
|
+
...(args.model ? { model: args.model } : {}),
|
|
1273
|
+
...(args.maxTokens !== undefined ? { maxTokens: args.maxTokens } : {}),
|
|
1274
|
+
...(signal ? { signal } : {}),
|
|
1275
|
+
});
|
|
1276
|
+
case "audio":
|
|
1277
|
+
return handleAudio({
|
|
1278
|
+
source,
|
|
1279
|
+
sourceType,
|
|
1280
|
+
bytes: acquired.bytes,
|
|
1281
|
+
mime: acquired.mime,
|
|
1282
|
+
question,
|
|
1283
|
+
...(args.language ? { language: args.language } : {}),
|
|
1284
|
+
...(args.provider ? { provider: args.provider } : {}),
|
|
1285
|
+
...(args.model ? { model: args.model } : {}),
|
|
1286
|
+
...(args.maxTokens !== undefined ? { maxTokens: args.maxTokens } : {}),
|
|
1287
|
+
...(signal ? { signal } : {}),
|
|
1288
|
+
});
|
|
1289
|
+
case "pdf":
|
|
1290
|
+
return handlePdf({
|
|
1291
|
+
source,
|
|
1292
|
+
sourceType,
|
|
1293
|
+
bytes: acquired.bytes,
|
|
1294
|
+
truncated: acquired.truncated,
|
|
1295
|
+
mime: acquired.mime,
|
|
1296
|
+
question,
|
|
1297
|
+
pages: args.pages,
|
|
1298
|
+
mode: args.mode ?? "auto",
|
|
1299
|
+
...(args.provider ? { provider: args.provider } : {}),
|
|
1300
|
+
...(args.model ? { model: args.model } : {}),
|
|
1301
|
+
...(args.maxTokens !== undefined ? { maxTokens: args.maxTokens } : {}),
|
|
1302
|
+
...(signal ? { signal } : {}),
|
|
1303
|
+
});
|
|
1304
|
+
case "text":
|
|
1305
|
+
return handleTextPlain({
|
|
1306
|
+
source,
|
|
1307
|
+
sourceType,
|
|
1308
|
+
bytes: acquired.bytes,
|
|
1309
|
+
truncated: acquired.truncated,
|
|
1310
|
+
...(acquired.mime ? { mime: acquired.mime } : {}),
|
|
1311
|
+
question,
|
|
1312
|
+
});
|
|
1313
|
+
case "docx":
|
|
1314
|
+
case "pptx":
|
|
1315
|
+
case "xlsx":
|
|
1316
|
+
case "html":
|
|
1317
|
+
case "odt":
|
|
1318
|
+
case "ods":
|
|
1319
|
+
case "odp":
|
|
1320
|
+
case "epub":
|
|
1321
|
+
case "rtf":
|
|
1322
|
+
case "ipynb":
|
|
1323
|
+
return handleTextExtract({
|
|
1324
|
+
kind,
|
|
1325
|
+
source,
|
|
1326
|
+
sourceType,
|
|
1327
|
+
bytes: acquired.bytes,
|
|
1328
|
+
truncated: acquired.truncated,
|
|
1329
|
+
mime: acquired.mime,
|
|
1330
|
+
question,
|
|
1331
|
+
pages: args.pages,
|
|
1332
|
+
// Embedded-image surfacing (OOXML only). DEFAULT-ON: when the
|
|
1333
|
+
// caller didn't say otherwise, an Office doc on a vision model
|
|
1334
|
+
// also returns its embedded pictures. `includeImages:false`
|
|
1335
|
+
// opts out (text-only, cheap).
|
|
1336
|
+
includeImages: args.includeImages !== false,
|
|
1337
|
+
modelContext: opts.modelContext,
|
|
1338
|
+
imageBudget,
|
|
1339
|
+
...(args.provider ? { provider: args.provider } : {}),
|
|
1340
|
+
...(args.model ? { model: args.model } : {}),
|
|
1341
|
+
...(args.maxTokens !== undefined ? { maxTokens: args.maxTokens } : {}),
|
|
1342
|
+
...(signal ? { signal } : {}),
|
|
1343
|
+
});
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
}
|
|
1347
|
+
/* ── batch (multi-source) pipeline ── */
|
|
1348
|
+
/**
|
|
1349
|
+
* Analyze MULTIPLE sources in one call. Images are pushed as N image blocks
|
|
1350
|
+
* into a single tool result (Pi tool-result content is an array of blocks);
|
|
1351
|
+
* non-image sources are reduced to their TEXT and concatenated under per-file
|
|
1352
|
+
* labels. Caps: {@link MAX_BATCH_IMAGES} images / {@link MAX_BATCH_DOCS}
|
|
1353
|
+
* non-image sources. The image byte budget is applied PER image (so N images
|
|
1354
|
+
* each get the per-image budget; downscaling keeps each one valid + bounded).
|
|
1355
|
+
* A per-source failure is reported inline (labeled) and never aborts the batch.
|
|
1356
|
+
*/
|
|
1357
|
+
async function analyzeBatch(sources, args, signal) {
|
|
1358
|
+
const question = (args.question ?? args.prompt ?? "").trim();
|
|
1359
|
+
// Partition by the cheap up-front signal (explicit kind / extension / —).
|
|
1360
|
+
// MIME-only images in a batch are treated as docs/text here (we don't pre-
|
|
1361
|
+
// fetch to classify); that's an acceptable edge for the batch path.
|
|
1362
|
+
const imageSources = [];
|
|
1363
|
+
const otherSources = [];
|
|
1364
|
+
for (const s of sources) {
|
|
1365
|
+
const k = args.kind ?? EXT_KIND[extensionOf(s)];
|
|
1366
|
+
if (k === "image")
|
|
1367
|
+
imageSources.push(s);
|
|
1368
|
+
else
|
|
1369
|
+
otherSources.push(s);
|
|
1370
|
+
}
|
|
1371
|
+
const cappedImages = imageSources.slice(0, MAX_BATCH_IMAGES);
|
|
1372
|
+
const cappedOthers = otherSources.slice(0, MAX_BATCH_DOCS);
|
|
1373
|
+
const overflow = [];
|
|
1374
|
+
if (imageSources.length > MAX_BATCH_IMAGES)
|
|
1375
|
+
overflow.push(`${imageSources.length - MAX_BATCH_IMAGES} image(s)`);
|
|
1376
|
+
if (otherSources.length > MAX_BATCH_DOCS)
|
|
1377
|
+
overflow.push(`${otherSources.length - MAX_BATCH_DOCS} document(s)`);
|
|
1378
|
+
const content = [];
|
|
1379
|
+
const labelParts = [];
|
|
1380
|
+
let anyOk = false;
|
|
1381
|
+
let imageCount = 0;
|
|
1382
|
+
let textCount = 0;
|
|
1383
|
+
const lead = question
|
|
1384
|
+
? `Analyze the ${sources.length} attached sources and answer this:\n${question}`
|
|
1385
|
+
: `Analyze the ${sources.length} attached sources and describe / summarize what they contain.`;
|
|
1386
|
+
content.push({ type: "text", text: lead });
|
|
1387
|
+
// Images first → each becomes its own labeled text + image block.
|
|
1388
|
+
for (let i = 0; i < cappedImages.length; i++) {
|
|
1389
|
+
const src = cappedImages[i];
|
|
1390
|
+
const label = `--- Image ${i + 1}: ${basenameOf(src)} ---`;
|
|
1391
|
+
const one = await analyzeOne(src, args, signal);
|
|
1392
|
+
const img = one.content.find((b) => b.type === "image");
|
|
1393
|
+
if (img) {
|
|
1394
|
+
content.push({ type: "text", text: label });
|
|
1395
|
+
content.push(img);
|
|
1396
|
+
imageCount += 1;
|
|
1397
|
+
anyOk = anyOk || one.details.ok;
|
|
1398
|
+
}
|
|
1399
|
+
else {
|
|
1400
|
+
// Text-only model / no key / failure → carry the explanatory text.
|
|
1401
|
+
content.push({ type: "text", text: `${label}\n${firstText(one)}` });
|
|
1402
|
+
}
|
|
1403
|
+
}
|
|
1404
|
+
// Non-image sources → concatenated labeled text extractions.
|
|
1405
|
+
for (let i = 0; i < cappedOthers.length; i++) {
|
|
1406
|
+
const src = cappedOthers[i];
|
|
1407
|
+
const label = `--- File ${i + 1}: ${basenameOf(src)} ---`;
|
|
1408
|
+
const one = await analyzeOne(src, args, signal);
|
|
1409
|
+
content.push({ type: "text", text: `${label}\n${firstText(one)}` });
|
|
1410
|
+
textCount += 1;
|
|
1411
|
+
anyOk = anyOk || one.details.ok;
|
|
1412
|
+
}
|
|
1413
|
+
if (overflow.length > 0) {
|
|
1414
|
+
content.push({
|
|
1415
|
+
type: "text",
|
|
1416
|
+
text: `(Note: ${overflow.join(" and ")} beyond the per-call cap of ${MAX_BATCH_IMAGES} images / ${MAX_BATCH_DOCS} documents were skipped. Split into multiple calls.)`,
|
|
1417
|
+
});
|
|
1418
|
+
}
|
|
1419
|
+
void labelParts;
|
|
1420
|
+
return {
|
|
1421
|
+
content,
|
|
1422
|
+
details: {
|
|
1423
|
+
ok: anyOk,
|
|
1424
|
+
source: sources.join(", "),
|
|
1425
|
+
sourceType: sources.every((s) => /^https?:\/\//i.test(s)) ? "url" : "path",
|
|
1426
|
+
returned: imageCount > 0 ? "image" : textCount > 0 ? "text" : "none",
|
|
1427
|
+
bytes: 0,
|
|
1428
|
+
message: `Batch of ${sources.length} sources: ${imageCount} image block(s), ${textCount} text extraction(s).`,
|
|
1429
|
+
},
|
|
1430
|
+
};
|
|
1431
|
+
}
|
|
1432
|
+
/* ── media-understanding helpers (shared by image/video/pdf provider paths) ── */
|
|
1433
|
+
/**
|
|
1434
|
+
* Run the media-understanding subsystem for `kind` and shape its TEXT into a
|
|
1435
|
+
* tool result. Returns `undefined` when no provider/key is available (so the
|
|
1436
|
+
* caller can fall back), and surfaces provider HTTP failures as a clean
|
|
1437
|
+
* failure result (never a raw throw to the model).
|
|
1438
|
+
*/
|
|
1439
|
+
async function understandViaProvider(p) {
|
|
1440
|
+
const cfg = getMuConfig();
|
|
1441
|
+
// Shape a successful provider TEXT into the tool result. Shared by the
|
|
1442
|
+
// cache-HIT and fresh-call paths so they return identically.
|
|
1443
|
+
const buildOk = (text, resolvedProvider, resolvedModel, fromCache) => {
|
|
1444
|
+
const promptText = buildPromptText(p.question, p.kind);
|
|
1445
|
+
// The provider's answer is derived from operator-pointed media but can
|
|
1446
|
+
// still echo injected instructions (a hostile document/video caption),
|
|
1447
|
+
// so wrap it in the untrusted-content envelope like extracted text.
|
|
1448
|
+
const wrapped = wrapWebContent(text, "web_fetch", { includeWarning: true });
|
|
1449
|
+
const notes = [p.note, fromCache ? "cached result" : undefined].filter(Boolean);
|
|
1450
|
+
const lead = notes.length > 0 ? `${promptText}\n\n(${notes.join("; ")})` : promptText;
|
|
1451
|
+
return {
|
|
1452
|
+
ok: true,
|
|
1453
|
+
result: {
|
|
1454
|
+
content: [{ type: "text", text: `${lead}\n\n${wrapped}` }],
|
|
1455
|
+
details: {
|
|
1456
|
+
ok: true,
|
|
1457
|
+
source: p.source,
|
|
1458
|
+
sourceType: p.sourceType,
|
|
1459
|
+
kind: p.kind,
|
|
1460
|
+
mimeType: p.mimeType,
|
|
1461
|
+
bytes: p.bytes.length,
|
|
1462
|
+
returned: "text",
|
|
1463
|
+
provider: resolvedProvider,
|
|
1464
|
+
providerModel: resolvedModel,
|
|
1465
|
+
},
|
|
1466
|
+
},
|
|
1467
|
+
};
|
|
1468
|
+
};
|
|
1469
|
+
// Cache key = content hash + the identity that determines the answer. Use
|
|
1470
|
+
// the REQUEST identity (override provider/model/maxTokens) so a repeat of
|
|
1471
|
+
// the same request hits; the RESOLVED provider/model live in the value.
|
|
1472
|
+
const cacheKey = cacheEnabled
|
|
1473
|
+
? mediaCacheKey({
|
|
1474
|
+
bytes: p.bytes,
|
|
1475
|
+
question: p.question,
|
|
1476
|
+
provider: p.provider ?? "auto",
|
|
1477
|
+
kind: p.kind,
|
|
1478
|
+
...(p.model ? { model: p.model } : {}),
|
|
1479
|
+
...(p.maxTokens !== undefined ? { maxTokens: p.maxTokens } : {}),
|
|
1480
|
+
})
|
|
1481
|
+
: "";
|
|
1482
|
+
if (cacheEnabled) {
|
|
1483
|
+
const hit = await readCache(cacheKey).catch(() => undefined);
|
|
1484
|
+
if (hit)
|
|
1485
|
+
return buildOk(hit.text, hit.provider, hit.model, true);
|
|
1486
|
+
}
|
|
1487
|
+
try {
|
|
1488
|
+
const res = await runUnderstanding({
|
|
1489
|
+
kind: p.kind,
|
|
1490
|
+
bytes: p.bytes,
|
|
1491
|
+
mimeType: p.mimeType,
|
|
1492
|
+
cfg,
|
|
1493
|
+
...(p.question ? { prompt: p.question } : {}),
|
|
1494
|
+
...(p.maxTokens !== undefined ? { maxTokens: p.maxTokens } : {}),
|
|
1495
|
+
...(p.provider ? { provider: p.provider } : {}),
|
|
1496
|
+
...(p.model ? { model: p.model } : {}),
|
|
1497
|
+
...(p.signal ? { signal: p.signal } : {}),
|
|
1498
|
+
});
|
|
1499
|
+
// Persist for next time (best-effort; never blocks the result).
|
|
1500
|
+
if (cacheEnabled) {
|
|
1501
|
+
const value = { text: res.text, provider: res.provider, model: res.model };
|
|
1502
|
+
void writeCache(cacheKey, value).catch(() => { });
|
|
1503
|
+
}
|
|
1504
|
+
return buildOk(res.text, res.provider, res.model, false);
|
|
1505
|
+
}
|
|
1506
|
+
catch (err) {
|
|
1507
|
+
if (err instanceof MediaUnderstandingUnavailableError) {
|
|
1508
|
+
return { ok: false, unavailable: true, message: err.message };
|
|
1509
|
+
}
|
|
1510
|
+
// Provider HTTP / processing failure — clean failure result.
|
|
1511
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1512
|
+
const guidance = p.failureGuidance ? ` ${p.failureGuidance}` : "";
|
|
1513
|
+
return {
|
|
1514
|
+
ok: false,
|
|
1515
|
+
unavailable: false,
|
|
1516
|
+
result: failure({
|
|
1517
|
+
source: p.source,
|
|
1518
|
+
sourceType: p.sourceType,
|
|
1519
|
+
kind: p.kind,
|
|
1520
|
+
mimeType: p.mimeType,
|
|
1521
|
+
bytes: p.bytes.length,
|
|
1522
|
+
message: `Provider media-understanding call failed: ${msg}.${guidance}`,
|
|
1523
|
+
}),
|
|
1524
|
+
};
|
|
1525
|
+
}
|
|
1526
|
+
}
|
|
1527
|
+
/* ── handlers (closures so they share `opts`) ── */
|
|
1528
|
+
async function handleImage(p) {
|
|
1529
|
+
const ext = extensionOf(p.source);
|
|
1530
|
+
let mimeType = (p.mime?.split(";")[0]?.trim() || imageMimeFromExt(ext)).toLowerCase();
|
|
1531
|
+
const isHeic = /heic|heif/.test(mimeType) || ext === "heic" || ext === "heif";
|
|
1532
|
+
const sees = modelLikelySeesImages(p.modelContext);
|
|
1533
|
+
const promptText = buildPromptText(p.question, "image");
|
|
1534
|
+
const warnings = [];
|
|
1535
|
+
// DOWNSCALE (not truncate) an oversize image. Truncating an image
|
|
1536
|
+
// mid-stream produces a broken payload every vision model rejects; instead
|
|
1537
|
+
// we resize it (fit-inside, down a quality grid) + EXIF auto-rotate, so the
|
|
1538
|
+
// model still sees a VALID image under the budget. HEIC/SVG aren't decodable
|
|
1539
|
+
// without a native dep, so they skip this (pass-through + the HEIC warning).
|
|
1540
|
+
let bytes = p.bytes;
|
|
1541
|
+
let imageTruncated = p.truncated;
|
|
1542
|
+
if (!isHeic && isDownscalableImageMime(mimeType)) {
|
|
1543
|
+
const overBudget = bytes.length > p.imageBudget;
|
|
1544
|
+
// Only pay the decode/encode when the image is actually over budget (or
|
|
1545
|
+
// arrived truncated and must be re-validated). A small image is shipped
|
|
1546
|
+
// untouched (lossless).
|
|
1547
|
+
if (overBudget || imageTruncated) {
|
|
1548
|
+
try {
|
|
1549
|
+
const ds = await downscaleImage(bytes, {
|
|
1550
|
+
maxBytes: p.imageBudget,
|
|
1551
|
+
sourceMime: mimeType,
|
|
1552
|
+
});
|
|
1553
|
+
bytes = ds.bytes;
|
|
1554
|
+
mimeType = ds.mimeType;
|
|
1555
|
+
// A successful downscale yields a valid image → clear the truncation
|
|
1556
|
+
// flag (we no longer ship a corrupt prefix).
|
|
1557
|
+
imageTruncated = false;
|
|
1558
|
+
if (ds.resized) {
|
|
1559
|
+
warnings.push(`The image exceeded the byte budget, so it was downscaled to ${ds.width}×${ds.height} (re-encoded as JPEG) to fit — detail may be reduced. Raise \`maxBytes\` for a higher-resolution pass.`);
|
|
1560
|
+
}
|
|
1561
|
+
}
|
|
1562
|
+
catch {
|
|
1563
|
+
// Could not decode (corrupt / unsupported encoding). Keep the
|
|
1564
|
+
// original bytes; the truncation warning below still applies.
|
|
1565
|
+
}
|
|
1566
|
+
}
|
|
1567
|
+
}
|
|
1568
|
+
if (isHeic) {
|
|
1569
|
+
warnings.push("This is a HEIC/HEIF image. Brigade cannot transcode it without a native dependency, so it is passed through as-is — many models reject HEIC. If the model cannot read it, ask the operator to convert it to JPEG/PNG.");
|
|
1570
|
+
}
|
|
1571
|
+
if (sees === false) {
|
|
1572
|
+
// The current model is text-only. Rather than ship a block it will
|
|
1573
|
+
// reject, route the image through a provider that CAN see it (when a
|
|
1574
|
+
// key is configured) and return the resulting text — so vision works
|
|
1575
|
+
// on any model. With no key, fall back to the honest "switch model"
|
|
1576
|
+
// message.
|
|
1577
|
+
//
|
|
1578
|
+
// FIX: a provider-bound image whose MIME the providers DON'T accept
|
|
1579
|
+
// (image/bmp, image/tiff — even when small + under budget, so the
|
|
1580
|
+
// downscale step above didn't fire) would make Anthropic return a 400.
|
|
1581
|
+
// Re-encode it to JPEG first (the downscale grid always emits JPEG and
|
|
1582
|
+
// jimp decodes bmp/tiff). HEIC/SVG aren't decodable → left as-is (the
|
|
1583
|
+
// HEIC warning already covers them).
|
|
1584
|
+
if (!isHeic && !PROVIDER_SAFE_IMAGE_MIME.has(mimeType) && isDownscalableImageMime(mimeType)) {
|
|
1585
|
+
try {
|
|
1586
|
+
const ds = await downscaleImage(bytes, {
|
|
1587
|
+
maxBytes: p.imageBudget,
|
|
1588
|
+
sourceMime: mimeType,
|
|
1589
|
+
});
|
|
1590
|
+
bytes = ds.bytes;
|
|
1591
|
+
mimeType = ds.mimeType; // image/jpeg
|
|
1592
|
+
imageTruncated = false;
|
|
1593
|
+
}
|
|
1594
|
+
catch {
|
|
1595
|
+
// Undecodable — fall through with the original bytes/mime; the
|
|
1596
|
+
// provider may still sniff it, and a 400 surfaces as a clean failure.
|
|
1597
|
+
}
|
|
1598
|
+
}
|
|
1599
|
+
const viaProvider = await understandViaProvider({
|
|
1600
|
+
kind: "image",
|
|
1601
|
+
source: p.source,
|
|
1602
|
+
sourceType: p.sourceType,
|
|
1603
|
+
bytes,
|
|
1604
|
+
mimeType,
|
|
1605
|
+
question: p.question,
|
|
1606
|
+
...(p.maxTokens !== undefined ? { maxTokens: p.maxTokens } : {}),
|
|
1607
|
+
...(p.provider ? { provider: p.provider } : {}),
|
|
1608
|
+
...(p.model ? { model: p.model } : {}),
|
|
1609
|
+
...(p.signal ? { signal: p.signal } : {}),
|
|
1610
|
+
note: "The current model is text-only, so the image was understood by a vision-capable provider and the description is below.",
|
|
1611
|
+
// BUG-1: when the current model is text-only AND a provider key exists
|
|
1612
|
+
// BUT the provider HTTP call fails, a bare transport error leaves the
|
|
1613
|
+
// model with no next step. Tell it exactly what unblocks the image.
|
|
1614
|
+
failureGuidance: "To read this image, the turn needs either a vision-capable model (e.g. a Claude / GPT-4o / Gemini model) or a working media-understanding provider key — check the configured key/quota and retry, or switch models.",
|
|
1615
|
+
});
|
|
1616
|
+
if (viaProvider.ok)
|
|
1617
|
+
return viaProvider.result;
|
|
1618
|
+
if (!viaProvider.unavailable)
|
|
1619
|
+
return viaProvider.result; // provider HTTP failure
|
|
1620
|
+
// Unavailable (no key) — be honest.
|
|
1621
|
+
warnings.push("The current model does not appear to accept images, so the image is NOT being attached. Switch to a vision-capable model (e.g. a Claude / GPT-4o / Gemini model), or configure a Google/Anthropic key so Brigade can understand images on any model.");
|
|
1622
|
+
return {
|
|
1623
|
+
content: [{ type: "text", text: `${promptText}\n\n${warnings.join("\n\n")}` }],
|
|
1624
|
+
details: {
|
|
1625
|
+
ok: false,
|
|
1626
|
+
source: p.source,
|
|
1627
|
+
sourceType: p.sourceType,
|
|
1628
|
+
kind: "image",
|
|
1629
|
+
mimeType,
|
|
1630
|
+
bytes: p.bytes.length,
|
|
1631
|
+
returned: "none",
|
|
1632
|
+
warning: warnings.join(" "),
|
|
1633
|
+
},
|
|
1634
|
+
};
|
|
1635
|
+
}
|
|
1636
|
+
if (sees === undefined) {
|
|
1637
|
+
warnings.push("Note: Brigade could not confirm this model is vision-capable. If you cannot see the image, switch to a vision-capable model.");
|
|
1638
|
+
}
|
|
1639
|
+
if (imageTruncated) {
|
|
1640
|
+
// Reached only when the image could NOT be downscaled (undecodable) yet
|
|
1641
|
+
// arrived truncated — the block may be corrupt.
|
|
1642
|
+
warnings.push("The image was truncated at the byte cap and could not be re-encoded, so it may be corrupt — raise `maxBytes` if it does not render.");
|
|
1643
|
+
}
|
|
1644
|
+
const text = warnings.length > 0 ? `${promptText}\n\n${warnings.join("\n\n")}` : promptText;
|
|
1645
|
+
return {
|
|
1646
|
+
// Image block carries raw base64 (NO data: prefix) — Pi's ImageContent
|
|
1647
|
+
// shape. This is the SAME block inbound/history images use, so a
|
|
1648
|
+
// vision model sees it as part of the turn.
|
|
1649
|
+
content: [
|
|
1650
|
+
{ type: "text", text },
|
|
1651
|
+
{ type: "image", data: bytes.toString("base64"), mimeType },
|
|
1652
|
+
],
|
|
1653
|
+
details: {
|
|
1654
|
+
ok: true,
|
|
1655
|
+
source: p.source,
|
|
1656
|
+
sourceType: p.sourceType,
|
|
1657
|
+
kind: "image",
|
|
1658
|
+
mimeType,
|
|
1659
|
+
bytes: bytes.length,
|
|
1660
|
+
returned: "image",
|
|
1661
|
+
...(imageTruncated ? { truncated: true } : {}),
|
|
1662
|
+
...(warnings.length > 0 ? { warning: warnings.join(" ") } : {}),
|
|
1663
|
+
},
|
|
1664
|
+
};
|
|
1665
|
+
}
|
|
1666
|
+
async function handleVideo(p) {
|
|
1667
|
+
// Pi's content channel can't carry video, so we call a video-capable
|
|
1668
|
+
// provider DIRECTLY (Gemini via the Files API) and return its TEXT.
|
|
1669
|
+
const mimeType = p.mime?.split(";")[0]?.trim().toLowerCase() || videoMimeFromExt(extensionOf(p.source));
|
|
1670
|
+
// Minor (4a): an explicit `provider:"anthropic"` override can't do video —
|
|
1671
|
+
// Anthropic has no video ingestion. Say so crisply instead of letting the
|
|
1672
|
+
// generic "needs a Gemini key" / capable-check message stand in for it.
|
|
1673
|
+
if (p.provider === "anthropic") {
|
|
1674
|
+
const promptText = buildPromptText(p.question, "video");
|
|
1675
|
+
const message = "Anthropic cannot analyze video — it has no video ingestion. Video understanding needs a Google/Gemini key. " +
|
|
1676
|
+
"Drop the `provider` override (or set it to \"google\") and configure a Gemini key.";
|
|
1677
|
+
return {
|
|
1678
|
+
content: [{ type: "text", text: `${promptText}\n\n${message}` }],
|
|
1679
|
+
details: {
|
|
1680
|
+
ok: false,
|
|
1681
|
+
source: p.source,
|
|
1682
|
+
sourceType: p.sourceType,
|
|
1683
|
+
kind: "video",
|
|
1684
|
+
mimeType,
|
|
1685
|
+
bytes: p.bytes.length,
|
|
1686
|
+
returned: "none",
|
|
1687
|
+
message,
|
|
1688
|
+
},
|
|
1689
|
+
};
|
|
1690
|
+
}
|
|
1691
|
+
const viaProvider = await understandViaProvider({
|
|
1692
|
+
kind: "video",
|
|
1693
|
+
source: p.source,
|
|
1694
|
+
sourceType: p.sourceType,
|
|
1695
|
+
bytes: p.bytes,
|
|
1696
|
+
mimeType,
|
|
1697
|
+
question: p.question,
|
|
1698
|
+
...(p.maxTokens !== undefined ? { maxTokens: p.maxTokens } : {}),
|
|
1699
|
+
...(p.provider ? { provider: p.provider } : {}),
|
|
1700
|
+
...(p.model ? { model: p.model } : {}),
|
|
1701
|
+
...(p.signal ? { signal: p.signal } : {}),
|
|
1702
|
+
});
|
|
1703
|
+
if (viaProvider.ok)
|
|
1704
|
+
return viaProvider.result;
|
|
1705
|
+
if (!viaProvider.unavailable)
|
|
1706
|
+
return viaProvider.result; // provider HTTP failure
|
|
1707
|
+
// No key configured — clear, actionable message.
|
|
1708
|
+
const promptText = buildPromptText(p.question, "video");
|
|
1709
|
+
return {
|
|
1710
|
+
content: [{ type: "text", text: `${promptText}\n\n${viaProvider.message}` }],
|
|
1711
|
+
details: {
|
|
1712
|
+
ok: false,
|
|
1713
|
+
source: p.source,
|
|
1714
|
+
sourceType: p.sourceType,
|
|
1715
|
+
kind: "video",
|
|
1716
|
+
mimeType,
|
|
1717
|
+
bytes: p.bytes.length,
|
|
1718
|
+
returned: "none",
|
|
1719
|
+
message: viaProvider.message,
|
|
1720
|
+
},
|
|
1721
|
+
};
|
|
1722
|
+
}
|
|
1723
|
+
/**
|
|
1724
|
+
* Audio handler (voice notes + clips). Pi's content channel can't carry
|
|
1725
|
+
* audio (text + image only), so audio understanding is GEMINI-ONLY: we route
|
|
1726
|
+
* to the media-understanding subsystem (Gemini inline audio) and return its
|
|
1727
|
+
* TEXT transcription / summary. With no Google/Gemini key, a clear "configure
|
|
1728
|
+
* a Gemini key" message — never a provider 400 from packing audio into an
|
|
1729
|
+
* image block.
|
|
1730
|
+
*/
|
|
1731
|
+
async function handleAudio(p) {
|
|
1732
|
+
const mimeType = p.mime?.split(";")[0]?.trim().toLowerCase() || audioMimeFromExt(extensionOf(p.source));
|
|
1733
|
+
// Fold the language hint (and the question/context) into the provider
|
|
1734
|
+
// prompt — the Gemini generateContent API has no dedicated language field,
|
|
1735
|
+
// so the spoken-language hint rides in the instruction text.
|
|
1736
|
+
const audioPrompt = buildAudioPrompt(p.question, p.language);
|
|
1737
|
+
const viaProvider = await understandViaProvider({
|
|
1738
|
+
kind: "audio",
|
|
1739
|
+
source: p.source,
|
|
1740
|
+
sourceType: p.sourceType,
|
|
1741
|
+
bytes: p.bytes,
|
|
1742
|
+
mimeType,
|
|
1743
|
+
question: audioPrompt,
|
|
1744
|
+
...(p.maxTokens !== undefined ? { maxTokens: p.maxTokens } : {}),
|
|
1745
|
+
...(p.provider ? { provider: p.provider } : {}),
|
|
1746
|
+
...(p.model ? { model: p.model } : {}),
|
|
1747
|
+
...(p.signal ? { signal: p.signal } : {}),
|
|
1748
|
+
});
|
|
1749
|
+
if (viaProvider.ok)
|
|
1750
|
+
return viaProvider.result;
|
|
1751
|
+
if (!viaProvider.unavailable)
|
|
1752
|
+
return viaProvider.result; // provider HTTP failure
|
|
1753
|
+
// No capable key — clear, actionable message.
|
|
1754
|
+
const promptText = buildPromptText(p.question, "audio");
|
|
1755
|
+
return {
|
|
1756
|
+
content: [{ type: "text", text: `${promptText}\n\n${viaProvider.message}` }],
|
|
1757
|
+
details: {
|
|
1758
|
+
ok: false,
|
|
1759
|
+
source: p.source,
|
|
1760
|
+
sourceType: p.sourceType,
|
|
1761
|
+
kind: "audio",
|
|
1762
|
+
mimeType,
|
|
1763
|
+
bytes: p.bytes.length,
|
|
1764
|
+
returned: "none",
|
|
1765
|
+
message: viaProvider.message,
|
|
1766
|
+
},
|
|
1767
|
+
};
|
|
1768
|
+
}
|
|
1769
|
+
/**
|
|
1770
|
+
* PDF handler. With an understanding-provider key configured (and `mode` not
|
|
1771
|
+
* forced to "text"), the PDF is sent NATIVELY to the provider (Anthropic
|
|
1772
|
+
* document block — OCRs scanned pages + reads layout; or Gemini inline) and
|
|
1773
|
+
* the provider's TEXT answer is returned. Otherwise — or when `mode:"text"`,
|
|
1774
|
+
* or when the provider call comes back empty/unavailable — it falls back to
|
|
1775
|
+
* the local `unpdf` per-page text extraction (honoring `pages`).
|
|
1776
|
+
*/
|
|
1777
|
+
async function handlePdf(p) {
|
|
1778
|
+
// Local text extraction is the fallback (and the forced path for mode:"text").
|
|
1779
|
+
const extractLocally = () => handleTextExtract({
|
|
1780
|
+
kind: "pdf",
|
|
1781
|
+
source: p.source,
|
|
1782
|
+
sourceType: p.sourceType,
|
|
1783
|
+
bytes: p.bytes,
|
|
1784
|
+
truncated: p.truncated,
|
|
1785
|
+
...(p.mime ? { mime: p.mime } : {}),
|
|
1786
|
+
question: p.question,
|
|
1787
|
+
...(p.pages ? { pages: p.pages } : {}),
|
|
1788
|
+
});
|
|
1789
|
+
if (p.mode === "text")
|
|
1790
|
+
return extractLocally();
|
|
1791
|
+
const cfg = getMuConfig();
|
|
1792
|
+
// Does any capable provider have a key? (Pure read — no HTTP.)
|
|
1793
|
+
const providerAvailable = p.provider
|
|
1794
|
+
? Boolean(safeResolveKey(cfg, p.provider))
|
|
1795
|
+
: Boolean(safeResolveKey(cfg, "anthropic")) || Boolean(safeResolveKey(cfg, "google"));
|
|
1796
|
+
if (p.mode === "provider" || providerAvailable) {
|
|
1797
|
+
const viaProvider = await understandViaProvider({
|
|
1798
|
+
kind: "pdf",
|
|
1799
|
+
source: p.source,
|
|
1800
|
+
sourceType: p.sourceType,
|
|
1801
|
+
bytes: p.bytes,
|
|
1802
|
+
mimeType: "application/pdf",
|
|
1803
|
+
question: p.question,
|
|
1804
|
+
...(p.maxTokens !== undefined ? { maxTokens: p.maxTokens } : {}),
|
|
1805
|
+
...(p.provider ? { provider: p.provider } : {}),
|
|
1806
|
+
...(p.model ? { model: p.model } : {}),
|
|
1807
|
+
...(p.signal ? { signal: p.signal } : {}),
|
|
1808
|
+
note: "This PDF was read natively by a provider (handles scanned pages + layout)." +
|
|
1809
|
+
(p.pages ? " The `pages` range is not applied on the native path." : ""),
|
|
1810
|
+
});
|
|
1811
|
+
if (viaProvider.ok)
|
|
1812
|
+
return viaProvider.result;
|
|
1813
|
+
// mode:"provider" forces provider — surface the failure/unavailable
|
|
1814
|
+
// rather than silently extracting (the operator asked for native).
|
|
1815
|
+
if (p.mode === "provider") {
|
|
1816
|
+
if (viaProvider.unavailable) {
|
|
1817
|
+
return failure({
|
|
1818
|
+
source: p.source,
|
|
1819
|
+
sourceType: p.sourceType,
|
|
1820
|
+
kind: "pdf",
|
|
1821
|
+
...(p.mime ? { mimeType: p.mime } : {}),
|
|
1822
|
+
bytes: p.bytes.length,
|
|
1823
|
+
message: viaProvider.message,
|
|
1824
|
+
});
|
|
1825
|
+
}
|
|
1826
|
+
return viaProvider.result;
|
|
1827
|
+
}
|
|
1828
|
+
// auto + provider HTTP failure → fall back to local text extraction.
|
|
1829
|
+
}
|
|
1830
|
+
return extractLocally();
|
|
1831
|
+
}
|
|
1832
|
+
/**
|
|
1833
|
+
* Plain / structured-text handler (txt / csv / tsv / json / xml / yaml / log /
|
|
1834
|
+
* markdown / source code / unknown-but-UTF-8). Decodes the bytes as UTF-8,
|
|
1835
|
+
* wraps them in the untrusted-content envelope (the file is operator-pointed
|
|
1836
|
+
* but can still carry injected instructions), and returns them as text capped
|
|
1837
|
+
* to the char budget. No provider call — this is a pure read, the cheapest
|
|
1838
|
+
* path. Both rival tools accept these formats; Brigade used to reject them.
|
|
1839
|
+
*/
|
|
1840
|
+
async function handleTextPlain(p) {
|
|
1841
|
+
// Strip a UTF-8 BOM if present, then decode. `Buffer.toString("utf8")`
|
|
1842
|
+
// replaces invalid sequences with U+FFFD rather than throwing, so even
|
|
1843
|
+
// near-text binary degrades gracefully instead of erroring.
|
|
1844
|
+
let raw = p.bytes.toString("utf8");
|
|
1845
|
+
if (raw.charCodeAt(0) === 0xfeff)
|
|
1846
|
+
raw = raw.slice(1);
|
|
1847
|
+
const rawText = raw.trim();
|
|
1848
|
+
if (!rawText) {
|
|
1849
|
+
return failure({
|
|
1850
|
+
source: p.source,
|
|
1851
|
+
sourceType: p.sourceType,
|
|
1852
|
+
kind: "text",
|
|
1853
|
+
...(p.mime ? { mimeType: p.mime } : {}),
|
|
1854
|
+
bytes: p.bytes.length,
|
|
1855
|
+
message: "The file is empty or contains no readable text.",
|
|
1856
|
+
});
|
|
1857
|
+
}
|
|
1858
|
+
const { text: clamped, truncated: textTruncated } = truncateText(raw, DEFAULT_MAX_CHARS);
|
|
1859
|
+
const wrapped = wrapWebContent(clamped, "web_fetch", { includeWarning: true });
|
|
1860
|
+
const promptText = buildPromptText(p.question, "text");
|
|
1861
|
+
const truncated = p.truncated || textTruncated;
|
|
1862
|
+
const note = truncated
|
|
1863
|
+
? "\n\n(Content was truncated to fit the turn — raise `maxBytes` for more.)"
|
|
1864
|
+
: "";
|
|
1865
|
+
return {
|
|
1866
|
+
content: [{ type: "text", text: `${promptText}${note}\n\n${wrapped}` }],
|
|
1867
|
+
details: {
|
|
1868
|
+
ok: true,
|
|
1869
|
+
source: p.source,
|
|
1870
|
+
sourceType: p.sourceType,
|
|
1871
|
+
kind: "text",
|
|
1872
|
+
...(p.mime ? { mimeType: p.mime } : {}),
|
|
1873
|
+
bytes: p.bytes.length,
|
|
1874
|
+
returned: "text",
|
|
1875
|
+
...(truncated ? { truncated: true } : {}),
|
|
1876
|
+
},
|
|
1877
|
+
};
|
|
1878
|
+
}
|
|
1879
|
+
async function handleTextExtract(p) {
|
|
1880
|
+
let rawText = "";
|
|
1881
|
+
let totalPages;
|
|
1882
|
+
try {
|
|
1883
|
+
switch (p.kind) {
|
|
1884
|
+
case "pdf": {
|
|
1885
|
+
const r = await extractPdf(p.bytes, p.pages);
|
|
1886
|
+
rawText = r.text;
|
|
1887
|
+
totalPages = r.totalPages;
|
|
1888
|
+
break;
|
|
1889
|
+
}
|
|
1890
|
+
case "docx":
|
|
1891
|
+
rawText = await extractDocx(p.bytes);
|
|
1892
|
+
break;
|
|
1893
|
+
case "pptx":
|
|
1894
|
+
rawText = await extractPptx(p.bytes, p.pages);
|
|
1895
|
+
break;
|
|
1896
|
+
case "xlsx":
|
|
1897
|
+
rawText = await extractXlsx(p.bytes);
|
|
1898
|
+
break;
|
|
1899
|
+
case "html":
|
|
1900
|
+
rawText = await extractHtml(p.bytes, p.sourceType === "url" ? p.source : "about:blank");
|
|
1901
|
+
break;
|
|
1902
|
+
case "odt":
|
|
1903
|
+
case "ods":
|
|
1904
|
+
case "odp":
|
|
1905
|
+
rawText = await extractOpenDocument(p.bytes, p.kind);
|
|
1906
|
+
break;
|
|
1907
|
+
case "epub":
|
|
1908
|
+
rawText = await extractEpub(p.bytes);
|
|
1909
|
+
break;
|
|
1910
|
+
case "rtf":
|
|
1911
|
+
rawText = extractRtf(p.bytes);
|
|
1912
|
+
break;
|
|
1913
|
+
case "ipynb":
|
|
1914
|
+
rawText = extractIpynb(p.bytes);
|
|
1915
|
+
break;
|
|
1916
|
+
}
|
|
1917
|
+
}
|
|
1918
|
+
catch (err) {
|
|
1919
|
+
if (err instanceof BrigadeToolInputError) {
|
|
1920
|
+
return failure({
|
|
1921
|
+
source: p.source,
|
|
1922
|
+
sourceType: p.sourceType,
|
|
1923
|
+
kind: p.kind,
|
|
1924
|
+
...(p.mime ? { mimeType: p.mime } : {}),
|
|
1925
|
+
bytes: p.bytes.length,
|
|
1926
|
+
message: err.message,
|
|
1927
|
+
});
|
|
1928
|
+
}
|
|
1929
|
+
throw err;
|
|
1930
|
+
}
|
|
1931
|
+
if (!rawText.trim()) {
|
|
1932
|
+
return failure({
|
|
1933
|
+
source: p.source,
|
|
1934
|
+
sourceType: p.sourceType,
|
|
1935
|
+
kind: p.kind,
|
|
1936
|
+
...(p.mime ? { mimeType: p.mime } : {}),
|
|
1937
|
+
bytes: p.bytes.length,
|
|
1938
|
+
message: p.kind === "pdf"
|
|
1939
|
+
? "No selectable text found — the PDF may be a scanned image. Image-only PDFs need OCR, which this tool does not perform."
|
|
1940
|
+
: `No extractable text found in the ${p.kind}.`,
|
|
1941
|
+
});
|
|
1942
|
+
}
|
|
1943
|
+
const { text: clamped, truncated: textTruncated } = truncateText(rawText, DEFAULT_MAX_CHARS);
|
|
1944
|
+
// Document text is from a file the operator pointed at, but it can still
|
|
1945
|
+
// carry injected instructions (a hostile PDF/HTML). Wrap it in the
|
|
1946
|
+
// untrusted-content envelope so the model treats it as data, not as
|
|
1947
|
+
// instructions. `web_fetch` is the closest existing envelope source.
|
|
1948
|
+
const wrapped = wrapWebContent(clamped, "web_fetch", { includeWarning: true });
|
|
1949
|
+
const promptText = buildPromptText(p.question, p.kind);
|
|
1950
|
+
const truncated = p.truncated || textTruncated;
|
|
1951
|
+
const notes = [];
|
|
1952
|
+
if (totalPages !== undefined)
|
|
1953
|
+
notes.push(`PDF total pages: ${totalPages}.`);
|
|
1954
|
+
if (p.pages && (p.kind === "pdf" || p.kind === "pptx")) {
|
|
1955
|
+
notes.push(`Limited to ${p.kind === "pdf" ? "pages" : "slides"} "${p.pages}".`);
|
|
1956
|
+
}
|
|
1957
|
+
if (truncated)
|
|
1958
|
+
notes.push("Content was truncated to fit the turn — raise `maxBytes` / narrow `pages` for more.");
|
|
1959
|
+
const noteBlock = notes.length > 0 ? `\n\n(${notes.join(" ")})` : "";
|
|
1960
|
+
// ── Embedded images (OOXML only) ──────────────────────────────────────
|
|
1961
|
+
// For a pptx/docx/xlsx, the substance is often in the PICTURES (wireframes,
|
|
1962
|
+
// screenshots, charts). DEFAULT-ON: also surface the embedded images so the
|
|
1963
|
+
// model SEES them — on a vision model as image blocks alongside the text; on
|
|
1964
|
+
// a text-only model, routed through the understanding provider (or, with no
|
|
1965
|
+
// key, a short note). The agent never needs bash. Honors `pages` (PPTX slide
|
|
1966
|
+
// scope), the count cap + total-byte budget, and skips undecodable embeds.
|
|
1967
|
+
const isOoxml = p.kind === "docx" || p.kind === "pptx" || p.kind === "xlsx";
|
|
1968
|
+
const embedResult = isOoxml && p.includeImages
|
|
1969
|
+
? await attachEmbeddedImages({
|
|
1970
|
+
kind: p.kind,
|
|
1971
|
+
bytes: p.bytes,
|
|
1972
|
+
pages: p.pages,
|
|
1973
|
+
imageBudget: p.imageBudget ?? DEFAULT_IMAGE_MAX_BYTES,
|
|
1974
|
+
...(p.modelContext ? { modelContext: p.modelContext } : {}),
|
|
1975
|
+
question: p.question,
|
|
1976
|
+
source: p.source,
|
|
1977
|
+
sourceType: p.sourceType,
|
|
1978
|
+
...(p.provider ? { provider: p.provider } : {}),
|
|
1979
|
+
...(p.model ? { model: p.model } : {}),
|
|
1980
|
+
...(p.maxTokens !== undefined ? { maxTokens: p.maxTokens } : {}),
|
|
1981
|
+
...(p.signal ? { signal: p.signal } : {}),
|
|
1982
|
+
})
|
|
1983
|
+
: undefined;
|
|
1984
|
+
const content = [
|
|
1985
|
+
{ type: "text", text: `${promptText}${noteBlock}\n\n${wrapped}` },
|
|
1986
|
+
];
|
|
1987
|
+
if (embedResult)
|
|
1988
|
+
content.push(...embedResult.content);
|
|
1989
|
+
return {
|
|
1990
|
+
content,
|
|
1991
|
+
details: {
|
|
1992
|
+
ok: true,
|
|
1993
|
+
source: p.source,
|
|
1994
|
+
sourceType: p.sourceType,
|
|
1995
|
+
kind: p.kind,
|
|
1996
|
+
...(p.mime ? { mimeType: p.mime } : {}),
|
|
1997
|
+
bytes: p.bytes.length,
|
|
1998
|
+
returned: embedResult && embedResult.imageCount > 0 ? "image" : "text",
|
|
1999
|
+
...(p.pages ? { pages: p.pages } : {}),
|
|
2000
|
+
...(truncated ? { truncated: true } : {}),
|
|
2001
|
+
},
|
|
2002
|
+
};
|
|
2003
|
+
}
|
|
2004
|
+
/**
|
|
2005
|
+
* Surface a doc's EMBEDDED images. Unzips the OOXML buffer, maps images to
|
|
2006
|
+
* slides (PPTX) / collects them (DOCX/XLSX) via `extractOoxmlImages`, honoring
|
|
2007
|
+
* `pages` over the PPTX slide order and the per-call count cap. Each selected
|
|
2008
|
+
* image is DOWNSCALED to the per-image budget (small is fine — these are
|
|
2009
|
+
* context), and a running TOTAL-byte budget drops any image that would blow the
|
|
2010
|
+
* turn (reported, not thrown). Then:
|
|
2011
|
+
* • vision model → emits the image blocks (each preceded by a label).
|
|
2012
|
+
* • text-only → routes EACH embed through the understanding provider and
|
|
2013
|
+
* emits the resulting TEXT; with NO provider key, a single "images present,
|
|
2014
|
+
* switch to a vision model" note (NEVER a raw block, NEVER a throw).
|
|
2015
|
+
* Undecodable embeds (`.wdp`/`.emf`/`.wmf`/…) and over-cap images are reported
|
|
2016
|
+
* as short notes. Returns the extra content blocks + how many image blocks were
|
|
2017
|
+
* emitted (so the caller can set `returned`). On ANY failure (corrupt zip,
|
|
2018
|
+
* decode error) it returns `undefined` — the text result already stands; the
|
|
2019
|
+
* embedded images are a best-effort add-on that must never break the text path.
|
|
2020
|
+
*/
|
|
2021
|
+
async function attachEmbeddedImages(e) {
|
|
2022
|
+
let entries;
|
|
2023
|
+
try {
|
|
2024
|
+
entries = await unzipEntries(e.bytes);
|
|
2025
|
+
}
|
|
2026
|
+
catch {
|
|
2027
|
+
// Corrupt/locked zip — the text path already reported the real failure
|
|
2028
|
+
// (or succeeded); embedded images are best-effort, so just skip them.
|
|
2029
|
+
return undefined;
|
|
2030
|
+
}
|
|
2031
|
+
// For PPTX, scope the embedded images by the same `pages` slide range the
|
|
2032
|
+
// text honored. DOCX/XLSX have no page→image map, so `inRange` is omitted
|
|
2033
|
+
// (all images, capped). The slide TOTAL drives the range clamp; resolve it
|
|
2034
|
+
// from the presentation order when available, else the slide-file count.
|
|
2035
|
+
let inRange;
|
|
2036
|
+
if (e.kind === "pptx" && e.pages) {
|
|
2037
|
+
const slideTotal = countPptxSlides(entries);
|
|
2038
|
+
inRange = parsePageRange(e.pages, slideTotal);
|
|
2039
|
+
}
|
|
2040
|
+
let extraction;
|
|
2041
|
+
try {
|
|
2042
|
+
extraction = extractOoxmlImages(entries, e.kind, {
|
|
2043
|
+
cap: MAX_EMBEDDED_IMAGES,
|
|
2044
|
+
...(inRange ? { inRange } : {}),
|
|
2045
|
+
});
|
|
2046
|
+
}
|
|
2047
|
+
catch {
|
|
2048
|
+
return undefined;
|
|
2049
|
+
}
|
|
2050
|
+
if (extraction.images.length === 0 && extraction.skipped === 0)
|
|
2051
|
+
return undefined;
|
|
2052
|
+
const content = [];
|
|
2053
|
+
const sees = modelLikelySeesImages(e.modelContext);
|
|
2054
|
+
// Downscale each selected image to the per-image budget AND enforce the
|
|
2055
|
+
// running total-byte ceiling so a deck full of big pictures can't blow the
|
|
2056
|
+
// turn even under the count cap. Images dropped by the total budget are
|
|
2057
|
+
// reported (like the over-cap ones).
|
|
2058
|
+
const prepared = [];
|
|
2059
|
+
let runningBytes = 0;
|
|
2060
|
+
let droppedForBytes = 0;
|
|
2061
|
+
for (const img of extraction.images) {
|
|
2062
|
+
let bytes = img.bytes;
|
|
2063
|
+
let mime = img.mime;
|
|
2064
|
+
if (isDownscalableImageMime(mime)) {
|
|
2065
|
+
try {
|
|
2066
|
+
const ds = await downscaleImage(bytes, {
|
|
2067
|
+
maxBytes: e.imageBudget,
|
|
2068
|
+
sourceMime: mime,
|
|
2069
|
+
});
|
|
2070
|
+
bytes = ds.bytes;
|
|
2071
|
+
mime = ds.mimeType;
|
|
2072
|
+
}
|
|
2073
|
+
catch {
|
|
2074
|
+
// Undecodable raster (rare for these types) — keep the original bytes;
|
|
2075
|
+
// a vision block may still render and a provider may still sniff it.
|
|
2076
|
+
}
|
|
2077
|
+
}
|
|
2078
|
+
if (runningBytes + bytes.length > EMBEDDED_IMAGES_TOTAL_BYTES && prepared.length > 0) {
|
|
2079
|
+
droppedForBytes += 1;
|
|
2080
|
+
continue;
|
|
2081
|
+
}
|
|
2082
|
+
runningBytes += bytes.length;
|
|
2083
|
+
prepared.push({ ...img, bytes, mime });
|
|
2084
|
+
}
|
|
2085
|
+
// Header note: count, "N of M", skipped, and dropped-for-budget.
|
|
2086
|
+
const emitted = prepared.length;
|
|
2087
|
+
const headerBits = [];
|
|
2088
|
+
if (emitted > 0) {
|
|
2089
|
+
headerBits.push(`${emitted} embedded image${emitted === 1 ? "" : "s"} from this ${e.kind}`);
|
|
2090
|
+
}
|
|
2091
|
+
if (extraction.matched > emitted + droppedForBytes) {
|
|
2092
|
+
headerBits.push(`showing ${emitted} of ${extraction.matched} embedded images — pass \`pages=\` to scope` +
|
|
2093
|
+
(e.kind === "pptx" ? " to specific slides" : ""));
|
|
2094
|
+
}
|
|
2095
|
+
if (droppedForBytes > 0) {
|
|
2096
|
+
headerBits.push(`${droppedForBytes} omitted to stay within the size budget`);
|
|
2097
|
+
}
|
|
2098
|
+
if (extraction.skipped > 0) {
|
|
2099
|
+
headerBits.push(`${extraction.skipped} skipped (unsupported format)`);
|
|
2100
|
+
}
|
|
2101
|
+
if (emitted === 0) {
|
|
2102
|
+
// Nothing decodable to show (e.g. every embed was a .wdp). Report the
|
|
2103
|
+
// skip as a note; do NOT throw.
|
|
2104
|
+
if (headerBits.length > 0) {
|
|
2105
|
+
content.push({ type: "text", text: `(Embedded images: ${headerBits.join("; ")}.)` });
|
|
2106
|
+
}
|
|
2107
|
+
return { content, imageCount: 0 };
|
|
2108
|
+
}
|
|
2109
|
+
if (sees === false) {
|
|
2110
|
+
// TEXT-ONLY current model. Don't ship raw blocks it will reject — route
|
|
2111
|
+
// each embed through the understanding provider and emit the TEXT. With
|
|
2112
|
+
// no key, a single honest note (the text extraction already stands).
|
|
2113
|
+
const cfg = getMuConfig();
|
|
2114
|
+
const providerAvailable = e.provider
|
|
2115
|
+
? Boolean(safeResolveKey(cfg, e.provider))
|
|
2116
|
+
: Boolean(safeResolveKey(cfg, "anthropic")) ||
|
|
2117
|
+
Boolean(safeResolveKey(cfg, "google")) ||
|
|
2118
|
+
Boolean(safeResolvePiImageModel(cfg));
|
|
2119
|
+
if (!providerAvailable) {
|
|
2120
|
+
content.push({
|
|
2121
|
+
type: "text",
|
|
2122
|
+
text: `(This ${e.kind} has ${headerBits.join("; ")}, but the current model is text-only so they are not attached. ` +
|
|
2123
|
+
"Switch to a vision-capable model to SEE them, or configure a Google/Anthropic key so Brigade can describe embedded images on any model.)",
|
|
2124
|
+
});
|
|
2125
|
+
return { content, imageCount: 0 };
|
|
2126
|
+
}
|
|
2127
|
+
content.push({
|
|
2128
|
+
type: "text",
|
|
2129
|
+
text: `(The current model is text-only, so the ${headerBits.join("; ")} were understood by a vision-capable provider; descriptions follow.)`,
|
|
2130
|
+
});
|
|
2131
|
+
for (const img of prepared) {
|
|
2132
|
+
// Re-encode a non-provider-safe raster (bmp/tiff) to JPEG so the
|
|
2133
|
+
// provider accepts it (Anthropic 400s on image/bmp).
|
|
2134
|
+
let bytes = img.bytes;
|
|
2135
|
+
let mime = img.mime;
|
|
2136
|
+
if (!PROVIDER_SAFE_IMAGE_MIME.has(mime) && isDownscalableImageMime(mime)) {
|
|
2137
|
+
try {
|
|
2138
|
+
const ds = await downscaleImage(bytes, { maxBytes: e.imageBudget, sourceMime: mime });
|
|
2139
|
+
bytes = ds.bytes;
|
|
2140
|
+
mime = ds.mimeType;
|
|
2141
|
+
}
|
|
2142
|
+
catch {
|
|
2143
|
+
/* leave as-is; a 400 surfaces as the note below */
|
|
2144
|
+
}
|
|
2145
|
+
}
|
|
2146
|
+
const viaProvider = await understandViaProvider({
|
|
2147
|
+
kind: "image",
|
|
2148
|
+
source: `${e.source}#${img.label.replace(/\s+/g, "-")}`,
|
|
2149
|
+
sourceType: e.sourceType,
|
|
2150
|
+
bytes,
|
|
2151
|
+
mimeType: mime,
|
|
2152
|
+
question: e.question,
|
|
2153
|
+
...(e.maxTokens !== undefined ? { maxTokens: e.maxTokens } : {}),
|
|
2154
|
+
...(e.provider ? { provider: e.provider } : {}),
|
|
2155
|
+
...(e.model ? { model: e.model } : {}),
|
|
2156
|
+
...(e.signal ? { signal: e.signal } : {}),
|
|
2157
|
+
});
|
|
2158
|
+
if (viaProvider.ok) {
|
|
2159
|
+
content.push({ type: "text", text: `--- ${img.label} ---\n${firstText(viaProvider.result)}` });
|
|
2160
|
+
}
|
|
2161
|
+
else {
|
|
2162
|
+
content.push({
|
|
2163
|
+
type: "text",
|
|
2164
|
+
text: `--- ${img.label} ---\n(Could not understand this embedded image: ${viaProvider.unavailable ? viaProvider.message : firstText(viaProvider.result)})`,
|
|
2165
|
+
});
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
2168
|
+
return { content, imageCount: 0 };
|
|
2169
|
+
}
|
|
2170
|
+
// VISION model (or unknown — assume yes, the common case). Emit each image
|
|
2171
|
+
// as a labeled block. `sees === undefined` adds one short uncertainty note.
|
|
2172
|
+
const lead = `(Embedded images from this ${e.kind}: ${headerBits.join("; ")}.` +
|
|
2173
|
+
(sees === undefined
|
|
2174
|
+
? " Note: Brigade could not confirm this model is vision-capable; if you cannot see them, switch models.)"
|
|
2175
|
+
: ")");
|
|
2176
|
+
content.push({ type: "text", text: lead });
|
|
2177
|
+
for (const img of prepared) {
|
|
2178
|
+
content.push({ type: "text", text: `--- ${img.label} ---` });
|
|
2179
|
+
content.push({ type: "image", data: img.bytes.toString("base64"), mimeType: img.mime });
|
|
2180
|
+
}
|
|
2181
|
+
return { content, imageCount: emitted };
|
|
2182
|
+
}
|
|
2183
|
+
}
|
|
2184
|
+
/* ─────────────────────────── small helpers ─────────────────────────── */
|
|
2185
|
+
/** Resolve a provider key from the mu-config without throwing (pure probe). */
|
|
2186
|
+
function safeResolveKey(cfg, provider) {
|
|
2187
|
+
try {
|
|
2188
|
+
return cfg.resolveKey(provider) || "";
|
|
2189
|
+
}
|
|
2190
|
+
catch {
|
|
2191
|
+
return "";
|
|
2192
|
+
}
|
|
2193
|
+
}
|
|
2194
|
+
/**
|
|
2195
|
+
* True when the Pi image path can resolve an image-capable model for SOME keyed
|
|
2196
|
+
* provider (OpenAI / Groq / Mistral / OpenRouter / xAI / Ollama / …) — so embedded
|
|
2197
|
+
* images from an OOXML doc can be understood on a text-only current model even
|
|
2198
|
+
* when neither google nor anthropic is keyed. Pure probe, never throws.
|
|
2199
|
+
*/
|
|
2200
|
+
function safeResolvePiImageModel(cfg) {
|
|
2201
|
+
try {
|
|
2202
|
+
return Boolean(resolvePiModel("image", cfg));
|
|
2203
|
+
}
|
|
2204
|
+
catch {
|
|
2205
|
+
return false;
|
|
2206
|
+
}
|
|
2207
|
+
}
|
|
2208
|
+
/**
|
|
2209
|
+
* Count slides in an already-unzipped PPTX for clamping a `pages` range over the
|
|
2210
|
+
* embedded-image path. Prefers the PRESENTATION order (so the count matches what
|
|
2211
|
+
* the text extractor + the image mapper see); falls back to the slide-file count
|
|
2212
|
+
* when the presentation graph is unreadable. Never throws.
|
|
2213
|
+
*/
|
|
2214
|
+
function countPptxSlides(entries) {
|
|
2215
|
+
try {
|
|
2216
|
+
const ordered = resolveSlideOrder(entries);
|
|
2217
|
+
if (ordered.length > 0)
|
|
2218
|
+
return ordered.length;
|
|
2219
|
+
}
|
|
2220
|
+
catch {
|
|
2221
|
+
/* fall through to the filename count */
|
|
2222
|
+
}
|
|
2223
|
+
return Object.keys(entries).filter((n) => /^ppt\/slides\/slide\d+\.xml$/.test(n)).length;
|
|
2224
|
+
}
|
|
2225
|
+
function clampBytes(requested, looksImage = false) {
|
|
2226
|
+
if (typeof requested !== "number" || !Number.isFinite(requested)) {
|
|
2227
|
+
return looksImage ? DEFAULT_IMAGE_MAX_BYTES : DEFAULT_MAX_BYTES;
|
|
2228
|
+
}
|
|
2229
|
+
return Math.max(1024, Math.min(MAX_BYTES_CEILING, Math.floor(requested)));
|
|
2230
|
+
}
|
|
2231
|
+
/** Build the leading instruction text the model reads before the content. */
|
|
2232
|
+
function buildPromptText(question, kind) {
|
|
2233
|
+
const what = kind === "image"
|
|
2234
|
+
? "the image below"
|
|
2235
|
+
: kind === "video"
|
|
2236
|
+
? "the video referenced below"
|
|
2237
|
+
: kind === "audio"
|
|
2238
|
+
? "the audio referenced below"
|
|
2239
|
+
: kind === "text"
|
|
2240
|
+
? "the text content below"
|
|
2241
|
+
: `the extracted ${kind} content below`;
|
|
2242
|
+
if (question)
|
|
2243
|
+
return `Analyze ${what} and answer this:\n${question}`;
|
|
2244
|
+
return `Analyze ${what} and describe / summarize what it contains.`;
|
|
2245
|
+
}
|
|
2246
|
+
/**
|
|
2247
|
+
* Build the provider prompt for an AUDIO call, folding in an optional spoken-
|
|
2248
|
+
* language hint and the caller's question/context. Gemini's generateContent has
|
|
2249
|
+
* no language field, so the hint is expressed in the instruction text. When the
|
|
2250
|
+
* caller gives no question, default to transcribe-then-summarize.
|
|
2251
|
+
*/
|
|
2252
|
+
export function buildAudioPrompt(question, language) {
|
|
2253
|
+
const lang = (language ?? "").trim();
|
|
2254
|
+
const langClause = lang
|
|
2255
|
+
? ` The spoken language is ${lang} — transcribe in ${lang} and preserve it.`
|
|
2256
|
+
: "";
|
|
2257
|
+
const base = question.trim()
|
|
2258
|
+
? question.trim()
|
|
2259
|
+
: "Transcribe this audio, then briefly summarize what is said.";
|
|
2260
|
+
return `${base}${langClause}`;
|
|
2261
|
+
}
|
|
2262
|
+
/**
|
|
2263
|
+
* Heuristic: do these bytes look like UTF-8 text (so an unknown extension/MIME
|
|
2264
|
+
* can be read as the `text` kind rather than rejected)? Rejects anything with a
|
|
2265
|
+
* NUL byte or a high ratio of C0 control bytes (binary), and validates that a
|
|
2266
|
+
* leading sample decodes as UTF-8 without replacement characters. Conservative
|
|
2267
|
+
* — a false negative just yields the old "unsupported" message.
|
|
2268
|
+
*/
|
|
2269
|
+
export function looksLikeUtf8Text(bytes) {
|
|
2270
|
+
if (bytes.length === 0)
|
|
2271
|
+
return false;
|
|
2272
|
+
const sample = bytes.subarray(0, Math.min(bytes.length, 4096));
|
|
2273
|
+
let control = 0;
|
|
2274
|
+
for (const b of sample) {
|
|
2275
|
+
if (b === 0)
|
|
2276
|
+
return false; // NUL → binary
|
|
2277
|
+
// Allow tab(9), LF(10), CR(13), FF(12); count other C0 controls.
|
|
2278
|
+
if (b < 0x20 && b !== 9 && b !== 10 && b !== 13 && b !== 12)
|
|
2279
|
+
control += 1;
|
|
2280
|
+
}
|
|
2281
|
+
if (control / sample.length > 0.05)
|
|
2282
|
+
return false;
|
|
2283
|
+
// Validate UTF-8: a strict decode shouldn't introduce replacement chars in a
|
|
2284
|
+
// sample that didn't already contain them.
|
|
2285
|
+
const decoded = sample.toString("utf8");
|
|
2286
|
+
const replacements = (decoded.match(/�/g) ?? []).length;
|
|
2287
|
+
if (replacements > 0 && replacements / decoded.length > 0.01)
|
|
2288
|
+
return false;
|
|
2289
|
+
return true;
|
|
2290
|
+
}
|
|
2291
|
+
function failure(d) {
|
|
2292
|
+
return jsonResult({ ok: false, returned: "none", ...d });
|
|
2293
|
+
}
|
|
2294
|
+
/** Short display name for a source (file basename, or the URL pathname tail). */
|
|
2295
|
+
function basenameOf(source) {
|
|
2296
|
+
try {
|
|
2297
|
+
if (/^https?:\/\//i.test(source)) {
|
|
2298
|
+
const u = new URL(source);
|
|
2299
|
+
const last = u.pathname.split("/").filter(Boolean).pop();
|
|
2300
|
+
return last || u.hostname;
|
|
2301
|
+
}
|
|
2302
|
+
}
|
|
2303
|
+
catch {
|
|
2304
|
+
/* fall through to path basename */
|
|
2305
|
+
}
|
|
2306
|
+
const norm = source.replace(/[\\/]+$/, "");
|
|
2307
|
+
const tail = norm.split(/[\\/]/).pop();
|
|
2308
|
+
return tail || source;
|
|
2309
|
+
}
|
|
2310
|
+
/** Concatenate all TEXT blocks of a single-source result (for batch labeling). */
|
|
2311
|
+
function firstText(r) {
|
|
2312
|
+
return r.content
|
|
2313
|
+
.filter((b) => b.type === "text")
|
|
2314
|
+
.map((b) => b.text)
|
|
2315
|
+
.join("\n")
|
|
2316
|
+
.trim();
|
|
2317
|
+
}
|
|
2318
|
+
// Image byte cap is applied where the image handler runs; export the constant
|
|
2319
|
+
// so callers/tests can reference the tighter image default.
|
|
2320
|
+
export { DEFAULT_IMAGE_MAX_BYTES, DEFAULT_MAX_BYTES, DEFAULT_MAX_CHARS };
|
|
2321
|
+
//# sourceMappingURL=analyze-media-tool.js.map
|