@robzilla1738/agentswarm 0.3.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +51 -11
  2. package/dist/agent.js +18 -2
  3. package/dist/cli.js +39 -8
  4. package/dist/config.js +62 -6
  5. package/dist/crawltools.js +247 -0
  6. package/dist/deepseek.js +125 -10
  7. package/dist/executor.js +993 -144
  8. package/dist/hub.js +85 -6
  9. package/dist/journal.js +61 -11
  10. package/dist/memory.js +84 -0
  11. package/dist/pdftext.js +211 -0
  12. package/dist/prompts.js +124 -23
  13. package/dist/report.js +289 -0
  14. package/dist/run.js +15 -2
  15. package/dist/sandbox.js +11 -0
  16. package/dist/searchcore.js +244 -0
  17. package/dist/state.js +85 -3
  18. package/dist/tools.js +392 -25
  19. package/dist/util.js +85 -0
  20. package/dist/webtools.js +327 -66
  21. package/package.json +3 -2
  22. package/ui/out/404/index.html +1 -1
  23. package/ui/out/404.html +1 -1
  24. package/ui/out/_next/static/chunks/532-35122e93f37719b9.js +1 -0
  25. package/ui/out/_next/static/chunks/677-721ce1c8b7a6a317.js +1 -0
  26. package/ui/out/_next/static/chunks/app/page-dc9f6744d203e76c.js +1 -0
  27. package/ui/out/_next/static/chunks/app/run/page-3674e103981703a2.js +1 -0
  28. package/ui/out/_next/static/chunks/app/settings/page-41a5d8ba43ecfd4a.js +1 -0
  29. package/ui/out/_next/static/css/d95c2ba395730031.css +3 -0
  30. package/ui/out/fonts/PlanetKosmos.ttf +0 -0
  31. package/ui/out/index.html +1 -1
  32. package/ui/out/index.txt +3 -3
  33. package/ui/out/run/index.html +1 -1
  34. package/ui/out/run/index.txt +3 -3
  35. package/ui/out/settings/index.html +1 -1
  36. package/ui/out/settings/index.txt +3 -3
  37. package/ui/out/_next/static/chunks/383-289a866b246b41cc.js +0 -1
  38. package/ui/out/_next/static/chunks/619-ba102abea3e3d0e4.js +0 -1
  39. package/ui/out/_next/static/chunks/677-7ab85a6f38c3a235.js +0 -1
  40. package/ui/out/_next/static/chunks/app/page-0fda5b8e77d90b84.js +0 -1
  41. package/ui/out/_next/static/chunks/app/run/page-07aab6b1224c3c8c.js +0 -1
  42. package/ui/out/_next/static/chunks/app/settings/page-528482d468d84cfa.js +0 -1
  43. package/ui/out/_next/static/css/e2c82b53bf4519e8.css +0 -3
  44. /package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → 7_pihFubDGD40BCy2ynlr}/_buildManifest.js +0 -0
  45. /package/ui/out/_next/static/{Rm5Fhkds2-wIOnVlME55J → 7_pihFubDGD40BCy2ynlr}/_ssgManifest.js +0 -0
package/dist/hub.js CHANGED
@@ -34,6 +34,7 @@ var __importStar = (this && this.__importStar) || (function () {
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
36
  exports.startHub = startHub;
37
+ exports.publicConfig = publicConfig;
37
38
  const fs = __importStar(require("fs"));
38
39
  const http = __importStar(require("http"));
39
40
  const os = __importStar(require("os"));
@@ -41,6 +42,8 @@ const path = __importStar(require("path"));
41
42
  const url_1 = require("url");
42
43
  const config_1 = require("./config");
43
44
  const control_1 = require("./control");
45
+ const crawltools_1 = require("./crawltools");
46
+ const webtools_1 = require("./webtools");
44
47
  const deepseek_1 = require("./deepseek");
45
48
  const providers_1 = require("./providers");
46
49
  const journal_1 = require("./journal");
@@ -81,9 +84,16 @@ function startHub(opts) {
81
84
  async function handle(req, res, opts) {
82
85
  const url = new url_1.URL(req.url || "/", `http://localhost:${opts.port}`);
83
86
  const p = url.pathname;
84
- res.setHeader("access-control-allow-origin", "*");
85
- res.setHeader("access-control-allow-methods", "GET, POST, DELETE, OPTIONS");
86
- res.setHeader("access-control-allow-headers", "content-type");
87
+ // Localhost-only CORS. The hub launches runs and reads reports with the
88
+ // operator's keys a random website's JS must never get a readable
89
+ // response. The dev UI on another localhost port is the one legitimate
90
+ // cross-origin client; everyone else gets no CORS headers at all.
91
+ const origin = String(req.headers.origin || "");
92
+ if (/^https?:\/\/(localhost|127\.0\.0\.1|\[::1\])(:\d+)?$/.test(origin)) {
93
+ res.setHeader("access-control-allow-origin", origin);
94
+ res.setHeader("access-control-allow-methods", "GET, POST, DELETE, OPTIONS");
95
+ res.setHeader("access-control-allow-headers", "content-type");
96
+ }
87
97
  if (req.method === "OPTIONS") {
88
98
  res.writeHead(204);
89
99
  res.end();
@@ -157,6 +167,51 @@ async function api(req, res, url, opts) {
157
167
  const r = await (0, sandbox_1.testSandbox)(cfg, kind);
158
168
  return sendJson(res, 200, { kind, ...r });
159
169
  }
170
+ // Settings diagnostics: prove the search engines / crawl backend actually
171
+ // work with the saved keys before a mission depends on them.
172
+ if (p === "/api/search/test" && method === "POST") {
173
+ const q = "open source vector database";
174
+ const probe = async (engine, fn) => {
175
+ try {
176
+ const hits = await fn();
177
+ return { engine, ok: hits.length > 0, detail: `${hits.length} result(s)` };
178
+ }
179
+ catch (e) {
180
+ return { engine, ok: false, detail: (0, util_1.errMsg)(e) };
181
+ }
182
+ };
183
+ const checks = [probe("duckduckgo", () => (0, webtools_1.ddgSearch)(q, 3)), probe("bing", () => (0, webtools_1.bingSearch)(q, 3))];
184
+ if (cfg.tinyfishApiKey)
185
+ checks.push(probe("tinyfish", () => (0, webtools_1.tinyfishSearch)(cfg, q, 3)));
186
+ const engines = await Promise.all(checks);
187
+ return sendJson(res, 200, { ok: engines.some((e) => e.ok), engines });
188
+ }
189
+ if (p === "/api/crawl/test" && method === "POST") {
190
+ const backend = (0, crawltools_1.resolveCrawlBackend)(cfg);
191
+ if (!backend) {
192
+ return sendJson(res, 200, { ok: false, backend: null, detail: "no crawl backend configured — add a key first" });
193
+ }
194
+ try {
195
+ if ((0, crawltools_1.hasScrapeBackend)(cfg)) {
196
+ const text = await (0, crawltools_1.scrapeUrl)(cfg, "https://example.com/");
197
+ return sendJson(res, 200, {
198
+ ok: Boolean(text && text.length > 50),
199
+ backend,
200
+ detail: text ? `scraped ${text.length} chars` : "empty scrape result",
201
+ });
202
+ }
203
+ // deepcrawl has no single-page scrape — smoke a 1-page crawl instead.
204
+ const out = await (0, crawltools_1.crawlSite)(cfg, { url: "https://example.com/", maxPages: 1 });
205
+ return sendJson(res, 200, {
206
+ ok: out.pages.length > 0,
207
+ backend,
208
+ detail: out.pages.length ? `crawled ${out.pages.length} page(s)` : out.warnings.join("; ") || "no pages",
209
+ });
210
+ }
211
+ catch (e) {
212
+ return sendJson(res, 200, { ok: false, backend, detail: (0, util_1.errMsg)(e) });
213
+ }
214
+ }
160
215
  if (p === "/api/models" && method === "GET") {
161
216
  try {
162
217
  const models = await (0, deepseek_1.listModels)(cfg);
@@ -255,7 +310,7 @@ async function api(req, res, url, opts) {
255
310
  return sendJson(res, 200, { events, live: (0, run_1.isRunLive)(id) });
256
311
  }
257
312
  if (sub === "/stream" && method === "GET") {
258
- return streamEvents(res, id);
313
+ return streamEvents(res, id, url.searchParams.get("quiet") === "1");
259
314
  }
260
315
  if (sub === "/note" && method === "POST") {
261
316
  const body = await readBody(req);
@@ -295,6 +350,14 @@ async function api(req, res, url, opts) {
295
350
  res.end(fs.readFileSync(file));
296
351
  return;
297
352
  }
353
+ if (sub === "/plan" && method === "GET") {
354
+ const file = path.join((0, config_1.runDir)(id), "artifacts", "mission-plan.md");
355
+ if (!fs.existsSync(file))
356
+ return sendJson(res, 404, { error: "no plan yet" });
357
+ res.writeHead(200, { "content-type": "text/markdown; charset=utf-8" });
358
+ res.end(fs.readFileSync(file));
359
+ return;
360
+ }
298
361
  if (sub === "/artifacts" && method === "GET") {
299
362
  return sendJson(res, 200, { artifacts: listArtifactFiles(id) });
300
363
  }
@@ -316,7 +379,7 @@ async function api(req, res, url, opts) {
316
379
  }
317
380
  sendJson(res, 404, { error: "not found" });
318
381
  }
319
- function streamEvents(res, id) {
382
+ function streamEvents(res, id, quiet = false) {
320
383
  res.writeHead(200, {
321
384
  "content-type": "text/event-stream",
322
385
  "cache-control": "no-cache, no-transform",
@@ -335,6 +398,9 @@ function streamEvents(res, id) {
335
398
  return;
336
399
  }
337
400
  for (const ev of evs) {
401
+ // quiet mode: skip streaming chatter for clients rendering many agents.
402
+ if (quiet && ev.type === "agent.delta")
403
+ continue;
338
404
  res.write(`data: ${JSON.stringify(ev)}\n\n`);
339
405
  }
340
406
  };
@@ -392,7 +458,15 @@ function publicConfig(cfg) {
392
458
  tinyfishKeySet: Boolean(cfg.tinyfishApiKey),
393
459
  tinyfishKeyMasked: (0, config_1.maskKey)(cfg.tinyfishApiKey),
394
460
  searchBackend: cfg.searchBackend,
395
- searchkitCmd: cfg.searchkitCmd,
461
+ crawlBackend: cfg.crawlBackend,
462
+ crawlResolved: (0, crawltools_1.resolveCrawlBackend)(cfg),
463
+ firecrawlKeySet: Boolean(cfg.firecrawlApiKey),
464
+ firecrawlKeyMasked: (0, config_1.maskKey)(cfg.firecrawlApiKey),
465
+ contextdevKeySet: Boolean(cfg.contextdevApiKey),
466
+ contextdevKeyMasked: (0, config_1.maskKey)(cfg.contextdevApiKey),
467
+ deepcrawlKeySet: Boolean(cfg.deepcrawlApiKey),
468
+ deepcrawlKeyMasked: (0, config_1.maskKey)(cfg.deepcrawlApiKey),
469
+ deepcrawlBaseUrl: cfg.deepcrawlBaseUrl,
396
470
  sandboxRuntime: cfg.sandboxRuntime,
397
471
  sandboxResolved: (0, sandbox_1.resolveSandboxKind)(cfg),
398
472
  sandboxImage: cfg.sandboxImage,
@@ -416,6 +490,9 @@ function publicConfig(cfg) {
416
490
  reasoningEffort: cfg.reasoningEffort,
417
491
  safeMode: cfg.safeMode,
418
492
  contextTokenLimit: cfg.contextTokenLimit,
493
+ contextWindows: cfg.contextWindows,
494
+ cheapModel: cfg.cheapModel,
495
+ strongModel: cfg.strongModel,
419
496
  knownModels,
420
497
  pricing: cfg.pricing,
421
498
  };
@@ -469,6 +546,8 @@ function snapshot(state, id) {
469
546
  operatorNotes: state.operatorNotes,
470
547
  usageByModel: Object.fromEntries(state.usageByModel),
471
548
  cost: state.cost,
549
+ budgetSeries: state.budgetSeries,
550
+ planExcerpt: state.planExcerpt,
472
551
  finalSummary: state.finalSummary,
473
552
  finalReportPath: state.finalReportPath,
474
553
  live: (0, run_1.isRunLive)(id),
package/dist/journal.js CHANGED
@@ -33,7 +33,7 @@ var __importStar = (this && this.__importStar) || (function () {
33
33
  };
34
34
  })();
35
35
  Object.defineProperty(exports, "__esModule", { value: true });
36
- exports.Journal = void 0;
36
+ exports.Journal = exports.TeamJournal = void 0;
37
37
  exports.eventsFile = eventsFile;
38
38
  exports.readEvents = readEvents;
39
39
  exports.lastSeq = lastSeq;
@@ -41,14 +41,36 @@ exports.readNewEvents = readNewEvents;
41
41
  const fs = __importStar(require("fs"));
42
42
  const path = __importStar(require("path"));
43
43
  /**
44
- * Append-only event journal. events.jsonl is the single source of truth for a
45
- * run: the executor writes it, the terminal renderer and the hub (web UI) read
46
- * and tail it. Tolerant of a torn final line after a crash.
44
+ * A child swarm's view of its parent's journal: same file, same sequence,
45
+ * every event stamped with the owning team's task id so the reducer can
46
+ * partition team activity away from the root run.
47
47
  */
48
+ class TeamJournal {
49
+ inner;
50
+ teamId;
51
+ constructor(inner, teamId) {
52
+ this.inner = inner;
53
+ this.teamId = teamId;
54
+ }
55
+ append(type, payload = {}) {
56
+ return this.inner.append(type, { teamId: this.teamId, ...payload });
57
+ }
58
+ flush() {
59
+ return this.inner.flush();
60
+ }
61
+ get degraded() {
62
+ return this.inner.degraded;
63
+ }
64
+ }
65
+ exports.TeamJournal = TeamJournal;
48
66
  class Journal {
49
67
  file;
50
68
  seq;
51
69
  chain = Promise.resolve();
70
+ buf = "";
71
+ failures = 0;
72
+ /** Set after repeated append failures: the source of truth is no longer being persisted. */
73
+ degraded = false;
52
74
  onEvent;
53
75
  constructor(runDirPath, startSeq) {
54
76
  this.file = path.join(runDirPath, "events.jsonl");
@@ -56,12 +78,8 @@ class Journal {
56
78
  }
57
79
  append(type, payload = {}) {
58
80
  const ev = { seq: this.seq++, t: Date.now(), type, ...payload };
59
- const line = JSON.stringify(ev) + "\n";
60
- this.chain = this.chain
61
- .then(() => fs.promises.appendFile(this.file, line, "utf8"))
62
- .catch(() => {
63
- /* never break the run on journal IO; next append retries the chain */
64
- });
81
+ this.buf += JSON.stringify(ev) + "\n";
82
+ this.chain = this.chain.then(() => this.drain());
65
83
  try {
66
84
  this.onEvent?.(ev);
67
85
  }
@@ -70,8 +88,40 @@ class Journal {
70
88
  }
71
89
  return ev;
72
90
  }
91
+ async drain() {
92
+ if (!this.buf)
93
+ return;
94
+ const chunk = this.buf;
95
+ this.buf = "";
96
+ try {
97
+ await fs.promises.appendFile(this.file, chunk, "utf8");
98
+ this.failures = 0;
99
+ }
100
+ catch (e) {
101
+ // Keep the unwritten events buffered so the next append/flush retries
102
+ // them in order; after repeated failures, stop pretending it's fine.
103
+ this.buf = chunk + this.buf;
104
+ this.failures++;
105
+ if (this.failures >= 5 && !this.degraded) {
106
+ this.degraded = true;
107
+ process.stderr.write(`agentswarm: journal writes are failing (${String(e)}); run state is no longer durable\n`);
108
+ }
109
+ }
110
+ }
73
111
  flush() {
74
- return this.chain;
112
+ return this.chain.then(() => this.drain());
113
+ }
114
+ /** Last-gasp synchronous flush for signal handlers and exit paths. */
115
+ flushSync() {
116
+ if (!this.buf)
117
+ return;
118
+ try {
119
+ fs.appendFileSync(this.file, this.buf, "utf8");
120
+ this.buf = "";
121
+ }
122
+ catch {
123
+ /* nothing left to do */
124
+ }
75
125
  }
76
126
  }
77
127
  exports.Journal = Journal;
package/dist/memory.js ADDED
@@ -0,0 +1,84 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.memoryFile = memoryFile;
37
+ exports.loadMemory = loadMemory;
38
+ exports.appendMemory = appendMemory;
39
+ exports.memoryBlock = memoryBlock;
40
+ const crypto = __importStar(require("crypto"));
41
+ const fs = __importStar(require("fs"));
42
+ const path = __importStar(require("path"));
43
+ const config_1 = require("./config");
44
+ const util_1 = require("./util");
45
+ const MAX_ENTRIES = 20;
46
+ function memoryFile(cwd) {
47
+ const hash = crypto.createHash("sha1").update(path.resolve(cwd)).digest("hex").slice(0, 12);
48
+ return path.join((0, config_1.home)(), "memory", `${hash}.json`);
49
+ }
50
+ function loadMemory(cwd) {
51
+ try {
52
+ const raw = JSON.parse(fs.readFileSync(memoryFile(cwd), "utf8"));
53
+ return Array.isArray(raw?.entries) ? raw.entries : [];
54
+ }
55
+ catch {
56
+ return [];
57
+ }
58
+ }
59
+ function appendMemory(cwd, entry) {
60
+ try {
61
+ // Same-run entries replace (interim → final); writeJson is temp+rename so
62
+ // a crash mid-write never loses the prior history.
63
+ const prior = loadMemory(cwd).filter((e) => !(entry.runId && e.runId === entry.runId));
64
+ const entries = [...prior, entry].slice(-MAX_ENTRIES);
65
+ (0, util_1.writeJson)(memoryFile(cwd), { cwd: path.resolve(cwd), entries });
66
+ }
67
+ catch {
68
+ /* memory is best-effort */
69
+ }
70
+ }
71
+ /** Prompt block for the conductor, or "" when there's no history. */
72
+ function memoryBlock(cwd) {
73
+ const entries = loadMemory(cwd);
74
+ if (!entries.length)
75
+ return "";
76
+ const lines = entries.slice(-8).map((e) => {
77
+ const when = new Date(e.finishedAt).toISOString().slice(0, 10);
78
+ const decisions = e.keyDecisions.length
79
+ ? ` Decisions: ${e.keyDecisions.map((d) => (0, util_1.oneLine)(d, 100)).join("; ")}`
80
+ : "";
81
+ return `- [${when}, ${e.status}] "${(0, util_1.oneLine)(e.mission, 100)}" — ${(0, util_1.oneLine)(e.summary, 200)}${decisions}`;
82
+ });
83
+ return (0, util_1.clip)(`PRIOR RUNS IN THIS WORKSPACE (build on them; don't redo settled decisions without reason):\n${lines.join("\n")}`, 4000);
84
+ }
@@ -0,0 +1,211 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.extractPdfText = extractPdfText;
37
+ const zlib = __importStar(require("zlib"));
38
+ /**
39
+ * Minimal zero-dependency PDF text extraction: inflate FlateDecode content
40
+ * streams (Node's built-in zlib) and interpret the text-showing operators
41
+ * (Tj / TJ / ' / "). Good enough for most digitally-produced text PDFs;
42
+ * returns null for scanned, encrypted, or exotic-encoding documents so the
43
+ * caller can tell the agent to find an HTML source instead.
44
+ */
45
+ function extractPdfText(buf) {
46
+ if (buf.subarray(0, 5).toString("latin1") !== "%PDF-")
47
+ return null;
48
+ // latin1 preserves bytes 1:1, so stream offsets in the string match the buffer.
49
+ const raw = buf.toString("latin1");
50
+ const pages = (raw.match(/\/Type\s*\/Pages?\b/g) || []).filter((m) => !/Pages/.test(m)).length || 1;
51
+ let text = "";
52
+ const streamRe = /<<([\s\S]{0,2000}?)>>\s*stream\r?\n/g;
53
+ let m;
54
+ while ((m = streamRe.exec(raw))) {
55
+ const dict = m[1];
56
+ const start = m.index + m[0].length;
57
+ const end = raw.indexOf("endstream", start);
58
+ if (end < 0)
59
+ continue;
60
+ streamRe.lastIndex = end;
61
+ // Only plain or Flate-compressed streams are supported.
62
+ if (/\/Filter/.test(dict) && !/FlateDecode/.test(dict))
63
+ continue;
64
+ let len = end;
65
+ while (len > start && (raw[len - 1] === "\n" || raw[len - 1] === "\r"))
66
+ len--;
67
+ let data = buf.subarray(start, len);
68
+ if (/FlateDecode/.test(dict)) {
69
+ try {
70
+ data = zlib.inflateSync(data);
71
+ }
72
+ catch {
73
+ continue;
74
+ }
75
+ }
76
+ const content = data.toString("latin1");
77
+ if (!/\bBT\b/.test(content))
78
+ continue; // not a text content stream
79
+ const extracted = extractFromContent(content);
80
+ if (extracted.trim())
81
+ text += extracted + "\n";
82
+ }
83
+ const cleaned = text
84
+ .replace(/[^\S\n]+/g, " ")
85
+ .replace(/ ?\n ?/g, "\n")
86
+ .replace(/\n{3,}/g, "\n\n")
87
+ .trim();
88
+ // CID/Type0 fonts yield glyph-index garbage; require a body of real characters.
89
+ const printable = cleaned.replace(/[^\x20-\x7E\n -￿]/g, "");
90
+ if (printable.replace(/\s/g, "").length < 40)
91
+ return null;
92
+ return { text: printable, pages };
93
+ }
94
+ /** Walk a content stream, collecting strings shown by Tj/TJ/'/" with newline heuristics. */
95
+ function extractFromContent(src) {
96
+ let out = "";
97
+ let pending = [];
98
+ const n = src.length;
99
+ let i = 0;
100
+ while (i < n) {
101
+ const ch = src[i];
102
+ if (ch === "(") {
103
+ const [s, next] = parseLiteralString(src, i);
104
+ pending.push(s);
105
+ i = next;
106
+ }
107
+ else if (ch === "<" && src[i + 1] !== "<") {
108
+ const close = src.indexOf(">", i + 1);
109
+ if (close < 0)
110
+ break;
111
+ pending.push(decodeHexString(src.slice(i + 1, close)));
112
+ i = close + 1;
113
+ }
114
+ else if (ch === "%") {
115
+ // comment to end of line
116
+ while (i < n && src[i] !== "\n" && src[i] !== "\r")
117
+ i++;
118
+ }
119
+ else if (/[A-Za-z'"*]/.test(ch)) {
120
+ let j = i;
121
+ while (j < n && /[A-Za-z'"*]/.test(src[j]))
122
+ j++;
123
+ const op = src.slice(i, j);
124
+ if (op === "Tj" || op === "TJ") {
125
+ out += pending.join("");
126
+ }
127
+ else if (op === "'" || op === '"') {
128
+ out += "\n" + pending.join("");
129
+ }
130
+ else if (op === "Td" || op === "TD" || op === "T*" || op === "Tm" || op === "ET") {
131
+ if (pending.length)
132
+ out += pending.join("");
133
+ if (!out.endsWith("\n"))
134
+ out += "\n";
135
+ }
136
+ pending = [];
137
+ i = j;
138
+ }
139
+ else if (ch === "-" || (ch >= "0" && ch <= "9") || ch === ".") {
140
+ let j = i + 1;
141
+ while (j < n && /[0-9.]/.test(src[j]))
142
+ j++;
143
+ // Large negative kerning inside a TJ array is a word gap.
144
+ const num = parseFloat(src.slice(i, j));
145
+ if (num <= -180 && pending.length && !pending[pending.length - 1].endsWith(" "))
146
+ pending.push(" ");
147
+ i = j;
148
+ }
149
+ else {
150
+ i++;
151
+ }
152
+ }
153
+ return out;
154
+ }
155
+ /** PDF literal string: balanced parens, backslash escapes, octal codes. */
156
+ function parseLiteralString(src, start) {
157
+ let out = "";
158
+ let depth = 0;
159
+ let i = start;
160
+ for (; i < src.length; i++) {
161
+ const ch = src[i];
162
+ if (ch === "\\") {
163
+ const next = src[i + 1];
164
+ if (next >= "0" && next <= "7") {
165
+ let oct = "";
166
+ for (let k = 1; k <= 3 && src[i + k] >= "0" && src[i + k] <= "7"; k++)
167
+ oct += src[i + k];
168
+ out += String.fromCharCode(parseInt(oct, 8));
169
+ i += oct.length;
170
+ }
171
+ else {
172
+ const map = { n: "\n", r: "\r", t: "\t", b: "\b", f: "\f", "(": "(", ")": ")", "\\": "\\" };
173
+ out += map[next] ?? next ?? "";
174
+ i++;
175
+ }
176
+ }
177
+ else if (ch === "(") {
178
+ depth++;
179
+ if (depth > 1)
180
+ out += ch;
181
+ }
182
+ else if (ch === ")") {
183
+ depth--;
184
+ if (depth === 0) {
185
+ i++;
186
+ break;
187
+ }
188
+ out += ch;
189
+ }
190
+ else {
191
+ out += ch;
192
+ }
193
+ }
194
+ return [out, i];
195
+ }
196
+ /** PDF hex string: byte pairs; a UTF-16BE BOM switches to two-byte chars. */
197
+ function decodeHexString(hex) {
198
+ const clean = hex.replace(/[^0-9a-fA-F]/g, "");
199
+ const bytes = [];
200
+ for (let i = 0; i + 1 < clean.length; i += 2)
201
+ bytes.push(parseInt(clean.slice(i, i + 2), 16));
202
+ if (clean.length % 2)
203
+ bytes.push(parseInt(clean[clean.length - 1] + "0", 16));
204
+ if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
205
+ let s = "";
206
+ for (let i = 2; i + 1 < bytes.length; i += 2)
207
+ s += String.fromCharCode((bytes[i] << 8) | bytes[i + 1]);
208
+ return s;
209
+ }
210
+ return bytes.map((b) => String.fromCharCode(b)).join("");
211
+ }