@chllming/wave-orchestration 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/README.md +549 -0
  3. package/docs/agents/wave-deploy-verifier-role.md +34 -0
  4. package/docs/agents/wave-documentation-role.md +30 -0
  5. package/docs/agents/wave-evaluator-role.md +43 -0
  6. package/docs/agents/wave-infra-role.md +34 -0
  7. package/docs/agents/wave-integration-role.md +32 -0
  8. package/docs/agents/wave-launcher-role.md +37 -0
  9. package/docs/context7/bundles.json +91 -0
  10. package/docs/plans/component-cutover-matrix.json +112 -0
  11. package/docs/plans/component-cutover-matrix.md +49 -0
  12. package/docs/plans/context7-wave-orchestrator.md +130 -0
  13. package/docs/plans/current-state.md +44 -0
  14. package/docs/plans/master-plan.md +16 -0
  15. package/docs/plans/migration.md +23 -0
  16. package/docs/plans/wave-orchestrator.md +254 -0
  17. package/docs/plans/waves/wave-0.md +165 -0
  18. package/docs/reference/github-packages-setup.md +52 -0
  19. package/docs/reference/migration-0.2-to-0.5.md +622 -0
  20. package/docs/reference/npmjs-trusted-publishing.md +55 -0
  21. package/docs/reference/repository-guidance.md +18 -0
  22. package/docs/reference/runtime-config/README.md +85 -0
  23. package/docs/reference/runtime-config/claude.md +105 -0
  24. package/docs/reference/runtime-config/codex.md +81 -0
  25. package/docs/reference/runtime-config/opencode.md +93 -0
  26. package/docs/research/agent-context-sources.md +57 -0
  27. package/docs/roadmap.md +626 -0
  28. package/package.json +53 -0
  29. package/releases/manifest.json +101 -0
  30. package/scripts/context7-api-check.sh +21 -0
  31. package/scripts/context7-export-env.sh +52 -0
  32. package/scripts/research/agent-context-archive.mjs +472 -0
  33. package/scripts/research/generate-agent-context-indexes.mjs +85 -0
  34. package/scripts/research/import-agent-context-archive.mjs +793 -0
  35. package/scripts/research/manifests/harness-and-blackboard-2026-03-21.mjs +201 -0
  36. package/scripts/wave-autonomous.mjs +13 -0
  37. package/scripts/wave-cli-bootstrap.mjs +27 -0
  38. package/scripts/wave-dashboard.mjs +11 -0
  39. package/scripts/wave-human-feedback.mjs +11 -0
  40. package/scripts/wave-launcher.mjs +11 -0
  41. package/scripts/wave-local-executor.mjs +13 -0
  42. package/scripts/wave-orchestrator/agent-state.mjs +416 -0
  43. package/scripts/wave-orchestrator/autonomous.mjs +367 -0
  44. package/scripts/wave-orchestrator/clarification-triage.mjs +605 -0
  45. package/scripts/wave-orchestrator/config.mjs +848 -0
  46. package/scripts/wave-orchestrator/context7.mjs +464 -0
  47. package/scripts/wave-orchestrator/coord-cli.mjs +286 -0
  48. package/scripts/wave-orchestrator/coordination-store.mjs +987 -0
  49. package/scripts/wave-orchestrator/coordination.mjs +768 -0
  50. package/scripts/wave-orchestrator/dashboard-renderer.mjs +254 -0
  51. package/scripts/wave-orchestrator/dashboard-state.mjs +473 -0
  52. package/scripts/wave-orchestrator/dep-cli.mjs +219 -0
  53. package/scripts/wave-orchestrator/docs-queue.mjs +75 -0
  54. package/scripts/wave-orchestrator/executors.mjs +385 -0
  55. package/scripts/wave-orchestrator/feedback.mjs +372 -0
  56. package/scripts/wave-orchestrator/install.mjs +540 -0
  57. package/scripts/wave-orchestrator/launcher.mjs +3879 -0
  58. package/scripts/wave-orchestrator/ledger.mjs +332 -0
  59. package/scripts/wave-orchestrator/local-executor.mjs +263 -0
  60. package/scripts/wave-orchestrator/replay.mjs +246 -0
  61. package/scripts/wave-orchestrator/roots.mjs +10 -0
  62. package/scripts/wave-orchestrator/routing-state.mjs +542 -0
  63. package/scripts/wave-orchestrator/shared.mjs +405 -0
  64. package/scripts/wave-orchestrator/terminals.mjs +209 -0
  65. package/scripts/wave-orchestrator/traces.mjs +1094 -0
  66. package/scripts/wave-orchestrator/wave-files.mjs +1923 -0
  67. package/scripts/wave.mjs +103 -0
  68. package/wave.config.json +115 -0
@@ -0,0 +1,793 @@
1
+ #!/usr/bin/env node
2
+ import fs from "node:fs/promises";
3
+ import { createRequire } from "node:module";
4
+ import os from "node:os";
5
+ import path from "node:path";
6
+ import process from "node:process";
7
+ import { pathToFileURL } from "node:url";
8
+ import { Readability } from "@mozilla/readability";
9
+ import { JSDOM } from "jsdom";
10
+ import { getDocument } from "pdfjs-dist/legacy/build/pdf.mjs";
11
+
12
+ const REPO_ROOT = process.cwd();
13
+ const DEFAULT_OUTPUT_ROOT = path.join(REPO_ROOT, "docs/research/agent-context-cache");
14
+ const TODAY = new Date().toISOString().slice(0, 10);
15
+ const require = createRequire(import.meta.url);
16
+ const PDFJS_PACKAGE_DIR = path.dirname(require.resolve("pdfjs-dist/package.json"));
17
+ const STANDARD_FONT_DATA_URL = `${pathToFileURL(path.join(PDFJS_PACKAGE_DIR, "standard_fonts")).href}/`;
18
+
19
+ function usage() {
20
+ console.error(
21
+ "Usage: node scripts/research/import-agent-context-archive.mjs <manifest-module> [--output-root <dir>] [--only <slug1,slug2>]",
22
+ );
23
+ }
24
+
25
+ function parseArgs(argv) {
26
+ const args = [...argv];
27
+ const manifestPath = args.shift();
28
+ if (!manifestPath) {
29
+ usage();
30
+ process.exit(1);
31
+ }
32
+
33
+ let outputRoot = DEFAULT_OUTPUT_ROOT;
34
+ let only = null;
35
+
36
+ while (args.length > 0) {
37
+ const arg = args.shift();
38
+ if (arg === "--") {
39
+ continue;
40
+ }
41
+ if (arg === "--output-root") {
42
+ outputRoot = path.resolve(REPO_ROOT, args.shift() ?? "");
43
+ continue;
44
+ }
45
+ if (arg === "--only") {
46
+ const value = args.shift() ?? "";
47
+ only = new Set(
48
+ value
49
+ .split(",")
50
+ .map((part) => part.trim())
51
+ .filter(Boolean),
52
+ );
53
+ continue;
54
+ }
55
+ throw new Error(`Unknown argument: ${arg}`);
56
+ }
57
+
58
+ return {
59
+ manifestPath: path.resolve(REPO_ROOT, manifestPath),
60
+ outputRoot,
61
+ only,
62
+ };
63
+ }
64
+
65
+ function normalizeWhitespace(value) {
66
+ return String(value ?? "")
67
+ .replaceAll("\u00a0", " ")
68
+ .replaceAll(/\s+/g, " ")
69
+ .trim();
70
+ }
71
+
72
+ function normalizeAuthorName(value) {
73
+ const name = normalizeWhitespace(value)
74
+ .replace(/[∗*†‡§¶‖]+$/g, "")
75
+ .trim();
76
+ const parts = name
77
+ .split(",")
78
+ .map((part) => part.trim())
79
+ .filter(Boolean);
80
+ if (parts.length === 2) {
81
+ return `${parts[1]} ${parts[0]}`;
82
+ }
83
+ return name;
84
+ }
85
+
86
+ function extractYear(value) {
87
+ const match = String(value ?? "").match(/\b(19|20)\d{2}\b/);
88
+ return match ? Number(match[0]) : null;
89
+ }
90
+
91
+ function escapeInlinePipes(value) {
92
+ return String(value ?? "").replaceAll("|", "\\|");
93
+ }
94
+
95
+ function escapeYamlSingleQuoted(value) {
96
+ return String(value ?? "").replaceAll("'", "''");
97
+ }
98
+
99
+ function normalizeList(value) {
100
+ if (!Array.isArray(value)) {
101
+ return [];
102
+ }
103
+ return value
104
+ .map((entry) => normalizeWhitespace(entry))
105
+ .filter(Boolean);
106
+ }
107
+
108
+ function sanitizeTextForMarkdown(value) {
109
+ return String(value ?? "")
110
+ .replace(/\r\n/g, "\n")
111
+ .split("\u0000")
112
+ .join("")
113
+ .replace(/[ \t]+\n/g, "\n")
114
+ .replace(/\n{3,}/g, "\n\n")
115
+ .trim();
116
+ }
117
+
118
+ function extractProxyMarkdownContent(value) {
119
+ const text = String(value ?? "").replace(/\r\n/g, "\n");
120
+ const marker = "\nMarkdown Content:\n";
121
+ const markerIndex = text.indexOf(marker);
122
+ if (markerIndex >= 0) {
123
+ return text.slice(markerIndex + marker.length).trim();
124
+ }
125
+ return text
126
+ .replace(/^Title:.*?\n/m, "")
127
+ .replace(/^URL Source:.*?\n/m, "")
128
+ .trim();
129
+ }
130
+
131
+ function fileExists(p) {
132
+ return fs
133
+ .access(p)
134
+ .then(() => true)
135
+ .catch(() => false);
136
+ }
137
+
138
+ async function fetchText(url) {
139
+ const response = await fetch(url, {
140
+ redirect: "follow",
141
+ headers: {
142
+ "User-Agent": "Mozilla/5.0",
143
+ Accept: "text/html,application/xhtml+xml,application/xml,text/plain;q=0.9,*/*;q=0.8",
144
+ },
145
+ });
146
+ if (!response.ok) {
147
+ throw new Error(`HTTP ${response.status} for ${url}`);
148
+ }
149
+ return await response.text();
150
+ }
151
+
152
+ async function fetchBuffer(url) {
153
+ const response = await fetch(url, {
154
+ redirect: "follow",
155
+ headers: {
156
+ "User-Agent": "Mozilla/5.0",
157
+ Accept: "application/pdf,application/octet-stream,*/*;q=0.8",
158
+ },
159
+ });
160
+ if (!response.ok) {
161
+ throw new Error(`HTTP ${response.status} for ${url}`);
162
+ }
163
+ const arrayBuffer = await response.arrayBuffer();
164
+ return Buffer.from(arrayBuffer);
165
+ }
166
+
167
+ function deriveArxivPdfUrl(url) {
168
+ const match = String(url).match(/^https?:\/\/arxiv\.org\/(?:abs|html)\/([^?#/]+(?:v\d+)?)\/?$/);
169
+ if (!match) {
170
+ return null;
171
+ }
172
+ return `https://arxiv.org/pdf/${match[1]}.pdf`;
173
+ }
174
+
175
+ function resolveUrlMaybe(baseUrl, candidate) {
176
+ try {
177
+ return new URL(candidate, baseUrl).toString();
178
+ } catch {
179
+ return null;
180
+ }
181
+ }
182
+
183
+ function discoverPdfFromHtml(doc, baseUrl) {
184
+ const metaPdf = doc.querySelector('meta[name="citation_pdf_url"]')?.getAttribute("content");
185
+ if (metaPdf) {
186
+ return resolveUrlMaybe(baseUrl, metaPdf);
187
+ }
188
+
189
+ const anchors = [...doc.querySelectorAll("a[href]")];
190
+ const exactPdf = anchors
191
+ .map((anchor) => ({
192
+ href: resolveUrlMaybe(baseUrl, anchor.getAttribute("href")),
193
+ text: normalizeWhitespace(anchor.textContent),
194
+ }))
195
+ .find(({ href }) => href && /\.pdf(?:[?#].*)?$/i.test(href));
196
+ if (exactPdf?.href) {
197
+ return exactPdf.href;
198
+ }
199
+
200
+ const labeledPdf = anchors
201
+ .map((anchor) => ({
202
+ href: resolveUrlMaybe(baseUrl, anchor.getAttribute("href")),
203
+ text: normalizeWhitespace(anchor.textContent).toLowerCase(),
204
+ }))
205
+ .find(({ href, text }) => href && text.includes("pdf"));
206
+ return labeledPdf?.href ?? null;
207
+ }
208
+
209
+ function parseMetadataFromHtml(html, url) {
210
+ const dom = new JSDOM(html, { url });
211
+ const { document } = dom.window;
212
+ const meta = {
213
+ title:
214
+ document.querySelector('meta[name="citation_title"]')?.getAttribute("content") ??
215
+ document.querySelector("title")?.textContent ??
216
+ null,
217
+ authors: [...document.querySelectorAll('meta[name="citation_author"]')]
218
+ .map((element) => normalizeAuthorName(element.getAttribute("content")))
219
+ .filter(Boolean),
220
+ year:
221
+ extractYear(
222
+ document.querySelector('meta[name="citation_publication_date"]')?.getAttribute("content"),
223
+ ) ??
224
+ extractYear(document.querySelector('meta[name="citation_date"]')?.getAttribute("content")) ??
225
+ extractYear(
226
+ document.querySelector('meta[property="article:published_time"]')?.getAttribute("content"),
227
+ ) ??
228
+ null,
229
+ venue:
230
+ document.querySelector('meta[name="citation_conference_title"]')?.getAttribute("content") ??
231
+ document.querySelector('meta[name="citation_journal_title"]')?.getAttribute("content") ??
232
+ document.querySelector('meta[name="dc.source"]')?.getAttribute("content") ??
233
+ null,
234
+ pdfUrl: discoverPdfFromHtml(document, url),
235
+ };
236
+
237
+ return { dom, meta };
238
+ }
239
+
240
+ function collectReadableTextFromHtml(html, url) {
241
+ const sourceDom = new JSDOM(html, { url });
242
+ const article = new Readability(sourceDom.window.document).parse();
243
+ if (!article) {
244
+ return null;
245
+ }
246
+
247
+ const articleDom = new JSDOM(article.content);
248
+ const blockSelectors = "h1,h2,h3,h4,h5,h6,p,li,blockquote,pre,tr";
249
+ const blocks = [...articleDom.window.document.querySelectorAll(blockSelectors)]
250
+ .map((node) => normalizeWhitespace(node.textContent))
251
+ .filter(Boolean);
252
+ if (blocks.length === 0) {
253
+ return sanitizeTextForMarkdown(article.textContent ?? "");
254
+ }
255
+ return sanitizeTextForMarkdown(blocks.join("\n\n"));
256
+ }
257
+
258
+ function shouldInsertSpace(prevText, nextText, currentX, previousEndX) {
259
+ if (!prevText) {
260
+ return false;
261
+ }
262
+ if (/[-/([{]$/.test(prevText)) {
263
+ return false;
264
+ }
265
+ if (/^[,.;:!?)}\]]/.test(nextText)) {
266
+ return false;
267
+ }
268
+ if (previousEndX == null || currentX == null) {
269
+ return true;
270
+ }
271
+ return currentX - previousEndX > 0.8;
272
+ }
273
+
274
+ function finalizePdfLine(chunks) {
275
+ return sanitizeTextForMarkdown(
276
+ chunks
277
+ .join("")
278
+ .replace(/\s+([,.;:!?])/g, "$1")
279
+ .replace(/([([{])\s+/g, "$1")
280
+ .replace(/\s+([)\]}])/g, "$1")
281
+ .replace(/-\s+\n/g, "-"),
282
+ );
283
+ }
284
+
285
+ async function extractPdfText(buffer) {
286
+ const originalWarn = console.warn;
287
+ const originalLog = console.log;
288
+ console.warn = (...args) => {
289
+ const message = String(args[0] ?? "");
290
+ if (message.includes("Unable to load font data") || message.includes("standardFontDataUrl")) {
291
+ return;
292
+ }
293
+ originalWarn(...args);
294
+ };
295
+ console.log = (...args) => {
296
+ const message = String(args[0] ?? "");
297
+ if (message.startsWith("Warning: TT:")) {
298
+ return;
299
+ }
300
+ originalLog(...args);
301
+ };
302
+
303
+ const pdf = await getDocument({
304
+ data: new Uint8Array(buffer),
305
+ disableWorker: true,
306
+ standardFontDataUrl: STANDARD_FONT_DATA_URL,
307
+ }).promise;
308
+ const sections = [];
309
+
310
+ try {
311
+ for (let pageNumber = 1; pageNumber <= pdf.numPages; pageNumber += 1) {
312
+ const page = await pdf.getPage(pageNumber);
313
+ const textContent = await page.getTextContent();
314
+ const lines = [];
315
+ let currentLine = [];
316
+ let lastY = null;
317
+ let lastXEnd = null;
318
+ let lastHeight = null;
319
+
320
+ for (const item of textContent.items) {
321
+ if (!("str" in item)) {
322
+ continue;
323
+ }
324
+ const text = String(item.str ?? "");
325
+ if (!text.trim()) {
326
+ if (item.hasEOL && currentLine.length > 0) {
327
+ lines.push(finalizePdfLine(currentLine));
328
+ currentLine = [];
329
+ lastY = null;
330
+ lastXEnd = null;
331
+ lastHeight = null;
332
+ }
333
+ continue;
334
+ }
335
+
336
+ const transform = Array.isArray(item.transform) ? item.transform : [];
337
+ const currentX = Number(transform[4] ?? 0);
338
+ const currentY = Number(transform[5] ?? 0);
339
+ const currentHeight = Number(item.height ?? 0);
340
+ const currentWidth = Number(item.width ?? 0);
341
+
342
+ const yThreshold = Math.max(2, (lastHeight ?? currentHeight) * 0.6);
343
+ const shouldBreakLine = lastY != null && Math.abs(currentY - lastY) > yThreshold;
344
+ if (shouldBreakLine && currentLine.length > 0) {
345
+ lines.push(finalizePdfLine(currentLine));
346
+ currentLine = [];
347
+ lastXEnd = null;
348
+ }
349
+
350
+ if (shouldInsertSpace(currentLine.at(-1) ?? "", text, currentX, lastXEnd)) {
351
+ currentLine.push(" ");
352
+ }
353
+ currentLine.push(text);
354
+
355
+ lastY = currentY;
356
+ lastHeight = currentHeight || lastHeight;
357
+ lastXEnd = currentX + currentWidth;
358
+
359
+ if (item.hasEOL) {
360
+ lines.push(finalizePdfLine(currentLine));
361
+ currentLine = [];
362
+ lastY = null;
363
+ lastXEnd = null;
364
+ lastHeight = null;
365
+ }
366
+ }
367
+
368
+ if (currentLine.length > 0) {
369
+ lines.push(finalizePdfLine(currentLine));
370
+ }
371
+
372
+ const pageText = sanitizeTextForMarkdown(lines.filter(Boolean).join("\n\n"));
373
+ if (pageText) {
374
+ sections.push(`### Page ${pageNumber}\n\n${pageText}`);
375
+ }
376
+ }
377
+ } finally {
378
+ console.warn = originalWarn;
379
+ console.log = originalLog;
380
+ }
381
+
382
+ return sections.join("\n\n");
383
+ }
384
+
385
+ function inferAuthorsFromPdfText(extractedText, title) {
386
+ const titleFragments = title
387
+ .toLowerCase()
388
+ .split(/\s+/)
389
+ .filter((word) => word.length > 3);
390
+ const pages = extractedText
391
+ .split(/^### Page \d+\s*$/m)
392
+ .map((page) =>
393
+ page
394
+ .split("\n")
395
+ .map((line) => normalizeWhitespace(line))
396
+ .filter(Boolean),
397
+ )
398
+ .filter((pageLines) => pageLines.length > 0);
399
+
400
+ const affiliationWords = [
401
+ "@",
402
+ "abstract",
403
+ "introduction",
404
+ "university",
405
+ "università",
406
+ "institute",
407
+ "department",
408
+ "research",
409
+ "laboratory",
410
+ "lab",
411
+ "college",
412
+ "school",
413
+ "center",
414
+ "centre",
415
+ "microsoft",
416
+ "google",
417
+ "amazon",
418
+ "meta",
419
+ "berkeley",
420
+ "cambridge",
421
+ "italy",
422
+ "united kingdom",
423
+ "united states",
424
+ "usa",
425
+ "uk",
426
+ "doi",
427
+ "arxiv",
428
+ "proceedings",
429
+ ];
430
+
431
+ const selectCandidates = (probeLines) => {
432
+ const cleanAuthorLine = (line) => {
433
+ if (line.includes(";")) {
434
+ return line
435
+ .split(";")
436
+ .map((segment) => segment.split(",")[0]?.trim())
437
+ .filter(Boolean)
438
+ .join(", ");
439
+ }
440
+
441
+ const commaParts = line
442
+ .split(",")
443
+ .map((part) => part.trim())
444
+ .filter(Boolean);
445
+ if (commaParts.length >= 2) {
446
+ while (commaParts.length > 0) {
447
+ const tail = commaParts.at(-1)?.toLowerCase() ?? "";
448
+ if (affiliationWords.some((word) => tail.includes(word))) {
449
+ commaParts.pop();
450
+ continue;
451
+ }
452
+ break;
453
+ }
454
+ return commaParts.join(", ");
455
+ }
456
+ return line;
457
+ };
458
+
459
+ const candidates = [];
460
+ for (const rawLine of probeLines) {
461
+ const line = rawLine.replace(/[∗*†‡§¶‖0-9]+$/g, "").trim();
462
+ if (!line || line.length < 6 || line.length > 160) {
463
+ continue;
464
+ }
465
+ const cleanedLine = cleanAuthorLine(line);
466
+ const lower = cleanedLine.toLowerCase();
467
+ if (titleFragments.slice(0, 3).every((word) => lower.includes(word))) {
468
+ continue;
469
+ }
470
+ if (affiliationWords.some((word) => lower.includes(word))) {
471
+ continue;
472
+ }
473
+ if (line.includes("http")) {
474
+ continue;
475
+ }
476
+ if (line.includes("https://")) {
477
+ continue;
478
+ }
479
+
480
+ const tokens = cleanedLine.replaceAll(",", " ").split(/\s+/).filter(Boolean);
481
+ if (tokens.length < 2 || tokens.length > 24) {
482
+ continue;
483
+ }
484
+ const uppercaseishTokens = tokens.filter(
485
+ (token) => /^[A-Z][A-Za-z.'-]*$/.test(token) || /^[A-Z]\.$/.test(token),
486
+ );
487
+ if (uppercaseishTokens.length < Math.ceil(tokens.length * 0.6)) {
488
+ continue;
489
+ }
490
+ candidates.push(cleanedLine);
491
+ }
492
+ return candidates.length > 0 ? [...new Set(candidates)].join(", ") : null;
493
+ };
494
+
495
+ for (const lines of pages.slice(0, 4)) {
496
+ const titleLineIndex = lines.findIndex((line) => {
497
+ const lower = line.toLowerCase();
498
+ return titleFragments.slice(0, 4).every((word) => lower.includes(word));
499
+ });
500
+ if (titleLineIndex < 0) {
501
+ continue;
502
+ }
503
+ const titleEndIndex = lines.findIndex(
504
+ (line, index) => index > titleLineIndex && line.toLowerCase().startsWith("abstract"),
505
+ );
506
+ const probeLines = lines.slice(
507
+ Math.max(0, titleLineIndex + 1),
508
+ titleEndIndex > 0 ? titleEndIndex : Math.min(lines.length, titleLineIndex + 24),
509
+ );
510
+ const candidates = selectCandidates(probeLines);
511
+ if (candidates) {
512
+ return candidates;
513
+ }
514
+ }
515
+
516
+ return selectCandidates(pages[0] ?? []);
517
+ }
518
+
519
+ async function enrichFromHtmlSource(url, resolved, options = {}) {
520
+ const html = await fetchText(url);
521
+ const { meta } = parseMetadataFromHtml(html, url);
522
+ if (!resolved.authors && meta.authors.length > 0) {
523
+ resolved.authors = meta.authors.join(", ");
524
+ }
525
+ if (!resolved.year && meta.year) {
526
+ resolved.year = meta.year;
527
+ }
528
+ if (!resolved.venue && meta.venue) {
529
+ resolved.venue = normalizeWhitespace(meta.venue);
530
+ }
531
+ if (!options.preferHtml && !resolved.sourcePdf && meta.pdfUrl) {
532
+ resolved.sourcePdf = meta.pdfUrl;
533
+ }
534
+ if (!resolved.extractedText && (options.preferHtml || !resolved.sourcePdf)) {
535
+ const readableText = collectReadableTextFromHtml(html, url);
536
+ if (readableText) {
537
+ resolved.extractedText = readableText;
538
+ }
539
+ }
540
+ }
541
+
542
+ async function tryHtmlFallback(url, resolved, slug, label) {
543
+ if (!url || String(url).toLowerCase().endsWith(".pdf")) {
544
+ return false;
545
+ }
546
+ try {
547
+ const html = await fetchText(url);
548
+ const readableText = collectReadableTextFromHtml(html, url);
549
+ if (readableText) {
550
+ resolved.extractedText = readableText;
551
+ return true;
552
+ }
553
+ } catch (error) {
554
+ console.error(`[warn] ${slug}: could not extract ${label} HTML (${error.message})`);
555
+ }
556
+ return false;
557
+ }
558
+
559
+ function renderMetadataRows(entry, resolved) {
560
+ const rows = [
561
+ ["Content type", entry.kind === "article" ? "Article" : "Paper / report"],
562
+ ["Authors", resolved.authors],
563
+ ["Year", String(resolved.year)],
564
+ ["Venue", resolved.venue],
565
+ ["Research bucket", entry.researchBucket],
566
+ ["Maps to", entry.mapsTo],
567
+ ["Harness fit", entry.fit],
568
+ ["Source page", entry.sourcePage ? `[Open source](${entry.sourcePage})` : null],
569
+ ["Source PDF", resolved.sourcePdf ? `[Open PDF](${resolved.sourcePdf})` : null],
570
+ [
571
+ "Additional source",
572
+ entry.additionalSource ? `[Open source](${entry.additionalSource})` : null,
573
+ ],
574
+ ["Additional PDF", entry.additionalPdf ? `[Open PDF](${entry.additionalPdf})` : null],
575
+ ["Notes", entry.notes ?? null],
576
+ ].filter(([, value]) => value);
577
+
578
+ return rows.map(([field, value]) => `| ${field} | ${escapeInlinePipes(value)} |`).join("\n");
579
+ }
580
+
581
+ function renderMarkdown(entry, resolved) {
582
+ const kind = entry.kind === "article" ? "article" : "paper";
583
+ const summary = `${kind === "article" ? "Cached article text" : "Converted paper text"} and source links for ${entry.title}.`;
584
+ const readWhenLines =
585
+ kind === "article"
586
+ ? [
587
+ " - Reviewing current harness guidance in the local archive",
588
+ " - You want the extracted article text with source links preserved",
589
+ ]
590
+ : [
591
+ " - Reviewing harness and coordination research source material in the docs tree",
592
+ " - You want the extracted paper text with source links preserved",
593
+ ];
594
+ const topics = normalizeList(entry.topics);
595
+ const topicBlock =
596
+ topics.length > 0 ? `topics:\n${topics.map((topic) => ` - ${topic}`).join("\n")}\n` : "";
597
+ const sourceLabel = kind === "article" ? "source page" : "source document";
598
+
599
+ return `---
600
+ summary: '${escapeYamlSingleQuoted(summary)}'
601
+ read_when:
602
+ ${readWhenLines.join("\n")}
603
+ ${topicBlock}kind: '${kind}'
604
+ title: '${escapeYamlSingleQuoted(entry.title)}'
605
+ ---
606
+ # ${entry.title}
607
+
608
+ <Note>
609
+ Converted from the ${sourceLabel} on ${TODAY}. The repo does not retain downloaded source files; they were fetched transiently, converted to Markdown, and deleted after extraction.
610
+ </Note>
611
+
612
+ ## Metadata
613
+
614
+ | Field | Value |
615
+ | --- | --- |
616
+ ${renderMetadataRows(entry, resolved)}
617
+
618
+ ## Extracted text
619
+ ${resolved.extractedText}
620
+ `;
621
+ }
622
+
623
+ async function loadManifest(manifestPath) {
624
+ const manifestUrl = pathToFileURL(manifestPath).href;
625
+ const module = await import(manifestUrl);
626
+ const manifest = module.paperManifest ?? module.default;
627
+ if (!Array.isArray(manifest)) {
628
+ throw new Error(`Manifest ${manifestPath} did not export paperManifest/default array`);
629
+ }
630
+ return manifest;
631
+ }
632
+
633
+ async function hydrateTextSource(entry, resolved) {
634
+ if (!entry.textSourceUrl || resolved.extractedText) {
635
+ return;
636
+ }
637
+ const raw = await fetchText(entry.textSourceUrl);
638
+ let extracted =
639
+ entry.textSourceFormat === "jina-markdown" ? extractProxyMarkdownContent(raw) : raw;
640
+ if (entry.textStartMarker) {
641
+ const markerIndex = extracted.indexOf(entry.textStartMarker);
642
+ if (markerIndex >= 0) {
643
+ extracted = extracted.slice(markerIndex).trimStart();
644
+ }
645
+ }
646
+ resolved.extractedText = sanitizeTextForMarkdown(extracted);
647
+ }
648
+
649
+ async function processEntry(entry, outputRoot, tempDir) {
650
+ const preferHtml = Boolean(entry.preferHtml);
651
+ const resolved = {
652
+ title: entry.title,
653
+ sourcePdf: preferHtml ? null : (entry.sourcePdf ?? deriveArxivPdfUrl(entry.sourcePage)),
654
+ authors: entry.authors ?? null,
655
+ year: entry.year ?? null,
656
+ venue: entry.venue ?? null,
657
+ extractedText: null,
658
+ };
659
+
660
+ await hydrateTextSource(entry, resolved);
661
+
662
+ if (
663
+ entry.sourcePage &&
664
+ entry.skipSourcePageFetch !== true &&
665
+ !String(entry.sourcePage).toLowerCase().endsWith(".pdf")
666
+ ) {
667
+ try {
668
+ await enrichFromHtmlSource(entry.sourcePage, resolved, { preferHtml });
669
+ } catch (error) {
670
+ console.error(
671
+ `[warn] ${entry.slug}: could not parse source page metadata (${error.message})`,
672
+ );
673
+ }
674
+ }
675
+
676
+ if (!resolved.sourcePdf && entry.additionalSource) {
677
+ const derivedPdf = deriveArxivPdfUrl(entry.additionalSource);
678
+ if (derivedPdf) {
679
+ resolved.sourcePdf = derivedPdf;
680
+ }
681
+ }
682
+
683
+ if (
684
+ entry.additionalSource &&
685
+ (!resolved.authors ||
686
+ !resolved.venue ||
687
+ !resolved.year ||
688
+ !resolved.sourcePdf ||
689
+ !resolved.extractedText) &&
690
+ !String(entry.additionalSource).toLowerCase().endsWith(".pdf")
691
+ ) {
692
+ try {
693
+ await enrichFromHtmlSource(entry.additionalSource, resolved, { preferHtml });
694
+ } catch (error) {
695
+ console.error(
696
+ `[warn] ${entry.slug}: could not parse additional source metadata (${error.message})`,
697
+ );
698
+ }
699
+ }
700
+
701
+ if (!resolved.extractedText && typeof resolved.sourcePdf === "string" && resolved.sourcePdf) {
702
+ try {
703
+ const pdfBuffer = await fetchBuffer(resolved.sourcePdf);
704
+ const tempPdfPath = path.join(tempDir, `${entry.slug}.pdf`);
705
+ await fs.writeFile(tempPdfPath, pdfBuffer);
706
+ resolved.extractedText = await extractPdfText(pdfBuffer);
707
+ await fs.rm(tempPdfPath, { force: true });
708
+ } catch (error) {
709
+ console.error(`[warn] ${entry.slug}: could not extract PDF (${error.message})`);
710
+ if (entry.additionalPdf && entry.additionalPdf !== resolved.sourcePdf) {
711
+ try {
712
+ const pdfBuffer = await fetchBuffer(entry.additionalPdf);
713
+ const tempPdfPath = path.join(tempDir, `${entry.slug}.pdf`);
714
+ await fs.writeFile(tempPdfPath, pdfBuffer);
715
+ resolved.extractedText = await extractPdfText(pdfBuffer);
716
+ await fs.rm(tempPdfPath, { force: true });
717
+ } catch (innerError) {
718
+ console.error(
719
+ `[warn] ${entry.slug}: could not extract additional PDF (${innerError.message})`,
720
+ );
721
+ }
722
+ }
723
+ }
724
+ }
725
+
726
+ if (!resolved.extractedText && entry.additionalSource) {
727
+ await tryHtmlFallback(entry.additionalSource, resolved, entry.slug, "additional source");
728
+ }
729
+
730
+ if (!resolved.extractedText && entry.sourcePage && entry.skipSourcePageFetch !== true) {
731
+ await tryHtmlFallback(entry.sourcePage, resolved, entry.slug, "source");
732
+ }
733
+
734
+ if (!resolved.extractedText) {
735
+ throw new Error(`${entry.slug}: no extractable source found`);
736
+ }
737
+
738
+ if (!resolved.authors && entry.kind !== "article") {
739
+ resolved.authors =
740
+ inferAuthorsFromPdfText(resolved.extractedText, entry.title) ?? "See extracted text";
741
+ }
742
+ if (!resolved.year) {
743
+ resolved.year = "Unknown";
744
+ }
745
+ if (!resolved.venue) {
746
+ resolved.venue = "Unknown";
747
+ }
748
+
749
+ const markdown = renderMarkdown(entry, resolved);
750
+ const kindDir = path.join(outputRoot, entry.kind === "article" ? "articles" : "papers");
751
+ await fs.mkdir(kindDir, { recursive: true });
752
+ const outPath = path.join(kindDir, `${entry.slug}.md`);
753
+ await fs.writeFile(outPath, `${markdown.trimEnd()}\n`, "utf8");
754
+ return {
755
+ slug: entry.slug,
756
+ outPath,
757
+ authors: resolved.authors,
758
+ sourcePdf: resolved.sourcePdf ?? null,
759
+ };
760
+ }
761
+
762
+ async function main() {
763
+ const { manifestPath, outputRoot, only } = parseArgs(process.argv.slice(2));
764
+ const manifest = await loadManifest(manifestPath);
765
+ const selected = only ? manifest.filter((entry) => only.has(entry.slug)) : manifest;
766
+ if (selected.length === 0) {
767
+ throw new Error("No manifest entries selected");
768
+ }
769
+
770
+ await fs.mkdir(outputRoot, { recursive: true });
771
+ const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "wave-agent-context-import-"));
772
+ const results = [];
773
+
774
+ try {
775
+ for (const entry of selected) {
776
+ console.error(`[archive] ${entry.slug}`);
777
+ results.push(await processEntry(entry, outputRoot, tempDir));
778
+ }
779
+ } finally {
780
+ if (await fileExists(tempDir)) {
781
+ await fs.rm(tempDir, { recursive: true, force: true });
782
+ }
783
+ }
784
+
785
+ for (const result of results) {
786
+ console.log(`${result.slug}\t${path.relative(REPO_ROOT, result.outPath)}\t${result.authors}`);
787
+ }
788
+ }
789
+
790
+ main().catch((error) => {
791
+ console.error(error.stack ?? String(error));
792
+ process.exit(1);
793
+ });