@swarmclawai/swarmclaw 1.3.4 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +20 -76
  2. package/package.json +3 -2
  3. package/skills/swarmclaw.md +17 -0
  4. package/src/app/api/agents/[id]/dream/route.ts +45 -0
  5. package/src/app/api/knowledge/[id]/route.ts +48 -49
  6. package/src/app/api/knowledge/hygiene/route.ts +13 -0
  7. package/src/app/api/knowledge/route.ts +70 -42
  8. package/src/app/api/knowledge/sources/[id]/archive/route.ts +15 -0
  9. package/src/app/api/knowledge/sources/[id]/restore/route.ts +10 -0
  10. package/src/app/api/knowledge/sources/[id]/route.ts +1 -0
  11. package/src/app/api/knowledge/sources/[id]/supersede/route.ts +26 -0
  12. package/src/app/api/knowledge/sources/[id]/sync/route.ts +17 -0
  13. package/src/app/api/knowledge/sources/route.ts +1 -0
  14. package/src/app/api/knowledge/upload/route.ts +3 -51
  15. package/src/app/api/memory/dream/[id]/route.ts +19 -0
  16. package/src/app/api/memory/dream/route.ts +34 -0
  17. package/src/app/knowledge/layout.tsx +1 -1
  18. package/src/app/knowledge/page.tsx +2 -22
  19. package/src/app/protocols/page.tsx +21 -2
  20. package/src/cli/index.js +16 -0
  21. package/src/cli/spec.js +5 -0
  22. package/src/components/agents/agent-sheet.tsx +65 -0
  23. package/src/components/chat/message-bubble.tsx +10 -0
  24. package/src/components/knowledge/grounding-panel.tsx +99 -0
  25. package/src/components/knowledge/knowledge-detail.tsx +402 -0
  26. package/src/components/knowledge/knowledge-list.tsx +351 -126
  27. package/src/components/knowledge/knowledge-sheet.tsx +208 -119
  28. package/src/components/memory/dream-history.tsx +155 -0
  29. package/src/components/memory/memory-card.tsx +7 -0
  30. package/src/components/memory/memory-detail.tsx +46 -0
  31. package/src/components/runs/run-list.tsx +23 -0
  32. package/src/lib/server/api-routes.test.ts +43 -2
  33. package/src/lib/server/chat-execution/chat-execution-disabled.test.ts +14 -31
  34. package/src/lib/server/chat-execution/chat-execution-eval-history.test.ts +11 -34
  35. package/src/lib/server/chat-execution/chat-execution-grounding.test.ts +108 -0
  36. package/src/lib/server/chat-execution/chat-execution-session-sync.test.ts +35 -36
  37. package/src/lib/server/chat-execution/chat-execution-types.ts +8 -1
  38. package/src/lib/server/chat-execution/chat-execution.ts +1 -0
  39. package/src/lib/server/chat-execution/chat-turn-finalization.ts +21 -1
  40. package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +6 -1
  41. package/src/lib/server/chat-execution/post-stream-finalization.ts +15 -3
  42. package/src/lib/server/chat-execution/prompt-sections.ts +29 -3
  43. package/src/lib/server/chat-execution/stream-agent-chat.ts +6 -1
  44. package/src/lib/server/execution-engine/task-attempt.ts +8 -2
  45. package/src/lib/server/knowledge-import.ts +159 -0
  46. package/src/lib/server/knowledge-sources.test.ts +261 -0
  47. package/src/lib/server/knowledge-sources.ts +1284 -0
  48. package/src/lib/server/memory/dream-cycles.ts +49 -0
  49. package/src/lib/server/memory/dream-idle-callback.ts +38 -0
  50. package/src/lib/server/memory/dream-service.ts +315 -0
  51. package/src/lib/server/memory/memory-db.ts +37 -2
  52. package/src/lib/server/protocols/protocol-agent-turn.ts +7 -0
  53. package/src/lib/server/protocols/protocol-run-lifecycle.ts +19 -6
  54. package/src/lib/server/protocols/protocol-service.test.ts +99 -0
  55. package/src/lib/server/protocols/protocol-step-helpers.ts +7 -1
  56. package/src/lib/server/protocols/protocol-step-processors.ts +16 -3
  57. package/src/lib/server/protocols/protocol-types.ts +4 -0
  58. package/src/lib/server/runtime/daemon-state/core.ts +6 -1
  59. package/src/lib/server/runtime/run-ledger.test.ts +120 -0
  60. package/src/lib/server/runtime/run-ledger.ts +27 -1
  61. package/src/lib/server/runtime/session-run-manager/drain.ts +5 -0
  62. package/src/lib/server/runtime/session-run-manager/state.ts +19 -2
  63. package/src/lib/server/storage-normalization.ts +5 -0
  64. package/src/lib/server/storage.ts +15 -0
  65. package/src/lib/server/test-utils/run-with-temp-data-dir.ts +15 -2
  66. package/src/stores/slices/ui-slice.ts +4 -0
  67. package/src/types/agent.ts +7 -0
  68. package/src/types/dream.ts +45 -0
  69. package/src/types/index.ts +1 -0
  70. package/src/types/message.ts +3 -0
  71. package/src/types/misc.ts +131 -0
  72. package/src/types/protocol.ts +4 -0
  73. package/src/types/run.ts +4 -1
@@ -1,5 +1,5 @@
1
1
  import { CONTEXT_OVERFLOW_RE } from '@/lib/providers/error-classification'
2
- import type { ProviderType } from '@/types'
2
+ import type { KnowledgeRetrievalTrace, ProviderType } from '@/types'
3
3
  import { getEnabledCapabilityIds } from '@/lib/capability-selection'
4
4
  import { isLocalOpenClawEndpoint } from '@/lib/openclaw/openclaw-endpoint'
5
5
  import { streamAgentChat } from '@/lib/server/chat-execution/stream-agent-chat'
@@ -42,6 +42,7 @@ export interface ExecutedPreparedChatTurn {
42
42
  outputTokens: number
43
43
  received: boolean
44
44
  }
45
+ knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
45
46
  }
46
47
 
47
48
  export async function executePreparedChatTurn(params: {
@@ -90,6 +91,7 @@ export async function executePreparedChatTurn(params: {
90
91
  let responseCacheHit = false
91
92
  let responseCacheInput: LlmResponseCacheKeyInput | null = null
92
93
  let durationMs = 0
94
+ let knowledgeRetrievalTrace: KnowledgeRetrievalTrace | null = null
93
95
  const startTs = Date.now()
94
96
  const endLlmPerf = perf.start('chat-execution', 'llm-round-trip', {
95
97
  sessionId,
@@ -111,6 +113,7 @@ export async function executePreparedChatTurn(params: {
111
113
  responseCacheHit,
112
114
  durationMs,
113
115
  directUsage,
116
+ knowledgeRetrievalTrace: null,
114
117
  }
115
118
  }
116
119
 
@@ -157,6 +160,7 @@ export async function executePreparedChatTurn(params: {
157
160
  promptMode,
158
161
  })
159
162
  fullResponse = result.finalResponse || result.fullText
163
+ knowledgeRetrievalTrace = result.knowledgeRetrievalTrace || null
160
164
  } else {
161
165
  let directHistorySnapshot = isAutoRunNoHistory
162
166
  ? (heartbeatLightContext ? [] : getSessionMessages(sessionId).slice(-6))
@@ -298,5 +302,6 @@ export async function executePreparedChatTurn(params: {
298
302
  responseCacheHit,
299
303
  durationMs,
300
304
  directUsage,
305
+ knowledgeRetrievalTrace,
301
306
  }
302
307
  }
@@ -5,7 +5,7 @@
5
5
  * usage recording, forced external service summary, capability hooks,
6
6
  * and OpenClaw sync.
7
7
  */
8
- import type { Session, UsageRecord } from '@/types'
8
+ import type { KnowledgeRetrievalTrace, Session, UsageRecord } from '@/types'
9
9
  import { log } from '@/lib/server/logger'
10
10
  import type { ChatTurnState } from '@/lib/server/chat-execution/chat-turn-state'
11
11
 
@@ -51,6 +51,7 @@ export interface PostStreamResult {
51
51
  fullText: string
52
52
  finalResponse: string
53
53
  toolEvents: import('@/types').MessageToolEvent[]
54
+ knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
54
55
  }
55
56
 
56
57
  export interface FinalizeStreamResultOpts {
@@ -70,6 +71,7 @@ export interface FinalizeStreamResultOpts {
70
71
  cleanup: () => Promise<void>
71
72
  runId: string
72
73
  classification?: MessageClassification | null
74
+ knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
73
75
  }
74
76
 
75
77
  export async function finalizeStreamResult(opts: FinalizeStreamResultOpts): Promise<PostStreamResult> {
@@ -138,7 +140,12 @@ export async function finalizeStreamResult(opts: FinalizeStreamResultOpts): Prom
138
140
  const finalResponse = await resolveAndSummarize()
139
141
  await emitLlmOutputHook(finalResponse)
140
142
  await cleanup()
141
- return { fullText: state.fullText, finalResponse, toolEvents: state.streamedToolEvents }
143
+ return {
144
+ fullText: state.fullText,
145
+ finalResponse,
146
+ toolEvents: state.streamedToolEvents,
147
+ knowledgeRetrievalTrace: opts.knowledgeRetrievalTrace || null,
148
+ }
142
149
  }
143
150
 
144
151
  // Strip leaked classification JSON from model output (e.g. `{ "isDeliverableTask": true, ... }`)
@@ -212,5 +219,10 @@ export async function finalizeStreamResult(opts: FinalizeStreamResultOpts): Prom
212
219
 
213
220
  await cleanup()
214
221
 
215
- return { fullText: state.fullText, finalResponse, toolEvents: state.streamedToolEvents }
222
+ return {
223
+ fullText: state.fullText,
224
+ finalResponse,
225
+ toolEvents: state.streamedToolEvents,
226
+ knowledgeRetrievalTrace: opts.knowledgeRetrievalTrace || null,
227
+ }
216
228
  }
@@ -8,7 +8,7 @@
8
8
 
9
9
  import fs from 'node:fs'
10
10
  import path from 'node:path'
11
- import type { Session, Agent } from '@/types'
11
+ import type { KnowledgeRetrievalTrace, Session, Agent } from '@/types'
12
12
  import type { PromptMode } from '@/lib/server/chat-execution/prompt-mode'
13
13
  import type { MessageClassification } from '@/lib/server/chat-execution/message-classifier'
14
14
  import type { ActiveProjectContext } from '@/lib/server/project-context'
@@ -428,6 +428,7 @@ export function buildSuggestionsSection(
428
428
  export interface ProactiveMemoryResult {
429
429
  section: string | null
430
430
  injectedIds: Record<string, number>
431
+ knowledgeTrace?: KnowledgeRetrievalTrace | null
431
432
  }
432
433
 
433
434
  export async function buildProactiveMemorySection(
@@ -438,22 +439,28 @@ export async function buildProactiveMemorySection(
438
439
  isMinimalPrompt: boolean,
439
440
  currentThreadRecallRequest: boolean,
440
441
  ): Promise<ProactiveMemoryResult> {
441
- const noResult: ProactiveMemoryResult = { section: null, injectedIds: {} }
442
+ const noResult: ProactiveMemoryResult = { section: null, injectedIds: {}, knowledgeTrace: null }
442
443
  if (isMinimalPrompt || !session.agentId || currentThreadRecallRequest || message.length <= 12) return noResult
443
444
  if (!agent?.proactiveMemory) return noResult
444
445
  try {
445
446
  const { getMemoryDb } = await import('@/lib/server/memory/memory-db')
446
447
  const { buildSessionMemoryScopeFilter } = await import('@/lib/server/memory/session-memory-scope')
448
+ const { buildKnowledgeRetrievalTrace } = await import('@/lib/server/knowledge-sources')
447
449
  const memDb = getMemoryDb()
448
450
  const recalled = memDb.search(message, session.agentId, {
449
451
  scope: buildSessionMemoryScopeFilter(session, agent.memoryScopeMode || null, activeProjectRoot),
450
452
  })
453
+ const knowledgeTrace = await buildKnowledgeRetrievalTrace({
454
+ query: message,
455
+ viewerAgentId: session.agentId,
456
+ })
451
457
 
452
458
  // Dedup: skip memories already injected 2+ times in this session
453
459
  const priorCounts = session.injectedMemoryIds || {}
454
460
  const filtered = recalled.filter((entry) => (priorCounts[entry.id] || 0) < 2)
455
461
 
456
462
  const topRecalled = filtered.slice(0, 3)
463
+ const sections: string[] = []
457
464
  if (topRecalled.length > 0) {
458
465
  // Track injection counts
459
466
  const updatedCounts: Record<string, number> = { ...priorCounts }
@@ -464,9 +471,28 @@ export async function buildProactiveMemorySection(
464
471
  const recalledLines = topRecalled.map((entry) =>
465
472
  `- ${entry.abstract || entry.content.slice(0, 300)}`,
466
473
  )
474
+ sections.push(`## Recalled Context\nRelevant memories from previous interactions:\n${recalledLines.join('\n')}`)
475
+ if (knowledgeTrace?.hits.length) {
476
+ const groundingLines = knowledgeTrace.hits.map((hit) =>
477
+ `- [${hit.chunkIndex + 1}/${hit.chunkCount}] ${hit.sourceTitle}: ${hit.snippet}`,
478
+ )
479
+ sections.push(`## Source Grounding\nSource-backed knowledge retrieved for this turn:\n${groundingLines.join('\n')}`)
480
+ }
467
481
  return {
468
- section: `## Recalled Context\nRelevant memories from previous interactions:\n${recalledLines.join('\n')}`,
482
+ section: sections.join('\n\n'),
469
483
  injectedIds: updatedCounts,
484
+ knowledgeTrace,
485
+ }
486
+ }
487
+
488
+ if (knowledgeTrace?.hits.length) {
489
+ const groundingLines = knowledgeTrace.hits.map((hit) =>
490
+ `- [${hit.chunkIndex + 1}/${hit.chunkCount}] ${hit.sourceTitle}: ${hit.snippet}`,
491
+ )
492
+ return {
493
+ section: `## Source Grounding\nSource-backed knowledge retrieved for this turn:\n${groundingLines.join('\n')}`,
494
+ injectedIds: priorCounts,
495
+ knowledgeTrace,
470
496
  }
471
497
  }
472
498
  } catch { /* non-critical */ }
@@ -36,7 +36,7 @@ import { log } from '@/lib/server/logger'
36
36
  import { logExecution } from '@/lib/server/execution-log'
37
37
  import { buildCurrentDateTimePromptContext } from '@/lib/server/prompt-runtime-context'
38
38
  import { expandExtensionIds } from '@/lib/server/tool-aliases'
39
- import type { ExecutionBrief, Session, Message } from '@/types'
39
+ import type { ExecutionBrief, KnowledgeRetrievalTrace, Session, Message } from '@/types'
40
40
  import { getEnabledCapabilityIds } from '@/lib/capability-selection'
41
41
  import { enqueueSystemEvent } from '@/lib/server/runtime/system-events'
42
42
  import { resolveActiveProjectContext } from '@/lib/server/project-context'
@@ -199,6 +199,7 @@ export interface StreamAgentChatResult {
199
199
  finalResponse: string
200
200
  /** Tool events emitted during the streamed run. */
201
201
  toolEvents: import('@/types').MessageToolEvent[]
202
+ knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
202
203
  }
203
204
 
204
205
  type LangChainContentPart =
@@ -267,6 +268,7 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
267
268
  preferMinimalPrompt: lightweightDirectChat,
268
269
  })
269
270
  const isMinimalPrompt = promptMode === 'minimal'
271
+ let knowledgeRetrievalTrace: KnowledgeRetrievalTrace | null = null
270
272
 
271
273
  // Resolve agent's thinking level for provider-native params
272
274
  let agentThinkingLevel: 'minimal' | 'low' | 'medium' | 'high' | undefined
@@ -309,6 +311,7 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
309
311
  fullText: requestedToolPreflightResponse,
310
312
  finalResponse: requestedToolPreflightResponse,
311
313
  toolEvents: [],
314
+ knowledgeRetrievalTrace: null,
312
315
  }
313
316
  }
314
317
  const runtime = loadRuntimeSettings()
@@ -490,6 +493,7 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
490
493
  isMinimalPrompt, currentThreadRecallRequest,
491
494
  )
492
495
  if (memoryResult.section) promptParts.push(memoryResult.section)
496
+ knowledgeRetrievalTrace = memoryResult.knowledgeTrace || null
493
497
  // Persist injection dedup counts so repeated memories are suppressed
494
498
  if (Object.keys(memoryResult.injectedIds).length > 0) {
495
499
  session.injectedMemoryIds = memoryResult.injectedIds
@@ -1269,5 +1273,6 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
1269
1273
  cleanup,
1270
1274
  runId,
1271
1275
  classification,
1276
+ knowledgeRetrievalTrace,
1272
1277
  })
1273
1278
  }
@@ -3,7 +3,7 @@ import { WORKSPACE_DIR } from '@/lib/server/data-dir'
3
3
  import { log } from '@/lib/server/logger'
4
4
  import { loadSettings } from '@/lib/server/settings/settings-repository'
5
5
  import { loadSessions } from '@/lib/server/sessions/session-repository'
6
- import { appendPersistedRunEvent, persistRun } from '@/lib/server/runtime/run-ledger'
6
+ import { appendPersistedRunEvent, buildRetrievalSummary, persistRun } from '@/lib/server/runtime/run-ledger'
7
7
  import { notify } from '@/lib/server/ws-hub'
8
8
  import { captureGuardianCheckpoint } from '@/lib/server/agents/guardian'
9
9
  import {
@@ -68,6 +68,7 @@ function notifyExecutionState(sessionId: string): void {
68
68
  }
69
69
 
70
70
  function emitStatus(run: SessionRunRecord, status: SessionRunStatus, extra?: Record<string, unknown>): void {
71
+ const { citations, retrievalTrace, ...eventExtra } = extra || {}
71
72
  appendPersistedRunEvent({
72
73
  runId: run.id,
73
74
  sessionId: run.sessionId,
@@ -78,6 +79,8 @@ function emitStatus(run: SessionRunRecord, status: SessionRunStatus, extra?: Rec
78
79
  phase: 'status',
79
80
  status,
80
81
  summary: run.resultPreview || run.error || undefined,
82
+ citations: citations as import('@/types').KnowledgeCitation[] | undefined,
83
+ retrievalTrace: (retrievalTrace as import('@/types').KnowledgeRetrievalTrace | undefined) || undefined,
81
84
  event: {
82
85
  t: 'md',
83
86
  text: JSON.stringify({
@@ -90,7 +93,7 @@ function emitStatus(run: SessionRunRecord, status: SessionRunStatus, extra?: Rec
90
93
  status,
91
94
  source: run.source,
92
95
  internal: run.internal,
93
- ...extra,
96
+ ...eventExtra,
94
97
  },
95
98
  }),
96
99
  },
@@ -268,6 +271,7 @@ export function enqueueTaskAttemptExecution(
268
271
  run.endedAt = Date.now()
269
272
  run.error = controller.signal.aborted ? (run.error || 'Cancelled') : result.error
270
273
  run.resultPreview = result.text?.slice(0, 280)
274
+ run.retrievalSummary = buildRetrievalSummary(result.citations)
271
275
  if (typeof result.inputTokens === 'number') run.totalInputTokens = result.inputTokens
272
276
  if (typeof result.outputTokens === 'number') run.totalOutputTokens = result.outputTokens
273
277
  if (typeof result.estimatedCost === 'number') run.estimatedCost = result.estimatedCost
@@ -275,6 +279,8 @@ export function enqueueTaskAttemptExecution(
275
279
  emitStatus(run, run.status, {
276
280
  hasText: !!result.text,
277
281
  error: run.error || null,
282
+ citations: result.citations,
283
+ retrievalTrace: result.retrievalTrace,
278
284
  })
279
285
  return result
280
286
  } catch (err: unknown) {
@@ -0,0 +1,159 @@
1
+ import fs from 'fs'
2
+ import path from 'path'
3
+ import * as cheerio from 'cheerio'
4
+
5
+ const TEXT_EXTS = new Set([
6
+ '.txt', '.md', '.markdown', '.csv', '.tsv', '.json', '.jsonl',
7
+ '.html', '.htm', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg',
8
+ '.js', '.ts', '.tsx', '.jsx', '.py', '.go', '.rs', '.java', '.c', '.cpp', '.h',
9
+ '.rb', '.php', '.sh', '.bash', '.zsh', '.sql', '.r', '.swift', '.kt',
10
+ '.env', '.log', '.conf', '.properties', '.gitignore', '.dockerignore',
11
+ ])
12
+
13
+ export const MAX_KNOWLEDGE_IMPORT_BYTES = 10 * 1024 * 1024
14
+ export const MAX_KNOWLEDGE_CONTENT_CHARS = 500_000
15
+
16
+ export function isKnowledgeTextFile(filename: string): boolean {
17
+ const ext = path.extname(filename).toLowerCase()
18
+ return TEXT_EXTS.has(ext) || ext === ''
19
+ }
20
+
21
+ export function deriveKnowledgeTitle(filename: string): string {
22
+ const name = path.basename(filename, path.extname(filename))
23
+ return name
24
+ .replace(/[-_]+/g, ' ')
25
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
26
+ .replace(/\b\w/g, (char) => char.toUpperCase())
27
+ .trim() || 'Knowledge Source'
28
+ }
29
+
30
+ function normalizeKnowledgeContent(content: string): string {
31
+ const normalized = String(content || '')
32
+ .replace(/^\uFEFF/, '')
33
+ .replace(/\r\n/g, '\n')
34
+ .trim()
35
+
36
+ if (normalized.length <= MAX_KNOWLEDGE_CONTENT_CHARS) return normalized
37
+ return `${normalized.slice(0, MAX_KNOWLEDGE_CONTENT_CHARS)}\n\n[... truncated at 500k characters]`
38
+ }
39
+
40
+ async function extractPdfText(buffer: Buffer, filePathHint?: string): Promise<string> {
41
+ try {
42
+ const pdfParseModule = await import('pdf-parse') as unknown as {
43
+ default?: (input: Buffer) => Promise<{ text?: string }>
44
+ }
45
+ const pdfParse = pdfParseModule.default
46
+ if (typeof pdfParse !== 'function') throw new Error('pdf-parse loader unavailable')
47
+ const result = await pdfParse(buffer)
48
+ return normalizeKnowledgeContent(result.text || '')
49
+ } catch {
50
+ return normalizeKnowledgeContent(
51
+ `[PDF document]\n\nUnable to extract text automatically.${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`,
52
+ )
53
+ }
54
+ }
55
+
56
+ function htmlToReadableText(html: string): { title: string | null; content: string } {
57
+ const $ = cheerio.load(html)
58
+ $('script, style, noscript, svg, nav, footer, header').remove()
59
+
60
+ const title = $('title').first().text().trim() || null
61
+ const root = $('main').first().length
62
+ ? $('main').first()
63
+ : $('article').first().length
64
+ ? $('article').first()
65
+ : $('body').first().length
66
+ ? $('body').first()
67
+ : $('html').first()
68
+
69
+ const text = root
70
+ .text()
71
+ .replace(/\u00a0/g, ' ')
72
+ .split('\n')
73
+ .map((line) => line.trim())
74
+ .filter(Boolean)
75
+ .join('\n\n')
76
+
77
+ return {
78
+ title,
79
+ content: normalizeKnowledgeContent(text),
80
+ }
81
+ }
82
+
83
+ export async function extractKnowledgeTextFromBuffer(
84
+ buffer: Buffer,
85
+ filename: string,
86
+ filePathHint?: string,
87
+ ): Promise<string> {
88
+ if (buffer.length === 0) return ''
89
+ if (buffer.length > MAX_KNOWLEDGE_IMPORT_BYTES) {
90
+ throw new Error('File too large. Maximum 10MB.')
91
+ }
92
+
93
+ const ext = path.extname(filename).toLowerCase()
94
+ if (ext === '.pdf') {
95
+ return extractPdfText(buffer, filePathHint)
96
+ }
97
+
98
+ if (isKnowledgeTextFile(filename)) {
99
+ return normalizeKnowledgeContent(buffer.toString('utf-8'))
100
+ }
101
+
102
+ return normalizeKnowledgeContent(
103
+ `[Binary file: ${filename}]${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`,
104
+ )
105
+ }
106
+
107
+ export async function extractKnowledgeTextFromFile(filePath: string, filename?: string): Promise<string> {
108
+ const buffer = await fs.promises.readFile(filePath)
109
+ return extractKnowledgeTextFromBuffer(buffer, filename || path.basename(filePath), filePath)
110
+ }
111
+
112
+ export async function extractKnowledgeTextFromUrl(sourceUrl: string): Promise<{
113
+ title: string | null
114
+ content: string
115
+ contentType: string | null
116
+ }> {
117
+ const response = await fetch(sourceUrl, {
118
+ headers: {
119
+ 'user-agent': 'SwarmClaw/knowledge-import',
120
+ accept: 'text/html, text/plain, application/json, application/pdf, */*',
121
+ },
122
+ })
123
+
124
+ if (!response.ok) {
125
+ throw new Error(`URL fetch failed (${response.status})`)
126
+ }
127
+
128
+ const contentType = response.headers.get('content-type')
129
+ const contentLength = Number.parseInt(response.headers.get('content-length') || '', 10)
130
+ if (Number.isFinite(contentLength) && contentLength > MAX_KNOWLEDGE_IMPORT_BYTES) {
131
+ throw new Error('Remote document is too large. Maximum 10MB.')
132
+ }
133
+
134
+ if ((contentType || '').includes('application/pdf') || sourceUrl.toLowerCase().endsWith('.pdf')) {
135
+ const buffer = Buffer.from(await response.arrayBuffer())
136
+ return {
137
+ title: null,
138
+ content: await extractPdfText(buffer, sourceUrl),
139
+ contentType,
140
+ }
141
+ }
142
+
143
+ const text = await response.text()
144
+ const looksLikeHtml = (contentType || '').includes('text/html') || /<html[\s>]|<body[\s>]/i.test(text)
145
+ if (looksLikeHtml) {
146
+ const parsed = htmlToReadableText(text)
147
+ return {
148
+ title: parsed.title,
149
+ content: parsed.content,
150
+ contentType,
151
+ }
152
+ }
153
+
154
+ return {
155
+ title: null,
156
+ content: normalizeKnowledgeContent(text),
157
+ contentType,
158
+ }
159
+ }