@swarmclawai/swarmclaw 1.3.4 → 1.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -76
- package/package.json +3 -2
- package/skills/swarmclaw.md +17 -0
- package/src/app/api/agents/[id]/dream/route.ts +45 -0
- package/src/app/api/knowledge/[id]/route.ts +48 -49
- package/src/app/api/knowledge/hygiene/route.ts +13 -0
- package/src/app/api/knowledge/route.ts +70 -42
- package/src/app/api/knowledge/sources/[id]/archive/route.ts +15 -0
- package/src/app/api/knowledge/sources/[id]/restore/route.ts +10 -0
- package/src/app/api/knowledge/sources/[id]/route.ts +1 -0
- package/src/app/api/knowledge/sources/[id]/supersede/route.ts +26 -0
- package/src/app/api/knowledge/sources/[id]/sync/route.ts +17 -0
- package/src/app/api/knowledge/sources/route.ts +1 -0
- package/src/app/api/knowledge/upload/route.ts +3 -51
- package/src/app/api/memory/dream/[id]/route.ts +19 -0
- package/src/app/api/memory/dream/route.ts +34 -0
- package/src/app/knowledge/layout.tsx +1 -1
- package/src/app/knowledge/page.tsx +2 -22
- package/src/app/protocols/page.tsx +21 -2
- package/src/cli/index.js +16 -0
- package/src/cli/spec.js +5 -0
- package/src/components/agents/agent-sheet.tsx +65 -0
- package/src/components/chat/message-bubble.tsx +10 -0
- package/src/components/knowledge/grounding-panel.tsx +99 -0
- package/src/components/knowledge/knowledge-detail.tsx +402 -0
- package/src/components/knowledge/knowledge-list.tsx +351 -126
- package/src/components/knowledge/knowledge-sheet.tsx +208 -119
- package/src/components/memory/dream-history.tsx +155 -0
- package/src/components/memory/memory-card.tsx +7 -0
- package/src/components/memory/memory-detail.tsx +46 -0
- package/src/components/runs/run-list.tsx +23 -0
- package/src/lib/server/api-routes.test.ts +43 -2
- package/src/lib/server/chat-execution/chat-execution-disabled.test.ts +14 -31
- package/src/lib/server/chat-execution/chat-execution-eval-history.test.ts +11 -34
- package/src/lib/server/chat-execution/chat-execution-grounding.test.ts +108 -0
- package/src/lib/server/chat-execution/chat-execution-session-sync.test.ts +35 -36
- package/src/lib/server/chat-execution/chat-execution-types.ts +8 -1
- package/src/lib/server/chat-execution/chat-execution.ts +1 -0
- package/src/lib/server/chat-execution/chat-turn-finalization.ts +21 -1
- package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +6 -1
- package/src/lib/server/chat-execution/post-stream-finalization.ts +15 -3
- package/src/lib/server/chat-execution/prompt-sections.ts +29 -3
- package/src/lib/server/chat-execution/stream-agent-chat.ts +6 -1
- package/src/lib/server/execution-engine/task-attempt.ts +8 -2
- package/src/lib/server/knowledge-import.ts +159 -0
- package/src/lib/server/knowledge-sources.test.ts +261 -0
- package/src/lib/server/knowledge-sources.ts +1284 -0
- package/src/lib/server/memory/dream-cycles.ts +49 -0
- package/src/lib/server/memory/dream-idle-callback.ts +38 -0
- package/src/lib/server/memory/dream-service.ts +315 -0
- package/src/lib/server/memory/memory-db.ts +37 -2
- package/src/lib/server/protocols/protocol-agent-turn.ts +7 -0
- package/src/lib/server/protocols/protocol-run-lifecycle.ts +19 -6
- package/src/lib/server/protocols/protocol-service.test.ts +99 -0
- package/src/lib/server/protocols/protocol-step-helpers.ts +7 -1
- package/src/lib/server/protocols/protocol-step-processors.ts +16 -3
- package/src/lib/server/protocols/protocol-types.ts +4 -0
- package/src/lib/server/runtime/daemon-state/core.ts +6 -1
- package/src/lib/server/runtime/run-ledger.test.ts +120 -0
- package/src/lib/server/runtime/run-ledger.ts +27 -1
- package/src/lib/server/runtime/session-run-manager/drain.ts +5 -0
- package/src/lib/server/runtime/session-run-manager/state.ts +19 -2
- package/src/lib/server/storage-normalization.ts +5 -0
- package/src/lib/server/storage.ts +15 -0
- package/src/lib/server/test-utils/run-with-temp-data-dir.ts +15 -2
- package/src/stores/slices/ui-slice.ts +4 -0
- package/src/types/agent.ts +7 -0
- package/src/types/dream.ts +45 -0
- package/src/types/index.ts +1 -0
- package/src/types/message.ts +3 -0
- package/src/types/misc.ts +131 -0
- package/src/types/protocol.ts +4 -0
- package/src/types/run.ts +4 -1
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { CONTEXT_OVERFLOW_RE } from '@/lib/providers/error-classification'
|
|
2
|
-
import type { ProviderType } from '@/types'
|
|
2
|
+
import type { KnowledgeRetrievalTrace, ProviderType } from '@/types'
|
|
3
3
|
import { getEnabledCapabilityIds } from '@/lib/capability-selection'
|
|
4
4
|
import { isLocalOpenClawEndpoint } from '@/lib/openclaw/openclaw-endpoint'
|
|
5
5
|
import { streamAgentChat } from '@/lib/server/chat-execution/stream-agent-chat'
|
|
@@ -42,6 +42,7 @@ export interface ExecutedPreparedChatTurn {
|
|
|
42
42
|
outputTokens: number
|
|
43
43
|
received: boolean
|
|
44
44
|
}
|
|
45
|
+
knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
|
|
45
46
|
}
|
|
46
47
|
|
|
47
48
|
export async function executePreparedChatTurn(params: {
|
|
@@ -90,6 +91,7 @@ export async function executePreparedChatTurn(params: {
|
|
|
90
91
|
let responseCacheHit = false
|
|
91
92
|
let responseCacheInput: LlmResponseCacheKeyInput | null = null
|
|
92
93
|
let durationMs = 0
|
|
94
|
+
let knowledgeRetrievalTrace: KnowledgeRetrievalTrace | null = null
|
|
93
95
|
const startTs = Date.now()
|
|
94
96
|
const endLlmPerf = perf.start('chat-execution', 'llm-round-trip', {
|
|
95
97
|
sessionId,
|
|
@@ -111,6 +113,7 @@ export async function executePreparedChatTurn(params: {
|
|
|
111
113
|
responseCacheHit,
|
|
112
114
|
durationMs,
|
|
113
115
|
directUsage,
|
|
116
|
+
knowledgeRetrievalTrace: null,
|
|
114
117
|
}
|
|
115
118
|
}
|
|
116
119
|
|
|
@@ -157,6 +160,7 @@ export async function executePreparedChatTurn(params: {
|
|
|
157
160
|
promptMode,
|
|
158
161
|
})
|
|
159
162
|
fullResponse = result.finalResponse || result.fullText
|
|
163
|
+
knowledgeRetrievalTrace = result.knowledgeRetrievalTrace || null
|
|
160
164
|
} else {
|
|
161
165
|
let directHistorySnapshot = isAutoRunNoHistory
|
|
162
166
|
? (heartbeatLightContext ? [] : getSessionMessages(sessionId).slice(-6))
|
|
@@ -298,5 +302,6 @@ export async function executePreparedChatTurn(params: {
|
|
|
298
302
|
responseCacheHit,
|
|
299
303
|
durationMs,
|
|
300
304
|
directUsage,
|
|
305
|
+
knowledgeRetrievalTrace,
|
|
301
306
|
}
|
|
302
307
|
}
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* usage recording, forced external service summary, capability hooks,
|
|
6
6
|
* and OpenClaw sync.
|
|
7
7
|
*/
|
|
8
|
-
import type { Session, UsageRecord } from '@/types'
|
|
8
|
+
import type { KnowledgeRetrievalTrace, Session, UsageRecord } from '@/types'
|
|
9
9
|
import { log } from '@/lib/server/logger'
|
|
10
10
|
import type { ChatTurnState } from '@/lib/server/chat-execution/chat-turn-state'
|
|
11
11
|
|
|
@@ -51,6 +51,7 @@ export interface PostStreamResult {
|
|
|
51
51
|
fullText: string
|
|
52
52
|
finalResponse: string
|
|
53
53
|
toolEvents: import('@/types').MessageToolEvent[]
|
|
54
|
+
knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
|
|
54
55
|
}
|
|
55
56
|
|
|
56
57
|
export interface FinalizeStreamResultOpts {
|
|
@@ -70,6 +71,7 @@ export interface FinalizeStreamResultOpts {
|
|
|
70
71
|
cleanup: () => Promise<void>
|
|
71
72
|
runId: string
|
|
72
73
|
classification?: MessageClassification | null
|
|
74
|
+
knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
|
|
73
75
|
}
|
|
74
76
|
|
|
75
77
|
export async function finalizeStreamResult(opts: FinalizeStreamResultOpts): Promise<PostStreamResult> {
|
|
@@ -138,7 +140,12 @@ export async function finalizeStreamResult(opts: FinalizeStreamResultOpts): Prom
|
|
|
138
140
|
const finalResponse = await resolveAndSummarize()
|
|
139
141
|
await emitLlmOutputHook(finalResponse)
|
|
140
142
|
await cleanup()
|
|
141
|
-
return {
|
|
143
|
+
return {
|
|
144
|
+
fullText: state.fullText,
|
|
145
|
+
finalResponse,
|
|
146
|
+
toolEvents: state.streamedToolEvents,
|
|
147
|
+
knowledgeRetrievalTrace: opts.knowledgeRetrievalTrace || null,
|
|
148
|
+
}
|
|
142
149
|
}
|
|
143
150
|
|
|
144
151
|
// Strip leaked classification JSON from model output (e.g. `{ "isDeliverableTask": true, ... }`)
|
|
@@ -212,5 +219,10 @@ export async function finalizeStreamResult(opts: FinalizeStreamResultOpts): Prom
|
|
|
212
219
|
|
|
213
220
|
await cleanup()
|
|
214
221
|
|
|
215
|
-
return {
|
|
222
|
+
return {
|
|
223
|
+
fullText: state.fullText,
|
|
224
|
+
finalResponse,
|
|
225
|
+
toolEvents: state.streamedToolEvents,
|
|
226
|
+
knowledgeRetrievalTrace: opts.knowledgeRetrievalTrace || null,
|
|
227
|
+
}
|
|
216
228
|
}
|
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
|
|
9
9
|
import fs from 'node:fs'
|
|
10
10
|
import path from 'node:path'
|
|
11
|
-
import type { Session, Agent } from '@/types'
|
|
11
|
+
import type { KnowledgeRetrievalTrace, Session, Agent } from '@/types'
|
|
12
12
|
import type { PromptMode } from '@/lib/server/chat-execution/prompt-mode'
|
|
13
13
|
import type { MessageClassification } from '@/lib/server/chat-execution/message-classifier'
|
|
14
14
|
import type { ActiveProjectContext } from '@/lib/server/project-context'
|
|
@@ -428,6 +428,7 @@ export function buildSuggestionsSection(
|
|
|
428
428
|
export interface ProactiveMemoryResult {
|
|
429
429
|
section: string | null
|
|
430
430
|
injectedIds: Record<string, number>
|
|
431
|
+
knowledgeTrace?: KnowledgeRetrievalTrace | null
|
|
431
432
|
}
|
|
432
433
|
|
|
433
434
|
export async function buildProactiveMemorySection(
|
|
@@ -438,22 +439,28 @@ export async function buildProactiveMemorySection(
|
|
|
438
439
|
isMinimalPrompt: boolean,
|
|
439
440
|
currentThreadRecallRequest: boolean,
|
|
440
441
|
): Promise<ProactiveMemoryResult> {
|
|
441
|
-
const noResult: ProactiveMemoryResult = { section: null, injectedIds: {} }
|
|
442
|
+
const noResult: ProactiveMemoryResult = { section: null, injectedIds: {}, knowledgeTrace: null }
|
|
442
443
|
if (isMinimalPrompt || !session.agentId || currentThreadRecallRequest || message.length <= 12) return noResult
|
|
443
444
|
if (!agent?.proactiveMemory) return noResult
|
|
444
445
|
try {
|
|
445
446
|
const { getMemoryDb } = await import('@/lib/server/memory/memory-db')
|
|
446
447
|
const { buildSessionMemoryScopeFilter } = await import('@/lib/server/memory/session-memory-scope')
|
|
448
|
+
const { buildKnowledgeRetrievalTrace } = await import('@/lib/server/knowledge-sources')
|
|
447
449
|
const memDb = getMemoryDb()
|
|
448
450
|
const recalled = memDb.search(message, session.agentId, {
|
|
449
451
|
scope: buildSessionMemoryScopeFilter(session, agent.memoryScopeMode || null, activeProjectRoot),
|
|
450
452
|
})
|
|
453
|
+
const knowledgeTrace = await buildKnowledgeRetrievalTrace({
|
|
454
|
+
query: message,
|
|
455
|
+
viewerAgentId: session.agentId,
|
|
456
|
+
})
|
|
451
457
|
|
|
452
458
|
// Dedup: skip memories already injected 2+ times in this session
|
|
453
459
|
const priorCounts = session.injectedMemoryIds || {}
|
|
454
460
|
const filtered = recalled.filter((entry) => (priorCounts[entry.id] || 0) < 2)
|
|
455
461
|
|
|
456
462
|
const topRecalled = filtered.slice(0, 3)
|
|
463
|
+
const sections: string[] = []
|
|
457
464
|
if (topRecalled.length > 0) {
|
|
458
465
|
// Track injection counts
|
|
459
466
|
const updatedCounts: Record<string, number> = { ...priorCounts }
|
|
@@ -464,9 +471,28 @@ export async function buildProactiveMemorySection(
|
|
|
464
471
|
const recalledLines = topRecalled.map((entry) =>
|
|
465
472
|
`- ${entry.abstract || entry.content.slice(0, 300)}`,
|
|
466
473
|
)
|
|
474
|
+
sections.push(`## Recalled Context\nRelevant memories from previous interactions:\n${recalledLines.join('\n')}`)
|
|
475
|
+
if (knowledgeTrace?.hits.length) {
|
|
476
|
+
const groundingLines = knowledgeTrace.hits.map((hit) =>
|
|
477
|
+
`- [${hit.chunkIndex + 1}/${hit.chunkCount}] ${hit.sourceTitle}: ${hit.snippet}`,
|
|
478
|
+
)
|
|
479
|
+
sections.push(`## Source Grounding\nSource-backed knowledge retrieved for this turn:\n${groundingLines.join('\n')}`)
|
|
480
|
+
}
|
|
467
481
|
return {
|
|
468
|
-
section:
|
|
482
|
+
section: sections.join('\n\n'),
|
|
469
483
|
injectedIds: updatedCounts,
|
|
484
|
+
knowledgeTrace,
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
if (knowledgeTrace?.hits.length) {
|
|
489
|
+
const groundingLines = knowledgeTrace.hits.map((hit) =>
|
|
490
|
+
`- [${hit.chunkIndex + 1}/${hit.chunkCount}] ${hit.sourceTitle}: ${hit.snippet}`,
|
|
491
|
+
)
|
|
492
|
+
return {
|
|
493
|
+
section: `## Source Grounding\nSource-backed knowledge retrieved for this turn:\n${groundingLines.join('\n')}`,
|
|
494
|
+
injectedIds: priorCounts,
|
|
495
|
+
knowledgeTrace,
|
|
470
496
|
}
|
|
471
497
|
}
|
|
472
498
|
} catch { /* non-critical */ }
|
|
@@ -36,7 +36,7 @@ import { log } from '@/lib/server/logger'
|
|
|
36
36
|
import { logExecution } from '@/lib/server/execution-log'
|
|
37
37
|
import { buildCurrentDateTimePromptContext } from '@/lib/server/prompt-runtime-context'
|
|
38
38
|
import { expandExtensionIds } from '@/lib/server/tool-aliases'
|
|
39
|
-
import type { ExecutionBrief, Session, Message } from '@/types'
|
|
39
|
+
import type { ExecutionBrief, KnowledgeRetrievalTrace, Session, Message } from '@/types'
|
|
40
40
|
import { getEnabledCapabilityIds } from '@/lib/capability-selection'
|
|
41
41
|
import { enqueueSystemEvent } from '@/lib/server/runtime/system-events'
|
|
42
42
|
import { resolveActiveProjectContext } from '@/lib/server/project-context'
|
|
@@ -199,6 +199,7 @@ export interface StreamAgentChatResult {
|
|
|
199
199
|
finalResponse: string
|
|
200
200
|
/** Tool events emitted during the streamed run. */
|
|
201
201
|
toolEvents: import('@/types').MessageToolEvent[]
|
|
202
|
+
knowledgeRetrievalTrace?: KnowledgeRetrievalTrace | null
|
|
202
203
|
}
|
|
203
204
|
|
|
204
205
|
type LangChainContentPart =
|
|
@@ -267,6 +268,7 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
|
|
|
267
268
|
preferMinimalPrompt: lightweightDirectChat,
|
|
268
269
|
})
|
|
269
270
|
const isMinimalPrompt = promptMode === 'minimal'
|
|
271
|
+
let knowledgeRetrievalTrace: KnowledgeRetrievalTrace | null = null
|
|
270
272
|
|
|
271
273
|
// Resolve agent's thinking level for provider-native params
|
|
272
274
|
let agentThinkingLevel: 'minimal' | 'low' | 'medium' | 'high' | undefined
|
|
@@ -309,6 +311,7 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
|
|
|
309
311
|
fullText: requestedToolPreflightResponse,
|
|
310
312
|
finalResponse: requestedToolPreflightResponse,
|
|
311
313
|
toolEvents: [],
|
|
314
|
+
knowledgeRetrievalTrace: null,
|
|
312
315
|
}
|
|
313
316
|
}
|
|
314
317
|
const runtime = loadRuntimeSettings()
|
|
@@ -490,6 +493,7 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
|
|
|
490
493
|
isMinimalPrompt, currentThreadRecallRequest,
|
|
491
494
|
)
|
|
492
495
|
if (memoryResult.section) promptParts.push(memoryResult.section)
|
|
496
|
+
knowledgeRetrievalTrace = memoryResult.knowledgeTrace || null
|
|
493
497
|
// Persist injection dedup counts so repeated memories are suppressed
|
|
494
498
|
if (Object.keys(memoryResult.injectedIds).length > 0) {
|
|
495
499
|
session.injectedMemoryIds = memoryResult.injectedIds
|
|
@@ -1269,5 +1273,6 @@ async function streamAgentChatCore(opts: StreamAgentChatOpts): Promise<StreamAge
|
|
|
1269
1273
|
cleanup,
|
|
1270
1274
|
runId,
|
|
1271
1275
|
classification,
|
|
1276
|
+
knowledgeRetrievalTrace,
|
|
1272
1277
|
})
|
|
1273
1278
|
}
|
|
@@ -3,7 +3,7 @@ import { WORKSPACE_DIR } from '@/lib/server/data-dir'
|
|
|
3
3
|
import { log } from '@/lib/server/logger'
|
|
4
4
|
import { loadSettings } from '@/lib/server/settings/settings-repository'
|
|
5
5
|
import { loadSessions } from '@/lib/server/sessions/session-repository'
|
|
6
|
-
import { appendPersistedRunEvent, persistRun } from '@/lib/server/runtime/run-ledger'
|
|
6
|
+
import { appendPersistedRunEvent, buildRetrievalSummary, persistRun } from '@/lib/server/runtime/run-ledger'
|
|
7
7
|
import { notify } from '@/lib/server/ws-hub'
|
|
8
8
|
import { captureGuardianCheckpoint } from '@/lib/server/agents/guardian'
|
|
9
9
|
import {
|
|
@@ -68,6 +68,7 @@ function notifyExecutionState(sessionId: string): void {
|
|
|
68
68
|
}
|
|
69
69
|
|
|
70
70
|
function emitStatus(run: SessionRunRecord, status: SessionRunStatus, extra?: Record<string, unknown>): void {
|
|
71
|
+
const { citations, retrievalTrace, ...eventExtra } = extra || {}
|
|
71
72
|
appendPersistedRunEvent({
|
|
72
73
|
runId: run.id,
|
|
73
74
|
sessionId: run.sessionId,
|
|
@@ -78,6 +79,8 @@ function emitStatus(run: SessionRunRecord, status: SessionRunStatus, extra?: Rec
|
|
|
78
79
|
phase: 'status',
|
|
79
80
|
status,
|
|
80
81
|
summary: run.resultPreview || run.error || undefined,
|
|
82
|
+
citations: citations as import('@/types').KnowledgeCitation[] | undefined,
|
|
83
|
+
retrievalTrace: (retrievalTrace as import('@/types').KnowledgeRetrievalTrace | undefined) || undefined,
|
|
81
84
|
event: {
|
|
82
85
|
t: 'md',
|
|
83
86
|
text: JSON.stringify({
|
|
@@ -90,7 +93,7 @@ function emitStatus(run: SessionRunRecord, status: SessionRunStatus, extra?: Rec
|
|
|
90
93
|
status,
|
|
91
94
|
source: run.source,
|
|
92
95
|
internal: run.internal,
|
|
93
|
-
...
|
|
96
|
+
...eventExtra,
|
|
94
97
|
},
|
|
95
98
|
}),
|
|
96
99
|
},
|
|
@@ -268,6 +271,7 @@ export function enqueueTaskAttemptExecution(
|
|
|
268
271
|
run.endedAt = Date.now()
|
|
269
272
|
run.error = controller.signal.aborted ? (run.error || 'Cancelled') : result.error
|
|
270
273
|
run.resultPreview = result.text?.slice(0, 280)
|
|
274
|
+
run.retrievalSummary = buildRetrievalSummary(result.citations)
|
|
271
275
|
if (typeof result.inputTokens === 'number') run.totalInputTokens = result.inputTokens
|
|
272
276
|
if (typeof result.outputTokens === 'number') run.totalOutputTokens = result.outputTokens
|
|
273
277
|
if (typeof result.estimatedCost === 'number') run.estimatedCost = result.estimatedCost
|
|
@@ -275,6 +279,8 @@ export function enqueueTaskAttemptExecution(
|
|
|
275
279
|
emitStatus(run, run.status, {
|
|
276
280
|
hasText: !!result.text,
|
|
277
281
|
error: run.error || null,
|
|
282
|
+
citations: result.citations,
|
|
283
|
+
retrievalTrace: result.retrievalTrace,
|
|
278
284
|
})
|
|
279
285
|
return result
|
|
280
286
|
} catch (err: unknown) {
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import fs from 'fs'
|
|
2
|
+
import path from 'path'
|
|
3
|
+
import * as cheerio from 'cheerio'
|
|
4
|
+
|
|
5
|
+
const TEXT_EXTS = new Set([
|
|
6
|
+
'.txt', '.md', '.markdown', '.csv', '.tsv', '.json', '.jsonl',
|
|
7
|
+
'.html', '.htm', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg',
|
|
8
|
+
'.js', '.ts', '.tsx', '.jsx', '.py', '.go', '.rs', '.java', '.c', '.cpp', '.h',
|
|
9
|
+
'.rb', '.php', '.sh', '.bash', '.zsh', '.sql', '.r', '.swift', '.kt',
|
|
10
|
+
'.env', '.log', '.conf', '.properties', '.gitignore', '.dockerignore',
|
|
11
|
+
])
|
|
12
|
+
|
|
13
|
+
export const MAX_KNOWLEDGE_IMPORT_BYTES = 10 * 1024 * 1024
|
|
14
|
+
export const MAX_KNOWLEDGE_CONTENT_CHARS = 500_000
|
|
15
|
+
|
|
16
|
+
export function isKnowledgeTextFile(filename: string): boolean {
|
|
17
|
+
const ext = path.extname(filename).toLowerCase()
|
|
18
|
+
return TEXT_EXTS.has(ext) || ext === ''
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function deriveKnowledgeTitle(filename: string): string {
|
|
22
|
+
const name = path.basename(filename, path.extname(filename))
|
|
23
|
+
return name
|
|
24
|
+
.replace(/[-_]+/g, ' ')
|
|
25
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
26
|
+
.replace(/\b\w/g, (char) => char.toUpperCase())
|
|
27
|
+
.trim() || 'Knowledge Source'
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function normalizeKnowledgeContent(content: string): string {
|
|
31
|
+
const normalized = String(content || '')
|
|
32
|
+
.replace(/^\uFEFF/, '')
|
|
33
|
+
.replace(/\r\n/g, '\n')
|
|
34
|
+
.trim()
|
|
35
|
+
|
|
36
|
+
if (normalized.length <= MAX_KNOWLEDGE_CONTENT_CHARS) return normalized
|
|
37
|
+
return `${normalized.slice(0, MAX_KNOWLEDGE_CONTENT_CHARS)}\n\n[... truncated at 500k characters]`
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
async function extractPdfText(buffer: Buffer, filePathHint?: string): Promise<string> {
|
|
41
|
+
try {
|
|
42
|
+
const pdfParseModule = await import('pdf-parse') as unknown as {
|
|
43
|
+
default?: (input: Buffer) => Promise<{ text?: string }>
|
|
44
|
+
}
|
|
45
|
+
const pdfParse = pdfParseModule.default
|
|
46
|
+
if (typeof pdfParse !== 'function') throw new Error('pdf-parse loader unavailable')
|
|
47
|
+
const result = await pdfParse(buffer)
|
|
48
|
+
return normalizeKnowledgeContent(result.text || '')
|
|
49
|
+
} catch {
|
|
50
|
+
return normalizeKnowledgeContent(
|
|
51
|
+
`[PDF document]\n\nUnable to extract text automatically.${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`,
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function htmlToReadableText(html: string): { title: string | null; content: string } {
|
|
57
|
+
const $ = cheerio.load(html)
|
|
58
|
+
$('script, style, noscript, svg, nav, footer, header').remove()
|
|
59
|
+
|
|
60
|
+
const title = $('title').first().text().trim() || null
|
|
61
|
+
const root = $('main').first().length
|
|
62
|
+
? $('main').first()
|
|
63
|
+
: $('article').first().length
|
|
64
|
+
? $('article').first()
|
|
65
|
+
: $('body').first().length
|
|
66
|
+
? $('body').first()
|
|
67
|
+
: $('html').first()
|
|
68
|
+
|
|
69
|
+
const text = root
|
|
70
|
+
.text()
|
|
71
|
+
.replace(/\u00a0/g, ' ')
|
|
72
|
+
.split('\n')
|
|
73
|
+
.map((line) => line.trim())
|
|
74
|
+
.filter(Boolean)
|
|
75
|
+
.join('\n\n')
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
title,
|
|
79
|
+
content: normalizeKnowledgeContent(text),
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
export async function extractKnowledgeTextFromBuffer(
|
|
84
|
+
buffer: Buffer,
|
|
85
|
+
filename: string,
|
|
86
|
+
filePathHint?: string,
|
|
87
|
+
): Promise<string> {
|
|
88
|
+
if (buffer.length === 0) return ''
|
|
89
|
+
if (buffer.length > MAX_KNOWLEDGE_IMPORT_BYTES) {
|
|
90
|
+
throw new Error('File too large. Maximum 10MB.')
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const ext = path.extname(filename).toLowerCase()
|
|
94
|
+
if (ext === '.pdf') {
|
|
95
|
+
return extractPdfText(buffer, filePathHint)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (isKnowledgeTextFile(filename)) {
|
|
99
|
+
return normalizeKnowledgeContent(buffer.toString('utf-8'))
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return normalizeKnowledgeContent(
|
|
103
|
+
`[Binary file: ${filename}]${filePathHint ? `\n\nSaved at: ${filePathHint}` : ''}`,
|
|
104
|
+
)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export async function extractKnowledgeTextFromFile(filePath: string, filename?: string): Promise<string> {
|
|
108
|
+
const buffer = await fs.promises.readFile(filePath)
|
|
109
|
+
return extractKnowledgeTextFromBuffer(buffer, filename || path.basename(filePath), filePath)
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
export async function extractKnowledgeTextFromUrl(sourceUrl: string): Promise<{
|
|
113
|
+
title: string | null
|
|
114
|
+
content: string
|
|
115
|
+
contentType: string | null
|
|
116
|
+
}> {
|
|
117
|
+
const response = await fetch(sourceUrl, {
|
|
118
|
+
headers: {
|
|
119
|
+
'user-agent': 'SwarmClaw/knowledge-import',
|
|
120
|
+
accept: 'text/html, text/plain, application/json, application/pdf, */*',
|
|
121
|
+
},
|
|
122
|
+
})
|
|
123
|
+
|
|
124
|
+
if (!response.ok) {
|
|
125
|
+
throw new Error(`URL fetch failed (${response.status})`)
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
const contentType = response.headers.get('content-type')
|
|
129
|
+
const contentLength = Number.parseInt(response.headers.get('content-length') || '', 10)
|
|
130
|
+
if (Number.isFinite(contentLength) && contentLength > MAX_KNOWLEDGE_IMPORT_BYTES) {
|
|
131
|
+
throw new Error('Remote document is too large. Maximum 10MB.')
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
if ((contentType || '').includes('application/pdf') || sourceUrl.toLowerCase().endsWith('.pdf')) {
|
|
135
|
+
const buffer = Buffer.from(await response.arrayBuffer())
|
|
136
|
+
return {
|
|
137
|
+
title: null,
|
|
138
|
+
content: await extractPdfText(buffer, sourceUrl),
|
|
139
|
+
contentType,
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
const text = await response.text()
|
|
144
|
+
const looksLikeHtml = (contentType || '').includes('text/html') || /<html[\s>]|<body[\s>]/i.test(text)
|
|
145
|
+
if (looksLikeHtml) {
|
|
146
|
+
const parsed = htmlToReadableText(text)
|
|
147
|
+
return {
|
|
148
|
+
title: parsed.title,
|
|
149
|
+
content: parsed.content,
|
|
150
|
+
contentType,
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
return {
|
|
155
|
+
title: null,
|
|
156
|
+
content: normalizeKnowledgeContent(text),
|
|
157
|
+
contentType,
|
|
158
|
+
}
|
|
159
|
+
}
|