@lota-sdk/core 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/infrastructure/schema/00_workstream.surql +55 -0
- package/infrastructure/schema/01_memory.surql +47 -0
- package/infrastructure/schema/02_execution_plan.surql +62 -0
- package/infrastructure/schema/03_learned_skill.surql +32 -0
- package/infrastructure/schema/04_runtime_bootstrap.surql +8 -0
- package/package.json +128 -0
- package/src/ai/definitions.ts +308 -0
- package/src/bifrost/bifrost.ts +256 -0
- package/src/config/agent-defaults.ts +99 -0
- package/src/config/constants.ts +33 -0
- package/src/config/env-shapes.ts +122 -0
- package/src/config/logger.ts +29 -0
- package/src/config/model-constants.ts +31 -0
- package/src/config/search.ts +17 -0
- package/src/config/workstream-defaults.ts +68 -0
- package/src/db/base.service.ts +55 -0
- package/src/db/cursor-pagination.ts +73 -0
- package/src/db/memory-query-builder.ts +207 -0
- package/src/db/memory-store.helpers.ts +118 -0
- package/src/db/memory-store.rows.ts +29 -0
- package/src/db/memory-store.ts +974 -0
- package/src/db/memory-types.ts +193 -0
- package/src/db/memory.ts +505 -0
- package/src/db/record-id.ts +78 -0
- package/src/db/service.ts +932 -0
- package/src/db/startup.ts +152 -0
- package/src/db/tables.ts +20 -0
- package/src/document/org-document-chunking.ts +224 -0
- package/src/document/parsing.ts +40 -0
- package/src/embeddings/provider.ts +76 -0
- package/src/index.ts +302 -0
- package/src/queues/context-compaction.queue.ts +82 -0
- package/src/queues/document-processor.queue.ts +118 -0
- package/src/queues/memory-consolidation.queue.ts +65 -0
- package/src/queues/post-chat-memory.queue.ts +128 -0
- package/src/queues/recent-activity-title-refinement.queue.ts +69 -0
- package/src/queues/regular-chat-memory-digest.config.ts +12 -0
- package/src/queues/regular-chat-memory-digest.queue.ts +73 -0
- package/src/queues/skill-extraction.config.ts +9 -0
- package/src/queues/skill-extraction.queue.ts +62 -0
- package/src/redis/connection.ts +176 -0
- package/src/redis/index.ts +30 -0
- package/src/redis/org-memory-lock.ts +43 -0
- package/src/redis/redis-lease-lock.ts +158 -0
- package/src/runtime/agent-contract.ts +1 -0
- package/src/runtime/agent-prompt-context.ts +119 -0
- package/src/runtime/agent-runtime-policy.ts +192 -0
- package/src/runtime/agent-stream-helpers.ts +117 -0
- package/src/runtime/agent-types.ts +22 -0
- package/src/runtime/approval-continuation.ts +16 -0
- package/src/runtime/chat-attachments.ts +46 -0
- package/src/runtime/chat-message.ts +10 -0
- package/src/runtime/chat-request-routing.ts +21 -0
- package/src/runtime/chat-run-orchestration.ts +25 -0
- package/src/runtime/chat-run-registry.ts +20 -0
- package/src/runtime/chat-types.ts +18 -0
- package/src/runtime/context-compaction-constants.ts +11 -0
- package/src/runtime/context-compaction-runtime.ts +86 -0
- package/src/runtime/context-compaction.ts +909 -0
- package/src/runtime/execution-plan.ts +59 -0
- package/src/runtime/helper-model.ts +405 -0
- package/src/runtime/indexed-repositories-policy.ts +28 -0
- package/src/runtime/instruction-sections.ts +8 -0
- package/src/runtime/llm-content.ts +71 -0
- package/src/runtime/memory-block.ts +264 -0
- package/src/runtime/memory-digest-policy.ts +14 -0
- package/src/runtime/memory-format.ts +8 -0
- package/src/runtime/memory-pipeline.ts +570 -0
- package/src/runtime/memory-prompts-fact.ts +47 -0
- package/src/runtime/memory-prompts-parse.ts +3 -0
- package/src/runtime/memory-prompts-update.ts +37 -0
- package/src/runtime/memory-scope.ts +43 -0
- package/src/runtime/plugin-types.ts +10 -0
- package/src/runtime/retrieval-adapters.ts +25 -0
- package/src/runtime/retrieval-pipeline.ts +3 -0
- package/src/runtime/runtime-extensions.ts +154 -0
- package/src/runtime/skill-extraction-policy.ts +3 -0
- package/src/runtime/team-consultation-orchestrator.ts +245 -0
- package/src/runtime/team-consultation-prompts.ts +32 -0
- package/src/runtime/title-helpers.ts +12 -0
- package/src/runtime/turn-lifecycle.ts +28 -0
- package/src/runtime/workstream-chat-helpers.ts +187 -0
- package/src/runtime/workstream-routing-policy.ts +301 -0
- package/src/runtime/workstream-state.ts +261 -0
- package/src/services/attachment.service.ts +159 -0
- package/src/services/chat-attachments.service.ts +17 -0
- package/src/services/chat-run-registry.service.ts +3 -0
- package/src/services/context-compaction-runtime.ts +13 -0
- package/src/services/context-compaction.service.ts +115 -0
- package/src/services/document-chunk.service.ts +141 -0
- package/src/services/execution-plan.service.ts +890 -0
- package/src/services/learned-skill.service.ts +328 -0
- package/src/services/memory-assessment.service.ts +43 -0
- package/src/services/memory.service.ts +807 -0
- package/src/services/memory.utils.ts +84 -0
- package/src/services/mutating-approval.service.ts +110 -0
- package/src/services/recent-activity-title.service.ts +74 -0
- package/src/services/recent-activity.service.ts +397 -0
- package/src/services/workstream-change-tracker.service.ts +313 -0
- package/src/services/workstream-message.service.ts +283 -0
- package/src/services/workstream-title.service.ts +58 -0
- package/src/services/workstream-turn-preparation.ts +1340 -0
- package/src/services/workstream-turn.ts +37 -0
- package/src/services/workstream.service.ts +854 -0
- package/src/services/workstream.types.ts +118 -0
- package/src/storage/attachment-parser.ts +101 -0
- package/src/storage/attachment-storage.service.ts +391 -0
- package/src/storage/attachments.types.ts +11 -0
- package/src/storage/attachments.utils.ts +58 -0
- package/src/storage/generated-document-storage.service.ts +55 -0
- package/src/system-agents/agent-result.ts +27 -0
- package/src/system-agents/context-compacter.agent.ts +46 -0
- package/src/system-agents/delegated-agent-factory.ts +177 -0
- package/src/system-agents/helper-agent-options.ts +20 -0
- package/src/system-agents/memory-reranker.agent.ts +38 -0
- package/src/system-agents/memory.agent.ts +58 -0
- package/src/system-agents/recent-activity-title-refiner.agent.ts +53 -0
- package/src/system-agents/regular-chat-memory-digest.agent.ts +75 -0
- package/src/system-agents/researcher.agent.ts +34 -0
- package/src/system-agents/skill-extractor.agent.ts +88 -0
- package/src/system-agents/skill-manager.agent.ts +80 -0
- package/src/system-agents/title-generator.agent.ts +42 -0
- package/src/system-agents/workstream-tracker.agent.ts +58 -0
- package/src/tools/execution-plan.tool.ts +163 -0
- package/src/tools/fetch-webpage.tool.ts +132 -0
- package/src/tools/firecrawl-client.ts +12 -0
- package/src/tools/memory-block.tool.ts +55 -0
- package/src/tools/read-file-parts.tool.ts +80 -0
- package/src/tools/remember-memory.tool.ts +85 -0
- package/src/tools/research-topic.tool.ts +15 -0
- package/src/tools/search-tools.ts +55 -0
- package/src/tools/search-web.tool.ts +175 -0
- package/src/tools/team-think.tool.ts +125 -0
- package/src/tools/tool-contract.ts +21 -0
- package/src/tools/user-questions.tool.ts +18 -0
- package/src/utils/async.ts +50 -0
- package/src/utils/date-time.ts +34 -0
- package/src/utils/error.ts +10 -0
- package/src/utils/errors.ts +28 -0
- package/src/utils/hono-error-handler.ts +71 -0
- package/src/utils/string.ts +51 -0
- package/src/workers/bootstrap.ts +44 -0
- package/src/workers/memory-consolidation.worker.ts +318 -0
- package/src/workers/regular-chat-memory-digest.helpers.ts +100 -0
- package/src/workers/regular-chat-memory-digest.runner.ts +363 -0
- package/src/workers/regular-chat-memory-digest.worker.ts +22 -0
- package/src/workers/skill-extraction.runner.ts +331 -0
- package/src/workers/skill-extraction.worker.ts +22 -0
- package/src/workers/utils/repo-indexer-chunker.ts +331 -0
- package/src/workers/utils/repo-structure-extractor.ts +645 -0
- package/src/workers/utils/repomix-process-concurrency.ts +65 -0
- package/src/workers/utils/sandbox-error.ts +5 -0
- package/src/workers/worker-utils.ts +182 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import { BoundQuery, RecordId } from 'surrealdb'
|
|
2
|
+
import { z } from 'zod'
|
|
3
|
+
|
|
4
|
+
import type { SurrealDBService, SurrealDatabaseLogger } from '../db/service'
|
|
5
|
+
import { TABLES } from '../db/tables'
|
|
6
|
+
import { getErrorMessage } from '../utils/error'
|
|
7
|
+
|
|
8
|
+
const DATABASE_BOOTSTRAP_KEY = 'database-schema-ready'
|
|
9
|
+
const DEFAULT_RETRY_DELAY_MS = 1_000
|
|
10
|
+
const DEFAULT_MAX_WAIT_MS = 3 * 60 * 1_000
|
|
11
|
+
const RETRY_LOG_INTERVAL = 5
|
|
12
|
+
|
|
13
|
+
const RuntimeBootstrapRecordSchema = z.object({
|
|
14
|
+
id: z.unknown(),
|
|
15
|
+
key: z.string(),
|
|
16
|
+
schemaFingerprint: z.string(),
|
|
17
|
+
readyAt: z.union([z.date(), z.string(), z.number()]),
|
|
18
|
+
updatedAt: z.union([z.date(), z.string(), z.number()]),
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
type StartupLogger = Pick<SurrealDatabaseLogger, 'info' | 'warn' | 'error'>
|
|
22
|
+
|
|
23
|
+
function shouldLogRetry(attempt: number): boolean {
|
|
24
|
+
return attempt === 1 || attempt % RETRY_LOG_INTERVAL === 0
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export async function connectWithStartupRetry(params: {
|
|
28
|
+
connect: () => Promise<void>
|
|
29
|
+
label: string
|
|
30
|
+
logger?: StartupLogger
|
|
31
|
+
retryDelayMs?: number
|
|
32
|
+
maxWaitMs?: number
|
|
33
|
+
}): Promise<void> {
|
|
34
|
+
const retryDelayMs = params.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS
|
|
35
|
+
const maxWaitMs = params.maxWaitMs ?? DEFAULT_MAX_WAIT_MS
|
|
36
|
+
const startedAt = Date.now()
|
|
37
|
+
|
|
38
|
+
let attempt = 0
|
|
39
|
+
let lastError: unknown = null
|
|
40
|
+
|
|
41
|
+
while (Date.now() - startedAt <= maxWaitMs) {
|
|
42
|
+
attempt += 1
|
|
43
|
+
|
|
44
|
+
try {
|
|
45
|
+
await params.connect()
|
|
46
|
+
return
|
|
47
|
+
} catch (error) {
|
|
48
|
+
lastError = error
|
|
49
|
+
if (shouldLogRetry(attempt)) {
|
|
50
|
+
params.logger?.warn?.(
|
|
51
|
+
`Waiting for ${params.label} (${attempt}, elapsed=${Date.now() - startedAt}ms): ${getErrorMessage(error)}`,
|
|
52
|
+
)
|
|
53
|
+
}
|
|
54
|
+
await Bun.sleep(retryDelayMs)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
params.logger?.error?.(`Timed out waiting for ${params.label}: ${getErrorMessage(lastError)}`)
|
|
59
|
+
throw lastError instanceof Error ? lastError : new Error(`Timed out waiting for ${params.label}`)
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
async function readDatabaseBootstrapRecord(
|
|
63
|
+
databaseService: SurrealDBService,
|
|
64
|
+
): Promise<z.infer<typeof RuntimeBootstrapRecordSchema> | null> {
|
|
65
|
+
return await databaseService.queryOne(
|
|
66
|
+
new BoundQuery(
|
|
67
|
+
`SELECT *
|
|
68
|
+
FROM ${TABLES.RUNTIME_BOOTSTRAP}
|
|
69
|
+
WHERE key = $key
|
|
70
|
+
LIMIT 1`,
|
|
71
|
+
{ key: DATABASE_BOOTSTRAP_KEY },
|
|
72
|
+
),
|
|
73
|
+
RuntimeBootstrapRecordSchema,
|
|
74
|
+
)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export async function waitForDatabaseBootstrap(params: {
|
|
78
|
+
databaseService: SurrealDBService
|
|
79
|
+
expectedFingerprint?: string | null
|
|
80
|
+
label: string
|
|
81
|
+
logger?: StartupLogger
|
|
82
|
+
connect?: () => Promise<void>
|
|
83
|
+
retryDelayMs?: number
|
|
84
|
+
maxWaitMs?: number
|
|
85
|
+
}): Promise<void> {
|
|
86
|
+
const expectedFingerprint = params.expectedFingerprint?.trim()
|
|
87
|
+
if (!expectedFingerprint) {
|
|
88
|
+
return
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const retryDelayMs = params.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS
|
|
92
|
+
const maxWaitMs = params.maxWaitMs ?? DEFAULT_MAX_WAIT_MS
|
|
93
|
+
const startedAt = Date.now()
|
|
94
|
+
|
|
95
|
+
let attempt = 0
|
|
96
|
+
let lastError: unknown = null
|
|
97
|
+
|
|
98
|
+
while (Date.now() - startedAt <= maxWaitMs) {
|
|
99
|
+
attempt += 1
|
|
100
|
+
|
|
101
|
+
try {
|
|
102
|
+
if (params.connect) {
|
|
103
|
+
await params.connect()
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const record = await readDatabaseBootstrapRecord(params.databaseService)
|
|
107
|
+
if (record?.schemaFingerprint === expectedFingerprint) {
|
|
108
|
+
return
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if (shouldLogRetry(attempt)) {
|
|
112
|
+
const currentFingerprint =
|
|
113
|
+
typeof record?.schemaFingerprint === 'string' && record.schemaFingerprint.length > 0
|
|
114
|
+
? record.schemaFingerprint
|
|
115
|
+
: 'missing'
|
|
116
|
+
params.logger?.info?.(
|
|
117
|
+
`Waiting for ${params.label} schema readiness (${attempt}, expected=${expectedFingerprint}, current=${currentFingerprint})`,
|
|
118
|
+
)
|
|
119
|
+
}
|
|
120
|
+
} catch (error) {
|
|
121
|
+
lastError = error
|
|
122
|
+
if (shouldLogRetry(attempt)) {
|
|
123
|
+
params.logger?.warn?.(`Waiting for ${params.label} schema readiness (${attempt}): ${getErrorMessage(error)}`)
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
await Bun.sleep(retryDelayMs)
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (lastError instanceof Error) {
|
|
131
|
+
throw lastError
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
throw new Error(`Timed out waiting for ${params.label} schema readiness`)
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export async function publishDatabaseBootstrap(params: {
|
|
138
|
+
databaseService: SurrealDBService
|
|
139
|
+
schemaFingerprint: string
|
|
140
|
+
}): Promise<void> {
|
|
141
|
+
await params.databaseService.upsert(
|
|
142
|
+
TABLES.RUNTIME_BOOTSTRAP,
|
|
143
|
+
new RecordId(TABLES.RUNTIME_BOOTSTRAP, DATABASE_BOOTSTRAP_KEY),
|
|
144
|
+
{
|
|
145
|
+
key: DATABASE_BOOTSTRAP_KEY,
|
|
146
|
+
schemaFingerprint: params.schemaFingerprint,
|
|
147
|
+
readyAt: new Date(),
|
|
148
|
+
updatedAt: new Date(),
|
|
149
|
+
},
|
|
150
|
+
RuntimeBootstrapRecordSchema,
|
|
151
|
+
)
|
|
152
|
+
}
|
package/src/db/tables.ts
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export const TABLES = {
|
|
2
|
+
WORKSTREAM_MESSAGE: 'workstreamMessage',
|
|
3
|
+
WORKSTREAM: 'workstream',
|
|
4
|
+
RUNTIME_BOOTSTRAP: 'runtimeBootstrap',
|
|
5
|
+
WORKSTREAM_ATTACHMENT: 'workstreamAttachment',
|
|
6
|
+
MEMORY: 'memory',
|
|
7
|
+
MEMORY_RELATION: 'memoryRelation',
|
|
8
|
+
MEMORY_HISTORY: 'memoryHistory',
|
|
9
|
+
LEARNED_SKILL: 'learnedSkill',
|
|
10
|
+
PLAN: 'plan',
|
|
11
|
+
PLAN_TASK: 'planTask',
|
|
12
|
+
PLAN_EVENT: 'planEvent',
|
|
13
|
+
ORGANIZATION: 'organization',
|
|
14
|
+
USER: 'user',
|
|
15
|
+
ORG_ACTION: 'orgAction',
|
|
16
|
+
RECENT_ACTIVITY_EVENT: 'recentActivityEvent',
|
|
17
|
+
RECENT_ACTIVITY: 'recentActivity',
|
|
18
|
+
} as const
|
|
19
|
+
|
|
20
|
+
export type DatabaseTable = (typeof TABLES)[keyof typeof TABLES] | (string & {})
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
import { normalizeKey, normalizeTextBody, normalizeWhitespace } from './parsing'
|
|
2
|
+
|
|
3
|
+
export type ParsedDocumentChunk = {
|
|
4
|
+
chunkKey: string
|
|
5
|
+
chunkIndex: number
|
|
6
|
+
content: string
|
|
7
|
+
sectionPath?: string
|
|
8
|
+
pageStart?: number
|
|
9
|
+
pageEnd?: number
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
type ChunkBase = Omit<ParsedDocumentChunk, 'chunkIndex'>
|
|
13
|
+
|
|
14
|
+
type TextPage = { pageNumber: number; text: string }
|
|
15
|
+
|
|
16
|
+
const MARKDOWN_CHUNK_CHARS = 1_700
|
|
17
|
+
const TEXT_CHUNK_CHARS = 1_600
|
|
18
|
+
const PDF_CHUNK_CHARS = 1_400
|
|
19
|
+
|
|
20
|
+
function joinSectionPath(parts: Array<string | undefined>): string | undefined {
|
|
21
|
+
const normalized = parts.map((part) => normalizeWhitespace(part ?? '')).filter((part) => part.length > 0)
|
|
22
|
+
return normalized.length > 0 ? normalized.join(' > ') : undefined
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function splitParagraphUnits(text: string): string[] {
|
|
26
|
+
const normalized = normalizeTextBody(text)
|
|
27
|
+
if (!normalized) return []
|
|
28
|
+
|
|
29
|
+
return normalized
|
|
30
|
+
.split(/\n{2,}/)
|
|
31
|
+
.map((value) => value.trim())
|
|
32
|
+
.filter((value) => value.length > 0)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
function splitOversizedUnit(unit: string, maxChars: number): string[] {
|
|
36
|
+
const normalized = normalizeTextBody(unit)
|
|
37
|
+
if (!normalized) return []
|
|
38
|
+
if (normalized.length <= maxChars) return [normalized]
|
|
39
|
+
|
|
40
|
+
const sentenceParts = normalized
|
|
41
|
+
.split(/(?<=[.!?])\s+/)
|
|
42
|
+
.map((value) => value.trim())
|
|
43
|
+
.filter((value) => value.length > 0)
|
|
44
|
+
|
|
45
|
+
if (sentenceParts.length > 1) {
|
|
46
|
+
return sentenceParts.flatMap((part) => splitOversizedUnit(part, maxChars))
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const lineParts = normalized
|
|
50
|
+
.split(/\n+/)
|
|
51
|
+
.map((value) => value.trim())
|
|
52
|
+
.filter((value) => value.length > 0)
|
|
53
|
+
|
|
54
|
+
if (lineParts.length > 1) {
|
|
55
|
+
return lineParts.flatMap((part) => splitOversizedUnit(part, maxChars))
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const chunks: string[] = []
|
|
59
|
+
let cursor = 0
|
|
60
|
+
while (cursor < normalized.length) {
|
|
61
|
+
let end = Math.min(cursor + maxChars, normalized.length)
|
|
62
|
+
if (end < normalized.length) {
|
|
63
|
+
const breakAt = normalized.lastIndexOf(' ', end)
|
|
64
|
+
if (breakAt > cursor + Math.floor(maxChars * 0.55)) {
|
|
65
|
+
end = breakAt
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const slice = normalized.slice(cursor, end).trim()
|
|
70
|
+
if (slice) chunks.push(slice)
|
|
71
|
+
cursor = end
|
|
72
|
+
while (cursor < normalized.length && normalized[cursor] === ' ') {
|
|
73
|
+
cursor += 1
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return chunks
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
function buildChunkBodies(units: string[], maxChars: number): string[] {
|
|
81
|
+
const chunks: string[] = []
|
|
82
|
+
let current = ''
|
|
83
|
+
|
|
84
|
+
const commit = () => {
|
|
85
|
+
const value = normalizeTextBody(current)
|
|
86
|
+
if (value) chunks.push(value)
|
|
87
|
+
current = ''
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
for (const unit of units) {
|
|
91
|
+
const normalized = normalizeTextBody(unit)
|
|
92
|
+
if (!normalized) continue
|
|
93
|
+
|
|
94
|
+
if (normalized.length > maxChars) {
|
|
95
|
+
if (current) commit()
|
|
96
|
+
for (const split of splitOversizedUnit(normalized, maxChars)) {
|
|
97
|
+
const value = normalizeTextBody(split)
|
|
98
|
+
if (value) chunks.push(value)
|
|
99
|
+
}
|
|
100
|
+
continue
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const nextValue = current ? `${current}\n\n${normalized}` : normalized
|
|
104
|
+
if (nextValue.length > maxChars && current) {
|
|
105
|
+
commit()
|
|
106
|
+
current = normalized
|
|
107
|
+
continue
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
current = nextValue
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
commit()
|
|
114
|
+
return chunks
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function toChunkEntries(params: {
|
|
118
|
+
prefix: string
|
|
119
|
+
contents: string[]
|
|
120
|
+
sectionPath?: string
|
|
121
|
+
pageStart?: number
|
|
122
|
+
pageEnd?: number
|
|
123
|
+
}): ChunkBase[] {
|
|
124
|
+
return params.contents.map((content, index) => ({
|
|
125
|
+
chunkKey: `${params.prefix}:${String(index + 1).padStart(3, '0')}`,
|
|
126
|
+
content,
|
|
127
|
+
sectionPath: params.sectionPath,
|
|
128
|
+
pageStart: params.pageStart,
|
|
129
|
+
pageEnd: params.pageEnd,
|
|
130
|
+
}))
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function withChunkIndexes(chunks: ChunkBase[]): ParsedDocumentChunk[] {
|
|
134
|
+
return chunks.map((chunk, index) => ({ ...chunk, chunkIndex: index }))
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export function chunkPlainTextDocument(params: {
|
|
138
|
+
text: string
|
|
139
|
+
chunkChars?: number
|
|
140
|
+
chunkKeyPrefix?: string
|
|
141
|
+
sectionPath?: string
|
|
142
|
+
}): ParsedDocumentChunk[] {
|
|
143
|
+
const chunkChars = params.chunkChars ?? TEXT_CHUNK_CHARS
|
|
144
|
+
const prefix = params.chunkKeyPrefix ?? 'text'
|
|
145
|
+
const units = splitParagraphUnits(params.text)
|
|
146
|
+
const contents = buildChunkBodies(units, chunkChars)
|
|
147
|
+
return withChunkIndexes(toChunkEntries({ prefix, contents, sectionPath: params.sectionPath }))
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export function chunkMarkdownDocument(params: {
|
|
151
|
+
text: string
|
|
152
|
+
chunkChars?: number
|
|
153
|
+
chunkKeyPrefix?: string
|
|
154
|
+
baseSectionPath?: string
|
|
155
|
+
}): ParsedDocumentChunk[] {
|
|
156
|
+
const chunkChars = params.chunkChars ?? MARKDOWN_CHUNK_CHARS
|
|
157
|
+
const prefix = params.chunkKeyPrefix ?? 'markdown'
|
|
158
|
+
const lines = normalizeTextBody(params.text).split('\n')
|
|
159
|
+
const chunks: ChunkBase[] = []
|
|
160
|
+
const headingStack: string[] = []
|
|
161
|
+
let currentLines: string[] = []
|
|
162
|
+
let sectionCounter = 0
|
|
163
|
+
|
|
164
|
+
const flushCurrent = () => {
|
|
165
|
+
const content = normalizeTextBody(currentLines.join('\n'))
|
|
166
|
+
if (!content) {
|
|
167
|
+
currentLines = []
|
|
168
|
+
return
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
sectionCounter += 1
|
|
172
|
+
const sectionPath = joinSectionPath([params.baseSectionPath, ...headingStack])
|
|
173
|
+
const sectionKeyBase = sectionPath ? normalizeKey(sectionPath) : 'section'
|
|
174
|
+
const sectionKey = `${sectionKeyBase}-${String(sectionCounter).padStart(3, '0')}`
|
|
175
|
+
const contents = buildChunkBodies(splitParagraphUnits(content), chunkChars)
|
|
176
|
+
chunks.push(...toChunkEntries({ prefix: `${prefix}:${sectionKey}`, contents, sectionPath }))
|
|
177
|
+
currentLines = []
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
for (const line of lines) {
|
|
181
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+?)\s*$/)
|
|
182
|
+
if (!headingMatch) {
|
|
183
|
+
currentLines.push(line)
|
|
184
|
+
continue
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
flushCurrent()
|
|
188
|
+
const depth = headingMatch[1].length
|
|
189
|
+
const headingText = normalizeWhitespace(headingMatch[2])
|
|
190
|
+
headingStack.splice(depth - 1)
|
|
191
|
+
headingStack[depth - 1] = headingText
|
|
192
|
+
currentLines = [line]
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
flushCurrent()
|
|
196
|
+
return withChunkIndexes(chunks)
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
export function chunkPagedDocument(params: {
|
|
200
|
+
pages: TextPage[]
|
|
201
|
+
chunkChars?: number
|
|
202
|
+
chunkKeyPrefix?: string
|
|
203
|
+
}): ParsedDocumentChunk[] {
|
|
204
|
+
const chunkChars = params.chunkChars ?? PDF_CHUNK_CHARS
|
|
205
|
+
const prefix = params.chunkKeyPrefix ?? 'page'
|
|
206
|
+
const chunks: ChunkBase[] = []
|
|
207
|
+
|
|
208
|
+
for (const page of params.pages) {
|
|
209
|
+
const pageText = normalizeTextBody(page.text)
|
|
210
|
+
if (!pageText) continue
|
|
211
|
+
|
|
212
|
+
const contents = buildChunkBodies(splitParagraphUnits(pageText), chunkChars)
|
|
213
|
+
chunks.push(
|
|
214
|
+
...toChunkEntries({
|
|
215
|
+
prefix: `${prefix}:${String(page.pageNumber).padStart(4, '0')}`,
|
|
216
|
+
contents,
|
|
217
|
+
pageStart: page.pageNumber,
|
|
218
|
+
pageEnd: page.pageNumber,
|
|
219
|
+
}),
|
|
220
|
+
)
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return withChunkIndexes(chunks)
|
|
224
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
export function normalizeWhitespace(value: string): string {
|
|
2
|
+
return value.replace(/\s+/g, ' ').trim()
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
export function normalizeTextBody(value: string): string {
|
|
6
|
+
return value.replaceAll(String.fromCharCode(0), '').replace(/\r/g, '').trim()
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
export function normalizeKey(value: string): string {
|
|
10
|
+
return normalizeWhitespace(value)
|
|
11
|
+
.toLowerCase()
|
|
12
|
+
.replace(/[^\w\s.-]/g, '')
|
|
13
|
+
.replace(/\s+/g, '-')
|
|
14
|
+
.slice(0, 120)
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function makeMemoryKey(kind: string, rawKey: string): string {
|
|
18
|
+
const normalized = normalizeKey(rawKey)
|
|
19
|
+
return normalized ? `${kind}:${normalized}` : `${kind}:item`
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function truncateForModel(value: string, maxChars: number): string {
|
|
23
|
+
if (value.length <= maxChars) return value
|
|
24
|
+
return `${value.slice(0, maxChars)}\n\n[...truncated due to size...]`
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export function dedupeStrings(items: string[], limit: number): string[] {
|
|
28
|
+
const out: string[] = []
|
|
29
|
+
const seen = new Set<string>()
|
|
30
|
+
for (const raw of items) {
|
|
31
|
+
const value = normalizeWhitespace(raw)
|
|
32
|
+
if (!value) continue
|
|
33
|
+
const key = value.toLowerCase()
|
|
34
|
+
if (seen.has(key)) continue
|
|
35
|
+
seen.add(key)
|
|
36
|
+
out.push(value)
|
|
37
|
+
if (out.length >= limit) break
|
|
38
|
+
}
|
|
39
|
+
return out
|
|
40
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { embed, embedMany } from 'ai'
|
|
2
|
+
|
|
3
|
+
import { bifrostEmbeddingModel } from '../bifrost/bifrost'
|
|
4
|
+
import { env } from '../config/env-shapes'
|
|
5
|
+
|
|
6
|
+
const SUPPORTED_EMBEDDING_PREFIXES = ['openai/', 'openrouter/'] as const
|
|
7
|
+
|
|
8
|
+
function resolveEmbeddingModel(modelId: string) {
|
|
9
|
+
const normalized = modelId.trim()
|
|
10
|
+
if (!normalized) {
|
|
11
|
+
throw new Error('[embeddings-provider] Model id is required.')
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
if (!SUPPORTED_EMBEDDING_PREFIXES.some((prefix) => normalized.startsWith(prefix))) {
|
|
15
|
+
throw new Error(
|
|
16
|
+
`[embeddings-provider] Unsupported model id "${modelId}". Use one of: ${SUPPORTED_EMBEDDING_PREFIXES.join(', ')}*.`,
|
|
17
|
+
)
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return bifrostEmbeddingModel(normalized)
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
class ProviderEmbeddings {
|
|
24
|
+
private _model: ReturnType<typeof resolveEmbeddingModel> | null = null
|
|
25
|
+
|
|
26
|
+
private getModel() {
|
|
27
|
+
if (!this._model) {
|
|
28
|
+
this._model = resolveEmbeddingModel(env.AI_EMBEDDING_MODEL)
|
|
29
|
+
}
|
|
30
|
+
return this._model
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
async embedQuery(text: string): Promise<number[]> {
|
|
34
|
+
const input = text.trim()
|
|
35
|
+
if (!input) return []
|
|
36
|
+
|
|
37
|
+
const result = await embed({ model: this.getModel(), value: input, maxRetries: 2 })
|
|
38
|
+
|
|
39
|
+
return result.embedding.map((value) => Number(value))
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
async embedDocuments(values: string[]): Promise<number[][]> {
|
|
43
|
+
if (values.length === 0) return []
|
|
44
|
+
|
|
45
|
+
const normalized = values.map((value) => value.trim())
|
|
46
|
+
const nonEmptyEntries = normalized
|
|
47
|
+
.map((value, index) => ({ value, index }))
|
|
48
|
+
.filter((entry) => entry.value.length > 0)
|
|
49
|
+
|
|
50
|
+
if (nonEmptyEntries.length === 0) {
|
|
51
|
+
return normalized.map(() => [])
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const result = await embedMany({
|
|
55
|
+
model: this.getModel(),
|
|
56
|
+
values: nonEmptyEntries.map((entry) => entry.value),
|
|
57
|
+
maxRetries: 2,
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
const embeddingsByIndex = new Map<number, number[]>()
|
|
61
|
+
result.embeddings.forEach((embedding, index) => {
|
|
62
|
+
const entry = nonEmptyEntries.at(index)
|
|
63
|
+
if (!entry) return
|
|
64
|
+
embeddingsByIndex.set(
|
|
65
|
+
entry.index,
|
|
66
|
+
embedding.map((value) => Number(value)),
|
|
67
|
+
)
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
return normalized.map((_, index) => embeddingsByIndex.get(index) ?? [])
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function createDefaultEmbeddings(): ProviderEmbeddings {
|
|
75
|
+
return new ProviderEmbeddings()
|
|
76
|
+
}
|