@lota-sdk/core 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. package/infrastructure/schema/00_workstream.surql +55 -0
  2. package/infrastructure/schema/01_memory.surql +47 -0
  3. package/infrastructure/schema/02_execution_plan.surql +62 -0
  4. package/infrastructure/schema/03_learned_skill.surql +32 -0
  5. package/infrastructure/schema/04_runtime_bootstrap.surql +8 -0
  6. package/package.json +128 -0
  7. package/src/ai/definitions.ts +308 -0
  8. package/src/bifrost/bifrost.ts +256 -0
  9. package/src/config/agent-defaults.ts +99 -0
  10. package/src/config/constants.ts +33 -0
  11. package/src/config/env-shapes.ts +122 -0
  12. package/src/config/logger.ts +29 -0
  13. package/src/config/model-constants.ts +31 -0
  14. package/src/config/search.ts +17 -0
  15. package/src/config/workstream-defaults.ts +68 -0
  16. package/src/db/base.service.ts +55 -0
  17. package/src/db/cursor-pagination.ts +73 -0
  18. package/src/db/memory-query-builder.ts +207 -0
  19. package/src/db/memory-store.helpers.ts +118 -0
  20. package/src/db/memory-store.rows.ts +29 -0
  21. package/src/db/memory-store.ts +974 -0
  22. package/src/db/memory-types.ts +193 -0
  23. package/src/db/memory.ts +505 -0
  24. package/src/db/record-id.ts +78 -0
  25. package/src/db/service.ts +932 -0
  26. package/src/db/startup.ts +152 -0
  27. package/src/db/tables.ts +20 -0
  28. package/src/document/org-document-chunking.ts +224 -0
  29. package/src/document/parsing.ts +40 -0
  30. package/src/embeddings/provider.ts +76 -0
  31. package/src/index.ts +302 -0
  32. package/src/queues/context-compaction.queue.ts +82 -0
  33. package/src/queues/document-processor.queue.ts +118 -0
  34. package/src/queues/memory-consolidation.queue.ts +65 -0
  35. package/src/queues/post-chat-memory.queue.ts +128 -0
  36. package/src/queues/recent-activity-title-refinement.queue.ts +69 -0
  37. package/src/queues/regular-chat-memory-digest.config.ts +12 -0
  38. package/src/queues/regular-chat-memory-digest.queue.ts +73 -0
  39. package/src/queues/skill-extraction.config.ts +9 -0
  40. package/src/queues/skill-extraction.queue.ts +62 -0
  41. package/src/redis/connection.ts +176 -0
  42. package/src/redis/index.ts +30 -0
  43. package/src/redis/org-memory-lock.ts +43 -0
  44. package/src/redis/redis-lease-lock.ts +158 -0
  45. package/src/runtime/agent-contract.ts +1 -0
  46. package/src/runtime/agent-prompt-context.ts +119 -0
  47. package/src/runtime/agent-runtime-policy.ts +192 -0
  48. package/src/runtime/agent-stream-helpers.ts +117 -0
  49. package/src/runtime/agent-types.ts +22 -0
  50. package/src/runtime/approval-continuation.ts +16 -0
  51. package/src/runtime/chat-attachments.ts +46 -0
  52. package/src/runtime/chat-message.ts +10 -0
  53. package/src/runtime/chat-request-routing.ts +21 -0
  54. package/src/runtime/chat-run-orchestration.ts +25 -0
  55. package/src/runtime/chat-run-registry.ts +20 -0
  56. package/src/runtime/chat-types.ts +18 -0
  57. package/src/runtime/context-compaction-constants.ts +11 -0
  58. package/src/runtime/context-compaction-runtime.ts +86 -0
  59. package/src/runtime/context-compaction.ts +909 -0
  60. package/src/runtime/execution-plan.ts +59 -0
  61. package/src/runtime/helper-model.ts +405 -0
  62. package/src/runtime/indexed-repositories-policy.ts +28 -0
  63. package/src/runtime/instruction-sections.ts +8 -0
  64. package/src/runtime/llm-content.ts +71 -0
  65. package/src/runtime/memory-block.ts +264 -0
  66. package/src/runtime/memory-digest-policy.ts +14 -0
  67. package/src/runtime/memory-format.ts +8 -0
  68. package/src/runtime/memory-pipeline.ts +570 -0
  69. package/src/runtime/memory-prompts-fact.ts +47 -0
  70. package/src/runtime/memory-prompts-parse.ts +3 -0
  71. package/src/runtime/memory-prompts-update.ts +37 -0
  72. package/src/runtime/memory-scope.ts +43 -0
  73. package/src/runtime/plugin-types.ts +10 -0
  74. package/src/runtime/retrieval-adapters.ts +25 -0
  75. package/src/runtime/retrieval-pipeline.ts +3 -0
  76. package/src/runtime/runtime-extensions.ts +154 -0
  77. package/src/runtime/skill-extraction-policy.ts +3 -0
  78. package/src/runtime/team-consultation-orchestrator.ts +245 -0
  79. package/src/runtime/team-consultation-prompts.ts +32 -0
  80. package/src/runtime/title-helpers.ts +12 -0
  81. package/src/runtime/turn-lifecycle.ts +28 -0
  82. package/src/runtime/workstream-chat-helpers.ts +187 -0
  83. package/src/runtime/workstream-routing-policy.ts +301 -0
  84. package/src/runtime/workstream-state.ts +261 -0
  85. package/src/services/attachment.service.ts +159 -0
  86. package/src/services/chat-attachments.service.ts +17 -0
  87. package/src/services/chat-run-registry.service.ts +3 -0
  88. package/src/services/context-compaction-runtime.ts +13 -0
  89. package/src/services/context-compaction.service.ts +115 -0
  90. package/src/services/document-chunk.service.ts +141 -0
  91. package/src/services/execution-plan.service.ts +890 -0
  92. package/src/services/learned-skill.service.ts +328 -0
  93. package/src/services/memory-assessment.service.ts +43 -0
  94. package/src/services/memory.service.ts +807 -0
  95. package/src/services/memory.utils.ts +84 -0
  96. package/src/services/mutating-approval.service.ts +110 -0
  97. package/src/services/recent-activity-title.service.ts +74 -0
  98. package/src/services/recent-activity.service.ts +397 -0
  99. package/src/services/workstream-change-tracker.service.ts +313 -0
  100. package/src/services/workstream-message.service.ts +283 -0
  101. package/src/services/workstream-title.service.ts +58 -0
  102. package/src/services/workstream-turn-preparation.ts +1340 -0
  103. package/src/services/workstream-turn.ts +37 -0
  104. package/src/services/workstream.service.ts +854 -0
  105. package/src/services/workstream.types.ts +118 -0
  106. package/src/storage/attachment-parser.ts +101 -0
  107. package/src/storage/attachment-storage.service.ts +391 -0
  108. package/src/storage/attachments.types.ts +11 -0
  109. package/src/storage/attachments.utils.ts +58 -0
  110. package/src/storage/generated-document-storage.service.ts +55 -0
  111. package/src/system-agents/agent-result.ts +27 -0
  112. package/src/system-agents/context-compacter.agent.ts +46 -0
  113. package/src/system-agents/delegated-agent-factory.ts +177 -0
  114. package/src/system-agents/helper-agent-options.ts +20 -0
  115. package/src/system-agents/memory-reranker.agent.ts +38 -0
  116. package/src/system-agents/memory.agent.ts +58 -0
  117. package/src/system-agents/recent-activity-title-refiner.agent.ts +53 -0
  118. package/src/system-agents/regular-chat-memory-digest.agent.ts +75 -0
  119. package/src/system-agents/researcher.agent.ts +34 -0
  120. package/src/system-agents/skill-extractor.agent.ts +88 -0
  121. package/src/system-agents/skill-manager.agent.ts +80 -0
  122. package/src/system-agents/title-generator.agent.ts +42 -0
  123. package/src/system-agents/workstream-tracker.agent.ts +58 -0
  124. package/src/tools/execution-plan.tool.ts +163 -0
  125. package/src/tools/fetch-webpage.tool.ts +132 -0
  126. package/src/tools/firecrawl-client.ts +12 -0
  127. package/src/tools/memory-block.tool.ts +55 -0
  128. package/src/tools/read-file-parts.tool.ts +80 -0
  129. package/src/tools/remember-memory.tool.ts +85 -0
  130. package/src/tools/research-topic.tool.ts +15 -0
  131. package/src/tools/search-tools.ts +55 -0
  132. package/src/tools/search-web.tool.ts +175 -0
  133. package/src/tools/team-think.tool.ts +125 -0
  134. package/src/tools/tool-contract.ts +21 -0
  135. package/src/tools/user-questions.tool.ts +18 -0
  136. package/src/utils/async.ts +50 -0
  137. package/src/utils/date-time.ts +34 -0
  138. package/src/utils/error.ts +10 -0
  139. package/src/utils/errors.ts +28 -0
  140. package/src/utils/hono-error-handler.ts +71 -0
  141. package/src/utils/string.ts +51 -0
  142. package/src/workers/bootstrap.ts +44 -0
  143. package/src/workers/memory-consolidation.worker.ts +318 -0
  144. package/src/workers/regular-chat-memory-digest.helpers.ts +100 -0
  145. package/src/workers/regular-chat-memory-digest.runner.ts +363 -0
  146. package/src/workers/regular-chat-memory-digest.worker.ts +22 -0
  147. package/src/workers/skill-extraction.runner.ts +331 -0
  148. package/src/workers/skill-extraction.worker.ts +22 -0
  149. package/src/workers/utils/repo-indexer-chunker.ts +331 -0
  150. package/src/workers/utils/repo-structure-extractor.ts +645 -0
  151. package/src/workers/utils/repomix-process-concurrency.ts +65 -0
  152. package/src/workers/utils/sandbox-error.ts +5 -0
  153. package/src/workers/worker-utils.ts +182 -0
@@ -0,0 +1,331 @@
1
+ import { toTimestamp } from '@lota-sdk/shared/runtime/chat-message-metadata'
2
+ import { BoundQuery } from 'surrealdb'
3
+ import { z } from 'zod'
4
+
5
+ import { serverLogger } from '../config/logger'
6
+ import { ensureRecordId, recordIdToString } from '../db/record-id'
7
+ import type { RecordIdRef } from '../db/record-id'
8
+ import { databaseService } from '../db/service'
9
+ import { TABLES } from '../db/tables'
10
+ import { createDefaultEmbeddings } from '../embeddings/provider'
11
+ import type { SkillExtractionJob } from '../queues/skill-extraction.queue'
12
+ import { createHelperModelRuntime } from '../runtime/helper-model'
13
+ import { getRuntimeAdapters, withConfiguredWorkspaceMemoryLock } from '../runtime/runtime-extensions'
14
+ import { learnedSkillService } from '../services/learned-skill.service'
15
+ import { createSkillExtractorAgent, SkillExtractionOutputSchema } from '../system-agents/skill-extractor.agent'
16
+ import type { SkillCandidate } from '../system-agents/skill-extractor.agent'
17
+ import { createSkillManagerAgent, SkillManagerOutputSchema } from '../system-agents/skill-manager.agent'
18
+ import { buildDigestTranscript, resolveWorkspaceBootstrapCutoff } from './regular-chat-memory-digest.helpers'
19
+
20
+ const SKILL_EXTRACTION_TIMEOUT_MS = 10 * 60 * 1000
21
+ const MIN_MESSAGE_THRESHOLD = 10
22
+
23
+ const RecordTimestampSchema = z.union([z.date(), z.string(), z.number()])
24
+ const MessageRoleSchema = z.enum(['system', 'user', 'assistant'])
25
+ const MessagePartSchema = z.record(z.string(), z.unknown())
26
+ const MessageMetadataSchema = z.record(z.string(), z.unknown()).nullish()
27
+
28
+ const WorkstreamMessageRowSchema = z.object({
29
+ id: z.string(),
30
+ workstreamId: z.string(),
31
+ role: MessageRoleSchema,
32
+ parts: z.array(MessagePartSchema).optional(),
33
+ metadata: MessageMetadataSchema,
34
+ createdAt: RecordTimestampSchema,
35
+ })
36
+
37
+ interface DigestCursor {
38
+ createdAt: Date
39
+ id: string
40
+ }
41
+
42
+ interface DigestMessage {
43
+ source: 'workstream'
44
+ sourceId: string
45
+ role: 'system' | 'user' | 'assistant'
46
+ parts: Array<Record<string, unknown>>
47
+ metadata?: Record<string, unknown>
48
+ cursor: DigestCursor
49
+ }
50
+
51
+ interface SkillExtractionRunResult {
52
+ skipped: boolean
53
+ processedMessages: number
54
+ extractedSkills: number
55
+ }
56
+
57
+ const embeddings = createDefaultEmbeddings()
58
+
59
+ const helperModelRuntime = createHelperModelRuntime()
60
+
61
+ function mapWorkstreamRow(row: z.infer<typeof WorkstreamMessageRowSchema>): DigestMessage {
62
+ return {
63
+ source: 'workstream',
64
+ sourceId: row.workstreamId,
65
+ role: row.role,
66
+ parts: row.parts ?? [],
67
+ metadata: row.metadata ?? undefined,
68
+ cursor: { createdAt: new Date(toTimestamp(row.createdAt)), id: row.id },
69
+ }
70
+ }
71
+
72
+ function compareMessageOrder(left: DigestMessage, right: DigestMessage): number {
73
+ const timeDiff = left.cursor.createdAt.getTime() - right.cursor.createdAt.getTime()
74
+ if (timeDiff !== 0) return timeDiff
75
+ return left.cursor.id.localeCompare(right.cursor.id)
76
+ }
77
+
78
+ async function listWorkstreamIdsForOrg(orgRef: RecordIdRef): Promise<RecordIdRef[]> {
79
+ const EntityIdRowSchema = z.string().trim().min(1)
80
+ const ids = await databaseService.query<unknown>(
81
+ new BoundQuery(
82
+ `SELECT VALUE type::string(id) FROM ${TABLES.WORKSTREAM}
83
+ WHERE organizationId = $organizationId`,
84
+ { organizationId: orgRef },
85
+ ),
86
+ )
87
+ return ids.map((value) => ensureRecordId(EntityIdRowSchema.parse(value), TABLES.WORKSTREAM))
88
+ }
89
+
90
+ async function listEligibleMessages(params: {
91
+ workstreamIds: RecordIdRef[]
92
+ cursor: DigestCursor | null
93
+ onboardingCutoff: Date | null
94
+ }): Promise<DigestMessage[]> {
95
+ if (params.workstreamIds.length === 0) return []
96
+
97
+ let query: BoundQuery | null = null
98
+ if (params.cursor) {
99
+ const cursorRowId = ensureRecordId(params.cursor.id, TABLES.WORKSTREAM_MESSAGE)
100
+ query = new BoundQuery(
101
+ `SELECT type::string(id) AS id, type::string(workstreamId) AS workstreamId, role, parts, metadata, createdAt FROM ${TABLES.WORKSTREAM_MESSAGE}
102
+ WHERE workstreamId IN $workstreamIds
103
+ AND (
104
+ createdAt > $cursorCreatedAt
105
+ OR (createdAt = $cursorCreatedAt AND id > $cursorRowId)
106
+ )
107
+ ORDER BY createdAt ASC, id ASC`,
108
+ { workstreamIds: params.workstreamIds, cursorCreatedAt: params.cursor.createdAt, cursorRowId },
109
+ )
110
+ } else if (params.onboardingCutoff) {
111
+ query = new BoundQuery(
112
+ `SELECT type::string(id) AS id, type::string(workstreamId) AS workstreamId, role, parts, metadata, createdAt FROM ${TABLES.WORKSTREAM_MESSAGE}
113
+ WHERE workstreamId IN $workstreamIds
114
+ AND createdAt > $onboardingCutoff
115
+ ORDER BY createdAt ASC, id ASC`,
116
+ { workstreamIds: params.workstreamIds, onboardingCutoff: params.onboardingCutoff },
117
+ )
118
+ }
119
+
120
+ if (!query) return []
121
+
122
+ const rows = await databaseService.query<unknown>(query)
123
+ return rows.map((row) => mapWorkstreamRow(WorkstreamMessageRowSchema.parse(row)))
124
+ }
125
+
126
+ function buildExtractionPrompt(params: { workspaceName: string; transcript: string; existingSkills: string }): string {
127
+ return [
128
+ `Workspace name: ${params.workspaceName}`,
129
+ '',
130
+ 'Existing learned skills:',
131
+ params.existingSkills || 'No existing learned skills.',
132
+ '',
133
+ 'Recent conversation transcript:',
134
+ params.transcript || 'No transcript.',
135
+ ].join('\n')
136
+ }
137
+
138
+ function buildManagerPrompt(params: {
139
+ candidate: SkillCandidate
140
+ existingSkill: { name: string; description: string; instructions: string; version: number } | null
141
+ }): string {
142
+ const parts = [
143
+ 'Candidate skill:',
144
+ `Name: ${params.candidate.name}`,
145
+ `Description: ${params.candidate.description}`,
146
+ `Instructions: ${params.candidate.instructions}`,
147
+ `Triggers: ${params.candidate.triggers.join(', ')}`,
148
+ `Confidence: ${params.candidate.confidence}`,
149
+ ]
150
+
151
+ if (params.existingSkill) {
152
+ parts.push(
153
+ '',
154
+ 'Most similar existing skill:',
155
+ `Name: ${params.existingSkill.name}`,
156
+ `Description: ${params.existingSkill.description}`,
157
+ `Instructions: ${params.existingSkill.instructions}`,
158
+ `Version: ${params.existingSkill.version}`,
159
+ )
160
+ } else {
161
+ parts.push('', 'No similar existing skill found.')
162
+ }
163
+
164
+ return parts.join('\n')
165
+ }
166
+
167
+ export async function runSkillExtraction(data: SkillExtractionJob): Promise<SkillExtractionRunResult> {
168
+ const orgRef = ensureRecordId(data.orgId, TABLES.ORGANIZATION)
169
+ const orgId = recordIdToString(orgRef, TABLES.ORGANIZATION)
170
+ const workspaceProvider = getRuntimeAdapters().services?.workspaceProvider
171
+ const cursorAwareWorkspaceProvider =
172
+ workspaceProvider?.getBackgroundCursor && workspaceProvider.setBackgroundCursor
173
+ ? (workspaceProvider as typeof workspaceProvider & {
174
+ getBackgroundCursor: NonNullable<typeof workspaceProvider.getBackgroundCursor>
175
+ setBackgroundCursor: NonNullable<typeof workspaceProvider.setBackgroundCursor>
176
+ })
177
+ : undefined
178
+ if (!cursorAwareWorkspaceProvider) {
179
+ serverLogger.info`Skipping skill extraction for ${orgId}: workspaceProvider background cursor methods are not configured`
180
+ return { skipped: true, processedMessages: 0, extractedSkills: 0 }
181
+ }
182
+
183
+ return await withConfiguredWorkspaceMemoryLock(orgId, async () => {
184
+ const workspace = await cursorAwareWorkspaceProvider.getWorkspace(orgRef)
185
+ const lifecycleState = await cursorAwareWorkspaceProvider.getLifecycleState?.(workspace)
186
+ if (lifecycleState?.bootstrapActive ?? false) {
187
+ serverLogger.info`Skipping skill extraction for ${orgId}: onboarding is not completed`
188
+ return { skipped: true, processedMessages: 0, extractedSkills: 0 }
189
+ }
190
+ const projectionState = await cursorAwareWorkspaceProvider.readProfileProjectionState?.(workspace)
191
+
192
+ const existingCursor = await cursorAwareWorkspaceProvider.getBackgroundCursor('skill-extraction', orgRef)
193
+ const onboardingCutoff = resolveWorkspaceBootstrapCutoff({
194
+ hasExistingCursor: existingCursor !== null,
195
+ bootstrapCompletedAt: lifecycleState?.bootstrapCompletedAt,
196
+ })
197
+
198
+ const workstreamIds = await listWorkstreamIdsForOrg(orgRef)
199
+ const messages = await listEligibleMessages({ workstreamIds, cursor: existingCursor, onboardingCutoff })
200
+
201
+ if (messages.length < MIN_MESSAGE_THRESHOLD) {
202
+ serverLogger.info`Skipping skill extraction for ${orgId}: only ${messages.length} messages (threshold: ${MIN_MESSAGE_THRESHOLD})`
203
+ return { skipped: true, processedMessages: messages.length, extractedSkills: 0 }
204
+ }
205
+
206
+ const sortedMessages = [...messages].sort(compareMessageOrder)
207
+ const { transcript } = buildDigestTranscript({ messages: sortedMessages })
208
+
209
+ const existingSkills = await learnedSkillService.listForOrg(orgId)
210
+ const existingSkillsSummary =
211
+ existingSkills.length > 0
212
+ ? existingSkills.map((skill, i) => `${i + 1}. ${skill.name}: ${skill.description}`).join('\n')
213
+ : 'None'
214
+
215
+ const extraction = await helperModelRuntime.generateHelperStructured({
216
+ tag: 'skill-extraction',
217
+ createAgent: createSkillExtractorAgent,
218
+ timeoutMs: SKILL_EXTRACTION_TIMEOUT_MS,
219
+ messages: [
220
+ {
221
+ role: 'user',
222
+ content: buildExtractionPrompt({
223
+ workspaceName: projectionState?.workspaceName || 'Workspace',
224
+ transcript,
225
+ existingSkills: existingSkillsSummary,
226
+ }),
227
+ },
228
+ ],
229
+ schema: SkillExtractionOutputSchema,
230
+ })
231
+
232
+ const skillCandidates = extraction.candidates.filter((c) => c.classification === 'skill')
233
+ let extractedSkills = 0
234
+
235
+ for (const candidate of skillCandidates) {
236
+ try {
237
+ const hash = learnedSkillService.generateHash(candidate.description, candidate.instructions)
238
+ const existingByHash = await learnedSkillService.findByHash(orgId, hash)
239
+ if (existingByHash) {
240
+ serverLogger.info`Skipping duplicate skill candidate ${candidate.name} (hash match)`
241
+ continue
242
+ }
243
+
244
+ const mostSimilar = await learnedSkillService.findMostSimilar(orgId, candidate.description)
245
+
246
+ const managerResult = await helperModelRuntime.generateHelperStructured({
247
+ tag: 'skill-manager',
248
+ createAgent: createSkillManagerAgent,
249
+ timeoutMs: SKILL_EXTRACTION_TIMEOUT_MS,
250
+ messages: [
251
+ {
252
+ role: 'user',
253
+ content: buildManagerPrompt({
254
+ candidate,
255
+ existingSkill: mostSimilar
256
+ ? {
257
+ name: mostSimilar.name,
258
+ description: mostSimilar.description,
259
+ instructions: mostSimilar.instructions,
260
+ version: mostSimilar.version,
261
+ }
262
+ : null,
263
+ }),
264
+ },
265
+ ],
266
+ schema: SkillManagerOutputSchema,
267
+ })
268
+
269
+ if (managerResult.decision === 'discard') {
270
+ serverLogger.info`Discarding skill candidate ${candidate.name}: ${managerResult.reason}`
271
+ continue
272
+ }
273
+
274
+ const embedding = await embeddings.embedQuery(candidate.description)
275
+ if (embedding.length === 0) {
276
+ serverLogger.warn`Skipping skill candidate ${candidate.name}: empty embedding`
277
+ continue
278
+ }
279
+
280
+ if (managerResult.decision === 'add') {
281
+ await learnedSkillService.create({
282
+ name: candidate.name,
283
+ description: candidate.description,
284
+ instructions: candidate.instructions,
285
+ triggers: candidate.triggers,
286
+ tags: candidate.tags,
287
+ examples: candidate.examples,
288
+ sourceType: 'conversation',
289
+ organizationId: orgId,
290
+ agentId: candidate.agentId,
291
+ confidence: candidate.confidence,
292
+ embedding,
293
+ hash,
294
+ })
295
+ extractedSkills++
296
+ serverLogger.info`Added new learned skill: ${candidate.name}`
297
+ } else if (mostSimilar && managerResult.mergedSkill) {
298
+ const merged = managerResult.mergedSkill
299
+ const mergedHash = learnedSkillService.generateHash(merged.description, merged.instructions)
300
+ const mergedEmbedding = await embeddings.embedQuery(merged.description)
301
+
302
+ await learnedSkillService.update(mostSimilar.id, {
303
+ name: merged.name,
304
+ description: merged.description,
305
+ instructions: merged.instructions,
306
+ triggers: merged.triggers,
307
+ tags: merged.tags,
308
+ examples: merged.examples,
309
+ confidence: merged.confidence,
310
+ version: mostSimilar.version + 1,
311
+ embedding: mergedEmbedding,
312
+ hash: mergedHash,
313
+ })
314
+ extractedSkills++
315
+ serverLogger.info`Merged skill candidate into ${mostSimilar.name} (v${mostSimilar.version + 1})`
316
+ }
317
+ } catch (candidateError) {
318
+ serverLogger.warn`Failed to process skill candidate ${candidate.name}: ${candidateError}`
319
+ }
320
+ }
321
+
322
+ const lastMessage = sortedMessages.at(-1)
323
+ if (lastMessage) {
324
+ await cursorAwareWorkspaceProvider.setBackgroundCursor('skill-extraction', orgRef, lastMessage.cursor)
325
+ }
326
+
327
+ serverLogger.info`Skill extraction completed for ${orgId}: messages=${messages.length}, extracted=${extractedSkills}`
328
+
329
+ return { skipped: false, processedMessages: messages.length, extractedSkills }
330
+ })
331
+ }
@@ -0,0 +1,22 @@
1
+ import type { SandboxedJob } from 'bullmq'
2
+
3
+ import { serverLogger } from '../config/logger'
4
+ import type { SkillExtractionJob } from '../queues/skill-extraction.queue'
5
+ import { initializeSandboxedWorkerRuntime } from './bootstrap'
6
+ import { runSkillExtraction } from './skill-extraction.runner'
7
+ import { toSandboxedWorkerError } from './utils/sandbox-error'
8
+ import { createTracedWorkerProcessor } from './worker-utils'
9
+
10
+ await initializeSandboxedWorkerRuntime()
11
+
12
+ const handler = async (job: SandboxedJob<SkillExtractionJob>) => {
13
+ try {
14
+ await runSkillExtraction(job.data)
15
+ } catch (error) {
16
+ const serialized = toSandboxedWorkerError(error, 'Skill extraction failed')
17
+ serverLogger.error`${serialized.message}`
18
+ throw serialized
19
+ }
20
+ }
21
+
22
+ export default createTracedWorkerProcessor('skill-extraction', handler)
@@ -0,0 +1,331 @@
1
+ export const DEFAULT_REPOMIX_CHUNK_MAX_CHARS = 250_000
2
+ const MIN_REPOMIX_CHUNK_MAX_CHARS = 4_000
3
+ export const DEFAULT_REPOMIX_CHUNK_MIN_CHARS = 10_000
4
+ const SECTION_SEPARATOR_LENGTH = 2
5
+ const FILE_SECTION_HEADER_SOURCE = '^## File:\\s+(.+)$'
6
+
7
+ interface RepomixSection {
8
+ kind: 'preamble' | 'file'
9
+ content: string
10
+ filePath?: string
11
+ }
12
+
13
+ export interface RepomixContextChunk {
14
+ index: number
15
+ totalChunks: number
16
+ content: string
17
+ charLength: number
18
+ tokenEstimate: number
19
+ sectionCount: number
20
+ fileCount: number
21
+ firstFilePath: string | null
22
+ lastFilePath: string | null
23
+ }
24
+
25
+ interface RepomixChunkOptions {
26
+ maxChars?: number
27
+ minChunkChars?: number
28
+ preserveCodeFenceIntegrity?: boolean
29
+ }
30
+
31
+ function estimateTokenCountFromChars(text: string): number {
32
+ if (!text) return 0
33
+ return Math.ceil(text.length / 3)
34
+ }
35
+
36
+ function normalizeMaxChars(value?: number): number {
37
+ if (typeof value !== 'number' || !Number.isFinite(value)) {
38
+ return DEFAULT_REPOMIX_CHUNK_MAX_CHARS
39
+ }
40
+ return Math.max(MIN_REPOMIX_CHUNK_MAX_CHARS, Math.floor(value))
41
+ }
42
+
43
+ function normalizeMinChunkChars(value: number | undefined, maxChars: number): number {
44
+ if (typeof value !== 'number' || !Number.isFinite(value)) {
45
+ return Math.min(DEFAULT_REPOMIX_CHUNK_MIN_CHARS, Math.floor(maxChars * 0.35))
46
+ }
47
+ const normalized = Math.max(512, Math.floor(value))
48
+ return Math.min(normalized, Math.floor(maxChars * 0.6))
49
+ }
50
+
51
+ async function splitTextByCharBudget(text: string, maxChars: number): Promise<string[]> {
52
+ const source = text.trim()
53
+ if (!source) return []
54
+ if (source.length <= maxChars) return [source]
55
+
56
+ const chunks: string[] = []
57
+ let cursor = 0
58
+
59
+ while (cursor < source.length) {
60
+ let end = Math.min(source.length, cursor + maxChars)
61
+
62
+ if (end < source.length) {
63
+ const breakCandidates = [
64
+ source.lastIndexOf('\n## File: ', end),
65
+ source.lastIndexOf('\n```\n', end),
66
+ source.lastIndexOf('\n\n', end),
67
+ source.lastIndexOf('\n', end),
68
+ source.lastIndexOf(' ', end),
69
+ ]
70
+ const preferred = breakCandidates.find((position) => position > cursor + Math.floor(maxChars * 0.35))
71
+ if (typeof preferred === 'number' && preferred > cursor) {
72
+ end = preferred
73
+ }
74
+ }
75
+
76
+ const chunk = source.slice(cursor, end).trim()
77
+ if (chunk) {
78
+ chunks.push(chunk)
79
+ }
80
+ cursor = end
81
+ }
82
+
83
+ return chunks
84
+ }
85
+
86
+ function splitBodyByBudget(params: { body: string; budget: number; minChunkChars: number }): string[] {
87
+ const body = params.body.trim()
88
+ if (!body) return []
89
+ if (body.length <= params.budget) return [body]
90
+
91
+ const chunks: string[] = []
92
+ let cursor = 0
93
+ while (cursor < body.length) {
94
+ let end = Math.min(body.length, cursor + params.budget)
95
+ if (end < body.length) {
96
+ const breakAt = body.lastIndexOf('\n', end)
97
+ if (breakAt > cursor + Math.floor(params.minChunkChars / 2)) {
98
+ end = breakAt
99
+ }
100
+ }
101
+
102
+ const part = body.slice(cursor, end).trim()
103
+ if (part) {
104
+ chunks.push(part)
105
+ }
106
+ cursor = end
107
+ }
108
+
109
+ return chunks
110
+ }
111
+
112
+ function splitFileBody(
113
+ body: string,
114
+ options: { budget: number; minChunkChars: number; preserveCodeFenceIntegrity: boolean },
115
+ ): string[] {
116
+ const source = body.trim()
117
+ if (!source) return []
118
+
119
+ if (!options.preserveCodeFenceIntegrity) {
120
+ return splitBodyByBudget({ body: source, budget: options.budget, minChunkChars: options.minChunkChars })
121
+ }
122
+
123
+ const fenceHeaderMatch = source.match(/^```[^\n]*\n/)
124
+ const hasTrailingFence = source.endsWith('\n```')
125
+ if (!fenceHeaderMatch || !hasTrailingFence) {
126
+ return splitBodyByBudget({ body: source, budget: options.budget, minChunkChars: options.minChunkChars })
127
+ }
128
+
129
+ const openingFence = fenceHeaderMatch[0].trimEnd()
130
+ const innerStart = fenceHeaderMatch[0].length
131
+ const innerEnd = Math.max(innerStart, source.length - '\n```'.length)
132
+ const innerBody = source.slice(innerStart, innerEnd)
133
+ const wrapperChars = openingFence.length + '\n'.length + '\n```'.length
134
+ const innerBudget = Math.max(1_000, options.budget - wrapperChars)
135
+ const innerChunks = splitBodyByBudget({ body: innerBody, budget: innerBudget, minChunkChars: options.minChunkChars })
136
+
137
+ if (innerChunks.length === 0) {
138
+ return [source]
139
+ }
140
+
141
+ return innerChunks.map((chunk) => `${openingFence}\n${chunk}\n\`\`\``)
142
+ }
143
+
144
+ function mergeTinyTailParts(parts: string[], options: { minChunkChars: number; maxChars: number }): string[] {
145
+ if (parts.length <= 1) return parts
146
+ const merged = [...parts]
147
+ while (merged.length > 1) {
148
+ const last = merged[merged.length - 1] ?? ''
149
+ if (last.length >= options.minChunkChars) break
150
+ const previous = merged[merged.length - 2] ?? ''
151
+ const combinedLength = previous.length + SECTION_SEPARATOR_LENGTH + last.length
152
+ if (combinedLength > options.maxChars) break
153
+ merged.splice(merged.length - 2, 2, `${previous}\n\n${last}`.trim())
154
+ }
155
+ return merged
156
+ }
157
+
158
+ async function splitOversizedSection(
159
+ section: RepomixSection,
160
+ options: { maxChars: number; minChunkChars: number; preserveCodeFenceIntegrity: boolean },
161
+ ): Promise<RepomixSection[]> {
162
+ if (section.content.length <= options.maxChars) {
163
+ return [section]
164
+ }
165
+
166
+ if (section.kind !== 'file' || !section.filePath) {
167
+ const chunks = await splitTextByCharBudget(section.content, options.maxChars)
168
+ return chunks.map((content) => ({ kind: section.kind, content }))
169
+ }
170
+
171
+ const header = `## File: ${section.filePath}`
172
+ const body = section.content.startsWith(header) ? section.content.slice(header.length).trimStart() : section.content
173
+ const partPrefixTemplate = '\n(part 000/000)\n'
174
+ const bodyBudget = Math.max(1_000, options.maxChars - header.length - partPrefixTemplate.length)
175
+
176
+ const bodyParts = splitFileBody(body, {
177
+ budget: bodyBudget,
178
+ minChunkChars: options.minChunkChars,
179
+ preserveCodeFenceIntegrity: options.preserveCodeFenceIntegrity,
180
+ })
181
+ const normalizedParts = mergeTinyTailParts(bodyParts, { minChunkChars: options.minChunkChars, maxChars: bodyBudget })
182
+ const totalParts = Math.max(1, normalizedParts.length)
183
+
184
+ return normalizedParts.map((chunk, index) => ({
185
+ kind: 'file',
186
+ filePath: section.filePath,
187
+ content: `${header}\n(part ${index + 1}/${totalParts})\n${chunk}`.trim(),
188
+ }))
189
+ }
190
+
191
+ function parseRepomixSections(repomixOutput: string): RepomixSection[] {
192
+ const source = repomixOutput.trim()
193
+ if (!source) return []
194
+
195
+ const matches = Array.from(source.matchAll(new RegExp(FILE_SECTION_HEADER_SOURCE, 'gm')))
196
+ if (matches.length === 0) {
197
+ return [{ kind: 'preamble', content: source }]
198
+ }
199
+
200
+ const sections: RepomixSection[] = []
201
+ const firstMatch = matches.at(0)
202
+ const firstIndex = firstMatch ? firstMatch.index : 0
203
+ if (firstIndex > 0) {
204
+ const preamble = source.slice(0, firstIndex).trim()
205
+ if (preamble) {
206
+ sections.push({ kind: 'preamble', content: preamble })
207
+ }
208
+ }
209
+
210
+ for (const [index, match] of matches.entries()) {
211
+ const start = match.index
212
+ const nextStart = matches[index + 1]?.index ?? source.length
213
+ const content = source.slice(start, nextStart).trim()
214
+ if (!content) continue
215
+ const filePath = (match[1] ?? '').trim()
216
+ sections.push({ kind: 'file', content, filePath: filePath || undefined })
217
+ }
218
+
219
+ return sections
220
+ }
221
+
222
+ function mergeTinyChunks(
223
+ chunks: Omit<RepomixContextChunk, 'index' | 'totalChunks'>[],
224
+ options: { minChunkChars: number; maxChars: number },
225
+ ): Omit<RepomixContextChunk, 'index' | 'totalChunks'>[] {
226
+ if (chunks.length <= 1) return chunks
227
+ const merged: Omit<RepomixContextChunk, 'index' | 'totalChunks'>[] = []
228
+
229
+ for (const chunk of chunks) {
230
+ const previous = merged.at(-1)
231
+ if (
232
+ previous &&
233
+ chunk.charLength < options.minChunkChars &&
234
+ previous.charLength + SECTION_SEPARATOR_LENGTH + chunk.charLength <= options.maxChars
235
+ ) {
236
+ const combinedContent = `${previous.content}\n\n${chunk.content}`.trim()
237
+ merged[merged.length - 1] = {
238
+ ...previous,
239
+ content: combinedContent,
240
+ charLength: combinedContent.length,
241
+ tokenEstimate: estimateTokenCountFromChars(combinedContent),
242
+ sectionCount: previous.sectionCount + chunk.sectionCount,
243
+ fileCount: previous.fileCount + chunk.fileCount,
244
+ firstFilePath: previous.firstFilePath,
245
+ lastFilePath: chunk.lastFilePath ?? previous.lastFilePath,
246
+ }
247
+ continue
248
+ }
249
+ merged.push(chunk)
250
+ }
251
+
252
+ return merged
253
+ }
254
+
255
+ export async function chunkRepomixOutput(
256
+ repomixOutput: string,
257
+ options: RepomixChunkOptions = {},
258
+ ): Promise<RepomixContextChunk[]> {
259
+ const maxChars = normalizeMaxChars(options.maxChars)
260
+ const minChunkChars = normalizeMinChunkChars(options.minChunkChars, maxChars)
261
+ const preserveCodeFenceIntegrity = options.preserveCodeFenceIntegrity ?? true
262
+
263
+ const rawSections = parseRepomixSections(repomixOutput)
264
+ const splitSections = await Promise.all(
265
+ rawSections.map(
266
+ async (section) => await splitOversizedSection(section, { maxChars, minChunkChars, preserveCodeFenceIntegrity }),
267
+ ),
268
+ )
269
+ const sections = splitSections.flat()
270
+
271
+ if (sections.length === 0) return []
272
+
273
+ const chunks: Omit<RepomixContextChunk, 'index' | 'totalChunks'>[] = []
274
+ let currentParts: string[] = []
275
+ let currentCharLength = 0
276
+ let currentSectionCount = 0
277
+ let currentFileCount = 0
278
+ let currentFirstFilePath: string | null = null
279
+ let currentLastFilePath: string | null = null
280
+
281
+ const flushCurrent = () => {
282
+ if (currentParts.length === 0) return
283
+
284
+ const content = currentParts.join('\n\n').trim()
285
+ if (!content) return
286
+
287
+ chunks.push({
288
+ content,
289
+ charLength: content.length,
290
+ tokenEstimate: estimateTokenCountFromChars(content),
291
+ sectionCount: currentSectionCount,
292
+ fileCount: currentFileCount,
293
+ firstFilePath: currentFirstFilePath,
294
+ lastFilePath: currentLastFilePath,
295
+ })
296
+
297
+ currentParts = []
298
+ currentCharLength = 0
299
+ currentSectionCount = 0
300
+ currentFileCount = 0
301
+ currentFirstFilePath = null
302
+ currentLastFilePath = null
303
+ }
304
+
305
+ for (const section of sections) {
306
+ const sectionText = section.content.trim()
307
+ if (!sectionText) continue
308
+
309
+ const addedSeparator = currentParts.length > 0 ? SECTION_SEPARATOR_LENGTH : 0
310
+ const projected = currentCharLength + sectionText.length + addedSeparator
311
+ if (currentParts.length > 0 && projected > maxChars) {
312
+ flushCurrent()
313
+ }
314
+
315
+ currentParts.push(sectionText)
316
+ currentCharLength += sectionText.length + (currentParts.length > 1 ? SECTION_SEPARATOR_LENGTH : 0)
317
+ currentSectionCount += 1
318
+ if (section.kind === 'file') {
319
+ currentFileCount += 1
320
+ if (!currentFirstFilePath) {
321
+ currentFirstFilePath = section.filePath ?? null
322
+ }
323
+ currentLastFilePath = section.filePath ?? currentLastFilePath
324
+ }
325
+ }
326
+
327
+ flushCurrent()
328
+ const normalizedChunks = mergeTinyChunks(chunks, { minChunkChars, maxChars })
329
+ const totalChunks = normalizedChunks.length
330
+ return normalizedChunks.map((chunk, index) => ({ ...chunk, index: index + 1, totalChunks }))
331
+ }