@remnic/core 9.3.593 → 9.3.594
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/access-cli.js +2 -2
- package/dist/buffer-surprise.js +2 -2
- package/dist/{chunk-7ZGA7YTS.js → chunk-7WU3FML2.js} +2 -2
- package/dist/{chunk-5JRF2PZA.js → chunk-BF7ZRHH2.js} +2 -2
- package/dist/{chunk-KVE7R4CG.js → chunk-WCYKT2DE.js} +21 -1
- package/dist/chunk-WCYKT2DE.js.map +1 -0
- package/dist/index.js +3 -3
- package/dist/orchestrator.js +2 -2
- package/dist/schemas.d.ts +22 -22
- package/dist/semantic-chunking.js +1 -1
- package/dist/transfer/types.d.ts +12 -12
- package/package.json +1 -1
- package/src/semantic-chunking.ts +25 -0
- package/dist/chunk-KVE7R4CG.js.map +0 -1
- /package/dist/{chunk-7ZGA7YTS.js.map → chunk-7WU3FML2.js.map} +0 -0
- /package/dist/{chunk-5JRF2PZA.js.map → chunk-BF7ZRHH2.js.map} +0 -0
package/dist/access-cli.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
Orchestrator
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-7WU3FML2.js";
|
|
4
4
|
import "./chunk-5RIRL3XL.js";
|
|
5
5
|
import "./chunk-KVEVLBKC.js";
|
|
6
6
|
import "./chunk-BFBF3XEF.js";
|
|
@@ -116,7 +116,7 @@ import "./chunk-3ONXXHQO.js";
|
|
|
116
116
|
import "./chunk-LMDRGRJ2.js";
|
|
117
117
|
import "./chunk-OKTXM5H4.js";
|
|
118
118
|
import "./chunk-3UXOZBHV.js";
|
|
119
|
-
import "./chunk-
|
|
119
|
+
import "./chunk-WCYKT2DE.js";
|
|
120
120
|
import "./chunk-4WMCPJWX.js";
|
|
121
121
|
import "./chunk-EVZFIAPG.js";
|
|
122
122
|
import "./chunk-CSKLPDN6.js";
|
package/dist/buffer-surprise.js
CHANGED
|
@@ -274,7 +274,7 @@ import {
|
|
|
274
274
|
} from "./chunk-LMDRGRJ2.js";
|
|
275
275
|
import {
|
|
276
276
|
semanticChunkContent
|
|
277
|
-
} from "./chunk-
|
|
277
|
+
} from "./chunk-WCYKT2DE.js";
|
|
278
278
|
import {
|
|
279
279
|
chunkContent
|
|
280
280
|
} from "./chunk-4WMCPJWX.js";
|
|
@@ -12405,4 +12405,4 @@ export {
|
|
|
12405
12405
|
resolvePersistedMemoryRelativePath,
|
|
12406
12406
|
Orchestrator
|
|
12407
12407
|
};
|
|
12408
|
-
//# sourceMappingURL=chunk-
|
|
12408
|
+
//# sourceMappingURL=chunk-7WU3FML2.js.map
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
cosineSimilarity
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-WCYKT2DE.js";
|
|
4
4
|
|
|
5
5
|
// src/buffer-surprise.ts
|
|
6
6
|
var DEFAULT_SURPRISE_K = 5;
|
|
@@ -64,4 +64,4 @@ export {
|
|
|
64
64
|
DEFAULT_SURPRISE_K,
|
|
65
65
|
computeSurprise
|
|
66
66
|
};
|
|
67
|
-
//# sourceMappingURL=chunk-
|
|
67
|
+
//# sourceMappingURL=chunk-BF7ZRHH2.js.map
|
|
@@ -103,6 +103,17 @@ async function batchEmbed(sentences, embedFn, batchSize) {
|
|
|
103
103
|
}
|
|
104
104
|
return allEmbeddings;
|
|
105
105
|
}
|
|
106
|
+
function findEmbeddingDimensionMismatch(embeddings) {
|
|
107
|
+
if (embeddings.length <= 1) return null;
|
|
108
|
+
const expected = embeddings[0].length;
|
|
109
|
+
for (let i = 1; i < embeddings.length; i++) {
|
|
110
|
+
const actual = embeddings[i].length;
|
|
111
|
+
if (actual !== expected) {
|
|
112
|
+
return { expected, actual, index: i };
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return null;
|
|
116
|
+
}
|
|
106
117
|
function buildSegments(sentences, boundaries) {
|
|
107
118
|
const sorted = [...boundaries].sort((a, b) => a - b);
|
|
108
119
|
const segments = [];
|
|
@@ -221,6 +232,15 @@ async function semanticChunkContent(content, embedFn, config) {
|
|
|
221
232
|
`Semantic chunking failed: expected ${sentences.length} embeddings but received ${embeddings.length}`
|
|
222
233
|
);
|
|
223
234
|
}
|
|
235
|
+
const dimensionMismatch = findEmbeddingDimensionMismatch(embeddings);
|
|
236
|
+
if (dimensionMismatch) {
|
|
237
|
+
if (cfg.fallbackToRecursive) {
|
|
238
|
+
return buildRecursiveFallback(content, cfg);
|
|
239
|
+
}
|
|
240
|
+
throw new Error(
|
|
241
|
+
`Semantic chunking failed: embedding vectors have mismatched dimensions (${dimensionMismatch.expected} vs ${dimensionMismatch.actual} at index ${dimensionMismatch.index})`
|
|
242
|
+
);
|
|
243
|
+
}
|
|
224
244
|
const similarities = [];
|
|
225
245
|
for (let i = 0; i < sentences.length - 1; i++) {
|
|
226
246
|
similarities.push(cosineSimilarity(embeddings[i], embeddings[i + 1]));
|
|
@@ -317,4 +337,4 @@ export {
|
|
|
317
337
|
findLocalMinima,
|
|
318
338
|
semanticChunkContent
|
|
319
339
|
};
|
|
320
|
-
//# sourceMappingURL=chunk-
|
|
340
|
+
//# sourceMappingURL=chunk-WCYKT2DE.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/semantic-chunking.ts"],"sourcesContent":["/**\n * Semantic Chunking with Smoothing-Based Topic Boundaries (Issue #368)\n *\n * An optional alternative to the recursive chunker in chunking.ts.\n * Uses sentence embeddings + cosine similarity + smoothing to detect\n * natural topic boundaries, producing more coherent chunks.\n */\n\nimport { chunkContent, type Chunk, type ChunkResult } from \"./chunking.js\";\n\n// ---------------------------------------------------------------------------\n// Configuration\n// ---------------------------------------------------------------------------\n\nexport interface SemanticChunkingConfig {\n /** Target tokens per chunk. Default: 200. */\n targetTokens: number;\n /** Minimum tokens for a segment before merging with neighbor. Default: 100. */\n minTokens: number;\n /** Maximum tokens for a segment before recursive splitting. Default: 400. */\n maxTokens: number;\n /** Window size for the moving-average smoothing filter. Default: 3. */\n smoothingWindowSize: number;\n /** How many standard deviations below the mean constitutes a boundary. Default: 1.0. */\n boundaryThresholdStdDevs: number;\n /** Batch size for embedding requests. Default: 32. */\n embeddingBatchSize: number;\n /** Fall back to recursive chunking when embeddings are unavailable. Default: true. */\n fallbackToRecursive: boolean;\n}\n\nexport const DEFAULT_SEMANTIC_CHUNKING_CONFIG: SemanticChunkingConfig = {\n targetTokens: 200,\n minTokens: 100,\n maxTokens: 400,\n smoothingWindowSize: 3,\n boundaryThresholdStdDevs: 1.0,\n embeddingBatchSize: 32,\n fallbackToRecursive: true,\n};\n\n// ---------------------------------------------------------------------------\n// Result types\n// ---------------------------------------------------------------------------\n\nexport interface SemanticChunk extends Chunk {\n /** Optional topic hint derived from position. */\n topicLabel?: string;\n /** Cosine similarity score at the trailing boundary of this chunk. */\n boundaryScore: number;\n}\n\nexport interface SemanticChunkResult {\n /** Whether content was split into multiple chunks. */\n chunked: boolean;\n /** The chunks produced. */\n chunks: SemanticChunk[];\n /** Sentence indices where topic splits occurred. */\n boundaries: number[];\n /** Which algorithm produced the result. */\n method: \"semantic\" | \"recursive-fallback\";\n}\n\n// ---------------------------------------------------------------------------\n// Embedding function signature\n// ---------------------------------------------------------------------------\n\n/** Caller-provided function that embeds an array of texts, returning vectors. */\nexport type EmbedFn = (texts: string[]) => Promise<number[][]>;\n\n// ---------------------------------------------------------------------------\n// Math utilities (exported for testing)\n// ---------------------------------------------------------------------------\n\n/**\n * Cosine similarity between two vectors.\n * Returns a value in [-1, 1]. Identical direction = 1, orthogonal = 0.\n *\n * NOTE: This duplicates cosineSimilarity in recall-mmr.ts and embedding-fallback.ts.\n * Consider extracting to a shared math utility in a future refactor.\n */\nexport function cosineSimilarity(a: number[], b: number[]): number {\n if (a.length !== b.length) {\n throw new Error(\n `cosineSimilarity: vector length mismatch (${a.length} vs ${b.length})`,\n );\n }\n if (a.length === 0) return 0;\n\n let dot = 0;\n let magA = 0;\n let magB = 0;\n for (let i = 0; i < a.length; i++) {\n dot += a[i] * b[i];\n magA += a[i] * a[i];\n magB += b[i] * b[i];\n }\n\n const denom = Math.sqrt(magA) * Math.sqrt(magB);\n if (denom === 0) return 0;\n return dot / denom;\n}\n\n/**\n * Arithmetic mean of a numeric series.\n */\nexport function mean(series: number[]): number {\n if (series.length === 0) return 0;\n let sum = 0;\n for (const v of series) sum += v;\n return sum / series.length;\n}\n\n/**\n * Population standard deviation of a numeric series.\n */\nexport function stddev(series: number[]): number {\n if (series.length === 0) return 0;\n const m = mean(series);\n let sumSq = 0;\n for (const v of series) {\n const d = v - m;\n sumSq += d * d;\n }\n return Math.sqrt(sumSq / series.length);\n}\n\n/**\n * Simple moving average over a 1D series.\n * The window is centered: for window size W, each output[i] averages\n * series[i - floor(W/2) .. i + floor(W/2)], clamped to bounds.\n *\n * Even window sizes are rounded up to the next odd value so the window\n * is symmetric around the center point (Finding 4, PR #420).\n */\nexport function movingAverage(series: number[], windowSize: number): number[] {\n if (series.length === 0) return [];\n if (windowSize < 1) windowSize = 1;\n // Round even values up to the next odd so the window is symmetric.\n if (windowSize % 2 === 0) windowSize = windowSize + 1;\n\n const halfW = Math.floor(windowSize / 2);\n const result: number[] = new Array(series.length);\n\n for (let i = 0; i < series.length; i++) {\n const lo = Math.max(0, i - halfW);\n const hi = Math.min(series.length - 1, i + halfW);\n let sum = 0;\n for (let j = lo; j <= hi; j++) sum += series[j];\n result[i] = sum / (hi - lo + 1);\n }\n return result;\n}\n\n/**\n * Find indices in the series that are local minima AND below the threshold.\n * A local minimum is a point lower than both its immediate neighbors\n * (or lower-or-equal at series boundaries).\n */\nexport function findLocalMinima(\n series: number[],\n threshold: number,\n): number[] {\n if (series.length <= 2) return [];\n\n const minima: number[] = [];\n for (let i = 1; i < series.length - 1; i++) {\n if (\n series[i] < series[i - 1] &&\n series[i] < series[i + 1] &&\n series[i] < threshold\n ) {\n minima.push(i);\n }\n }\n return minima;\n}\n\n// ---------------------------------------------------------------------------\n// Sentence tokenizer\n// ---------------------------------------------------------------------------\n\n/**\n * Split text into sentences at punctuation boundaries.\n * Preserves punctuation with the preceding sentence.\n */\nfunction splitSentences(text: string): string[] {\n const sentences: string[] = [];\n const sentenceRegex = /[^.!?]*[.!?]+(?:\\s+|$)/g;\n\n let match: RegExpExecArray | null;\n let lastIndex = 0;\n\n while ((match = sentenceRegex.exec(text)) !== null) {\n sentences.push(match[0].trim());\n lastIndex = sentenceRegex.lastIndex;\n }\n\n if (lastIndex < text.length) {\n const remaining = text.slice(lastIndex).trim();\n if (remaining) {\n sentences.push(remaining);\n }\n }\n\n return sentences.filter((s) => s.length > 0);\n}\n\n// ---------------------------------------------------------------------------\n// Token estimation\n// ---------------------------------------------------------------------------\n\n/** Rough token estimate: ~4 chars per token for English. */\nfunction estimateTokens(text: string): number {\n return Math.ceil(text.length / 4);\n}\n\n// ---------------------------------------------------------------------------\n// Core semantic chunking\n// ---------------------------------------------------------------------------\n\n/**\n * Batch-embed sentences using the provided embed function.\n * Respects the configured batch size.\n */\nasync function batchEmbed(\n sentences: string[],\n embedFn: EmbedFn,\n batchSize: number,\n): Promise<number[][]> {\n const allEmbeddings: number[][] = [];\n\n for (let i = 0; i < sentences.length; i += batchSize) {\n const batch = sentences.slice(i, i + batchSize);\n const batchResult = await embedFn(batch);\n for (const vec of batchResult) {\n allEmbeddings.push(vec);\n }\n }\n\n return allEmbeddings;\n}\n\nfunction findEmbeddingDimensionMismatch(\n embeddings: number[][],\n): { expected: number; actual: number; index: number } | null {\n if (embeddings.length <= 1) return null;\n const expected = embeddings[0].length;\n for (let i = 1; i < embeddings.length; i++) {\n const actual = embeddings[i].length;\n if (actual !== expected) {\n return { expected, actual, index: i };\n }\n }\n return null;\n}\n\n/**\n * Build segments from boundary indices.\n * boundaries are sentence indices at which splits occur (i.e., the split\n * happens AFTER the boundary index sentence).\n */\nfunction buildSegments(\n sentences: string[],\n boundaries: number[],\n): string[][] {\n const sorted = [...boundaries].sort((a, b) => a - b);\n const segments: string[][] = [];\n let start = 0;\n\n for (const b of sorted) {\n // Split after sentence at index b: segment is [start .. b]\n const splitPoint = b + 1;\n if (splitPoint > start && splitPoint <= sentences.length) {\n segments.push(sentences.slice(start, splitPoint));\n start = splitPoint;\n }\n }\n\n // Remaining sentences\n if (start < sentences.length) {\n segments.push(sentences.slice(start));\n }\n\n return segments;\n}\n\n/**\n * Merge short segments (below minTokens) with their neighbor.\n * Prefers merging forward; falls back to merging backward.\n */\nfunction mergeShortSegments(\n segments: string[][],\n minTokens: number,\n): string[][] {\n if (segments.length <= 1) return segments;\n\n const merged: string[][] = [];\n let buffer: string[] = [];\n\n for (let i = 0; i < segments.length; i++) {\n buffer = [...buffer, ...segments[i]];\n const tokenCount = estimateTokens(buffer.join(\" \"));\n\n if (tokenCount >= minTokens || i === segments.length - 1) {\n merged.push(buffer);\n buffer = [];\n }\n }\n\n // If the last merge left a dangling buffer, attach it to the last segment\n if (buffer.length > 0) {\n if (merged.length > 0) {\n merged[merged.length - 1] = [...merged[merged.length - 1], ...buffer];\n } else {\n merged.push(buffer);\n }\n }\n\n return merged;\n}\n\n/**\n * Split an oversized segment using recursive chunking.\n */\nfunction splitLongSegment(\n segment: string[],\n maxTokens: number,\n targetTokens: number,\n): SemanticChunk[] {\n const text = segment.join(\" \");\n // Cap targetTokens to maxTokens so recursive splitting never produces\n // segments larger than the configured maximum (Finding 2, PR #420).\n const cappedTarget = Math.min(targetTokens, maxTokens);\n const result: ChunkResult = chunkContent(text, {\n targetTokens: cappedTarget,\n minTokens: Math.min(cappedTarget, maxTokens),\n overlapSentences: 0,\n });\n\n return result.chunks.map((c) => ({\n content: c.content,\n index: c.index,\n tokenCount: c.tokenCount,\n boundaryScore: 0,\n }));\n}\n\n/**\n * Semantic chunking with smoothing-based topic boundary detection.\n *\n * @param content - Full text to chunk.\n * @param embedFn - Async function that embeds an array of texts.\n * @param config - Optional partial config overrides.\n * @returns SemanticChunkResult\n */\nexport async function semanticChunkContent(\n content: string,\n embedFn: EmbedFn,\n config?: Partial<SemanticChunkingConfig>,\n): Promise<SemanticChunkResult> {\n const cfg: SemanticChunkingConfig = {\n ...DEFAULT_SEMANTIC_CHUNKING_CONFIG,\n ...config,\n };\n\n // Guard against non-positive batch size which would cause an infinite loop\n const batchSize = Math.max(1, cfg.embeddingBatchSize);\n\n // --- Empty / trivially short input ---\n if (!content || content.trim().length === 0) {\n return {\n chunked: false,\n chunks: [],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n const sentences = splitSentences(content);\n\n if (sentences.length <= 1) {\n const tokenCount = estimateTokens(content);\n return {\n chunked: false,\n chunks: [\n {\n content: content.trim(),\n index: 0,\n tokenCount,\n boundaryScore: 1,\n },\n ],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n // If total tokens is short enough, return as single chunk\n const totalTokens = estimateTokens(content);\n if (totalTokens <= cfg.minTokens) {\n return {\n chunked: false,\n chunks: [\n {\n content: content.trim(),\n index: 0,\n tokenCount: totalTokens,\n boundaryScore: 1,\n },\n ],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n // --- Attempt embedding ---\n let embeddings: number[][];\n try {\n embeddings = await batchEmbed(sentences, embedFn, batchSize);\n } catch {\n // Embedding failed — fall back if configured\n if (cfg.fallbackToRecursive) {\n return buildRecursiveFallback(content, cfg);\n }\n throw new Error(\n \"Semantic chunking failed: embedding function threw and fallbackToRecursive is disabled\",\n );\n }\n\n if (embeddings.length !== sentences.length) {\n if (cfg.fallbackToRecursive) {\n return buildRecursiveFallback(content, cfg);\n }\n throw new Error(\n `Semantic chunking failed: expected ${sentences.length} embeddings but received ${embeddings.length}`,\n );\n }\n\n const dimensionMismatch = findEmbeddingDimensionMismatch(embeddings);\n if (dimensionMismatch) {\n if (cfg.fallbackToRecursive) {\n return buildRecursiveFallback(content, cfg);\n }\n throw new Error(\n `Semantic chunking failed: embedding vectors have mismatched dimensions ` +\n `(${dimensionMismatch.expected} vs ${dimensionMismatch.actual} at index ${dimensionMismatch.index})`,\n );\n }\n\n // --- Compute pairwise cosine similarity ---\n const similarities: number[] = [];\n for (let i = 0; i < sentences.length - 1; i++) {\n similarities.push(cosineSimilarity(embeddings[i], embeddings[i + 1]));\n }\n\n // If only one pair (2 sentences), nothing to smooth or split meaningfully.\n // However, if the combined content exceeds maxTokens, apply recursive splitting.\n if (similarities.length <= 1) {\n if (totalTokens > cfg.maxTokens) {\n return buildRecursiveFallback(content, cfg);\n }\n return {\n chunked: false,\n chunks: [\n {\n content: content.trim(),\n index: 0,\n tokenCount: totalTokens,\n boundaryScore: similarities.length === 1 ? similarities[0] : 1,\n },\n ],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n // --- Smooth the similarity series ---\n const smoothed = movingAverage(similarities, cfg.smoothingWindowSize);\n\n // --- Detect boundaries: local minima below (mean - k * stddev) ---\n const m = mean(smoothed);\n const s = stddev(smoothed);\n const threshold = m - cfg.boundaryThresholdStdDevs * s;\n const rawBoundaries = findLocalMinima(smoothed, threshold);\n\n // --- Build segments, merge short, split long ---\n let segments = buildSegments(sentences, rawBoundaries);\n segments = mergeShortSegments(segments, cfg.minTokens);\n\n // --- Convert segments to chunks, splitting oversized ones ---\n const chunks: SemanticChunk[] = [];\n const finalBoundaries: number[] = [];\n let sentenceOffset = 0;\n\n for (let segIdx = 0; segIdx < segments.length; segIdx++) {\n const segment = segments[segIdx];\n const segText = segment.join(\" \");\n const segTokens = estimateTokens(segText);\n\n if (segTokens > cfg.maxTokens) {\n // Recursive split for oversized segment\n const subChunks = splitLongSegment(segment, cfg.maxTokens, cfg.targetTokens);\n for (const sc of subChunks) {\n chunks.push({\n ...sc,\n index: chunks.length,\n });\n }\n } else {\n // Compute boundary score: the similarity at the trailing edge\n const trailingSentenceIdx = sentenceOffset + segment.length - 1;\n let bScore = 1;\n if (\n trailingSentenceIdx < similarities.length &&\n segIdx < segments.length - 1\n ) {\n bScore = smoothed[trailingSentenceIdx] ?? similarities[trailingSentenceIdx] ?? 1;\n }\n\n chunks.push({\n content: segText,\n index: chunks.length,\n tokenCount: segTokens,\n boundaryScore: bScore,\n });\n }\n\n // Record boundaries (all but the last segment produce a boundary)\n if (segIdx < segments.length - 1) {\n finalBoundaries.push(sentenceOffset + segment.length - 1);\n }\n sentenceOffset += segment.length;\n }\n\n return {\n chunked: chunks.length > 1,\n chunks,\n boundaries: finalBoundaries,\n method: \"semantic\",\n };\n}\n\n// ---------------------------------------------------------------------------\n// Recursive fallback helper\n// ---------------------------------------------------------------------------\n\nfunction buildRecursiveFallback(\n content: string,\n cfg: SemanticChunkingConfig,\n): SemanticChunkResult {\n // Cap targetTokens to maxTokens so the recursive fallback path honours the\n // same constraint as splitLongSegment (PR #439 post-merge cursor[bot] finding).\n const cappedTarget = Math.min(cfg.targetTokens, cfg.maxTokens);\n const result: ChunkResult = chunkContent(content, {\n targetTokens: cappedTarget,\n minTokens: Math.min(cfg.minTokens, cappedTarget),\n overlapSentences: 0,\n });\n\n return {\n chunked: result.chunked,\n chunks: result.chunks.map((c) => ({\n ...c,\n boundaryScore: 0,\n })),\n boundaries: [],\n method: \"recursive-fallback\",\n };\n}\n"],"mappings":";;;;;AA+BO,IAAM,mCAA2D;AAAA,EACtE,cAAc;AAAA,EACd,WAAW;AAAA,EACX,WAAW;AAAA,EACX,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,oBAAoB;AAAA,EACpB,qBAAqB;AACvB;AA0CO,SAAS,iBAAiB,GAAa,GAAqB;AACjE,MAAI,EAAE,WAAW,EAAE,QAAQ;AACzB,UAAM,IAAI;AAAA,MACR,6CAA6C,EAAE,MAAM,OAAO,EAAE,MAAM;AAAA,IACtE;AAAA,EACF;AACA,MAAI,EAAE,WAAW,EAAG,QAAO;AAE3B,MAAI,MAAM;AACV,MAAI,OAAO;AACX,MAAI,OAAO;AACX,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAO,EAAE,CAAC,IAAI,EAAE,CAAC;AACjB,YAAQ,EAAE,CAAC,IAAI,EAAE,CAAC;AAClB,YAAQ,EAAE,CAAC,IAAI,EAAE,CAAC;AAAA,EACpB;AAEA,QAAM,QAAQ,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,IAAI;AAC9C,MAAI,UAAU,EAAG,QAAO;AACxB,SAAO,MAAM;AACf;AAKO,SAAS,KAAK,QAA0B;AAC7C,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,MAAM;AACV,aAAW,KAAK,OAAQ,QAAO;AAC/B,SAAO,MAAM,OAAO;AACtB;AAKO,SAAS,OAAO,QAA0B;AAC/C,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,QAAM,IAAI,KAAK,MAAM;AACrB,MAAI,QAAQ;AACZ,aAAW,KAAK,QAAQ;AACtB,UAAM,IAAI,IAAI;AACd,aAAS,IAAI;AAAA,EACf;AACA,SAAO,KAAK,KAAK,QAAQ,OAAO,MAAM;AACxC;AAUO,SAAS,cAAc,QAAkB,YAA8B;AAC5E,MAAI,OAAO,WAAW,EAAG,QAAO,CAAC;AACjC,MAAI,aAAa,EAAG,cAAa;AAEjC,MAAI,aAAa,MAAM,EAAG,cAAa,aAAa;AAEpD,QAAM,QAAQ,KAAK,MAAM,aAAa,CAAC;AACvC,QAAM,SAAmB,IAAI,MAAM,OAAO,MAAM;AAEhD,WAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,UAAM,KAAK,KAAK,IAAI,GAAG,IAAI,KAAK;AAChC,UAAM,KAAK,KAAK,IAAI,OAAO,SAAS,GAAG,IAAI,KAAK;AAChD,QAAI,MAAM;AACV,aAAS,IAAI,IAAI,KAAK,IAAI,IAAK,QAAO,OAAO,CAAC;AAC9C,WAAO,CAAC,IAAI,OAAO,KAAK,KAAK;AAAA,EAC/B;AACA,SAAO;AACT;AAOO,SAAS,gBACd,QACA,WACU;AACV,MAAI,OAAO,UAAU,EAAG,QAAO,CAAC;AAEhC,QAAM,SAAmB,CAAC;AAC1B,WAAS,IAAI,GAAG,IAAI,OAAO,SAAS,GAAG,KAAK;AAC1C,QACE,OAAO,CAAC,IAAI,OAAO,IAAI,CAAC,KACxB,OAAO,CAAC,IAAI,OAAO,IAAI,CAAC,KACxB,OAAO,CAAC,IAAI,WACZ;AACA,aAAO,KAAK,CAAC;AAAA,IACf;AAAA,EACF;AACA,SAAO;AACT;AAUA,SAAS,eAAe,MAAwB;AAC9C,QAAM,YAAsB,CAAC;AAC7B,QAAM,gBAAgB;AAEtB,MAAI;AACJ,MAAI,YAAY;AAEhB,UAAQ,QAAQ,cAAc,KAAK,IAAI,OAAO,MAAM;AAClD,cAAU,KAAK,MAAM,CAAC,EAAE,KAAK,CAAC;AAC9B,gBAAY,cAAc;AAAA,EAC5B;AAEA,MAAI,YAAY,KAAK,QAAQ;AAC3B,UAAM,YAAY,KAAK,MAAM,SAAS,EAAE,KAAK;AAC7C,QAAI,WAAW;AACb,gBAAU,KAAK,SAAS;AAAA,IAC1B;AAAA,EACF;AAEA,SAAO,UAAU,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC;AAC7C;AAOA,SAAS,eAAe,MAAsB;AAC5C,SAAO,KAAK,KAAK,KAAK,SAAS,CAAC;AAClC;AAUA,eAAe,WACb,WACA,SACA,WACqB;AACrB,QAAM,gBAA4B,CAAC;AAEnC,WAAS,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK,WAAW;AACpD,UAAM,QAAQ,UAAU,MAAM,GAAG,IAAI,SAAS;AAC9C,UAAM,cAAc,MAAM,QAAQ,KAAK;AACvC,eAAW,OAAO,aAAa;AAC7B,oBAAc,KAAK,GAAG;AAAA,IACxB;AAAA,EACF;AAEA,SAAO;AACT;AAEA,SAAS,+BACP,YAC4D;AAC5D,MAAI,WAAW,UAAU,EAAG,QAAO;AACnC,QAAM,WAAW,WAAW,CAAC,EAAE;AAC/B,WAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,UAAM,SAAS,WAAW,CAAC,EAAE;AAC7B,QAAI,WAAW,UAAU;AACvB,aAAO,EAAE,UAAU,QAAQ,OAAO,EAAE;AAAA,IACtC;AAAA,EACF;AACA,SAAO;AACT;AAOA,SAAS,cACP,WACA,YACY;AACZ,QAAM,SAAS,CAAC,GAAG,UAAU,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AACnD,QAAM,WAAuB,CAAC;AAC9B,MAAI,QAAQ;AAEZ,aAAW,KAAK,QAAQ;AAEtB,UAAM,aAAa,IAAI;AACvB,QAAI,aAAa,SAAS,cAAc,UAAU,QAAQ;AACxD,eAAS,KAAK,UAAU,MAAM,OAAO,UAAU,CAAC;AAChD,cAAQ;AAAA,IACV;AAAA,EACF;AAGA,MAAI,QAAQ,UAAU,QAAQ;AAC5B,aAAS,KAAK,UAAU,MAAM,KAAK,CAAC;AAAA,EACtC;AAEA,SAAO;AACT;AAMA,SAAS,mBACP,UACA,WACY;AACZ,MAAI,SAAS,UAAU,EAAG,QAAO;AAEjC,QAAM,SAAqB,CAAC;AAC5B,MAAI,SAAmB,CAAC;AAExB,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,aAAS,CAAC,GAAG,QAAQ,GAAG,SAAS,CAAC,CAAC;AACnC,UAAM,aAAa,eAAe,OAAO,KAAK,GAAG,CAAC;AAElD,QAAI,cAAc,aAAa,MAAM,SAAS,SAAS,GAAG;AACxD,aAAO,KAAK,MAAM;AAClB,eAAS,CAAC;AAAA,IACZ;AAAA,EACF;AAGA,MAAI,OAAO,SAAS,GAAG;AACrB,QAAI,OAAO,SAAS,GAAG;AACrB,aAAO,OAAO,SAAS,CAAC,IAAI,CAAC,GAAG,OAAO,OAAO,SAAS,CAAC,GAAG,GAAG,MAAM;AAAA,IACtE,OAAO;AACL,aAAO,KAAK,MAAM;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,iBACP,SACA,WACA,cACiB;AACjB,QAAM,OAAO,QAAQ,KAAK,GAAG;AAG7B,QAAM,eAAe,KAAK,IAAI,cAAc,SAAS;AACrD,QAAM,SAAsB,aAAa,MAAM;AAAA,IAC7C,cAAc;AAAA,IACd,WAAW,KAAK,IAAI,cAAc,SAAS;AAAA,IAC3C,kBAAkB;AAAA,EACpB,CAAC;AAED,SAAO,OAAO,OAAO,IAAI,CAAC,OAAO;AAAA,IAC/B,SAAS,EAAE;AAAA,IACX,OAAO,EAAE;AAAA,IACT,YAAY,EAAE;AAAA,IACd,eAAe;AAAA,EACjB,EAAE;AACJ;AAUA,eAAsB,qBACpB,SACA,SACA,QAC8B;AAC9B,QAAM,MAA8B;AAAA,IAClC,GAAG;AAAA,IACH,GAAG;AAAA,EACL;AAGA,QAAM,YAAY,KAAK,IAAI,GAAG,IAAI,kBAAkB;AAGpD,MAAI,CAAC,WAAW,QAAQ,KAAK,EAAE,WAAW,GAAG;AAC3C,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ,CAAC;AAAA,MACT,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAEA,QAAM,YAAY,eAAe,OAAO;AAExC,MAAI,UAAU,UAAU,GAAG;AACzB,UAAM,aAAa,eAAe,OAAO;AACzC,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ;AAAA,QACN;AAAA,UACE,SAAS,QAAQ,KAAK;AAAA,UACtB,OAAO;AAAA,UACP;AAAA,UACA,eAAe;AAAA,QACjB;AAAA,MACF;AAAA,MACA,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAGA,QAAM,cAAc,eAAe,OAAO;AAC1C,MAAI,eAAe,IAAI,WAAW;AAChC,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ;AAAA,QACN;AAAA,UACE,SAAS,QAAQ,KAAK;AAAA,UACtB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,eAAe;AAAA,QACjB;AAAA,MACF;AAAA,MACA,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAGA,MAAI;AACJ,MAAI;AACF,iBAAa,MAAM,WAAW,WAAW,SAAS,SAAS;AAAA,EAC7D,QAAQ;AAEN,QAAI,IAAI,qBAAqB;AAC3B,aAAO,uBAAuB,SAAS,GAAG;AAAA,IAC5C;AACA,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,MAAI,WAAW,WAAW,UAAU,QAAQ;AAC1C,QAAI,IAAI,qBAAqB;AAC3B,aAAO,uBAAuB,SAAS,GAAG;AAAA,IAC5C;AACA,UAAM,IAAI;AAAA,MACR,sCAAsC,UAAU,MAAM,4BAA4B,WAAW,MAAM;AAAA,IACrG;AAAA,EACF;AAEA,QAAM,oBAAoB,+BAA+B,UAAU;AACnE,MAAI,mBAAmB;AACrB,QAAI,IAAI,qBAAqB;AAC3B,aAAO,uBAAuB,SAAS,GAAG;AAAA,IAC5C;AACA,UAAM,IAAI;AAAA,MACR,2EACM,kBAAkB,QAAQ,OAAO,kBAAkB,MAAM,aAAa,kBAAkB,KAAK;AAAA,IACrG;AAAA,EACF;AAGA,QAAM,eAAyB,CAAC;AAChC,WAAS,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,KAAK;AAC7C,iBAAa,KAAK,iBAAiB,WAAW,CAAC,GAAG,WAAW,IAAI,CAAC,CAAC,CAAC;AAAA,EACtE;AAIA,MAAI,aAAa,UAAU,GAAG;AAC5B,QAAI,cAAc,IAAI,WAAW;AAC/B,aAAO,uBAAuB,SAAS,GAAG;AAAA,IAC5C;AACA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ;AAAA,QACN;AAAA,UACE,SAAS,QAAQ,KAAK;AAAA,UACtB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,eAAe,aAAa,WAAW,IAAI,aAAa,CAAC,IAAI;AAAA,QAC/D;AAAA,MACF;AAAA,MACA,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAGA,QAAM,WAAW,cAAc,cAAc,IAAI,mBAAmB;AAGpE,QAAM,IAAI,KAAK,QAAQ;AACvB,QAAM,IAAI,OAAO,QAAQ;AACzB,QAAM,YAAY,IAAI,IAAI,2BAA2B;AACrD,QAAM,gBAAgB,gBAAgB,UAAU,SAAS;AAGzD,MAAI,WAAW,cAAc,WAAW,aAAa;AACrD,aAAW,mBAAmB,UAAU,IAAI,SAAS;AAGrD,QAAM,SAA0B,CAAC;AACjC,QAAM,kBAA4B,CAAC;AACnC,MAAI,iBAAiB;AAErB,WAAS,SAAS,GAAG,SAAS,SAAS,QAAQ,UAAU;AACvD,UAAM,UAAU,SAAS,MAAM;AAC/B,UAAM,UAAU,QAAQ,KAAK,GAAG;AAChC,UAAM,YAAY,eAAe,OAAO;AAExC,QAAI,YAAY,IAAI,WAAW;AAE7B,YAAM,YAAY,iBAAiB,SAAS,IAAI,WAAW,IAAI,YAAY;AAC3E,iBAAW,MAAM,WAAW;AAC1B,eAAO,KAAK;AAAA,UACV,GAAG;AAAA,UACH,OAAO,OAAO;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF,OAAO;AAEL,YAAM,sBAAsB,iBAAiB,QAAQ,SAAS;AAC9D,UAAI,SAAS;AACb,UACE,sBAAsB,aAAa,UACnC,SAAS,SAAS,SAAS,GAC3B;AACA,iBAAS,SAAS,mBAAmB,KAAK,aAAa,mBAAmB,KAAK;AAAA,MACjF;AAEA,aAAO,KAAK;AAAA,QACV,SAAS;AAAA,QACT,OAAO,OAAO;AAAA,QACd,YAAY;AAAA,QACZ,eAAe;AAAA,MACjB,CAAC;AAAA,IACH;AAGA,QAAI,SAAS,SAAS,SAAS,GAAG;AAChC,sBAAgB,KAAK,iBAAiB,QAAQ,SAAS,CAAC;AAAA,IAC1D;AACA,sBAAkB,QAAQ;AAAA,EAC5B;AAEA,SAAO;AAAA,IACL,SAAS,OAAO,SAAS;AAAA,IACzB;AAAA,IACA,YAAY;AAAA,IACZ,QAAQ;AAAA,EACV;AACF;AAMA,SAAS,uBACP,SACA,KACqB;AAGrB,QAAM,eAAe,KAAK,IAAI,IAAI,cAAc,IAAI,SAAS;AAC7D,QAAM,SAAsB,aAAa,SAAS;AAAA,IAChD,cAAc;AAAA,IACd,WAAW,KAAK,IAAI,IAAI,WAAW,YAAY;AAAA,IAC/C,kBAAkB;AAAA,EACpB,CAAC;AAED,SAAO;AAAA,IACL,SAAS,OAAO;AAAA,IAChB,QAAQ,OAAO,OAAO,IAAI,CAAC,OAAO;AAAA,MAChC,GAAG;AAAA,MACH,eAAe;AAAA,IACjB,EAAE;AAAA,IACF,YAAY,CAAC;AAAA,IACb,QAAQ;AAAA,EACV;AACF;","names":[]}
|
package/dist/index.js
CHANGED
|
@@ -162,7 +162,7 @@ import {
|
|
|
162
162
|
import {
|
|
163
163
|
DEFAULT_SURPRISE_K,
|
|
164
164
|
computeSurprise
|
|
165
|
-
} from "./chunk-
|
|
165
|
+
} from "./chunk-BF7ZRHH2.js";
|
|
166
166
|
import {
|
|
167
167
|
getMemoryForActiveMemory,
|
|
168
168
|
recallForActiveMemory
|
|
@@ -182,7 +182,7 @@ import {
|
|
|
182
182
|
saveTaxonomy,
|
|
183
183
|
validateSlug,
|
|
184
184
|
validateTaxonomy
|
|
185
|
-
} from "./chunk-
|
|
185
|
+
} from "./chunk-7WU3FML2.js";
|
|
186
186
|
import "./chunk-5RIRL3XL.js";
|
|
187
187
|
import {
|
|
188
188
|
migrateFromEngram,
|
|
@@ -442,7 +442,7 @@ import {
|
|
|
442
442
|
runConnectorPollOnce
|
|
443
443
|
} from "./chunk-OKTXM5H4.js";
|
|
444
444
|
import "./chunk-3UXOZBHV.js";
|
|
445
|
-
import "./chunk-
|
|
445
|
+
import "./chunk-WCYKT2DE.js";
|
|
446
446
|
import "./chunk-4WMCPJWX.js";
|
|
447
447
|
import {
|
|
448
448
|
SmartBuffer
|
package/dist/orchestrator.js
CHANGED
|
@@ -26,7 +26,7 @@ import {
|
|
|
26
26
|
sanitizeSessionKeyForFilename,
|
|
27
27
|
shouldFilterLifecycleRecallCandidate,
|
|
28
28
|
summarizeGraphShadowComparison
|
|
29
|
-
} from "./chunk-
|
|
29
|
+
} from "./chunk-7WU3FML2.js";
|
|
30
30
|
import "./chunk-5RIRL3XL.js";
|
|
31
31
|
import "./chunk-KVEVLBKC.js";
|
|
32
32
|
import "./chunk-BFBF3XEF.js";
|
|
@@ -139,7 +139,7 @@ import "./chunk-3ONXXHQO.js";
|
|
|
139
139
|
import "./chunk-LMDRGRJ2.js";
|
|
140
140
|
import "./chunk-OKTXM5H4.js";
|
|
141
141
|
import "./chunk-3UXOZBHV.js";
|
|
142
|
-
import "./chunk-
|
|
142
|
+
import "./chunk-WCYKT2DE.js";
|
|
143
143
|
import "./chunk-4WMCPJWX.js";
|
|
144
144
|
import "./chunk-EVZFIAPG.js";
|
|
145
145
|
import "./chunk-CSKLPDN6.js";
|
package/dist/schemas.d.ts
CHANGED
|
@@ -275,12 +275,12 @@ declare const EntityMentionSchema: z.ZodObject<{
|
|
|
275
275
|
title: z.ZodString;
|
|
276
276
|
facts: z.ZodArray<z.ZodString, "many">;
|
|
277
277
|
}, "strip", z.ZodTypeAny, {
|
|
278
|
-
title: string;
|
|
279
278
|
key: string;
|
|
279
|
+
title: string;
|
|
280
280
|
facts: string[];
|
|
281
281
|
}, {
|
|
282
|
-
title: string;
|
|
283
282
|
key: string;
|
|
283
|
+
title: string;
|
|
284
284
|
facts: string[];
|
|
285
285
|
}>, "many">>>;
|
|
286
286
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -288,8 +288,8 @@ declare const EntityMentionSchema: z.ZodObject<{
|
|
|
288
288
|
name: string;
|
|
289
289
|
facts: string[];
|
|
290
290
|
structuredSections?: {
|
|
291
|
-
title: string;
|
|
292
291
|
key: string;
|
|
292
|
+
title: string;
|
|
293
293
|
facts: string[];
|
|
294
294
|
}[] | null | undefined;
|
|
295
295
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -298,8 +298,8 @@ declare const EntityMentionSchema: z.ZodObject<{
|
|
|
298
298
|
name: string;
|
|
299
299
|
facts: string[];
|
|
300
300
|
structuredSections?: {
|
|
301
|
-
title: string;
|
|
302
301
|
key: string;
|
|
302
|
+
title: string;
|
|
303
303
|
facts: string[];
|
|
304
304
|
}[] | null | undefined;
|
|
305
305
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -584,12 +584,12 @@ declare const ProactiveExtractionResultSchema: z.ZodObject<{
|
|
|
584
584
|
title: z.ZodString;
|
|
585
585
|
facts: z.ZodArray<z.ZodString, "many">;
|
|
586
586
|
}, "strip", z.ZodTypeAny, {
|
|
587
|
-
title: string;
|
|
588
587
|
key: string;
|
|
588
|
+
title: string;
|
|
589
589
|
facts: string[];
|
|
590
590
|
}, {
|
|
591
|
-
title: string;
|
|
592
591
|
key: string;
|
|
592
|
+
title: string;
|
|
593
593
|
facts: string[];
|
|
594
594
|
}>, "many">>>;
|
|
595
595
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -597,8 +597,8 @@ declare const ProactiveExtractionResultSchema: z.ZodObject<{
|
|
|
597
597
|
name: string;
|
|
598
598
|
facts: string[];
|
|
599
599
|
structuredSections?: {
|
|
600
|
-
title: string;
|
|
601
600
|
key: string;
|
|
601
|
+
title: string;
|
|
602
602
|
facts: string[];
|
|
603
603
|
}[] | null | undefined;
|
|
604
604
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -607,8 +607,8 @@ declare const ProactiveExtractionResultSchema: z.ZodObject<{
|
|
|
607
607
|
name: string;
|
|
608
608
|
facts: string[];
|
|
609
609
|
structuredSections?: {
|
|
610
|
-
title: string;
|
|
611
610
|
key: string;
|
|
611
|
+
title: string;
|
|
612
612
|
facts: string[];
|
|
613
613
|
}[] | null | undefined;
|
|
614
614
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -665,8 +665,8 @@ declare const ProactiveExtractionResultSchema: z.ZodObject<{
|
|
|
665
665
|
name: string;
|
|
666
666
|
facts: string[];
|
|
667
667
|
structuredSections?: {
|
|
668
|
-
title: string;
|
|
669
668
|
key: string;
|
|
669
|
+
title: string;
|
|
670
670
|
facts: string[];
|
|
671
671
|
}[] | null | undefined;
|
|
672
672
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -714,8 +714,8 @@ declare const ProactiveExtractionResultSchema: z.ZodObject<{
|
|
|
714
714
|
name: string;
|
|
715
715
|
facts: string[];
|
|
716
716
|
structuredSections?: {
|
|
717
|
-
title: string;
|
|
718
717
|
key: string;
|
|
718
|
+
title: string;
|
|
719
719
|
facts: string[];
|
|
720
720
|
}[] | null | undefined;
|
|
721
721
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -952,12 +952,12 @@ declare const ExtractionResultSchema: z.ZodObject<{
|
|
|
952
952
|
title: z.ZodString;
|
|
953
953
|
facts: z.ZodArray<z.ZodString, "many">;
|
|
954
954
|
}, "strip", z.ZodTypeAny, {
|
|
955
|
-
title: string;
|
|
956
955
|
key: string;
|
|
956
|
+
title: string;
|
|
957
957
|
facts: string[];
|
|
958
958
|
}, {
|
|
959
|
-
title: string;
|
|
960
959
|
key: string;
|
|
960
|
+
title: string;
|
|
961
961
|
facts: string[];
|
|
962
962
|
}>, "many">>>;
|
|
963
963
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -965,8 +965,8 @@ declare const ExtractionResultSchema: z.ZodObject<{
|
|
|
965
965
|
name: string;
|
|
966
966
|
facts: string[];
|
|
967
967
|
structuredSections?: {
|
|
968
|
-
title: string;
|
|
969
968
|
key: string;
|
|
969
|
+
title: string;
|
|
970
970
|
facts: string[];
|
|
971
971
|
}[] | null | undefined;
|
|
972
972
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -975,8 +975,8 @@ declare const ExtractionResultSchema: z.ZodObject<{
|
|
|
975
975
|
name: string;
|
|
976
976
|
facts: string[];
|
|
977
977
|
structuredSections?: {
|
|
978
|
-
title: string;
|
|
979
978
|
key: string;
|
|
979
|
+
title: string;
|
|
980
980
|
facts: string[];
|
|
981
981
|
}[] | null | undefined;
|
|
982
982
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -1047,8 +1047,8 @@ declare const ExtractionResultSchema: z.ZodObject<{
|
|
|
1047
1047
|
name: string;
|
|
1048
1048
|
facts: string[];
|
|
1049
1049
|
structuredSections?: {
|
|
1050
|
-
title: string;
|
|
1051
1050
|
key: string;
|
|
1051
|
+
title: string;
|
|
1052
1052
|
facts: string[];
|
|
1053
1053
|
}[] | null | undefined;
|
|
1054
1054
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -1102,8 +1102,8 @@ declare const ExtractionResultSchema: z.ZodObject<{
|
|
|
1102
1102
|
name: string;
|
|
1103
1103
|
facts: string[];
|
|
1104
1104
|
structuredSections?: {
|
|
1105
|
-
title: string;
|
|
1106
1105
|
key: string;
|
|
1106
|
+
title: string;
|
|
1107
1107
|
facts: string[];
|
|
1108
1108
|
}[] | null | undefined;
|
|
1109
1109
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -1172,12 +1172,12 @@ declare const ConsolidationResultSchema: z.ZodObject<{
|
|
|
1172
1172
|
title: z.ZodString;
|
|
1173
1173
|
facts: z.ZodArray<z.ZodString, "many">;
|
|
1174
1174
|
}, "strip", z.ZodTypeAny, {
|
|
1175
|
-
title: string;
|
|
1176
1175
|
key: string;
|
|
1176
|
+
title: string;
|
|
1177
1177
|
facts: string[];
|
|
1178
1178
|
}, {
|
|
1179
|
-
title: string;
|
|
1180
1179
|
key: string;
|
|
1180
|
+
title: string;
|
|
1181
1181
|
facts: string[];
|
|
1182
1182
|
}>, "many">>>;
|
|
1183
1183
|
}, "strip", z.ZodTypeAny, {
|
|
@@ -1185,8 +1185,8 @@ declare const ConsolidationResultSchema: z.ZodObject<{
|
|
|
1185
1185
|
name: string;
|
|
1186
1186
|
facts: string[];
|
|
1187
1187
|
structuredSections?: {
|
|
1188
|
-
title: string;
|
|
1189
1188
|
key: string;
|
|
1189
|
+
title: string;
|
|
1190
1190
|
facts: string[];
|
|
1191
1191
|
}[] | null | undefined;
|
|
1192
1192
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -1195,8 +1195,8 @@ declare const ConsolidationResultSchema: z.ZodObject<{
|
|
|
1195
1195
|
name: string;
|
|
1196
1196
|
facts: string[];
|
|
1197
1197
|
structuredSections?: {
|
|
1198
|
-
title: string;
|
|
1199
1198
|
key: string;
|
|
1199
|
+
title: string;
|
|
1200
1200
|
facts: string[];
|
|
1201
1201
|
}[] | null | undefined;
|
|
1202
1202
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -1215,8 +1215,8 @@ declare const ConsolidationResultSchema: z.ZodObject<{
|
|
|
1215
1215
|
name: string;
|
|
1216
1216
|
facts: string[];
|
|
1217
1217
|
structuredSections?: {
|
|
1218
|
-
title: string;
|
|
1219
1218
|
key: string;
|
|
1219
|
+
title: string;
|
|
1220
1220
|
facts: string[];
|
|
1221
1221
|
}[] | null | undefined;
|
|
1222
1222
|
promptedByQuestion?: string | null | undefined;
|
|
@@ -1235,8 +1235,8 @@ declare const ConsolidationResultSchema: z.ZodObject<{
|
|
|
1235
1235
|
name: string;
|
|
1236
1236
|
facts: string[];
|
|
1237
1237
|
structuredSections?: {
|
|
1238
|
-
title: string;
|
|
1239
1238
|
key: string;
|
|
1239
|
+
title: string;
|
|
1240
1240
|
facts: string[];
|
|
1241
1241
|
}[] | null | undefined;
|
|
1242
1242
|
promptedByQuestion?: string | null | undefined;
|
package/dist/transfer/types.d.ts
CHANGED
|
@@ -313,13 +313,13 @@ declare const CapsuleBlockSchema: z.ZodObject<{
|
|
|
313
313
|
peerProfiles: boolean;
|
|
314
314
|
}>;
|
|
315
315
|
}, "strip", z.ZodTypeAny, {
|
|
316
|
-
schemaVersion: string;
|
|
317
316
|
includes: {
|
|
318
317
|
procedural: boolean;
|
|
319
318
|
taxonomy: boolean;
|
|
320
319
|
identityAnchors: boolean;
|
|
321
320
|
peerProfiles: boolean;
|
|
322
321
|
};
|
|
322
|
+
schemaVersion: string;
|
|
323
323
|
id: string;
|
|
324
324
|
description: string;
|
|
325
325
|
version: string;
|
|
@@ -334,13 +334,13 @@ declare const CapsuleBlockSchema: z.ZodObject<{
|
|
|
334
334
|
directAnswerEnabled: boolean;
|
|
335
335
|
};
|
|
336
336
|
}, {
|
|
337
|
-
schemaVersion: string;
|
|
338
337
|
includes: {
|
|
339
338
|
procedural: boolean;
|
|
340
339
|
taxonomy: boolean;
|
|
341
340
|
identityAnchors: boolean;
|
|
342
341
|
peerProfiles: boolean;
|
|
343
342
|
};
|
|
343
|
+
schemaVersion: string;
|
|
344
344
|
id: string;
|
|
345
345
|
description: string;
|
|
346
346
|
version: string;
|
|
@@ -464,13 +464,13 @@ declare const ExportManifestV2Schema: z.ZodObject<{
|
|
|
464
464
|
peerProfiles: boolean;
|
|
465
465
|
}>;
|
|
466
466
|
}, "strip", z.ZodTypeAny, {
|
|
467
|
-
schemaVersion: string;
|
|
468
467
|
includes: {
|
|
469
468
|
procedural: boolean;
|
|
470
469
|
taxonomy: boolean;
|
|
471
470
|
identityAnchors: boolean;
|
|
472
471
|
peerProfiles: boolean;
|
|
473
472
|
};
|
|
473
|
+
schemaVersion: string;
|
|
474
474
|
id: string;
|
|
475
475
|
description: string;
|
|
476
476
|
version: string;
|
|
@@ -485,13 +485,13 @@ declare const ExportManifestV2Schema: z.ZodObject<{
|
|
|
485
485
|
directAnswerEnabled: boolean;
|
|
486
486
|
};
|
|
487
487
|
}, {
|
|
488
|
-
schemaVersion: string;
|
|
489
488
|
includes: {
|
|
490
489
|
procedural: boolean;
|
|
491
490
|
taxonomy: boolean;
|
|
492
491
|
identityAnchors: boolean;
|
|
493
492
|
peerProfiles: boolean;
|
|
494
493
|
};
|
|
494
|
+
schemaVersion: string;
|
|
495
495
|
id: string;
|
|
496
496
|
description: string;
|
|
497
497
|
version: string;
|
|
@@ -518,13 +518,13 @@ declare const ExportManifestV2Schema: z.ZodObject<{
|
|
|
518
518
|
pluginVersion: string;
|
|
519
519
|
includesTranscripts: boolean;
|
|
520
520
|
capsule: {
|
|
521
|
-
schemaVersion: string;
|
|
522
521
|
includes: {
|
|
523
522
|
procedural: boolean;
|
|
524
523
|
taxonomy: boolean;
|
|
525
524
|
identityAnchors: boolean;
|
|
526
525
|
peerProfiles: boolean;
|
|
527
526
|
};
|
|
527
|
+
schemaVersion: string;
|
|
528
528
|
id: string;
|
|
529
529
|
description: string;
|
|
530
530
|
version: string;
|
|
@@ -551,13 +551,13 @@ declare const ExportManifestV2Schema: z.ZodObject<{
|
|
|
551
551
|
pluginVersion: string;
|
|
552
552
|
includesTranscripts: boolean;
|
|
553
553
|
capsule: {
|
|
554
|
-
schemaVersion: string;
|
|
555
554
|
includes: {
|
|
556
555
|
procedural: boolean;
|
|
557
556
|
taxonomy: boolean;
|
|
558
557
|
identityAnchors: boolean;
|
|
559
558
|
peerProfiles: boolean;
|
|
560
559
|
};
|
|
560
|
+
schemaVersion: string;
|
|
561
561
|
id: string;
|
|
562
562
|
description: string;
|
|
563
563
|
version: string;
|
|
@@ -683,13 +683,13 @@ declare const ExportBundleV2Schema: z.ZodObject<{
|
|
|
683
683
|
peerProfiles: boolean;
|
|
684
684
|
}>;
|
|
685
685
|
}, "strip", z.ZodTypeAny, {
|
|
686
|
-
schemaVersion: string;
|
|
687
686
|
includes: {
|
|
688
687
|
procedural: boolean;
|
|
689
688
|
taxonomy: boolean;
|
|
690
689
|
identityAnchors: boolean;
|
|
691
690
|
peerProfiles: boolean;
|
|
692
691
|
};
|
|
692
|
+
schemaVersion: string;
|
|
693
693
|
id: string;
|
|
694
694
|
description: string;
|
|
695
695
|
version: string;
|
|
@@ -704,13 +704,13 @@ declare const ExportBundleV2Schema: z.ZodObject<{
|
|
|
704
704
|
directAnswerEnabled: boolean;
|
|
705
705
|
};
|
|
706
706
|
}, {
|
|
707
|
-
schemaVersion: string;
|
|
708
707
|
includes: {
|
|
709
708
|
procedural: boolean;
|
|
710
709
|
taxonomy: boolean;
|
|
711
710
|
identityAnchors: boolean;
|
|
712
711
|
peerProfiles: boolean;
|
|
713
712
|
};
|
|
713
|
+
schemaVersion: string;
|
|
714
714
|
id: string;
|
|
715
715
|
description: string;
|
|
716
716
|
version: string;
|
|
@@ -737,13 +737,13 @@ declare const ExportBundleV2Schema: z.ZodObject<{
|
|
|
737
737
|
pluginVersion: string;
|
|
738
738
|
includesTranscripts: boolean;
|
|
739
739
|
capsule: {
|
|
740
|
-
schemaVersion: string;
|
|
741
740
|
includes: {
|
|
742
741
|
procedural: boolean;
|
|
743
742
|
taxonomy: boolean;
|
|
744
743
|
identityAnchors: boolean;
|
|
745
744
|
peerProfiles: boolean;
|
|
746
745
|
};
|
|
746
|
+
schemaVersion: string;
|
|
747
747
|
id: string;
|
|
748
748
|
description: string;
|
|
749
749
|
version: string;
|
|
@@ -770,13 +770,13 @@ declare const ExportBundleV2Schema: z.ZodObject<{
|
|
|
770
770
|
pluginVersion: string;
|
|
771
771
|
includesTranscripts: boolean;
|
|
772
772
|
capsule: {
|
|
773
|
-
schemaVersion: string;
|
|
774
773
|
includes: {
|
|
775
774
|
procedural: boolean;
|
|
776
775
|
taxonomy: boolean;
|
|
777
776
|
identityAnchors: boolean;
|
|
778
777
|
peerProfiles: boolean;
|
|
779
778
|
};
|
|
779
|
+
schemaVersion: string;
|
|
780
780
|
id: string;
|
|
781
781
|
description: string;
|
|
782
782
|
version: string;
|
|
@@ -815,13 +815,13 @@ declare const ExportBundleV2Schema: z.ZodObject<{
|
|
|
815
815
|
pluginVersion: string;
|
|
816
816
|
includesTranscripts: boolean;
|
|
817
817
|
capsule: {
|
|
818
|
-
schemaVersion: string;
|
|
819
818
|
includes: {
|
|
820
819
|
procedural: boolean;
|
|
821
820
|
taxonomy: boolean;
|
|
822
821
|
identityAnchors: boolean;
|
|
823
822
|
peerProfiles: boolean;
|
|
824
823
|
};
|
|
824
|
+
schemaVersion: string;
|
|
825
825
|
id: string;
|
|
826
826
|
description: string;
|
|
827
827
|
version: string;
|
|
@@ -854,13 +854,13 @@ declare const ExportBundleV2Schema: z.ZodObject<{
|
|
|
854
854
|
pluginVersion: string;
|
|
855
855
|
includesTranscripts: boolean;
|
|
856
856
|
capsule: {
|
|
857
|
-
schemaVersion: string;
|
|
858
857
|
includes: {
|
|
859
858
|
procedural: boolean;
|
|
860
859
|
taxonomy: boolean;
|
|
861
860
|
identityAnchors: boolean;
|
|
862
861
|
peerProfiles: boolean;
|
|
863
862
|
};
|
|
863
|
+
schemaVersion: string;
|
|
864
864
|
id: string;
|
|
865
865
|
description: string;
|
|
866
866
|
version: string;
|
package/package.json
CHANGED
package/src/semantic-chunking.ts
CHANGED
|
@@ -241,6 +241,20 @@ async function batchEmbed(
|
|
|
241
241
|
return allEmbeddings;
|
|
242
242
|
}
|
|
243
243
|
|
|
244
|
+
function findEmbeddingDimensionMismatch(
|
|
245
|
+
embeddings: number[][],
|
|
246
|
+
): { expected: number; actual: number; index: number } | null {
|
|
247
|
+
if (embeddings.length <= 1) return null;
|
|
248
|
+
const expected = embeddings[0].length;
|
|
249
|
+
for (let i = 1; i < embeddings.length; i++) {
|
|
250
|
+
const actual = embeddings[i].length;
|
|
251
|
+
if (actual !== expected) {
|
|
252
|
+
return { expected, actual, index: i };
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
return null;
|
|
256
|
+
}
|
|
257
|
+
|
|
244
258
|
/**
|
|
245
259
|
* Build segments from boundary indices.
|
|
246
260
|
* boundaries are sentence indices at which splits occur (i.e., the split
|
|
@@ -423,6 +437,17 @@ export async function semanticChunkContent(
|
|
|
423
437
|
);
|
|
424
438
|
}
|
|
425
439
|
|
|
440
|
+
const dimensionMismatch = findEmbeddingDimensionMismatch(embeddings);
|
|
441
|
+
if (dimensionMismatch) {
|
|
442
|
+
if (cfg.fallbackToRecursive) {
|
|
443
|
+
return buildRecursiveFallback(content, cfg);
|
|
444
|
+
}
|
|
445
|
+
throw new Error(
|
|
446
|
+
`Semantic chunking failed: embedding vectors have mismatched dimensions ` +
|
|
447
|
+
`(${dimensionMismatch.expected} vs ${dimensionMismatch.actual} at index ${dimensionMismatch.index})`,
|
|
448
|
+
);
|
|
449
|
+
}
|
|
450
|
+
|
|
426
451
|
// --- Compute pairwise cosine similarity ---
|
|
427
452
|
const similarities: number[] = [];
|
|
428
453
|
for (let i = 0; i < sentences.length - 1; i++) {
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/semantic-chunking.ts"],"sourcesContent":["/**\n * Semantic Chunking with Smoothing-Based Topic Boundaries (Issue #368)\n *\n * An optional alternative to the recursive chunker in chunking.ts.\n * Uses sentence embeddings + cosine similarity + smoothing to detect\n * natural topic boundaries, producing more coherent chunks.\n */\n\nimport { chunkContent, type Chunk, type ChunkResult } from \"./chunking.js\";\n\n// ---------------------------------------------------------------------------\n// Configuration\n// ---------------------------------------------------------------------------\n\nexport interface SemanticChunkingConfig {\n /** Target tokens per chunk. Default: 200. */\n targetTokens: number;\n /** Minimum tokens for a segment before merging with neighbor. Default: 100. */\n minTokens: number;\n /** Maximum tokens for a segment before recursive splitting. Default: 400. */\n maxTokens: number;\n /** Window size for the moving-average smoothing filter. Default: 3. */\n smoothingWindowSize: number;\n /** How many standard deviations below the mean constitutes a boundary. Default: 1.0. */\n boundaryThresholdStdDevs: number;\n /** Batch size for embedding requests. Default: 32. */\n embeddingBatchSize: number;\n /** Fall back to recursive chunking when embeddings are unavailable. Default: true. */\n fallbackToRecursive: boolean;\n}\n\nexport const DEFAULT_SEMANTIC_CHUNKING_CONFIG: SemanticChunkingConfig = {\n targetTokens: 200,\n minTokens: 100,\n maxTokens: 400,\n smoothingWindowSize: 3,\n boundaryThresholdStdDevs: 1.0,\n embeddingBatchSize: 32,\n fallbackToRecursive: true,\n};\n\n// ---------------------------------------------------------------------------\n// Result types\n// ---------------------------------------------------------------------------\n\nexport interface SemanticChunk extends Chunk {\n /** Optional topic hint derived from position. */\n topicLabel?: string;\n /** Cosine similarity score at the trailing boundary of this chunk. */\n boundaryScore: number;\n}\n\nexport interface SemanticChunkResult {\n /** Whether content was split into multiple chunks. */\n chunked: boolean;\n /** The chunks produced. */\n chunks: SemanticChunk[];\n /** Sentence indices where topic splits occurred. */\n boundaries: number[];\n /** Which algorithm produced the result. */\n method: \"semantic\" | \"recursive-fallback\";\n}\n\n// ---------------------------------------------------------------------------\n// Embedding function signature\n// ---------------------------------------------------------------------------\n\n/** Caller-provided function that embeds an array of texts, returning vectors. */\nexport type EmbedFn = (texts: string[]) => Promise<number[][]>;\n\n// ---------------------------------------------------------------------------\n// Math utilities (exported for testing)\n// ---------------------------------------------------------------------------\n\n/**\n * Cosine similarity between two vectors.\n * Returns a value in [-1, 1]. Identical direction = 1, orthogonal = 0.\n *\n * NOTE: This duplicates cosineSimilarity in recall-mmr.ts and embedding-fallback.ts.\n * Consider extracting to a shared math utility in a future refactor.\n */\nexport function cosineSimilarity(a: number[], b: number[]): number {\n if (a.length !== b.length) {\n throw new Error(\n `cosineSimilarity: vector length mismatch (${a.length} vs ${b.length})`,\n );\n }\n if (a.length === 0) return 0;\n\n let dot = 0;\n let magA = 0;\n let magB = 0;\n for (let i = 0; i < a.length; i++) {\n dot += a[i] * b[i];\n magA += a[i] * a[i];\n magB += b[i] * b[i];\n }\n\n const denom = Math.sqrt(magA) * Math.sqrt(magB);\n if (denom === 0) return 0;\n return dot / denom;\n}\n\n/**\n * Arithmetic mean of a numeric series.\n */\nexport function mean(series: number[]): number {\n if (series.length === 0) return 0;\n let sum = 0;\n for (const v of series) sum += v;\n return sum / series.length;\n}\n\n/**\n * Population standard deviation of a numeric series.\n */\nexport function stddev(series: number[]): number {\n if (series.length === 0) return 0;\n const m = mean(series);\n let sumSq = 0;\n for (const v of series) {\n const d = v - m;\n sumSq += d * d;\n }\n return Math.sqrt(sumSq / series.length);\n}\n\n/**\n * Simple moving average over a 1D series.\n * The window is centered: for window size W, each output[i] averages\n * series[i - floor(W/2) .. i + floor(W/2)], clamped to bounds.\n *\n * Even window sizes are rounded up to the next odd value so the window\n * is symmetric around the center point (Finding 4, PR #420).\n */\nexport function movingAverage(series: number[], windowSize: number): number[] {\n if (series.length === 0) return [];\n if (windowSize < 1) windowSize = 1;\n // Round even values up to the next odd so the window is symmetric.\n if (windowSize % 2 === 0) windowSize = windowSize + 1;\n\n const halfW = Math.floor(windowSize / 2);\n const result: number[] = new Array(series.length);\n\n for (let i = 0; i < series.length; i++) {\n const lo = Math.max(0, i - halfW);\n const hi = Math.min(series.length - 1, i + halfW);\n let sum = 0;\n for (let j = lo; j <= hi; j++) sum += series[j];\n result[i] = sum / (hi - lo + 1);\n }\n return result;\n}\n\n/**\n * Find indices in the series that are local minima AND below the threshold.\n * A local minimum is a point lower than both its immediate neighbors\n * (or lower-or-equal at series boundaries).\n */\nexport function findLocalMinima(\n series: number[],\n threshold: number,\n): number[] {\n if (series.length <= 2) return [];\n\n const minima: number[] = [];\n for (let i = 1; i < series.length - 1; i++) {\n if (\n series[i] < series[i - 1] &&\n series[i] < series[i + 1] &&\n series[i] < threshold\n ) {\n minima.push(i);\n }\n }\n return minima;\n}\n\n// ---------------------------------------------------------------------------\n// Sentence tokenizer\n// ---------------------------------------------------------------------------\n\n/**\n * Split text into sentences at punctuation boundaries.\n * Preserves punctuation with the preceding sentence.\n */\nfunction splitSentences(text: string): string[] {\n const sentences: string[] = [];\n const sentenceRegex = /[^.!?]*[.!?]+(?:\\s+|$)/g;\n\n let match: RegExpExecArray | null;\n let lastIndex = 0;\n\n while ((match = sentenceRegex.exec(text)) !== null) {\n sentences.push(match[0].trim());\n lastIndex = sentenceRegex.lastIndex;\n }\n\n if (lastIndex < text.length) {\n const remaining = text.slice(lastIndex).trim();\n if (remaining) {\n sentences.push(remaining);\n }\n }\n\n return sentences.filter((s) => s.length > 0);\n}\n\n// ---------------------------------------------------------------------------\n// Token estimation\n// ---------------------------------------------------------------------------\n\n/** Rough token estimate: ~4 chars per token for English. */\nfunction estimateTokens(text: string): number {\n return Math.ceil(text.length / 4);\n}\n\n// ---------------------------------------------------------------------------\n// Core semantic chunking\n// ---------------------------------------------------------------------------\n\n/**\n * Batch-embed sentences using the provided embed function.\n * Respects the configured batch size.\n */\nasync function batchEmbed(\n sentences: string[],\n embedFn: EmbedFn,\n batchSize: number,\n): Promise<number[][]> {\n const allEmbeddings: number[][] = [];\n\n for (let i = 0; i < sentences.length; i += batchSize) {\n const batch = sentences.slice(i, i + batchSize);\n const batchResult = await embedFn(batch);\n for (const vec of batchResult) {\n allEmbeddings.push(vec);\n }\n }\n\n return allEmbeddings;\n}\n\n/**\n * Build segments from boundary indices.\n * boundaries are sentence indices at which splits occur (i.e., the split\n * happens AFTER the boundary index sentence).\n */\nfunction buildSegments(\n sentences: string[],\n boundaries: number[],\n): string[][] {\n const sorted = [...boundaries].sort((a, b) => a - b);\n const segments: string[][] = [];\n let start = 0;\n\n for (const b of sorted) {\n // Split after sentence at index b: segment is [start .. b]\n const splitPoint = b + 1;\n if (splitPoint > start && splitPoint <= sentences.length) {\n segments.push(sentences.slice(start, splitPoint));\n start = splitPoint;\n }\n }\n\n // Remaining sentences\n if (start < sentences.length) {\n segments.push(sentences.slice(start));\n }\n\n return segments;\n}\n\n/**\n * Merge short segments (below minTokens) with their neighbor.\n * Prefers merging forward; falls back to merging backward.\n */\nfunction mergeShortSegments(\n segments: string[][],\n minTokens: number,\n): string[][] {\n if (segments.length <= 1) return segments;\n\n const merged: string[][] = [];\n let buffer: string[] = [];\n\n for (let i = 0; i < segments.length; i++) {\n buffer = [...buffer, ...segments[i]];\n const tokenCount = estimateTokens(buffer.join(\" \"));\n\n if (tokenCount >= minTokens || i === segments.length - 1) {\n merged.push(buffer);\n buffer = [];\n }\n }\n\n // If the last merge left a dangling buffer, attach it to the last segment\n if (buffer.length > 0) {\n if (merged.length > 0) {\n merged[merged.length - 1] = [...merged[merged.length - 1], ...buffer];\n } else {\n merged.push(buffer);\n }\n }\n\n return merged;\n}\n\n/**\n * Split an oversized segment using recursive chunking.\n */\nfunction splitLongSegment(\n segment: string[],\n maxTokens: number,\n targetTokens: number,\n): SemanticChunk[] {\n const text = segment.join(\" \");\n // Cap targetTokens to maxTokens so recursive splitting never produces\n // segments larger than the configured maximum (Finding 2, PR #420).\n const cappedTarget = Math.min(targetTokens, maxTokens);\n const result: ChunkResult = chunkContent(text, {\n targetTokens: cappedTarget,\n minTokens: Math.min(cappedTarget, maxTokens),\n overlapSentences: 0,\n });\n\n return result.chunks.map((c) => ({\n content: c.content,\n index: c.index,\n tokenCount: c.tokenCount,\n boundaryScore: 0,\n }));\n}\n\n/**\n * Semantic chunking with smoothing-based topic boundary detection.\n *\n * @param content - Full text to chunk.\n * @param embedFn - Async function that embeds an array of texts.\n * @param config - Optional partial config overrides.\n * @returns SemanticChunkResult\n */\nexport async function semanticChunkContent(\n content: string,\n embedFn: EmbedFn,\n config?: Partial<SemanticChunkingConfig>,\n): Promise<SemanticChunkResult> {\n const cfg: SemanticChunkingConfig = {\n ...DEFAULT_SEMANTIC_CHUNKING_CONFIG,\n ...config,\n };\n\n // Guard against non-positive batch size which would cause an infinite loop\n const batchSize = Math.max(1, cfg.embeddingBatchSize);\n\n // --- Empty / trivially short input ---\n if (!content || content.trim().length === 0) {\n return {\n chunked: false,\n chunks: [],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n const sentences = splitSentences(content);\n\n if (sentences.length <= 1) {\n const tokenCount = estimateTokens(content);\n return {\n chunked: false,\n chunks: [\n {\n content: content.trim(),\n index: 0,\n tokenCount,\n boundaryScore: 1,\n },\n ],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n // If total tokens is short enough, return as single chunk\n const totalTokens = estimateTokens(content);\n if (totalTokens <= cfg.minTokens) {\n return {\n chunked: false,\n chunks: [\n {\n content: content.trim(),\n index: 0,\n tokenCount: totalTokens,\n boundaryScore: 1,\n },\n ],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n // --- Attempt embedding ---\n let embeddings: number[][];\n try {\n embeddings = await batchEmbed(sentences, embedFn, batchSize);\n } catch {\n // Embedding failed — fall back if configured\n if (cfg.fallbackToRecursive) {\n return buildRecursiveFallback(content, cfg);\n }\n throw new Error(\n \"Semantic chunking failed: embedding function threw and fallbackToRecursive is disabled\",\n );\n }\n\n if (embeddings.length !== sentences.length) {\n if (cfg.fallbackToRecursive) {\n return buildRecursiveFallback(content, cfg);\n }\n throw new Error(\n `Semantic chunking failed: expected ${sentences.length} embeddings but received ${embeddings.length}`,\n );\n }\n\n // --- Compute pairwise cosine similarity ---\n const similarities: number[] = [];\n for (let i = 0; i < sentences.length - 1; i++) {\n similarities.push(cosineSimilarity(embeddings[i], embeddings[i + 1]));\n }\n\n // If only one pair (2 sentences), nothing to smooth or split meaningfully.\n // However, if the combined content exceeds maxTokens, apply recursive splitting.\n if (similarities.length <= 1) {\n if (totalTokens > cfg.maxTokens) {\n return buildRecursiveFallback(content, cfg);\n }\n return {\n chunked: false,\n chunks: [\n {\n content: content.trim(),\n index: 0,\n tokenCount: totalTokens,\n boundaryScore: similarities.length === 1 ? similarities[0] : 1,\n },\n ],\n boundaries: [],\n method: \"semantic\",\n };\n }\n\n // --- Smooth the similarity series ---\n const smoothed = movingAverage(similarities, cfg.smoothingWindowSize);\n\n // --- Detect boundaries: local minima below (mean - k * stddev) ---\n const m = mean(smoothed);\n const s = stddev(smoothed);\n const threshold = m - cfg.boundaryThresholdStdDevs * s;\n const rawBoundaries = findLocalMinima(smoothed, threshold);\n\n // --- Build segments, merge short, split long ---\n let segments = buildSegments(sentences, rawBoundaries);\n segments = mergeShortSegments(segments, cfg.minTokens);\n\n // --- Convert segments to chunks, splitting oversized ones ---\n const chunks: SemanticChunk[] = [];\n const finalBoundaries: number[] = [];\n let sentenceOffset = 0;\n\n for (let segIdx = 0; segIdx < segments.length; segIdx++) {\n const segment = segments[segIdx];\n const segText = segment.join(\" \");\n const segTokens = estimateTokens(segText);\n\n if (segTokens > cfg.maxTokens) {\n // Recursive split for oversized segment\n const subChunks = splitLongSegment(segment, cfg.maxTokens, cfg.targetTokens);\n for (const sc of subChunks) {\n chunks.push({\n ...sc,\n index: chunks.length,\n });\n }\n } else {\n // Compute boundary score: the similarity at the trailing edge\n const trailingSentenceIdx = sentenceOffset + segment.length - 1;\n let bScore = 1;\n if (\n trailingSentenceIdx < similarities.length &&\n segIdx < segments.length - 1\n ) {\n bScore = smoothed[trailingSentenceIdx] ?? similarities[trailingSentenceIdx] ?? 1;\n }\n\n chunks.push({\n content: segText,\n index: chunks.length,\n tokenCount: segTokens,\n boundaryScore: bScore,\n });\n }\n\n // Record boundaries (all but the last segment produce a boundary)\n if (segIdx < segments.length - 1) {\n finalBoundaries.push(sentenceOffset + segment.length - 1);\n }\n sentenceOffset += segment.length;\n }\n\n return {\n chunked: chunks.length > 1,\n chunks,\n boundaries: finalBoundaries,\n method: \"semantic\",\n };\n}\n\n// ---------------------------------------------------------------------------\n// Recursive fallback helper\n// ---------------------------------------------------------------------------\n\nfunction buildRecursiveFallback(\n content: string,\n cfg: SemanticChunkingConfig,\n): SemanticChunkResult {\n // Cap targetTokens to maxTokens so the recursive fallback path honours the\n // same constraint as splitLongSegment (PR #439 post-merge cursor[bot] finding).\n const cappedTarget = Math.min(cfg.targetTokens, cfg.maxTokens);\n const result: ChunkResult = chunkContent(content, {\n targetTokens: cappedTarget,\n minTokens: Math.min(cfg.minTokens, cappedTarget),\n overlapSentences: 0,\n });\n\n return {\n chunked: result.chunked,\n chunks: result.chunks.map((c) => ({\n ...c,\n boundaryScore: 0,\n })),\n boundaries: [],\n method: \"recursive-fallback\",\n };\n}\n"],"mappings":";;;;;AA+BO,IAAM,mCAA2D;AAAA,EACtE,cAAc;AAAA,EACd,WAAW;AAAA,EACX,WAAW;AAAA,EACX,qBAAqB;AAAA,EACrB,0BAA0B;AAAA,EAC1B,oBAAoB;AAAA,EACpB,qBAAqB;AACvB;AA0CO,SAAS,iBAAiB,GAAa,GAAqB;AACjE,MAAI,EAAE,WAAW,EAAE,QAAQ;AACzB,UAAM,IAAI;AAAA,MACR,6CAA6C,EAAE,MAAM,OAAO,EAAE,MAAM;AAAA,IACtE;AAAA,EACF;AACA,MAAI,EAAE,WAAW,EAAG,QAAO;AAE3B,MAAI,MAAM;AACV,MAAI,OAAO;AACX,MAAI,OAAO;AACX,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,WAAO,EAAE,CAAC,IAAI,EAAE,CAAC;AACjB,YAAQ,EAAE,CAAC,IAAI,EAAE,CAAC;AAClB,YAAQ,EAAE,CAAC,IAAI,EAAE,CAAC;AAAA,EACpB;AAEA,QAAM,QAAQ,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,IAAI;AAC9C,MAAI,UAAU,EAAG,QAAO;AACxB,SAAO,MAAM;AACf;AAKO,SAAS,KAAK,QAA0B;AAC7C,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,MAAM;AACV,aAAW,KAAK,OAAQ,QAAO;AAC/B,SAAO,MAAM,OAAO;AACtB;AAKO,SAAS,OAAO,QAA0B;AAC/C,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,QAAM,IAAI,KAAK,MAAM;AACrB,MAAI,QAAQ;AACZ,aAAW,KAAK,QAAQ;AACtB,UAAM,IAAI,IAAI;AACd,aAAS,IAAI;AAAA,EACf;AACA,SAAO,KAAK,KAAK,QAAQ,OAAO,MAAM;AACxC;AAUO,SAAS,cAAc,QAAkB,YAA8B;AAC5E,MAAI,OAAO,WAAW,EAAG,QAAO,CAAC;AACjC,MAAI,aAAa,EAAG,cAAa;AAEjC,MAAI,aAAa,MAAM,EAAG,cAAa,aAAa;AAEpD,QAAM,QAAQ,KAAK,MAAM,aAAa,CAAC;AACvC,QAAM,SAAmB,IAAI,MAAM,OAAO,MAAM;AAEhD,WAAS,IAAI,GAAG,IAAI,OAAO,QAAQ,KAAK;AACtC,UAAM,KAAK,KAAK,IAAI,GAAG,IAAI,KAAK;AAChC,UAAM,KAAK,KAAK,IAAI,OAAO,SAAS,GAAG,IAAI,KAAK;AAChD,QAAI,MAAM;AACV,aAAS,IAAI,IAAI,KAAK,IAAI,IAAK,QAAO,OAAO,CAAC;AAC9C,WAAO,CAAC,IAAI,OAAO,KAAK,KAAK;AAAA,EAC/B;AACA,SAAO;AACT;AAOO,SAAS,gBACd,QACA,WACU;AACV,MAAI,OAAO,UAAU,EAAG,QAAO,CAAC;AAEhC,QAAM,SAAmB,CAAC;AAC1B,WAAS,IAAI,GAAG,IAAI,OAAO,SAAS,GAAG,KAAK;AAC1C,QACE,OAAO,CAAC,IAAI,OAAO,IAAI,CAAC,KACxB,OAAO,CAAC,IAAI,OAAO,IAAI,CAAC,KACxB,OAAO,CAAC,IAAI,WACZ;AACA,aAAO,KAAK,CAAC;AAAA,IACf;AAAA,EACF;AACA,SAAO;AACT;AAUA,SAAS,eAAe,MAAwB;AAC9C,QAAM,YAAsB,CAAC;AAC7B,QAAM,gBAAgB;AAEtB,MAAI;AACJ,MAAI,YAAY;AAEhB,UAAQ,QAAQ,cAAc,KAAK,IAAI,OAAO,MAAM;AAClD,cAAU,KAAK,MAAM,CAAC,EAAE,KAAK,CAAC;AAC9B,gBAAY,cAAc;AAAA,EAC5B;AAEA,MAAI,YAAY,KAAK,QAAQ;AAC3B,UAAM,YAAY,KAAK,MAAM,SAAS,EAAE,KAAK;AAC7C,QAAI,WAAW;AACb,gBAAU,KAAK,SAAS;AAAA,IAC1B;AAAA,EACF;AAEA,SAAO,UAAU,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC;AAC7C;AAOA,SAAS,eAAe,MAAsB;AAC5C,SAAO,KAAK,KAAK,KAAK,SAAS,CAAC;AAClC;AAUA,eAAe,WACb,WACA,SACA,WACqB;AACrB,QAAM,gBAA4B,CAAC;AAEnC,WAAS,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK,WAAW;AACpD,UAAM,QAAQ,UAAU,MAAM,GAAG,IAAI,SAAS;AAC9C,UAAM,cAAc,MAAM,QAAQ,KAAK;AACvC,eAAW,OAAO,aAAa;AAC7B,oBAAc,KAAK,GAAG;AAAA,IACxB;AAAA,EACF;AAEA,SAAO;AACT;AAOA,SAAS,cACP,WACA,YACY;AACZ,QAAM,SAAS,CAAC,GAAG,UAAU,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AACnD,QAAM,WAAuB,CAAC;AAC9B,MAAI,QAAQ;AAEZ,aAAW,KAAK,QAAQ;AAEtB,UAAM,aAAa,IAAI;AACvB,QAAI,aAAa,SAAS,cAAc,UAAU,QAAQ;AACxD,eAAS,KAAK,UAAU,MAAM,OAAO,UAAU,CAAC;AAChD,cAAQ;AAAA,IACV;AAAA,EACF;AAGA,MAAI,QAAQ,UAAU,QAAQ;AAC5B,aAAS,KAAK,UAAU,MAAM,KAAK,CAAC;AAAA,EACtC;AAEA,SAAO;AACT;AAMA,SAAS,mBACP,UACA,WACY;AACZ,MAAI,SAAS,UAAU,EAAG,QAAO;AAEjC,QAAM,SAAqB,CAAC;AAC5B,MAAI,SAAmB,CAAC;AAExB,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,aAAS,CAAC,GAAG,QAAQ,GAAG,SAAS,CAAC,CAAC;AACnC,UAAM,aAAa,eAAe,OAAO,KAAK,GAAG,CAAC;AAElD,QAAI,cAAc,aAAa,MAAM,SAAS,SAAS,GAAG;AACxD,aAAO,KAAK,MAAM;AAClB,eAAS,CAAC;AAAA,IACZ;AAAA,EACF;AAGA,MAAI,OAAO,SAAS,GAAG;AACrB,QAAI,OAAO,SAAS,GAAG;AACrB,aAAO,OAAO,SAAS,CAAC,IAAI,CAAC,GAAG,OAAO,OAAO,SAAS,CAAC,GAAG,GAAG,MAAM;AAAA,IACtE,OAAO;AACL,aAAO,KAAK,MAAM;AAAA,IACpB;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,iBACP,SACA,WACA,cACiB;AACjB,QAAM,OAAO,QAAQ,KAAK,GAAG;AAG7B,QAAM,eAAe,KAAK,IAAI,cAAc,SAAS;AACrD,QAAM,SAAsB,aAAa,MAAM;AAAA,IAC7C,cAAc;AAAA,IACd,WAAW,KAAK,IAAI,cAAc,SAAS;AAAA,IAC3C,kBAAkB;AAAA,EACpB,CAAC;AAED,SAAO,OAAO,OAAO,IAAI,CAAC,OAAO;AAAA,IAC/B,SAAS,EAAE;AAAA,IACX,OAAO,EAAE;AAAA,IACT,YAAY,EAAE;AAAA,IACd,eAAe;AAAA,EACjB,EAAE;AACJ;AAUA,eAAsB,qBACpB,SACA,SACA,QAC8B;AAC9B,QAAM,MAA8B;AAAA,IAClC,GAAG;AAAA,IACH,GAAG;AAAA,EACL;AAGA,QAAM,YAAY,KAAK,IAAI,GAAG,IAAI,kBAAkB;AAGpD,MAAI,CAAC,WAAW,QAAQ,KAAK,EAAE,WAAW,GAAG;AAC3C,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ,CAAC;AAAA,MACT,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAEA,QAAM,YAAY,eAAe,OAAO;AAExC,MAAI,UAAU,UAAU,GAAG;AACzB,UAAM,aAAa,eAAe,OAAO;AACzC,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ;AAAA,QACN;AAAA,UACE,SAAS,QAAQ,KAAK;AAAA,UACtB,OAAO;AAAA,UACP;AAAA,UACA,eAAe;AAAA,QACjB;AAAA,MACF;AAAA,MACA,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAGA,QAAM,cAAc,eAAe,OAAO;AAC1C,MAAI,eAAe,IAAI,WAAW;AAChC,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ;AAAA,QACN;AAAA,UACE,SAAS,QAAQ,KAAK;AAAA,UACtB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,eAAe;AAAA,QACjB;AAAA,MACF;AAAA,MACA,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAGA,MAAI;AACJ,MAAI;AACF,iBAAa,MAAM,WAAW,WAAW,SAAS,SAAS;AAAA,EAC7D,QAAQ;AAEN,QAAI,IAAI,qBAAqB;AAC3B,aAAO,uBAAuB,SAAS,GAAG;AAAA,IAC5C;AACA,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,MAAI,WAAW,WAAW,UAAU,QAAQ;AAC1C,QAAI,IAAI,qBAAqB;AAC3B,aAAO,uBAAuB,SAAS,GAAG;AAAA,IAC5C;AACA,UAAM,IAAI;AAAA,MACR,sCAAsC,UAAU,MAAM,4BAA4B,WAAW,MAAM;AAAA,IACrG;AAAA,EACF;AAGA,QAAM,eAAyB,CAAC;AAChC,WAAS,IAAI,GAAG,IAAI,UAAU,SAAS,GAAG,KAAK;AAC7C,iBAAa,KAAK,iBAAiB,WAAW,CAAC,GAAG,WAAW,IAAI,CAAC,CAAC,CAAC;AAAA,EACtE;AAIA,MAAI,aAAa,UAAU,GAAG;AAC5B,QAAI,cAAc,IAAI,WAAW;AAC/B,aAAO,uBAAuB,SAAS,GAAG;AAAA,IAC5C;AACA,WAAO;AAAA,MACL,SAAS;AAAA,MACT,QAAQ;AAAA,QACN;AAAA,UACE,SAAS,QAAQ,KAAK;AAAA,UACtB,OAAO;AAAA,UACP,YAAY;AAAA,UACZ,eAAe,aAAa,WAAW,IAAI,aAAa,CAAC,IAAI;AAAA,QAC/D;AAAA,MACF;AAAA,MACA,YAAY,CAAC;AAAA,MACb,QAAQ;AAAA,IACV;AAAA,EACF;AAGA,QAAM,WAAW,cAAc,cAAc,IAAI,mBAAmB;AAGpE,QAAM,IAAI,KAAK,QAAQ;AACvB,QAAM,IAAI,OAAO,QAAQ;AACzB,QAAM,YAAY,IAAI,IAAI,2BAA2B;AACrD,QAAM,gBAAgB,gBAAgB,UAAU,SAAS;AAGzD,MAAI,WAAW,cAAc,WAAW,aAAa;AACrD,aAAW,mBAAmB,UAAU,IAAI,SAAS;AAGrD,QAAM,SAA0B,CAAC;AACjC,QAAM,kBAA4B,CAAC;AACnC,MAAI,iBAAiB;AAErB,WAAS,SAAS,GAAG,SAAS,SAAS,QAAQ,UAAU;AACvD,UAAM,UAAU,SAAS,MAAM;AAC/B,UAAM,UAAU,QAAQ,KAAK,GAAG;AAChC,UAAM,YAAY,eAAe,OAAO;AAExC,QAAI,YAAY,IAAI,WAAW;AAE7B,YAAM,YAAY,iBAAiB,SAAS,IAAI,WAAW,IAAI,YAAY;AAC3E,iBAAW,MAAM,WAAW;AAC1B,eAAO,KAAK;AAAA,UACV,GAAG;AAAA,UACH,OAAO,OAAO;AAAA,QAChB,CAAC;AAAA,MACH;AAAA,IACF,OAAO;AAEL,YAAM,sBAAsB,iBAAiB,QAAQ,SAAS;AAC9D,UAAI,SAAS;AACb,UACE,sBAAsB,aAAa,UACnC,SAAS,SAAS,SAAS,GAC3B;AACA,iBAAS,SAAS,mBAAmB,KAAK,aAAa,mBAAmB,KAAK;AAAA,MACjF;AAEA,aAAO,KAAK;AAAA,QACV,SAAS;AAAA,QACT,OAAO,OAAO;AAAA,QACd,YAAY;AAAA,QACZ,eAAe;AAAA,MACjB,CAAC;AAAA,IACH;AAGA,QAAI,SAAS,SAAS,SAAS,GAAG;AAChC,sBAAgB,KAAK,iBAAiB,QAAQ,SAAS,CAAC;AAAA,IAC1D;AACA,sBAAkB,QAAQ;AAAA,EAC5B;AAEA,SAAO;AAAA,IACL,SAAS,OAAO,SAAS;AAAA,IACzB;AAAA,IACA,YAAY;AAAA,IACZ,QAAQ;AAAA,EACV;AACF;AAMA,SAAS,uBACP,SACA,KACqB;AAGrB,QAAM,eAAe,KAAK,IAAI,IAAI,cAAc,IAAI,SAAS;AAC7D,QAAM,SAAsB,aAAa,SAAS;AAAA,IAChD,cAAc;AAAA,IACd,WAAW,KAAK,IAAI,IAAI,WAAW,YAAY;AAAA,IAC/C,kBAAkB;AAAA,EACpB,CAAC;AAED,SAAO;AAAA,IACL,SAAS,OAAO;AAAA,IAChB,QAAQ,OAAO,OAAO,IAAI,CAAC,OAAO;AAAA,MAChC,GAAG;AAAA,MACH,eAAe;AAAA,IACjB,EAAE;AAAA,IACF,YAAY,CAAC;AAAA,IACb,QAAQ;AAAA,EACV;AACF;","names":[]}
|
|
File without changes
|
|
File without changes
|