npm - @knolo/core - Versions diffs - 3.1.2 - Mend

@knolo/core 3.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/dist/agent.d.ts +53 -0
package/dist/agent.js +175 -0
package/dist/builder.d.ts +20 -0
package/dist/builder.js +196 -0
package/dist/index.d.ts +21 -0
package/dist/index.js +13 -0
package/dist/indexer.d.ts +23 -0
package/dist/indexer.js +71 -0
package/dist/pack.d.ts +35 -0
package/dist/pack.js +175 -0
package/dist/patch.d.ts +22 -0
package/dist/patch.js +35 -0
package/dist/quality/diversify.d.ts +13 -0
package/dist/quality/diversify.js +41 -0
package/dist/quality/proximity.d.ts +2 -0
package/dist/quality/proximity.js +31 -0
package/dist/quality/signature.d.ts +3 -0
package/dist/quality/signature.js +24 -0
package/dist/quality/similarity.d.ts +3 -0
package/dist/quality/similarity.js +27 -0
package/dist/query.d.ts +41 -0
package/dist/query.js +463 -0
package/dist/rank.d.ts +21 -0
package/dist/rank.js +31 -0
package/dist/router.d.ts +28 -0
package/dist/router.js +74 -0
package/dist/routing_profile.d.ts +19 -0
package/dist/routing_profile.js +102 -0
package/dist/semantic.d.ts +7 -0
package/dist/semantic.js +98 -0
package/dist/tokenize.d.ts +24 -0
package/dist/tokenize.js +53 -0
package/dist/tool_gate.d.ts +3 -0
package/dist/tool_gate.js +8 -0
package/dist/tool_parse.d.ts +2 -0
package/dist/tool_parse.js +102 -0
package/dist/tools.d.ts +27 -0
package/dist/tools.js +34 -0
package/dist/trace.d.ts +45 -0
package/dist/trace.js +12 -0
package/dist/utils/utf8.d.ts +8 -0
package/dist/utils/utf8.js +72 -0
package/package.json +39 -0

package/dist/agent.d.ts ADDED Viewed

@@ -0,0 +1,53 @@
+import type { Pack } from './pack.js';
+import type { QueryOptions } from './query.js';
+export type AgentPromptTemplate = string[] | {
+    format: 'markdown';
+    template: string;
+};
+export type AgentToolPolicy = {
+    mode: 'allow' | 'deny';
+    tools: string[];
+};
+export type AgentRetrievalDefaults = {
+    namespace: string[];
+    topK?: number;
+    queryExpansion?: QueryOptions['queryExpansion'];
+    semantic?: Omit<NonNullable<QueryOptions['semantic']>, 'queryEmbedding' | 'enabled' | 'force'> & {
+        enabled?: boolean;
+    };
+    minScore?: number;
+    requirePhrases?: string[];
+    source?: string[];
+};
+export type AgentDefinitionV1 = {
+    id: string;
+    version: 1;
+    name?: string;
+    description?: string;
+    systemPrompt: AgentPromptTemplate;
+    retrievalDefaults: AgentRetrievalDefaults;
+    toolPolicy?: AgentToolPolicy;
+    metadata?: Record<string, string | number | boolean | null>;
+};
+export type AgentRegistry = {
+    version: 1;
+    agents: AgentDefinitionV1[];
+};
+export type ResolveAgentInput = {
+    agentId: string;
+    query?: QueryOptions;
+    patch?: Record<string, string | number | boolean>;
+};
+export type ResolvedAgent = {
+    agent: AgentDefinitionV1;
+    systemPrompt: string;
+    retrievalOptions: QueryOptions;
+};
+export declare function validateAgentRegistry(reg: AgentRegistry): void;
+export declare function validateAgentDefinition(agent: AgentDefinitionV1): void;
+export declare function listAgents(pack: Pack): string[];
+export declare function getAgent(pack: Pack, agentId: string): AgentDefinitionV1 | undefined;
+export declare function buildSystemPrompt(agent: AgentDefinitionV1, patch?: Record<string, string | number | boolean>): string;
+export declare function resolveAgent(pack: Pack, input: ResolveAgentInput): ResolvedAgent;
+export declare function isToolAllowed(agent: AgentDefinitionV1, toolId: string): boolean;
+export declare function assertToolAllowed(agent: AgentDefinitionV1, toolId: string): void;

package/dist/agent.js ADDED Viewed

@@ -0,0 +1,175 @@
+import { validateQueryOptions } from './query.js';
+export function validateAgentRegistry(reg) {
+    if (!reg || typeof reg !== 'object') {
+        throw new Error('agent registry must be an object.');
+    }
+    if (reg.version !== 1) {
+        throw new Error('agent registry version must be 1.');
+    }
+    if (!Array.isArray(reg.agents)) {
+        throw new Error('agent registry agents must be an array.');
+    }
+    const seen = new Set();
+    for (const agent of reg.agents) {
+        validateAgentDefinition(agent);
+        if (seen.has(agent.id)) {
+            throw new Error(`agent id must be unique: ${agent.id}`);
+        }
+        seen.add(agent.id);
+    }
+}
+export function validateAgentDefinition(agent) {
+    if (!agent || typeof agent !== 'object') {
+        throw new Error('agent definition must be an object.');
+    }
+    if (typeof agent.id !== 'string' || !agent.id.trim()) {
+        throw new Error('agent id must be a non-empty string.');
+    }
+    if (!/^[a-z0-9]+(?:[._-][a-z0-9]+)*$/.test(agent.id)) {
+        throw new Error(`agent id must be slug-like: ${agent.id}`);
+    }
+    if (agent.version !== 1) {
+        throw new Error(`agent ${agent.id} version must be 1.`);
+    }
+    validateSystemPrompt(agent);
+    const defaults = agent.retrievalDefaults;
+    if (!defaults || typeof defaults !== 'object') {
+        throw new Error(`agent ${agent.id} retrievalDefaults must be an object.`);
+    }
+    if (!Array.isArray(defaults.namespace) ||
+        defaults.namespace.length === 0 ||
+        defaults.namespace.some((ns) => typeof ns !== 'string' || !ns.trim())) {
+        throw new Error(`agent ${agent.id} retrievalDefaults.namespace must be a non-empty string array.`);
+    }
+    if (defaults.topK !== undefined &&
+        (!Number.isInteger(defaults.topK) || defaults.topK < 1)) {
+        throw new Error(`agent ${agent.id} retrievalDefaults.topK must be a positive integer.`);
+    }
+    if (agent.toolPolicy) {
+        const { mode, tools } = agent.toolPolicy;
+        if (mode !== 'allow' && mode !== 'deny') {
+            throw new Error(`agent ${agent.id} toolPolicy.mode must be "allow" or "deny".`);
+        }
+        if (!Array.isArray(tools) ||
+            tools.some((tool) => typeof tool !== 'string' || !tool.trim())) {
+            throw new Error(`agent ${agent.id} toolPolicy.tools must be a string array.`);
+        }
+        if (new Set(tools).size !== tools.length) {
+            throw new Error(`agent ${agent.id} toolPolicy.tools must contain unique values.`);
+        }
+    }
+    const syntheticOpts = {
+        namespace: defaults.namespace,
+        topK: defaults.topK,
+        queryExpansion: defaults.queryExpansion,
+        semantic: defaults.semantic,
+        minScore: defaults.minScore,
+        requirePhrases: defaults.requirePhrases,
+        source: defaults.source,
+    };
+    validateQueryOptions(syntheticOpts);
+}
+export function listAgents(pack) {
+    const reg = pack.meta.agents;
+    if (!reg?.agents?.length)
+        return [];
+    return reg.agents.map((agent) => agent.id);
+}
+export function getAgent(pack, agentId) {
+    return pack.meta.agents?.agents.find((agent) => agent.id === agentId);
+}
+export function buildSystemPrompt(agent, patch = {}) {
+    const template = agent.systemPrompt;
+    if (Array.isArray(template)) {
+        return template.join('\n');
+    }
+    const source = template.template;
+    const placeholders = Array.from(source.matchAll(/\{\{\s*([A-Za-z0-9_.-]+)\s*\}\}/g)).map((m) => m[1]);
+    for (const key of placeholders) {
+        if (!(key in patch)) {
+            throw new Error(`agent ${agent.id} system prompt missing patch value for placeholder: ${key}`);
+        }
+    }
+    return source.replace(/\{\{\s*([A-Za-z0-9_.-]+)\s*\}\}/g, (_match, key) => String(patch[key]));
+}
+export function resolveAgent(pack, input) {
+    const agent = getAgent(pack, input.agentId);
+    if (!agent) {
+        throw new Error(`agent not found: ${input.agentId}`);
+    }
+    const defaults = {
+        namespace: agent.retrievalDefaults.namespace,
+        topK: agent.retrievalDefaults.topK,
+        queryExpansion: agent.retrievalDefaults.queryExpansion,
+        semantic: agent.retrievalDefaults.semantic,
+        minScore: agent.retrievalDefaults.minScore,
+        requirePhrases: agent.retrievalDefaults.requirePhrases,
+        source: agent.retrievalDefaults.source,
+    };
+    const caller = input.query ?? {};
+    const retrievalOptions = {
+        ...defaults,
+        ...caller,
+        namespace: defaults.namespace,
+        queryExpansion: {
+            ...(defaults.queryExpansion ?? {}),
+            ...(caller.queryExpansion ?? {}),
+        },
+        semantic: {
+            ...(defaults.semantic ?? {}),
+            ...(caller.semantic ?? {}),
+            blend: {
+                ...(defaults.semantic?.blend ?? {}),
+                ...(caller.semantic?.blend ?? {}),
+            },
+        },
+    };
+    if (!defaults.queryExpansion && !caller.queryExpansion)
+        delete retrievalOptions.queryExpansion;
+    if (!defaults.semantic && !caller.semantic)
+        delete retrievalOptions.semantic;
+    if (retrievalOptions.semantic &&
+        !defaults.semantic?.blend &&
+        !caller.semantic?.blend) {
+        delete retrievalOptions.semantic.blend;
+    }
+    validateQueryOptions(retrievalOptions);
+    return {
+        agent,
+        systemPrompt: buildSystemPrompt(agent, input.patch),
+        retrievalOptions,
+    };
+}
+export function isToolAllowed(agent, toolId) {
+    const policy = agent.toolPolicy;
+    if (!policy)
+        return true;
+    const hasTool = policy.tools.includes(toolId);
+    if (policy.mode === 'allow') {
+        return hasTool;
+    }
+    return !hasTool;
+}
+export function assertToolAllowed(agent, toolId) {
+    if (!isToolAllowed(agent, toolId)) {
+        throw new Error(`agent ${agent.id} does not allow tool: ${toolId}`);
+    }
+}
+function validateSystemPrompt(agent) {
+    const prompt = agent.systemPrompt;
+    if (Array.isArray(prompt)) {
+        if (!prompt.length || prompt.some((line) => typeof line !== 'string')) {
+            throw new Error(`agent ${agent.id} systemPrompt must be a non-empty string array.`);
+        }
+        if (!prompt.join('').trim()) {
+            throw new Error(`agent ${agent.id} systemPrompt must not be empty.`);
+        }
+        return;
+    }
+    if (!prompt ||
+        prompt.format !== 'markdown' ||
+        typeof prompt.template !== 'string' ||
+        !prompt.template.trim()) {
+        throw new Error(`agent ${agent.id} systemPrompt markdown template must be a non-empty string.`);
+    }
+}

package/dist/builder.d.ts ADDED Viewed

@@ -0,0 +1,20 @@
+import type { AgentDefinitionV1, AgentRegistry } from './agent.js';
+export type BuildInputDoc = {
+    id?: string;
+    heading?: string;
+    namespace?: string;
+    text: string;
+};
+export type BuildPackOptions = {
+    agents?: AgentRegistry | AgentDefinitionV1[];
+    semantic?: {
+        enabled: boolean;
+        modelId: string;
+        embeddings: Float32Array[];
+        quantization?: {
+            type: 'int8_l2norm';
+            perVectorScale?: true;
+        };
+    };
+};
+export declare function buildPack(docs: BuildInputDoc[], opts?: BuildPackOptions): Promise<Uint8Array>;

package/dist/builder.js ADDED Viewed

@@ -0,0 +1,196 @@
+/*
+ * builder.ts
+ *
+ * Build `.knolo` packs from input docs. Persists headings/docIds/token lengths
+ * and stores avgBlockLen in meta for stable query-time normalization.
+ */
+import { buildIndex } from './indexer.js';
+import { tokenize } from './tokenize.js';
+import { getTextEncoder } from './utils/utf8.js';
+import { encodeScaleF16, quantizeEmbeddingInt8L2Norm } from './semantic.js';
+import { validateAgentRegistry } from './agent.js';
+export async function buildPack(docs, opts = {}) {
+    const normalizedDocs = validateDocs(docs);
+    // Prepare blocks (strip MD) and carry heading/docId for optional boosts.
+    const blocks = normalizedDocs.map((d, i) => ({
+        id: i,
+        text: stripMd(d.text),
+        heading: d.heading,
+    }));
+    // Build index
+    const { lexicon, postings } = buildIndex(blocks);
+    const blockTokenLens = blocks.map((b) => tokenize(b.text).length);
+    const totalTokens = blockTokenLens.reduce((sum, len) => sum + len, 0);
+    const avgBlockLen = blocks.length ? totalTokens / blocks.length : 1;
+    const agents = normalizeAgents(opts.agents);
+    const meta = {
+        version: 3,
+        stats: {
+            docs: normalizedDocs.length,
+            blocks: blocks.length,
+            terms: lexicon.length,
+            avgBlockLen,
+        },
+        ...(agents ? { agents } : {}),
+    };
+    // Persist blocks as objects to optionally carry heading/docId/token length.
+    const blocksPayload = blocks.map((b, i) => ({
+        text: b.text,
+        heading: b.heading ?? null,
+        docId: normalizedDocs[i]?.id ?? null,
+        namespace: normalizedDocs[i]?.namespace ?? null,
+        len: blockTokenLens[i] ?? 0,
+    }));
+    // Encode sections
+    const enc = getTextEncoder();
+    const metaBytes = enc.encode(JSON.stringify(meta));
+    const lexBytes = enc.encode(JSON.stringify(lexicon));
+    const blocksBytes = enc.encode(JSON.stringify(blocksPayload));
+    const semanticEnabled = Boolean(opts.semantic?.enabled);
+    const semanticSection = semanticEnabled && opts.semantic
+        ? buildSemanticSection(blocks.length, opts.semantic)
+        : undefined;
+    const semBytes = semanticSection
+        ? enc.encode(JSON.stringify(semanticSection.semJson))
+        : undefined;
+    const semBlob = semanticSection?.semBlob;
+    const totalLength = 4 +
+        metaBytes.length +
+        4 +
+        lexBytes.length +
+        4 +
+        postings.length * 4 +
+        4 +
+        blocksBytes.length +
+        (semanticEnabled && semBytes && semBlob
+            ? 4 + semBytes.length + 4 + semBlob.length
+            : 0);
+    const out = new Uint8Array(totalLength);
+    const dv = new DataView(out.buffer);
+    let offset = 0;
+    // meta
+    dv.setUint32(offset, metaBytes.length, true);
+    offset += 4;
+    out.set(metaBytes, offset);
+    offset += metaBytes.length;
+    // lexicon
+    dv.setUint32(offset, lexBytes.length, true);
+    offset += 4;
+    out.set(lexBytes, offset);
+    offset += lexBytes.length;
+    // postings (alignment-safe via DataView)
+    dv.setUint32(offset, postings.length, true);
+    offset += 4;
+    for (let i = 0; i < postings.length; i++) {
+        dv.setUint32(offset, postings[i], true);
+        offset += 4;
+    }
+    // blocks
+    dv.setUint32(offset, blocksBytes.length, true);
+    offset += 4;
+    out.set(blocksBytes, offset);
+    offset += blocksBytes.length;
+    if (semanticEnabled && semBytes && semBlob) {
+        dv.setUint32(offset, semBytes.length, true);
+        offset += 4;
+        out.set(semBytes, offset);
+        offset += semBytes.length;
+        dv.setUint32(offset, semBlob.length, true);
+        offset += 4;
+        out.set(semBlob, offset);
+    }
+    return out;
+}
+function normalizeAgents(input) {
+    if (!input)
+        return undefined;
+    const registry = Array.isArray(input)
+        ? { version: 1, agents: input }
+        : input;
+    validateAgentRegistry(registry);
+    return registry;
+}
+function buildSemanticSection(blockCount, semantic) {
+    const { embeddings } = semantic;
+    if (!Array.isArray(embeddings) || embeddings.length !== blockCount) {
+        throw new Error(`semantic.embeddings must be provided with one embedding per block (expected ${blockCount}).`);
+    }
+    const quantizationType = semantic.quantization?.type ?? 'int8_l2norm';
+    if (quantizationType !== 'int8_l2norm') {
+        throw new Error(`Unsupported semantic quantization type: ${quantizationType}`);
+    }
+    const dims = embeddings[0]?.length ?? 0;
+    if (!dims)
+        throw new Error('semantic.embeddings must contain vectors with non-zero dimensions.');
+    const vecs = new Int8Array(embeddings.length * dims);
+    const scales = new Uint16Array(embeddings.length);
+    for (let i = 0; i < embeddings.length; i++) {
+        const embedding = embeddings[i];
+        if (!(embedding instanceof Float32Array)) {
+            throw new Error(`semantic.embeddings[${i}] must be a Float32Array.`);
+        }
+        if (embedding.length !== dims) {
+            throw new Error(`semantic.embeddings[${i}] dims mismatch: expected ${dims}, got ${embedding.length}.`);
+        }
+        const { q, scale } = quantizeEmbeddingInt8L2Norm(embedding);
+        vecs.set(q, i * dims);
+        scales[i] = encodeScaleF16(scale);
+    }
+    const vecByteOffset = 0;
+    const vecByteLength = vecs.byteLength;
+    const scalesByteOffset = vecByteLength;
+    const scalesByteLength = scales.byteLength;
+    const semBlob = new Uint8Array(vecByteLength + scalesByteLength);
+    semBlob.set(new Uint8Array(vecs.buffer, vecs.byteOffset, vecByteLength), vecByteOffset);
+    semBlob.set(new Uint8Array(scales.buffer, scales.byteOffset, scalesByteLength), scalesByteOffset);
+    const semJson = {
+        version: 1,
+        modelId: semantic.modelId,
+        dims,
+        encoding: 'int8_l2norm',
+        perVectorScale: true,
+        blocks: {
+            vectors: { byteOffset: vecByteOffset, length: vecs.length },
+            scales: {
+                byteOffset: scalesByteOffset,
+                length: scales.length,
+                encoding: 'float16',
+            },
+        },
+    };
+    return { semJson, semBlob };
+}
+function validateDocs(docs) {
+    if (!Array.isArray(docs)) {
+        throw new Error('buildPack expects an array of docs: [{ text, id?, heading?, namespace? }, ...]');
+    }
+    return docs.map((doc, i) => {
+        if (!doc || typeof doc !== 'object') {
+            throw new Error(`Invalid doc at index ${i}: expected an object with a string "text" field.`);
+        }
+        if (typeof doc.text !== 'string' || !doc.text.trim()) {
+            throw new Error(`Invalid doc at index ${i}: "text" must be a non-empty string.`);
+        }
+        if (doc.id !== undefined && typeof doc.id !== 'string') {
+            throw new Error(`Invalid doc at index ${i}: "id" must be a string when provided.`);
+        }
+        if (doc.heading !== undefined && typeof doc.heading !== 'string') {
+            throw new Error(`Invalid doc at index ${i}: "heading" must be a string when provided.`);
+        }
+        if (doc.namespace !== undefined && typeof doc.namespace !== 'string') {
+            throw new Error(`Invalid doc at index ${i}: "namespace" must be a string when provided.`);
+        }
+        return doc;
+    });
+}
+/** Strip Markdown syntax with lightweight regexes (no deps). */
+function stripMd(md) {
+    let text = md.replace(/```[\s\S]*?```/g, ' ');
+    text = text.replace(/`[^`]*`/g, ' ');
+    text = text.replace(/[\*_~]+/g, ' ');
+    text = text.replace(/^#+\s*/gm, '');
+    text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
+    text = text.replace(/[\[\]()]/g, ' ');
+    text = text.replace(/\s+/g, ' ').trim();
+    return text;
+}

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+export { mountPack, hasSemantic } from './pack.js';
+export { query, lexConfidence, validateQueryOptions, validateSemanticQueryOptions, } from './query.js';
+export { makeContextPatch } from './patch.js';
+export { buildPack } from './builder.js';
+export { quantizeEmbeddingInt8L2Norm, encodeScaleF16, decodeScaleF16, } from './semantic.js';
+export { listAgents, getAgent, resolveAgent, buildSystemPrompt, isToolAllowed, assertToolAllowed, validateAgentRegistry, validateAgentDefinition, } from './agent.js';
+export type { MountOptions, PackMeta, Pack } from './pack.js';
+export type { QueryOptions, Hit } from './query.js';
+export type { ContextPatch } from './patch.js';
+export type { BuildInputDoc, BuildPackOptions } from './builder.js';
+export type { AgentPromptTemplate, AgentToolPolicy, AgentRetrievalDefaults, AgentDefinitionV1, AgentRegistry, ResolveAgentInput, ResolvedAgent, } from './agent.js';
+export { parseToolCallV1FromText } from './tool_parse.js';
+export { nowIso, createTrace } from './trace.js';
+export { assertToolCallAllowed } from './tool_gate.js';
+export { getAgentRoutingProfileV1, getPackRoutingProfilesV1, } from './routing_profile.js';
+export { isRouteDecisionV1, validateRouteDecisionV1, selectAgentIdFromRouteDecisionV1, } from './router.js';
+export { isToolCallV1, isToolResultV1 } from './tools.js';
+export type { ToolId, ToolCallV1, ToolResultErrorV1, ToolResultV1, ToolSpecV1, } from './tools.js';
+export type { TraceEventV1 } from './trace.js';
+export type { AgentRoutingProfileV1 } from './routing_profile.js';
+export type { RouteCandidateV1, RouteDecisionV1 } from './router.js';

package/dist/index.js ADDED Viewed

@@ -0,0 +1,13 @@
+// src/index.ts
+export { mountPack, hasSemantic } from './pack.js';
+export { query, lexConfidence, validateQueryOptions, validateSemanticQueryOptions, } from './query.js';
+export { makeContextPatch } from './patch.js';
+export { buildPack } from './builder.js';
+export { quantizeEmbeddingInt8L2Norm, encodeScaleF16, decodeScaleF16, } from './semantic.js';
+export { listAgents, getAgent, resolveAgent, buildSystemPrompt, isToolAllowed, assertToolAllowed, validateAgentRegistry, validateAgentDefinition, } from './agent.js';
+export { parseToolCallV1FromText } from './tool_parse.js';
+export { nowIso, createTrace } from './trace.js';
+export { assertToolCallAllowed } from './tool_gate.js';
+export { getAgentRoutingProfileV1, getPackRoutingProfilesV1, } from './routing_profile.js';
+export { isRouteDecisionV1, validateRouteDecisionV1, selectAgentIdFromRouteDecisionV1, } from './router.js';
+export { isToolCallV1, isToolResultV1 } from './tools.js';

package/dist/indexer.d.ts ADDED Viewed

@@ -0,0 +1,23 @@
+export type Block = {
+    id: number;
+    text: string;
+    heading?: string;
+};
+export type IndexBuildResult = {
+    lexicon: Array<[string, number]>;
+    postings: Uint32Array;
+};
+/**
+ * Build an inverted index from an array of blocks. The postings list format
+ * encodes, for each term, a header containing the termId, followed by
+ * sequences of blockId and positions for that term, with zeros as delimiters.
+ * The structure looks like:
+ *
+ *     [termId, blockId+1, pos, pos, 0, blockId+1, pos, 0, 0, termId, ...]
+ *
+ * Block IDs are stored as bid+1 so that 0 can remain a sentinel delimiter.
+ * Each block section ends with a 0, and each term section ends with a 0. The
+ * entire array can be streamed sequentially without needing to know the sizes
+ * of individual lists ahead of time.
+ */
+export declare function buildIndex(blocks: Block[]): IndexBuildResult;

package/dist/indexer.js ADDED Viewed

@@ -0,0 +1,71 @@
+/*
+ * indexer.ts
+ *
+ * Implements a basic inverted index builder. Given an array of blocks, it
+ * produces a lexicon mapping each unique term to a term identifier and a
+ * flattened postings array. This representation is intentionally naïve to
+ * prioritise clarity and portability over maximum compression. The pack
+ * builder can later swap this implementation for a more compact format.
+ */
+import { tokenize } from "./tokenize.js";
+/**
+ * Build an inverted index from an array of blocks. The postings list format
+ * encodes, for each term, a header containing the termId, followed by
+ * sequences of blockId and positions for that term, with zeros as delimiters.
+ * The structure looks like:
+ *
+ *     [termId, blockId+1, pos, pos, 0, blockId+1, pos, 0, 0, termId, ...]
+ *
+ * Block IDs are stored as bid+1 so that 0 can remain a sentinel delimiter.
+ * Each block section ends with a 0, and each term section ends with a 0. The
+ * entire array can be streamed sequentially without needing to know the sizes
+ * of individual lists ahead of time.
+ */
+export function buildIndex(blocks) {
+    // Map term to termId and interim map of termId -> blockId -> positions
+    const term2id = new Map();
+    const termBlockPositions = new Map();
+    const getTermId = (t) => {
+        let id = term2id.get(t);
+        if (id === undefined) {
+            id = term2id.size + 1; // term IDs start at 1
+            term2id.set(t, id);
+        }
+        return id;
+    };
+    // Build a local term frequency map per block, then populate the global map
+    for (const block of blocks) {
+        const toks = tokenize(block.text);
+        const perTermPositions = new Map();
+        for (const tk of toks) {
+            const id = getTermId(tk.term);
+            let positions = perTermPositions.get(id);
+            if (!positions) {
+                positions = [];
+                perTermPositions.set(id, positions);
+            }
+            positions.push(tk.pos);
+        }
+        // Merge into global structure
+        for (const [tid, positions] of perTermPositions) {
+            let blockMap = termBlockPositions.get(tid);
+            if (!blockMap) {
+                blockMap = new Map();
+                termBlockPositions.set(tid, blockMap);
+            }
+            blockMap.set(block.id, positions);
+        }
+    }
+    // Flatten postings into a single Uint32Array
+    const postings = [];
+    for (const [tid, blockMap] of termBlockPositions) {
+        postings.push(tid);
+        for (const [bid, positions] of blockMap) {
+            postings.push(bid + 1, ...positions, 0);
+        }
+        postings.push(0); // end of term
+    }
+    // Convert lexicon to array for serialization
+    const lexicon = Array.from(term2id.entries());
+    return { lexicon, postings: new Uint32Array(postings) };
+}

package/dist/pack.d.ts ADDED Viewed

@@ -0,0 +1,35 @@
+import type { AgentRegistry } from './agent.js';
+export type MountOptions = {
+    src: string | ArrayBufferLike | Uint8Array;
+};
+export type PackMeta = {
+    version: number;
+    stats: {
+        docs: number;
+        blocks: number;
+        terms: number;
+        avgBlockLen?: number;
+    };
+    agents?: AgentRegistry;
+};
+export type Pack = {
+    meta: PackMeta;
+    lexicon: Map<string, number>;
+    postings: Uint32Array;
+    blocks: string[];
+    headings?: (string | null)[];
+    docIds?: (string | null)[];
+    namespaces?: (string | null)[];
+    blockTokenLens?: number[];
+    semantic?: {
+        version: 1;
+        modelId: string;
+        dims: number;
+        encoding: 'int8_l2norm';
+        perVectorScale: boolean;
+        vecs: Int8Array;
+        scales?: Uint16Array;
+    };
+};
+export declare function hasSemantic(pack: Pack): boolean;
+export declare function mountPack(opts: MountOptions): Promise<Pack>;