npm - @tobilu/qmd - Versions diffs - 2.0.0 → 2.1.0 - Mend

@tobilu/qmd 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/store.d.ts CHANGED Viewed

@@ -18,6 +18,8 @@ export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b
 export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
 export declare const DEFAULT_GLOB = "**/*.md";
 export declare const DEFAULT_MULTI_GET_MAX_BYTES: number;
+export declare const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
+export declare const DEFAULT_EMBED_MAX_BATCH_BYTES: number;
 export declare const CHUNK_SIZE_TOKENS = 900;
 export declare const CHUNK_OVERLAP_TOKENS: number;
 export declare const CHUNK_SIZE_CHARS: number;
@@ -76,6 +78,20 @@ export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]
  * @returns The best position to cut at
  */
 export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
+export type ChunkStrategy = "auto" | "regex";
+/**
+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
+ * score at each position. Result is sorted by position.
+ */
+export declare function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[];
+/**
+ * Core chunk algorithm that operates on precomputed break points and code fences.
+ * This is the shared implementation used by both regex-only and AST-aware chunking.
+ */
+export declare function chunkDocumentWithBreakPoints(content: string, breakPoints: BreakPoint[], codeFences: CodeFenceRegion[], maxChars?: number, overlapChars?: number, windowChars?: number): {
+    text: string;
+    pos: number;
+}[];
 export declare const STRONG_SIGNAL_MIN_SCORE = 0.85;
 export declare const STRONG_SIGNAL_MIN_GAP = 0.15;
 export declare const RERANK_CANDIDATE_LIMIT = 40;
@@ -118,6 +134,8 @@ export declare function normalizePathSeparators(path: string): string;
 export declare function getRelativePathFromPrefix(path: string, prefix: string): string | null;
 export declare function resolve(...paths: string[]): string;
 export declare function enableProductionMode(): void;
+/** Reset production mode flag — only for testing. */
+export declare function _resetProductionModeForTesting(): void;
 export declare function getDefaultDbPath(indexName?: string): string;
 export declare function getPwd(): string;
 export declare function getRealPath(path: string): string;
@@ -311,16 +329,20 @@ export type EmbedResult = {
     errors: number;
     durationMs: number;
 };
+export type EmbedOptions = {
+    force?: boolean;
+    model?: string;
+    maxDocsPerBatch?: number;
+    maxBatchBytes?: number;
+    chunkStrategy?: ChunkStrategy;
+    onProgress?: (info: EmbedProgress) => void;
+};
 /**
  * Generate vector embeddings for documents that need them.
  * Pure function — no console output, no db lifecycle management.
  * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
  */
-export declare function generateEmbeddings(store: Store, options?: {
-    force?: boolean;
-    model?: string;
-    onProgress?: (info: EmbedProgress) => void;
-}): Promise<EmbedResult>;
+export declare function generateEmbeddings(store: Store, options?: EmbedOptions): Promise<EmbedResult>;
 /**
  * Create a new store instance with the given database path.
  * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
@@ -505,15 +527,34 @@ export declare function deactivateDocument(db: Database, collectionName: string,
  */
 export declare function getActiveDocumentPaths(db: Database, collectionName: string): string[];
 export { formatQueryForEmbedding, formatDocForEmbedding };
+/**
+ * Chunk a document using regex-only break point detection.
+ * This is the sync, backward-compatible API used by tests and legacy callers.
+ */
 export declare function chunkDocument(content: string, maxChars?: number, overlapChars?: number, windowChars?: number): {
     text: string;
     pos: number;
 }[];
+/**
+ * Async AST-aware chunking. Detects language from filepath, computes AST
+ * break points for supported code files, merges with regex break points,
+ * and delegates to the shared chunk algorithm.
+ *
+ * Falls back to regex-only when strategy is "regex", filepath is absent,
+ * or language is unsupported.
+ */
+export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
+    text: string;
+    pos: number;
+}[]>;
 /**
  * Chunk a document by actual token count using the LLM tokenizer.
  * More accurate than character-based chunking but requires async.
+ *
+ * When filepath and chunkStrategy are provided, uses AST-aware break points
+ * for supported code files.
  */
-export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number): Promise<{
+export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal): Promise<{
     text: string;
     pos: number;
     tokens: number;
@@ -640,6 +681,7 @@ export declare function getCollectionsWithoutContext(db: Database): {
  * Useful for suggesting where context might be needed.
  */
 export declare function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[];
+export declare function sanitizeFTS5Term(term: string): string;
 /**
  * Validate that a vec/hyde query doesn't use lex-only syntax.
  * Returns error message if invalid, null if valid.
@@ -665,6 +707,12 @@ export declare function clearAllEmbeddings(db: Database): void;
 /**
  * Insert a single embedding into both content_vectors and vectors_vec tables.
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
+ *
+ * content_vectors is inserted first so that getHashesForEmbedding (which checks
+ * only content_vectors) won't re-select the hash on a crash between the two inserts.
+ *
+ * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
+ * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
  */
 export declare function insertEmbedding(db: Database, hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string): void;
 export declare function expandQuery(query: string, model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]>;
@@ -763,6 +811,7 @@ export interface HybridQueryOptions {
     explain?: boolean;
     intent?: string;
     skipRerank?: boolean;
+    chunkStrategy?: ChunkStrategy;
     hooks?: SearchHooks;
 }
 export interface HybridQueryResult {
@@ -836,6 +885,7 @@ export interface StructuredSearchOptions {
     intent?: string;
     /** Skip LLM reranking, use only RRF scores */
     skipRerank?: boolean;
+    chunkStrategy?: ChunkStrategy;
     hooks?: SearchHooks;
 }
 /**