@tobilu/qmd 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/store.d.ts CHANGED
@@ -18,6 +18,8 @@ export declare const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b
18
18
  export declare const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
19
19
  export declare const DEFAULT_GLOB = "**/*.md";
20
20
  export declare const DEFAULT_MULTI_GET_MAX_BYTES: number;
21
+ export declare const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
22
+ export declare const DEFAULT_EMBED_MAX_BATCH_BYTES: number;
21
23
  export declare const CHUNK_SIZE_TOKENS = 900;
22
24
  export declare const CHUNK_OVERLAP_TOKENS: number;
23
25
  export declare const CHUNK_SIZE_CHARS: number;
@@ -76,6 +78,20 @@ export declare function isInsideCodeFence(pos: number, fences: CodeFenceRegion[]
76
78
  * @returns The best position to cut at
77
79
  */
78
80
  export declare function findBestCutoff(breakPoints: BreakPoint[], targetCharPos: number, windowChars?: number, decayFactor?: number, codeFences?: CodeFenceRegion[]): number;
81
+ export type ChunkStrategy = "auto" | "regex";
82
+ /**
83
+ * Merge two sets of break points (e.g. regex + AST), keeping the highest
84
+ * score at each position. Result is sorted by position.
85
+ */
86
+ export declare function mergeBreakPoints(a: BreakPoint[], b: BreakPoint[]): BreakPoint[];
87
+ /**
88
+ * Core chunk algorithm that operates on precomputed break points and code fences.
89
+ * This is the shared implementation used by both regex-only and AST-aware chunking.
90
+ */
91
+ export declare function chunkDocumentWithBreakPoints(content: string, breakPoints: BreakPoint[], codeFences: CodeFenceRegion[], maxChars?: number, overlapChars?: number, windowChars?: number): {
92
+ text: string;
93
+ pos: number;
94
+ }[];
79
95
  export declare const STRONG_SIGNAL_MIN_SCORE = 0.85;
80
96
  export declare const STRONG_SIGNAL_MIN_GAP = 0.15;
81
97
  export declare const RERANK_CANDIDATE_LIMIT = 40;
@@ -118,6 +134,8 @@ export declare function normalizePathSeparators(path: string): string;
118
134
  export declare function getRelativePathFromPrefix(path: string, prefix: string): string | null;
119
135
  export declare function resolve(...paths: string[]): string;
120
136
  export declare function enableProductionMode(): void;
137
+ /** Reset production mode flag — only for testing. */
138
+ export declare function _resetProductionModeForTesting(): void;
121
139
  export declare function getDefaultDbPath(indexName?: string): string;
122
140
  export declare function getPwd(): string;
123
141
  export declare function getRealPath(path: string): string;
@@ -311,16 +329,20 @@ export type EmbedResult = {
311
329
  errors: number;
312
330
  durationMs: number;
313
331
  };
332
+ export type EmbedOptions = {
333
+ force?: boolean;
334
+ model?: string;
335
+ maxDocsPerBatch?: number;
336
+ maxBatchBytes?: number;
337
+ chunkStrategy?: ChunkStrategy;
338
+ onProgress?: (info: EmbedProgress) => void;
339
+ };
314
340
  /**
315
341
  * Generate vector embeddings for documents that need them.
316
342
  * Pure function — no console output, no db lifecycle management.
317
343
  * Uses the store's LlamaCpp instance if set, otherwise the global singleton.
318
344
  */
319
- export declare function generateEmbeddings(store: Store, options?: {
320
- force?: boolean;
321
- model?: string;
322
- onProgress?: (info: EmbedProgress) => void;
323
- }): Promise<EmbedResult>;
345
+ export declare function generateEmbeddings(store: Store, options?: EmbedOptions): Promise<EmbedResult>;
324
346
  /**
325
347
  * Create a new store instance with the given database path.
326
348
  * If no path is provided, uses the default path (~/.cache/qmd/index.sqlite).
@@ -505,15 +527,34 @@ export declare function deactivateDocument(db: Database, collectionName: string,
505
527
  */
506
528
  export declare function getActiveDocumentPaths(db: Database, collectionName: string): string[];
507
529
  export { formatQueryForEmbedding, formatDocForEmbedding };
530
+ /**
531
+ * Chunk a document using regex-only break point detection.
532
+ * This is the sync, backward-compatible API used by tests and legacy callers.
533
+ */
508
534
  export declare function chunkDocument(content: string, maxChars?: number, overlapChars?: number, windowChars?: number): {
509
535
  text: string;
510
536
  pos: number;
511
537
  }[];
538
+ /**
539
+ * Async AST-aware chunking. Detects language from filepath, computes AST
540
+ * break points for supported code files, merges with regex break points,
541
+ * and delegates to the shared chunk algorithm.
542
+ *
543
+ * Falls back to regex-only when strategy is "regex", filepath is absent,
544
+ * or language is unsupported.
545
+ */
546
+ export declare function chunkDocumentAsync(content: string, maxChars?: number, overlapChars?: number, windowChars?: number, filepath?: string, chunkStrategy?: ChunkStrategy): Promise<{
547
+ text: string;
548
+ pos: number;
549
+ }[]>;
512
550
  /**
513
551
  * Chunk a document by actual token count using the LLM tokenizer.
514
552
  * More accurate than character-based chunking but requires async.
553
+ *
554
+ * When filepath and chunkStrategy are provided, uses AST-aware break points
555
+ * for supported code files.
515
556
  */
516
- export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number): Promise<{
557
+ export declare function chunkDocumentByTokens(content: string, maxTokens?: number, overlapTokens?: number, windowTokens?: number, filepath?: string, chunkStrategy?: ChunkStrategy, signal?: AbortSignal): Promise<{
517
558
  text: string;
518
559
  pos: number;
519
560
  tokens: number;
@@ -640,6 +681,7 @@ export declare function getCollectionsWithoutContext(db: Database): {
640
681
  * Useful for suggesting where context might be needed.
641
682
  */
642
683
  export declare function getTopLevelPathsWithoutContext(db: Database, collectionName: string): string[];
684
+ export declare function sanitizeFTS5Term(term: string): string;
643
685
  /**
644
686
  * Validate that a vec/hyde query doesn't use lex-only syntax.
645
687
  * Returns error message if invalid, null if valid.
@@ -665,6 +707,12 @@ export declare function clearAllEmbeddings(db: Database): void;
665
707
  /**
666
708
  * Insert a single embedding into both content_vectors and vectors_vec tables.
667
709
  * The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
710
+ *
711
+ * content_vectors is inserted first so that getHashesForEmbedding (which checks
712
+ * only content_vectors) won't re-select the hash on a crash between the two inserts.
713
+ *
714
+ * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's
715
+ * vec0 virtual tables silently ignore the OR REPLACE conflict clause.
668
716
  */
669
717
  export declare function insertEmbedding(db: Database, hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string): void;
670
718
  export declare function expandQuery(query: string, model: string | undefined, db: Database, intent?: string, llmOverride?: LlamaCpp): Promise<ExpandedQuery[]>;
@@ -763,6 +811,7 @@ export interface HybridQueryOptions {
763
811
  explain?: boolean;
764
812
  intent?: string;
765
813
  skipRerank?: boolean;
814
+ chunkStrategy?: ChunkStrategy;
766
815
  hooks?: SearchHooks;
767
816
  }
768
817
  export interface HybridQueryResult {
@@ -836,6 +885,7 @@ export interface StructuredSearchOptions {
836
885
  intent?: string;
837
886
  /** Skip LLM reranking, use only RRF scores */
838
887
  skipRerank?: boolean;
888
+ chunkStrategy?: ChunkStrategy;
839
889
  hooks?: SearchHooks;
840
890
  }
841
891
  /**